aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2020-03-11 17:15:25 +0000
committerWilco Dijkstra <wdijkstr@arm.com>2020-03-11 17:15:25 +0000
commit700065132744e0dfa6d4d9142d63f6e3a1934726 (patch)
treebdac82e2bad3a8a9d41953f1a1e3bfe52358fa77
parent15ab195229dc288d1d49612c3de14a33b88065ed (diff)
downloadglibc-700065132744e0dfa6d4d9142d63f6e3a1934726.zip
glibc-700065132744e0dfa6d4d9142d63f6e3a1934726.tar.gz
glibc-700065132744e0dfa6d4d9142d63f6e3a1934726.tar.bz2
[AArch64] Improve integer memcpy
Further optimize integer memcpy. Small cases now include copies up to 32 bytes. 64-128 byte copies are split into two cases to improve performance of 64-96 byte copies. Comments have been rewritten.
-rw-r--r--sysdeps/aarch64/memcpy.S197
1 files changed, 101 insertions, 96 deletions
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index d0d47e9..e0b4c45 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -33,11 +33,11 @@
#define A_l x6
#define A_lw w6
#define A_h x7
-#define A_hw w7
#define B_l x8
#define B_lw w8
#define B_h x9
#define C_l x10
+#define C_lw w10
#define C_h x11
#define D_l x12
#define D_h x13
@@ -51,16 +51,6 @@
#define H_h srcend
#define tmp1 x14
-/* Copies are split into 3 main cases: small copies of up to 32 bytes,
- medium copies of 33..128 bytes which are fully unrolled. Large copies
- of more than 128 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- In order to share code with memmove, small and medium copies read all
- data before writing, allowing any kind of overlap. So small, medium
- and large backwards memmoves are handled by falling through into memcpy.
- Overlapping large forward memmoves use a loop that copies backwards.
-*/
-
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@@ -68,118 +58,115 @@
# define MEMCPY memcpy
#endif
-ENTRY_ALIGN (MEMMOVE, 6)
+/* This implementation supports both memcpy and memmove and shares most code.
+ It uses unaligned accesses and branchless sequences to keep the code small,
+ simple and improve performance.
- DELOUSE (0)
- DELOUSE (1)
- DELOUSE (2)
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check in memmove is negligible since it is only required for large copies.
- sub tmp1, dstin, src
- cmp count, 128
- ccmp tmp1, count, 2, hi
- b.lo L(move_long)
-
- /* Common case falls through into memcpy. */
-END (MEMMOVE)
-libc_hidden_builtin_def (MEMMOVE)
-ENTRY (MEMCPY)
+ Large copies use a software pipelined loop processing 64 bytes per
+ iteration. The destination pointer is 16-byte aligned to minimize
+ unaligned accesses. The loop tail is handled by always copying 64 bytes
+ from the end.
+*/
+ENTRY_ALIGN (MEMCPY, 6)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
- prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
- cmp count, 32
- b.ls L(copy32)
cmp count, 128
b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
- /* Medium copies: 33..128 bytes. */
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
ldp D_l, D_h, [srcend, -16]
- cmp count, 64
- b.hi L(copy128)
stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
- .p2align 4
- /* Small copies: 0..32 bytes. */
-L(copy32):
- /* 16-32 bytes. */
- cmp count, 16
- b.lo 1f
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstend, -16]
- ret
- .p2align 4
-1:
- /* 8-15 bytes. */
- tbz count, 3, 1f
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
- .p2align 4
-1:
- /* 4-7 bytes. */
- tbz count, 2, 1f
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
+ ldr B_lw, [srcend, -4]
str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ str B_lw, [dstend, -4]
ret
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
lsr tmp1, count, 1
ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
+ ldrb C_lw, [srcend, -1]
ldrb B_lw, [src, tmp1]
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
.p2align 4
- /* Copy 65..128 bytes. Copy 64 bytes from the start and
- 64 bytes from the end. */
+ /* Copy 65..128 bytes. */
L(copy128):
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
ldp G_l, G_h, [srcend, -64]
ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32]
stp F_l, F_h, [dstin, 48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
stp C_l, C_h, [dstend, -32]
stp D_l, D_h, [dstend, -16]
ret
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 128 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
.p2align 4
+ /* Copy more than 128 bytes. */
L(copy_long):
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+ ldp D_l, D_h, [src]
and tmp1, dstin, 15
bic dst, dstin, 15
- ldp D_l, D_h, [src]
sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
ldp A_l, A_h, [src, 16]
@@ -188,7 +175,8 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(last64)
+ b.ls L(copy64_from_end)
+
L(loop64):
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
@@ -201,10 +189,8 @@ L(loop64):
subs count, count, 64
b.hi L(loop64)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-L(last64):
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
ldp E_l, E_h, [srcend, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend, -48]
@@ -219,20 +205,42 @@ L(last64):
stp C_l, C_h, [dstend, -16]
ret
- .p2align 4
-L(move_long):
- cbz tmp1, 3f
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+
+ENTRY_ALIGN (MEMMOVE, 4)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
add srcend, src, count
add dstend, dstin, count
+ cmp count, 128
+ b.hi L(move_long)
+ cmp count, 32
+ b.hi L(copy32_128)
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 128 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
- and tmp1, dstend, 15
+ .p2align 4
+L(move_long):
+ /* Only use backward copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.hs L(copy_long)
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
sub srcend, srcend, tmp1
sub count, count, tmp1
ldp A_l, A_h, [srcend, -16]
@@ -242,10 +250,9 @@ L(move_long):
ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
- b.ls 2f
+ b.ls L(copy64_from_start)
- nop
-1:
+L(loop64_backwards):
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [srcend, -16]
stp B_l, B_h, [dstend, -32]
@@ -255,12 +262,10 @@ L(move_long):
stp D_l, D_h, [dstend, -64]!
ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
- b.hi 1b
+ b.hi L(loop64_backwards)
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend, -16]
ldp A_l, A_h, [src, 32]
@@ -273,7 +278,7 @@ L(move_long):
stp A_l, A_h, [dstin, 32]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
-3: ret
+ ret
-END (MEMCPY)
-libc_hidden_builtin_def (MEMCPY)
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)