aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--sysdeps/aarch64/multiarch/ifunc-impl-list.c1
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_thunderx2.S572
-rw-r--r--sysdeps/aarch64/multiarch/memmove.c5
4 files changed, 227 insertions, 362 deletions
diff --git a/ChangeLog b/ChangeLog
index 3c4dcb3..c71f249 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2019-05-03 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
+
+ * sysdeps/aarch64/multiarch/ifunc-impl-list.c: Added
+ __memmove_thunderx2 to the list of implementations
+ * sysdeps/aarch64/multiarch/memmove.c: Likewise
+ * sysdeps/aarch64/multiarch/memcpy_thunderx2.S:
+ (__memmove_thunderx2): Rewritten using SIMD ld/st
+ (__memcpy_thunderx2): Fixed handling overlapping cases.
+ Used ldp/stp instead of ldr/str if possible. Made loops
+ tails branchless.
+
2019-05-03 Florian Weimer <fweimer@redhat.com>
* misc/tst-tsearch.c (walk_tree): Add more error checking.
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 5f43362..10ff7d4 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index 45e9a29..058db48 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -31,8 +31,8 @@
#define dst x3
#define srcend x4
#define dstend x5
-#define tmp2 x6
-#define tmp3 x7
+#define tmp2 x6
+#define tmp3 x7
#define tmp3w w7
#define A_l x6
#define A_lw w6
@@ -53,26 +53,26 @@
#define G_h dst
#define tmp1 x14
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
#define H_q q7
#define I_q q16
#define J_q q17
-#define A_v v0
-#define B_v v1
-#define C_v v2
-#define D_v v3
-#define E_v v4
-#define F_v v5
-#define G_v v6
-#define H_v v7
-#define I_v v16
+#define A_v v0
+#define B_v v1
+#define C_v v2
+#define D_v v3
+#define E_v v4
+#define F_v v5
+#define G_v v6
+#define H_v v7
+#define I_v v16
#define J_v v17
#ifndef MEMMOVE
@@ -85,16 +85,14 @@
#if IS_IN (libc)
#undef MEMCPY
-#undef MEMMOVE
#define MEMCPY __memcpy_thunderx2
+#undef MEMMOVE
#define MEMMOVE __memmove_thunderx2
-/* Moves are split into 3 main cases: small copies of up to 16 bytes,
- medium copies of 17..96 bytes which are fully unrolled. Large copies
- of more than 96 bytes align the destination and use an unrolled loop
- processing 64 bytes per iteration.
- Overlapping large forward memmoves use a loop that copies backwards.
+/* Overlapping large forward memmoves use a loop that copies backwards.
+ Otherwise memcpy is used. Small moves branch to memcopy16 directly.
+ The longer memcpy cases fall through to the memcpy head.
*/
ENTRY_ALIGN (MEMMOVE, 6)
@@ -103,188 +101,14 @@ ENTRY_ALIGN (MEMMOVE, 6)
DELOUSE (1)
DELOUSE (2)
+ add srcend, src, count
+ cmp count, 16
+ b.ls L(memcopy16)
sub tmp1, dstin, src
cmp count, 96
ccmp tmp1, count, 2, hi
b.lo L(move_long)
- prfm PLDL1KEEP, [src]
- add srcend, src, count
- add dstend, dstin, count
- cmp count, 16
- b.ls L(copy16)
- cmp count, 96
- b.hi L(copy_long)
-
- /* Medium copies: 17..96 bytes. */
- sub tmp1, count, 1
- ldp A_l, A_h, [src]
- tbnz tmp1, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz tmp1, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
- /* Small copies: 0..16 bytes. */
-L(copy16):
- cmp count, 8
- b.lo 1f
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
- .p2align 4
-1:
- tbz count, 2, 1f
- ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
- str A_lw, [dstin]
- str A_hw, [dstend, -4]
- ret
-
- /* Copy 0..3 bytes. Use a branchless sequence that copies the same
- byte 3 times if count==1, or the 2nd byte twice if count==2. */
-1:
- cbz count, 2f
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-2: ret
-
- .p2align 4
- /* Copy 64..96 bytes. Copy 64 bytes from the start and
- 32 bytes from the end. */
-L(copy96):
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [src, 32]
- ldp D_l, D_h, [src, 48]
- ldp E_l, E_h, [srcend, -32]
- ldp F_l, F_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin, 32]
- stp D_l, D_h, [dstin, 48]
- stp E_l, E_h, [dstend, -32]
- stp F_l, F_h, [dstend, -16]
- ret
-
- /* Align DST to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- .p2align 4
-L(copy_long):
- and tmp1, dstin, 15
- bic dst, dstin, 15
- ldp D_l, D_h, [src]
- sub src, src, tmp1
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(last64)
-L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
- subs count, count, 64
- b.hi L(loop64)
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the end even if
- there is just 1 byte left. */
-L(last64):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
- ret
-
- .p2align 4
-L(move_long):
- cbz tmp1, 3f
-
- add srcend, src, count
- add dstend, dstin, count
-
- /* Align dstend to 16 byte alignment so that we don't cross cache line
- boundaries on both loads and stores. There are at least 96 bytes
- to copy, so copy 16 bytes unaligned and then align. The loop
- copies 64 bytes per iteration and prefetches one iteration ahead. */
-
- and tmp1, dstend, 15
- ldp D_l, D_h, [srcend, -16]
- sub srcend, srcend, tmp1
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
- sub dstend, dstend, tmp1
- subs count, count, 128
- b.ls 2f
-
- nop
-1:
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi 1b
-
- /* Write the last full set of 64 bytes. The remainder is at most 64
- bytes, so it is safe to always copy 64 bytes from the start even if
- there is just 1 byte left. */
-2:
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
-3: ret
-
END (MEMMOVE)
libc_hidden_builtin_def (MEMMOVE)
@@ -293,227 +117,217 @@ libc_hidden_builtin_def (MEMMOVE)
medium copies of 17..96 bytes which are fully unrolled. Large copies
of more than 96 bytes align the destination and use load-and-merge
approach in the case src and dst addresses are unaligned not evenly,
- so that, loads and stores are always aligned.
- Large copies use an unrolled loop processing 64 bytes per iteration.
- The current optimized memcpy implementation is not compatible with
- memmove and is separated from it completely.
-
- memcpy implementation below is not compatible with memmove
- because of pipelined loads/stores, which are faster, but they
- can't be used in the case of overlapping memmove arrays */
+ so that, actual loads and stores are always aligned.
+ Large copies use the loops processing 64 bytes per iteration for
+ unaligned case and 128 bytes per iteration for aligned ones.
+*/
#define MEMCPY_PREFETCH_LDR 640
+ .p2align 4
ENTRY (MEMCPY)
+
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
- add srcend, src, count
- cmp count, 16
- b.ls L(memcopy16)
- ldr A_q, [src], #16
- add dstend, dstin, count
- and tmp1, src, 15
- cmp count, 96
- b.hi L(memcopy_long)
+ add srcend, src, count
+ cmp count, 16
+ b.ls L(memcopy16)
+ ldr A_q, [src], #16
+ add dstend, dstin, count
+ and tmp1, src, 15
+ cmp count, 96
+ b.hi L(memcopy_long)
/* Medium copies: 17..96 bytes. */
- ldr E_q, [srcend, -16]
- cmp count, 64
- b.gt L(memcpy_copy96)
- cmp count, 48
- b.le L(bytes_17_to_48)
+ ldr E_q, [srcend, -16]
+ cmp count, 64
+ b.gt L(memcpy_copy96)
+ cmp count, 48
+ b.le L(bytes_17_to_48)
/* 49..64 bytes */
- ldp B_q, C_q, [src]
- str E_q, [dstend, -16]
- stp A_q, B_q, [dstin]
- str C_q, [dstin, 32]
+ ldp B_q, C_q, [src]
+ str E_q, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ str C_q, [dstin, 32]
ret
L(bytes_17_to_48):
/* 17..48 bytes*/
- cmp count, 32
- b.gt L(bytes_32_to_48)
+ cmp count, 32
+ b.gt L(bytes_32_to_48)
/* 17..32 bytes*/
- str A_q, [dstin]
- str E_q, [dstend, -16]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
ret
L(bytes_32_to_48):
/* 32..48 */
- ldr B_q, [src]
- str A_q, [dstin]
- str E_q, [dstend, -16]
- str B_q, [dstin, 16]
+ ldr B_q, [src]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
+ str B_q, [dstin, 16]
ret
.p2align 4
/* Small copies: 0..16 bytes. */
L(memcopy16):
- cmp count, 8
- b.lo L(bytes_0_to_8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- add dstend, dstin, count
- str A_l, [dstin]
- str A_h, [dstend, -8]
+ cmp count, 8
+ b.lo L(bytes_0_to_8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ add dstend, dstin, count
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
ret
.p2align 4
L(bytes_0_to_8):
- tbz count, 2, L(bytes_0_to_3)
- ldr A_lw, [src]
- ldr A_hw, [srcend, -4]
- add dstend, dstin, count
- str A_lw, [dstin]
- str A_hw, [dstend, -4]
+ tbz count, 2, L(bytes_0_to_3)
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ add dstend, dstin, count
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
ret
/* Copy 0..3 bytes. Use a branchless sequence that copies the same
byte 3 times if count==1, or the 2nd byte twice if count==2. */
L(bytes_0_to_3):
- cbz count, L(end)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb A_hw, [srcend, -1]
- add dstend, dstin, count
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb A_hw, [dstend, -1]
-L(end):
+ cbz count, 1f
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb A_hw, [srcend, -1]
+ add dstend, dstin, count
+ ldrb B_lw, [src, tmp1]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
+ strb A_lw, [dstin]
+1:
ret
.p2align 4
L(memcpy_copy96):
/* Copying 65..96 bytes. A_q (first 16 bytes) and
- E_q(last 16 bytes) are already loaded.
-
- The size is large enough to benefit from aligned
- loads */
- bic src, src, 15
- ldp B_q, C_q, [src]
- str A_q, [dstin]
+ E_q(last 16 bytes) are already loaded. The size
+ is large enough to benefit from aligned loads */
+ bic src, src, 15
+ ldp B_q, C_q, [src]
/* Loaded 64 bytes, second 16-bytes chunk can be
overlapping with the first chunk by tmp1 bytes.
Stored 16 bytes. */
- sub dst, dstin, tmp1
- add count, count, tmp1
+ sub dst, dstin, tmp1
+ add count, count, tmp1
/* The range of count being [65..96] becomes [65..111]
after tmp [0..15] gets added to it,
count now is <bytes-left-to-load>+48 */
- cmp count, 80
- b.gt L(copy96_medium)
- ldr D_q, [src, 32]
- stp B_q, C_q, [dst, 16]
- str E_q, [dstend, -16]
- str D_q, [dst, 48]
+ cmp count, 80
+ b.gt L(copy96_medium)
+ ldr D_q, [src, 32]
+ stp B_q, C_q, [dst, 16]
+ str D_q, [dst, 48]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
ret
.p2align 4
L(copy96_medium):
- ldp D_q, A_q, [src, 32]
- str B_q, [dst, 16]
- cmp count, 96
- b.gt L(copy96_large)
- str E_q, [dstend, -16]
- stp C_q, D_q, [dst, 32]
- str A_q, [dst, 64]
+ ldp D_q, G_q, [src, 32]
+ cmp count, 96
+ b.gt L(copy96_large)
+ stp B_q, C_q, [dst, 16]
+ stp D_q, G_q, [dst, 48]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
ret
L(copy96_large):
- ldr F_q, [src, 64]
- stp C_q, D_q, [dst, 32]
- str E_q, [dstend, -16]
- stp A_q, F_q, [dst, 64]
+ ldr F_q, [src, 64]
+ str B_q, [dst, 16]
+ stp C_q, D_q, [dst, 32]
+ stp G_q, F_q, [dst, 64]
+ str A_q, [dstin]
+ str E_q, [dstend, -16]
ret
.p2align 4
L(memcopy_long):
- bic src, src, 15
- ldp B_q, C_q, [src], #32
- str A_q, [dstin]
- sub dst, dstin, tmp1
- add count, count, tmp1
- add dst, dst, 16
+ bic src, src, 15
+ ldp B_q, C_q, [src], #32
+ sub dst, dstin, tmp1
+ add count, count, tmp1
+ add dst, dst, 16
and tmp1, dst, 15
- ldp D_q, E_q, [src], #32
- str B_q, [dst], #16
+ ldp D_q, E_q, [src], #32
+ str A_q, [dstin]
/* Already loaded 64+16 bytes. Check if at
least 64 more bytes left */
- subs count, count, 64+64+16
- b.lt L(loop128_exit2)
- cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
- b.lt L(loop128)
+ subs count, count, 64+64+16
+ b.lt L(loop128_exit0)
+ cmp count, MEMCPY_PREFETCH_LDR + 64 + 32
+ b.lt L(loop128)
cbnz tmp1, L(dst_unaligned)
- sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+ sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32
.p2align 4
L(loop128_prefetch):
- str C_q, [dst], #16
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- str D_q, [dst], #16
- ldp F_q, G_q, [src], #32
- str E_q, [dst], #16
- ldp H_q, A_q, [src], #32
- str F_q, [dst], #16
- prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
- str G_q, [dst], #16
- ldp B_q, C_q, [src], #32
- str H_q, [dst], #16
- ldp D_q, E_q, [src], #32
- stp A_q, B_q, [dst], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp F_q, G_q, [src], #32
+ stp B_q, C_q, [dst], #32
+ ldp H_q, I_q, [src], #32
+ prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+ ldp B_q, C_q, [src], #32
+ stp D_q, E_q, [dst], #32
+ ldp D_q, E_q, [src], #32
+ stp F_q, G_q, [dst], #32
+ stp H_q, I_q, [dst], #32
subs count, count, 128
- b.ge L(loop128_prefetch)
+ b.ge L(loop128_prefetch)
-L(preloop128):
add count, count, MEMCPY_PREFETCH_LDR + 64 + 32
.p2align 4
L(loop128):
- ldp F_q, G_q, [src], #32
- str C_q, [dst], #16
- ldp B_q, A_q, [src], #32
- str D_q, [dst], #16
- stp E_q, F_q, [dst], #32
- stp G_q, B_q, [dst], #32
- subs count, count, 64
- b.lt L(loop128_exit1)
-L(loop128_proceed):
- ldp B_q, C_q, [src], #32
- str A_q, [dst], #16
- ldp D_q, E_q, [src], #32
- str B_q, [dst], #16
- subs count, count, 64
- b.ge L(loop128)
-
- .p2align 4
-L(loop128_exit2):
- stp C_q, D_q, [dst], #32
- str E_q, [dst], #16
- b L(copy_long_check32);
-
+ ldp F_q, G_q, [src], #32
+ ldp H_q, I_q, [src], #32
+ stp B_q, C_q, [dst], #32
+ stp D_q, E_q, [dst], #32
+ subs count, count, 64
+ b.lt L(loop128_exit1)
+ ldp B_q, C_q, [src], #32
+ ldp D_q, E_q, [src], #32
+ stp F_q, G_q, [dst], #32
+ stp H_q, I_q, [dst], #32
+ subs count, count, 64
+ b.ge L(loop128)
+L(loop128_exit0):
+ ldp F_q, G_q, [srcend, -64]
+ ldp H_q, I_q, [srcend, -32]
+ stp B_q, C_q, [dst], #32
+ stp D_q, E_q, [dst]
+ stp F_q, G_q, [dstend, -64]
+ stp H_q, I_q, [dstend, -32]
+ ret
L(loop128_exit1):
- /* A_q is still not stored and 0..63 bytes left,
- so, count is -64..-1.
- Check if less than 32 bytes left (count < -32) */
- str A_q, [dst], #16
-L(copy_long_check32):
- cmn count, 64
- b.eq L(copy_long_done)
- cmn count, 32
- b.le L(copy_long_last32)
- ldp B_q, C_q, [src]
- stp B_q, C_q, [dst]
-
-L(copy_long_last32):
- ldp F_q, G_q, [srcend, -32]
- stp F_q, G_q, [dstend, -32]
-
-L(copy_long_done):
+ ldp B_q, C_q, [srcend, -64]
+ ldp D_q, E_q, [srcend, -32]
+ stp F_q, G_q, [dst], #32
+ stp H_q, I_q, [dst]
+ stp B_q, C_q, [dstend, -64]
+ stp D_q, E_q, [dstend, -32]
+ ret
+
+L(dst_unaligned_tail):
+ ldp C_q, D_q, [srcend, -64]
+ ldp E_q, F_q, [srcend, -32]
+ stp A_q, B_q, [dst], #32
+ stp H_q, I_q, [dst], #16
+ str G_q, [dst, tmp1]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstend, -32]
ret
L(dst_unaligned):
@@ -542,17 +356,20 @@ L(dst_unaligned):
/* Store the 16 bytes to dst and align dst for further
operations, several bytes will be stored at this
address once more */
- str C_q, [dst], #16
- ldp F_q, G_q, [src], #32
+
+ ldp F_q, G_q, [src], #32
+ stp B_q, C_q, [dst], #32
bic dst, dst, 15
- subs count, count, 32
+ sub count, count, 32
adrp tmp2, L(ext_table)
add tmp2, tmp2, :lo12:L(ext_table)
add tmp2, tmp2, tmp1, LSL #2
ldr tmp3w, [tmp2]
add tmp2, tmp2, tmp3w, SXTW
br tmp2
-.p2align 4 ;\
+
+.p2align 4
+ /* to make the loop in each chunk 16-bytes aligned */
nop
#define EXT_CHUNK(shft) \
L(ext_size_ ## shft):;\
@@ -573,7 +390,7 @@ L(ext_size_ ## shft):;\
b.ge 1b;\
2:;\
ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\
- b L(ext_tail);
+ b L(dst_unaligned_tail);
EXT_CHUNK(1)
EXT_CHUNK(2)
@@ -591,12 +408,45 @@ EXT_CHUNK(13)
EXT_CHUNK(14)
EXT_CHUNK(15)
-L(ext_tail):
- stp A_q, B_q, [dst], #32
- stp H_q, I_q, [dst], #16
- add dst, dst, tmp1
- str G_q, [dst], #16
- b L(copy_long_check32)
+L(move_long):
+ .p2align 4
+1:
+ cbz tmp1, 3f
+
+ add srcend, src, count
+ add dstend, dstin, count
+
+ and tmp1, srcend, 15
+ ldr D_q, [srcend, -16]
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls 2f
+
+ .p2align 4
+1:
+ subs count, count, 64
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -64]!
+ b.hi 1b
+
+ /* Write the last full set of 64 bytes. The remainder is at most 64
+ bytes, so it is safe to always copy 64 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp E_q, F_q, [src, 32]
+ ldp G_q, H_q, [src]
+ stp A_q, B_q, [dstend, -32]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp G_q, H_q, [dstin]
+3: ret
END (MEMCPY)
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index f58dde3..f3d341b 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -30,6 +30,7 @@ extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
libc_ifunc (__libc_memmove,
@@ -37,7 +38,9 @@ libc_ifunc (__libc_memmove,
? __memmove_thunderx
: (IS_FALKOR (midr) || IS_PHECDA (midr)
? __memmove_falkor
- : __memmove_generic)));
+ : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memmove_thunderx2
+ : __memmove_generic))));
# undef memmove
strong_alias (__libc_memmove, memmove);