aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/riscv/multiarch
diff options
context:
space:
mode:
authorAdhemerval Zanella <adhemerval.zanella@linaro.org>2024-03-05 14:02:57 -0300
committerAdhemerval Zanella <adhemerval.zanella@linaro.org>2024-03-12 14:38:08 -0300
commit2149da36836bc32cd66359ca37bab5884af7e81f (patch)
treea238d5f1dfcacdd229f9c4c3d479e8d304195116 /sysdeps/riscv/multiarch
parent2173173d57971d042c0ad4b281431ae127e9b5b8 (diff)
downloadglibc-2149da36836bc32cd66359ca37bab5884af7e81f.zip
glibc-2149da36836bc32cd66359ca37bab5884af7e81f.tar.gz
glibc-2149da36836bc32cd66359ca37bab5884af7e81f.tar.bz2
riscv: Fix alignment-ignorant memcpy implementation
The memcpy optimization (commit 587a1290a1af7bee6db) has a series of mistakes: - The implementation is wrong: the chunk size calculation is wrong leading to invalid memory access. - It adds ifunc supports as default, so --disable-multi-arch does not work as expected for riscv. - It mixes Linux files (memcpy ifunc selection which requires the vDSO/syscall mechanism) with generic support (the memcpy optimization itself). - There is no __libc_ifunc_impl_list, which makes testing only check the selected implementation instead of all supported by the system. This patch also simplifies the required bits to enable ifunc: there is no need to memcopy.h; nor to add Linux-specific files. The __memcpy_noalignment tail handling now uses a branchless strategy similar to aarch64 (overlap 32-bits copies for sizes 4..7 and byte copies for size 1..3). Checked on riscv64 and riscv32 by explicitly enabling the function on __libc_ifunc_impl_list on qemu-system. Changes from v1: * Implement the memcpy in assembly to correctly handle RISCV strict-alignment. Reviewed-by: Evan Green <evan@rivosinc.com> Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Diffstat (limited to 'sysdeps/riscv/multiarch')
-rw-r--r--sysdeps/riscv/multiarch/memcpy-generic.c26
-rw-r--r--sysdeps/riscv/multiarch/memcpy_noalignment.S162
2 files changed, 188 insertions, 0 deletions
diff --git a/sysdeps/riscv/multiarch/memcpy-generic.c b/sysdeps/riscv/multiarch/memcpy-generic.c
new file mode 100644
index 0000000..4235d33
--- /dev/null
+++ b/sysdeps/riscv/multiarch/memcpy-generic.c
@@ -0,0 +1,26 @@
+/* Re-include the default memcpy implementation.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+
+#if IS_IN(libc)
+# define MEMCPY __memcpy_generic
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(x)
+#endif
+#include <string/memcpy.c>
diff --git a/sysdeps/riscv/multiarch/memcpy_noalignment.S b/sysdeps/riscv/multiarch/memcpy_noalignment.S
new file mode 100644
index 0000000..fa39be2
--- /dev/null
+++ b/sysdeps/riscv/multiarch/memcpy_noalignment.S
@@ -0,0 +1,162 @@
+/* memcpy for RISC-V, ignoring buffer alignment
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <sys/asm.h>
+
+/* memcpy optimization for CPUs with fast unaligned support
+ (RISCV_HWPROBE_MISALIGNED_FAST).
+
+ Copies are split into 3 main cases: small copies up to SZREG, copies up to
+ BLOCK_SIZE (128 for 64 bits, 64 for 32 bits), and copies larger than BLOCK_SIZE.
+
+ Large copies use a software pipelined loop processing BLOCK_SIZE bytes per
+ iteration. The destination pointer is SZREG-byte aligned to minimize store
+ unaligned accesses.
+
+ The tail is handled with branchless copies. */
+
+#define BLOCK_SIZE (16 * SZREG)
+
+ .attribute unaligned_access, 1
+ENTRY (__memcpy_noalignment)
+ beq a2, zero, L(ret)
+
+ /* if LEN < SZREG jump to tail handling. */
+ li a5, SZREG-1
+ mv a6, a0
+ bleu a2, a5, L(tail)
+
+ /* Copy the first word, align DEST to word, and adjust DEST/SRC/LEN
+ based on the amount adjusted to align DEST. */
+ REG_L a3, 0(a1)
+ andi a5, a0, SZREG-1
+ addi a2, a2, -SZREG
+ li a4, SZREG
+ sub a4, a4, a5
+ REG_S a3, 0(a0)
+ add a2, a5, a2
+
+ /* If LEN < BLOCK_SIZE jump to word copy. */
+ li a3, BLOCK_SIZE-1
+ add a5, a0, a4
+ add a1, a1, a4
+ bleu a2, a3, L(word_copy_adjust)
+ addi a7, a2, -BLOCK_SIZE
+ andi a7, a7, -BLOCK_SIZE
+ addi a7, a7, BLOCK_SIZE
+ add a3, a5, a7
+ mv a4, a1
+L(block_copy):
+ REG_L a6, 0(a4)
+ REG_L t0, SZREG(a4)
+ REG_L t1, (2*SZREG)(a4)
+ REG_L t2, (3*SZREG)(a4)
+ REG_L t3, (4*SZREG)(a4)
+ REG_L t4, (5*SZREG)(a4)
+ REG_L t5, (6*SZREG)(a4)
+ REG_L t6, (7*SZREG)(a4)
+ REG_S a6, 0(a5)
+ REG_S t0, SZREG(a5)
+ REG_S t1, (2*SZREG)(a5)
+ REG_S t2, (3*SZREG)(a5)
+ REG_S t3, (4*SZREG)(a5)
+ REG_S t4, (5*SZREG)(a5)
+ REG_S t5, (6*SZREG)(a5)
+ REG_S t6, (7*SZREG)(a5)
+ REG_L a6, (8*SZREG)(a4)
+ REG_L t0, (9*SZREG)(a4)
+ REG_L t1, (10*SZREG)(a4)
+ REG_L t2, (11*SZREG)(a4)
+ REG_L t3, (12*SZREG)(a4)
+ REG_L t4, (13*SZREG)(a4)
+ REG_L t5, (14*SZREG)(a4)
+ REG_L t6, (15*SZREG)(a4)
+ addi a4, a4, BLOCK_SIZE
+ REG_S a6, (8*SZREG)(a5)
+ REG_S t0, (9*SZREG)(a5)
+ REG_S t1, (10*SZREG)(a5)
+ REG_S t2, (11*SZREG)(a5)
+ REG_S t3, (12*SZREG)(a5)
+ REG_S t4, (13*SZREG)(a5)
+ REG_S t5, (14*SZREG)(a5)
+ REG_S t6, (15*SZREG)(a5)
+ addi a5, a5, BLOCK_SIZE
+ bne a5, a3, L(block_copy)
+ add a1, a1, a7
+ andi a2, a2, BLOCK_SIZE-1
+
+ /* 0 <= a2/LEN < BLOCK_SIZE. */
+L(word_copy):
+ li a5, SZREG-1
+ /* if LEN < SZREG jump to tail handling. */
+ bleu a2, a5, L(tail_adjust)
+ addi a7, a2, -SZREG
+ andi a7, a7, -SZREG
+ addi a7, a7, SZREG
+ add a6, a3, a7
+ mv a5, a1
+L(word_copy_loop):
+ REG_L a4, 0(a5)
+ addi a3, a3, SZREG
+ addi a5, a5, SZREG
+ REG_S a4, -SZREG(a3)
+ bne a3, a6, L(word_copy_loop)
+ add a1, a1, a7
+ andi a2, a2, SZREG-1
+
+ /* Copy the last word unaligned. */
+ add a3, a1, a2
+ add a4, a6, a2
+ REG_L t0, -SZREG(a3)
+ REG_S t0, -SZREG(a4)
+ ret
+
+L(tail):
+ /* Copy 4-7 bytes. */
+ andi a5, a2, 4
+ add a3, a1, a2
+ add a4, a6, a2
+ beq a5, zero, L(copy_0_3)
+ lw t0, 0(a1)
+ lw t1, -4(a3)
+ sw t0, 0(a6)
+ sw t1, -4(a4)
+ ret
+
+ /* Copy 0-3 bytes. */
+L(copy_0_3):
+ beq a2, zero, L(ret)
+ srli a2, a2, 1
+ add t4, a1, a2
+ add t5, a6, a2
+ lbu t0, 0(a1)
+ lbu t1, -1(a3)
+ lbu t2, 0(t4)
+ sb t0, 0(a6)
+ sb t1, -1(a4)
+ sb t2, 0(t5)
+L(ret):
+ ret
+L(tail_adjust):
+ mv a6, a3
+ j L(tail)
+L(word_copy_adjust):
+ mv a3, a5
+ j L(word_copy)
+END (__memcpy_noalignment)