aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Rini <trini@konsulko.com>2021-09-24 10:13:44 -0400
committerTom Rini <trini@konsulko.com>2021-09-24 10:13:44 -0400
commit7d1fcaea128ff689aea75deaf7f75d75d1b553d3 (patch)
tree8e378b623cd2b5f2d9883b93aa7afa9feb5ff42e
parent2c14ff587998e2b0a94bc2b693bcd43094a40b8c (diff)
parent4e062fc955b684d004b252b33b006a6a16899f5c (diff)
downloadu-boot-7d1fcaea128ff689aea75deaf7f75d75d1b553d3.zip
u-boot-7d1fcaea128ff689aea75deaf7f75d75d1b553d3.tar.gz
u-boot-7d1fcaea128ff689aea75deaf7f75d75d1b553d3.tar.bz2
Merge branch '2021-09-24-arm64-optimized-str-funcs' into next
- Bring in, but disable by default, asm optimized string functions for arm64.
-rw-r--r--arch/arm/Kconfig43
-rw-r--r--arch/arm/include/asm/string.h4
-rw-r--r--arch/arm/lib/Makefile5
-rw-r--r--arch/arm/lib/asmdefs.h98
-rw-r--r--arch/arm/lib/memcpy-arm64.S242
-rw-r--r--arch/arm/lib/memset-arm64.S148
6 files changed, 532 insertions, 8 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 95102d3..9c4787f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -455,8 +455,8 @@ config ARM_CORTEX_CPU_IS_UP
config USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy"
- default y
- depends on !ARM64
+ default y if !ARM64
+ depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400))
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
@@ -465,7 +465,7 @@ config USE_ARCH_MEMCPY
config SPL_USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy for SPL"
default y if USE_ARCH_MEMCPY
- depends on !ARM64 && SPL
+ depends on SPL
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
@@ -474,16 +474,43 @@ config SPL_USE_ARCH_MEMCPY
config TPL_USE_ARCH_MEMCPY
bool "Use an assembly optimized implementation of memcpy for TPL"
default y if USE_ARCH_MEMCPY
- depends on !ARM64 && TPL
+ depends on TPL
help
Enable the generation of an optimized version of memcpy.
Such an implementation may be faster under some conditions
but may increase the binary size.
+config USE_ARCH_MEMMOVE
+ bool "Use an assembly optimized implementation of memmove" if !ARM64
+ default USE_ARCH_MEMCPY if ARM64
+ depends on ARM64
+ help
+ Enable the generation of an optimized version of memmove.
+ Such an implementation may be faster under some conditions
+ but may increase the binary size.
+
+config SPL_USE_ARCH_MEMMOVE
+ bool "Use an assembly optimized implementation of memmove for SPL" if !ARM64
+ default SPL_USE_ARCH_MEMCPY if ARM64
+ depends on SPL && ARM64
+ help
+ Enable the generation of an optimized version of memmove.
+ Such an implementation may be faster under some conditions
+ but may increase the binary size.
+
+config TPL_USE_ARCH_MEMMOVE
+ bool "Use an assembly optimized implementation of memmove for TPL" if !ARM64
+ default TPL_USE_ARCH_MEMCPY if ARM64
+ depends on TPL && ARM64
+ help
+ Enable the generation of an optimized version of memmove.
+ Such an implementation may be faster under some conditions
+ but may increase the binary size.
+
config USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset"
- default y
- depends on !ARM64
+ default y if !ARM64
+ depends on !ARM64 || (ARM64 && (GCC_VERSION >= 90400))
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
@@ -492,7 +519,7 @@ config USE_ARCH_MEMSET
config SPL_USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset for SPL"
default y if USE_ARCH_MEMSET
- depends on !ARM64 && SPL
+ depends on SPL
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
@@ -501,7 +528,7 @@ config SPL_USE_ARCH_MEMSET
config TPL_USE_ARCH_MEMSET
bool "Use an assembly optimized implementation of memset for TPL"
default y if USE_ARCH_MEMSET
- depends on !ARM64 && TPL
+ depends on TPL
help
Enable the generation of an optimized version of memset.
Such an implementation may be faster under some conditions
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index 11eaa34..ead3f2c 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -19,7 +19,11 @@ extern char * strchr(const char * s, int c);
#endif
extern void * memcpy(void *, const void *, __kernel_size_t);
+#if CONFIG_IS_ENABLED(USE_ARCH_MEMMOVE)
+#define __HAVE_ARCH_MEMMOVE
+#else
#undef __HAVE_ARCH_MEMMOVE
+#endif
extern void * memmove(void *, const void *, __kernel_size_t);
#undef __HAVE_ARCH_MEMCHR
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 7f66332..c48e1f6 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o
obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o
obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o
endif
+ifdef CONFIG_ARM64
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o
+else
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
+endif
obj-$(CONFIG_SEMIHOSTING) += semihosting.o
obj-y += bdinfo.o
diff --git a/arch/arm/lib/asmdefs.h b/arch/arm/lib/asmdefs.h
new file mode 100644
index 0000000..d307a3a
--- /dev/null
+++ b/arch/arm/lib/asmdefs.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc;
+
+#endif
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/arch/arm/lib/memcpy-arm64.S b/arch/arm/lib/memcpy-arm64.S
new file mode 100644
index 0000000..507054d
--- /dev/null
+++ b/arch/arm/lib/memcpy-arm64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+END (memcpy)
diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S
new file mode 100644
index 0000000..ee9f9a9
--- /dev/null
+++ b/arch/arm/lib/memset-arm64.S
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include <asm/macro.h>
+#include "asmdefs.h"
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+
+ENTRY (memset)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ /*
+ * The optimized memset uses the dc opcode, which causes problems
+ * when the cache is disabled. Let's check if the cache is disabled
+ * and use a very simple memset implementation in this case. Otherwise
+ * jump to the optimized version.
+ */
+ switch_el x6, 3f, 2f, 1f
+3: mrs x6, sctlr_el3
+ b 0f
+2: mrs x6, sctlr_el2
+ b 0f
+1: mrs x6, sctlr_el1
+0:
+ tst x6, #CR_C
+ bne 9f
+
+ /*
+ * A very "simple" memset implementation without the use of the
+ * dc opcode. Can be run with caches disabled.
+ */
+ mov x3, #0x0
+ cmp count, x3 /* check for zero length */
+ beq 8f
+4: strb valw, [dstin, x3]
+ add x3, x3, #0x1
+ cmp count, x3
+ bne 4b
+8: ret
+9:
+
+ /* Here the optimized memset version starts */
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ .p2align 4
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (memset)