diff options
author | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2023-10-26 17:07:21 +0100 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2023-11-13 16:50:44 +0000 |
commit | 3d7090f14b13312320e425b27dcf0fe72de026fd (patch) | |
tree | 907257f5525b5a709cc5e3ad04845f95c77ff2b9 | |
parent | 9627ab99b50d250c6dd3001a3355aa03692f7fe5 (diff) | |
download | glibc-3d7090f14b13312320e425b27dcf0fe72de026fd.zip glibc-3d7090f14b13312320e425b27dcf0fe72de026fd.tar.gz glibc-3d7090f14b13312320e425b27dcf0fe72de026fd.tar.bz2 |
AArch64: Add memset_zva64
Add a specialized memset for the common ZVA size of 64 to avoid the
overhead of reading the ZVA size. Since the code is identical to
__memset_falkor, remove the latter.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
-rw-r--r-- | sysdeps/aarch64/memset.S | 10 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset.c | 9 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_falkor.S | 54 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_zva64.S | 27 |
6 files changed, 38 insertions, 68 deletions
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index bf3cf85..bbfb718 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -101,19 +101,19 @@ L(tail64): ret L(try_zva): -#ifdef ZVA_MACRO - zva_macro -#else +#ifndef ZVA64_ONLY .p2align 3 mrs tmp1, dczid_el0 tbnz tmp1w, 4, L(no_zva) and tmp1w, tmp1w, 15 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ b.ne L(zva_128) - + nop +#endif /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA. This is faster on some cores. */ + .p2align 4 L(zva_64): str q0, [dst, 16] stp q0, q0, [dst, 32] @@ -123,7 +123,6 @@ L(zva_64): sub count, dstend, dst /* Count is now 128 too large. */ sub count, count, 128+64+64 /* Adjust count and bias for loop. */ add dst, dst, 128 - nop 1: dc zva, dst add dst, dst, 64 subs count, count, 64 @@ -134,6 +133,7 @@ L(zva_64): stp q0, q0, [dstend, -32] ret +#ifndef ZVA64_ONLY .p2align 3 L(zva_128): cmp tmp1w, 5 /* ZVA size is 128 bytes. */ diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index a1a4de3..171ca5e 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -12,10 +12,10 @@ sysdep_routines += \ memmove_mops \ memset_a64fx \ memset_emag \ - memset_falkor \ memset_generic \ memset_kunpeng \ memset_mops \ + memset_zva64 \ strlen_asimd \ strlen_generic \ # sysdep_routines diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 3596d3c..fdd9ea9 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, - /* Enable this on non-falkor processors too so that other cores - can do a comparative analysis with __memset_generic. */ - IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) #if HAVE_AARCH64_SVE_ASM diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index 9193b19..6deb686 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -28,7 +28,7 @@ extern __typeof (__redirect_memset) __libc_memset; -extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden; extern __typeof (__redirect_memset) __memset_emag attribute_hidden; extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; @@ -47,18 +47,17 @@ select_memset_ifunc (void) { if (IS_A64FX (midr) && zva_size == 256) return __memset_a64fx; - return __memset_generic; } if (IS_KUNPENG920 (midr)) return __memset_kunpeng; - if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64) - return __memset_falkor; - if (IS_EMAG (midr)) return __memset_emag; + if (zva_size == 64) + return __memset_zva64; + return __memset_generic; } diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S deleted file mode 100644 index c6946a8..0000000 --- a/sysdeps/aarch64/multiarch/memset_falkor.S +++ /dev/null @@ -1,54 +0,0 @@ -/* Memset for falkor. - Copyright (C) 2017-2023 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <memset-reg.h> - -/* Reading dczid_el0 is expensive on falkor so move it into the ifunc - resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to - use this function only when ZVA is enabled. */ - -#if IS_IN (libc) -.macro zva_macro - .p2align 4 - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. */ - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret -.endm - -# define ZVA_MACRO zva_macro -# define MEMSET __memset_falkor -# include <sysdeps/aarch64/memset.S> -#endif diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S new file mode 100644 index 0000000..13f45fd --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_zva64.S @@ -0,0 +1,27 @@ +/* Optimized memset for zva size = 64. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define ZVA64_ONLY 1 +#define MEMSET __memset_zva64 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(X) + +#include "../memset.S" |