diff options
author | Feng Xue <fxue@os.amperecomputing.com> | 2018-08-09 04:38:03 -0400 |
---|---|---|
committer | Feng Xue <fxue@os.amperecomputing.com> | 2019-02-01 08:14:21 -0500 |
commit | 83d1cc42d8e6b18a4b6ba53addfdae98c694ea36 (patch) | |
tree | 31baf4b5309964aadf357cb81b21f5e6614a983e | |
parent | c7d3890ff51bceb38fac0947ce1f2bb0c34f6b15 (diff) | |
download | glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.zip glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.gz glibc-83d1cc42d8e6b18a4b6ba53addfdae98c694ea36.tar.bz2 |
aarch64: Optimized memchr specific to AmpereComputing emag
This version uses general register based memory instruction to load
data, because vector register based is slightly slower in emag.
Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.
* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
[!MEMCHR](MEMCHR): Set to __memchr.
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memchr_generic and memchr_nosimd.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add memchr ifuncs.
* sysdeps/aarch64/multiarch/memchr.c: New file.
* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
* sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise.
-rw-r--r-- | ChangeLog | 12 | ||||
-rw-r--r-- | sysdeps/aarch64/memchr.S | 10 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/Makefile | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memchr.c | 41 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memchr_generic.S | 33 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memchr_nosimd.S | 223 |
7 files changed, 320 insertions, 3 deletions
@@ -1,5 +1,17 @@ 2019-02-01 Feng Xue <fxue@os.amperecomputing.com> + * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. + [!MEMCHR](MEMCHR): Set to __memchr. + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): + Add memchr_generic and memchr_nosimd. + * sysdeps/aarch64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add memchr ifuncs. + * sysdeps/aarch64/multiarch/memchr.c: New file. + * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. + * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise. + +2019-02-01 Feng Xue <fxue@os.amperecomputing.com> + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memset_emag. * sysdeps/aarch64/multiarch/ifunc-impl-list.c diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S index bb7b971..7023d91 100644 --- a/sysdeps/aarch64/memchr.S +++ b/sysdeps/aarch64/memchr.S @@ -26,6 +26,10 @@ * Neon Available. */ +#ifndef MEMCHR +# define MEMCHR __memchr +#endif + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -59,7 +63,7 @@ * identify exactly which byte has matched. */ -ENTRY (__memchr) +ENTRY (MEMCHR) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* @@ -152,6 +156,6 @@ L(tail): L(zero_length): mov result, #0 ret -END (__memchr) -weak_alias (__memchr, memchr) +END (MEMCHR) +weak_alias (MEMCHR, memchr) libc_hidden_builtin_def (memchr) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 3c6c879..4150b89 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -2,5 +2,6 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ memset_generic memset_falkor memset_emag \ + memchr_generic memchr_nosimd \ strlen_generic strlen_asimd endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index d1f7df6..5f43362 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic)) IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c new file mode 100644 index 0000000..3a2f1d1 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr.c @@ -0,0 +1,41 @@ +/* Multiple versions of memchr. AARCH64 version. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memchr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memchr +# define memchr __redirect_memchr +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memchr) __memchr; + +extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden; +extern __typeof (__redirect_memchr) __memchr_nosimd attribute_hidden; + +libc_ifunc (__memchr, + ((IS_EMAG (midr) + ? __memchr_nosimd + : __memchr_generic))); + +# undef memchr +strong_alias (__memchr, memchr); +#endif diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S new file mode 100644 index 0000000..707148b --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_generic.S @@ -0,0 +1,33 @@ +/* Memchr for aarch64, default version for internal use. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMCHR __memchr_generic + +/* Do not hide the generic version of memchr, we use it internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memchr; __GI_memchr = __memchr_generic +# endif +#endif + +# include "../memchr.S" diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S new file mode 100644 index 0000000..5ce8eb7 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S @@ -0,0 +1,223 @@ +/* memchr - find a character in a memory zone using base integer registers + + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Use base integer registers. + */ + +#ifndef MEMCHR +# define MEMCHR __memchr_nosimd +#endif + +/* Arguments and results. */ +#define srcin x0 +#define chrin x1 +#define cntin x2 + +#define result x0 + +#define repchr x1 + +#define tmp1 x2 +#define tmp2 x3 +#define tmp3 x4 +#define tmp4 x5 + +#define src x6 +#define srcend x7 +#define srcend16 x8 + +#define anymore x9 + +#define zeroones x10 + +#define data1 x11 +#define data2 x12 + +#define has_chr1 x13 +#define has_chr2 x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + + +ENTRY_ALIGN (MEMCHR, 6) + + DELOUSE (0) + DELOUSE (2) + + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(none_chr) + + /* Start address is 16-byte aligned or not? */ + tst srcin, 15 + bic src, srcin, 15 + + mov zeroones, REP8_01 + and repchr, chrin, 255 + /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ + mul repchr, repchr, zeroones + + add srcend, srcin, cntin + /* + * srcend16 is address of the block following the last block. + * + * [A block is 16-byte aligned and sized.] + */ + add srcend16, srcend, 15 + bic srcend16, srcend16, 15 + + b.eq L(loop) + + /* Load the first block containing start address. */ + ldp data1, data2, [src], 16 + + lsl tmp1, srcin, 3 + mov tmp2, ~0 +#ifdef __AARCH64EB__ + lsr tmp3, tmp2, tmp1 +#else + lsl tmp3, tmp2, tmp1 +#endif + /* Start address is in the first or the second qword? */ + tst srcin, 8 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. In this way, searching + * the char becomes detecting zero in the resulting two qwords. + */ + eor data1, data1, repchr + eor data2, data2, repchr + + /* + * Set those unused bytes(before start address) to 0xff, so + * that they will not hit any zero detection. + */ + orn tmp1, data1, tmp3 + orn tmp2, data2, tmp3 + + csinv data1, tmp1, xzr, eq + csel data2, data2, tmp2, eq + + /* + * When the first and last block are the same, there are two cases: + * o. Memory range to search is just in one block. + * ( start address - end address) < 0 + * + * o. Memory range is so large that end address wrap-around. + * ( start address - end address) > 0 + */ + cmp srcin, srcend + ccmp src, srcend16, 0, mi + csetm anymore, ne + b L(find_chr) + + .p2align 4 +L(loop): + ldp data1, data2, [src], 16 + + subs anymore, src, srcend16 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. + */ + eor data1, data1, repchr + eor data2, data2, repchr + +L(find_chr): + /* + * Use the following integer test to find out if any byte in a + * qword is zero. If do not contain zero-valued byte, test result + * is zero. + * + * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 + * = + * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) + * + */ + sub tmp1, data1, zeroones + sub tmp2, data2, zeroones + + orr tmp3, data1, REP8_7f + orr tmp4, data2, REP8_7f + + bic has_chr1, tmp1, tmp3 + bic has_chr2, tmp2, tmp4 + + orr tmp1, has_chr1, has_chr2 + ccmp tmp1, 0, 0, ne + + b.eq L(loop) + + cbz has_chr1, 1f + sub result, src, 16 +#ifdef __AARCH64EB__ + rev data1, data1 +#else + rev has_chr1, has_chr1 +#endif + b L(done) + +1: cbz has_chr2, L(none_chr) + sub result, src, 8 +#ifdef __AARCH64EB__ + rev data1, data2 +#else + rev has_chr1, has_chr2 +#endif + +L(done): +#ifdef __AARCH64EB__ + /* + * For big-endian, can not directly use has_chr1/has_chr2 because + * two qwords has been reversed after loading from memory. + * Thus, have to perform char detection on two qwords again, which + * should be byte-swapped this time. + */ + sub tmp1, data1, zeroones + orr tmp3, data1, REP8_7f + bic has_chr1, tmp1, tmp3 + rev has_chr1, has_chr1 +#endif + + /* + * If the specified char is found in a qword, the corresponding + * byte of in has_chr has value of 1, while this is only true for + * the first occurrence, not other occurrences. + */ + cmp anymore, 0 + clz tmp1, has_chr1 + add result, result, tmp1, lsr 3 + ccmp result, srcend, 8, eq /* NZCV = 8000 */ + csel result, result, xzr, mi + ret + +L(none_chr): + mov result, 0 + ret + +END (MEMCHR) +libc_hidden_builtin_def (MEMCHR) |