diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2017-06-05 11:09:48 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2017-06-05 11:09:59 -0700 |
commit | ef9c4cb6c7abb6340b52e19de31d2a56c8de5844 (patch) | |
tree | e97752428e31d5d595c22a44b1905b5a91948d24 /sysdeps/x86_64/multiarch | |
parent | 9cd30491dd6d9d4c5e9372d7a5c75f78f3a11260 (diff) | |
download | glibc-ef9c4cb6c7abb6340b52e19de31d2a56c8de5844.zip glibc-ef9c4cb6c7abb6340b52e19de31d2a56c8de5844.tar.gz glibc-ef9c4cb6c7abb6340b52e19de31d2a56c8de5844.tar.bz2 |
x86-64: Optimize wmemset with SSE2/AVX2/AVX512
The difference between memset and wmemset is byte vs int. Add stubs
to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
SSE2 wmemset:
shl $0x2,%rdx
movd %esi,%xmm0
mov %rdi,%rax
pshufd $0x0,%xmm0,%xmm0
jmp entry_from_wmemset
SSE2 memset:
movd %esi,%xmm0
mov %rdi,%rax
punpcklbw %xmm0,%xmm0
punpcklwd %xmm0,%xmm0
pshufd $0x0,%xmm0,%xmm0
entry_from_wmemset:
Since the ERMS versions of wmemset requires "rep stosl" instead of
"rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
are added. The SSE2 wmemset is about 3X faster and the AVX2 wmemset
is about 6X faster on Haswell.
* include/wchar.h (__wmemset_chk): New.
* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_CHK_SYMBOL): Likewise.
(WMEMSET_SYMBOL): Likewise.
(__wmemset): Add hidden definition.
(wmemset): Add weak hidden definition.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
wmemset_chk-nonshared.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
and __wmemset_chk_avx512_unaligned.
* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
(WMEMSET_SYMBOL): Likewise.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
(WMEMSET_CHK_SYMBOL): New.
(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
(libc_hidden_builtin_def): Also define __GI_wmemset and
__GI___wmemset.
(weak_alias): New.
* sysdeps/x86_64/multiarch/wmemset.c: New file.
* sysdeps/x86_64/multiarch/wmemset.h: Likewise.
* sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S: Likewise.
* sysdeps/x86_64/multiarch/wmemset_chk.c: Likewise.
* sysdeps/x86_64/wmemset.c: Likewise.
* sysdeps/x86_64/wmemset_chk.c: Likewise.
Diffstat (limited to 'sysdeps/x86_64/multiarch')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S | 8 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 24 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memset.S | 13 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemset.c | 33 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemset.h | 42 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S | 21 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wmemset_chk.c | 31 |
10 files changed, 199 insertions, 8 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 3736f54..65a545b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -32,3 +32,7 @@ endif ifeq ($(subdir),wcsmbs) sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c endif + +ifeq ($(subdir),debug) +sysdep_routines += wmemset_chk-nonshared +endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 06d9a9d..a91d2f9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + /* Support sysdeps/x86_64/multiarch/wmemset.S. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + #ifdef SHARED /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ IFUNC_IMPL (i, name, __memcpy_chk, @@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset_chk.S. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) #endif return i; diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 79975e0..7ab3d89 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -4,13 +4,19 @@ # define VMOVU vmovdqu # define VMOVA vmovdqa -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %ymm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + # define SECTION(p) p##.avx # define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S index a5ec349..0783979 100644 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -4,14 +4,21 @@ # define VMOVU vmovdqu64 # define VMOVA vmovdqa64 -# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ vmovd d, %xmm0; \ movq r, %rax; \ vpbroadcastb %xmm0, %xmm0; \ vpbroadcastq %xmm0, %zmm0 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + # define SECTION(p) p##.avx512 # define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 704eed9..2eb9e37 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -30,6 +30,10 @@ # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) #endif +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper @@ -79,6 +83,21 @@ END (__bzero) weak_alias (__bzero, bzero) #endif +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + #if defined SHARED && IS_IN (libc) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) cmpq %rdx, %rcx @@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) #endif ENTRY (MEMSET_SYMBOL (__memset, unaligned)) -L(memset_entry): - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) L(entry_from_bzero): cmpq $VEC_SIZE, %rdx jb L(less_vec) @@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) # endif ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) - VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) cmpq $VEC_SIZE, %rdx jb L(less_vec) cmpq $(VEC_SIZE * 2), %rdx diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index 9d33118..11f2737 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -58,16 +58,23 @@ END(memset) #if IS_IN (libc) # define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s # ifdef SHARED -# undef libc_hidden_builtin_def +# undef libc_hidden_builtin_def /* It doesn't make sense to send libc-internal memset calls through a PLT. The speedup we get from using SSE2 instructions is likely eaten away by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memset; __GI_memset = __memset_sse2_unaligned +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \ + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \ + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned # endif +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + # undef strong_alias # define strong_alias(original, alias) #endif diff --git a/sysdeps/x86_64/multiarch/wmemset.c b/sysdeps/x86_64/multiarch/wmemset.c new file mode 100644 index 0000000..61626a9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.c @@ -0,0 +1,33 @@ +/* Multiple versions of wmemset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemset __redirect_wmemset +# define __wmemset __redirect___wmemset +# include <wchar.h> +# undef wmemset +# undef __wmemset + +# define SYMBOL_NAME wmemset +# include "wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ()); +weak_alias (__wmemset, wmemset) +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset.h b/sysdeps/x86_64/multiarch/wmemset.h new file mode 100644 index 0000000..d761985 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset.h @@ -0,0 +1,42 @@ +/* Common definition for wmemset/wmemset_chk ifunc selections. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + else + return OPTIMIZE (avx2_unaligned); + } + + return OPTIMIZE (sse2_unaligned); +} diff --git a/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S new file mode 100644 index 0000000..0a537fe --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of wmemset_chk for x86-64. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include "../wmemset_chk.S" +#endif diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.c b/sysdeps/x86_64/multiarch/wmemset_chk.c new file mode 100644 index 0000000..2c039a5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of wmemset_chk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __wmemset_chk __redirect_wmemset_chk +# include <wchar.h> +# undef __wmemset_chk + +# define SYMBOL_NAME wmemset_chk +# include "wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk, + IFUNC_SELECTOR ()); +#endif |