diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2015-01-30 06:50:20 -0800 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2015-01-30 15:37:58 -0800 |
commit | 5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97 (patch) | |
tree | 8eabf127206283d2421bc40b6bc44e123e346598 | |
parent | b658fdd82b4524cf6a39881d092caa23f63d93ac (diff) | |
download | glibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.zip glibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.tar.gz glibc-5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97.tar.bz2 |
Use AVX unaligned memcpy only if AVX2 is available
memcpy with unaligned 256-bit AVX register loads/stores are slow on older
processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load
and sets it only when AVX2 is available.
[BZ #17801]
* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
New.
(index_AVX_Fast_Unaligned_Load): Likewise.
(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | NEWS | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.c | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.h | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy_chk.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memmove_chk.c | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy.S | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 |
10 files changed, 37 insertions, 10 deletions
@@ -1,3 +1,21 @@ +2015-01-30 H.J. Lu <hongjiu.lu@intel.com> + + [BZ #17801] + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): + New. + (index_AVX_Fast_Unaligned_Load): Likewise. + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. + 2015-01-29 Andreas Schwab <schwab@suse.de> * sysdeps/nptl/allocrtsig.c: Include <signal.h>. @@ -17,8 +17,8 @@ Version 2.21 17601, 17608, 17616, 17625, 17630, 17633, 17634, 17635, 17647, 17653, 17657, 17658, 17664, 17665, 17668, 17682, 17702, 17717, 17719, 17722, 17723, 17724, 17725, 17732, 17733, 17744, 17745, 17746, 17747, 17748, - 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17803, - 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. + 17775, 17777, 17780, 17781, 17782, 17791, 17793, 17796, 17797, 17801, + 17803, 17806, 17834, 17844, 17848, 17868, 17869, 17870, 17885, 17892. * A new semaphore algorithm has been implemented in generic C code for all machines. Previous custom assembly implementations of semaphore were diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 9299360..7dec218 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -171,9 +171,14 @@ __init_cpu_features (void) /* Determine if AVX is usable. */ if (CPUID_AVX) __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if AVX2 is usable. */ +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; + __cpu_features.feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; /* Determine if FMA is usable. */ if (CPUID_FMA) __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 55f1c5b..e6b5ba5 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -25,6 +25,7 @@ #define bit_FMA4_Usable (1 << 8) #define bit_Slow_SSE4_2 (1 << 9) #define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) /* CPUID Feature flags. */ @@ -74,6 +75,7 @@ # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 992e40d..4e18cd3 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 1f ret 1: leaq __memcpy_sse2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 5e9cf00..1e756ea 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index d93bfd0..dd153a3 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 743ca2a..8b12d00 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX ? __memmove_chk_avx_unaligned : + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index cdf1dab..2eaacdf 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -37,7 +37,7 @@ ENTRY(__mempcpy) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index b7f9e89..17b8470 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret |