From 5f3d0b78e011d2a72f9e88b0e9ef5bc081d18f97 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Jan 2015 06:50:20 -0800 Subject: Use AVX unaligned memcpy only if AVX2 is available memcpy with unaligned 256-bit AVX register loads/stores are slow on older processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load and sets it only when AVX2 is available. [BZ #17801] * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): New. (index_AVX_Fast_Unaligned_Load): Likewise. (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. --- sysdeps/x86_64/multiarch/init-arch.c | 9 +++++++-- sysdeps/x86_64/multiarch/init-arch.h | 4 ++++ sysdeps/x86_64/multiarch/memcpy.S | 2 +- sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +- sysdeps/x86_64/multiarch/memmove.c | 2 +- sysdeps/x86_64/multiarch/memmove_chk.c | 2 +- sysdeps/x86_64/multiarch/mempcpy.S | 2 +- sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +- 8 files changed, 17 insertions(+), 8 deletions(-) (limited to 'sysdeps/x86_64') diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 9299360..7dec218 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -171,9 +171,14 @@ __init_cpu_features (void) /* Determine if AVX is usable. */ if (CPUID_AVX) __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if AVX2 is usable. */ +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; + __cpu_features.feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; /* Determine if FMA is usable. */ if (CPUID_FMA) __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 55f1c5b..e6b5ba5 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -25,6 +25,7 @@ #define bit_FMA4_Usable (1 << 8) #define bit_Slow_SSE4_2 (1 << 9) #define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) /* CPUID Feature flags. */ @@ -74,6 +75,7 @@ # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 992e40d..4e18cd3 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 1f ret 1: leaq __memcpy_sse2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 5e9cf00..1e756ea 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index d93bfd0..dd153a3 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 743ca2a..8b12d00 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX ? __memmove_chk_avx_unaligned : + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index cdf1dab..2eaacdf 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -37,7 +37,7 @@ ENTRY(__mempcpy) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index b7f9e89..17b8470 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret -- cgit v1.1