diff options
-rw-r--r-- | sysdeps/x86/dl-cacheinfo.h | 8 | ||||
-rw-r--r-- | sysdeps/x86/dl-tunables.list | 26 |
2 files changed, 20 insertions, 14 deletions
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index e6c94df..2e43e67 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ unsigned int minimum_rep_movsb_threshold; #endif - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for + VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB + threshold is 2048 * (VEC_SIZE / 16). */ unsigned int rep_movsb_threshold; if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) { - rep_movsb_threshold = 2048 * (64 / 16); + rep_movsb_threshold = 4096 * (64 / 16); #if HAVE_TUNABLES minimum_rep_movsb_threshold = 64 * 8; #endif @@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) else if (CPU_FEATURE_PREFERRED_P (cpu_features, AVX_Fast_Unaligned_Load)) { - rep_movsb_threshold = 2048 * (32 / 16); + rep_movsb_threshold = 4096 * (32 / 16); #if HAVE_TUNABLES minimum_rep_movsb_threshold = 32 * 8; #endif diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index dd6e1d6..4193138 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -32,17 +32,21 @@ glibc { } x86_rep_movsb_threshold { type: SIZE_T - # Since there is overhead to set up REP MOVSB operation, REP MOVSB - # isn't faster on short data. The memcpy micro benchmark in glibc - # shows that 2KB is the approximate value above which REP MOVSB - # becomes faster than SSE2 optimization on processors with Enhanced - # REP MOVSB. Since larger register size can move more data with a - # single load and store, the threshold is higher with larger register - # size. Note: Since the REP MOVSB threshold must be greater than 8 - # times of vector size and the default value is 2048 * (vector size - # / 16), the default value and the minimum value must be updated at - # run-time. NB: Don't set the default value since we can't tell if - # the tunable value is set by user or not [BZ #27069]. + # Since there is overhead to set up REP MOVSB operation, REP + # MOVSB isn't faster on short data. The memcpy micro benchmark + # in glibc shows that 2KB is the approximate value above which + # REP MOVSB becomes faster than SSE2 optimization on processors + # with Enhanced REP MOVSB. Since larger register size can move + # more data with a single load and store, the threshold is + # higher with larger register size. Micro benchmarks show AVX + # REP MOVSB becomes faster apprximately at 8KB. The AVX512 + # threshold is extrapolated to 16KB. For machines with FSRM the + # threshold is universally set at 2112 bytes. Note: Since the + # REP MOVSB threshold must be greater than 8 times of vector + # size and the default value is 4096 * (vector size / 16), the + # default value and the minimum value must be updated at + # run-time. NB: Don't set the default value since we can't tell + # if the tunable value is set by user or not [BZ #27069]. minval: 1 } x86_rep_stosb_threshold { |