diff options
-rw-r--r-- | sysdeps/x86/cacheinfo.c | 76 |
1 files changed, 49 insertions, 27 deletions
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c index b9444dd..4354c01 100644 --- a/sysdeps/x86/cacheinfo.c +++ b/sysdeps/x86/cacheinfo.c @@ -494,6 +494,7 @@ init_cacheinfo (void) int max_cpuid_ex; long int data = -1; long int shared = -1; + long int shared_per_thread = -1; unsigned int level; unsigned int threads = 0; const struct cpu_features *cpu_features = __get_cpu_features (); @@ -509,7 +510,7 @@ init_cacheinfo (void) /* Try L3 first. */ level = 3; shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); - + shared_per_thread = shared; /* Number of logical processors sharing L2 cache. */ int threads_l2; @@ -521,6 +522,7 @@ init_cacheinfo (void) /* Try L2 otherwise. */ level = 2; shared = core; + shared_per_thread = core; threads_l2 = 0; threads_l3 = -1; } @@ -677,26 +679,25 @@ init_cacheinfo (void) } else { -intel_bug_no_cache_info: + intel_bug_no_cache_info: /* Assume that all logical threads share the highest cache level. */ - threads - = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx - >> 16) & 0xff); - } + threads = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx >> 16) + & 0xff); - /* Cap usage of highest cache level to the number of supported - threads. */ - if (shared > 0 && threads > 0) - shared /= threads; + /* Cap usage of highest cache level to the number of supported + threads. */ + if (shared_per_thread > 0 && threads > 0) + shared_per_thread /= threads; + } } /* Account for non-inclusive L2 and L3 caches. */ if (!inclusive_cache) { - if (threads_l2 > 0) - core /= threads_l2; + if (threads_l2 > 0) + shared_per_thread += core / threads_l2; shared += core; } } @@ -705,13 +706,17 @@ intel_bug_no_cache_info: data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + shared_per_thread = shared; /* Get maximum extended function. */ __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); if (shared <= 0) - /* No shared L3 cache. All we have is the L2 cache. */ - shared = core; + { + /* No shared L3 cache. All we have is the L2 cache. */ + shared = core; + shared_per_thread = core; + } else { /* Figure out the number of logical threads that share L3. */ @@ -735,10 +740,11 @@ intel_bug_no_cache_info: /* Cap usage of highest cache level to the number of supported threads. */ if (threads > 0) - shared /= threads; + shared_per_thread /= threads; /* Account for exclusive L2 and L3 caches. */ shared += core; + shared_per_thread += core; } #ifndef DISABLE_PREFETCHW @@ -766,26 +772,42 @@ intel_bug_no_cache_info: } if (cpu_features->shared_cache_size != 0) - shared = cpu_features->shared_cache_size; + shared_per_thread = cpu_features->shared_cache_size; - if (shared > 0) + if (shared_per_thread > 0) { - __x86_raw_shared_cache_size_half = shared / 2; - __x86_raw_shared_cache_size = shared; + __x86_raw_shared_cache_size_half = shared_per_thread / 2; + __x86_raw_shared_cache_size = shared_per_thread; /* Round shared cache size to multiple of 256 bytes. */ - shared = shared & ~255L; - __x86_shared_cache_size_half = shared / 2; - __x86_shared_cache_size = shared; + shared_per_thread = shared_per_thread & ~255L; + __x86_shared_cache_size_half = shared_per_thread / 2; + __x86_shared_cache_size = shared_per_thread; } - /* The large memcpy micro benchmark in glibc shows that 6 times of - shared cache size is the approximate value above which non-temporal - store becomes faster on a 8-core processor. This is the 3/4 of the - total shared cache size. */ + /* The default setting for the non_temporal threshold is [1/8, 1/2] of size + of the chip's cache (depending on `cachesize_non_temporal_divisor` which + is microarch specific. The default is 1/4). For most Intel processors + with an initial release date between 2017 and 2023, a thread's + typical share of the cache is from 18-64MB. Using a reasonable size + fraction of L3 is meant to estimate the point where non-temporal stores + begin out-competing REP MOVSB. As well the point where the fact that + non-temporal stores are forced back to main memory would already occurred + to the majority of the lines in the copy. Note, concerns about the entire + L3 cache being evicted by the copy are mostly alleviated by the fact that + modern HW detects streaming patterns and provides proper LRU hints so that + the maximum thrashing capped at 1/associativity. */ + unsigned long int non_temporal_threshold = shared / 4; + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run + a higher risk of actually thrashing the cache as they don't have a HW LRU + hint. As well, their performance in highly parallel situations is + noticeably worse. */ + if (!CPU_FEATURES_CPU_P (cpu_features, ERMS)) + non_temporal_threshold = shared_per_thread * 3 / 4; + __x86_shared_non_temporal_threshold = (cpu_features->non_temporal_threshold != 0 ? cpu_features->non_temporal_threshold - : __x86_shared_cache_size * threads * 3 / 4); + : non_temporal_threshold); } #endif |