aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86/cacheinfo.c
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86/cacheinfo.c')
-rw-r--r--sysdeps/x86/cacheinfo.c76
1 files changed, 49 insertions, 27 deletions
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 02c886c..ac98741 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -494,6 +494,7 @@ init_cacheinfo (void)
int max_cpuid_ex;
long int data = -1;
long int shared = -1;
+ long int shared_per_thread = -1;
unsigned int level;
unsigned int threads = 0;
const struct cpu_features *cpu_features = __get_cpu_features ();
@@ -509,7 +510,7 @@ init_cacheinfo (void)
/* Try L3 first. */
level = 3;
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
-
+ shared_per_thread = shared;
/* Number of logical processors sharing L2 cache. */
int threads_l2;
@@ -521,6 +522,7 @@ init_cacheinfo (void)
/* Try L2 otherwise. */
level = 2;
shared = core;
+ shared_per_thread = core;
threads_l2 = 0;
threads_l3 = -1;
}
@@ -677,26 +679,25 @@ init_cacheinfo (void)
}
else
{
-intel_bug_no_cache_info:
+ intel_bug_no_cache_info:
/* Assume that all logical threads share the highest cache
level. */
- threads
- = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
- >> 16) & 0xff);
- }
+ threads = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx >> 16)
+ & 0xff);
- /* Cap usage of highest cache level to the number of supported
- threads. */
- if (shared > 0 && threads > 0)
- shared /= threads;
+ /* Cap usage of highest cache level to the number of supported
+ threads. */
+ if (shared_per_thread > 0 && threads > 0)
+ shared_per_thread /= threads;
+ }
}
/* Account for non-inclusive L2 and L3 caches. */
if (!inclusive_cache)
{
- if (threads_l2 > 0)
- core /= threads_l2;
+ if (threads_l2 > 0)
+ shared_per_thread += core / threads_l2;
shared += core;
}
}
@@ -705,13 +706,17 @@ intel_bug_no_cache_info:
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+ shared_per_thread = shared;
/* Get maximum extended function. */
__cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
if (shared <= 0)
- /* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ {
+ /* No shared L3 cache. All we have is the L2 cache. */
+ shared = core;
+ shared_per_thread = core;
+ }
else
{
/* Figure out the number of logical threads that share L3. */
@@ -735,10 +740,11 @@ intel_bug_no_cache_info:
/* Cap usage of highest cache level to the number of
supported threads. */
if (threads > 0)
- shared /= threads;
+ shared_per_thread /= threads;
/* Account for exclusive L2 and L3 caches. */
shared += core;
+ shared_per_thread += core;
}
#ifndef DISABLE_PREFETCHW
@@ -766,26 +772,42 @@ intel_bug_no_cache_info:
}
if (cpu_features->shared_cache_size != 0)
- shared = cpu_features->shared_cache_size;
+ shared_per_thread = cpu_features->shared_cache_size;
- if (shared > 0)
+ if (shared_per_thread > 0)
{
- __x86_raw_shared_cache_size_half = shared / 2;
- __x86_raw_shared_cache_size = shared;
+ __x86_raw_shared_cache_size_half = shared_per_thread / 2;
+ __x86_raw_shared_cache_size = shared_per_thread;
/* Round shared cache size to multiple of 256 bytes. */
- shared = shared & ~255L;
- __x86_shared_cache_size_half = shared / 2;
- __x86_shared_cache_size = shared;
+ shared_per_thread = shared_per_thread & ~255L;
+ __x86_shared_cache_size_half = shared_per_thread / 2;
+ __x86_shared_cache_size = shared_per_thread;
}
- /* The large memcpy micro benchmark in glibc shows that 6 times of
- shared cache size is the approximate value above which non-temporal
- store becomes faster on a 8-core processor. This is the 3/4 of the
- total shared cache size. */
+ /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
+ of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+ is microarch specific. The default is 1/4). For most Intel processors
+ with an initial release date between 2017 and 2023, a thread's
+ typical share of the cache is from 18-64MB. Using a reasonable size
+ fraction of L3 is meant to estimate the point where non-temporal stores
+ begin out-competing REP MOVSB. As well the point where the fact that
+ non-temporal stores are forced back to main memory would already occurred
+ to the majority of the lines in the copy. Note, concerns about the entire
+ L3 cache being evicted by the copy are mostly alleviated by the fact that
+ modern HW detects streaming patterns and provides proper LRU hints so that
+ the maximum thrashing capped at 1/associativity. */
+ unsigned long int non_temporal_threshold = shared / 4;
+ /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+ a higher risk of actually thrashing the cache as they don't have a HW LRU
+ hint. As well, their performance in highly parallel situations is
+ noticeably worse. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ non_temporal_threshold = shared_per_thread * 3 / 4;
+
__x86_shared_non_temporal_threshold
= (cpu_features->non_temporal_threshold != 0
? cpu_features->non_temporal_threshold
- : __x86_shared_cache_size * threads * 3 / 4);
+ : non_temporal_threshold);
}
#endif