1 files changed, 49 insertions, 27 deletions
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 02c886c..ac98741 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -494,6 +494,7 @@ init_cacheinfo (void)
   int max_cpuid_ex;
   long int data = -1;
   long int shared = -1;
+  long int shared_per_thread = -1;
   unsigned int level;
   unsigned int threads = 0;
   const struct cpu_features *cpu_features = __get_cpu_features ();
@@ -509,7 +510,7 @@ init_cacheinfo (void)
       /* Try L3 first.  */
       level  = 3;
       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
-
+      shared_per_thread = shared;
       /* Number of logical processors sharing L2 cache.  */
       int threads_l2;
 
@@ -521,6 +522,7 @@ init_cacheinfo (void)
 	  /* Try L2 otherwise.  */
 	  level  = 2;
 	  shared = core;
+      shared_per_thread = core;
 	  threads_l2 = 0;
 	  threads_l3 = -1;
 	}
@@ -677,26 +679,25 @@ init_cacheinfo (void)
 	    }
 	  else
 	    {
-intel_bug_no_cache_info:
+	    intel_bug_no_cache_info:
 	      /* Assume that all logical threads share the highest cache
 		 level.  */
 
-	      threads
-		= ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
-		    >> 16) & 0xff);
-	    }
+	      threads = ((cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx >> 16)
+			 & 0xff);
 
-	  /* Cap usage of highest cache level to the number of supported
-	     threads.  */
-	  if (shared > 0 && threads > 0)
-	    shared /= threads;
+	      /* Cap usage of highest cache level to the number of supported
+		 threads.  */
+	      if (shared_per_thread > 0 && threads > 0)
+		shared_per_thread /= threads;
+	    }
 	}
 
       /* Account for non-inclusive L2 and L3 caches.  */
       if (!inclusive_cache)
 	{
-	  if (threads_l2 > 0)
-	    core /= threads_l2;
+      if (threads_l2 > 0)
+	shared_per_thread += core / threads_l2;
 	  shared += core;
 	}
     }
@@ -705,13 +706,17 @@ intel_bug_no_cache_info:
       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
 
       /* Get maximum extended function. */
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
 
       if (shared <= 0)
-	/* No shared L3 cache.  All we have is the L2 cache.  */
-	shared = core;
+	{
+	  /* No shared L3 cache.  All we have is the L2 cache.  */
+	  shared = core;
+	  shared_per_thread = core;
+	}
       else
 	{
 	  /* Figure out the number of logical threads that share L3.  */
@@ -735,10 +740,11 @@ intel_bug_no_cache_info:
 	  /* Cap usage of highest cache level to the number of
 	     supported threads.  */
 	  if (threads > 0)
-	    shared /= threads;
+	    shared_per_thread /= threads;
 
 	  /* Account for exclusive L2 and L3 caches.  */
 	  shared += core;
+	  shared_per_thread += core;
 	}
 
 #ifndef DISABLE_PREFETCHW
@@ -766,26 +772,42 @@ intel_bug_no_cache_info:
     }
 
   if (cpu_features->shared_cache_size != 0)
-    shared = cpu_features->shared_cache_size;
+    shared_per_thread = cpu_features->shared_cache_size;
 
-  if (shared > 0)
+  if (shared_per_thread > 0)
     {
-      __x86_raw_shared_cache_size_half = shared / 2;
-      __x86_raw_shared_cache_size = shared;
+      __x86_raw_shared_cache_size_half = shared_per_thread / 2;
+      __x86_raw_shared_cache_size = shared_per_thread;
       /* Round shared cache size to multiple of 256 bytes.  */
-      shared = shared & ~255L;
-      __x86_shared_cache_size_half = shared / 2;
-      __x86_shared_cache_size = shared;
+      shared_per_thread = shared_per_thread & ~255L;
+      __x86_shared_cache_size_half = shared_per_thread / 2;
+      __x86_shared_cache_size = shared_per_thread;
     }
 
-  /* The large memcpy micro benchmark in glibc shows that 6 times of
-     shared cache size is the approximate value above which non-temporal
-     store becomes faster on a 8-core processor.  This is the 3/4 of the
-     total shared cache size.  */
+  /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
+     of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+     is microarch specific. The default is 1/4). For most Intel processors
+     with an initial release date between 2017 and 2023, a thread's
+     typical share of the cache is from 18-64MB. Using a reasonable size
+     fraction of L3 is meant to estimate the point where non-temporal stores
+     begin out-competing REP MOVSB. As well the point where the fact that
+     non-temporal stores are forced back to main memory would already occurred
+     to the majority of the lines in the copy. Note, concerns about the entire
+     L3 cache being evicted by the copy are mostly alleviated by the fact that
+     modern HW detects streaming patterns and provides proper LRU hints so that
+     the maximum thrashing capped at 1/associativity. */
+  unsigned long int non_temporal_threshold = shared / 4;
+  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+     a higher risk of actually thrashing the cache as they don't have a HW LRU
+     hint. As well, their performance in highly parallel situations is
+     noticeably worse.  */
+  if (!CPU_FEATURES_CPU_P (cpu_features, ERMS))
+    non_temporal_threshold = shared_per_thread * 3 / 4;
+
   __x86_shared_non_temporal_threshold
     = (cpu_features->non_temporal_threshold != 0
        ? cpu_features->non_temporal_threshold
-       : __x86_shared_cache_size * threads * 3 / 4);
+       : non_temporal_threshold);
 }
 
 #endif