aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/x86')
-rw-r--r--sysdeps/x86/Makefile52
-rw-r--r--sysdeps/x86/cpu-features.c391
-rw-r--r--sysdeps/x86/cpu-tunables.c4
-rw-r--r--sysdeps/x86/dl-diagnostics-cpu.c2
-rw-r--r--sysdeps/x86/include/cpu-features.h9
-rw-r--r--sysdeps/x86/sysdep.h29
-rw-r--r--sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c1
-rw-r--r--sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c1
-rw-r--r--sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c1
-rw-r--r--sysdeps/x86/tst-gnu2-tls2.c36
-rw-r--r--sysdeps/x86/tst-gnu2-tls2.h37
-rw-r--r--sysdeps/x86/tst-tls23.c22
-rw-r--r--sysdeps/x86/tst-tls23.h35
13 files changed, 399 insertions, 221 deletions
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 5311b59..4fbd48e 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -4,7 +4,13 @@ endif
ifeq ($(subdir),elf)
sysdep_routines += get-cpuid-feature-leaf
-sysdep-dl-routines += dl-get-cpu-features
+sysdep-dl-routines += \
+ dl-get-cpu-features \
+ dl-tlsdesc \
+ tls_get_addr \
+ tlsdesc \
+# sysdep-dl-routines
+
sysdep_headers += \
bits/platform/features.h \
bits/platform/x86.h \
@@ -21,6 +27,9 @@ tests += \
tst-cpu-features-supports-static \
tst-get-cpu-features \
tst-get-cpu-features-static \
+ tst-gnu2-tls2-x86-noxsave \
+ tst-gnu2-tls2-x86-noxsavec \
+ tst-gnu2-tls2-x86-noxsavexsavec \
tst-hwcap-tunables \
# tests
tests-static += \
@@ -87,10 +96,49 @@ tst-ifunc-isa-2-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-SSE4_2,-AVX,-AVX2,-AVX512
tst-ifunc-isa-2-static-ENV = $(tst-ifunc-isa-2-ENV)
tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
-CFLAGS-tst-gnu2-tls2.c += -msse
+CFLAGS-tst-gnu2-tls2.c += -msse2
CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
+
+LDFLAGS-tst-gnu2-tls2 += -rdynamic
+LDFLAGS-tst-gnu2-tls2mod0.so += -Wl,-z,undefs
+LDFLAGS-tst-gnu2-tls2mod1.so += -Wl,-z,undefs
+LDFLAGS-tst-gnu2-tls2mod2.so += -Wl,-z,undefs
+
+CFLAGS-tst-gnu2-tls2-x86-noxsave.c += -msse2
+CFLAGS-tst-gnu2-tls2-x86-noxsavec.c += -msse2
+CFLAGS-tst-gnu2-tls2-x86-noxsavexsavec.c += -msse2
+LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy -rdynamic
+LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy -rdynamic
+LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy -rdynamic
+
+# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
+# via tunable.
+tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
+tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
+$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
+ $(objpfx)tst-gnu2-tls2mod0.so \
+ $(objpfx)tst-gnu2-tls2mod1.so \
+ $(objpfx)tst-gnu2-tls2mod2.so
+
+CFLAGS-tst-tls23.c += -msse2
+CFLAGS-tst-tls23-mod.c += -msse2 -mtune=haswell
+
+LDFLAGS-tst-tls23 += -rdynamic
+tst-tls23-mod.so-no-z-defs = yes
+
+$(objpfx)tst-tls23-mod.so: $(libsupport)
+endif
+
+ifeq ($(subdir),gmon)
+CFLAGS-mcount.c += -mgeneral-regs-only
endif
ifeq ($(subdir),math)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 27abaca..b7d1506 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -24,6 +24,7 @@
#include <dl-cacheinfo.h>
#include <dl-minsigstacksize.h>
#include <dl-hwcap2.h>
+#include <gcc-macros.h>
extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
attribute_hidden;
@@ -83,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
# include <dl-cet.h>
#endif
+unsigned long int _dl_x86_features_tlsdesc_state_size;
+
static void
update_active (struct cpu_features *cpu_features)
{
@@ -317,17 +320,13 @@ update_active (struct cpu_features *cpu_features)
= xsave_state_full_size;
cpu_features->xsave_state_full_size
= xsave_state_full_size;
+ _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
/* Check if XSAVEC is available. */
if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
{
- unsigned int xstate_comp_offsets[32];
- unsigned int xstate_comp_sizes[32];
-#ifdef __x86_64__
- unsigned int xstate_amx_comp_offsets[32];
- unsigned int xstate_amx_comp_sizes[32];
- unsigned int amx_ecx;
-#endif
+ unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
+ unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
unsigned int i;
xstate_comp_offsets[0] = 0;
@@ -335,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
xstate_comp_offsets[2] = 576;
xstate_comp_sizes[0] = 160;
xstate_comp_sizes[1] = 256;
-#ifdef __x86_64__
- xstate_amx_comp_offsets[0] = 0;
- xstate_amx_comp_offsets[1] = 160;
- xstate_amx_comp_offsets[2] = 576;
- xstate_amx_comp_sizes[0] = 160;
- xstate_amx_comp_sizes[1] = 256;
-#endif
- for (i = 2; i < 32; i++)
+ for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
{
if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
{
__cpuid_count (0xd, i, eax, ebx, ecx, edx);
-#ifdef __x86_64__
- /* Include this in xsave_state_full_size. */
- amx_ecx = ecx;
- xstate_amx_comp_sizes[i] = eax;
- if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
- {
- /* Exclude this from xsave_state_size. */
- ecx = 0;
- xstate_comp_sizes[i] = 0;
- }
- else
-#endif
- xstate_comp_sizes[i] = eax;
+ xstate_comp_sizes[i] = eax;
}
else
{
-#ifdef __x86_64__
- amx_ecx = 0;
- xstate_amx_comp_sizes[i] = 0;
-#endif
ecx = 0;
xstate_comp_sizes[i] = 0;
}
@@ -376,44 +352,32 @@ update_active (struct cpu_features *cpu_features)
{
xstate_comp_offsets[i]
= (xstate_comp_offsets[i - 1]
- + xstate_comp_sizes[i -1]);
+ + xstate_comp_sizes[i - 1]);
if ((ecx & (1 << 1)) != 0)
xstate_comp_offsets[i]
= ALIGN_UP (xstate_comp_offsets[i], 64);
-#ifdef __x86_64__
- xstate_amx_comp_offsets[i]
- = (xstate_amx_comp_offsets[i - 1]
- + xstate_amx_comp_sizes[i - 1]);
- if ((amx_ecx & (1 << 1)) != 0)
- xstate_amx_comp_offsets[i]
- = ALIGN_UP (xstate_amx_comp_offsets[i],
- 64);
-#endif
}
}
/* Use XSAVEC. */
unsigned int size
- = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+ = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
+ + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
if (size)
{
+ size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+ 64);
#ifdef __x86_64__
- unsigned int amx_size
- = (xstate_amx_comp_offsets[31]
- + xstate_amx_comp_sizes[31]);
- amx_size
- = ALIGN_UP ((amx_size
- + TLSDESC_CALL_REGISTER_SAVE_AREA),
- 64);
- /* Set xsave_state_full_size to the compact AMX
- state size for XSAVEC. NB: xsave_state_full_size
- is only used in _dl_tlsdesc_dynamic_xsave and
- _dl_tlsdesc_dynamic_xsavec. */
- cpu_features->xsave_state_full_size = amx_size;
+ _dl_x86_features_tlsdesc_state_size = size;
+ /* Exclude the AMX space from the start of TILECFG
+ space to the end of TILEDATA space. If CPU
+ doesn't support AMX, TILECFG offset is the same
+ as TILEDATA + 1 offset. Otherwise, they are
+ multiples of 64. */
+ size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
+ - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
#endif
- cpu_features->xsave_state_size
- = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
- 64);
+ cpu_features->xsave_state_size = size;
CPU_FEATURE_SET (cpu_features, XSAVEC);
}
}
@@ -538,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
"Incorrect index_arch_Fast_Unaligned_Load");
-/* Intel Family-6 microarch list. */
-enum
+/* Intel microarch list. */
+enum intel_microarch
{
/* Atom processors. */
INTEL_ATOM_BONNELL,
@@ -548,6 +512,7 @@ enum
INTEL_ATOM_GOLDMONT,
INTEL_ATOM_GOLDMONT_PLUS,
INTEL_ATOM_SIERRAFOREST,
+ INTEL_ATOM_CLEARWATERFOREST,
INTEL_ATOM_GRANDRIDGE,
INTEL_ATOM_TREMONT,
@@ -575,7 +540,9 @@ enum
INTEL_BIGCORE_METEORLAKE,
INTEL_BIGCORE_LUNARLAKE,
INTEL_BIGCORE_ARROWLAKE,
+ INTEL_BIGCORE_PANTHERLAKE,
INTEL_BIGCORE_GRANITERAPIDS,
+ INTEL_BIGCORE_DIAMONDRAPIDS,
/* Mixed (bigcore + atom SOC). */
INTEL_MIXED_LAKEFIELD,
@@ -589,7 +556,7 @@ enum
INTEL_UNKNOWN,
};
-static unsigned int
+static enum intel_microarch
intel_get_fam6_microarch (unsigned int model,
__attribute__ ((unused)) unsigned int stepping)
{
@@ -620,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_ATOM_GOLDMONT_PLUS;
case 0xAF:
return INTEL_ATOM_SIERRAFOREST;
+ case 0xDD:
+ return INTEL_ATOM_CLEARWATERFOREST;
case 0xB6:
return INTEL_ATOM_GRANDRIDGE;
case 0x86:
@@ -727,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_BIGCORE_METEORLAKE;
case 0xbd:
return INTEL_BIGCORE_LUNARLAKE;
+ case 0xb5:
+ case 0xc5:
case 0xc6:
return INTEL_BIGCORE_ARROWLAKE;
+ case 0xCC:
+ return INTEL_BIGCORE_PANTHERLAKE;
case 0xAD:
case 0xAE:
return INTEL_BIGCORE_GRANITERAPIDS;
@@ -792,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
+ enum intel_microarch microarch = INTEL_UNKNOWN;
if (family == 0x06)
{
model += extended_model;
- unsigned int microarch
- = intel_get_fam6_microarch (model, stepping);
+ microarch = intel_get_fam6_microarch (model, stepping);
+ /* Disable TSX on some processors to avoid TSX on kernels that
+ weren't updated with the latest microcode package (which
+ disables broken feature by default). */
switch (microarch)
{
- /* Atom / KNL tuning. */
- case INTEL_ATOM_BONNELL:
- /* BSF is slow on Bonnell. */
- cpu_features->preferred[index_arch_Slow_BSF]
- |= bit_arch_Slow_BSF;
- break;
-
- /* Unaligned load versions are faster than SSSE3
- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
- case INTEL_ATOM_AIRMONT:
- case INTEL_ATOM_SILVERMONT:
- case INTEL_ATOM_GOLDMONT:
- case INTEL_ATOM_GOLDMONT_PLUS:
-
- /* Knights Landing. Enable Silvermont optimizations. */
- case INTEL_KNIGHTS_LANDING:
-
- cpu_features->preferred[index_arch_Fast_Unaligned_Load]
- |= (bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- case INTEL_ATOM_TREMONT:
- /* Enable rep string instructions, unaligned load, unaligned
- copy, pminub and avoid SSE 4.2 on Tremont. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- /*
- Default tuned Knights microarch.
- case INTEL_KNIGHTS_MILL:
- */
-
- /*
- Default tuned atom microarch.
- case INTEL_ATOM_SIERRAFOREST:
- case INTEL_ATOM_GRANDRIDGE:
- */
-
- /* Bigcore/Default Tuning. */
default:
- default_tuning:
- /* Unknown family 0x06 processors. Assuming this is one
- of Core i3/i5/i7 processors if AVX is available. */
- if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
- break;
-
- enable_modern_features:
- /* Rep string instructions, unaligned load, unaligned copy,
- and pminub are fast on Intel Core i3, i5 and i7. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop);
break;
- case INTEL_BIGCORE_NEHALEM:
- case INTEL_BIGCORE_WESTMERE:
- /* Older CPUs prefer non-temporal stores at lower threshold. */
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto enable_modern_features;
-
- /* Older Bigcore microarch (smaller non-temporal store
- threshold). */
- case INTEL_BIGCORE_SANDYBRIDGE:
- case INTEL_BIGCORE_IVYBRIDGE:
- case INTEL_BIGCORE_HASWELL:
- case INTEL_BIGCORE_BROADWELL:
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto default_tuning;
-
- /* Newer Bigcore microarch (larger non-temporal store
- threshold). */
- case INTEL_BIGCORE_SKYLAKE_AVX512:
- case INTEL_BIGCORE_CANNONLAKE:
- /* Benchmarks indicate non-temporal memset is not
- necessarily profitable on SKX (and in some cases much
- worse). This is likely unique to SKX due its it unique
- mesh interconnect (not present on ICX or BWD). Disable
- non-temporal on all Skylake servers. */
- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
- |= bit_arch_Avoid_Non_Temporal_Memset;
- /* fallthrough */
- case INTEL_BIGCORE_COMETLAKE:
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_ICELAKE:
- case INTEL_BIGCORE_TIGERLAKE:
- case INTEL_BIGCORE_ROCKETLAKE:
- case INTEL_BIGCORE_RAPTORLAKE:
- case INTEL_BIGCORE_METEORLAKE:
- case INTEL_BIGCORE_LUNARLAKE:
- case INTEL_BIGCORE_ARROWLAKE:
- case INTEL_BIGCORE_SAPPHIRERAPIDS:
- case INTEL_BIGCORE_EMERALDRAPIDS:
- case INTEL_BIGCORE_GRANITERAPIDS:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
-
- /* Default tuned Mixed (bigcore + atom SOC). */
- case INTEL_MIXED_LAKEFIELD:
- case INTEL_MIXED_ALDERLAKE:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
- }
-
- /* Disable TSX on some processors to avoid TSX on kernels that
- weren't updated with the latest microcode package (which
- disables broken feature by default). */
- switch (microarch)
- {
case INTEL_BIGCORE_SKYLAKE_AVX512:
/* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
if (stepping <= 5)
@@ -927,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_KABYLAKE:
/* NB: Although the errata documents that for model == 0x8e
- (kabylake skylake client), only 0xb stepping or lower are
- impacted, the intention of the errata was to disable TSX on
- all client processors on all steppings. Include 0xc
- stepping which is an Intel Core i7-8665U, a client mobile
- processor. */
+ (kabylake skylake client), only 0xb stepping or lower are
+ impacted, the intention of the errata was to disable TSX on
+ all client processors on all steppings. Include 0xc
+ stepping which is an Intel Core i7-8665U, a client mobile
+ processor. */
if (stepping > 0xc)
break;
/* Fall through. */
case INTEL_BIGCORE_SKYLAKE:
- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
- processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
- */
- disable_tsx:
- CPU_FEATURE_UNSET (cpu_features, HLE);
- CPU_FEATURE_UNSET (cpu_features, RTM);
- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
- break;
+ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+ processors listed in:
+
+ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+ */
+disable_tsx:
+ CPU_FEATURE_UNSET (cpu_features, HLE);
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+ break;
case INTEL_BIGCORE_HASWELL:
- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
- TSX. Haswell also include other model numbers that have
- working TSX. */
- if (model == 0x3f && stepping >= 4)
+ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+ TSX. Haswell also includes other model numbers that have
+ working TSX. */
+ if (model == 0x3f && stepping >= 4)
break;
- CPU_FEATURE_UNSET (cpu_features, RTM);
- break;
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ break;
}
}
+ else if (family == 19)
+ switch (model)
+ {
+ case 0x01:
+ microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+ break;
+ default:
+ break;
+ }
+
+ switch (microarch)
+ {
+ /* Atom / KNL tuning. */
+ case INTEL_ATOM_BONNELL:
+ /* BSF is slow on Bonnell. */
+ cpu_features->preferred[index_arch_Slow_BSF]
+ |= bit_arch_Slow_BSF;
+ break;
+
+ /* Unaligned load versions are faster than SSSE3
+ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
+ case INTEL_ATOM_AIRMONT:
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+
+ /* Knights Landing. Enable Silvermont optimizations. */
+ case INTEL_KNIGHTS_LANDING:
+
+ cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+ |= (bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ case INTEL_ATOM_TREMONT:
+ /* Enable rep string instructions, unaligned load, unaligned
+ copy, pminub and avoid SSE 4.2 on Tremont. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ /*
+ Default tuned Knights microarch.
+ case INTEL_KNIGHTS_MILL:
+ */
+
+ /*
+ Default tuned atom microarch.
+ case INTEL_ATOM_SIERRAFOREST:
+ case INTEL_ATOM_GRANDRIDGE:
+ case INTEL_ATOM_CLEARWATERFOREST:
+ */
+
+ /* Bigcore/Default Tuning. */
+ default:
+ default_tuning:
+ /* Unknown Intel processors. Assuming this is one of Core
+ i3/i5/i7 processors if AVX is available. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+ break;
+
+ enable_modern_features:
+ /* Rep string instructions, unaligned load, unaligned copy,
+ and pminub are fast on Intel Core i3, i5 and i7. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop);
+ break;
+
+ case INTEL_BIGCORE_NEHALEM:
+ case INTEL_BIGCORE_WESTMERE:
+ /* Older CPUs prefer non-temporal stores at lower threshold. */
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto enable_modern_features;
+
+ /* Older Bigcore microarch (smaller non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SANDYBRIDGE:
+ case INTEL_BIGCORE_IVYBRIDGE:
+ case INTEL_BIGCORE_HASWELL:
+ case INTEL_BIGCORE_BROADWELL:
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto default_tuning;
+
+ /* Newer Bigcore microarch (larger non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SKYLAKE_AVX512:
+ case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due to its unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ /* fallthrough */
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
+ case INTEL_BIGCORE_ICELAKE:
+ case INTEL_BIGCORE_TIGERLAKE:
+ case INTEL_BIGCORE_ROCKETLAKE:
+ case INTEL_BIGCORE_RAPTORLAKE:
+ case INTEL_BIGCORE_METEORLAKE:
+ case INTEL_BIGCORE_LUNARLAKE:
+ case INTEL_BIGCORE_ARROWLAKE:
+ case INTEL_BIGCORE_PANTHERLAKE:
+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ case INTEL_BIGCORE_EMERALDRAPIDS:
+ case INTEL_BIGCORE_GRANITERAPIDS:
+ case INTEL_BIGCORE_DIAMONDRAPIDS:
+ /* Default tuned Mixed (bigcore + atom SOC). */
+ case INTEL_MIXED_LAKEFIELD:
+ case INTEL_MIXED_ALDERLAKE:
+ cpu_features->cachesize_non_temporal_divisor = 2;
+ goto default_tuning;
+ }
/* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
if AVX512ER is available. Don't use AVX512 to avoid lower CPU
@@ -1159,6 +1144,9 @@ no_cpuid:
TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
#endif
+ /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
+ requires AVX and therefore XSAVE or XSAVEC support. */
+#ifndef GCCMACRO__AVX__
bool disable_xsave_features = false;
if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
@@ -1212,6 +1200,7 @@ no_cpuid:
CPU_FEATURE_UNSET (cpu_features, FMA4);
}
+#endif
#ifdef __x86_64__
GLRO(dl_hwcap) = HWCAP_X86_64;
@@ -1267,7 +1256,7 @@ no_cpuid:
#endif
if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
- || (GLRO(dl_x86_cpu_features).xsave_state_size != 0))
+ || cpu_features->xsave_state_size != 0)
{
if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
{
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 3423176..74cd5b9 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -96,7 +96,7 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz,....
can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature
- yyy and zzz, where the feature name is case-sensitive and has to
+ xxx and zzz, where the feature name is case-sensitive and has to
match the ones in cpu-features.h. It can be used by glibc developers
to tune for a new processor or override the IFUNC selection to
improve performance for a particular workload.
@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
/* Update xsave_state_size to XSAVE state size. */
cpu_features->xsave_state_size
= cpu_features->xsave_state_full_size;
+ _dl_x86_features_tlsdesc_state_size
+ = cpu_features->xsave_state_full_size;
CPU_FEATURE_UNSET (cpu_features, XSAVEC);
}
}
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 7d03736..870b126 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void)
cpu_features->xsave_state_size);
print_cpu_features_value ("xsave_state_full_size",
cpu_features->xsave_state_full_size);
+ print_cpu_features_value ("tlsdesc_state_full_size",
+ _dl_x86_features_tlsdesc_state_size);
print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
print_cpu_features_value ("shared_cache_size",
cpu_features->shared_cache_size);
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 9c485d3..fbf1b89 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -935,8 +935,6 @@ struct cpu_features
/* The full state size for XSAVE when XSAVEC is disabled by
GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
-
- and the AMX state size when XSAVEC is available.
*/
unsigned int xsave_state_full_size;
/* Data cache size for use in memory and string routines, typically
@@ -990,6 +988,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
#define __get_cpu_features() _dl_x86_get_cpu_features()
+#if IS_IN (rtld) || IS_IN (libc)
+/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to
+ xsave_state_size from struct cpu_features, this includes additional
+ registers. */
+extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
+#endif
+
#if defined (_LIBC) && !IS_IN (nonlib)
/* Unused for x86. */
# define INIT_ARCH()
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 541393f..b8e963b 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -102,6 +102,9 @@
| (1 << X86_XSTATE_ZMM_ID) \
| (1 << X86_XSTATE_APX_F_ID))
+/* The maximum supported xstate ID. */
+# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID
+
/* AMX state mask. */
# define AMX_STATE_SAVE_MASK \
((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
@@ -123,6 +126,9 @@
| (1 << X86_XSTATE_K_ID) \
| (1 << X86_XSTATE_ZMM_H_ID))
+/* The maximum supported xstate ID. */
+# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID
+
/* States to be included in xsave_state_size. */
# define FULL_STATE_SAVE_MASK STATE_SAVE_MASK
#endif
@@ -177,6 +183,29 @@
#define atom_text_section .section ".text.atom", "ax"
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+ https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+ __tls_get_addr may be called with 8-byte/4-byte stack alignment.
+ Although this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't
+ assume that stack will be always aligned at 16 bytes. */
+# ifdef __x86_64__
+# define DL_STACK_ALIGNMENT 8
+# define MINIMUM_ALIGNMENT 16
+# else
+# define DL_STACK_ALIGNMENT 4
+# endif
+#endif
+
+/* True if _dl_runtime_resolve/_dl_tlsdesc_dynamic should align stack for
+ STATE_SAVE or align stack to MINIMUM_ALIGNMENT bytes before calling
+ _dl_fixup/__tls_get_addr. */
+#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
+ (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
+ || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT)
+
#endif /* __ASSEMBLER__ */
#endif /* _X86_SYSDEP_H */
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
new file mode 100644
index 0000000..963c4f3
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
@@ -0,0 +1 @@
+#include <tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
new file mode 100644
index 0000000..963c4f3
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
@@ -0,0 +1 @@
+#include <tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
new file mode 100644
index 0000000..963c4f3
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
@@ -0,0 +1 @@
+#include <tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2.c b/sysdeps/x86/tst-gnu2-tls2.c
index de900a4..b3195ff 100644
--- a/sysdeps/x86/tst-gnu2-tls2.c
+++ b/sysdeps/x86/tst-gnu2-tls2.c
@@ -1,20 +1,26 @@
-#ifndef __x86_64__
-#include <sys/platform/x86.h>
+#ifndef TEST_AMX
+# ifndef __x86_64__
+# include <sys/platform/x86.h>
-#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
-#endif
+# define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
+# endif
-/* Clear XMM0...XMM7 */
-#define PREPARE_MALLOC() \
-{ \
- asm volatile ("xorps %%xmm0, %%xmm0" : : : "xmm0" ); \
- asm volatile ("xorps %%xmm1, %%xmm1" : : : "xmm1" ); \
- asm volatile ("xorps %%xmm2, %%xmm2" : : : "xmm2" ); \
- asm volatile ("xorps %%xmm3, %%xmm3" : : : "xmm3" ); \
- asm volatile ("xorps %%xmm4, %%xmm4" : : : "xmm4" ); \
- asm volatile ("xorps %%xmm5, %%xmm5" : : : "xmm5" ); \
- asm volatile ("xorps %%xmm6, %%xmm6" : : : "xmm6" ); \
- asm volatile ("xorps %%xmm7, %%xmm7" : : : "xmm7" ); \
+/* Set XMM0...XMM7 to all 1s. */
+# define PREPARE_MALLOC() \
+{ \
+ asm volatile ("pcmpeqd %%xmm0, %%xmm0" : : : "xmm0" ); \
+ asm volatile ("pcmpeqd %%xmm1, %%xmm1" : : : "xmm1" ); \
+ asm volatile ("pcmpeqd %%xmm2, %%xmm2" : : : "xmm2" ); \
+ asm volatile ("pcmpeqd %%xmm3, %%xmm3" : : : "xmm3" ); \
+ asm volatile ("pcmpeqd %%xmm4, %%xmm4" : : : "xmm4" ); \
+ asm volatile ("pcmpeqd %%xmm5, %%xmm5" : : : "xmm5" ); \
+ asm volatile ("pcmpeqd %%xmm6, %%xmm6" : : : "xmm6" ); \
+ asm volatile ("pcmpeqd %%xmm7, %%xmm7" : : : "xmm7" ); \
}
+#endif
#include <elf/tst-gnu2-tls2.c>
+
+#ifndef TEST_AMX
+v2di v1, v2, v3;
+#endif
diff --git a/sysdeps/x86/tst-gnu2-tls2.h b/sysdeps/x86/tst-gnu2-tls2.h
new file mode 100644
index 0000000..fdbb565
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2.h
@@ -0,0 +1,37 @@
+/* Test TLSDESC relocation, x86 version.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef TEST_AMX
+# include <support/check.h>
+
+typedef long long v2di __attribute__((vector_size(16)));
+extern v2di v1, v2, v3;
+
+# define BEFORE_TLSDESC_CALL() \
+ v1 = __extension__(v2di){0, 0}; \
+ v2 = __extension__(v2di){0, 0};
+
+# define AFTER_TLSDESC_CALL() \
+ v3 = __extension__(v2di){0, 0}; \
+ asm volatile ("" : "+x" (v3)); \
+ union { v2di x; long long a[2]; } u; \
+ u.x = v3; \
+ TEST_VERIFY_EXIT (u.a[0] == 0 && u.a[1] == 0);
+#endif
+
+#include <elf/tst-gnu2-tls2.h>
diff --git a/sysdeps/x86/tst-tls23.c b/sysdeps/x86/tst-tls23.c
new file mode 100644
index 0000000..6130d91
--- /dev/null
+++ b/sysdeps/x86/tst-tls23.c
@@ -0,0 +1,22 @@
+#ifndef __x86_64__
+#include <sys/platform/x86.h>
+
+#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2)
+#endif
+
+/* Set XMM0...XMM7 to all 1s. */
+#define PREPARE_MALLOC() \
+{ \
+ asm volatile ("pcmpeqd %%xmm0, %%xmm0" : : : "xmm0" ); \
+ asm volatile ("pcmpeqd %%xmm1, %%xmm1" : : : "xmm1" ); \
+ asm volatile ("pcmpeqd %%xmm2, %%xmm2" : : : "xmm2" ); \
+ asm volatile ("pcmpeqd %%xmm3, %%xmm3" : : : "xmm3" ); \
+ asm volatile ("pcmpeqd %%xmm4, %%xmm4" : : : "xmm4" ); \
+ asm volatile ("pcmpeqd %%xmm5, %%xmm5" : : : "xmm5" ); \
+ asm volatile ("pcmpeqd %%xmm6, %%xmm6" : : : "xmm6" ); \
+ asm volatile ("pcmpeqd %%xmm7, %%xmm7" : : : "xmm7" ); \
+}
+
+#include <elf/tst-tls23.c>
+
+v2di v1, v2, v3;
diff --git a/sysdeps/x86/tst-tls23.h b/sysdeps/x86/tst-tls23.h
new file mode 100644
index 0000000..21cee4c
--- /dev/null
+++ b/sysdeps/x86/tst-tls23.h
@@ -0,0 +1,35 @@
+/* Test that __tls_get_addr preserves XMM registers.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <support/check.h>
+
+typedef long long v2di __attribute__((vector_size(16)));
+extern v2di v1, v2, v3;
+
+#define BEFORE_TLS_CALL() \
+ v1 = __extension__(v2di){0, 0}; \
+ v2 = __extension__(v2di){0, 0};
+
+#define AFTER_TLS_CALL() \
+ v3 = __extension__(v2di){0, 0}; \
+ asm volatile ("" : "+x" (v3)); \
+ union { v2di x; long long a[2]; } u; \
+ u.x = v3; \
+ TEST_VERIFY_EXIT (u.a[0] == 0 && u.a[1] == 0);
+
+#include <elf/tst-tls23.h>