61 files changed, 996 insertions, 1317 deletions
diff --git a/NEWS b/NEWS
index 7db9143..19b4440 100644
--- a/NEWS
+++ b/NEWS
@@ -99,6 +99,8 @@ The following bugs are resolved with this release:
   [32231] elf: Change ldconfig auxcache magic number
   [32470] x86: Avoid integer truncation with large cache sizes
   [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395)
+  [32810] Crash on x86-64 if XSAVEC disable via tunable
+  [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
 
 Version 2.39
 
diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
index 455aa65..4985293 100644
--- a/benchtests/atanh-inputs
+++ b/benchtests/atanh-inputs
@@ -1,6 +1,7 @@
 ## args: double
 ## ret: double
 ## includes: math.h
+## name: workload-random
 0x1.5a2730bacd94ap-1
 -0x1.b57eb40fc048ep-21
 -0x1.c0b185fb450e2p-17
diff --git a/elf/Makefile b/elf/Makefile
index 8a5678a..51d52b5 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -266,6 +266,7 @@ tests-static-normal := \
   tst-array1-static \
   tst-array5-static \
   tst-dl-iter-static \
+  tst-dlopen-sgid \
   tst-dst-static \
   tst-env-setuid-static \
   tst-getauxval-static \
@@ -376,6 +377,7 @@ tests += \
   tst-align3 \
   tst-audit-tlsdesc \
   tst-audit-tlsdesc-dlopen \
+  tst-audit-tlsdesc-dlopen2 \
   tst-audit1 \
   tst-audit2 \
   tst-audit8 \
@@ -802,6 +804,7 @@ modules-names += \
   tst-auditmanymod8 \
   tst-auditmanymod9 \
   tst-auditmod-tlsdesc  \
+  tst-auditmod-tlsdesc2 \
   tst-auditmod1 \
   tst-auditmod11 \
   tst-auditmod12 \
@@ -842,6 +845,7 @@ modules-names += \
   tst-dlmopen-twice-mod1 \
   tst-dlmopen-twice-mod2 \
   tst-dlmopen1mod \
+  tst-dlopen-sgid-mod \
   tst-dlopen-tlsreinitmod1 \
   tst-dlopen-tlsreinitmod2 \
   tst-dlopen-tlsreinitmod3 \
@@ -3012,6 +3016,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
 tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
 $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
 tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
+  $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
 
 $(objpfx)tst-dlmopen-twice.out: \
   $(objpfx)tst-dlmopen-twice-mod1.so \
@@ -3120,3 +3127,5 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so
 tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
 $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so
 tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so
+
+$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 3d529b7..b13e752 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
       if (newp == NULL)
 	oom ();
       memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
+#ifdef SHARED
+      /* Auditors can trigger a DTV resize event while the full malloc
+	 is not yet in use.  Mark the new DTV allocation as the
+	 initial allocation.  */
+      if (!__rtld_malloc_is_complete ())
+	GL(dl_initial_dtv) = &newp[1];
+#endif
     }
   else
     {
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
new file mode 100644
index 0000000..7ba2c41
--- /dev/null
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
@@ -0,0 +1,46 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Main program.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+
+static int
+do_test (void)
+{
+  puts ("info: start of main program");
+
+  /* Load TLS-using modules, to trigger DTV resizing.  The dynamic
+     linker will load them again (requiring their own TLS) because the
+     dlopen calls from the auditor were in the auditing namespace.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = xdlopen (dso, RTLD_LAZY);
+      int (*func) (void) = xdlsym (handle, sym);
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
new file mode 100644
index 0000000..50275cd
--- /dev/null
+++ b/elf/tst-auditmod-tlsdesc2.c
@@ -0,0 +1,59 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Audit module.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+unsigned int
+la_version (unsigned int version)
+{
+  /* Open some modules, to trigger DTV resizing before the switch to
+     the main malloc.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = dlopen (dso, RTLD_LAZY);
+      if (handle == NULL)
+        {
+          printf ("error: dlmopen from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      int (*func) (void) = dlsym (handle, sym);
+      if (func == NULL)
+        {
+          printf ("error: dlsym from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  puts ("info: TLS-using modules loaded from auditor");
+  fflush (stdout);
+
+  return LAV_CURRENT;
+}
diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
new file mode 100644
index 0000000..5eb79ee
--- /dev/null
+++ b/elf/tst-dlopen-sgid-mod.c
@@ -0,0 +1 @@
+/* Opening this object should not succeed.  */
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
new file mode 100644
index 0000000..8aec52e
--- /dev/null
+++ b/elf/tst-dlopen-sgid.c
@@ -0,0 +1,106 @@
+/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <gnu/lib-names.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/capture_subprocess.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/test-driver.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/* This is the name of our test object.  Use a custom module for
+   testing, so that this object does not get picked up from the system
+   path.  */
+static const char dso_name[] = "tst-dlopen-sgid-mod.so";
+
+/* Used to mark the recursive invocation.  */
+static const char magic_argument[] = "run-actual-test";
+
+static int
+do_test (void)
+{
+/* Pathname of the directory that receives the shared objects this
+   test attempts to load.  */
+  char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
+
+  /* This is supposed to be ignored and stripped.  */
+  TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
+
+  /* Copy of libc.so.6.  */
+  {
+    char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
+    char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
+    add_temp_file (to);
+    support_copy_file (from, to);
+    free (to);
+    free (from);
+  }
+
+  /* Copy of the test object.   */
+  {
+    char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
+    char *to = xasprintf ("%s/%s", libdir, dso_name);
+    add_temp_file (to);
+    support_copy_file (from, to);
+    free (to);
+    free (from);
+  }
+
+  free (libdir);
+
+  support_capture_subprogram_self_sgid (magic_argument);
+
+  return 0;
+}
+
+static void
+alternative_main (int argc, char **argv)
+{
+  if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
+    {
+      if (getgid () == getegid ())
+        /* This can happen if the file system is mounted nosuid.  */
+        FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+                          (intmax_t) getgid ());
+
+      /* Should be removed due to SGID.  */
+      TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
+
+      TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
+      {
+        const char *message = dlerror ();
+        TEST_COMPARE_STRING (message,
+                             "tst-dlopen-sgid-mod.so:"
+                             " cannot open shared object file:"
+                             " No such file or directory");
+      }
+
+      support_record_failure_barrier ();
+      exit (EXIT_SUCCESS);
+    }
+}
+
+#define PREPARE alternative_main
+#include <support/test-driver.c>
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index a472190..233eec76 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
 
       if (ret != 0)
 	exit (1);
-
-      /* Special return code to make sure that the child executed all the way
-	 through.  */
-      exit (42);
+      return 0;
     }
   else
     {
@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
 	      continue;
 	    }
 
-	  int status = support_capture_subprogram_self_sgid (buf);
-
-	  /* Bail out early if unsupported.  */
-	  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-	    return EXIT_UNSUPPORTED;
-
-	  if (WEXITSTATUS (status) != 42)
-	    {
-	      printf ("    [%d] child failed with status %d\n", i,
-		      WEXITSTATUS (status));
-	      support_record_failure ();
-	    }
+	  support_capture_subprogram_self_sgid (buf);
 	}
       return 0;
     }
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index 43047c4..c084aa4 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -148,10 +148,7 @@ do_test (int argc, char **argv)
 
       if (ret != 0)
 	exit (1);
-
-      /* Special return code to make sure that the child executed all the way
-	 through.  */
-      exit (42);
+      return 0;
     }
   else
     {
@@ -175,17 +172,7 @@ do_test (int argc, char **argv)
 	free (profilepath);
       }
 
-      int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
-
-      if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-	exit (EXIT_UNSUPPORTED);
-
-      if (WEXITSTATUS (status) != 42)
-	{
-	  printf ("    child failed with status %d\n",
-		  WEXITSTATUS (status));
-	  support_record_failure ();
-	}
+      support_capture_subprogram_self_sgid (SETGID_CHILD);
 
       return 0;
     }
diff --git a/libio/Makefile b/libio/Makefile
index b92aeaf..92d6c6b 100644
--- a/libio/Makefile
+++ b/libio/Makefile
@@ -68,22 +68,79 @@ routines_no_fortify += \
   wprintf \
   # routines_no_fortify
 
-tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc   \
-	tst_wprintf2 tst-widetext test-fmemopen tst-ext tst-ext2 \
-	tst-fgetws tst-ungetwc1 tst-ungetwc2 tst-swscanf tst-sscanf	      \
-	tst-mmap-setvbuf bug-ungetwc1 bug-ungetwc2 tst-atime tst-eof          \
-	tst-freopen bug-rewind bug-rewind2 bug-ungetc bug-fseek \
-	tst-mmap-eofsync tst-mmap-fflushsync bug-mmap-fflush \
-	tst-mmap2-eofsync tst-mmap-offend bug-fopena+ bug-wfflush \
-	bug-ungetc2 bug-ftell bug-ungetc3 bug-ungetc4 tst-fopenloc2 \
-	tst-memstream1 tst-memstream2 tst-memstream3 tst-memstream4 \
-	tst-wmemstream1 tst-wmemstream2 tst-wmemstream3 tst-wmemstream4 \
-	tst-wmemstream5 bug-memstream1 bug-wmemstream1 \
-	tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \
-	tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \
-	tst-ftell-append tst-fputws tst-bz22415 tst-fgetc-after-eof \
-	tst-sprintf-ub tst-sprintf-chk-ub tst-bz24051 tst-bz24153 \
-	tst-wfile-sync tst-bz28828 tst-getdelim
+tests = \
+  bug-fopena+ \
+  bug-fseek \
+  bug-ftell \
+  bug-memstream1 \
+  bug-mmap-fflush \
+  bug-rewind \
+  bug-rewind2 \
+  bug-ungetc \
+  bug-ungetc2 \
+  bug-ungetc3 \
+  bug-ungetc4 \
+  bug-ungetwc1 \
+  bug-ungetwc2 \
+  bug-wfflush \
+  bug-wmemstream1 \
+  bug-wsetpos \
+  test-fmemopen \
+  tst-atime \
+  tst-bz22415 \
+  tst-bz24051 \
+  tst-bz24153 \
+  tst-bz28828 \
+  tst-eof \
+  tst-ext \
+  tst-ext2 \
+  tst-fgetc-after-eof \
+  tst-fgetwc \
+  tst-fgetws \
+  tst-fopenloc2 \
+  tst-fputws \
+  tst-freopen \
+  tst-fseek \
+  tst-ftell-active-handler \
+  tst-ftell-append \
+  tst-ftell-partial-wide \
+  tst-fwrite-error \
+  tst-getdelim \
+  tst-memstream1 \
+  tst-memstream2 \
+  tst-memstream3 \
+  tst-memstream4 \
+  tst-mmap-eofsync \
+  tst-mmap-fflushsync \
+  tst-mmap-offend \
+  tst-mmap-setvbuf \
+  tst-mmap2-eofsync \
+  tst-popen-fork \
+  tst-popen1 \
+  tst-setvbuf1 \
+  tst-sprintf-chk-ub \
+  tst-sprintf-ub \
+  tst-sscanf \
+  tst-swscanf \
+  tst-ungetwc1 \
+  tst-ungetwc2 \
+  tst-wfile-sync \
+  tst-widetext \
+  tst-wmemstream1 \
+  tst-wmemstream2 \
+  tst-wmemstream3 \
+  tst-wmemstream4 \
+  tst-wmemstream5 \
+  tst_getwc \
+  tst_putwc \
+  tst_swprintf \
+  tst_swscanf \
+  tst_wprintf \
+  tst_wprintf2 \
+  tst_wscanf \
+  # tests
+
+$(objpfx)tst-popen-fork: $(shared-thread-library)
 
 tests-internal = tst-vtables tst-vtables-interposed
 
@@ -235,16 +292,26 @@ tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \
 		 $(objpfx)tst-bz24228-mem.out
 endif
 
-tests += tst-cleanup-default tst-cleanup-default-static
+tests += \
+  tst-cleanup-default \
+  tst-cleanup-default-static \
+  # tests
 tests-static += tst-cleanup-default-static
 tests-special += $(objpfx)tst-cleanup-default-cmp.out $(objpfx)tst-cleanup-default-static-cmp.out
 LDFLAGS-tst-cleanup-default = -Wl,--gc-sections
 LDFLAGS-tst-cleanup-default-static = -Wl,--gc-sections
 
 ifeq ($(have-gnu-retain)$(have-z-start-stop-gc),yesyes)
-tests += tst-cleanup-start-stop-gc tst-cleanup-start-stop-gc-static \
-		tst-cleanup-nostart-stop-gc tst-cleanup-nostart-stop-gc-static
-tests-static += tst-cleanup-start-stop-gc-static tst-cleanup-nostart-stop-gc-static
+tests += \
+  tst-cleanup-nostart-stop-gc \
+  tst-cleanup-nostart-stop-gc-static \
+  tst-cleanup-start-stop-gc \
+  tst-cleanup-start-stop-gc-static \
+  # tests
+tests-static += \
+  tst-cleanup-nostart-stop-gc-static \
+  tst-cleanup-start-stop-gc-static \
+  # tests-static
 tests-special += $(objpfx)tst-cleanup-start-stop-gc-cmp.out \
 		$(objpfx)tst-cleanup-start-stop-gc-static-cmp.out \
 		$(objpfx)tst-cleanup-nostart-stop-gc-cmp.out \
diff --git a/libio/iopopen.c b/libio/iopopen.c
index d01cb06..352513a 100644
--- a/libio/iopopen.c
+++ b/libio/iopopen.c
@@ -57,6 +57,26 @@ unlock (void *not_used)
 }
 #endif
 
+/* These lock/unlock/resetlock functions are used during fork.  */
+
+void
+_IO_proc_file_chain_lock (void)
+{
+  _IO_lock_lock (proc_file_chain_lock);
+}
+
+void
+_IO_proc_file_chain_unlock (void)
+{
+  _IO_lock_unlock (proc_file_chain_lock);
+}
+
+void
+_IO_proc_file_chain_resetlock (void)
+{
+  _IO_lock_init (proc_file_chain_lock);
+}
+
 /* POSIX states popen shall ensure that any streams from previous popen()
    calls that remain open in the parent process should be closed in the new
    child process.
diff --git a/libio/libioP.h b/libio/libioP.h
index 616253f..a83a411 100644
--- a/libio/libioP.h
+++ b/libio/libioP.h
@@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock)
 extern void _IO_enable_locks (void) __THROW;
 libc_hidden_proto (_IO_enable_locks)
 
+/* Functions for operating popen's proc_file_chain_lock during fork.  */
+
+extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden;
+extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden;
+extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden;
+
 /* Default jumptable functions. */
 
 extern int _IO_default_underflow (FILE *) __THROW;
diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c
new file mode 100644
index 0000000..1df30fc
--- /dev/null
+++ b/libio/tst-popen-fork.c
@@ -0,0 +1,80 @@
+/* Test concurrent popen and fork.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdatomic.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+#include <support/check.h>
+#include <support/xthread.h>
+#include <support/xunistd.h>
+
+static void
+popen_and_pclose (void)
+{
+  FILE *f = popen ("true", "r");
+  TEST_VERIFY_EXIT (f != NULL);
+  pclose (f);
+  return;
+}
+
+static atomic_bool done = ATOMIC_VAR_INIT (0);
+
+static void *
+popen_and_pclose_forever (__attribute__ ((unused))
+                          void *arg)
+{
+  while (!atomic_load_explicit (&done, memory_order_acquire))
+    popen_and_pclose ();
+  return NULL;
+}
+
+static int
+do_test (void)
+{
+
+  /* Repeatedly call popen in a loop during the entire test.  */
+  pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL);
+
+  /* Repeatedly fork off and reap child processes one-by-one.
+     Each child calls popen once, then exits, leading to the possibility
+     that a child forks *during* our own popen call, thus inheriting any
+     intermediate popen state, possibly including lock state(s).  */
+  for (int i = 0; i < 100; i++)
+    {
+      int cpid = xfork ();
+
+      if (cpid == 0)
+        {
+          popen_and_pclose ();
+          _exit (0);
+        }
+      else
+        xwaitpid (cpid, NULL, 0);
+    }
+
+  /* Stop calling popen.  */
+  atomic_store_explicit (&done, 1, memory_order_release);
+  xpthread_join (t);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/manual/tunables.texi b/manual/tunables.texi
index be97190..b255a14 100644
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647)
 glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff)
 glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
 glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
+glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff)
 glibc.cpu.x86_shstk:
 glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff)
 glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
@@ -485,7 +486,8 @@ thread stack originally backup by Huge Pages to default pages.
 @cindex shared_cache_size tunables
 @cindex tunables, shared_cache_size
 @cindex non_temporal_threshold tunables
-@cindex tunables, non_temporal_threshold
+@cindex memset_non_temporal_threshold tunables
+@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold
 
 @deftp {Tunable namespace} glibc.cpu
 Behavior of @theglibc{} can be tuned to assume specific hardware capabilities
@@ -561,6 +563,18 @@ like memmove and memcpy.
 This tunable is specific to i386 and x86-64.
 @end deftp
 
+@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold
+The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows
+the user to set threshold in bytes for non temporal store in
+memset. Non temporal stores give a hint to the hardware to move data
+directly to memory without displacing other data from the cache. This
+tunable is used by some platforms to determine when to use non
+temporal stores memset.
+
+This tunable is specific to i386 and x86-64.
+@end deftp
+
+
 @deftp Tunable glibc.cpu.x86_rep_movsb_threshold
 The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
 set threshold in bytes to start using "rep movsb".  The value must be
diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c
index aada916..51afa62 100644
--- a/nptl/pthread_cond_broadcast.c
+++ b/nptl/pthread_cond_broadcast.c
@@ -57,10 +57,10 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
     {
       /* Add as many signals as the remaining size of the group.  */
       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
-				cond->__data.__g_size[g1] << 1);
+				cond->__data.__g_size[g1]);
       cond->__data.__g_size[g1] = 0;
 
-      /* We need to wake G1 waiters before we quiesce G1 below.  */
+      /* We need to wake G1 waiters before we switch G1 below.  */
       /* TODO Only set it if there are indeed futex waiters.  We could
 	 also try to move this out of the critical section in cases when
 	 G2 is empty (and we don't need to quiesce).  */
@@ -69,11 +69,11 @@ ___pthread_cond_broadcast (pthread_cond_t *cond)
 
   /* G1 is complete.  Step (2) is next unless there are no waiters in G2, in
      which case we can stop.  */
-  if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
+  if (__condvar_switch_g1 (cond, wseq, &g1, private))
     {
       /* Step (3): Send signals to all waiters in the old G2 / new G1.  */
       atomic_fetch_add_relaxed (cond->__data.__g_signals + g1,
-				cond->__data.__g_size[g1] << 1);
+				cond->__data.__g_size[g1]);
       cond->__data.__g_size[g1] = 0;
       /* TODO Only set it if there are indeed futex waiters.  */
       do_futex_wake = true;
diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c
index 3487557..3894029 100644
--- a/nptl/pthread_cond_common.c
+++ b/nptl/pthread_cond_common.c
@@ -189,19 +189,17 @@ __condvar_get_private (int flags)
     return FUTEX_SHARED;
 }
 
-/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to
-   leave G1, converts G1 into a fresh G2, and then switches group roles so that
-   the former G2 becomes the new G1 ending at the current __wseq value when we
-   eventually make the switch (WSEQ is just an observation of __wseq by the
-   signaler).
+/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2,
+   and then switches group roles so that the former G2 becomes the new G1
+   ending at the current __wseq value when we eventually make the switch
+   (WSEQ is just an observation of __wseq by the signaler).
    If G2 is empty, it will not switch groups because then it would create an
    empty G1 which would require switching groups again on the next signal.
    Returns false iff groups were not switched because G2 was empty.  */
 static bool __attribute__ ((unused))
-__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
+__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
     unsigned int *g1index, int private)
 {
-  const unsigned int maxspin = 0;
   unsigned int g1 = *g1index;
 
   /* If there is no waiter in G2, we don't do anything.  The expression may
@@ -210,96 +208,23 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
      behavior.
      Note that this works correctly for a zero-initialized condvar too.  */
   unsigned int old_orig_size = __condvar_get_orig_size (cond);
-  uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
-  if (((unsigned) (wseq - old_g1_start - old_orig_size)
-	  + cond->__data.__g_size[g1 ^ 1]) == 0)
+  uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond);
+  uint64_t new_g1_start = old_g1_start + old_orig_size;
+  if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0)
 	return false;
 
-  /* Now try to close and quiesce G1.  We have to consider the following kinds
-     of waiters:
+  /* We have to consider the following kinds of waiters:
      * Waiters from less recent groups than G1 are not affected because
        nothing will change for them apart from __g1_start getting larger.
      * New waiters arriving concurrently with the group switching will all go
        into G2 until we atomically make the switch.  Waiters existing in G2
        are not affected.
-     * Waiters in G1 will be closed out immediately by setting a flag in
-       __g_signals, which will prevent waiters from blocking using a futex on
-       __g_signals and also notifies them that the group is closed.  As a
-       result, they will eventually remove their group reference, allowing us
-       to close switch group roles.  */
+     * Waiters in G1 have already received a signal and been woken.  */
 
-  /* First, set the closed flag on __g_signals.  This tells waiters that are
-     about to wait that they shouldn't do that anymore.  This basically
-     serves as an advance notification of the upcoming change to __g1_start;
-     waiters interpret it as if __g1_start was larger than their waiter
-     sequence position.  This allows us to change __g1_start after waiting
-     for all existing waiters with group references to leave, which in turn
-     makes recovery after stealing a signal simpler because it then can be
-     skipped if __g1_start indicates that the group is closed (otherwise,
-     we would have to recover always because waiters don't know how big their
-     groups are).  Relaxed MO is fine.  */
-  atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1);
-
-  /* Wait until there are no group references anymore.  The fetch-or operation
-     injects us into the modification order of __g_refs; release MO ensures
-     that waiters incrementing __g_refs after our fetch-or see the previous
-     changes to __g_signals and to __g1_start that had to happen before we can
-     switch this G1 and alias with an older group (we have two groups, so
-     aliasing requires switching group roles twice).  Note that nobody else
-     can have set the wake-request flag, so we do not have to act upon it.
-
-     Also note that it is harmless if older waiters or waiters from this G1
-     get a group reference after we have quiesced the group because it will
-     remain closed for them either because of the closed flag in __g_signals
-     or the later update to __g1_start.  New waiters will never arrive here
-     but instead continue to go into the still current G2.  */
-  unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0);
-  while ((r >> 1) > 0)
-    {
-      for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--)
-	{
-	  /* TODO Back off.  */
-	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
-	}
-      if ((r >> 1) > 0)
-	{
-	  /* There is still a waiter after spinning.  Set the wake-request
-	     flag and block.  Relaxed MO is fine because this is just about
-	     this futex word.
-
-	     Update r to include the set wake-request flag so that the upcoming
-	     futex_wait only blocks if the flag is still set (otherwise, we'd
-	     violate the basic client-side futex protocol).  */
-	  r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1;
-
-	  if ((r >> 1) > 0)
-	    futex_wait_simple (cond->__data.__g_refs + g1, r, private);
-	  /* Reload here so we eventually see the most recent value even if we
-	     do not spin.   */
-	  r = atomic_load_relaxed (cond->__data.__g_refs + g1);
-	}
-    }
-  /* Acquire MO so that we synchronize with the release operation that waiters
-     use to decrement __g_refs and thus happen after the waiters we waited
-     for.  */
-  atomic_thread_fence_acquire ();
-
-  /* Update __g1_start, which finishes closing this group.  The value we add
-     will never be negative because old_orig_size can only be zero when we
-     switch groups the first time after a condvar was initialized, in which
-     case G1 will be at index 1 and we will add a value of 1.  See above for
-     why this takes place after waiting for quiescence of the group.
-     Relaxed MO is fine because the change comes with no additional
-     constraints that others would have to observe.  */
-  __condvar_add_g1_start_relaxed (cond,
-      (old_orig_size << 1) + (g1 == 1 ? 1 : - 1));
-
-  /* Now reopen the group, thus enabling waiters to again block using the
-     futex controlled by __g_signals.  Release MO so that observers that see
-     no signals (and thus can block) also see the write __g1_start and thus
-     that this is now a new group (see __pthread_cond_wait_common for the
-     matching acquire MO loads).  */
-  atomic_store_release (cond->__data.__g_signals + g1, 0);
+  /* Update __g1_start, which closes this group.  Relaxed MO is fine because
+     the change comes with no additional constraints that others would have
+     to observe.  */
+  __condvar_add_g1_start_relaxed (cond, old_orig_size);
 
   /* At this point, the old G1 is now a valid new G2 (but not in use yet).
      No old waiter can neither grab a signal nor acquire a reference without
@@ -311,9 +236,13 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq,
   g1 ^= 1;
   *g1index ^= 1;
 
+  /* Now advance the new G1 g_signals to the new g1_start, giving it
+     an effective signal count of 0 to start.  */
+  atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start);
+
   /* These values are just observed by signalers, and thus protected by the
      lock.  */
-  unsigned int orig_size = wseq - (old_g1_start + old_orig_size);
+  unsigned int orig_size = wseq - new_g1_start;
   __condvar_set_orig_size (cond, orig_size);
   /* Use and addition to not loose track of cancellations in what was
      previously G2.  */
diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c
index 43d6286..fa3a5c3 100644
--- a/nptl/pthread_cond_signal.c
+++ b/nptl/pthread_cond_signal.c
@@ -69,19 +69,18 @@ ___pthread_cond_signal (pthread_cond_t *cond)
   bool do_futex_wake = false;
 
   /* If G1 is still receiving signals, we put the signal there.  If not, we
-     check if G2 has waiters, and if so, quiesce and switch G1 to the former
-     G2; if this results in a new G1 with waiters (G2 might have cancellations
-     already, see __condvar_quiesce_and_switch_g1), we put the signal in the
-     new G1.  */
+     check if G2 has waiters, and if so, switch G1 to the former G2; if this
+     results in a new G1 with waiters (G2 might have cancellations already,
+     see __condvar_switch_g1), we put the signal in the new G1. */
   if ((cond->__data.__g_size[g1] != 0)
-      || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private))
+      || __condvar_switch_g1 (cond, wseq, &g1, private))
     {
       /* Add a signal.  Relaxed MO is fine because signaling does not need to
-	 establish a happens-before relation (see above).  We do not mask the
-	 release-MO store when initializing a group in
-	 __condvar_quiesce_and_switch_g1 because we use an atomic
-	 read-modify-write and thus extend that store's release sequence.  */
-      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2);
+         establish a happens-before relation (see above).  We do not mask the
+         release-MO store when initializing a group in __condvar_switch_g1
+         because we use an atomic read-modify-write and thus extend that
+         store's release sequence.  */
+      atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1);
       cond->__data.__g_size[g1]--;
       /* TODO Only set it if there are indeed futex waiters.  */
       do_futex_wake = true;
diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c
index 66786c7..0f1dfcb 100644
--- a/nptl/pthread_cond_wait.c
+++ b/nptl/pthread_cond_wait.c
@@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
      not hold a reference on the group.  */
   __condvar_acquire_lock (cond, private);
 
-  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
+  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
   if (g1_start > seq)
     {
       /* Our group is closed, so someone provided enough signals for it.
@@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
     }
 }
 
-/* Wake up any signalers that might be waiting.  */
-static void
-__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
-{
-  /* Release MO to synchronize-with the acquire load in
-     __condvar_quiesce_and_switch_g1.  */
-  if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
-    {
-      /* Clear the wake-up request flag before waking up.  We do not need more
-	 than relaxed MO and it doesn't matter if we apply this for an aliased
-	 group because we wake all futex waiters right after clearing the
-	 flag.  */
-      atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
-      futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
-    }
-}
-
 /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
    our registration as a waiter, confirm we have woken up, and re-acquire the
    mutex.  */
@@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg)
   pthread_cond_t *cond = cbuffer->cond;
   unsigned g = cbuffer->wseq & 1;
 
-  __condvar_dec_grefs (cond, g, cbuffer->private);
-
   __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
   /* FIXME With the current cancellation implementation, it is possible that
      a thread is cancelled after it has returned from a syscall.  This could
@@ -238,9 +219,7 @@ __condvar_cleanup_waiting (void *arg)
    signaled), and a reference count.
 
    The group reference count is used to maintain the number of waiters that
-   are using the group's futex.  Before a group can change its role, the
-   reference count must show that no waiters are using the futex anymore; this
-   prevents ABA issues on the futex word.
+   are using the group's futex.
 
    To represent which intervals in the waiter sequence the groups cover (and
    thus also which group slot contains G1 or G2), we use a 64b counter to
@@ -251,7 +230,7 @@ __condvar_cleanup_waiting (void *arg)
    figure out whether they are in a group that has already been completely
    signaled (i.e., if the current G1 starts at a later position that the
    waiter's position).  Waiters cannot determine whether they are currently
-   in G2 or G1 -- but they do not have too because all they are interested in
+   in G2 or G1 -- but they do not have to because all they are interested in
    is whether there are available signals, and they always start in G2 (whose
    group slot they know because of the bit in the waiter sequence.  Signalers
    will simply fill the right group until it is completely signaled and can
@@ -280,7 +259,6 @@ __condvar_cleanup_waiting (void *arg)
      * Waiters fetch-add while having acquire the mutex associated with the
        condvar.  Signalers load it and fetch-xor it concurrently.
    __g1_start: Starting position of G1 (inclusive)
-     * LSB is index of current G2.
      * Modified by signalers while having acquired the condvar-internal lock
        and observed concurrently by waiters.
    __g1_orig_size: Initial size of G1
@@ -300,11 +278,10 @@ __condvar_cleanup_waiting (void *arg)
        last reference.
      * Reference count used by waiters concurrently with signalers that have
        acquired the condvar-internal lock.
-   __g_signals: The number of signals that can still be consumed.
+   __g_signals: The number of signals that can still be consumed, relative to
+     the current g1_start.  (i.e. g1_start with the signal count added)
      * Used as a futex word by waiters.  Used concurrently by waiters and
        signalers.
-     * LSB is true iff this group has been completely signaled (i.e., it is
-       closed).
    __g_size: Waiters remaining in this group (i.e., which have not been
      signaled yet.
      * Accessed by signalers and waiters that cancel waiting (both do so only
@@ -328,27 +305,6 @@ __condvar_cleanup_waiting (void *arg)
    sufficient because if a waiter can see a sufficiently large value, it could
    have also consume a signal in the waiters group.
 
-   Waiters try to grab a signal from __g_signals without holding a reference
-   count, which can lead to stealing a signal from a more recent group after
-   their own group was already closed.  They cannot always detect whether they
-   in fact did because they do not know when they stole, but they can
-   conservatively add a signal back to the group they stole from; if they
-   did so unnecessarily, all that happens is a spurious wake-up.  To make this
-   even less likely, __g1_start contains the index of the current g2 too,
-   which allows waiters to check if there aliasing on the group slots; if
-   there wasn't, they didn't steal from the current G1, which means that the
-   G1 they stole from must have been already closed and they do not need to
-   fix anything.
-
-   It is essential that the last field in pthread_cond_t is __g_signals[1]:
-   The previous condvar used a pointer-sized field in pthread_cond_t, so a
-   PTHREAD_COND_INITIALIZER from that condvar implementation might only
-   initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
-   in total instead of the 48 we need).  __g_signals[1] is not accessed before
-   the first group switch (G2 starts at index 0), which will set its value to
-   zero after a harmless fetch-or whose return value is ignored.  This
-   effectively completes initialization.
-
 
    Limitations:
    * This condvar isn't designed to allow for more than
@@ -379,7 +335,6 @@ static __always_inline int
 __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
     clockid_t clockid, const struct __timespec64 *abstime)
 {
-  const int maxspin = 0;
   int err;
   int result = 0;
 
@@ -396,8 +351,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
      because we do not need to establish any happens-before relation with
      signalers (see __pthread_cond_signal); modification order alone
      establishes a total order of waiters/signals.  We do need acquire MO
-     to synchronize with group reinitialization in
-     __condvar_quiesce_and_switch_g1.  */
+     to synchronize with group reinitialization in __condvar_switch_g1.  */
   uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
   /* Find our group's index.  We always go into what was G2 when we acquired
      our position.  */
@@ -424,178 +378,64 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
       return err;
     }
 
-  /* Now wait until a signal is available in our group or it is closed.
-     Acquire MO so that if we observe a value of zero written after group
-     switching in __condvar_quiesce_and_switch_g1, we synchronize with that
-     store and will see the prior update of __g1_start done while switching
-     groups too.  */
-  unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
 
-  do
+  while (1)
     {
-      while (1)
-	{
-	  /* Spin-wait first.
-	     Note that spinning first without checking whether a timeout
-	     passed might lead to what looks like a spurious wake-up even
-	     though we should return ETIMEDOUT (e.g., if the caller provides
-	     an absolute timeout that is clearly in the past).  However,
-	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
-	     user will (ab)use pthread_cond_wait as a check for whether a
-	     point in time is in the past, and (3) spinning first without
-	     having to compare against the current time seems to be the right
-	     choice from a performance perspective for most use cases.  */
-	  unsigned int spin = maxspin;
-	  while (signals == 0 && spin > 0)
-	    {
-	      /* Check that we are not spinning on a group that's already
-		 closed.  */
-	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
-		goto done;
-
-	      /* TODO Back off.  */
-
-	      /* Reload signals.  See above for MO.  */
-	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
-	      spin--;
-	    }
-
-	  /* If our group will be closed as indicated by the flag on signals,
-	     don't bother grabbing a signal.  */
-	  if (signals & 1)
-	    goto done;
-
-	  /* If there is an available signal, don't block.  */
-	  if (signals != 0)
-	    break;
-
-	  /* No signals available after spinning, so prepare to block.
-	     We first acquire a group reference and use acquire MO for that so
-	     that we synchronize with the dummy read-modify-write in
-	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
-	     in this case this will make us see the closed flag on __g_signals
-	     that designates a concurrent attempt to reuse the group's slot.
-	     We use acquire MO for the __g_signals check to make the
-	     __g1_start check work (see spinning above).
-	     Note that the group reference acquisition will not mask the
-	     release MO when decrementing the reference count because we use
-	     an atomic read-modify-write operation and thus extend the release
-	     sequence.  */
-	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
-	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
-	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
-	    {
-	      /* Our group is closed.  Wake up any signalers that might be
-		 waiting.  */
-	      __condvar_dec_grefs (cond, g, private);
-	      goto done;
-	    }
-
-	  // Now block.
-	  struct _pthread_cleanup_buffer buffer;
-	  struct _condvar_cleanup_buffer cbuffer;
-	  cbuffer.wseq = wseq;
-	  cbuffer.cond = cond;
-	  cbuffer.mutex = mutex;
-	  cbuffer.private = private;
-	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
-
-	  err = __futex_abstimed_wait_cancelable64 (
-	    cond->__data.__g_signals + g, 0, clockid, abstime, private);
-
-	  __pthread_cleanup_pop (&buffer, 0);
-
-	  if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
-	    {
-	      __condvar_dec_grefs (cond, g, private);
-	      /* If we timed out, we effectively cancel waiting.  Note that
-		 we have decremented __g_refs before cancellation, so that a
-		 deadlock between waiting for quiescence of our group in
-		 __condvar_quiesce_and_switch_g1 and us trying to acquire
-		 the lock during cancellation is not possible.  */
-	      __condvar_cancel_waiting (cond, seq, g, private);
-	      result = err;
-	      goto done;
-	    }
-	  else
-	    __condvar_dec_grefs (cond, g, private);
-
-	  /* Reload signals.  See above for MO.  */
-	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
+      /* Now wait until a signal is available in our group or it is closed.
+         Acquire MO so that if we observe (signals == lowseq) after group
+         switching in __condvar_switch_g1, we synchronize with that store and
+         will see the prior update of __g1_start done while switching groups
+         too.  */
+      unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
+      uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
+
+      if (seq < g1_start)
+        {
+          /* If the group is closed already,
+             then this waiter originally had enough extra signals to
+             consume, up until the time its group was closed.  */
+           break;
+        }
+
+      /* If there is an available signal, don't block.
+         If __g1_start has advanced at all, then we must be in G1
+         by now, perhaps in the process of switching back to an older
+         G2, but in either case we're allowed to consume the available
+         signal and should not block anymore.  */
+      if ((int)(signals - (unsigned int)g1_start) > 0)
+        {
+	  /* Try to grab a signal.  See above for MO.  (if we do another loop
+	     iteration we need to see the correct value of g1_start)  */
+	    if (atomic_compare_exchange_weak_acquire (
+			cond->__data.__g_signals + g,
+			&signals, signals - 1))
+	      break;
+	    else
+	      continue;
 	}
 
+      // Now block.
+      struct _pthread_cleanup_buffer buffer;
+      struct _condvar_cleanup_buffer cbuffer;
+      cbuffer.wseq = wseq;
+      cbuffer.cond = cond;
+      cbuffer.mutex = mutex;
+      cbuffer.private = private;
+      __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
+
+      err = __futex_abstimed_wait_cancelable64 (
+        cond->__data.__g_signals + g, signals, clockid, abstime, private);
+
+      __pthread_cleanup_pop (&buffer, 0);
+
+      if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
+        {
+          /* If we timed out, we effectively cancel waiting.  */
+          __condvar_cancel_waiting (cond, seq, g, private);
+          result = err;
+          break;
+        }
     }
-  /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
-     of __g1_start below (see spinning above for a similar case).  In
-     particular, if we steal from a more recent group, we will also see a
-     more recent __g1_start below.  */
-  while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
-						&signals, signals - 2));
-
-  /* We consumed a signal but we could have consumed from a more recent group
-     that aliased with ours due to being in the same group slot.  If this
-     might be the case our group must be closed as visible through
-     __g1_start.  */
-  uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
-  if (seq < (g1_start >> 1))
-    {
-      /* We potentially stole a signal from a more recent group but we do not
-	 know which group we really consumed from.
-	 We do not care about groups older than current G1 because they are
-	 closed; we could have stolen from these, but then we just add a
-	 spurious wake-up for the current groups.
-	 We will never steal a signal from current G2 that was really intended
-	 for G2 because G2 never receives signals (until it becomes G1).  We
-	 could have stolen a signal from G2 that was conservatively added by a
-	 previous waiter that also thought it stole a signal -- but given that
-	 that signal was added unnecessarily, it's not a problem if we steal
-	 it.
-	 Thus, the remaining case is that we could have stolen from the current
-	 G1, where "current" means the __g1_start value we observed.  However,
-	 if the current G1 does not have the same slot index as we do, we did
-	 not steal from it and do not need to undo that.  This is the reason
-	 for putting a bit with G2's index into__g1_start as well.  */
-      if (((g1_start & 1) ^ 1) == g)
-	{
-	  /* We have to conservatively undo our potential mistake of stealing
-	     a signal.  We can stop trying to do that when the current G1
-	     changes because other spinning waiters will notice this too and
-	     __condvar_quiesce_and_switch_g1 has checked that there are no
-	     futex waiters anymore before switching G1.
-	     Relaxed MO is fine for the __g1_start load because we need to
-	     merely be able to observe this fact and not have to observe
-	     something else as well.
-	     ??? Would it help to spin for a little while to see whether the
-	     current G1 gets closed?  This might be worthwhile if the group is
-	     small or close to being closed.  */
-	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
-	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
-	    {
-	      /* Try to add a signal.  We don't need to acquire the lock
-		 because at worst we can cause a spurious wake-up.  If the
-		 group is in the process of being closed (LSB is true), this
-		 has an effect similar to us adding a signal.  */
-	      if (((s & 1) != 0)
-		  || atomic_compare_exchange_weak_relaxed
-		       (cond->__data.__g_signals + g, &s, s + 2))
-		{
-		  /* If we added a signal, we also need to add a wake-up on
-		     the futex.  We also need to do that if we skipped adding
-		     a signal because the group is being closed because
-		     while __condvar_quiesce_and_switch_g1 could have closed
-		     the group, it might still be waiting for futex waiters to
-		     leave (and one of those waiters might be the one we stole
-		     the signal from, which cause it to block using the
-		     futex).  */
-		  futex_wake (cond->__data.__g_signals + g, 1, private);
-		  break;
-		}
-	      /* TODO Back off.  */
-	    }
-	}
-    }
-
- done:
 
   /* Confirm that we have been woken.  We do that before acquiring the mutex
      to allow for execution of pthread_cond_destroy while having acquired the
diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c
index 1336e9c..bdcb45c 100644
--- a/nptl/tst-cond22.c
+++ b/nptl/tst-cond22.c
@@ -106,13 +106,13 @@ do_test (void)
       status = 1;
     }
 
-  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
 	  c.__data.__wseq.__value32.__high,
 	  c.__data.__wseq.__value32.__low,
 	  c.__data.__g1_start.__value32.__high,
 	  c.__data.__g1_start.__value32.__low,
-	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
-	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+	  c.__data.__g_signals[0], c.__data.__g_size[0],
+	  c.__data.__g_signals[1], c.__data.__g_size[1],
 	  c.__data.__g1_orig_size, c.__data.__wrefs);
 
   if (pthread_create (&th, NULL, tf, (void *) 1l) != 0)
@@ -152,13 +152,13 @@ do_test (void)
       status = 1;
     }
 
-  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n",
+  printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n",
 	  c.__data.__wseq.__value32.__high,
 	  c.__data.__wseq.__value32.__low,
 	  c.__data.__g1_start.__value32.__high,
 	  c.__data.__g1_start.__value32.__low,
-	  c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0],
-	  c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1],
+	  c.__data.__g_signals[0], c.__data.__g_size[0],
+	  c.__data.__g_signals[1], c.__data.__g_size[1],
 	  c.__data.__g1_orig_size, c.__data.__wrefs);
 
   return status;
diff --git a/posix/fork.c b/posix/fork.c
index 298765a..cf9b80e 100644
--- a/posix/fork.c
+++ b/posix/fork.c
@@ -62,6 +62,7 @@ __libc_fork (void)
       call_function_static_weak (__nss_database_fork_prepare_parent,
 				 &nss_database_data);
 
+      _IO_proc_file_chain_lock ();
       _IO_list_lock ();
 
       /* Acquire malloc locks.  This needs to come last because fork
@@ -92,6 +93,7 @@ __libc_fork (void)
 
 	  /* Reset locks in the I/O code.  */
 	  _IO_list_resetlock ();
+	  _IO_proc_file_chain_resetlock ();
 
 	  call_function_static_weak (__nss_database_fork_subprocess,
 				     &nss_database_data);
@@ -121,6 +123,7 @@ __libc_fork (void)
 
 	  /* We execute this even if the 'fork' call failed.  */
 	  _IO_list_unlock ();
+	  _IO_proc_file_chain_unlock ();
 	}
 
       /* Run the handlers registered for the parent.  */
diff --git a/scripts/sort-makefile-lines.py b/scripts/sort-makefile-lines.py
index f65ee40..b2249ae 100755
--- a/scripts/sort-makefile-lines.py
+++ b/scripts/sort-makefile-lines.py
@@ -129,7 +129,7 @@ def sort_makefile_lines():
     for i in range(len(lines)):
         # Look for things like "var = \", "var := \" or "var += \"
         # to start the sorted list.
-        var = re.search(r'^([a-zA-Z0-9-]*) [\+:]?\= \\$', lines[i])
+        var = re.search(r'^([-_a-zA-Z0-9]*) [\+:]?\= \\$', lines[i])
         if var:
             # Remember the index and the name.
             startmarks.append((i, var.group(1)))
@@ -140,7 +140,7 @@ def sort_makefile_lines():
     rangemarks = []
     for sm in startmarks:
         # Look for things like "  # var" to end the sorted list.
-        reg = r'^  # ' + sm[1] + r'$'
+        reg = r'^ *# ' + sm[1] + r'$'
         for j in range(sm[0] + 1, len(lines)):
             if re.search(reg, lines[j]):
                 # Remember the block to sort (inclusive).
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index cc26ed6..cefee58 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -57,13 +57,7 @@ do_test (void)
       exit (1);
     }
 
-  int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
-
-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-    return EXIT_UNSUPPORTED;
-
-  if (!WIFEXITED (status))
-    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+  support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
 
   return 0;
 }
@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
       if (secure_getenv ("PATH") != NULL)
 	FAIL_EXIT (4, "PATH variable not filtered out\n");
 
+      support_record_failure_barrier ();
       exit (EXIT_SUCCESS);
     }
 }
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 1ecbdfe..8cbdca3 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -41,11 +41,12 @@ struct support_capture_subprocess support_capture_subprocess
 struct support_capture_subprocess support_capture_subprogram
   (const char *file, char *const argv[]);
 
-/* Copy the running program into a setgid binary and run it with CHILD_ID
-   argument.  If execution is successful, return the exit status of the child
-   program, otherwise return a non-zero failure exit code.  */
-int support_capture_subprogram_self_sgid
-  (char *child_id);
+/* Copy the running program into a setgid binary and run it with
+   CHILD_ID argument.  If the program exits with a non-zero status,
+   exit with that exit status (or status 1 if the program did not exit
+   normally).  If the test cannot be performed, exit with
+   EXIT_UNSUPPORTED.  */
+void support_capture_subprogram_self_sgid (const char *child_id);
 
 /* Deallocate the subprocess data captured by
    support_capture_subprocess.  */
diff --git a/support/check.h b/support/check.h
index 7ea22c7..8f41e5b 100644
--- a/support/check.h
+++ b/support/check.h
@@ -207,6 +207,9 @@ void support_record_failure_reset (void);
    failures or not.  */
 int support_record_failure_is_failed (void);
 
+/* Terminate the process if any failures have been encountered so far.  */
+void support_record_failure_barrier (void);
+
 __END_DECLS
 
 #endif /* SUPPORT_CHECK_H */
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index ffced8a..8dc95f8 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -21,12 +21,17 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <grp.h>
+#include <scratch_buffer.h>
+#include <stdio_ext.h>
 #include <stdlib.h>
+#include <string.h>
 #include <support/check.h>
 #include <support/xunistd.h>
 #include <support/xsocket.h>
 #include <support/xspawn.h>
 #include <support/support.h>
+#include <support/temp_file.h>
 #include <support/test-driver.h>
 
 static void
@@ -108,111 +113,88 @@ support_capture_subprogram (const char *file, char *const argv[])
 /* Copies the executable into a restricted directory, so that we can
    safely make it SGID with the TARGET group ID.  Then runs the
    executable.  */
-static int
-copy_and_spawn_sgid (char *child_id, gid_t gid)
+static void
+copy_and_spawn_sgid (const char *child_id, gid_t gid)
 {
-  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
-			     test_dir, (intmax_t) getpid ());
+  char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
   char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = 1, status = 1;
-
-  TEST_VERIFY (mkdir (dirname, 0700) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
+  add_temp_file (execname);
 
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
+  if (access ("/proc/self/exe", R_OK) != 0)
     FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
 
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  TEST_VERIFY (outfd >= 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      TEST_VERIFY (rdcount >= 0);
-      if (support_record_failure_is_failed ())
-	goto err;
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  TEST_VERIFY (wrcount > 0);
-	  if (support_record_failure_is_failed ())
-	    goto err;
-	  p += wrcount;
-	}
-    }
+  support_copy_file ("/proc/self/exe", execname);
 
-  bool chowned = false;
-  TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
-	       || errno == EPERM);
-  if (support_record_failure_is_failed ())
-    goto err;
-  else if (!chowned)
-    {
-      ret = 77;
-      goto err;
-    }
+  if (chown (execname, getuid (), gid) != 0)
+    FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
+		      execname, (intmax_t) gid);
 
-  TEST_VERIFY (fchmod (outfd, 02750) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-  TEST_VERIFY (close (outfd) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-  TEST_VERIFY (close (infd) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
+  if (chmod (execname, 02750) != 0)
+    FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
 
   /* We have the binary, now spawn the subprocess.  Avoid using
      support_subprogram because we only want the program exit status, not the
      contents.  */
-  ret = 0;
-  infd = outfd = -1;
 
-  char * const args[] = {execname, child_id, NULL};
+  char * const args[] = {execname, (char *) child_id, NULL};
+  int status = support_subprogram_wait (args[0], args);
 
-  status = support_subprogram_wait (args[0], args);
+  free (execname);
+  free (dirname);
 
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname != NULL)
+  if (WIFEXITED (status))
     {
-      unlink (execname);
-      free (execname);
+      if (WEXITSTATUS (status) == 0)
+	return;
+      else
+	exit (WEXITSTATUS (status));
     }
-  if (dirname != NULL)
+  else
+    FAIL_EXIT1 ("subprogram failed with status %d", status);
+}
+
+/* Returns true if a group with NAME has been found, and writes its
+   GID to *TARGET.  */
+static bool
+find_sgid_group (gid_t *target, const char *name)
+{
+  /* Do not use getgrname_r because it does not work in statically
+     linked binaries if the system libc is different.  */
+  FILE *fp = fopen ("/etc/group", "rce");
+  if (fp == NULL)
+    return false;
+  __fsetlocking (fp, FSETLOCKING_BYCALLER);
+
+  bool ok = false;
+  struct scratch_buffer buf;
+  scratch_buffer_init (&buf);
+  while (true)
     {
-      rmdir (dirname);
-      free (dirname);
+      struct group grp;
+      struct group *result = NULL;
+      int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
+      if (status == 0 && result != NULL)
+	{
+	  if (strcmp (result->gr_name, name) == 0)
+	    {
+	      *target = result->gr_gid;
+	      ok = true;
+	      break;
+	    }
+	}
+      else if (errno != ERANGE)
+	break;
+      else if (!scratch_buffer_grow (&buf))
+	break;
     }
-
-  if (ret == 77)
-    FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
-  if (ret != 0)
-    FAIL_EXIT1 ("Failed to make sgid executable for test\n");
-
-  return status;
+  scratch_buffer_free (&buf);
+  fclose (fp);
+  return ok;
 }
 
-int
-support_capture_subprogram_self_sgid (char *child_id)
+void
+support_capture_subprogram_self_sgid (const char *child_id)
 {
-  gid_t target = 0;
   const int count = 64;
   gid_t groups[count];
 
@@ -224,6 +206,7 @@ support_capture_subprogram_self_sgid (char *child_id)
 		     (intmax_t) getuid ());
 
   gid_t current = getgid ();
+  gid_t target = current;
   for (int i = 0; i < ret; ++i)
     {
       if (groups[i] != current)
@@ -233,11 +216,18 @@ support_capture_subprogram_self_sgid (char *child_id)
 	}
     }
 
-  if (target == 0)
-    FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
-		     (intmax_t) getuid ());
+  if (target == current)
+    {
+      /* If running as root, try to find a harmless group for SGID.  */
+      if (getuid () != 0
+	  || (!find_sgid_group (&target, "nogroup")
+	      && !find_sgid_group (&target, "bin")
+	      && !find_sgid_group (&target, "daemon")))
+	FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+			 (intmax_t) getuid ());
+    }
 
-  return copy_and_spawn_sgid (child_id, target);
+  copy_and_spawn_sgid (child_id, target);
 }
 
 void
diff --git a/support/support_record_failure.c b/support/support_record_failure.c
index 9781237..72ee2b2 100644
--- a/support/support_record_failure.c
+++ b/support/support_record_failure.c
@@ -112,3 +112,13 @@ support_record_failure_is_failed (void)
      synchronization for reliable test error reporting anyway.  */
   return __atomic_load_n (&state->failed, __ATOMIC_RELAXED);
 }
+
+void
+support_record_failure_barrier (void)
+{
+  if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED))
+    {
+      puts ("error: exiting due to previous errors");
+      exit (1);
+    }
+}
diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
index 11a2a45..05ac0a1 100644
--- a/sysdeps/ieee754/dbl-64/e_atanh.c
+++ b/sysdeps/ieee754/dbl-64/e_atanh.c
@@ -44,6 +44,11 @@
 
 static const double huge = 1e300;
 
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
 double
 __ieee754_atanh (double x)
 {
@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
 
   return copysign (t, x);
 }
+
+#ifndef __ieee754_atanh
 libm_alias_finite (__ieee754_atanh, __atanh)
+#endif
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
index df54eef..bccc200 100644
--- a/sysdeps/nptl/bits/thread-shared-types.h
+++ b/sysdeps/nptl/bits/thread-shared-types.h
@@ -95,11 +95,12 @@ struct __pthread_cond_s
 {
   __atomic_wide_counter __wseq;
   __atomic_wide_counter __g1_start;
-  unsigned int __g_refs[2] __LOCK_ALIGNMENT;
-  unsigned int __g_size[2];
+  unsigned int __g_size[2] __LOCK_ALIGNMENT;
   unsigned int __g1_orig_size;
   unsigned int __wrefs;
   unsigned int __g_signals[2];
+  unsigned int __unused_initialized_1;
+  unsigned int __unused_initialized_2;
 };
 
 typedef unsigned int __tss_t;
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
index 3d4f4a7..e0f2441 100644
--- a/sysdeps/nptl/pthread.h
+++ b/sysdeps/nptl/pthread.h
@@ -152,7 +152,7 @@ enum
 
 
 /* Conditional variable handling.  */
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } }
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
 
 
 /* Cleanup buffers */
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
deleted file mode 100644
index 53e5716..0000000
--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Optimized memchr implementation for POWER10 LE.
-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-# ifndef MEMCHR
-#  define MEMCHR __memchr
-# endif
-# define M_VREG_ZERO v20
-# define M_OFF_START_LOOP 256
-# define MEMCHR_SUBTRACT_VECTORS \
-	vsububm   v4,v4,v18;	    \
-	vsububm   v5,v5,v18;	    \
-	vsububm   v6,v6,v18;	    \
-	vsububm   v7,v7,v18;
-# define M_TAIL(vreg,increment)	   \
-	vctzlsbb  r4,vreg;	   \
-	cmpld     r5,r4;	   \
-	ble       L(null);	   \
-	addi	  r4,r4,increment; \
-	add	  r3,r6,r4;	   \
-	blr
-
-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
-   >= 2.35.  This is used to keep compatibility with older versions.  */
-#define M_VEXTRACTBM(rt,vrb)	 \
-	.long(((4)<<(32-6))	 \
-	      | ((rt)<<(32-11))	 \
-	      | ((8)<<(32-16))	 \
-	      | ((vrb)<<(32-21)) \
-	      | 1602)
-
-#define M_LXVP(xtp,dq,ra)		   \
-	.long(((6)<<(32-6))		   \
-	      | ((((xtp)-32)>>1)<<(32-10)) \
-	      | ((1)<<(32-11))		   \
-	      | ((ra)<<(32-16))		   \
-	      | dq)
-
-#define CHECK16B(vreg,offset,addr,label) \
-	lxv	  vreg+32,offset(addr);	\
-	vcmpequb. vreg,vreg,v18;	\
-	bne	  cr6,L(label);		\
-	cmpldi	  r5,16;		\
-	ble	  L(null);		\
-	addi	  r5,r5,-16;
-
-/* Load 4 quadwords, merge into one VR for speed and check for NULLs.  r6 has #
-   of bytes already checked.  */
-#define CHECK64B(offset,addr,label)	    \
-	M_LXVP(v4+32,offset,addr);	    \
-	M_LXVP(v6+32,offset+32,addr);	    \
-	MEMCHR_SUBTRACT_VECTORS;	    \
-	vminub	  v14,v4,v5;		    \
-	vminub	  v15,v6,v7;		    \
-	vminub	  v16,v14,v15;		    \
-	vcmpequb. v0,v16,M_VREG_ZERO;	    \
-	beq	  cr6,$+12;		    \
-	li	  r7,offset;		    \
-	b     	  L(label);          	    \
-	cmpldi	  r5,64;		    \
-	ble	  L(null);		    \
-	addi	  r5,r5,-64
-
-/* Implements the function
-   void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]).  */
-
-	.machine power9
-
-ENTRY_TOCLESS (MEMCHR)
-	CALL_MCOUNT 3
-
-	cmpldi	r5,0
-	beq	L(null)
-	mr	r0,r5
-	xori	r6,r4,0xff
-
-	mtvsrd	v18+32,r4	/* matching char in v18  */
-	mtvsrd	v19+32,r6	/* non matching char in v19  */
-
-	vspltb	v18,v18,7	/* replicate  */
-	vspltb	v19,v19,7	/* replicate  */
-	vspltisb  M_VREG_ZERO,0
-
-	/* Next 16B-aligned address. Prepare address for L(aligned).  */
-	addi	  r6,r3,16
-	clrrdi	  r6,r6,4
-
-	/* Align data and fill bytes not loaded with non matching char.	 */
-	lvx	  v0,0,r3
-	lvsr	  v1,0,r3
-	vperm	  v0,v19,v0,v1
-
-	vcmpequb. v6,v0,v18
-	bne	  cr6,L(found)
-	sub	  r4,r6,r3
-	cmpld	  r5,r4
-	ble	  L(null)
-	sub	  r5,r5,r4
-
-	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
-	   optimized for longer strings, so checking the first bytes in 16B
-	   chunks benefits a lot small strings.  */
-	.p2align 5
-L(aligned):
-	cmpldi	r5,0
-	beq     L(null)
-
-	CHECK16B(v0,0,r6,tail1)
-	CHECK16B(v1,16,r6,tail2)
-	CHECK16B(v2,32,r6,tail3)
-	CHECK16B(v3,48,r6,tail4)
-	CHECK16B(v4,64,r6,tail5)
-	CHECK16B(v5,80,r6,tail6)
-	CHECK16B(v6,96,r6,tail7)
-	CHECK16B(v7,112,r6,tail8)
-	CHECK16B(v8,128,r6,tail9)
-	CHECK16B(v9,144,r6,tail10)
-	CHECK16B(v10,160,r6,tail11)
-	CHECK16B(v0,176,r6,tail12)
-	CHECK16B(v1,192,r6,tail13)
-	CHECK16B(v2,208,r6,tail14)
-	CHECK16B(v3,224,r6,tail15)
-
-	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
-				   choose how we will perform the main loop.  */
-
-	/* Prepare address for the loop.  */
-	addi	  r4,r3,M_OFF_START_LOOP
-	clrrdi	  r4,r4,6
-	sub	  r6,r4,r3
-	sub	  r5,r0,r6
-	addi	  r6,r4,128
-
-	/* If c == 0, use the loop without the vsububm.  */
-	beq	cr5,L(loop)
-
-	/* This is very similar to the block after L(loop), the difference is
-	   that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
-	   each byte loaded by the char we are looking for, this way we can keep
-	   using vminub to merge the results and checking for nulls.  */
-	.p2align 5
-L(memchr_loop):
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	r6,r6,256
-
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	r6,r6,256
-
-	b	L(memchr_loop)
-	/* Switch to a more aggressive approach checking 64B each time.  Use 2
-	   pointers 128B apart and unroll the loop once to make the pointer
-	   updates and usages separated enough to avoid stalls waiting for
-	   address calculation.  */
-	.p2align 5
-L(loop):
-#undef MEMCHR_SUBTRACT_VECTORS
-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	  r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	  r6,r6,256
-
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi      r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi      r6,r6,256
-
-	b	  L(loop)
-
-	.p2align  5
-L(pre_tail_64b):
-	mr	r6,r4
-L(tail_64b):
-	/* OK, we found a null byte.  Let's look for it in the current 64-byte
-	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
-	   low 16B bytes into vx+1, and the high into vx, so the order here is
-	   v5, v4, v7, v6.  */
-	vcmpequb  v1,v5,M_VREG_ZERO
-	vcmpequb  v2,v4,M_VREG_ZERO
-	vcmpequb  v3,v7,M_VREG_ZERO
-	vcmpequb  v4,v6,M_VREG_ZERO
-
-	/* Take into account the other 64B blocks we had already checked.  */
-	add	r6,r6,r7
-	/* Extract first bit of each byte.  */
-	M_VEXTRACTBM(r8,v1)
-	M_VEXTRACTBM(r9,v2)
-	M_VEXTRACTBM(r10,v3)
-	M_VEXTRACTBM(r11,v4)
-
-	/* Shift each value into their corresponding position.  */
-	sldi	  r9,r9,16
-	sldi	  r10,r10,32
-	sldi	  r11,r11,48
-
-	/* Merge the results.  */
-	or	  r8,r8,r9
-	or	  r9,r10,r11
-	or	  r11,r9,r8
-
-	cnttzd	  r0,r11	  /* Count trailing zeros before the match.  */
-	cmpld     r5,r0
-	ble	  L(null)
-	add	  r3,r6,r0	  /* Compute final address.  */
-	blr
-
-	.p2align  5
-L(tail1):
-	M_TAIL(v0,0)
-
-	.p2align  5
-L(tail2):
-	M_TAIL(v1,16)
-
-	.p2align  5
-L(tail3):
-	M_TAIL(v2,32)
-
-	.p2align  5
-L(tail4):
-	M_TAIL(v3,48)
-
-	.p2align  5
-L(tail5):
-	M_TAIL(v4,64)
-
-	.p2align  5
-L(tail6):
-	M_TAIL(v5,80)
-
-	.p2align  5
-L(tail7):
-	M_TAIL(v6,96)
-
-	.p2align  5
-L(tail8):
-	M_TAIL(v7,112)
-
-	.p2align  5
-L(tail9):
-	M_TAIL(v8,128)
-
-	.p2align  5
-L(tail10):
-	M_TAIL(v9,144)
-
-	.p2align  5
-L(tail11):
-	M_TAIL(v10,160)
-
-	.p2align  5
-L(tail12):
-	M_TAIL(v0,176)
-
-	.p2align  5
-L(tail13):
-	M_TAIL(v1,192)
-
-	.p2align  5
-L(tail14):
-	M_TAIL(v2,208)
-
-	.p2align  5
-L(tail15):
-	M_TAIL(v3,224)
-
-	.p2align  5
-L(found):
-	vctzlsbb  r7,v6
-	cmpld     r5,r7
-	ble       L(null)
-	add       r3,r3,r7
-	blr
-
-	.p2align  5
-L(null):
-	li	r3,0
-	blr
-
-END (MEMCHR)
-
-weak_alias (__memchr, memchr)
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
deleted file mode 100644
index f0d6732..0000000
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Optimized strcmp implementation for PowerPC64/POWER10.
-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-#include <sysdep.h>
-
-#ifndef STRCMP
-# define STRCMP strcmp
-#endif
-
-/* Implements the function
-   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]).  */
-
-/* TODO: Change this to actual instructions when minimum binutils is upgraded
-   to 2.27.  Macros are defined below for these newer instructions in order
-   to maintain compatibility.  */
-
-#define LXVP(xtp,dq,ra)		     \
-	.long(((6)<<(32-6))	     \
-	| ((((xtp)-32)>>1)<<(32-10)) \
-	| ((1)<<(32-11))	     \
-	| ((ra)<<(32-16))	     \
-	| dq)
-
-#define COMPARE_16(vreg1,vreg2,offset)  \
-	lxv       vreg1+32,offset(r3);  \
-	lxv       vreg2+32,offset(r4);	\
-	vcmpnezb. v7,vreg1,vreg2;	\
-	bne       cr6,L(different);     \
-
-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
-	LXVP(vreg1+32,offset,r3);                    \
-	LXVP(vreg2+32,offset,r4);                    \
-	vcmpnezb. v7,vreg1+1,vreg2+1;                \
-	bne	  cr6,L(label1);                     \
-	vcmpnezb. v7,vreg1,vreg2;                    \
-	bne	  cr6,L(label2);                     \
-
-#define TAIL(vreg1,vreg2)     \
-	vctzlsbb r6,v7;	      \
-	vextubrx r5,r6,vreg1; \
-	vextubrx r4,r6,vreg2; \
-	subf	 r3,r4,r5;    \
-	blr;                  \
-
-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
-	sldi	  r0,len_reg,56;         \
-	lxvl	  32+v4,reg1,r0;         \
-	lxvl	  32+v5,reg2,r0;         \
-	add	  reg1,reg1,len_reg;     \
-	add	  reg2,reg2,len_reg;     \
-	vcmpnezb  v7,v4,v5;              \
-	vctzlsbb  r6,v7;                 \
-	cmpld	  cr7,r6,len_reg;        \
-	blt	  cr7,L(different);      \
-
-	/* TODO: change this to .machine power10 when the minimum required
-	binutils allows it.  */
-
-	.machine  power9
-ENTRY_TOCLESS (STRCMP, 4)
-	andi.	r7,r3,4095
-	andi.	r8,r4,4095
-	cmpldi	cr0,r7,4096-16
-	cmpldi	cr1,r8,4096-16
-	bgt	cr0,L(crosses)
-	bgt	cr1,L(crosses)
-	COMPARE_16(v4,v5,0)
-
-L(crosses):
-	andi.	r7,r3,15
-	subfic	r7,r7,16	/* r7(nalign1) = 16 - (str1 & 15).  */
-	andi.	r9,r4,15
-	subfic	r5,r9,16	/* r5(nalign2) = 16 - (str2 & 15).  */
-	cmpld	cr7,r7,r5
-	beq	cr7,L(same_aligned)
-	blt	cr7,L(nalign1_min)
-
-	/* nalign2 is minimum and s2 pointer is aligned.  */
-	CHECK_N_BYTES(r3,r4,r5)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.	r10,r3,63	/* Determine offset into 64B hunk.  */
-	andi.	r8,r3,15        /* The offset into the 16B hunk.  */
-	neg	r7,r3
-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
-	rlwinm.	r7,r7,26,0x3F	/* ((r3-4096))>>6&63.  */
-	beq	L(compare_64_pagecross)
-	mtctr	r7
-	b	L(compare_64B_unaligned)
-
-	/* nalign1 is minimum and s1 pointer is aligned.  */
-L(nalign1_min):
-	CHECK_N_BYTES(r3,r4,r7)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.	r10,r4,63	/* Determine offset into 64B hunk.  */
-	andi.	r8,r4,15	/* The offset into the 16B hunk.  */
-	neg	r7,r4
-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
-	rlwinm. r7,r7,26,0x3F	/* ((r4-4096))>>6&63.  */
-	beq	L(compare_64_pagecross)
-	mtctr	r7
-
-	.p2align 5
-L(compare_64B_unaligned):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	bdnz	L(compare_64B_unaligned)
-
-	/* Cross the page boundary of s2, carefully. Only for first
-	iteration we have to get the count of 64B blocks to be checked.
-	From second iteration and beyond, loop counter is always 63.  */
-L(compare_64_pagecross):
-	li	r11, 63
-	mtctr	r11
-	cmpldi	r10,16
-	ble	L(cross_4)
-	cmpldi	r10,32
-	ble	L(cross_3)
-	cmpldi	r10,48
-	ble	L(cross_2)
-L(cross_1):
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi	r3,r3,48
-	addi	r4,r4,48
-	b	L(compare_64B_unaligned)
-L(cross_2):
-	COMPARE_16(v4,v5,0)
-	addi	r3,r3,16
-	addi	r4,r4,16
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi	r3,r3,32
-	addi	r4,r4,32
-	b	L(compare_64B_unaligned)
-L(cross_3):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi	r3,r3,32
-	addi	r4,r4,32
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	addi	r3,r3,16
-	addi	r4,r4,16
-	b	L(compare_64B_unaligned)
-L(cross_4):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi	r3,r3,48
-	addi	r4,r4,48
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	b	L(compare_64B_unaligned)
-
-L(same_aligned):
-	CHECK_N_BYTES(r3,r4,r7)
-        /* Align s1 to 32B and adjust s2 address.
-	   Use lxvp only if both s1 and s2 are 32B aligned.  */
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-
-	clrldi	r6,r3,59
-	subfic	r5,r6,32
-	add	r3,r3,r5
-	add	r4,r4,r5
-	andi.	r5,r4,0x1F
-	beq	cr0,L(32B_aligned_loop)
-
-	.p2align 5
-L(16B_aligned_loop):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	b	L(16B_aligned_loop)
-
-	/* Calculate and return the difference.  */
-L(different):
-	TAIL(v4,v5)
-
-	.p2align 5
-L(32B_aligned_loop):
-	COMPARE_32(v14,v16,0,tail1,tail2)
-	COMPARE_32(v18,v20,32,tail3,tail4)
-	COMPARE_32(v22,v24,64,tail5,tail6)
-	COMPARE_32(v26,v28,96,tail7,tail8)
-	addi	r3,r3,128
-	addi	r4,r4,128
-	b	L(32B_aligned_loop)
-
-L(tail1): TAIL(v15,v17)
-L(tail2): TAIL(v14,v16)
-L(tail3): TAIL(v19,v21)
-L(tail4): TAIL(v18,v20)
-L(tail5): TAIL(v23,v25)
-L(tail6): TAIL(v22,v24)
-L(tail7): TAIL(v27,v29)
-L(tail8): TAIL(v26,v28)
-
-END (STRCMP)
-libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 594fbb8..27d8495 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -31,10 +31,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
-		   memmove-power10 memset-power10 rawmemchr-power9 \
-		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
-		   strncmp-power9 strcpy-power9 stpcpy-power9 \
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
+		   rawmemchr-power9 rawmemchr-power10 \
+		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 5b2d6a9..ad6080f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c.  */
   IFUNC_IMPL (i, name, memchr,
-#ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, memchr,
-		              hwcap2 & PPC_FEATURE2_ARCH_3_1
-			      && hwcap & PPC_FEATURE_HAS_VSX,
-			      __memchr_power10)
-#endif
 	      IFUNC_IMPL_ADD (array, i, memchr,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
@@ -384,10 +378,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcmp,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, strcmp,
-			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
-			      && (hwcap & PPC_FEATURE_HAS_VSX),
-			      __strcmp_power10)
-	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00
 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strcmp_power9)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index 57d23e7..b4655df 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
 extern __typeof (__memchr) __memchr_power7 attribute_hidden;
 extern __typeof (__memchr) __memchr_power8 attribute_hidden;
 
-# ifdef __LITTLE_ENDIAN__
-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
-# endif
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__memchr,
-# ifdef __LITTLE_ENDIAN__
-	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
-	     && hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __memchr_power10 :
-# endif
-	      (hwcap2 & PPC_FEATURE2_ARCH_2_07
-	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
-	      ? __memchr_power8 :
-	        (hwcap & PPC_FEATURE_ARCH_2_06)
-	        ? __memchr_power7
-	        : __memchr_ppc);
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+	    ? __memchr_power8 :
+	    (hwcap & PPC_FEATURE_ARCH_2_06)
+            ? __memchr_power7
+            : __memchr_ppc);
 
 weak_alias (__memchr, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
deleted file mode 100644
index 1a9f606..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized strcmp implementation for POWER10/PPC64.
-   Copyright (C) 2021-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define STRCMP __strcmp_power10
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index ff32496..06b9b40 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
 extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 # ifdef __LITTLE_ENDIAN__
 extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
 # endif
 
 # undef strcmp
 
 libc_ifunc_redirected (__redirect_strcmp, strcmp,
 # ifdef __LITTLE_ENDIAN__
-		        (hwcap2 & PPC_FEATURE2_ARCH_3_1
-			 && hwcap & PPC_FEATURE_HAS_VSX)
-			? __strcmp_power10 :
 			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 			? __strcmp_power9 :
diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list
index 73e941e..9ac42c3 100644
--- a/sysdeps/unix/sysv/linux/syscalls.list
+++ b/sysdeps/unix/sysv/linux/syscalls.list
@@ -46,6 +46,7 @@ open_tree	EXTRA	open_tree	i:isU	open_tree
 pipe2		-	pipe2		i:fi	__pipe2		pipe2
 pidfd_open	EXTRA	pidfd_open	i:iU	pidfd_open
 pidfd_getfd	EXTRA	pidfd_getfd	i:iiU	pidfd_getfd
+prctl		EXTRA	prctl		i:iiiii	__prctl		prctl __prctl_time64
 pivot_root	EXTRA	pivot_root	i:ss	pivot_root
 pidfd_send_signal	EXTRA	pidfd_send_signal	i:iiPU	pidfd_send_signal
 process_madvise EXTRA   process_madvise i:iPniU process_madvise
diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile
index 9a1e7aa..fcbffd8 100644
--- a/sysdeps/unix/sysv/linux/x86_64/Makefile
+++ b/sysdeps/unix/sysv/linux/x86_64/Makefile
@@ -17,18 +17,21 @@ endif
 ifeq ($(subdir),elf)
 ifeq (yes,$(enable-x86-isa-level))
 tests += \
-  tst-glibc-hwcaps-2
+  tst-glibc-hwcaps-2 \
+# tests
 ifeq (no,$(build-hardcoded-path-in-tests))
 # This is an ld.so.cache test, and RPATH/RUNPATH in the executable
 # interferes with its test objectives.
 tests-container += \
-  tst-glibc-hwcaps-2-cache
+  tst-glibc-hwcaps-2-cache \
+# tests-container
 endif
 modules-names += \
   libx86-64-isa-level-1 \
   libx86-64-isa-level-2 \
   libx86-64-isa-level-3 \
-  libx86-64-isa-level-4
+  libx86-64-isa-level-4 \
+# modules-names
 
 $(objpfx)tst-glibc-hwcaps-2: $(objpfx)libx86-64-isa-level.so
 
diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
index 52d234e..4bf1b47 100644
--- a/sysdeps/unix/sysv/linux/prctl.c
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c
@@ -1,4 +1,4 @@
-/* prctl - Linux specific syscall.
+/* prctl - Linux specific syscall.  x86-64 x32 version.
    Copyright (C) 2020-2024 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -40,6 +40,3 @@ __prctl (int option, ...)
 
 libc_hidden_def (__prctl)
 weak_alias (__prctl, prctl)
-#if __TIMESIZE != 64
-weak_alias (__prctl, __prctl_time64)
-#endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 5311b59..01b0192 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -21,6 +21,9 @@ tests += \
   tst-cpu-features-supports-static \
   tst-get-cpu-features \
   tst-get-cpu-features-static \
+  tst-gnu2-tls2-x86-noxsave \
+  tst-gnu2-tls2-x86-noxsavec \
+  tst-gnu2-tls2-x86-noxsavexsavec \
   tst-hwcap-tunables \
 # tests
 tests-static += \
@@ -91,6 +94,25 @@ CFLAGS-tst-gnu2-tls2.c += -msse
 CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
 CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
 CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
+
+LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
+
+# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
+# via tunable.
+tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
+tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
+$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
+  $(objpfx)tst-gnu2-tls2mod0.so \
+  $(objpfx)tst-gnu2-tls2mod1.so \
+  $(objpfx)tst-gnu2-tls2mod2.so
 endif
 
 ifeq ($(subdir),math)
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index ab73556..8349160 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024;
 long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
 
-/* Threshold to use non temporal store.  */
+/* Threshold to use non temporal store in memmove.  */
 long int __x86_shared_non_temporal_threshold attribute_hidden;
 
+/* Threshold to use non temporal store in memset.  */
+long int __x86_memset_non_temporal_threshold attribute_hidden;
+
 /* Threshold to use Enhanced REP MOVSB.  */
 long int __x86_rep_movsb_threshold attribute_hidden = 2048;
 
@@ -77,6 +80,9 @@ init_cacheinfo (void)
   __x86_shared_non_temporal_threshold
     = cpu_features->non_temporal_threshold;
 
+  __x86_memset_non_temporal_threshold
+      = cpu_features->memset_non_temporal_threshold;
+
   __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
   __x86_rep_movsb_stop_threshold =  cpu_features->rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3d7c281..47dc3b1 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -24,6 +24,7 @@
 #include <dl-cacheinfo.h>
 #include <dl-minsigstacksize.h>
 #include <dl-hwcap2.h>
+#include <gcc-macros.h>
 
 extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
   attribute_hidden;
@@ -83,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
 # include <dl-cet.h>
 #endif
 
+unsigned long int _dl_x86_features_tlsdesc_state_size;
+
 static void
 update_active (struct cpu_features *cpu_features)
 {
@@ -317,17 +320,13 @@ update_active (struct cpu_features *cpu_features)
 		= xsave_state_full_size;
 	      cpu_features->xsave_state_full_size
 		= xsave_state_full_size;
+	      _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
 
 	      /* Check if XSAVEC is available.  */
 	      if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
 		{
-		  unsigned int xstate_comp_offsets[32];
-		  unsigned int xstate_comp_sizes[32];
-#ifdef __x86_64__
-		  unsigned int xstate_amx_comp_offsets[32];
-		  unsigned int xstate_amx_comp_sizes[32];
-		  unsigned int amx_ecx;
-#endif
+		  unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
+		  unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
 		  unsigned int i;
 
 		  xstate_comp_offsets[0] = 0;
@@ -335,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
 		  xstate_comp_offsets[2] = 576;
 		  xstate_comp_sizes[0] = 160;
 		  xstate_comp_sizes[1] = 256;
-#ifdef __x86_64__
-		  xstate_amx_comp_offsets[0] = 0;
-		  xstate_amx_comp_offsets[1] = 160;
-		  xstate_amx_comp_offsets[2] = 576;
-		  xstate_amx_comp_sizes[0] = 160;
-		  xstate_amx_comp_sizes[1] = 256;
-#endif
 
-		  for (i = 2; i < 32; i++)
+		  for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
 		    {
 		      if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
 			{
 			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
-#ifdef __x86_64__
-			  /* Include this in xsave_state_full_size.  */
-			  amx_ecx = ecx;
-			  xstate_amx_comp_sizes[i] = eax;
-			  if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
-			    {
-			      /* Exclude this from xsave_state_size.  */
-			      ecx = 0;
-			      xstate_comp_sizes[i] = 0;
-			    }
-			  else
-#endif
-			    xstate_comp_sizes[i] = eax;
+			  xstate_comp_sizes[i] = eax;
 			}
 		      else
 			{
-#ifdef __x86_64__
-			  amx_ecx = 0;
-			  xstate_amx_comp_sizes[i] = 0;
-#endif
 			  ecx = 0;
 			  xstate_comp_sizes[i] = 0;
 			}
@@ -376,44 +352,32 @@ update_active (struct cpu_features *cpu_features)
 			{
 			  xstate_comp_offsets[i]
 			    = (xstate_comp_offsets[i - 1]
-			       + xstate_comp_sizes[i -1]);
+			       + xstate_comp_sizes[i - 1]);
 			  if ((ecx & (1 << 1)) != 0)
 			    xstate_comp_offsets[i]
 			      = ALIGN_UP (xstate_comp_offsets[i], 64);
-#ifdef __x86_64__
-			  xstate_amx_comp_offsets[i]
-			    = (xstate_amx_comp_offsets[i - 1]
-			       + xstate_amx_comp_sizes[i - 1]);
-			  if ((amx_ecx & (1 << 1)) != 0)
-			    xstate_amx_comp_offsets[i]
-			      = ALIGN_UP (xstate_amx_comp_offsets[i],
-					  64);
-#endif
 			}
 		    }
 
 		  /* Use XSAVEC.  */
 		  unsigned int size
-		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+		    = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
+		       + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
 		  if (size)
 		    {
+		      size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+				       64);
 #ifdef __x86_64__
-		      unsigned int amx_size
-			= (xstate_amx_comp_offsets[31]
-			   + xstate_amx_comp_sizes[31]);
-		      amx_size
-			= ALIGN_UP ((amx_size
-				     + TLSDESC_CALL_REGISTER_SAVE_AREA),
-				    64);
-		      /* Set xsave_state_full_size to the compact AMX
-			 state size for XSAVEC.  NB: xsave_state_full_size
-			 is only used in _dl_tlsdesc_dynamic_xsave and
-			 _dl_tlsdesc_dynamic_xsavec.  */
-		      cpu_features->xsave_state_full_size = amx_size;
+		      _dl_x86_features_tlsdesc_state_size = size;
+		      /* Exclude the AMX space from the start of TILECFG
+			 space to the end of TILEDATA space.  If CPU
+			 doesn't support AMX, TILECFG offset is the same
+			 as TILEDATA + 1 offset.  Otherwise, they are
+			 multiples of 64.  */
+		      size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
+			       - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
 #endif
-		      cpu_features->xsave_state_size
-			= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
-				    64);
+		      cpu_features->xsave_state_size = size;
 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
 		    }
 		}
@@ -538,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
 		"Incorrect index_arch_Fast_Unaligned_Load");
 
 
-/* Intel Family-6 microarch list.  */
-enum
+/* Intel microarch list.  */
+enum intel_microarch
 {
   /* Atom processors.  */
   INTEL_ATOM_BONNELL,
@@ -548,6 +512,7 @@ enum
   INTEL_ATOM_GOLDMONT,
   INTEL_ATOM_GOLDMONT_PLUS,
   INTEL_ATOM_SIERRAFOREST,
+  INTEL_ATOM_CLEARWATERFOREST,
   INTEL_ATOM_GRANDRIDGE,
   INTEL_ATOM_TREMONT,
 
@@ -575,7 +540,9 @@ enum
   INTEL_BIGCORE_METEORLAKE,
   INTEL_BIGCORE_LUNARLAKE,
   INTEL_BIGCORE_ARROWLAKE,
+  INTEL_BIGCORE_PANTHERLAKE,
   INTEL_BIGCORE_GRANITERAPIDS,
+  INTEL_BIGCORE_DIAMONDRAPIDS,
 
   /* Mixed (bigcore + atom SOC).  */
   INTEL_MIXED_LAKEFIELD,
@@ -589,7 +556,7 @@ enum
   INTEL_UNKNOWN,
 };
 
-static unsigned int
+static enum intel_microarch
 intel_get_fam6_microarch (unsigned int model,
 			  __attribute__ ((unused)) unsigned int stepping)
 {
@@ -620,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_ATOM_GOLDMONT_PLUS;
     case 0xAF:
       return INTEL_ATOM_SIERRAFOREST;
+    case 0xDD:
+      return INTEL_ATOM_CLEARWATERFOREST;
     case 0xB6:
       return INTEL_ATOM_GRANDRIDGE;
     case 0x86:
@@ -727,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_BIGCORE_METEORLAKE;
     case 0xbd:
       return INTEL_BIGCORE_LUNARLAKE;
+    case 0xb5:
+    case 0xc5:
     case 0xc6:
       return INTEL_BIGCORE_ARROWLAKE;
+    case 0xCC:
+      return INTEL_BIGCORE_PANTHERLAKE;
     case 0xAD:
     case 0xAE:
       return INTEL_BIGCORE_GRANITERAPIDS;
@@ -756,6 +729,12 @@ init_cpu_features (struct cpu_features *cpu_features)
   unsigned int stepping = 0;
   enum cpu_features_kind kind;
 
+  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+     as of writing this, we only have benchmarks indicatings it profitability
+     on Intel/AMD.  */
+  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+      |= bit_arch_Avoid_Non_Temporal_Memset;
+
   cpu_features->cachesize_non_temporal_divisor = 4;
 #if !HAS_CPUID
   if (__get_cpuid_max (0, 0) == 0)
@@ -781,125 +760,25 @@ init_cpu_features (struct cpu_features *cpu_features)
 
       update_active (cpu_features);
 
+      /* Benchmarks indicate non-temporal memset can be profitable on Intel
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
+      enum intel_microarch microarch = INTEL_UNKNOWN;
       if (family == 0x06)
 	{
 	  model += extended_model;
-	  unsigned int microarch
-	      = intel_get_fam6_microarch (model, stepping);
+	  microarch = intel_get_fam6_microarch (model, stepping);
 
+	  /* Disable TSX on some processors to avoid TSX on kernels that
+	     weren't updated with the latest microcode package (which
+	     disables broken feature by default).  */
 	  switch (microarch)
 	    {
-	      /* Atom / KNL tuning.  */
-	    case INTEL_ATOM_BONNELL:
-	      /* BSF is slow on Bonnell.  */
-	      cpu_features->preferred[index_arch_Slow_BSF]
-		  |= bit_arch_Slow_BSF;
-	      break;
-
-	      /* Unaligned load versions are faster than SSSE3
-		     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
-	    case INTEL_ATOM_AIRMONT:
-	    case INTEL_ATOM_SILVERMONT:
-	    case INTEL_ATOM_GOLDMONT:
-	    case INTEL_ATOM_GOLDMONT_PLUS:
-
-          /* Knights Landing.  Enable Silvermont optimizations.  */
-	    case INTEL_KNIGHTS_LANDING:
-
-	      cpu_features->preferred[index_arch_Fast_Unaligned_Load]
-		  |= (bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	    case INTEL_ATOM_TREMONT:
-	      /* Enable rep string instructions, unaligned load, unaligned
-		 copy, pminub and avoid SSE 4.2 on Tremont.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	   /*
-	    Default tuned Knights microarch.
-	    case INTEL_KNIGHTS_MILL:
-        */
-
-	   /*
-	    Default tuned atom microarch.
-	    case INTEL_ATOM_SIERRAFOREST:
-	    case INTEL_ATOM_GRANDRIDGE:
-	   */
-
-	      /* Bigcore/Default Tuning.  */
 	    default:
-	    default_tuning:
-	      /* Unknown family 0x06 processors.  Assuming this is one
-		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
-		break;
-
-	    enable_modern_features:
-	      /* Rep string instructions, unaligned load, unaligned copy,
-		 and pminub are fast on Intel Core i3, i5 and i7.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 
-	    case INTEL_BIGCORE_NEHALEM:
-	    case INTEL_BIGCORE_WESTMERE:
-	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto enable_modern_features;
-
-	      /* Older Bigcore microarch (smaller non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SANDYBRIDGE:
-	    case INTEL_BIGCORE_IVYBRIDGE:
-	    case INTEL_BIGCORE_HASWELL:
-	    case INTEL_BIGCORE_BROADWELL:
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto default_tuning;
-
-	      /* Newer Bigcore microarch (larger non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SKYLAKE:
-	    case INTEL_BIGCORE_KABYLAKE:
-	    case INTEL_BIGCORE_COMETLAKE:
-	    case INTEL_BIGCORE_SKYLAKE_AVX512:
-	    case INTEL_BIGCORE_CANNONLAKE:
-	    case INTEL_BIGCORE_ICELAKE:
-	    case INTEL_BIGCORE_TIGERLAKE:
-	    case INTEL_BIGCORE_ROCKETLAKE:
-	    case INTEL_BIGCORE_RAPTORLAKE:
-	    case INTEL_BIGCORE_METEORLAKE:
-	    case INTEL_BIGCORE_LUNARLAKE:
-	    case INTEL_BIGCORE_ARROWLAKE:
-	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
-	    case INTEL_BIGCORE_EMERALDRAPIDS:
-	    case INTEL_BIGCORE_GRANITERAPIDS:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-
-	      /* Default tuned Mixed (bigcore + atom SOC). */
-	    case INTEL_MIXED_LAKEFIELD:
-	    case INTEL_MIXED_ALDERLAKE:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-	    }
-
-	      /* Disable TSX on some processors to avoid TSX on kernels that
-		 weren't updated with the latest microcode package (which
-		 disables broken feature by default).  */
-	  switch (microarch)
-	    {
 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
 	      /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
 	      if (stepping <= 5)
@@ -908,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features)
 
 	    case INTEL_BIGCORE_KABYLAKE:
 	      /* NB: Although the errata documents that for model == 0x8e
-		     (kabylake skylake client), only 0xb stepping or lower are
-		     impacted, the intention of the errata was to disable TSX on
-		     all client processors on all steppings.  Include 0xc
-		     stepping which is an Intel Core i7-8665U, a client mobile
-		     processor.  */
+		 (kabylake skylake client), only 0xb stepping or lower are
+		 impacted, the intention of the errata was to disable TSX on
+		 all client processors on all steppings.  Include 0xc
+		 stepping which is an Intel Core i7-8665U, a client mobile
+		 processor.  */
 	      if (stepping > 0xc)
 		break;
 	      /* Fall through.  */
 	    case INTEL_BIGCORE_SKYLAKE:
-		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
-		   processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
-		 */
-	    disable_tsx:
-		CPU_FEATURE_UNSET (cpu_features, HLE);
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
-		break;
+	      /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+		 processors listed in:
+
+		 https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+	       */
+disable_tsx:
+	      CPU_FEATURE_UNSET (cpu_features, HLE);
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+	      break;
 
 	    case INTEL_BIGCORE_HASWELL:
-		/* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
-		   TSX.  Haswell also include other model numbers that have
-		   working TSX.  */
-		if (model == 0x3f && stepping >= 4)
+	      /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+		 TSX.  Haswell also includes other model numbers that have
+		 working TSX.  */
+	      if (model == 0x3f && stepping >= 4)
 		break;
 
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		break;
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      break;
 	    }
 	}
+      else if (family == 19)
+	switch (model)
+	  {
+	  case 0x01:
+	    microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+	    break;
 
+	  default:
+	    break;
+	  }
+
+      switch (microarch)
+	{
+	  /* Atom / KNL tuning.  */
+	case INTEL_ATOM_BONNELL:
+	  /* BSF is slow on Bonnell.  */
+	  cpu_features->preferred[index_arch_Slow_BSF]
+	    |= bit_arch_Slow_BSF;
+	  break;
+
+	  /* Unaligned load versions are faster than SSSE3
+	     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
+	case INTEL_ATOM_AIRMONT:
+	case INTEL_ATOM_SILVERMONT:
+	case INTEL_ATOM_GOLDMONT:
+	case INTEL_ATOM_GOLDMONT_PLUS:
+
+	  /* Knights Landing.  Enable Silvermont optimizations.  */
+	case INTEL_KNIGHTS_LANDING:
+
+	  cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+	    |= (bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	case INTEL_ATOM_TREMONT:
+	  /* Enable rep string instructions, unaligned load, unaligned
+	     copy, pminub and avoid SSE 4.2 on Tremont.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	  /*
+	     Default tuned Knights microarch.
+	     case INTEL_KNIGHTS_MILL:
+	     */
+
+	  /*
+	     Default tuned atom microarch.
+	     case INTEL_ATOM_SIERRAFOREST:
+	     case INTEL_ATOM_GRANDRIDGE:
+	     case INTEL_ATOM_CLEARWATERFOREST:
+	     */
+
+	  /* Bigcore/Default Tuning.  */
+	default:
+	default_tuning:
+	  /* Unknown Intel processors.  Assuming this is one of Core
+	     i3/i5/i7 processors if AVX is available.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+	    break;
+
+	enable_modern_features:
+	  /* Rep string instructions, unaligned load, unaligned copy,
+	     and pminub are fast on Intel Core i3, i5 and i7.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop);
+	  break;
+
+	case INTEL_BIGCORE_NEHALEM:
+	case INTEL_BIGCORE_WESTMERE:
+	  /* Older CPUs prefer non-temporal stores at lower threshold.  */
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto enable_modern_features;
+
+	  /* Older Bigcore microarch (smaller non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SANDYBRIDGE:
+	case INTEL_BIGCORE_IVYBRIDGE:
+	case INTEL_BIGCORE_HASWELL:
+	case INTEL_BIGCORE_BROADWELL:
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto default_tuning;
+
+	  /* Newer Bigcore microarch (larger non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SKYLAKE_AVX512:
+	case INTEL_BIGCORE_CANNONLAKE:
+	  /* Benchmarks indicate non-temporal memset is not
+	     necessarily profitable on SKX (and in some cases much
+	     worse). This is likely unique to SKX due to its unique
+	     mesh interconnect (not present on ICX or BWD). Disable
+	     non-temporal on all Skylake servers. */
+	  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	    |= bit_arch_Avoid_Non_Temporal_Memset;
+	  /* fallthrough */
+	case INTEL_BIGCORE_COMETLAKE:
+	case INTEL_BIGCORE_SKYLAKE:
+	case INTEL_BIGCORE_KABYLAKE:
+	case INTEL_BIGCORE_ICELAKE:
+	case INTEL_BIGCORE_TIGERLAKE:
+	case INTEL_BIGCORE_ROCKETLAKE:
+	case INTEL_BIGCORE_RAPTORLAKE:
+	case INTEL_BIGCORE_METEORLAKE:
+	case INTEL_BIGCORE_LUNARLAKE:
+	case INTEL_BIGCORE_ARROWLAKE:
+	case INTEL_BIGCORE_PANTHERLAKE:
+	case INTEL_BIGCORE_SAPPHIRERAPIDS:
+	case INTEL_BIGCORE_EMERALDRAPIDS:
+	case INTEL_BIGCORE_GRANITERAPIDS:
+	case INTEL_BIGCORE_DIAMONDRAPIDS:
+	  /* Default tuned Mixed (bigcore + atom SOC). */
+	case INTEL_MIXED_LAKEFIELD:
+	case INTEL_MIXED_ALDERLAKE:
+	  cpu_features->cachesize_non_temporal_divisor = 2;
+	  goto default_tuning;
+	}
 
       /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
          if AVX512ER is available.  Don't use AVX512 to avoid lower CPU
@@ -984,6 +988,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
 
       ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx;
 
+      /* Benchmarks indicate non-temporal memset can be profitable on AMD
+	hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
+
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
 	{
 	  /* Since the FMA4 bit is in CPUID_INDEX_80000001 and
@@ -1092,6 +1101,9 @@ no_cpuid:
 	       TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
 #endif
 
+  /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
+     requires AVX and therefore XSAVE or XSAVEC support.  */
+#ifndef GCCMACRO__AVX__
   bool disable_xsave_features = false;
 
   if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
@@ -1145,6 +1157,7 @@ no_cpuid:
 
       CPU_FEATURE_UNSET (cpu_features, FMA4);
     }
+#endif
 
 #ifdef __x86_64__
   GLRO(dl_hwcap) = HWCAP_X86_64;
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 89da7a0..a0b31d8 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		  /* Update xsave_state_size to XSAVE state size.  */
 		  cpu_features->xsave_state_size
 		    = cpu_features->xsave_state_full_size;
+		  _dl_x86_features_tlsdesc_state_size
+		    = cpu_features->xsave_state_full_size;
 		  CPU_FEATURE_UNSET (cpu_features, XSAVEC);
 		}
 	    }
@@ -243,6 +245,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		(n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24);
 	    }
 	  break;
+	case 25:
+	  {
+	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+					      Avoid_Non_Temporal_Memset, 25);
+	  }
+	  break;
 	case 26:
 	    {
 	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 1f68968..10ad180 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -986,9 +986,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     rep_movsb_threshold = 2112;
 
-   /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
-      cases slower than the vectorized path (and for some alignments,
-      it is really slow, check BZ #30994).  */
+  /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
+     cases slower than the vectorized path (and for some alignments,
+     it is really slow, check BZ #30994).  */
   if (cpu_features->basic.kind == arch_kind_amd)
     rep_movsb_threshold = non_temporal_threshold;
 
@@ -1007,11 +1007,23 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   if (tunable_size != 0)
     shared = tunable_size;
 
+  /* Non-temporal stores are more performant on some hardware above
+     non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
+     Intel and AMD hardware. */
+  unsigned long int memset_non_temporal_threshold = SIZE_MAX;
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
+    memset_non_temporal_threshold = non_temporal_threshold;
+
   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
   if (tunable_size > minimum_non_temporal_threshold
       && tunable_size <= maximum_non_temporal_threshold)
     non_temporal_threshold = tunable_size;
 
+  tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL);
+  if (tunable_size > minimum_non_temporal_threshold
+      && tunable_size <= maximum_non_temporal_threshold)
+    memset_non_temporal_threshold = tunable_size;
+
   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
   if (tunable_size > minimum_rep_movsb_threshold)
     rep_movsb_threshold = tunable_size;
@@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 			   minimum_non_temporal_threshold,
 			   maximum_non_temporal_threshold);
+  TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold,
+			   memset_non_temporal_threshold,
+			   minimum_non_temporal_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 			   minimum_rep_movsb_threshold, SIZE_MAX);
   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
@@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   cpu_features->data_cache_size = data;
   cpu_features->shared_cache_size = shared;
   cpu_features->non_temporal_threshold = non_temporal_threshold;
+  cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold;
   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index c76ea3b..8113a93 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -78,11 +78,15 @@ _dl_diagnostics_cpu (void)
                             cpu_features->xsave_state_size);
   print_cpu_features_value ("xsave_state_full_size",
                             cpu_features->xsave_state_full_size);
+  print_cpu_features_value ("tlsdesc_state_full_size",
+                            _dl_x86_features_tlsdesc_state_size);
   print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
   print_cpu_features_value ("shared_cache_size",
                             cpu_features->shared_cache_size);
   print_cpu_features_value ("non_temporal_threshold",
                             cpu_features->non_temporal_threshold);
+  print_cpu_features_value ("memset_non_temporal_threshold",
+                            cpu_features->memset_non_temporal_threshold);
   print_cpu_features_value ("rep_movsb_threshold",
                             cpu_features->rep_movsb_threshold);
   print_cpu_features_value ("rep_movsb_stop_threshold",
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index 7d82da0..a0a1299 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -30,6 +30,9 @@ glibc {
     x86_non_temporal_threshold {
       type: SIZE_T
     }
+    x86_memset_non_temporal_threshold {
+      type: SIZE_T
+    }
     x86_rep_movsb_threshold {
       type: SIZE_T
       # Since there is overhead to set up REP MOVSB operation, REP
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 85e7f54..61bbbc2 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
+BIT (Avoid_Non_Temporal_Memset)
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index cd7bd27..03c7138 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -934,8 +934,6 @@ struct cpu_features
   /* The full state size for XSAVE when XSAVEC is disabled by
 
      GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
-
-     and the AMX state size when XSAVEC is available.
    */
   unsigned int xsave_state_full_size;
   /* Data cache size for use in memory and string routines, typically
@@ -944,8 +942,10 @@ struct cpu_features
   /* Shared cache size for use in memory and string routines, typically
      L2 or L3 size.  */
   unsigned long int shared_cache_size;
-  /* Threshold to use non temporal store.  */
+  /* Threshold to use non temporal store in memmove.  */
   unsigned long int non_temporal_threshold;
+  /* Threshold to use non temporal store in memset.  */
+  unsigned long int memset_non_temporal_threshold;
   /* Threshold to use "rep movsb".  */
   unsigned long int rep_movsb_threshold;
   /* Threshold to stop using "rep movsb".  */
@@ -987,6 +987,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
 
 #define __get_cpu_features() _dl_x86_get_cpu_features()
 
+#if IS_IN (rtld) || IS_IN (libc)
+/* XSAVE/XSAVEC state size used by TLS descriptors.  Compared to
+   xsave_state_size from struct cpu_features, this includes additional
+   registers.  */
+extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
+#endif
+
 #if defined (_LIBC) && !IS_IN (nonlib)
 /* Unused for x86.  */
 # define INIT_ARCH()
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 7359149..1d6cabd 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -102,6 +102,9 @@
    | (1 << X86_XSTATE_ZMM_ID)		\
    | (1 << X86_XSTATE_APX_F_ID))
 
+/* The maximum supported xstate ID.  */
+# define X86_XSTATE_MAX_ID	X86_XSTATE_APX_F_ID
+
 /* AMX state mask.  */
 # define AMX_STATE_SAVE_MASK		\
   ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
@@ -123,6 +126,9 @@
    | (1 << X86_XSTATE_K_ID)		\
    | (1 << X86_XSTATE_ZMM_H_ID))
 
+/* The maximum supported xstate ID.  */
+# define X86_XSTATE_MAX_ID	X86_XSTATE_ZMM_H_ID
+
 /* States to be included in xsave_state_size.  */
 # define FULL_STATE_SAVE_MASK		STATE_SAVE_MASK
 #endif
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
new file mode 100644
index 0000000..f0024c1
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
new file mode 100644
index 0000000..f0024c1
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
new file mode 100644
index 0000000..f0024c1
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index f6a65b8..bc573c7 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -60,7 +60,7 @@ static const struct test_t
     /* Disable everything.  */
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
-    "-AVX_Fast_Unaligned_Load",
+    "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset",
     test_1,
     array_length (test_1)
   },
@@ -68,7 +68,7 @@ static const struct test_t
     /* Same as before, but with some empty suboptions.  */
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
-    "-ERMS,-AVX_Fast_Unaligned_Load,-,",
+    "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,",
     test_1,
     array_length (test_1)
   }
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 0ede447..08ec882 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -32,7 +32,8 @@ sysdep_routines += \
 # sysdep_routines
 gen-as-const-headers += locale-defines.sym
 tests += \
-  tst-rsi-strlen
+  tst-rsi-strlen \
+# tests
 endif
 
 ifeq ($(subdir),elf)
@@ -232,7 +233,8 @@ sysdep_routines += \
 # sysdep_routines
 
 tests += \
-  tst-rsi-wcslen
+  tst-rsi-wcslen \
+# tests
 endif
 
 
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
index 9f02cfc..44d9486 100644
--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
 # endif
 #else
 	/* Allocate stack space of the required size to save the state.  */
-	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
+	sub	_dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
 #endif
 	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
 	   r10 and r11.  */
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index b527cab..bc479b4 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,6 +1,7 @@
 ifeq ($(subdir),math)
 CFLAGS-e_asin-fma.c = -mfma -mavx2
 CFLAGS-e_atan2-fma.c = -mfma -mavx2
+CFLAGS-e_atanh-fma.c = -mfma -mavx2
 CFLAGS-e_exp-fma.c = -mfma -mavx2
 CFLAGS-e_log-fma.c = -mfma -mavx2
 CFLAGS-e_log2-fma.c = -mfma -mavx2
@@ -57,6 +58,7 @@ libm-sysdep_routines += \
   e_asin-fma \
   e_atan2-avx \
   e_atan2-fma \
+  e_atanh-fma \
   e_exp-avx \
   e_exp-fma \
   e_exp2f-fma \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
new file mode 100644
index 0000000..c3f2f9e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
@@ -0,0 +1,6 @@
+#define __ieee754_atanh __ieee754_atanh_fma
+#define __log1p __log1p_fma
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
index 7d35ef2..d2b785d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
@@ -1,5 +1,5 @@
-/* Optimized memchr implementation for POWER10/PPC64.
-   Copyright (C) 2016-2024 Free Software Foundation, Inc.
+/* Multiple versions of atanh.
+   Copyright (C) 2025 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,13 +16,19 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define MEMCHR __memchr_power10
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
 
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(name,alias)
+extern double __redirect_ieee754_atanh (double);
 
-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
+# define SYMBOL_NAME ieee754_atanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_atanh, __atanh)
+
+# define __ieee754_atanh __ieee754_atanh_sse2
 #endif
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c4a21d4..c34c94c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -928,7 +928,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_avx2)
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
 				     1,
 				     __wcsncpy_generic))
 
@@ -958,7 +958,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_avx2)
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
 				     1,
 				     __wcpncpy_generic))
 
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 637caad..88bf08e 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -24,9 +24,9 @@
    5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
       4 VEC stores and store 4 * VEC at a time until done.
    6. On machines ERMS feature, if size is range
-	  [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
+	  [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold)
 	  then REP STOSB will be used.
-   7. If size >= __x86_shared_non_temporal_threshold, use a
+   7. If size >= __x86_memset_non_temporal_threshold, use a
 	  non-temporal stores.  */
 
 #include <sysdep.h>
@@ -318,7 +318,7 @@ L(return_vzeroupper):
 	/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
 	   range for 2-byte jump encoding.  */
 L(stosb_local):
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	cmp	__x86_memset_non_temporal_threshold(%rip), %RDX_LP
 	jae	L(nt_memset)
 	movzbl	%sil, %eax
 	mov	%RDX_LP, %RCX_LP