19 files changed, 591 insertions, 283 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
index e86d8b0..cc267db 100644
--- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
@@ -152,6 +152,46 @@ L(copy96):
 	.p2align 6
 L(copy_long):
 
+	/* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
+	   This loop is identical to the one below it but using ldnp/stnp
+	   instructions.  For loops that are less than 32768 bytes,
+	   the ldnp/stnp instructions will not help and will cause a slow
+	   down so only use the ldnp/stnp loop for the largest sizes.  */
+
+	cmp	count, #32768
+	b.lo	L(copy_long_without_nontemp)
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldnp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldnp	A_l, A_h, [src, 16]
+	stnp	D_l, D_h, [dstin]
+	ldnp	B_l, B_h, [src, 32]
+	ldnp	C_l, C_h, [src, 48]
+	ldnp	D_l, D_h, [src, 64]
+	add	src, src, #64
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+
+L(nontemp_loop64):
+	tbz	src, #6, 1f
+1:
+	stnp	A_l, A_h, [dst, 16]
+	ldnp	A_l, A_h, [src, 16]
+	stnp	B_l, B_h, [dst, 32]
+	ldnp	B_l, B_h, [src, 32]
+	stnp	C_l, C_h, [dst, 48]
+	ldnp	C_l, C_h, [src, 48]
+	stnp	D_l, D_h, [dst, 64]
+	ldnp	D_l, D_h, [src, 64]
+	add	src, src, #64
+	add	dst, dst, #64
+	subs	count, count, 64
+	b.hi	L(nontemp_loop64)
+	b	L(last64)
+
+L(copy_long_without_nontemp):
+
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index 0f9b718..88f4ef4 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -90,6 +90,8 @@ L(set_long):
 	cmp	count, 256
 	ccmp	valw, 0, 0, cs
 	b.eq	L(try_zva)
+	cmp	count, #32768
+	b.hi	L(set_long_with_nontemp)
 	/* Small-size or non-zero memset does not use DC ZVA. */
 	sub	count, dstend, dst
 
@@ -112,6 +114,30 @@ L(set_long):
 	stp	val, val, [dstend, -16]
 	ret
 
+L(set_long_with_nontemp):
+	/* Small-size or non-zero memset does not use DC ZVA. */
+	sub	count, dstend, dst
+
+	/* Adjust count and bias for loop. By subtracting extra 1 from count,
+	   it is easy to use tbz instruction to check whether loop tailing
+	   count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+	sub	count, count, 64+16+1
+
+1:	stnp	val, val, [dst, 16]
+	stnp	val, val, [dst, 32]
+	stnp	val, val, [dst, 48]
+	stnp	val, val, [dst, 64]
+	add	dst, dst, #64
+	subs	count, count, 64
+	b.hs	1b
+
+	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
+	stnp	val, val, [dst, 16]
+	stnp	val, val, [dst, 32]
+1:	stnp	val, val, [dstend, -32]
+	stnp	val, val, [dstend, -16]
+	ret
+
 L(try_zva):
 	/* Write the first and last 64 byte aligned block using stp rather
 	   than using DC ZVA as it is faster. */
diff --git a/sysdeps/mach/hurd/dup3.c b/sysdeps/mach/hurd/dup3.c
index 22af45b..49545ae 100644
--- a/sysdeps/mach/hurd/dup3.c
+++ b/sysdeps/mach/hurd/dup3.c
@@ -69,6 +69,7 @@ __dup3 (int fd, int fd2, int flags)
 	{
 	  /* Get a hold of the destination descriptor.  */
 	  struct hurd_fd *d2;
+	  error_t err;
 
 	  __mutex_lock (&_hurd_dtable_lock);
 
@@ -107,22 +108,51 @@ __dup3 (int fd, int fd2, int flags)
 	    }
 	  else
 	    {
-	      /* Give the ports each a user ref for the new descriptor.  */
-	      __mach_port_mod_refs (__mach_task_self (), port,
-				    MACH_PORT_RIGHT_SEND, 1);
-	      if (ctty != MACH_PORT_NULL)
-		__mach_port_mod_refs (__mach_task_self (), ctty,
-				      MACH_PORT_RIGHT_SEND, 1);
-
-	      /* Install the ports and flags in the new descriptor slot.  */
-	      __spin_lock (&d2->port.lock);
-	      if (flags & O_CLOEXEC)
-		d2->flags = d_flags | FD_CLOEXEC;
-	      else
-		/* dup clears FD_CLOEXEC.  */
-		d2->flags = d_flags & ~FD_CLOEXEC;
-	      _hurd_port_set (&d2->ctty, ctty);
-	      _hurd_port_locked_set (&d2->port, port); /* Unlocks D2.  */
+	      /* Give the io server port a user ref for the new descriptor.  */
+	      err = __mach_port_mod_refs (__mach_task_self (), port,
+					  MACH_PORT_RIGHT_SEND, 1);
+
+	      if (err == KERN_UREFS_OVERFLOW)
+		fd2 = __hurd_fail (EMFILE);
+	      else if (err)
+		fd2 = __hurd_fail (EINVAL);
+	      else if (ctty != MACH_PORT_NULL)
+		{
+		  /* We have confirmed the io server port has got a user ref
+		     count, now give ctty port a user ref for the new
+		     descriptor.  */
+		  err = __mach_port_mod_refs (__mach_task_self (), ctty,
+					      MACH_PORT_RIGHT_SEND, 1);
+
+		  if (err)
+		    {
+		      /* In this case the io server port has got a ref count
+		         but the ctty port failed to get one, so we need to
+			 clean the ref count we just assigned.  */
+		      __mach_port_mod_refs (__mach_task_self (), port,
+					    MACH_PORT_RIGHT_SEND, -1);
+
+		      if (err == KERN_UREFS_OVERFLOW)
+			fd2 = __hurd_fail (EMFILE);
+		      else
+			fd2 = __hurd_fail (EINVAL);
+		    }
+		}
+
+	      if (!err)
+	        {
+		  /* The ref counts of the ports are incremented
+		     successfully.  */
+		  /* Install the ports and flags in the new descriptor slot.  */
+		  __spin_lock (&d2->port.lock);
+		  if (flags & O_CLOEXEC)
+		    d2->flags = d_flags | FD_CLOEXEC;
+		  else
+		    /* dup clears FD_CLOEXEC.  */
+		    d2->flags = d_flags & ~FD_CLOEXEC;
+		  _hurd_port_set (&d2->ctty, ctty);
+		  _hurd_port_locked_set (&d2->port, port); /* Unlocks D2.  */
+		}
 	    }
 	}
 
diff --git a/sysdeps/mach/hurd/fcntl.c b/sysdeps/mach/hurd/fcntl.c
index a65c190..de576af 100644
--- a/sysdeps/mach/hurd/fcntl.c
+++ b/sysdeps/mach/hurd/fcntl.c
@@ -83,18 +83,47 @@ __libc_fcntl (int fd, int cmd, ...)
 	  result = -1;
 	else
 	  {
-	    /* Give the ports each a user ref for the new descriptor.  */
-	    __mach_port_mod_refs (__mach_task_self (), port,
-				  MACH_PORT_RIGHT_SEND, 1);
-	    if (ctty != MACH_PORT_NULL)
-	      __mach_port_mod_refs (__mach_task_self (), ctty,
-				    MACH_PORT_RIGHT_SEND, 1);
-
-	    /* Install the ports and flags in the new descriptor.  */
-	    if (ctty != MACH_PORT_NULL)
-	      _hurd_port_set (&new->ctty, ctty);
-	    new->flags = flags;
-	    _hurd_port_locked_set (&new->port, port); /* Unlocks NEW.  */
+	    /* Give the io server port a user ref for the new descriptor.  */
+	    err = __mach_port_mod_refs (__mach_task_self (), port,
+					MACH_PORT_RIGHT_SEND, 1);
+
+	    if (err == KERN_UREFS_OVERFLOW)
+	      result = __hurd_fail (EMFILE);
+	    else if (err)
+	      result = __hurd_fail (EINVAL);
+	    else if (ctty != MACH_PORT_NULL)
+	      {
+		/* We have confirmed the io server port has got a user ref
+		   count, now give ctty port a user ref for the new
+		   descriptor.  */
+		err = __mach_port_mod_refs (__mach_task_self (), ctty,
+					    MACH_PORT_RIGHT_SEND, 1);
+
+		if (err)
+		  {
+		    /* In this case the io server port has got a ref count
+		    but the ctty port fails to get one, so we need to clean
+		    the ref count we just assigned.  */
+		    __mach_port_mod_refs (__mach_task_self (), port,
+					  MACH_PORT_RIGHT_SEND, -1);
+
+		    if (err == KERN_UREFS_OVERFLOW)
+		      result = __hurd_fail (EMFILE);
+		    else
+		      result = __hurd_fail (EINVAL);
+		  }
+	      }
+
+	    if (!err)
+	      {
+		/* The ref counts of the ports are incremented successfully.  */
+		/* Install the ports and flags in the new descriptor.  */
+		if (ctty != MACH_PORT_NULL)
+		  _hurd_port_set (&new->ctty, ctty);
+		new->flags = flags;
+		/* Unlocks NEW.  */
+		_hurd_port_locked_set (&new->port, port);
+	      }
 	  }
 
 	HURD_CRITICAL_END;
diff --git a/sysdeps/mach/hurd/futimens.c b/sysdeps/mach/hurd/futimens.c
index 30ef0a6..1212529 100644
--- a/sysdeps/mach/hurd/futimens.c
+++ b/sysdeps/mach/hurd/futimens.c
@@ -32,7 +32,9 @@ __futimens (int fd, const struct timespec tsp[2])
   struct timespec atime, mtime;
   error_t err;
 
-  utime_ts_from_tspec (tsp, &atime, &mtime);
+  err = utime_ts_from_tspec (tsp, &atime, &mtime);
+  if (err)
+    return err;
 
   err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime));
 
@@ -40,7 +42,9 @@ __futimens (int fd, const struct timespec tsp[2])
     {
       time_value_t atim, mtim;
 
-      utime_tvalue_from_tspec (tsp, &atim, &mtim);
+      err = utime_tvalue_from_tspec (tsp, &atim, &mtim);
+      if (err)
+	return err;
 
       err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim));
   }
diff --git a/sysdeps/mach/hurd/futimes.c b/sysdeps/mach/hurd/futimes.c
index 20f47f3..97385d7 100644
--- a/sysdeps/mach/hurd/futimes.c
+++ b/sysdeps/mach/hurd/futimes.c
@@ -32,7 +32,9 @@ __futimes (int fd, const struct timeval tvp[2])
   struct timespec atime, mtime;
   error_t err;
 
-  utime_ts_from_tval (tvp, &atime, &mtime);
+  err = utime_ts_from_tval (tvp, &atime, &mtime);
+  if (err)
+    return err;
 
   err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime));
 
@@ -40,7 +42,9 @@ __futimes (int fd, const struct timeval tvp[2])
     {
       time_value_t atim, mtim;
 
-      utime_tvalue_from_tval (tvp, &atim, &mtim);
+      err = utime_tvalue_from_tval (tvp, &atim, &mtim);
+      if (err)
+	return err;
 
       err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim));
     }
diff --git a/sysdeps/mach/hurd/i386/bits/sigcontext.h b/sysdeps/mach/hurd/i386/bits/sigcontext.h
index 6e5e220..c44e4de 100644
--- a/sysdeps/mach/hurd/i386/bits/sigcontext.h
+++ b/sysdeps/mach/hurd/i386/bits/sigcontext.h
@@ -88,6 +88,8 @@ struct sigcontext
     struct i386_fp_save sc_fpsave;
     struct i386_fp_regs sc_fpregs;
     int sc_fpexcsr;		/* FPSR including exception bits.  */
+
+    struct i386_xfloat_state *xstate;
   };
 
 /* Traditional BSD names for some members.  */
diff --git a/sysdeps/mach/hurd/i386/sigreturn.c b/sysdeps/mach/hurd/i386/sigreturn.c
index ce8df8d..dc57d61 100644
--- a/sysdeps/mach/hurd/i386/sigreturn.c
+++ b/sysdeps/mach/hurd/i386/sigreturn.c
@@ -21,6 +21,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <cpuid.h>
+
 /* This is run on the thread stack after restoring it, to be able to
    unlock SS off sigstack.  */
 static void
@@ -123,10 +125,35 @@ __sigreturn (struct sigcontext *scp)
   if (scp->sc_onstack)
     ss->sigaltstack.ss_flags &= ~SS_ONSTACK;
 
-  if (scp->sc_fpused)
-    /* Restore the FPU state.  Mach conveniently stores the state
-       in the format the i387 `frstor' instruction uses to restore it.  */
-    asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
+#ifdef i386_XFLOAT_STATE
+  if (scp->xstate)
+    {
+      if (scp->xstate->initialized)
+	{
+	  unsigned eax, ebx, ecx, edx;
+	  __cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+	  switch (scp->xstate->fp_save_kind)
+	    {
+	    case 0: // FNSAVE
+	      asm volatile("frstor %0" : : "m" (scp->xstate->hw_state));
+	      break;
+	    case 1: // FXSAVE
+	      asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state),    \
+			   "a" (eax), "d" (edx));
+	      break;
+	    default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES
+	      asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state),     \
+			   "a" (eax), "d" (edx));
+	      break;
+	    }
+	}
+    }
+  else
+#endif
+    if (scp->sc_fpused)
+      /* Restore the FPU state.  Mach conveniently stores the state
+         in the format the i387 `frstor' instruction uses to restore it.  */
+      asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
 
   {
     /* There are convenient instructions to pop state off the stack, so we
diff --git a/sysdeps/mach/hurd/symlinkat.c b/sysdeps/mach/hurd/symlinkat.c
index e7dfb67..cb6250e 100644
--- a/sysdeps/mach/hurd/symlinkat.c
+++ b/sysdeps/mach/hurd/symlinkat.c
@@ -47,7 +47,7 @@ __symlinkat (const char *from, int fd, const char *to)
 
   if (! *name)
     /* Can't link to the existing directory itself.  */
-    err = ENOTDIR;
+    err = EEXIST;
   else
     /* Create a new, unlinked node in the target directory.  */
     err = __dir_mkfile (dir, O_WRITE, 0777 & ~_hurd_umask, &node);
diff --git a/sysdeps/mach/hurd/utime-helper.c b/sysdeps/mach/hurd/utime-helper.c
index d88bccd..6afa871 100644
--- a/sysdeps/mach/hurd/utime-helper.c
+++ b/sysdeps/mach/hurd/utime-helper.c
@@ -21,8 +21,14 @@
 #include <stddef.h>
 #include <sys/time.h>
 
+static inline bool
+check_tval (const struct timeval *tvp)
+{
+  return tvp->tv_usec >= 0 && tvp->tv_usec < USEC_PER_SEC;
+}
+
 /* Initializes atime/mtime timespec structures from an array of timeval.  */
-static inline void
+static inline error_t
 utime_ts_from_tval (const struct timeval tvp[2],
                     struct timespec *atime, struct timespec *mtime)
 {
@@ -37,13 +43,19 @@ utime_ts_from_tval (const struct timeval tvp[2],
     }
   else
     {
+      if (!check_tval (&tvp[0]))
+	return EINVAL;
+      if (!check_tval (&tvp[1]))
+	return EINVAL;
+
       TIMEVAL_TO_TIMESPEC (&tvp[0], atime);
       TIMEVAL_TO_TIMESPEC (&tvp[1], mtime);
     }
+  return 0;
 }
 
 /* Initializes atime/mtime time_value_t structures from an array of timeval.  */
-static inline void
+static inline error_t
 utime_tvalue_from_tval (const struct timeval tvp[2],
                         time_value_t *atime, time_value_t *mtime)
 {
@@ -53,11 +65,17 @@ utime_tvalue_from_tval (const struct timeval tvp[2],
     atime->microseconds = mtime->microseconds = -1;
   else
     {
+      if (!check_tval (&tvp[0]))
+	return EINVAL;
+      if (!check_tval (&tvp[1]))
+	return EINVAL;
+
       atime->seconds = tvp[0].tv_sec;
       atime->microseconds = tvp[0].tv_usec;
       mtime->seconds = tvp[1].tv_sec;
       mtime->microseconds = tvp[1].tv_usec;
     }
+  return 0;
 }
 
 /* Changes the access time of the file behind PORT using a timeval array.  */
@@ -67,7 +85,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
   error_t err;
   struct timespec atime, mtime;
 
-  utime_ts_from_tval (tvp, &atime, &mtime);
+  err = utime_ts_from_tval (tvp, &atime, &mtime);
+  if (err)
+    return err;
 
   err = __file_utimens (port, atime, mtime);
 
@@ -75,7 +95,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
     {
       time_value_t atim, mtim;
 
-      utime_tvalue_from_tval (tvp, &atim, &mtim);
+      err = utime_tvalue_from_tval (tvp, &atim, &mtim);
+      if (err)
+	return err;
 
       err = __file_utimes (port, atim, mtim);
     }
@@ -83,8 +105,16 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
   return err;
 }
 
+static inline bool
+check_tspec (const struct timespec *tsp)
+{
+  return tsp->tv_nsec == UTIME_NOW
+      || tsp->tv_nsec == UTIME_OMIT
+      || tsp->tv_nsec >= 0 && tsp->tv_nsec < NSEC_PER_SEC;
+}
+
 /* Initializes atime/mtime timespec structures from an array of timespec.  */
-static inline void
+static inline error_t
 utime_ts_from_tspec (const struct timespec tsp[2],
                      struct timespec *atime, struct timespec *mtime)
 {
@@ -99,13 +129,19 @@ utime_ts_from_tspec (const struct timespec tsp[2],
     }
   else
     {
+      if (!check_tspec (&tsp[0]))
+	return EINVAL;
+      if (!check_tspec (&tsp[1]))
+	return EINVAL;
+
       *atime = tsp[0];
       *mtime = tsp[1];
     }
+  return 0;
 }
 
 /* Initializes atime/mtime time_value_t structures from an array of timespec.  */
-static inline void
+static inline error_t
 utime_tvalue_from_tspec (const struct timespec tsp[2],
                          time_value_t *atime, time_value_t *mtime)
 {
@@ -115,6 +151,11 @@ utime_tvalue_from_tspec (const struct timespec tsp[2],
     atime->microseconds = mtime->microseconds = -1;
   else
     {
+      if (!check_tspec (&tsp[0]))
+	return EINVAL;
+      if (!check_tspec (&tsp[1]))
+	return EINVAL;
+
       if (tsp[0].tv_nsec == UTIME_NOW)
 	atime->microseconds = -1;
       else if (tsp[0].tv_nsec == UTIME_OMIT)
@@ -128,6 +169,7 @@ utime_tvalue_from_tspec (const struct timespec tsp[2],
       else
 	TIMESPEC_TO_TIME_VALUE (mtime, &(tsp[1]));
     }
+  return 0;
 }
 
 /* Changes the access time of the file behind PORT using a timespec array.  */
@@ -137,7 +179,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2])
   error_t err;
   struct timespec atime, mtime;
 
-  utime_ts_from_tspec (tsp, &atime, &mtime);
+  err = utime_ts_from_tspec (tsp, &atime, &mtime);
+  if (err)
+    return err;
 
   err = __file_utimens (port, atime, mtime);
 
@@ -145,7 +189,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2])
     {
       time_value_t atim, mtim;
 
-      utime_tvalue_from_tspec (tsp, &atim, &mtim);
+      err = utime_tvalue_from_tspec (tsp, &atim, &mtim);
+      if (err)
+	return err;
 
       err = __file_utimes (port, atim, mtim);
     }
diff --git a/sysdeps/mach/hurd/x86/trampoline.c b/sysdeps/mach/hurd/x86/trampoline.c
index 8e2890f..6f23c56 100644
--- a/sysdeps/mach/hurd/x86/trampoline.c
+++ b/sysdeps/mach/hurd/x86/trampoline.c
@@ -26,7 +26,11 @@
 #include "hurdfault.h"
 #include <intr-msg.h>
 #include <sys/ucontext.h>
-
+#ifdef __x86_64__
+#include <mach/x86_64/mach_i386.h>
+#else
+#include <mach/i386/mach_i386.h>
+#endif
 
 /* Fill in a siginfo_t structure for SA_SIGINFO-enabled handlers.  */
 static void fill_siginfo (siginfo_t *si, int signo,
@@ -106,6 +110,7 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
   void firewall (void);
   void *sigsp;
   struct sigcontext *scp;
+  vm_size_t xstate_size;
   struct
     {
       union
@@ -145,6 +150,14 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
       struct hurd_userlink link;
       ucontext_t ucontext;
       siginfo_t siginfo;
+#ifdef __x86_64__
+      char _pad2[56];
+#else
+      char _pad2[20];
+#endif
+      char xstate[];
+      /* Don't add anything after xstate, as it's dynamically
+         sized. */
     } *stackframe;
 
 #ifdef __x86_64__
@@ -170,6 +183,17 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
   if (! machine_get_basic_state (ss->thread, state))
     return NULL;
 
+  /* Initialize the size of the CPU extended state, to be saved during
+   * signal handling */
+#ifdef i386_XFLOAT_STATE
+  _Static_assert ((sizeof(*stackframe) + sizeof(struct i386_xfloat_state)) % 64 == 0,
+                  "stackframe size must be multiple of 64-byte minus "
+                  "sizeof(struct i386_xfloat_state), please adjust _pad2");
+
+  if (__i386_get_xstate_size(__mach_host_self(), &xstate_size))
+#endif
+    xstate_size = 0;
+
   /* Save the original SP in the gratuitous `esp' slot.
      We may need to reset the SP (the `uesp' slot) to avoid clobbering an
      interrupted RPC frame.  */
@@ -196,14 +220,21 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
 #endif
     }
 
-  /* Push the arguments to call `trampoline' on the stack.  */
-  sigsp -= sizeof (*stackframe);
-#ifdef __x86_64__
-  /* Align SP at 16 bytes.  Coupled with the fact that sigreturn_addr is
-     16-byte aligned within the stackframe struct, this ensures that it ends
-     up on a 16-byte aligned address, as required by the ABI.  */
-  sigsp = (void *) ((uintptr_t) sigsp & ~15UL);
-#endif
+  /* Push the arguments to call `trampoline' on the stack.
+   * The extended state might have a variable size depending on the platform,
+   * so we dynamically allocate it on the stack frame.*/
+  sigsp -= sizeof (*stackframe) + xstate_size;
+
+  /* Align SP at 64 bytes. This is needed for two reasons:
+   * - sigreturn_addr is 16-byte aligned within the stackframe
+   *   struct, and this ensures that it ends up on a 16-byte aligned
+   *   address, as required by the ABI.
+   * - the XSAVE state needs to be aligned at 64 bytes (on both i386 and
+   *   x86_64), so we align the stackframe also at 64 bytes and add the
+   *   required padding at the end, see the _pad2 field.
+   */
+  sigsp = (void *) ((uintptr_t) sigsp & ~63UL);
+
   stackframe = sigsp;
 
   if (_hurdsig_catch_memory_fault (stackframe))
@@ -248,14 +279,40 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
       memcpy (&scp->sc_i386_thread_state,
 	      &state->basic, sizeof (state->basic));
 
-      /* struct sigcontext is laid out so that starting at sc_fpkind mimics
-	 a struct i386_float_state.  */
-      _Static_assert (offsetof (struct sigcontext, sc_i386_float_state)
-		      % __alignof__ (struct i386_float_state) == 0,
-		      "sc_i386_float_state layout mismatch");
-      ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE,
-			      &state->fpu, &scp->sc_i386_float_state,
-			      sizeof (state->fpu));
+      scp->xstate = NULL;
+#ifdef i386_XFLOAT_STATE
+      if (xstate_size > 0)
+        {
+          mach_msg_type_number_t got = (xstate_size / sizeof (int));
+
+          ok = (! __thread_get_state (ss->thread, i386_XFLOAT_STATE,
+                                      (thread_state_t) stackframe->xstate, &got)
+                && got == (xstate_size / sizeof (int)));
+
+	  if (ok && ((struct i386_xfloat_state*) stackframe->xstate)->fp_save_kind > 5)
+	    /* We support up to XSAVES */
+	    ok = 0;
+
+          if (ok)
+	    {
+	      scp->xstate = (struct i386_xfloat_state*) stackframe->xstate;
+	      assert((uintptr_t)scp->xstate->hw_state % 64 == 0);
+	    }
+        }
+      else
+#endif
+        ok = 0;
+      if (!ok)
+        {
+          /* struct sigcontext is laid out so that starting at sc_fpkind mimics
+            a struct i386_float_state.  */
+          _Static_assert (offsetof (struct sigcontext, sc_i386_float_state)
+                         % __alignof__ (struct i386_float_state) == 0,
+                         "sc_i386_float_state layout mismatch");
+          ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE,
+                                 &state->fpu, &scp->sc_i386_float_state,
+                                 sizeof (state->fpu));
+        }
 
       /* Set up the arguments for the signal handler.  */
       stackframe->signo = signo;
@@ -404,7 +461,10 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
    - in gdb: gdb/i386-gnu-tdep.c gnu_sigtramp_code.  */
 
 #ifdef __x86_64__
-asm ("rpc_wait_trampoline:\n"
+asm ("trampoline:\n"
+     "fnclex\n"			/* Clear any pending exception.  */
+     "jmp _trampoline\n"
+     "rpc_wait_trampoline:\n"
   /* This is the entry point when we have an RPC reply message to receive
      before running the handler.  The MACH_MSG_SEND bit has already been
      cleared in the OPTION argument in our %rsi.  The interrupted user
@@ -423,7 +483,7 @@ asm ("rpc_wait_trampoline:\n"
      /* Switch to the signal stack.  */
      "movq %rbx, %rsp\n"
 
-     "trampoline:\n"
+     "_trampoline:\n"
      /* Entry point for running the handler normally.  The arguments to the
         handler function are on the top of the stack, same as in the i386
         version:
@@ -449,7 +509,10 @@ asm ("rpc_wait_trampoline:\n"
      "movq 16(%rsp), %rdi\n"
      "ret");
 #else
-asm ("rpc_wait_trampoline:\n");
+asm ("trampoline:\n"
+     "fnclex\n"			/* Clear any pending exception.  */
+     "jmp _trampoline\n"
+     "rpc_wait_trampoline:\n");
   /* This is the entry point when we have an RPC reply message to receive
      before running the handler.  The MACH_MSG_SEND bit has already been
      cleared in the OPTION argument on our stack.  The interrupted user
@@ -469,7 +532,7 @@ asm (/* Retry the interrupted mach_msg system call.  */
      /* Switch to the signal stack.  */
      "movl %ebx, %esp\n");
 
- asm ("trampoline:\n");
+asm ("_trampoline:\n");
   /* Entry point for running the handler normally.  The arguments to the
      handler function are already on the top of the stack:
 
diff --git a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
index 7bac881..d83795f 100644
--- a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
+++ b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
@@ -96,6 +96,8 @@ struct sigcontext
     struct i386_fp_save sc_fpsave;
     struct i386_fp_regs sc_fpregs;
     int sc_fpexcsr;		/* FPSR including exception bits.  */
+
+    struct i386_xfloat_state *xstate;
   };
 
 /* Traditional BSD names for some members.  */
diff --git a/sysdeps/mach/hurd/x86_64/sigreturn.c b/sysdeps/mach/hurd/x86_64/sigreturn.c
index 81a2d3b..773c00f 100644
--- a/sysdeps/mach/hurd/x86_64/sigreturn.c
+++ b/sysdeps/mach/hurd/x86_64/sigreturn.c
@@ -20,6 +20,8 @@
 #include <hurd/msg.h>
 #include <stdlib.h>
 
+#include <cpuid.h>
+
 /* This is run on the thread stack after restoring it, to be able to
    unlock SS off sigstack.  */
 void
@@ -116,10 +118,35 @@ __sigreturn (struct sigcontext *scp)
   if (scp->sc_onstack)
     ss->sigaltstack.ss_flags &= ~SS_ONSTACK;
 
-  if (scp->sc_fpused)
-    /* Restore the FPU state.  Mach conveniently stores the state
-       in the format the i387 `frstor' instruction uses to restore it.  */
-    asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
+#ifdef i386_XFLOAT_STATE
+  if (scp->xstate)
+    {
+      if (scp->xstate->initialized)
+	{
+	  unsigned eax, ebx, ecx, edx;
+	  __cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+	  switch (scp->xstate->fp_save_kind)
+	    {
+	    case 0: // FNSAVE
+	      asm volatile("frstor %0" : : "m" (scp->xstate->hw_state));
+	      break;
+	    case 1: // FXSAVE
+	      asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state),    \
+			   "a" (eax), "d" (edx));
+	      break;
+	    default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES
+	      asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state),     \
+			   "a" (eax), "d" (edx));
+	      break;
+	    }
+	}
+    }
+  else
+#endif
+    if (scp->sc_fpused)
+      /* Restore the FPU state.  Mach conveniently stores the state
+         in the format the i387 `frstor' instruction uses to restore it.  */
+      asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
 
   /* Copy the registers onto the user's stack, to be able to release the
      altstack (by unlocking sigstate).  Note that unless an altstack is used,
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index d8d7c8b..89e26bb 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -363,7 +363,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[],
 				    / sizeof (Elf64_Rela));
       Elf64_Addr l_addr = map->l_addr;
       Elf64_Dyn **info = map->l_info;
-      char *p;
 
       extern void _dl_runtime_resolve (void);
       extern void _dl_profile_resolve (void);
@@ -435,20 +434,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[],
 	      offset += PLT_ENTRY_WORDS;
 	      glink_offset += GLINK_ENTRY_WORDS (i);
 	    }
-
-	  /* Now, we've modified data.  We need to write the changes from
-	     the data cache to a second-level unified cache, then make
-	     sure that stale data in the instruction cache is removed.
-	     (In a multiprocessor system, the effect is more complex.)
-	     Most of the PLT shouldn't be in the instruction cache, but
-	     there may be a little overlap at the start and the end.
-
-	     Assumes that dcbst and icbi apply to lines of 16 bytes or
-	     more.  Current known line sizes are 16, 32, and 128 bytes.  */
-
-	  for (p = (char *) plt; p < (char *) &plt[offset]; p += 16)
-	    PPC_DCBST (p);
-	  PPC_SYNC;
 	}
     }
   return lazy;
diff --git a/sysdeps/pthread/tst-stdio2.c b/sysdeps/pthread/tst-stdio2.c
index 08948cb..0876ed6 100644
--- a/sysdeps/pthread/tst-stdio2.c
+++ b/sysdeps/pthread/tst-stdio2.c
@@ -75,7 +75,7 @@ do_test (void)
       exit (1);
     }
 
-  puts ("join returned succsefully");
+  puts ("join returned successfully");
 
   return 0;
 }
diff --git a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
index dfc554a..f425a4b 100644
--- a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
+++ b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
@@ -379,6 +379,8 @@ struct file_handle
 						identity and may not
 						be usable to
 						open_by_handle_at.  */
+# define AT_HANDLE_MNT_ID_UNIQUE 1 /* Return the 64-bit unique mount
+				      ID.  */
 #endif
 
 __BEGIN_DECLS
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 9d136e4..e50f1d6 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
 		"Incorrect index_arch_Fast_Unaligned_Load");
 
 
-/* Intel Family-6 microarch list.  */
-enum
+/* Intel microarch list.  */
+enum intel_microarch
 {
   /* Atom processors.  */
   INTEL_ATOM_BONNELL,
@@ -512,6 +512,7 @@ enum
   INTEL_ATOM_GOLDMONT,
   INTEL_ATOM_GOLDMONT_PLUS,
   INTEL_ATOM_SIERRAFOREST,
+  INTEL_ATOM_CLEARWATERFOREST,
   INTEL_ATOM_GRANDRIDGE,
   INTEL_ATOM_TREMONT,
 
@@ -539,7 +540,9 @@ enum
   INTEL_BIGCORE_METEORLAKE,
   INTEL_BIGCORE_LUNARLAKE,
   INTEL_BIGCORE_ARROWLAKE,
+  INTEL_BIGCORE_PANTHERLAKE,
   INTEL_BIGCORE_GRANITERAPIDS,
+  INTEL_BIGCORE_DIAMONDRAPIDS,
 
   /* Mixed (bigcore + atom SOC).  */
   INTEL_MIXED_LAKEFIELD,
@@ -553,7 +556,7 @@ enum
   INTEL_UNKNOWN,
 };
 
-static unsigned int
+static enum intel_microarch
 intel_get_fam6_microarch (unsigned int model,
 			  __attribute__ ((unused)) unsigned int stepping)
 {
@@ -584,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_ATOM_GOLDMONT_PLUS;
     case 0xAF:
       return INTEL_ATOM_SIERRAFOREST;
+    case 0xDD:
+      return INTEL_ATOM_CLEARWATERFOREST;
     case 0xB6:
       return INTEL_ATOM_GRANDRIDGE;
     case 0x86:
@@ -691,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_BIGCORE_METEORLAKE;
     case 0xbd:
       return INTEL_BIGCORE_LUNARLAKE;
+    case 0xb5:
+    case 0xc5:
     case 0xc6:
       return INTEL_BIGCORE_ARROWLAKE;
+    case 0xCC:
+      return INTEL_BIGCORE_PANTHERLAKE;
     case 0xAD:
     case 0xAE:
       return INTEL_BIGCORE_GRANITERAPIDS;
@@ -756,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features)
       cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
 	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
 
+      enum intel_microarch microarch = INTEL_UNKNOWN;
       if (family == 0x06)
 	{
 	  model += extended_model;
-	  unsigned int microarch
-	      = intel_get_fam6_microarch (model, stepping);
+	  microarch = intel_get_fam6_microarch (model, stepping);
 
+	  /* Disable TSX on some processors to avoid TSX on kernels that
+	     weren't updated with the latest microcode package (which
+	     disables broken feature by default).  */
 	  switch (microarch)
 	    {
-	      /* Atom / KNL tuning.  */
-	    case INTEL_ATOM_BONNELL:
-	      /* BSF is slow on Bonnell.  */
-	      cpu_features->preferred[index_arch_Slow_BSF]
-		  |= bit_arch_Slow_BSF;
-	      break;
-
-	      /* Unaligned load versions are faster than SSSE3
-		     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
-	    case INTEL_ATOM_AIRMONT:
-	    case INTEL_ATOM_SILVERMONT:
-	    case INTEL_ATOM_GOLDMONT:
-	    case INTEL_ATOM_GOLDMONT_PLUS:
-
-          /* Knights Landing.  Enable Silvermont optimizations.  */
-	    case INTEL_KNIGHTS_LANDING:
-
-	      cpu_features->preferred[index_arch_Fast_Unaligned_Load]
-		  |= (bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	    case INTEL_ATOM_TREMONT:
-	      /* Enable rep string instructions, unaligned load, unaligned
-		 copy, pminub and avoid SSE 4.2 on Tremont.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	   /*
-	    Default tuned Knights microarch.
-	    case INTEL_KNIGHTS_MILL:
-        */
-
-	   /*
-	    Default tuned atom microarch.
-	    case INTEL_ATOM_SIERRAFOREST:
-	    case INTEL_ATOM_GRANDRIDGE:
-	   */
-
-	      /* Bigcore/Default Tuning.  */
 	    default:
-	    default_tuning:
-	      /* Unknown family 0x06 processors.  Assuming this is one
-		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
-		break;
-
-	    enable_modern_features:
-	      /* Rep string instructions, unaligned load, unaligned copy,
-		 and pminub are fast on Intel Core i3, i5 and i7.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 
-	    case INTEL_BIGCORE_NEHALEM:
-	    case INTEL_BIGCORE_WESTMERE:
-	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto enable_modern_features;
-
-	      /* Older Bigcore microarch (smaller non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SANDYBRIDGE:
-	    case INTEL_BIGCORE_IVYBRIDGE:
-	    case INTEL_BIGCORE_HASWELL:
-	    case INTEL_BIGCORE_BROADWELL:
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto default_tuning;
-
-	      /* Newer Bigcore microarch (larger non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SKYLAKE_AVX512:
-	    case INTEL_BIGCORE_CANNONLAKE:
-	      /* Benchmarks indicate non-temporal memset is not
-		     necessarily profitable on SKX (and in some cases much
-		     worse). This is likely unique to SKX due its it unique
-		     mesh interconnect (not present on ICX or BWD). Disable
-		     non-temporal on all Skylake servers. */
-	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
-		  |= bit_arch_Avoid_Non_Temporal_Memset;
-	      /* fallthrough */
-	    case INTEL_BIGCORE_COMETLAKE:
-	    case INTEL_BIGCORE_SKYLAKE:
-	    case INTEL_BIGCORE_KABYLAKE:
-	    case INTEL_BIGCORE_ICELAKE:
-	    case INTEL_BIGCORE_TIGERLAKE:
-	    case INTEL_BIGCORE_ROCKETLAKE:
-	    case INTEL_BIGCORE_RAPTORLAKE:
-	    case INTEL_BIGCORE_METEORLAKE:
-	    case INTEL_BIGCORE_LUNARLAKE:
-	    case INTEL_BIGCORE_ARROWLAKE:
-	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
-	    case INTEL_BIGCORE_EMERALDRAPIDS:
-	    case INTEL_BIGCORE_GRANITERAPIDS:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-
-	      /* Default tuned Mixed (bigcore + atom SOC). */
-	    case INTEL_MIXED_LAKEFIELD:
-	    case INTEL_MIXED_ALDERLAKE:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-	    }
-
-	      /* Disable TSX on some processors to avoid TSX on kernels that
-		 weren't updated with the latest microcode package (which
-		 disables broken feature by default).  */
-	  switch (microarch)
-	    {
 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
 	      /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
 	      if (stepping <= 5)
@@ -891,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features)
 
 	    case INTEL_BIGCORE_KABYLAKE:
 	      /* NB: Although the errata documents that for model == 0x8e
-		     (kabylake skylake client), only 0xb stepping or lower are
-		     impacted, the intention of the errata was to disable TSX on
-		     all client processors on all steppings.  Include 0xc
-		     stepping which is an Intel Core i7-8665U, a client mobile
-		     processor.  */
+		 (kabylake skylake client), only 0xb stepping or lower are
+		 impacted, the intention of the errata was to disable TSX on
+		 all client processors on all steppings.  Include 0xc
+		 stepping which is an Intel Core i7-8665U, a client mobile
+		 processor.  */
 	      if (stepping > 0xc)
 		break;
 	      /* Fall through.  */
 	    case INTEL_BIGCORE_SKYLAKE:
-		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
-		   processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
-		 */
-	    disable_tsx:
-		CPU_FEATURE_UNSET (cpu_features, HLE);
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
-		break;
+	      /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+		 processors listed in:
+
+		 https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+	       */
+disable_tsx:
+	      CPU_FEATURE_UNSET (cpu_features, HLE);
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+	      break;
 
 	    case INTEL_BIGCORE_HASWELL:
-		/* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
-		   TSX.  Haswell also include other model numbers that have
-		   working TSX.  */
-		if (model == 0x3f && stepping >= 4)
+	      /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+		 TSX.  Haswell also includes other model numbers that have
+		 working TSX.  */
+	      if (model == 0x3f && stepping >= 4)
 		break;
 
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		break;
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      break;
 	    }
 	}
+      else if (family == 19)
+	switch (model)
+	  {
+	  case 0x01:
+	    microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+	    break;
 
+	  default:
+	    break;
+	  }
+
+      switch (microarch)
+	{
+	  /* Atom / KNL tuning.  */
+	case INTEL_ATOM_BONNELL:
+	  /* BSF is slow on Bonnell.  */
+	  cpu_features->preferred[index_arch_Slow_BSF]
+	    |= bit_arch_Slow_BSF;
+	  break;
+
+	  /* Unaligned load versions are faster than SSSE3
+	     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
+	case INTEL_ATOM_AIRMONT:
+	case INTEL_ATOM_SILVERMONT:
+	case INTEL_ATOM_GOLDMONT:
+	case INTEL_ATOM_GOLDMONT_PLUS:
+
+	  /* Knights Landing.  Enable Silvermont optimizations.  */
+	case INTEL_KNIGHTS_LANDING:
+
+	  cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+	    |= (bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	case INTEL_ATOM_TREMONT:
+	  /* Enable rep string instructions, unaligned load, unaligned
+	     copy, pminub and avoid SSE 4.2 on Tremont.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	  /*
+	     Default tuned Knights microarch.
+	     case INTEL_KNIGHTS_MILL:
+	     */
+
+	  /*
+	     Default tuned atom microarch.
+	     case INTEL_ATOM_SIERRAFOREST:
+	     case INTEL_ATOM_GRANDRIDGE:
+	     case INTEL_ATOM_CLEARWATERFOREST:
+	     */
+
+	  /* Bigcore/Default Tuning.  */
+	default:
+	default_tuning:
+	  /* Unknown Intel processors.  Assuming this is one of Core
+	     i3/i5/i7 processors if AVX is available.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+	    break;
+
+	enable_modern_features:
+	  /* Rep string instructions, unaligned load, unaligned copy,
+	     and pminub are fast on Intel Core i3, i5 and i7.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop);
+	  break;
+
+	case INTEL_BIGCORE_NEHALEM:
+	case INTEL_BIGCORE_WESTMERE:
+	  /* Older CPUs prefer non-temporal stores at lower threshold.  */
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto enable_modern_features;
+
+	  /* Older Bigcore microarch (smaller non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SANDYBRIDGE:
+	case INTEL_BIGCORE_IVYBRIDGE:
+	case INTEL_BIGCORE_HASWELL:
+	case INTEL_BIGCORE_BROADWELL:
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto default_tuning;
+
+	  /* Newer Bigcore microarch (larger non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SKYLAKE_AVX512:
+	case INTEL_BIGCORE_CANNONLAKE:
+	  /* Benchmarks indicate non-temporal memset is not
+	     necessarily profitable on SKX (and in some cases much
+	     worse). This is likely unique to SKX due to its unique
+	     mesh interconnect (not present on ICX or BWD). Disable
+	     non-temporal on all Skylake servers. */
+	  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	    |= bit_arch_Avoid_Non_Temporal_Memset;
+	  /* fallthrough */
+	case INTEL_BIGCORE_COMETLAKE:
+	case INTEL_BIGCORE_SKYLAKE:
+	case INTEL_BIGCORE_KABYLAKE:
+	case INTEL_BIGCORE_ICELAKE:
+	case INTEL_BIGCORE_TIGERLAKE:
+	case INTEL_BIGCORE_ROCKETLAKE:
+	case INTEL_BIGCORE_RAPTORLAKE:
+	case INTEL_BIGCORE_METEORLAKE:
+	case INTEL_BIGCORE_LUNARLAKE:
+	case INTEL_BIGCORE_ARROWLAKE:
+	case INTEL_BIGCORE_PANTHERLAKE:
+	case INTEL_BIGCORE_SAPPHIRERAPIDS:
+	case INTEL_BIGCORE_EMERALDRAPIDS:
+	case INTEL_BIGCORE_GRANITERAPIDS:
+	case INTEL_BIGCORE_DIAMONDRAPIDS:
+	  /* Default tuned Mixed (bigcore + atom SOC). */
+	case INTEL_MIXED_LAKEFIELD:
+	case INTEL_MIXED_ALDERLAKE:
+	  cpu_features->cachesize_non_temporal_divisor = 2;
+	  goto default_tuning;
+	}
 
       /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
          if AVX512ER is available.  Don't use AVX512 to avoid lower CPU
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 9d31685..5723ec1 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -142,7 +142,6 @@ CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
 AVX512-CFLAGS = -mavx512f
 CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
-CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
 CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
 
diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c
index 6eb21b6..0b994ef 100644
--- a/sysdeps/x86_64/tst-auditmod10b.c
+++ b/sysdeps/x86_64/tst-auditmod10b.c
@@ -125,7 +125,6 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
 
 #include <tst-audit.h>
 
-#ifdef __AVX512F__
 #include <immintrin.h>
 #include <cpuid.h>
 
@@ -148,9 +147,37 @@ check_avx512 (void)
   return (eax & 0xe6) == 0xe6;
 }
 
-#else
-#include <emmintrin.h>
-#endif
+void
+__attribute__ ((target ("avx512f")))
+pltenter_avx512f (La_regs *regs, long int *framesizep)
+{
+  __m512i zero = _mm512_setzero_si512 ();
+  if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
+    abort ();
+
+  for (int i = 0; i < 8; i++)
+    regs->lr_vector[i].zmm[0]
+      = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
+
+  __m512i zmm = _mm512_set1_epi64 (-1);
+  asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+  asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
+  asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
+  asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
+  asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
+  asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
+  asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
+  asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
+
+  *framesizep = 1024;
+}
 
 ElfW(Addr)
 pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
@@ -160,39 +187,33 @@ pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
   printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
 	  symname, (long int) sym->st_value, ndx, *flags);
 
-#ifdef __AVX512F__
   if (check_avx512 () && strcmp (symname, "audit_test") == 0)
+    pltenter_avx512f (regs, framesizep);
+
+  return sym->st_value;
+}
+
+void
+__attribute__ ((target ("avx512f")))
+pltexit_avx512f (const La_regs *inregs, La_retval *outregs)
+{
+  __m512i zero = _mm512_setzero_si512 ();
+  if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
+    abort ();
+
+  for (int i = 0; i < 8; i++)
     {
-      __m512i zero = _mm512_setzero_si512 ();
-      if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
-	abort ();
-
-      for (int i = 0; i < 8; i++)
-	regs->lr_vector[i].zmm[0]
-	  = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
-
-      __m512i zmm = _mm512_set1_epi64 (-1);
-      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
-      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
-      asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
-      asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
-      asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
-      asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
-      asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
-      asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
-
-      *framesizep = 1024;
+      __m512i zmm = _mm512_set1_epi64 (i + 1);
+      if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
+        abort ();
     }
-#endif
 
-  return sym->st_value;
+  outregs->lrv_vector0.zmm[0]
+    = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
+
+  __m512i zmm = _mm512_set1_epi64 (-1);
+  asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+  asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
 }
 
 unsigned int
@@ -204,28 +225,8 @@ pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
 	  symname, (long int) sym->st_value, ndx,
 	  (ptrdiff_t) outregs->int_retval);
 
-#ifdef __AVX512F__
   if (check_avx512 () && strcmp (symname, "audit_test") == 0)
-    {
-      __m512i zero = _mm512_setzero_si512 ();
-      if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
-	abort ();
-
-      for (int i = 0; i < 8; i++)
-	{
-	  __m512i zmm = _mm512_set1_epi64 (i + 1);
-	  if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
-	    abort ();
-	}
-
-      outregs->lrv_vector0.zmm[0]
-	= (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
-
-      __m512i zmm = _mm512_set1_epi64 (-1);
-      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
-      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
-    }
-#endif
+    pltexit_avx512f (inregs, outregs);
 
   return 0;
 }