aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/multiarch/memcpy_oryon1.S40
-rw-r--r--sysdeps/aarch64/multiarch/memset_oryon1.S26
-rw-r--r--sysdeps/mach/hurd/dup3.c62
-rw-r--r--sysdeps/mach/hurd/fcntl.c53
-rw-r--r--sysdeps/mach/hurd/futimens.c8
-rw-r--r--sysdeps/mach/hurd/futimes.c8
-rw-r--r--sysdeps/mach/hurd/i386/bits/sigcontext.h2
-rw-r--r--sysdeps/mach/hurd/i386/sigreturn.c35
-rw-r--r--sysdeps/mach/hurd/symlinkat.c2
-rw-r--r--sysdeps/mach/hurd/utime-helper.c62
-rw-r--r--sysdeps/mach/hurd/x86/trampoline.c105
-rw-r--r--sysdeps/mach/hurd/x86_64/bits/sigcontext.h2
-rw-r--r--sysdeps/mach/hurd/x86_64/sigreturn.c35
-rw-r--r--sysdeps/powerpc/powerpc64/dl-machine.h15
-rw-r--r--sysdeps/pthread/tst-stdio2.c2
-rw-r--r--sysdeps/unix/sysv/linux/bits/fcntl-linux.h2
-rw-r--r--sysdeps/x86/cpu-features.c305
-rw-r--r--sysdeps/x86_64/Makefile1
-rw-r--r--sysdeps/x86_64/tst-auditmod10b.c109
19 files changed, 591 insertions, 283 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
index e86d8b0..cc267db 100644
--- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S
@@ -152,6 +152,46 @@ L(copy96):
.p2align 6
L(copy_long):
+ /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp.
+ This loop is identical to the one below it but using ldnp/stnp
+ instructions. For loops that are less than 32768 bytes,
+ the ldnp/stnp instructions will not help and will cause a slow
+ down so only use the ldnp/stnp loop for the largest sizes. */
+
+ cmp count, #32768
+ b.lo L(copy_long_without_nontemp)
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ ldnp D_l, D_h, [src]
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldnp A_l, A_h, [src, 16]
+ stnp D_l, D_h, [dstin]
+ ldnp B_l, B_h, [src, 32]
+ ldnp C_l, C_h, [src, 48]
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ subs count, count, 128 + 16 /* Test and readjust count. */
+
+L(nontemp_loop64):
+ tbz src, #6, 1f
+1:
+ stnp A_l, A_h, [dst, 16]
+ ldnp A_l, A_h, [src, 16]
+ stnp B_l, B_h, [dst, 32]
+ ldnp B_l, B_h, [src, 32]
+ stnp C_l, C_h, [dst, 48]
+ ldnp C_l, C_h, [src, 48]
+ stnp D_l, D_h, [dst, 64]
+ ldnp D_l, D_h, [src, 64]
+ add src, src, #64
+ add dst, dst, #64
+ subs count, count, 64
+ b.hi L(nontemp_loop64)
+ b L(last64)
+
+L(copy_long_without_nontemp):
+
and tmp1, dstin, 15
bic dst, dstin, 15
ldp D_l, D_h, [src]
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index 0f9b718..88f4ef4 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -90,6 +90,8 @@ L(set_long):
cmp count, 256
ccmp valw, 0, 0, cs
b.eq L(try_zva)
+ cmp count, #32768
+ b.hi L(set_long_with_nontemp)
/* Small-size or non-zero memset does not use DC ZVA. */
sub count, dstend, dst
@@ -112,6 +114,30 @@ L(set_long):
stp val, val, [dstend, -16]
ret
+L(set_long_with_nontemp):
+ /* Small-size or non-zero memset does not use DC ZVA. */
+ sub count, dstend, dst
+
+ /* Adjust count and bias for loop. By subtracting extra 1 from count,
+ it is easy to use tbz instruction to check whether loop tailing
+ count is less than 33 bytes, so as to bypass 2 unnecessary stps. */
+ sub count, count, 64+16+1
+
+1: stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+ stnp val, val, [dst, 48]
+ stnp val, val, [dst, 64]
+ add dst, dst, #64
+ subs count, count, 64
+ b.hs 1b
+
+ tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
+ stnp val, val, [dst, 16]
+ stnp val, val, [dst, 32]
+1: stnp val, val, [dstend, -32]
+ stnp val, val, [dstend, -16]
+ ret
+
L(try_zva):
/* Write the first and last 64 byte aligned block using stp rather
than using DC ZVA as it is faster. */
diff --git a/sysdeps/mach/hurd/dup3.c b/sysdeps/mach/hurd/dup3.c
index 22af45b..49545ae 100644
--- a/sysdeps/mach/hurd/dup3.c
+++ b/sysdeps/mach/hurd/dup3.c
@@ -69,6 +69,7 @@ __dup3 (int fd, int fd2, int flags)
{
/* Get a hold of the destination descriptor. */
struct hurd_fd *d2;
+ error_t err;
__mutex_lock (&_hurd_dtable_lock);
@@ -107,22 +108,51 @@ __dup3 (int fd, int fd2, int flags)
}
else
{
- /* Give the ports each a user ref for the new descriptor. */
- __mach_port_mod_refs (__mach_task_self (), port,
- MACH_PORT_RIGHT_SEND, 1);
- if (ctty != MACH_PORT_NULL)
- __mach_port_mod_refs (__mach_task_self (), ctty,
- MACH_PORT_RIGHT_SEND, 1);
-
- /* Install the ports and flags in the new descriptor slot. */
- __spin_lock (&d2->port.lock);
- if (flags & O_CLOEXEC)
- d2->flags = d_flags | FD_CLOEXEC;
- else
- /* dup clears FD_CLOEXEC. */
- d2->flags = d_flags & ~FD_CLOEXEC;
- _hurd_port_set (&d2->ctty, ctty);
- _hurd_port_locked_set (&d2->port, port); /* Unlocks D2. */
+ /* Give the io server port a user ref for the new descriptor. */
+ err = __mach_port_mod_refs (__mach_task_self (), port,
+ MACH_PORT_RIGHT_SEND, 1);
+
+ if (err == KERN_UREFS_OVERFLOW)
+ fd2 = __hurd_fail (EMFILE);
+ else if (err)
+ fd2 = __hurd_fail (EINVAL);
+ else if (ctty != MACH_PORT_NULL)
+ {
+ /* We have confirmed the io server port has got a user ref
+ count, now give ctty port a user ref for the new
+ descriptor. */
+ err = __mach_port_mod_refs (__mach_task_self (), ctty,
+ MACH_PORT_RIGHT_SEND, 1);
+
+ if (err)
+ {
+ /* In this case the io server port has got a ref count
+ but the ctty port failed to get one, so we need to
+ clean the ref count we just assigned. */
+ __mach_port_mod_refs (__mach_task_self (), port,
+ MACH_PORT_RIGHT_SEND, -1);
+
+ if (err == KERN_UREFS_OVERFLOW)
+ fd2 = __hurd_fail (EMFILE);
+ else
+ fd2 = __hurd_fail (EINVAL);
+ }
+ }
+
+ if (!err)
+ {
+ /* The ref counts of the ports are incremented
+ successfully. */
+ /* Install the ports and flags in the new descriptor slot. */
+ __spin_lock (&d2->port.lock);
+ if (flags & O_CLOEXEC)
+ d2->flags = d_flags | FD_CLOEXEC;
+ else
+ /* dup clears FD_CLOEXEC. */
+ d2->flags = d_flags & ~FD_CLOEXEC;
+ _hurd_port_set (&d2->ctty, ctty);
+ _hurd_port_locked_set (&d2->port, port); /* Unlocks D2. */
+ }
}
}
diff --git a/sysdeps/mach/hurd/fcntl.c b/sysdeps/mach/hurd/fcntl.c
index a65c190..de576af 100644
--- a/sysdeps/mach/hurd/fcntl.c
+++ b/sysdeps/mach/hurd/fcntl.c
@@ -83,18 +83,47 @@ __libc_fcntl (int fd, int cmd, ...)
result = -1;
else
{
- /* Give the ports each a user ref for the new descriptor. */
- __mach_port_mod_refs (__mach_task_self (), port,
- MACH_PORT_RIGHT_SEND, 1);
- if (ctty != MACH_PORT_NULL)
- __mach_port_mod_refs (__mach_task_self (), ctty,
- MACH_PORT_RIGHT_SEND, 1);
-
- /* Install the ports and flags in the new descriptor. */
- if (ctty != MACH_PORT_NULL)
- _hurd_port_set (&new->ctty, ctty);
- new->flags = flags;
- _hurd_port_locked_set (&new->port, port); /* Unlocks NEW. */
+ /* Give the io server port a user ref for the new descriptor. */
+ err = __mach_port_mod_refs (__mach_task_self (), port,
+ MACH_PORT_RIGHT_SEND, 1);
+
+ if (err == KERN_UREFS_OVERFLOW)
+ result = __hurd_fail (EMFILE);
+ else if (err)
+ result = __hurd_fail (EINVAL);
+ else if (ctty != MACH_PORT_NULL)
+ {
+ /* We have confirmed the io server port has got a user ref
+ count, now give ctty port a user ref for the new
+ descriptor. */
+ err = __mach_port_mod_refs (__mach_task_self (), ctty,
+ MACH_PORT_RIGHT_SEND, 1);
+
+ if (err)
+ {
+ /* In this case the io server port has got a ref count
+ but the ctty port fails to get one, so we need to clean
+ the ref count we just assigned. */
+ __mach_port_mod_refs (__mach_task_self (), port,
+ MACH_PORT_RIGHT_SEND, -1);
+
+ if (err == KERN_UREFS_OVERFLOW)
+ result = __hurd_fail (EMFILE);
+ else
+ result = __hurd_fail (EINVAL);
+ }
+ }
+
+ if (!err)
+ {
+ /* The ref counts of the ports are incremented successfully. */
+ /* Install the ports and flags in the new descriptor. */
+ if (ctty != MACH_PORT_NULL)
+ _hurd_port_set (&new->ctty, ctty);
+ new->flags = flags;
+ /* Unlocks NEW. */
+ _hurd_port_locked_set (&new->port, port);
+ }
}
HURD_CRITICAL_END;
diff --git a/sysdeps/mach/hurd/futimens.c b/sysdeps/mach/hurd/futimens.c
index 30ef0a6..1212529 100644
--- a/sysdeps/mach/hurd/futimens.c
+++ b/sysdeps/mach/hurd/futimens.c
@@ -32,7 +32,9 @@ __futimens (int fd, const struct timespec tsp[2])
struct timespec atime, mtime;
error_t err;
- utime_ts_from_tspec (tsp, &atime, &mtime);
+ err = utime_ts_from_tspec (tsp, &atime, &mtime);
+ if (err)
+ return err;
err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime));
@@ -40,7 +42,9 @@ __futimens (int fd, const struct timespec tsp[2])
{
time_value_t atim, mtim;
- utime_tvalue_from_tspec (tsp, &atim, &mtim);
+ err = utime_tvalue_from_tspec (tsp, &atim, &mtim);
+ if (err)
+ return err;
err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim));
}
diff --git a/sysdeps/mach/hurd/futimes.c b/sysdeps/mach/hurd/futimes.c
index 20f47f3..97385d7 100644
--- a/sysdeps/mach/hurd/futimes.c
+++ b/sysdeps/mach/hurd/futimes.c
@@ -32,7 +32,9 @@ __futimes (int fd, const struct timeval tvp[2])
struct timespec atime, mtime;
error_t err;
- utime_ts_from_tval (tvp, &atime, &mtime);
+ err = utime_ts_from_tval (tvp, &atime, &mtime);
+ if (err)
+ return err;
err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime));
@@ -40,7 +42,9 @@ __futimes (int fd, const struct timeval tvp[2])
{
time_value_t atim, mtim;
- utime_tvalue_from_tval (tvp, &atim, &mtim);
+ err = utime_tvalue_from_tval (tvp, &atim, &mtim);
+ if (err)
+ return err;
err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim));
}
diff --git a/sysdeps/mach/hurd/i386/bits/sigcontext.h b/sysdeps/mach/hurd/i386/bits/sigcontext.h
index 6e5e220..c44e4de 100644
--- a/sysdeps/mach/hurd/i386/bits/sigcontext.h
+++ b/sysdeps/mach/hurd/i386/bits/sigcontext.h
@@ -88,6 +88,8 @@ struct sigcontext
struct i386_fp_save sc_fpsave;
struct i386_fp_regs sc_fpregs;
int sc_fpexcsr; /* FPSR including exception bits. */
+
+ struct i386_xfloat_state *xstate;
};
/* Traditional BSD names for some members. */
diff --git a/sysdeps/mach/hurd/i386/sigreturn.c b/sysdeps/mach/hurd/i386/sigreturn.c
index ce8df8d..dc57d61 100644
--- a/sysdeps/mach/hurd/i386/sigreturn.c
+++ b/sysdeps/mach/hurd/i386/sigreturn.c
@@ -21,6 +21,8 @@
#include <stdlib.h>
#include <string.h>
+#include <cpuid.h>
+
/* This is run on the thread stack after restoring it, to be able to
unlock SS off sigstack. */
static void
@@ -123,10 +125,35 @@ __sigreturn (struct sigcontext *scp)
if (scp->sc_onstack)
ss->sigaltstack.ss_flags &= ~SS_ONSTACK;
- if (scp->sc_fpused)
- /* Restore the FPU state. Mach conveniently stores the state
- in the format the i387 `frstor' instruction uses to restore it. */
- asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
+#ifdef i386_XFLOAT_STATE
+ if (scp->xstate)
+ {
+ if (scp->xstate->initialized)
+ {
+ unsigned eax, ebx, ecx, edx;
+ __cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+ switch (scp->xstate->fp_save_kind)
+ {
+ case 0: // FNSAVE
+ asm volatile("frstor %0" : : "m" (scp->xstate->hw_state));
+ break;
+ case 1: // FXSAVE
+ asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state), \
+ "a" (eax), "d" (edx));
+ break;
+ default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES
+ asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state), \
+ "a" (eax), "d" (edx));
+ break;
+ }
+ }
+ }
+ else
+#endif
+ if (scp->sc_fpused)
+ /* Restore the FPU state. Mach conveniently stores the state
+ in the format the i387 `frstor' instruction uses to restore it. */
+ asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
{
/* There are convenient instructions to pop state off the stack, so we
diff --git a/sysdeps/mach/hurd/symlinkat.c b/sysdeps/mach/hurd/symlinkat.c
index e7dfb67..cb6250e 100644
--- a/sysdeps/mach/hurd/symlinkat.c
+++ b/sysdeps/mach/hurd/symlinkat.c
@@ -47,7 +47,7 @@ __symlinkat (const char *from, int fd, const char *to)
if (! *name)
/* Can't link to the existing directory itself. */
- err = ENOTDIR;
+ err = EEXIST;
else
/* Create a new, unlinked node in the target directory. */
err = __dir_mkfile (dir, O_WRITE, 0777 & ~_hurd_umask, &node);
diff --git a/sysdeps/mach/hurd/utime-helper.c b/sysdeps/mach/hurd/utime-helper.c
index d88bccd..6afa871 100644
--- a/sysdeps/mach/hurd/utime-helper.c
+++ b/sysdeps/mach/hurd/utime-helper.c
@@ -21,8 +21,14 @@
#include <stddef.h>
#include <sys/time.h>
+static inline bool
+check_tval (const struct timeval *tvp)
+{
+ return tvp->tv_usec >= 0 && tvp->tv_usec < USEC_PER_SEC;
+}
+
/* Initializes atime/mtime timespec structures from an array of timeval. */
-static inline void
+static inline error_t
utime_ts_from_tval (const struct timeval tvp[2],
struct timespec *atime, struct timespec *mtime)
{
@@ -37,13 +43,19 @@ utime_ts_from_tval (const struct timeval tvp[2],
}
else
{
+ if (!check_tval (&tvp[0]))
+ return EINVAL;
+ if (!check_tval (&tvp[1]))
+ return EINVAL;
+
TIMEVAL_TO_TIMESPEC (&tvp[0], atime);
TIMEVAL_TO_TIMESPEC (&tvp[1], mtime);
}
+ return 0;
}
/* Initializes atime/mtime time_value_t structures from an array of timeval. */
-static inline void
+static inline error_t
utime_tvalue_from_tval (const struct timeval tvp[2],
time_value_t *atime, time_value_t *mtime)
{
@@ -53,11 +65,17 @@ utime_tvalue_from_tval (const struct timeval tvp[2],
atime->microseconds = mtime->microseconds = -1;
else
{
+ if (!check_tval (&tvp[0]))
+ return EINVAL;
+ if (!check_tval (&tvp[1]))
+ return EINVAL;
+
atime->seconds = tvp[0].tv_sec;
atime->microseconds = tvp[0].tv_usec;
mtime->seconds = tvp[1].tv_sec;
mtime->microseconds = tvp[1].tv_usec;
}
+ return 0;
}
/* Changes the access time of the file behind PORT using a timeval array. */
@@ -67,7 +85,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
error_t err;
struct timespec atime, mtime;
- utime_ts_from_tval (tvp, &atime, &mtime);
+ err = utime_ts_from_tval (tvp, &atime, &mtime);
+ if (err)
+ return err;
err = __file_utimens (port, atime, mtime);
@@ -75,7 +95,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
{
time_value_t atim, mtim;
- utime_tvalue_from_tval (tvp, &atim, &mtim);
+ err = utime_tvalue_from_tval (tvp, &atim, &mtim);
+ if (err)
+ return err;
err = __file_utimes (port, atim, mtim);
}
@@ -83,8 +105,16 @@ hurd_futimes (const file_t port, const struct timeval tvp[2])
return err;
}
+static inline bool
+check_tspec (const struct timespec *tsp)
+{
+ return tsp->tv_nsec == UTIME_NOW
+ || tsp->tv_nsec == UTIME_OMIT
+ || tsp->tv_nsec >= 0 && tsp->tv_nsec < NSEC_PER_SEC;
+}
+
/* Initializes atime/mtime timespec structures from an array of timespec. */
-static inline void
+static inline error_t
utime_ts_from_tspec (const struct timespec tsp[2],
struct timespec *atime, struct timespec *mtime)
{
@@ -99,13 +129,19 @@ utime_ts_from_tspec (const struct timespec tsp[2],
}
else
{
+ if (!check_tspec (&tsp[0]))
+ return EINVAL;
+ if (!check_tspec (&tsp[1]))
+ return EINVAL;
+
*atime = tsp[0];
*mtime = tsp[1];
}
+ return 0;
}
/* Initializes atime/mtime time_value_t structures from an array of timespec. */
-static inline void
+static inline error_t
utime_tvalue_from_tspec (const struct timespec tsp[2],
time_value_t *atime, time_value_t *mtime)
{
@@ -115,6 +151,11 @@ utime_tvalue_from_tspec (const struct timespec tsp[2],
atime->microseconds = mtime->microseconds = -1;
else
{
+ if (!check_tspec (&tsp[0]))
+ return EINVAL;
+ if (!check_tspec (&tsp[1]))
+ return EINVAL;
+
if (tsp[0].tv_nsec == UTIME_NOW)
atime->microseconds = -1;
else if (tsp[0].tv_nsec == UTIME_OMIT)
@@ -128,6 +169,7 @@ utime_tvalue_from_tspec (const struct timespec tsp[2],
else
TIMESPEC_TO_TIME_VALUE (mtime, &(tsp[1]));
}
+ return 0;
}
/* Changes the access time of the file behind PORT using a timespec array. */
@@ -137,7 +179,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2])
error_t err;
struct timespec atime, mtime;
- utime_ts_from_tspec (tsp, &atime, &mtime);
+ err = utime_ts_from_tspec (tsp, &atime, &mtime);
+ if (err)
+ return err;
err = __file_utimens (port, atime, mtime);
@@ -145,7 +189,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2])
{
time_value_t atim, mtim;
- utime_tvalue_from_tspec (tsp, &atim, &mtim);
+ err = utime_tvalue_from_tspec (tsp, &atim, &mtim);
+ if (err)
+ return err;
err = __file_utimes (port, atim, mtim);
}
diff --git a/sysdeps/mach/hurd/x86/trampoline.c b/sysdeps/mach/hurd/x86/trampoline.c
index 8e2890f..6f23c56 100644
--- a/sysdeps/mach/hurd/x86/trampoline.c
+++ b/sysdeps/mach/hurd/x86/trampoline.c
@@ -26,7 +26,11 @@
#include "hurdfault.h"
#include <intr-msg.h>
#include <sys/ucontext.h>
-
+#ifdef __x86_64__
+#include <mach/x86_64/mach_i386.h>
+#else
+#include <mach/i386/mach_i386.h>
+#endif
/* Fill in a siginfo_t structure for SA_SIGINFO-enabled handlers. */
static void fill_siginfo (siginfo_t *si, int signo,
@@ -106,6 +110,7 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
void firewall (void);
void *sigsp;
struct sigcontext *scp;
+ vm_size_t xstate_size;
struct
{
union
@@ -145,6 +150,14 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
struct hurd_userlink link;
ucontext_t ucontext;
siginfo_t siginfo;
+#ifdef __x86_64__
+ char _pad2[56];
+#else
+ char _pad2[20];
+#endif
+ char xstate[];
+ /* Don't add anything after xstate, as it's dynamically
+ sized. */
} *stackframe;
#ifdef __x86_64__
@@ -170,6 +183,17 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
if (! machine_get_basic_state (ss->thread, state))
return NULL;
+ /* Initialize the size of the CPU extended state, to be saved during
+ * signal handling */
+#ifdef i386_XFLOAT_STATE
+ _Static_assert ((sizeof(*stackframe) + sizeof(struct i386_xfloat_state)) % 64 == 0,
+ "stackframe size must be multiple of 64-byte minus "
+ "sizeof(struct i386_xfloat_state), please adjust _pad2");
+
+ if (__i386_get_xstate_size(__mach_host_self(), &xstate_size))
+#endif
+ xstate_size = 0;
+
/* Save the original SP in the gratuitous `esp' slot.
We may need to reset the SP (the `uesp' slot) to avoid clobbering an
interrupted RPC frame. */
@@ -196,14 +220,21 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
#endif
}
- /* Push the arguments to call `trampoline' on the stack. */
- sigsp -= sizeof (*stackframe);
-#ifdef __x86_64__
- /* Align SP at 16 bytes. Coupled with the fact that sigreturn_addr is
- 16-byte aligned within the stackframe struct, this ensures that it ends
- up on a 16-byte aligned address, as required by the ABI. */
- sigsp = (void *) ((uintptr_t) sigsp & ~15UL);
-#endif
+ /* Push the arguments to call `trampoline' on the stack.
+ * The extended state might have a variable size depending on the platform,
+ * so we dynamically allocate it on the stack frame.*/
+ sigsp -= sizeof (*stackframe) + xstate_size;
+
+ /* Align SP at 64 bytes. This is needed for two reasons:
+ * - sigreturn_addr is 16-byte aligned within the stackframe
+ * struct, and this ensures that it ends up on a 16-byte aligned
+ * address, as required by the ABI.
+ * - the XSAVE state needs to be aligned at 64 bytes (on both i386 and
+ * x86_64), so we align the stackframe also at 64 bytes and add the
+ * required padding at the end, see the _pad2 field.
+ */
+ sigsp = (void *) ((uintptr_t) sigsp & ~63UL);
+
stackframe = sigsp;
if (_hurdsig_catch_memory_fault (stackframe))
@@ -248,14 +279,40 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
memcpy (&scp->sc_i386_thread_state,
&state->basic, sizeof (state->basic));
- /* struct sigcontext is laid out so that starting at sc_fpkind mimics
- a struct i386_float_state. */
- _Static_assert (offsetof (struct sigcontext, sc_i386_float_state)
- % __alignof__ (struct i386_float_state) == 0,
- "sc_i386_float_state layout mismatch");
- ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE,
- &state->fpu, &scp->sc_i386_float_state,
- sizeof (state->fpu));
+ scp->xstate = NULL;
+#ifdef i386_XFLOAT_STATE
+ if (xstate_size > 0)
+ {
+ mach_msg_type_number_t got = (xstate_size / sizeof (int));
+
+ ok = (! __thread_get_state (ss->thread, i386_XFLOAT_STATE,
+ (thread_state_t) stackframe->xstate, &got)
+ && got == (xstate_size / sizeof (int)));
+
+ if (ok && ((struct i386_xfloat_state*) stackframe->xstate)->fp_save_kind > 5)
+ /* We support up to XSAVES */
+ ok = 0;
+
+ if (ok)
+ {
+ scp->xstate = (struct i386_xfloat_state*) stackframe->xstate;
+ assert((uintptr_t)scp->xstate->hw_state % 64 == 0);
+ }
+ }
+ else
+#endif
+ ok = 0;
+ if (!ok)
+ {
+ /* struct sigcontext is laid out so that starting at sc_fpkind mimics
+ a struct i386_float_state. */
+ _Static_assert (offsetof (struct sigcontext, sc_i386_float_state)
+ % __alignof__ (struct i386_float_state) == 0,
+ "sc_i386_float_state layout mismatch");
+ ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE,
+ &state->fpu, &scp->sc_i386_float_state,
+ sizeof (state->fpu));
+ }
/* Set up the arguments for the signal handler. */
stackframe->signo = signo;
@@ -404,7 +461,10 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action
- in gdb: gdb/i386-gnu-tdep.c gnu_sigtramp_code. */
#ifdef __x86_64__
-asm ("rpc_wait_trampoline:\n"
+asm ("trampoline:\n"
+ "fnclex\n" /* Clear any pending exception. */
+ "jmp _trampoline\n"
+ "rpc_wait_trampoline:\n"
/* This is the entry point when we have an RPC reply message to receive
before running the handler. The MACH_MSG_SEND bit has already been
cleared in the OPTION argument in our %rsi. The interrupted user
@@ -423,7 +483,7 @@ asm ("rpc_wait_trampoline:\n"
/* Switch to the signal stack. */
"movq %rbx, %rsp\n"
- "trampoline:\n"
+ "_trampoline:\n"
/* Entry point for running the handler normally. The arguments to the
handler function are on the top of the stack, same as in the i386
version:
@@ -449,7 +509,10 @@ asm ("rpc_wait_trampoline:\n"
"movq 16(%rsp), %rdi\n"
"ret");
#else
-asm ("rpc_wait_trampoline:\n");
+asm ("trampoline:\n"
+ "fnclex\n" /* Clear any pending exception. */
+ "jmp _trampoline\n"
+ "rpc_wait_trampoline:\n");
/* This is the entry point when we have an RPC reply message to receive
before running the handler. The MACH_MSG_SEND bit has already been
cleared in the OPTION argument on our stack. The interrupted user
@@ -469,7 +532,7 @@ asm (/* Retry the interrupted mach_msg system call. */
/* Switch to the signal stack. */
"movl %ebx, %esp\n");
- asm ("trampoline:\n");
+asm ("_trampoline:\n");
/* Entry point for running the handler normally. The arguments to the
handler function are already on the top of the stack:
diff --git a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
index 7bac881..d83795f 100644
--- a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
+++ b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h
@@ -96,6 +96,8 @@ struct sigcontext
struct i386_fp_save sc_fpsave;
struct i386_fp_regs sc_fpregs;
int sc_fpexcsr; /* FPSR including exception bits. */
+
+ struct i386_xfloat_state *xstate;
};
/* Traditional BSD names for some members. */
diff --git a/sysdeps/mach/hurd/x86_64/sigreturn.c b/sysdeps/mach/hurd/x86_64/sigreturn.c
index 81a2d3b..773c00f 100644
--- a/sysdeps/mach/hurd/x86_64/sigreturn.c
+++ b/sysdeps/mach/hurd/x86_64/sigreturn.c
@@ -20,6 +20,8 @@
#include <hurd/msg.h>
#include <stdlib.h>
+#include <cpuid.h>
+
/* This is run on the thread stack after restoring it, to be able to
unlock SS off sigstack. */
void
@@ -116,10 +118,35 @@ __sigreturn (struct sigcontext *scp)
if (scp->sc_onstack)
ss->sigaltstack.ss_flags &= ~SS_ONSTACK;
- if (scp->sc_fpused)
- /* Restore the FPU state. Mach conveniently stores the state
- in the format the i387 `frstor' instruction uses to restore it. */
- asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
+#ifdef i386_XFLOAT_STATE
+ if (scp->xstate)
+ {
+ if (scp->xstate->initialized)
+ {
+ unsigned eax, ebx, ecx, edx;
+ __cpuid_count(0xd, 0, eax, ebx, ecx, edx);
+ switch (scp->xstate->fp_save_kind)
+ {
+ case 0: // FNSAVE
+ asm volatile("frstor %0" : : "m" (scp->xstate->hw_state));
+ break;
+ case 1: // FXSAVE
+ asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state), \
+ "a" (eax), "d" (edx));
+ break;
+ default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES
+ asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state), \
+ "a" (eax), "d" (edx));
+ break;
+ }
+ }
+ }
+ else
+#endif
+ if (scp->sc_fpused)
+ /* Restore the FPU state. Mach conveniently stores the state
+ in the format the i387 `frstor' instruction uses to restore it. */
+ asm volatile ("frstor %0" : : "m" (scp->sc_fpsave));
/* Copy the registers onto the user's stack, to be able to release the
altstack (by unlocking sigstate). Note that unless an altstack is used,
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index d8d7c8b..89e26bb 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -363,7 +363,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[],
/ sizeof (Elf64_Rela));
Elf64_Addr l_addr = map->l_addr;
Elf64_Dyn **info = map->l_info;
- char *p;
extern void _dl_runtime_resolve (void);
extern void _dl_profile_resolve (void);
@@ -435,20 +434,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[],
offset += PLT_ENTRY_WORDS;
glink_offset += GLINK_ENTRY_WORDS (i);
}
-
- /* Now, we've modified data. We need to write the changes from
- the data cache to a second-level unified cache, then make
- sure that stale data in the instruction cache is removed.
- (In a multiprocessor system, the effect is more complex.)
- Most of the PLT shouldn't be in the instruction cache, but
- there may be a little overlap at the start and the end.
-
- Assumes that dcbst and icbi apply to lines of 16 bytes or
- more. Current known line sizes are 16, 32, and 128 bytes. */
-
- for (p = (char *) plt; p < (char *) &plt[offset]; p += 16)
- PPC_DCBST (p);
- PPC_SYNC;
}
}
return lazy;
diff --git a/sysdeps/pthread/tst-stdio2.c b/sysdeps/pthread/tst-stdio2.c
index 08948cb..0876ed6 100644
--- a/sysdeps/pthread/tst-stdio2.c
+++ b/sysdeps/pthread/tst-stdio2.c
@@ -75,7 +75,7 @@ do_test (void)
exit (1);
}
- puts ("join returned succsefully");
+ puts ("join returned successfully");
return 0;
}
diff --git a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
index dfc554a..f425a4b 100644
--- a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
+++ b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h
@@ -379,6 +379,8 @@ struct file_handle
identity and may not
be usable to
open_by_handle_at. */
+# define AT_HANDLE_MNT_ID_UNIQUE 1 /* Return the 64-bit unique mount
+ ID. */
#endif
__BEGIN_DECLS
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 9d136e4..e50f1d6 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
"Incorrect index_arch_Fast_Unaligned_Load");
-/* Intel Family-6 microarch list. */
-enum
+/* Intel microarch list. */
+enum intel_microarch
{
/* Atom processors. */
INTEL_ATOM_BONNELL,
@@ -512,6 +512,7 @@ enum
INTEL_ATOM_GOLDMONT,
INTEL_ATOM_GOLDMONT_PLUS,
INTEL_ATOM_SIERRAFOREST,
+ INTEL_ATOM_CLEARWATERFOREST,
INTEL_ATOM_GRANDRIDGE,
INTEL_ATOM_TREMONT,
@@ -539,7 +540,9 @@ enum
INTEL_BIGCORE_METEORLAKE,
INTEL_BIGCORE_LUNARLAKE,
INTEL_BIGCORE_ARROWLAKE,
+ INTEL_BIGCORE_PANTHERLAKE,
INTEL_BIGCORE_GRANITERAPIDS,
+ INTEL_BIGCORE_DIAMONDRAPIDS,
/* Mixed (bigcore + atom SOC). */
INTEL_MIXED_LAKEFIELD,
@@ -553,7 +556,7 @@ enum
INTEL_UNKNOWN,
};
-static unsigned int
+static enum intel_microarch
intel_get_fam6_microarch (unsigned int model,
__attribute__ ((unused)) unsigned int stepping)
{
@@ -584,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_ATOM_GOLDMONT_PLUS;
case 0xAF:
return INTEL_ATOM_SIERRAFOREST;
+ case 0xDD:
+ return INTEL_ATOM_CLEARWATERFOREST;
case 0xB6:
return INTEL_ATOM_GRANDRIDGE;
case 0x86:
@@ -691,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model,
return INTEL_BIGCORE_METEORLAKE;
case 0xbd:
return INTEL_BIGCORE_LUNARLAKE;
+ case 0xb5:
+ case 0xc5:
case 0xc6:
return INTEL_BIGCORE_ARROWLAKE;
+ case 0xCC:
+ return INTEL_BIGCORE_PANTHERLAKE;
case 0xAD:
case 0xAE:
return INTEL_BIGCORE_GRANITERAPIDS;
@@ -756,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
&= ~bit_arch_Avoid_Non_Temporal_Memset;
+ enum intel_microarch microarch = INTEL_UNKNOWN;
if (family == 0x06)
{
model += extended_model;
- unsigned int microarch
- = intel_get_fam6_microarch (model, stepping);
+ microarch = intel_get_fam6_microarch (model, stepping);
+ /* Disable TSX on some processors to avoid TSX on kernels that
+ weren't updated with the latest microcode package (which
+ disables broken feature by default). */
switch (microarch)
{
- /* Atom / KNL tuning. */
- case INTEL_ATOM_BONNELL:
- /* BSF is slow on Bonnell. */
- cpu_features->preferred[index_arch_Slow_BSF]
- |= bit_arch_Slow_BSF;
- break;
-
- /* Unaligned load versions are faster than SSSE3
- on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
- case INTEL_ATOM_AIRMONT:
- case INTEL_ATOM_SILVERMONT:
- case INTEL_ATOM_GOLDMONT:
- case INTEL_ATOM_GOLDMONT_PLUS:
-
- /* Knights Landing. Enable Silvermont optimizations. */
- case INTEL_KNIGHTS_LANDING:
-
- cpu_features->preferred[index_arch_Fast_Unaligned_Load]
- |= (bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- case INTEL_ATOM_TREMONT:
- /* Enable rep string instructions, unaligned load, unaligned
- copy, pminub and avoid SSE 4.2 on Tremont. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop
- | bit_arch_Slow_SSE4_2);
- break;
-
- /*
- Default tuned Knights microarch.
- case INTEL_KNIGHTS_MILL:
- */
-
- /*
- Default tuned atom microarch.
- case INTEL_ATOM_SIERRAFOREST:
- case INTEL_ATOM_GRANDRIDGE:
- */
-
- /* Bigcore/Default Tuning. */
default:
- default_tuning:
- /* Unknown family 0x06 processors. Assuming this is one
- of Core i3/i5/i7 processors if AVX is available. */
- if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
- break;
-
- enable_modern_features:
- /* Rep string instructions, unaligned load, unaligned copy,
- and pminub are fast on Intel Core i3, i5 and i7. */
- cpu_features->preferred[index_arch_Fast_Rep_String]
- |= (bit_arch_Fast_Rep_String
- | bit_arch_Fast_Unaligned_Load
- | bit_arch_Fast_Unaligned_Copy
- | bit_arch_Prefer_PMINUB_for_stringop);
break;
- case INTEL_BIGCORE_NEHALEM:
- case INTEL_BIGCORE_WESTMERE:
- /* Older CPUs prefer non-temporal stores at lower threshold. */
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto enable_modern_features;
-
- /* Older Bigcore microarch (smaller non-temporal store
- threshold). */
- case INTEL_BIGCORE_SANDYBRIDGE:
- case INTEL_BIGCORE_IVYBRIDGE:
- case INTEL_BIGCORE_HASWELL:
- case INTEL_BIGCORE_BROADWELL:
- cpu_features->cachesize_non_temporal_divisor = 8;
- goto default_tuning;
-
- /* Newer Bigcore microarch (larger non-temporal store
- threshold). */
- case INTEL_BIGCORE_SKYLAKE_AVX512:
- case INTEL_BIGCORE_CANNONLAKE:
- /* Benchmarks indicate non-temporal memset is not
- necessarily profitable on SKX (and in some cases much
- worse). This is likely unique to SKX due its it unique
- mesh interconnect (not present on ICX or BWD). Disable
- non-temporal on all Skylake servers. */
- cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
- |= bit_arch_Avoid_Non_Temporal_Memset;
- /* fallthrough */
- case INTEL_BIGCORE_COMETLAKE:
- case INTEL_BIGCORE_SKYLAKE:
- case INTEL_BIGCORE_KABYLAKE:
- case INTEL_BIGCORE_ICELAKE:
- case INTEL_BIGCORE_TIGERLAKE:
- case INTEL_BIGCORE_ROCKETLAKE:
- case INTEL_BIGCORE_RAPTORLAKE:
- case INTEL_BIGCORE_METEORLAKE:
- case INTEL_BIGCORE_LUNARLAKE:
- case INTEL_BIGCORE_ARROWLAKE:
- case INTEL_BIGCORE_SAPPHIRERAPIDS:
- case INTEL_BIGCORE_EMERALDRAPIDS:
- case INTEL_BIGCORE_GRANITERAPIDS:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
-
- /* Default tuned Mixed (bigcore + atom SOC). */
- case INTEL_MIXED_LAKEFIELD:
- case INTEL_MIXED_ALDERLAKE:
- cpu_features->cachesize_non_temporal_divisor = 2;
- goto default_tuning;
- }
-
- /* Disable TSX on some processors to avoid TSX on kernels that
- weren't updated with the latest microcode package (which
- disables broken feature by default). */
- switch (microarch)
- {
case INTEL_BIGCORE_SKYLAKE_AVX512:
/* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
if (stepping <= 5)
@@ -891,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_KABYLAKE:
/* NB: Although the errata documents that for model == 0x8e
- (kabylake skylake client), only 0xb stepping or lower are
- impacted, the intention of the errata was to disable TSX on
- all client processors on all steppings. Include 0xc
- stepping which is an Intel Core i7-8665U, a client mobile
- processor. */
+ (kabylake skylake client), only 0xb stepping or lower are
+ impacted, the intention of the errata was to disable TSX on
+ all client processors on all steppings. Include 0xc
+ stepping which is an Intel Core i7-8665U, a client mobile
+ processor. */
if (stepping > 0xc)
break;
/* Fall through. */
case INTEL_BIGCORE_SKYLAKE:
- /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
- processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
- */
- disable_tsx:
- CPU_FEATURE_UNSET (cpu_features, HLE);
- CPU_FEATURE_UNSET (cpu_features, RTM);
- CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
- break;
+ /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+ processors listed in:
+
+ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+ */
+disable_tsx:
+ CPU_FEATURE_UNSET (cpu_features, HLE);
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+ break;
case INTEL_BIGCORE_HASWELL:
- /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
- TSX. Haswell also include other model numbers that have
- working TSX. */
- if (model == 0x3f && stepping >= 4)
+ /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+ TSX. Haswell also includes other model numbers that have
+ working TSX. */
+ if (model == 0x3f && stepping >= 4)
break;
- CPU_FEATURE_UNSET (cpu_features, RTM);
- break;
+ CPU_FEATURE_UNSET (cpu_features, RTM);
+ break;
}
}
+ else if (family == 19)
+ switch (model)
+ {
+ case 0x01:
+ microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+ break;
+ default:
+ break;
+ }
+
+ switch (microarch)
+ {
+ /* Atom / KNL tuning. */
+ case INTEL_ATOM_BONNELL:
+ /* BSF is slow on Bonnell. */
+ cpu_features->preferred[index_arch_Slow_BSF]
+ |= bit_arch_Slow_BSF;
+ break;
+
+ /* Unaligned load versions are faster than SSSE3
+ on Airmont, Silvermont, Goldmont, and Goldmont Plus. */
+ case INTEL_ATOM_AIRMONT:
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+
+ /* Knights Landing. Enable Silvermont optimizations. */
+ case INTEL_KNIGHTS_LANDING:
+
+ cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+ |= (bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ case INTEL_ATOM_TREMONT:
+ /* Enable rep string instructions, unaligned load, unaligned
+ copy, pminub and avoid SSE 4.2 on Tremont. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop
+ | bit_arch_Slow_SSE4_2);
+ break;
+
+ /*
+ Default tuned Knights microarch.
+ case INTEL_KNIGHTS_MILL:
+ */
+
+ /*
+ Default tuned atom microarch.
+ case INTEL_ATOM_SIERRAFOREST:
+ case INTEL_ATOM_GRANDRIDGE:
+ case INTEL_ATOM_CLEARWATERFOREST:
+ */
+
+ /* Bigcore/Default Tuning. */
+ default:
+ default_tuning:
+ /* Unknown Intel processors. Assuming this is one of Core
+ i3/i5/i7 processors if AVX is available. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+ break;
+
+ enable_modern_features:
+ /* Rep string instructions, unaligned load, unaligned copy,
+ and pminub are fast on Intel Core i3, i5 and i7. */
+ cpu_features->preferred[index_arch_Fast_Rep_String]
+ |= (bit_arch_Fast_Rep_String
+ | bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Unaligned_Copy
+ | bit_arch_Prefer_PMINUB_for_stringop);
+ break;
+
+ case INTEL_BIGCORE_NEHALEM:
+ case INTEL_BIGCORE_WESTMERE:
+ /* Older CPUs prefer non-temporal stores at lower threshold. */
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto enable_modern_features;
+
+ /* Older Bigcore microarch (smaller non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SANDYBRIDGE:
+ case INTEL_BIGCORE_IVYBRIDGE:
+ case INTEL_BIGCORE_HASWELL:
+ case INTEL_BIGCORE_BROADWELL:
+ cpu_features->cachesize_non_temporal_divisor = 8;
+ goto default_tuning;
+
+ /* Newer Bigcore microarch (larger non-temporal store
+ threshold). */
+ case INTEL_BIGCORE_SKYLAKE_AVX512:
+ case INTEL_BIGCORE_CANNONLAKE:
+ /* Benchmarks indicate non-temporal memset is not
+ necessarily profitable on SKX (and in some cases much
+ worse). This is likely unique to SKX due to its unique
+ mesh interconnect (not present on ICX or BWD). Disable
+ non-temporal on all Skylake servers. */
+ cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+ |= bit_arch_Avoid_Non_Temporal_Memset;
+ /* fallthrough */
+ case INTEL_BIGCORE_COMETLAKE:
+ case INTEL_BIGCORE_SKYLAKE:
+ case INTEL_BIGCORE_KABYLAKE:
+ case INTEL_BIGCORE_ICELAKE:
+ case INTEL_BIGCORE_TIGERLAKE:
+ case INTEL_BIGCORE_ROCKETLAKE:
+ case INTEL_BIGCORE_RAPTORLAKE:
+ case INTEL_BIGCORE_METEORLAKE:
+ case INTEL_BIGCORE_LUNARLAKE:
+ case INTEL_BIGCORE_ARROWLAKE:
+ case INTEL_BIGCORE_PANTHERLAKE:
+ case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ case INTEL_BIGCORE_EMERALDRAPIDS:
+ case INTEL_BIGCORE_GRANITERAPIDS:
+ case INTEL_BIGCORE_DIAMONDRAPIDS:
+ /* Default tuned Mixed (bigcore + atom SOC). */
+ case INTEL_MIXED_LAKEFIELD:
+ case INTEL_MIXED_ALDERLAKE:
+ cpu_features->cachesize_non_temporal_divisor = 2;
+ goto default_tuning;
+ }
/* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
if AVX512ER is available. Don't use AVX512 to avoid lower CPU
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 9d31685..5723ec1 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -142,7 +142,6 @@ CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
AVX512-CFLAGS = -mavx512f
CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
-CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c
index 6eb21b6..0b994ef 100644
--- a/sysdeps/x86_64/tst-auditmod10b.c
+++ b/sysdeps/x86_64/tst-auditmod10b.c
@@ -125,7 +125,6 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
#include <tst-audit.h>
-#ifdef __AVX512F__
#include <immintrin.h>
#include <cpuid.h>
@@ -148,9 +147,37 @@ check_avx512 (void)
return (eax & 0xe6) == 0xe6;
}
-#else
-#include <emmintrin.h>
-#endif
+void
+__attribute__ ((target ("avx512f")))
+pltenter_avx512f (La_regs *regs, long int *framesizep)
+{
+ __m512i zero = _mm512_setzero_si512 ();
+ if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
+ || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
+ abort ();
+
+ for (int i = 0; i < 8; i++)
+ regs->lr_vector[i].zmm[0]
+ = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
+
+ __m512i zmm = _mm512_set1_epi64 (-1);
+ asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+ asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
+ asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
+ asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
+ asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
+ asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
+ asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
+ asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
+
+ *framesizep = 1024;
+}
ElfW(Addr)
pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
@@ -160,39 +187,33 @@ pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
symname, (long int) sym->st_value, ndx, *flags);
-#ifdef __AVX512F__
if (check_avx512 () && strcmp (symname, "audit_test") == 0)
+ pltenter_avx512f (regs, framesizep);
+
+ return sym->st_value;
+}
+
+void
+__attribute__ ((target ("avx512f")))
+pltexit_avx512f (const La_regs *inregs, La_retval *outregs)
+{
+ __m512i zero = _mm512_setzero_si512 ();
+ if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
+ abort ();
+
+ for (int i = 0; i < 8; i++)
{
- __m512i zero = _mm512_setzero_si512 ();
- if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
- || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
- abort ();
-
- for (int i = 0; i < 8; i++)
- regs->lr_vector[i].zmm[0]
- = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
-
- __m512i zmm = _mm512_set1_epi64 (-1);
- asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
- asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
- asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
- asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
- asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
- asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
- asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
- asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
-
- *framesizep = 1024;
+ __m512i zmm = _mm512_set1_epi64 (i + 1);
+ if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
+ abort ();
}
-#endif
- return sym->st_value;
+ outregs->lrv_vector0.zmm[0]
+ = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
+
+ __m512i zmm = _mm512_set1_epi64 (-1);
+ asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+ asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
}
unsigned int
@@ -204,28 +225,8 @@ pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
symname, (long int) sym->st_value, ndx,
(ptrdiff_t) outregs->int_retval);
-#ifdef __AVX512F__
if (check_avx512 () && strcmp (symname, "audit_test") == 0)
- {
- __m512i zero = _mm512_setzero_si512 ();
- if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
- abort ();
-
- for (int i = 0; i < 8; i++)
- {
- __m512i zmm = _mm512_set1_epi64 (i + 1);
- if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
- abort ();
- }
-
- outregs->lrv_vector0.zmm[0]
- = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
-
- __m512i zmm = _mm512_set1_epi64 (-1);
- asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
- asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
- }
-#endif
+ pltexit_avx512f (inregs, outregs);
return 0;
}