diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/aarch64/multiarch/memcpy_oryon1.S | 40 | ||||
-rw-r--r-- | sysdeps/aarch64/multiarch/memset_oryon1.S | 26 | ||||
-rw-r--r-- | sysdeps/mach/hurd/dup3.c | 62 | ||||
-rw-r--r-- | sysdeps/mach/hurd/fcntl.c | 53 | ||||
-rw-r--r-- | sysdeps/mach/hurd/futimens.c | 8 | ||||
-rw-r--r-- | sysdeps/mach/hurd/futimes.c | 8 | ||||
-rw-r--r-- | sysdeps/mach/hurd/i386/bits/sigcontext.h | 2 | ||||
-rw-r--r-- | sysdeps/mach/hurd/i386/sigreturn.c | 35 | ||||
-rw-r--r-- | sysdeps/mach/hurd/symlinkat.c | 2 | ||||
-rw-r--r-- | sysdeps/mach/hurd/utime-helper.c | 62 | ||||
-rw-r--r-- | sysdeps/mach/hurd/x86/trampoline.c | 105 | ||||
-rw-r--r-- | sysdeps/mach/hurd/x86_64/bits/sigcontext.h | 2 | ||||
-rw-r--r-- | sysdeps/mach/hurd/x86_64/sigreturn.c | 35 | ||||
-rw-r--r-- | sysdeps/powerpc/powerpc64/dl-machine.h | 15 | ||||
-rw-r--r-- | sysdeps/pthread/tst-stdio2.c | 2 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/bits/fcntl-linux.h | 2 | ||||
-rw-r--r-- | sysdeps/x86/cpu-features.c | 305 | ||||
-rw-r--r-- | sysdeps/x86_64/Makefile | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/tst-auditmod10b.c | 109 |
19 files changed, 591 insertions, 283 deletions
diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index e86d8b0..cc267db 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -152,6 +152,46 @@ L(copy96): .p2align 6 L(copy_long): + /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. + This loop is identical to the one below it but using ldnp/stnp + instructions. For loops that are less than 32768 bytes, + the ldnp/stnp instructions will not help and will cause a slow + down so only use the ldnp/stnp loop for the largest sizes. */ + + cmp count, #32768 + b.lo L(copy_long_without_nontemp) + and tmp1, dstin, 15 + bic dst, dstin, 15 + ldnp D_l, D_h, [src] + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldnp A_l, A_h, [src, 16] + stnp D_l, D_h, [dstin] + ldnp B_l, B_h, [src, 32] + ldnp C_l, C_h, [src, 48] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + subs count, count, 128 + 16 /* Test and readjust count. */ + +L(nontemp_loop64): + tbz src, #6, 1f +1: + stnp A_l, A_h, [dst, 16] + ldnp A_l, A_h, [src, 16] + stnp B_l, B_h, [dst, 32] + ldnp B_l, B_h, [src, 32] + stnp C_l, C_h, [dst, 48] + ldnp C_l, C_h, [src, 48] + stnp D_l, D_h, [dst, 64] + ldnp D_l, D_h, [src, 64] + add src, src, #64 + add dst, dst, #64 + subs count, count, 64 + b.hi L(nontemp_loop64) + b L(last64) + +L(copy_long_without_nontemp): + and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src] diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S index 0f9b718..88f4ef4 100644 --- a/sysdeps/aarch64/multiarch/memset_oryon1.S +++ b/sysdeps/aarch64/multiarch/memset_oryon1.S @@ -90,6 +90,8 @@ L(set_long): cmp count, 256 ccmp valw, 0, 0, cs b.eq L(try_zva) + cmp count, #32768 + b.hi L(set_long_with_nontemp) /* Small-size or non-zero memset does not use DC ZVA. */ sub count, dstend, dst @@ -112,6 +114,30 @@ L(set_long): stp val, val, [dstend, -16] ret +L(set_long_with_nontemp): + /* Small-size or non-zero memset does not use DC ZVA. */ + sub count, dstend, dst + + /* Adjust count and bias for loop. By subtracting extra 1 from count, + it is easy to use tbz instruction to check whether loop tailing + count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ + sub count, count, 64+16+1 + +1: stnp val, val, [dst, 16] + stnp val, val, [dst, 32] + stnp val, val, [dst, 48] + stnp val, val, [dst, 64] + add dst, dst, #64 + subs count, count, 64 + b.hs 1b + + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ + stnp val, val, [dst, 16] + stnp val, val, [dst, 32] +1: stnp val, val, [dstend, -32] + stnp val, val, [dstend, -16] + ret + L(try_zva): /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA as it is faster. */ diff --git a/sysdeps/mach/hurd/dup3.c b/sysdeps/mach/hurd/dup3.c index 22af45b..49545ae 100644 --- a/sysdeps/mach/hurd/dup3.c +++ b/sysdeps/mach/hurd/dup3.c @@ -69,6 +69,7 @@ __dup3 (int fd, int fd2, int flags) { /* Get a hold of the destination descriptor. */ struct hurd_fd *d2; + error_t err; __mutex_lock (&_hurd_dtable_lock); @@ -107,22 +108,51 @@ __dup3 (int fd, int fd2, int flags) } else { - /* Give the ports each a user ref for the new descriptor. */ - __mach_port_mod_refs (__mach_task_self (), port, - MACH_PORT_RIGHT_SEND, 1); - if (ctty != MACH_PORT_NULL) - __mach_port_mod_refs (__mach_task_self (), ctty, - MACH_PORT_RIGHT_SEND, 1); - - /* Install the ports and flags in the new descriptor slot. */ - __spin_lock (&d2->port.lock); - if (flags & O_CLOEXEC) - d2->flags = d_flags | FD_CLOEXEC; - else - /* dup clears FD_CLOEXEC. */ - d2->flags = d_flags & ~FD_CLOEXEC; - _hurd_port_set (&d2->ctty, ctty); - _hurd_port_locked_set (&d2->port, port); /* Unlocks D2. */ + /* Give the io server port a user ref for the new descriptor. */ + err = __mach_port_mod_refs (__mach_task_self (), port, + MACH_PORT_RIGHT_SEND, 1); + + if (err == KERN_UREFS_OVERFLOW) + fd2 = __hurd_fail (EMFILE); + else if (err) + fd2 = __hurd_fail (EINVAL); + else if (ctty != MACH_PORT_NULL) + { + /* We have confirmed the io server port has got a user ref + count, now give ctty port a user ref for the new + descriptor. */ + err = __mach_port_mod_refs (__mach_task_self (), ctty, + MACH_PORT_RIGHT_SEND, 1); + + if (err) + { + /* In this case the io server port has got a ref count + but the ctty port failed to get one, so we need to + clean the ref count we just assigned. */ + __mach_port_mod_refs (__mach_task_self (), port, + MACH_PORT_RIGHT_SEND, -1); + + if (err == KERN_UREFS_OVERFLOW) + fd2 = __hurd_fail (EMFILE); + else + fd2 = __hurd_fail (EINVAL); + } + } + + if (!err) + { + /* The ref counts of the ports are incremented + successfully. */ + /* Install the ports and flags in the new descriptor slot. */ + __spin_lock (&d2->port.lock); + if (flags & O_CLOEXEC) + d2->flags = d_flags | FD_CLOEXEC; + else + /* dup clears FD_CLOEXEC. */ + d2->flags = d_flags & ~FD_CLOEXEC; + _hurd_port_set (&d2->ctty, ctty); + _hurd_port_locked_set (&d2->port, port); /* Unlocks D2. */ + } } } diff --git a/sysdeps/mach/hurd/fcntl.c b/sysdeps/mach/hurd/fcntl.c index a65c190..de576af 100644 --- a/sysdeps/mach/hurd/fcntl.c +++ b/sysdeps/mach/hurd/fcntl.c @@ -83,18 +83,47 @@ __libc_fcntl (int fd, int cmd, ...) result = -1; else { - /* Give the ports each a user ref for the new descriptor. */ - __mach_port_mod_refs (__mach_task_self (), port, - MACH_PORT_RIGHT_SEND, 1); - if (ctty != MACH_PORT_NULL) - __mach_port_mod_refs (__mach_task_self (), ctty, - MACH_PORT_RIGHT_SEND, 1); - - /* Install the ports and flags in the new descriptor. */ - if (ctty != MACH_PORT_NULL) - _hurd_port_set (&new->ctty, ctty); - new->flags = flags; - _hurd_port_locked_set (&new->port, port); /* Unlocks NEW. */ + /* Give the io server port a user ref for the new descriptor. */ + err = __mach_port_mod_refs (__mach_task_self (), port, + MACH_PORT_RIGHT_SEND, 1); + + if (err == KERN_UREFS_OVERFLOW) + result = __hurd_fail (EMFILE); + else if (err) + result = __hurd_fail (EINVAL); + else if (ctty != MACH_PORT_NULL) + { + /* We have confirmed the io server port has got a user ref + count, now give ctty port a user ref for the new + descriptor. */ + err = __mach_port_mod_refs (__mach_task_self (), ctty, + MACH_PORT_RIGHT_SEND, 1); + + if (err) + { + /* In this case the io server port has got a ref count + but the ctty port fails to get one, so we need to clean + the ref count we just assigned. */ + __mach_port_mod_refs (__mach_task_self (), port, + MACH_PORT_RIGHT_SEND, -1); + + if (err == KERN_UREFS_OVERFLOW) + result = __hurd_fail (EMFILE); + else + result = __hurd_fail (EINVAL); + } + } + + if (!err) + { + /* The ref counts of the ports are incremented successfully. */ + /* Install the ports and flags in the new descriptor. */ + if (ctty != MACH_PORT_NULL) + _hurd_port_set (&new->ctty, ctty); + new->flags = flags; + /* Unlocks NEW. */ + _hurd_port_locked_set (&new->port, port); + } } HURD_CRITICAL_END; diff --git a/sysdeps/mach/hurd/futimens.c b/sysdeps/mach/hurd/futimens.c index 30ef0a6..1212529 100644 --- a/sysdeps/mach/hurd/futimens.c +++ b/sysdeps/mach/hurd/futimens.c @@ -32,7 +32,9 @@ __futimens (int fd, const struct timespec tsp[2]) struct timespec atime, mtime; error_t err; - utime_ts_from_tspec (tsp, &atime, &mtime); + err = utime_ts_from_tspec (tsp, &atime, &mtime); + if (err) + return err; err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime)); @@ -40,7 +42,9 @@ __futimens (int fd, const struct timespec tsp[2]) { time_value_t atim, mtim; - utime_tvalue_from_tspec (tsp, &atim, &mtim); + err = utime_tvalue_from_tspec (tsp, &atim, &mtim); + if (err) + return err; err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim)); } diff --git a/sysdeps/mach/hurd/futimes.c b/sysdeps/mach/hurd/futimes.c index 20f47f3..97385d7 100644 --- a/sysdeps/mach/hurd/futimes.c +++ b/sysdeps/mach/hurd/futimes.c @@ -32,7 +32,9 @@ __futimes (int fd, const struct timeval tvp[2]) struct timespec atime, mtime; error_t err; - utime_ts_from_tval (tvp, &atime, &mtime); + err = utime_ts_from_tval (tvp, &atime, &mtime); + if (err) + return err; err = HURD_DPORT_USE (fd, __file_utimens (port, atime, mtime)); @@ -40,7 +42,9 @@ __futimes (int fd, const struct timeval tvp[2]) { time_value_t atim, mtim; - utime_tvalue_from_tval (tvp, &atim, &mtim); + err = utime_tvalue_from_tval (tvp, &atim, &mtim); + if (err) + return err; err = HURD_DPORT_USE (fd, __file_utimes (port, atim, mtim)); } diff --git a/sysdeps/mach/hurd/i386/bits/sigcontext.h b/sysdeps/mach/hurd/i386/bits/sigcontext.h index 6e5e220..c44e4de 100644 --- a/sysdeps/mach/hurd/i386/bits/sigcontext.h +++ b/sysdeps/mach/hurd/i386/bits/sigcontext.h @@ -88,6 +88,8 @@ struct sigcontext struct i386_fp_save sc_fpsave; struct i386_fp_regs sc_fpregs; int sc_fpexcsr; /* FPSR including exception bits. */ + + struct i386_xfloat_state *xstate; }; /* Traditional BSD names for some members. */ diff --git a/sysdeps/mach/hurd/i386/sigreturn.c b/sysdeps/mach/hurd/i386/sigreturn.c index ce8df8d..dc57d61 100644 --- a/sysdeps/mach/hurd/i386/sigreturn.c +++ b/sysdeps/mach/hurd/i386/sigreturn.c @@ -21,6 +21,8 @@ #include <stdlib.h> #include <string.h> +#include <cpuid.h> + /* This is run on the thread stack after restoring it, to be able to unlock SS off sigstack. */ static void @@ -123,10 +125,35 @@ __sigreturn (struct sigcontext *scp) if (scp->sc_onstack) ss->sigaltstack.ss_flags &= ~SS_ONSTACK; - if (scp->sc_fpused) - /* Restore the FPU state. Mach conveniently stores the state - in the format the i387 `frstor' instruction uses to restore it. */ - asm volatile ("frstor %0" : : "m" (scp->sc_fpsave)); +#ifdef i386_XFLOAT_STATE + if (scp->xstate) + { + if (scp->xstate->initialized) + { + unsigned eax, ebx, ecx, edx; + __cpuid_count(0xd, 0, eax, ebx, ecx, edx); + switch (scp->xstate->fp_save_kind) + { + case 0: // FNSAVE + asm volatile("frstor %0" : : "m" (scp->xstate->hw_state)); + break; + case 1: // FXSAVE + asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state), \ + "a" (eax), "d" (edx)); + break; + default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES + asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state), \ + "a" (eax), "d" (edx)); + break; + } + } + } + else +#endif + if (scp->sc_fpused) + /* Restore the FPU state. Mach conveniently stores the state + in the format the i387 `frstor' instruction uses to restore it. */ + asm volatile ("frstor %0" : : "m" (scp->sc_fpsave)); { /* There are convenient instructions to pop state off the stack, so we diff --git a/sysdeps/mach/hurd/symlinkat.c b/sysdeps/mach/hurd/symlinkat.c index e7dfb67..cb6250e 100644 --- a/sysdeps/mach/hurd/symlinkat.c +++ b/sysdeps/mach/hurd/symlinkat.c @@ -47,7 +47,7 @@ __symlinkat (const char *from, int fd, const char *to) if (! *name) /* Can't link to the existing directory itself. */ - err = ENOTDIR; + err = EEXIST; else /* Create a new, unlinked node in the target directory. */ err = __dir_mkfile (dir, O_WRITE, 0777 & ~_hurd_umask, &node); diff --git a/sysdeps/mach/hurd/utime-helper.c b/sysdeps/mach/hurd/utime-helper.c index d88bccd..6afa871 100644 --- a/sysdeps/mach/hurd/utime-helper.c +++ b/sysdeps/mach/hurd/utime-helper.c @@ -21,8 +21,14 @@ #include <stddef.h> #include <sys/time.h> +static inline bool +check_tval (const struct timeval *tvp) +{ + return tvp->tv_usec >= 0 && tvp->tv_usec < USEC_PER_SEC; +} + /* Initializes atime/mtime timespec structures from an array of timeval. */ -static inline void +static inline error_t utime_ts_from_tval (const struct timeval tvp[2], struct timespec *atime, struct timespec *mtime) { @@ -37,13 +43,19 @@ utime_ts_from_tval (const struct timeval tvp[2], } else { + if (!check_tval (&tvp[0])) + return EINVAL; + if (!check_tval (&tvp[1])) + return EINVAL; + TIMEVAL_TO_TIMESPEC (&tvp[0], atime); TIMEVAL_TO_TIMESPEC (&tvp[1], mtime); } + return 0; } /* Initializes atime/mtime time_value_t structures from an array of timeval. */ -static inline void +static inline error_t utime_tvalue_from_tval (const struct timeval tvp[2], time_value_t *atime, time_value_t *mtime) { @@ -53,11 +65,17 @@ utime_tvalue_from_tval (const struct timeval tvp[2], atime->microseconds = mtime->microseconds = -1; else { + if (!check_tval (&tvp[0])) + return EINVAL; + if (!check_tval (&tvp[1])) + return EINVAL; + atime->seconds = tvp[0].tv_sec; atime->microseconds = tvp[0].tv_usec; mtime->seconds = tvp[1].tv_sec; mtime->microseconds = tvp[1].tv_usec; } + return 0; } /* Changes the access time of the file behind PORT using a timeval array. */ @@ -67,7 +85,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2]) error_t err; struct timespec atime, mtime; - utime_ts_from_tval (tvp, &atime, &mtime); + err = utime_ts_from_tval (tvp, &atime, &mtime); + if (err) + return err; err = __file_utimens (port, atime, mtime); @@ -75,7 +95,9 @@ hurd_futimes (const file_t port, const struct timeval tvp[2]) { time_value_t atim, mtim; - utime_tvalue_from_tval (tvp, &atim, &mtim); + err = utime_tvalue_from_tval (tvp, &atim, &mtim); + if (err) + return err; err = __file_utimes (port, atim, mtim); } @@ -83,8 +105,16 @@ hurd_futimes (const file_t port, const struct timeval tvp[2]) return err; } +static inline bool +check_tspec (const struct timespec *tsp) +{ + return tsp->tv_nsec == UTIME_NOW + || tsp->tv_nsec == UTIME_OMIT + || tsp->tv_nsec >= 0 && tsp->tv_nsec < NSEC_PER_SEC; +} + /* Initializes atime/mtime timespec structures from an array of timespec. */ -static inline void +static inline error_t utime_ts_from_tspec (const struct timespec tsp[2], struct timespec *atime, struct timespec *mtime) { @@ -99,13 +129,19 @@ utime_ts_from_tspec (const struct timespec tsp[2], } else { + if (!check_tspec (&tsp[0])) + return EINVAL; + if (!check_tspec (&tsp[1])) + return EINVAL; + *atime = tsp[0]; *mtime = tsp[1]; } + return 0; } /* Initializes atime/mtime time_value_t structures from an array of timespec. */ -static inline void +static inline error_t utime_tvalue_from_tspec (const struct timespec tsp[2], time_value_t *atime, time_value_t *mtime) { @@ -115,6 +151,11 @@ utime_tvalue_from_tspec (const struct timespec tsp[2], atime->microseconds = mtime->microseconds = -1; else { + if (!check_tspec (&tsp[0])) + return EINVAL; + if (!check_tspec (&tsp[1])) + return EINVAL; + if (tsp[0].tv_nsec == UTIME_NOW) atime->microseconds = -1; else if (tsp[0].tv_nsec == UTIME_OMIT) @@ -128,6 +169,7 @@ utime_tvalue_from_tspec (const struct timespec tsp[2], else TIMESPEC_TO_TIME_VALUE (mtime, &(tsp[1])); } + return 0; } /* Changes the access time of the file behind PORT using a timespec array. */ @@ -137,7 +179,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2]) error_t err; struct timespec atime, mtime; - utime_ts_from_tspec (tsp, &atime, &mtime); + err = utime_ts_from_tspec (tsp, &atime, &mtime); + if (err) + return err; err = __file_utimens (port, atime, mtime); @@ -145,7 +189,9 @@ hurd_futimens (const file_t port, const struct timespec tsp[2]) { time_value_t atim, mtim; - utime_tvalue_from_tspec (tsp, &atim, &mtim); + err = utime_tvalue_from_tspec (tsp, &atim, &mtim); + if (err) + return err; err = __file_utimes (port, atim, mtim); } diff --git a/sysdeps/mach/hurd/x86/trampoline.c b/sysdeps/mach/hurd/x86/trampoline.c index 8e2890f..6f23c56 100644 --- a/sysdeps/mach/hurd/x86/trampoline.c +++ b/sysdeps/mach/hurd/x86/trampoline.c @@ -26,7 +26,11 @@ #include "hurdfault.h" #include <intr-msg.h> #include <sys/ucontext.h> - +#ifdef __x86_64__ +#include <mach/x86_64/mach_i386.h> +#else +#include <mach/i386/mach_i386.h> +#endif /* Fill in a siginfo_t structure for SA_SIGINFO-enabled handlers. */ static void fill_siginfo (siginfo_t *si, int signo, @@ -106,6 +110,7 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action void firewall (void); void *sigsp; struct sigcontext *scp; + vm_size_t xstate_size; struct { union @@ -145,6 +150,14 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action struct hurd_userlink link; ucontext_t ucontext; siginfo_t siginfo; +#ifdef __x86_64__ + char _pad2[56]; +#else + char _pad2[20]; +#endif + char xstate[]; + /* Don't add anything after xstate, as it's dynamically + sized. */ } *stackframe; #ifdef __x86_64__ @@ -170,6 +183,17 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action if (! machine_get_basic_state (ss->thread, state)) return NULL; + /* Initialize the size of the CPU extended state, to be saved during + * signal handling */ +#ifdef i386_XFLOAT_STATE + _Static_assert ((sizeof(*stackframe) + sizeof(struct i386_xfloat_state)) % 64 == 0, + "stackframe size must be multiple of 64-byte minus " + "sizeof(struct i386_xfloat_state), please adjust _pad2"); + + if (__i386_get_xstate_size(__mach_host_self(), &xstate_size)) +#endif + xstate_size = 0; + /* Save the original SP in the gratuitous `esp' slot. We may need to reset the SP (the `uesp' slot) to avoid clobbering an interrupted RPC frame. */ @@ -196,14 +220,21 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action #endif } - /* Push the arguments to call `trampoline' on the stack. */ - sigsp -= sizeof (*stackframe); -#ifdef __x86_64__ - /* Align SP at 16 bytes. Coupled with the fact that sigreturn_addr is - 16-byte aligned within the stackframe struct, this ensures that it ends - up on a 16-byte aligned address, as required by the ABI. */ - sigsp = (void *) ((uintptr_t) sigsp & ~15UL); -#endif + /* Push the arguments to call `trampoline' on the stack. + * The extended state might have a variable size depending on the platform, + * so we dynamically allocate it on the stack frame.*/ + sigsp -= sizeof (*stackframe) + xstate_size; + + /* Align SP at 64 bytes. This is needed for two reasons: + * - sigreturn_addr is 16-byte aligned within the stackframe + * struct, and this ensures that it ends up on a 16-byte aligned + * address, as required by the ABI. + * - the XSAVE state needs to be aligned at 64 bytes (on both i386 and + * x86_64), so we align the stackframe also at 64 bytes and add the + * required padding at the end, see the _pad2 field. + */ + sigsp = (void *) ((uintptr_t) sigsp & ~63UL); + stackframe = sigsp; if (_hurdsig_catch_memory_fault (stackframe)) @@ -248,14 +279,40 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action memcpy (&scp->sc_i386_thread_state, &state->basic, sizeof (state->basic)); - /* struct sigcontext is laid out so that starting at sc_fpkind mimics - a struct i386_float_state. */ - _Static_assert (offsetof (struct sigcontext, sc_i386_float_state) - % __alignof__ (struct i386_float_state) == 0, - "sc_i386_float_state layout mismatch"); - ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE, - &state->fpu, &scp->sc_i386_float_state, - sizeof (state->fpu)); + scp->xstate = NULL; +#ifdef i386_XFLOAT_STATE + if (xstate_size > 0) + { + mach_msg_type_number_t got = (xstate_size / sizeof (int)); + + ok = (! __thread_get_state (ss->thread, i386_XFLOAT_STATE, + (thread_state_t) stackframe->xstate, &got) + && got == (xstate_size / sizeof (int))); + + if (ok && ((struct i386_xfloat_state*) stackframe->xstate)->fp_save_kind > 5) + /* We support up to XSAVES */ + ok = 0; + + if (ok) + { + scp->xstate = (struct i386_xfloat_state*) stackframe->xstate; + assert((uintptr_t)scp->xstate->hw_state % 64 == 0); + } + } + else +#endif + ok = 0; + if (!ok) + { + /* struct sigcontext is laid out so that starting at sc_fpkind mimics + a struct i386_float_state. */ + _Static_assert (offsetof (struct sigcontext, sc_i386_float_state) + % __alignof__ (struct i386_float_state) == 0, + "sc_i386_float_state layout mismatch"); + ok = machine_get_state (ss->thread, state, i386_FLOAT_STATE, + &state->fpu, &scp->sc_i386_float_state, + sizeof (state->fpu)); + } /* Set up the arguments for the signal handler. */ stackframe->signo = signo; @@ -404,7 +461,10 @@ _hurd_setup_sighandler (struct hurd_sigstate *ss, const struct sigaction *action - in gdb: gdb/i386-gnu-tdep.c gnu_sigtramp_code. */ #ifdef __x86_64__ -asm ("rpc_wait_trampoline:\n" +asm ("trampoline:\n" + "fnclex\n" /* Clear any pending exception. */ + "jmp _trampoline\n" + "rpc_wait_trampoline:\n" /* This is the entry point when we have an RPC reply message to receive before running the handler. The MACH_MSG_SEND bit has already been cleared in the OPTION argument in our %rsi. The interrupted user @@ -423,7 +483,7 @@ asm ("rpc_wait_trampoline:\n" /* Switch to the signal stack. */ "movq %rbx, %rsp\n" - "trampoline:\n" + "_trampoline:\n" /* Entry point for running the handler normally. The arguments to the handler function are on the top of the stack, same as in the i386 version: @@ -449,7 +509,10 @@ asm ("rpc_wait_trampoline:\n" "movq 16(%rsp), %rdi\n" "ret"); #else -asm ("rpc_wait_trampoline:\n"); +asm ("trampoline:\n" + "fnclex\n" /* Clear any pending exception. */ + "jmp _trampoline\n" + "rpc_wait_trampoline:\n"); /* This is the entry point when we have an RPC reply message to receive before running the handler. The MACH_MSG_SEND bit has already been cleared in the OPTION argument on our stack. The interrupted user @@ -469,7 +532,7 @@ asm (/* Retry the interrupted mach_msg system call. */ /* Switch to the signal stack. */ "movl %ebx, %esp\n"); - asm ("trampoline:\n"); +asm ("_trampoline:\n"); /* Entry point for running the handler normally. The arguments to the handler function are already on the top of the stack: diff --git a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h index 7bac881..d83795f 100644 --- a/sysdeps/mach/hurd/x86_64/bits/sigcontext.h +++ b/sysdeps/mach/hurd/x86_64/bits/sigcontext.h @@ -96,6 +96,8 @@ struct sigcontext struct i386_fp_save sc_fpsave; struct i386_fp_regs sc_fpregs; int sc_fpexcsr; /* FPSR including exception bits. */ + + struct i386_xfloat_state *xstate; }; /* Traditional BSD names for some members. */ diff --git a/sysdeps/mach/hurd/x86_64/sigreturn.c b/sysdeps/mach/hurd/x86_64/sigreturn.c index 81a2d3b..773c00f 100644 --- a/sysdeps/mach/hurd/x86_64/sigreturn.c +++ b/sysdeps/mach/hurd/x86_64/sigreturn.c @@ -20,6 +20,8 @@ #include <hurd/msg.h> #include <stdlib.h> +#include <cpuid.h> + /* This is run on the thread stack after restoring it, to be able to unlock SS off sigstack. */ void @@ -116,10 +118,35 @@ __sigreturn (struct sigcontext *scp) if (scp->sc_onstack) ss->sigaltstack.ss_flags &= ~SS_ONSTACK; - if (scp->sc_fpused) - /* Restore the FPU state. Mach conveniently stores the state - in the format the i387 `frstor' instruction uses to restore it. */ - asm volatile ("frstor %0" : : "m" (scp->sc_fpsave)); +#ifdef i386_XFLOAT_STATE + if (scp->xstate) + { + if (scp->xstate->initialized) + { + unsigned eax, ebx, ecx, edx; + __cpuid_count(0xd, 0, eax, ebx, ecx, edx); + switch (scp->xstate->fp_save_kind) + { + case 0: // FNSAVE + asm volatile("frstor %0" : : "m" (scp->xstate->hw_state)); + break; + case 1: // FXSAVE + asm volatile("fxrstor %0" : : "m" (scp->xstate->hw_state), \ + "a" (eax), "d" (edx)); + break; + default: // XSAVE, XSAVEOPT, XSAVEC, XSAVES + asm volatile("xrstor %0" : : "m" (scp->xstate->hw_state), \ + "a" (eax), "d" (edx)); + break; + } + } + } + else +#endif + if (scp->sc_fpused) + /* Restore the FPU state. Mach conveniently stores the state + in the format the i387 `frstor' instruction uses to restore it. */ + asm volatile ("frstor %0" : : "m" (scp->sc_fpsave)); /* Copy the registers onto the user's stack, to be able to release the altstack (by unlocking sigstate). Note that unless an altstack is used, diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h index d8d7c8b..89e26bb 100644 --- a/sysdeps/powerpc/powerpc64/dl-machine.h +++ b/sysdeps/powerpc/powerpc64/dl-machine.h @@ -363,7 +363,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[], / sizeof (Elf64_Rela)); Elf64_Addr l_addr = map->l_addr; Elf64_Dyn **info = map->l_info; - char *p; extern void _dl_runtime_resolve (void); extern void _dl_profile_resolve (void); @@ -435,20 +434,6 @@ elf_machine_runtime_setup (struct link_map *map, struct r_scope_elem *scope[], offset += PLT_ENTRY_WORDS; glink_offset += GLINK_ENTRY_WORDS (i); } - - /* Now, we've modified data. We need to write the changes from - the data cache to a second-level unified cache, then make - sure that stale data in the instruction cache is removed. - (In a multiprocessor system, the effect is more complex.) - Most of the PLT shouldn't be in the instruction cache, but - there may be a little overlap at the start and the end. - - Assumes that dcbst and icbi apply to lines of 16 bytes or - more. Current known line sizes are 16, 32, and 128 bytes. */ - - for (p = (char *) plt; p < (char *) &plt[offset]; p += 16) - PPC_DCBST (p); - PPC_SYNC; } } return lazy; diff --git a/sysdeps/pthread/tst-stdio2.c b/sysdeps/pthread/tst-stdio2.c index 08948cb..0876ed6 100644 --- a/sysdeps/pthread/tst-stdio2.c +++ b/sysdeps/pthread/tst-stdio2.c @@ -75,7 +75,7 @@ do_test (void) exit (1); } - puts ("join returned succsefully"); + puts ("join returned successfully"); return 0; } diff --git a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h index dfc554a..f425a4b 100644 --- a/sysdeps/unix/sysv/linux/bits/fcntl-linux.h +++ b/sysdeps/unix/sysv/linux/bits/fcntl-linux.h @@ -379,6 +379,8 @@ struct file_handle identity and may not be usable to open_by_handle_at. */ +# define AT_HANDLE_MNT_ID_UNIQUE 1 /* Return the 64-bit unique mount + ID. */ #endif __BEGIN_DECLS diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 9d136e4..e50f1d6 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -502,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load "Incorrect index_arch_Fast_Unaligned_Load"); -/* Intel Family-6 microarch list. */ -enum +/* Intel microarch list. */ +enum intel_microarch { /* Atom processors. */ INTEL_ATOM_BONNELL, @@ -512,6 +512,7 @@ enum INTEL_ATOM_GOLDMONT, INTEL_ATOM_GOLDMONT_PLUS, INTEL_ATOM_SIERRAFOREST, + INTEL_ATOM_CLEARWATERFOREST, INTEL_ATOM_GRANDRIDGE, INTEL_ATOM_TREMONT, @@ -539,7 +540,9 @@ enum INTEL_BIGCORE_METEORLAKE, INTEL_BIGCORE_LUNARLAKE, INTEL_BIGCORE_ARROWLAKE, + INTEL_BIGCORE_PANTHERLAKE, INTEL_BIGCORE_GRANITERAPIDS, + INTEL_BIGCORE_DIAMONDRAPIDS, /* Mixed (bigcore + atom SOC). */ INTEL_MIXED_LAKEFIELD, @@ -553,7 +556,7 @@ enum INTEL_UNKNOWN, }; -static unsigned int +static enum intel_microarch intel_get_fam6_microarch (unsigned int model, __attribute__ ((unused)) unsigned int stepping) { @@ -584,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_ATOM_GOLDMONT_PLUS; case 0xAF: return INTEL_ATOM_SIERRAFOREST; + case 0xDD: + return INTEL_ATOM_CLEARWATERFOREST; case 0xB6: return INTEL_ATOM_GRANDRIDGE; case 0x86: @@ -691,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_BIGCORE_METEORLAKE; case 0xbd: return INTEL_BIGCORE_LUNARLAKE; + case 0xb5: + case 0xc5: case 0xc6: return INTEL_BIGCORE_ARROWLAKE; + case 0xCC: + return INTEL_BIGCORE_PANTHERLAKE; case 0xAD: case 0xAE: return INTEL_BIGCORE_GRANITERAPIDS; @@ -756,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features) cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] &= ~bit_arch_Avoid_Non_Temporal_Memset; + enum intel_microarch microarch = INTEL_UNKNOWN; if (family == 0x06) { model += extended_model; - unsigned int microarch - = intel_get_fam6_microarch (model, stepping); + microarch = intel_get_fam6_microarch (model, stepping); + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (microarch) { - /* Atom / KNL tuning. */ - case INTEL_ATOM_BONNELL: - /* BSF is slow on Bonnell. */ - cpu_features->preferred[index_arch_Slow_BSF] - |= bit_arch_Slow_BSF; - break; - - /* Unaligned load versions are faster than SSSE3 - on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ - case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_SILVERMONT: - case INTEL_ATOM_GOLDMONT: - case INTEL_ATOM_GOLDMONT_PLUS: - - /* Knights Landing. Enable Silvermont optimizations. */ - case INTEL_KNIGHTS_LANDING: - - cpu_features->preferred[index_arch_Fast_Unaligned_Load] - |= (bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - case INTEL_ATOM_TREMONT: - /* Enable rep string instructions, unaligned load, unaligned - copy, pminub and avoid SSE 4.2 on Tremont. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - /* - Default tuned Knights microarch. - case INTEL_KNIGHTS_MILL: - */ - - /* - Default tuned atom microarch. - case INTEL_ATOM_SIERRAFOREST: - case INTEL_ATOM_GRANDRIDGE: - */ - - /* Bigcore/Default Tuning. */ default: - default_tuning: - /* Unknown family 0x06 processors. Assuming this is one - of Core i3/i5/i7 processors if AVX is available. */ - if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) - break; - - enable_modern_features: - /* Rep string instructions, unaligned load, unaligned copy, - and pminub are fast on Intel Core i3, i5 and i7. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop); break; - case INTEL_BIGCORE_NEHALEM: - case INTEL_BIGCORE_WESTMERE: - /* Older CPUs prefer non-temporal stores at lower threshold. */ - cpu_features->cachesize_non_temporal_divisor = 8; - goto enable_modern_features; - - /* Older Bigcore microarch (smaller non-temporal store - threshold). */ - case INTEL_BIGCORE_SANDYBRIDGE: - case INTEL_BIGCORE_IVYBRIDGE: - case INTEL_BIGCORE_HASWELL: - case INTEL_BIGCORE_BROADWELL: - cpu_features->cachesize_non_temporal_divisor = 8; - goto default_tuning; - - /* Newer Bigcore microarch (larger non-temporal store - threshold). */ - case INTEL_BIGCORE_SKYLAKE_AVX512: - case INTEL_BIGCORE_CANNONLAKE: - /* Benchmarks indicate non-temporal memset is not - necessarily profitable on SKX (and in some cases much - worse). This is likely unique to SKX due its it unique - mesh interconnect (not present on ICX or BWD). Disable - non-temporal on all Skylake servers. */ - cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] - |= bit_arch_Avoid_Non_Temporal_Memset; - /* fallthrough */ - case INTEL_BIGCORE_COMETLAKE: - case INTEL_BIGCORE_SKYLAKE: - case INTEL_BIGCORE_KABYLAKE: - case INTEL_BIGCORE_ICELAKE: - case INTEL_BIGCORE_TIGERLAKE: - case INTEL_BIGCORE_ROCKETLAKE: - case INTEL_BIGCORE_RAPTORLAKE: - case INTEL_BIGCORE_METEORLAKE: - case INTEL_BIGCORE_LUNARLAKE: - case INTEL_BIGCORE_ARROWLAKE: - case INTEL_BIGCORE_SAPPHIRERAPIDS: - case INTEL_BIGCORE_EMERALDRAPIDS: - case INTEL_BIGCORE_GRANITERAPIDS: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - - /* Default tuned Mixed (bigcore + atom SOC). */ - case INTEL_MIXED_LAKEFIELD: - case INTEL_MIXED_ALDERLAKE: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - } - - /* Disable TSX on some processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which - disables broken feature by default). */ - switch (microarch) - { case INTEL_BIGCORE_SKYLAKE_AVX512: /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */ if (stepping <= 5) @@ -891,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features) case INTEL_BIGCORE_KABYLAKE: /* NB: Although the errata documents that for model == 0x8e - (kabylake skylake client), only 0xb stepping or lower are - impacted, the intention of the errata was to disable TSX on - all client processors on all steppings. Include 0xc - stepping which is an Intel Core i7-8665U, a client mobile - processor. */ + (kabylake skylake client), only 0xb stepping or lower are + impacted, the intention of the errata was to disable TSX on + all client processors on all steppings. Include 0xc + stepping which is an Intel Core i7-8665U, a client mobile + processor. */ if (stepping > 0xc) break; /* Fall through. */ case INTEL_BIGCORE_SKYLAKE: - /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for - processors listed in: - -https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html - */ - disable_tsx: - CPU_FEATURE_UNSET (cpu_features, HLE); - CPU_FEATURE_UNSET (cpu_features, RTM); - CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); - break; + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + + https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + CPU_FEATURE_UNSET (cpu_features, HLE); + CPU_FEATURE_UNSET (cpu_features, RTM); + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); + break; case INTEL_BIGCORE_HASWELL: - /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working - TSX. Haswell also include other model numbers that have - working TSX. */ - if (model == 0x3f && stepping >= 4) + /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working + TSX. Haswell also includes other model numbers that have + working TSX. */ + if (model == 0x3f && stepping >= 4) break; - CPU_FEATURE_UNSET (cpu_features, RTM); - break; + CPU_FEATURE_UNSET (cpu_features, RTM); + break; } } + else if (family == 19) + switch (model) + { + case 0x01: + microarch = INTEL_BIGCORE_DIAMONDRAPIDS; + break; + default: + break; + } + + switch (microarch) + { + /* Atom / KNL tuning. */ + case INTEL_ATOM_BONNELL: + /* BSF is slow on Bonnell. */ + cpu_features->preferred[index_arch_Slow_BSF] + |= bit_arch_Slow_BSF; + break; + + /* Unaligned load versions are faster than SSSE3 + on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ + case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_SILVERMONT: + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: + + /* Knights Landing. Enable Silvermont optimizations. */ + case INTEL_KNIGHTS_LANDING: + + cpu_features->preferred[index_arch_Fast_Unaligned_Load] + |= (bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + case INTEL_ATOM_TREMONT: + /* Enable rep string instructions, unaligned load, unaligned + copy, pminub and avoid SSE 4.2 on Tremont. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + /* + Default tuned Knights microarch. + case INTEL_KNIGHTS_MILL: + */ + + /* + Default tuned atom microarch. + case INTEL_ATOM_SIERRAFOREST: + case INTEL_ATOM_GRANDRIDGE: + case INTEL_ATOM_CLEARWATERFOREST: + */ + + /* Bigcore/Default Tuning. */ + default: + default_tuning: + /* Unknown Intel processors. Assuming this is one of Core + i3/i5/i7 processors if AVX is available. */ + if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) + break; + + enable_modern_features: + /* Rep string instructions, unaligned load, unaligned copy, + and pminub are fast on Intel Core i3, i5 and i7. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop); + break; + + case INTEL_BIGCORE_NEHALEM: + case INTEL_BIGCORE_WESTMERE: + /* Older CPUs prefer non-temporal stores at lower threshold. */ + cpu_features->cachesize_non_temporal_divisor = 8; + goto enable_modern_features; + + /* Older Bigcore microarch (smaller non-temporal store + threshold). */ + case INTEL_BIGCORE_SANDYBRIDGE: + case INTEL_BIGCORE_IVYBRIDGE: + case INTEL_BIGCORE_HASWELL: + case INTEL_BIGCORE_BROADWELL: + cpu_features->cachesize_non_temporal_divisor = 8; + goto default_tuning; + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ + case INTEL_BIGCORE_SKYLAKE_AVX512: + case INTEL_BIGCORE_CANNONLAKE: + /* Benchmarks indicate non-temporal memset is not + necessarily profitable on SKX (and in some cases much + worse). This is likely unique to SKX due to its unique + mesh interconnect (not present on ICX or BWD). Disable + non-temporal on all Skylake servers. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + /* fallthrough */ + case INTEL_BIGCORE_COMETLAKE: + case INTEL_BIGCORE_SKYLAKE: + case INTEL_BIGCORE_KABYLAKE: + case INTEL_BIGCORE_ICELAKE: + case INTEL_BIGCORE_TIGERLAKE: + case INTEL_BIGCORE_ROCKETLAKE: + case INTEL_BIGCORE_RAPTORLAKE: + case INTEL_BIGCORE_METEORLAKE: + case INTEL_BIGCORE_LUNARLAKE: + case INTEL_BIGCORE_ARROWLAKE: + case INTEL_BIGCORE_PANTHERLAKE: + case INTEL_BIGCORE_SAPPHIRERAPIDS: + case INTEL_BIGCORE_EMERALDRAPIDS: + case INTEL_BIGCORE_GRANITERAPIDS: + case INTEL_BIGCORE_DIAMONDRAPIDS: + /* Default tuned Mixed (bigcore + atom SOC). */ + case INTEL_MIXED_LAKEFIELD: + case INTEL_MIXED_ALDERLAKE: + cpu_features->cachesize_non_temporal_divisor = 2; + goto default_tuning; + } /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER if AVX512ER is available. Don't use AVX512 to avoid lower CPU diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 9d31685..5723ec1 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -142,7 +142,6 @@ CFLAGS-tst-avxmod.c += $(AVX-CFLAGS) AVX512-CFLAGS = -mavx512f CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS) CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS) -CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS) CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS) CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS) diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c index 6eb21b6..0b994ef 100644 --- a/sysdeps/x86_64/tst-auditmod10b.c +++ b/sysdeps/x86_64/tst-auditmod10b.c @@ -125,7 +125,6 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, #include <tst-audit.h> -#ifdef __AVX512F__ #include <immintrin.h> #include <cpuid.h> @@ -148,9 +147,37 @@ check_avx512 (void) return (eax & 0xe6) == 0xe6; } -#else -#include <emmintrin.h> -#endif +void +__attribute__ ((target ("avx512f"))) +pltenter_avx512f (La_regs *regs, long int *framesizep) +{ + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); + asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" ); + asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" ); + asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" ); + asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" ); + asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" ); + asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" ); + + *framesizep = 1024; +} ElfW(Addr) pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, @@ -160,39 +187,33 @@ pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", symname, (long int) sym->st_value, ndx, *flags); -#ifdef __AVX512F__ if (check_avx512 () && strcmp (symname, "audit_test") == 0) + pltenter_avx512f (regs, framesizep); + + return sym->st_value; +} + +void +__attribute__ ((target ("avx512f"))) +pltexit_avx512f (const La_regs *inregs, La_retval *outregs) +{ + __m512i zero = _mm512_setzero_si512 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) { - __m512i zero = _mm512_setzero_si512 (); - if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) - || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) - abort (); - - for (int i = 0; i < 8; i++) - regs->lr_vector[i].zmm[0] - = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1); - - __m512i zmm = _mm512_set1_epi64 (-1); - asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); - asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); - asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" ); - asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" ); - asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" ); - asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" ); - asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" ); - asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" ); - - *framesizep = 1024; + __m512i zmm = _mm512_set1_epi64 (i + 1); + if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0) + abort (); } -#endif - return sym->st_value; + outregs->lrv_vector0.zmm[0] + = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876); + + __m512i zmm = _mm512_set1_epi64 (-1); + asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); + asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); } unsigned int @@ -204,28 +225,8 @@ pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, symname, (long int) sym->st_value, ndx, (ptrdiff_t) outregs->int_retval); -#ifdef __AVX512F__ if (check_avx512 () && strcmp (symname, "audit_test") == 0) - { - __m512i zero = _mm512_setzero_si512 (); - if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) - abort (); - - for (int i = 0; i < 8; i++) - { - __m512i zmm = _mm512_set1_epi64 (i + 1); - if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0) - abort (); - } - - outregs->lrv_vector0.zmm[0] - = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876); - - __m512i zmm = _mm512_set1_epi64 (-1); - asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" ); - asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" ); - } -#endif + pltexit_avx512f (inregs, outregs); return 0; } |