diff options
179 files changed, 4463 insertions, 2562 deletions
@@ -40,6 +40,7 @@ The following CVEs were fixed in this release: The following bugs are resolved with this release: + [19341] ctype: Fallback initialization of TLS using relocations [19622] network: Support aliasing with struct sockaddr [27821] ungetc: Fix backup buffer leak on program exit [30081] resolv: Do not wait for non-existing second DNS response after error @@ -86,11 +87,13 @@ The following bugs are resolved with this release: (x86/tst-cpu-features-supports.c:69:3: error: parameter to builtin not valid: avx5124fmaps) [31798] pidfd_getpid.c is miscompiled by GCC 6.4 + [31840] Remove memory leak in fdopen [31867] build: "CPU ISA level is lower than required" on SSE2-free CPUs [31883] build: ISA level support configure check relies on bashism / is otherwise broken for arithmetic [31890] resolv: Allow short error responses to match any DNS query + [31943] _dl_find_object can fail if ld.so contains gaps between load segments [31965] rseq extension mechanism does not work as intended [31968] mremap implementation in C does not handle arguments correctly [32026] strerror/strsignal TLS not handled correctly for secondary namespaces @@ -99,6 +102,11 @@ The following bugs are resolved with this release: [32231] elf: Change ldconfig auxcache magic number [32470] x86: Avoid integer truncation with large cache sizes [32582] Fix underallocation of abort_msg_s struct (CVE-2025-0395) + [32810] Crash on x86-64 if XSAVEC disable via tunable + [32987] elf: Fix subprocess status handling for tst-dlopen-sgid + [33185] Fix double-free after allocation failure in regcomp + [32483] ctype macros segfault in multithreaded programs with multiple libc.so + [33234] Use TLS initial-exec model for __libc_tsd_CTYPE_* thread variables Version 2.39 diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs index 455aa65..4985293 100644 --- a/benchtests/atanh-inputs +++ b/benchtests/atanh-inputs @@ -1,6 +1,7 @@ ## args: double ## ret: double ## includes: math.h +## name: workload-random 0x1.5a2730bacd94ap-1 -0x1.b57eb40fc048ep-21 -0x1.c0b185fb450e2p-17 @@ -4763,6 +4763,9 @@ with_fp_cond=1 # A preconfigure script may define another name to TLS descriptor variant mtls_descriptor=gnu2 +# A preconfigure script may define another name to traditional TLS variant +mtls_traditional=gnu + if frags=`ls -d $srcdir/sysdeps/*/preconfigure 2> /dev/null` then { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for sysdeps preconfigure fragments" >&5 @@ -7042,6 +7045,39 @@ printf "%s\n" "$libc_cv_mtls_descriptor" >&6; } config_vars="$config_vars have-mtls-descriptor = $libc_cv_mtls_descriptor" +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for traditional tls support" >&5 +printf %s "checking for traditional tls support... " >&6; } +if test ${libc_cv_test_mtls_traditional+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) cat > conftest.c <<EOF +__thread int i; +void foo (void) +{ + i = 10; +} +EOF +if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=$mtls_traditional -nostdlib -nostartfiles + -shared conftest.c -o conftest 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then + libc_cv_test_mtls_traditional=$mtls_traditional +else + libc_cv_test_mtls_traditional=no +fi +rm -f conftest* ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_test_mtls_traditional" >&5 +printf "%s\n" "$libc_cv_test_mtls_traditional" >&6; } +config_vars="$config_vars +have-test-mtls-traditional = $libc_cv_test_mtls_traditional" + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if -Wno-ignored-attributes is required for aliases" >&5 printf %s "checking if -Wno-ignored-attributes is required for aliases... " >&6; } if test ${libc_cv_wno_ignored_attributes+y} diff --git a/configure.ac b/configure.ac index bdc385d..a313397 100644 --- a/configure.ac +++ b/configure.ac @@ -445,6 +445,9 @@ with_fp_cond=1 # A preconfigure script may define another name to TLS descriptor variant mtls_descriptor=gnu2 +# A preconfigure script may define another name to traditional TLS variant +mtls_traditional=gnu + dnl Let sysdeps/*/preconfigure act here. LIBC_PRECONFIGURE([$srcdir], [for sysdeps]) @@ -1310,6 +1313,28 @@ rm -f conftest*]) AC_SUBST(libc_cv_mtls_descriptor) LIBC_CONFIG_VAR([have-mtls-descriptor], [$libc_cv_mtls_descriptor]) +dnl Check if CC supports traditional tls. +AC_CACHE_CHECK([for traditional tls support], + libc_cv_test_mtls_traditional, +[dnl +cat > conftest.c <<EOF +__thread int i; +void foo (void) +{ + i = 10; +} +EOF +if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fPIC -mtls-dialect=$mtls_traditional -nostdlib -nostartfiles + -shared conftest.c -o conftest 1>&AS_MESSAGE_LOG_FD]) +then + libc_cv_test_mtls_traditional=$mtls_traditional +else + libc_cv_test_mtls_traditional=no +fi +rm -f conftest*]) +LIBC_CONFIG_VAR([have-test-mtls-traditional], + [$libc_cv_test_mtls_traditional]) + dnl clang emits an warning for a double alias redirection, to warn the dnl original symbol is sed even when weak definition overrides it. dnl It is a usual pattern for weak_alias, where multiple alias point to diff --git a/ctype/Makefile b/ctype/Makefile index 717d020..b7cd5f2 100644 --- a/ctype/Makefile +++ b/ctype/Makefile @@ -24,9 +24,35 @@ include ../Makeconfig headers := ctype.h -routines := ctype ctype-c99 ctype-extn ctype-c99_l ctype_l isctype -aux := ctype-info - -tests := test_ctype +routines := \ + ctype \ + ctype-c99 \ + ctype-c99_l \ + ctype-extn \ + ctype_l \ + isctype \ + # routines +aux := ctype-info + +tests := \ + test_ctype \ + tst-ctype-tls-dlmopen \ + tst-ctype-tls-dlopen-static \ + # tests + +tests-static := \ + tst-ctype-tls-dlopen-static \ + # tests-static + +modules-names := \ + tst-ctype-tls-mod \ + # modules-names include ../Rules + +$(objpfx)tst-ctype-tls-dlmopen: $(shared-thread-library) +$(objpfx)tst-ctype-tls-dlmopen.out: $(objpfx)tst-ctype-tls-mod.so +$(objpfx)tst-ctype-tls-dlopen-static: $(static-thread-library) +$(objpfx)tst-ctype-tls-dlopen-static.out: $(objpfx)tst-ctype-tls-mod.so +tst-ctype-tls-dlopen-static-ENV = \ + LD_LIBRARY_PATH=$(ld-library-path):$(common-objpfx):$(common-objpfx)elf diff --git a/ctype/ctype-info.c b/ctype/ctype-info.c index 9032547..b6cdf7e 100644 --- a/ctype/ctype-info.c +++ b/ctype/ctype-info.c @@ -19,20 +19,28 @@ #include <ctype.h> #include <locale/localeinfo.h> -__libc_tsd_define (, const uint16_t *, CTYPE_B) -__libc_tsd_define (, const int32_t *, CTYPE_TOLOWER) -__libc_tsd_define (, const int32_t *, CTYPE_TOUPPER) +/* Fallback initialization using relocations. See the _nl_C_locobj + initializers in locale/xlocale.c. Usually, this is overwritten by + __ctype_init before user code runs, but this does not happen for + threads in secondary namespaces. With the initializers, secondary + namespaces at least get locale data from the C locale. */ +__thread const uint16_t * __libc_tsd_CTYPE_B attribute_tls_model_ie + = (const uint16_t *) _nl_C_LC_CTYPE_class + 128; +__thread const int32_t * __libc_tsd_CTYPE_TOLOWER attribute_tls_model_ie + = (const int32_t *) _nl_C_LC_CTYPE_tolower + 128; +__thread const int32_t * __libc_tsd_CTYPE_TOUPPER attribute_tls_model_ie + = (const int32_t *) _nl_C_LC_CTYPE_toupper + 128; void __ctype_init (void) { - const uint16_t **bp = __libc_tsd_address (const uint16_t *, CTYPE_B); - *bp = (const uint16_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CLASS) + 128; - const int32_t **up = __libc_tsd_address (const int32_t *, CTYPE_TOUPPER); - *up = ((int32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TOUPPER) + 128); - const int32_t **lp = __libc_tsd_address (const int32_t *, CTYPE_TOLOWER); - *lp = ((int32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TOLOWER) + 128); + __libc_tsd_CTYPE_B + = ((const uint16_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CLASS)) + 128; + __libc_tsd_CTYPE_TOUPPER + = ((const int32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TOUPPER)) + 128; + __libc_tsd_CTYPE_TOLOWER = + ((const int32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TOLOWER)) + 128; } libc_hidden_def (__ctype_init) @@ -41,10 +49,7 @@ libc_hidden_def (__ctype_init) #if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3) /* Defined in locale/C-ctype.c. */ -extern const char _nl_C_LC_CTYPE_class[] attribute_hidden; extern const char _nl_C_LC_CTYPE_class32[] attribute_hidden; -extern const char _nl_C_LC_CTYPE_toupper[] attribute_hidden; -extern const char _nl_C_LC_CTYPE_tolower[] attribute_hidden; extern const char _nl_C_LC_CTYPE_class_upper[] attribute_hidden; extern const char _nl_C_LC_CTYPE_class_lower[] attribute_hidden; extern const char _nl_C_LC_CTYPE_class_alpha[] attribute_hidden; diff --git a/ctype/tst-ctype-tls-dlmopen.c b/ctype/tst-ctype-tls-dlmopen.c new file mode 100644 index 0000000..f7eeb65 --- /dev/null +++ b/ctype/tst-ctype-tls-dlmopen.c @@ -0,0 +1,2 @@ +#define DO_STATIC_TEST 0 +#include "tst-ctype-tls-skeleton.c" diff --git a/ctype/tst-ctype-tls-dlopen-static.c b/ctype/tst-ctype-tls-dlopen-static.c new file mode 100644 index 0000000..c2c09c3 --- /dev/null +++ b/ctype/tst-ctype-tls-dlopen-static.c @@ -0,0 +1,2 @@ +#define DO_STATIC_TEST 1 +#include "tst-ctype-tls-skeleton.c" diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/ctype/tst-ctype-tls-mod.c index 1a9f606..52cbb9d 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S +++ b/ctype/tst-ctype-tls-mod.c @@ -1,5 +1,5 @@ -/* Optimized strcmp implementation for POWER10/PPC64. - Copyright (C) 2021-2024 Free Software Foundation, Inc. +/* Wrappers for <ctype.h> macros in a secondary namespace. + Copyright (C) 2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +16,22 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) -#define STRCMP __strcmp_power10 +#include <ctype.h> -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) +int +my_isalpha (int ch) +{ + return isalpha (ch); +} -#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S> -#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */ +int +my_toupper (int ch) +{ + return toupper (ch); +} + +int +my_tolower (int ch) +{ + return tolower (ch); +} diff --git a/ctype/tst-ctype-tls-skeleton.c b/ctype/tst-ctype-tls-skeleton.c new file mode 100644 index 0000000..8c53e35 --- /dev/null +++ b/ctype/tst-ctype-tls-skeleton.c @@ -0,0 +1,67 @@ +/* Test that <ctype.h> in a secondary namespace works. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Before this file is included, define DO_STATIC_TEST to 0 or 1. + With 0, dlmopen is used for the test. With 1, dlopen is used. */ + +#include <stddef.h> +#include <stdlib.h> +#include <support/check.h> +#include <support/support.h> +#include <support/xdlfcn.h> +#include <support/xthread.h> + +static int (*my_isalpha) (int); +static int (*my_toupper) (int); +static int (*my_tolower) (int); + +static void * +checks (void *ignore) +{ + TEST_VERIFY (my_isalpha ('a')); + TEST_VERIFY (!my_isalpha ('0')); + TEST_COMPARE (my_toupper ('a'), 'A'); + TEST_COMPARE (my_toupper ('A'), 'A'); + TEST_COMPARE (my_tolower ('a'), 'a'); + TEST_COMPARE (my_tolower ('A'), 'a'); + return NULL; +} + +static int +do_test (void) +{ + char *dso = xasprintf ("%s/ctype/tst-ctype-tls-mod.so", support_objdir_root); +#if DO_STATIC_TEST + void *handle = xdlopen (dso, RTLD_LAZY); +#else + void *handle = xdlmopen (LM_ID_NEWLM, dso, RTLD_LAZY); +#endif + my_isalpha = xdlsym (handle, "my_isalpha"); + my_toupper = xdlsym (handle, "my_toupper"); + my_tolower = xdlsym (handle, "my_tolower"); + + checks (NULL); + xpthread_join (xpthread_create (NULL, checks, NULL)); + + xdlclose (handle); + free (dso); + + return 0; +} + +#include <support/test-driver.c> diff --git a/debug/Makefile b/debug/Makefile index 3903cc9..76c311d 100644 --- a/debug/Makefile +++ b/debug/Makefile @@ -287,6 +287,7 @@ tests = \ tst-fortify-wide \ tst-longjmp_chk \ tst-longjmp_chk2 \ + tst-longjmp_chk3 \ tst-realpath-chk \ tst-sprintf-fortify-rdonly \ tst-sprintf-fortify-unchecked \ diff --git a/debug/tst-longjmp_chk3.c b/debug/tst-longjmp_chk3.c index 9ff9977..9b9db3b 100644 --- a/debug/tst-longjmp_chk3.c +++ b/debug/tst-longjmp_chk3.c @@ -18,9 +18,13 @@ #include <setjmp.h> #include <signal.h> +#include <stdio.h> #include <string.h> +#include <unistd.h> -static char buf[SIGSTKSZ * 4]; +#include <support/support.h> + +static char *buf; static jmp_buf jb; static void @@ -49,8 +53,10 @@ do_test (void) set_fortify_handler (handler); /* Create a valid signal stack and enable it. */ + size_t bufsize = SIGSTKSZ * 4; + buf = xmalloc (bufsize); ss.ss_sp = buf; - ss.ss_size = sizeof (buf); + ss.ss_size = bufsize; ss.ss_flags = 0; if (sigaltstack (&ss, NULL) < 0) { @@ -65,8 +71,8 @@ do_test (void) /* Shrink the signal stack so the jmpbuf is now invalid. We adjust the start & end to handle stacks that grow up & down. */ - ss.ss_sp = buf + sizeof (buf) / 2; - ss.ss_size = sizeof (buf) / 4; + ss.ss_sp = buf + bufsize / 2; + ss.ss_size = bufsize / 4; if (sigaltstack (&ss, NULL) < 0) { printf ("second sigaltstack failed: %m\n"); diff --git a/elf/Makefile b/elf/Makefile index 8a5678a..381b598 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -57,6 +57,7 @@ dl-routines = \ dl-close \ dl-debug \ dl-debug-symbols \ + dl-debug_state \ dl-deps \ dl-exception \ dl-execstack \ @@ -266,6 +267,7 @@ tests-static-normal := \ tst-array1-static \ tst-array5-static \ tst-dl-iter-static \ + tst-dlopen-sgid \ tst-dst-static \ tst-env-setuid-static \ tst-getauxval-static \ @@ -376,6 +378,7 @@ tests += \ tst-align3 \ tst-audit-tlsdesc \ tst-audit-tlsdesc-dlopen \ + tst-audit-tlsdesc-dlopen2 \ tst-audit1 \ tst-audit2 \ tst-audit8 \ @@ -412,6 +415,10 @@ tests += \ tst-dlmopen1 \ tst-dlmopen3 \ tst-dlmopen4 \ + tst-dlmopen4-nonpic \ + tst-dlmopen4-pic \ + tst-dlopen-auditdup \ + tst-dlopen-constructor-null \ tst-dlopen-self \ tst-dlopen-tlsmodid \ tst-dlopen-tlsreinit1 \ @@ -450,6 +457,9 @@ tests += \ tst-recursive-tls \ tst-relsort1 \ tst-ro-dynamic \ + tst-rtld-no-malloc \ + tst-rtld-no-malloc-audit \ + tst-rtld-no-malloc-preload \ tst-rtld-run-static \ tst-single_threaded \ tst-single_threaded-pthread \ @@ -474,6 +484,7 @@ tests += \ tst-tls19 \ tst-tls20 \ tst-tls21 \ + tst-tls23 \ tst-tlsalign \ tst-tlsalign-extern \ tst-tlsgap \ @@ -481,6 +492,7 @@ tests += \ tst-unique2 \ tst-unwind-ctor \ tst-unwind-main \ + tst-version-hash-zero \ unload3 \ unload4 \ unload5 \ @@ -511,6 +523,8 @@ tests-internal += \ tst-dl_find_object \ tst-dl_find_object-threads \ tst-dlmopen2 \ + tst-link-map-contiguous-ldso \ + tst-link-map-contiguous-libc \ tst-ptrguard1 \ tst-stackguard1 \ tst-tls-surplus \ @@ -522,6 +536,10 @@ tests-internal += \ unload2 \ # tests-internal +ifeq ($(build-hardcoded-path-in-tests),yes) +tests-internal += tst-link-map-contiguous-main +endif + tests-container += \ tst-dlopen-self-container \ tst-dlopen-tlsmodid-container \ @@ -802,6 +820,7 @@ modules-names += \ tst-auditmanymod8 \ tst-auditmanymod9 \ tst-auditmod-tlsdesc \ + tst-auditmod-tlsdesc2 \ tst-auditmod1 \ tst-auditmod11 \ tst-auditmod12 \ @@ -842,6 +861,11 @@ modules-names += \ tst-dlmopen-twice-mod1 \ tst-dlmopen-twice-mod2 \ tst-dlmopen1mod \ + tst-dlopen-auditdup-auditmod \ + tst-dlopen-auditdupmod \ + tst-dlopen-constructor-null-mod1 \ + tst-dlopen-constructor-null-mod2 \ + tst-dlopen-sgid-mod \ tst-dlopen-tlsreinitmod1 \ tst-dlopen-tlsreinitmod2 \ tst-dlopen-tlsreinitmod3 \ @@ -926,6 +950,7 @@ modules-names += \ tst-tls19mod3 \ tst-tls20mod-bad \ tst-tls21mod \ + tst-tls23-mod \ tst-tlsalign-lib \ tst-tlsgap-mod0 \ tst-tlsgap-mod1 \ @@ -956,6 +981,9 @@ modules-names += \ tst-unique2mod1 \ tst-unique2mod2 \ tst-unwind-ctor-lib \ + tst-version-hash-zero-linkmod \ + tst-version-hash-zero-mod \ + tst-version-hash-zero-refmod \ unload2dep \ unload2mod \ unload3mod1 \ @@ -2032,6 +2060,13 @@ $(objpfx)tst-dlmopen3.out: $(objpfx)tst-dlmopen1mod.so $(objpfx)tst-dlmopen4.out: $(objpfx)tst-dlmopen1mod.so +CFLAGS-tst-dlmopen4-pic.c += -fPIC +$(objpfx)tst-dlmopen4-pic.out: $(objpfx)tst-dlmopen1mod.so + +CFLAGS-tst-dlmopen4-nonpic.c += -fno-pie +tst-dlmopen4-nonpic-no-pie = yes +$(objpfx)tst-dlmopen4-nonpic.out: $(objpfx)tst-dlmopen1mod.so + $(objpfx)tst-audit1.out: $(objpfx)tst-auditmod1.so tst-audit1-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so @@ -3012,6 +3047,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so +$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \ + $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules)) +tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so $(objpfx)tst-dlmopen-twice.out: \ $(objpfx)tst-dlmopen-twice-mod1.so \ @@ -3090,6 +3128,13 @@ CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=$(have-mtls-descriptor) CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=$(have-mtls-descriptor) endif +$(objpfx)tst-tls23: $(shared-thread-library) +$(objpfx)tst-tls23.out: $(objpfx)tst-tls23-mod.so + +ifneq (no,$(have-test-mtls-traditional)) +CFLAGS-tst-tls23-mod.c += -mtls-dialect=$(have-test-mtls-traditional) +endif + $(objpfx)tst-recursive-tls: $(objpfx)tst-recursive-tlsmallocmod.so # More objects than DTV_SURPLUS, to trigger DTV reallocation. $(objpfx)tst-recursive-tls.out: \ @@ -3120,3 +3165,55 @@ $(objpfx)tst-dlopen-tlsreinit3.out: $(objpfx)tst-auditmod1.so tst-dlopen-tlsreinit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so $(objpfx)tst-dlopen-tlsreinit4.out: $(objpfx)tst-auditmod1.so tst-dlopen-tlsreinit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so + +$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so + +tst-dlopen-auditdup-ENV = LD_AUDIT=$(objpfx)tst-dlopen-auditdup-auditmod.so +$(objpfx)tst-dlopen-auditdup.out: \ + $(objpfx)tst-dlopen-auditdupmod.so $(objpfx)tst-dlopen-auditdup-auditmod.so + +# Reuse an audit module which provides ample debug logging. +tst-rtld-no-malloc-audit-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so + +# Any shared object should do. +tst-rtld-no-malloc-preload-ENV = LD_PRELOAD=$(objpfx)tst-auditmod1.so + +# These rules link and run the special elf/tst-nolink-libc-* tests if +# a port adds them to the tests variables. Neither test variant is +# linked against libc.so, but tst-nolink-libc-1 is linked against +# ld.so. The test is always run directly, not under the dynamic +# linker. +CFLAGS-tst-nolink-libc.c += $(no-stack-protector) +$(objpfx)tst-nolink-libc-1: $(objpfx)tst-nolink-libc.o $(objpfx)ld.so + $(LINK.o) -nostdlib -nostartfiles -o $@ $< \ + -Wl,--dynamic-linker=$(objpfx)ld.so,--no-as-needed $(objpfx)ld.so +$(objpfx)tst-nolink-libc-1.out: $(objpfx)tst-nolink-libc-1 $(objpfx)ld.so + $< > $@ 2>&1; $(evaluate-test) +$(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o + $(LINK.o) -nostdlib -nostartfiles -o $@ $< \ + -Wl,--dynamic-linker=$(objpfx)ld.so +$(objpfx)tst-nolink-libc-2.out: $(objpfx)tst-nolink-libc-2 $(objpfx)ld.so + $< > $@ 2>&1; $(evaluate-test) + +$(objpfx)tst-version-hash-zero.out: \ + $(objpfx)tst-version-hash-zero-mod.so \ + $(objpfx)tst-version-hash-zero-refmod.so +LDFLAGS-tst-version-hash-zero-mod.so = \ + -Wl,--version-script=tst-version-hash-zero-mod.map +# The run-time test module tst-version-hash-zero-refmod.so is linked +# to a stub module, tst-version-hash-zero-linkmod.so, to produce an +# expected relocation error. +$(objpfx)tst-version-hash-zero-refmod.so: \ + $(objpfx)tst-version-hash-zero-linkmod.so +LDFLAGS-tst-version-hash-zero-linkmod.so = \ + -Wl,--version-script=tst-version-hash-zero-linkmod.map \ + -Wl,--soname=tst-version-hash-zero-mod.so +$(objpfx)tst-version-hash-zero-refmod.so: \ + $(objpfx)tst-version-hash-zero-linkmod.so +tst-version-hash-zero-refmod.so-no-z-defs = yes + +$(objpfx)tst-dlopen-constructor-null: \ + $(objpfx)tst-dlopen-constructor-null-mod1.so \ + $(objpfx)tst-dlopen-constructor-null-mod2.so +$(objpfx)tst-dlopen-constructor-null-mod2.so: \ + $(objpfx)tst-dlopen-constructor-null-mod1.so diff --git a/elf/dl-close.c b/elf/dl-close.c index 8822624..fb27a12 100644 --- a/elf/dl-close.c +++ b/elf/dl-close.c @@ -264,6 +264,12 @@ _dl_close_worker (struct link_map *map, bool force) _dl_catch_exception (NULL, _dl_call_fini, imap); #ifdef SHARED + /* Auditing checkpoint: we will start deleting objects. + This is supposed to happen before la_objclose (see _dl_fini), + but only once per non-recursive dlclose call. */ + if (!unload_any) + _dl_audit_activity_nsid (nsid, LA_ACT_DELETE); + /* Auditing checkpoint: we remove an object. */ _dl_audit_objclose (imap); #endif @@ -424,15 +430,10 @@ _dl_close_worker (struct link_map *map, bool force) if (!unload_any) goto out; -#ifdef SHARED - /* Auditing checkpoint: we will start deleting objects. */ - _dl_audit_activity_nsid (nsid, LA_ACT_DELETE); -#endif - - /* Notify the debugger we are about to remove some loaded objects. */ + /* Notify the debugger we are about to remove some loaded objects. + LA_ACT_DELETE has already been signalled above for !unload_any. */ struct r_debug *r = _dl_debug_update (nsid); - r->r_state = RT_DELETE; - _dl_debug_state (); + _dl_debug_change_state (r, RT_DELETE); LIBC_PROBE (unmap_start, 2, nsid, r); if (unload_global) @@ -723,6 +724,10 @@ _dl_close_worker (struct link_map *map, bool force) /* TLS is cleaned up for the unloaded modules. */ __rtld_lock_unlock_recursive (GL(dl_load_tls_lock)); + /* Notify the debugger those objects are finalized and gone. */ + _dl_debug_change_state (r, RT_CONSISTENT); + LIBC_PROBE (unmap_complete, 2, nsid, r); + #ifdef SHARED /* Auditing checkpoint: we have deleted all objects. Also, do not notify auditors of the cleanup of a failed audit module loading attempt. */ @@ -735,11 +740,6 @@ _dl_close_worker (struct link_map *map, bool force) --GL(dl_nns); while (GL(dl_ns)[GL(dl_nns) - 1]._ns_loaded == NULL); - /* Notify the debugger those objects are finalized and gone. */ - r->r_state = RT_CONSISTENT; - _dl_debug_state (); - LIBC_PROBE (unmap_complete, 2, nsid, r); - /* Recheck if we need to retry, release the lock. */ out: if (dl_close_state == rerun) diff --git a/elf/dl-debug-symbols.S b/elf/dl-debug-symbols.S index 4e35ade..33f0fc7 100644 --- a/elf/dl-debug-symbols.S +++ b/elf/dl-debug-symbols.S @@ -38,3 +38,4 @@ _r_debug: _r_debug_extended: .zero R_DEBUG_EXTENDED_SIZE +rtld_hidden_def (_r_debug) diff --git a/elf/dl-debug.c b/elf/dl-debug.c index ef56de7..df36d61 100644 --- a/elf/dl-debug.c +++ b/elf/dl-debug.c @@ -16,6 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include <assert.h> #include <ldsodefs.h> @@ -30,23 +31,86 @@ extern const int verify_link_map_members[(VERIFY_MEMBER (l_addr) && VERIFY_MEMBER (l_prev)) ? 1 : -1]; +#ifdef SHARED +/* r_debug structs for secondary namespaces. The first namespace is + handled separately because its r_debug structure must overlap with + the public _r_debug symbol, so the first array element corresponds + to LM_ID_BASE + 1. See elf/dl-debug-symbols.S. */ +struct r_debug_extended _r_debug_array[DL_NNS - 1]; + +/* If not null, pointer to the _r_debug in the main executable. */ +static struct r_debug *_r_debug_main; + +void +_dl_debug_post_relocate (struct link_map *main_map) +{ + /* Perform a full symbol search in all objects, to maintain + compatibility if interposed _r_debug definitions. The lookup + cannot fail because there is a definition in ld.so, and this + function is only called if the ld.so search scope is not empty. */ + const ElfW(Sym) *sym = NULL; + lookup_t result =_dl_lookup_symbol_x ("_r_debug", main_map, &sym, + main_map->l_scope, NULL, 0, 0, NULL); + if (sym->st_size >= sizeof (struct r_debug)) + { + struct r_debug *main_r_debug = DL_SYMBOL_ADDRESS (result, sym); + if (main_r_debug != &_r_debug_extended.base) + { + /* The extended version of the struct is not available in + the main executable because a copy relocation has been + used. r_map etc. have already been copied as part of the + copy relocation processing. */ + main_r_debug->r_version = 1; + + /* Record that dual updates of the initial link map are + required. */ + _r_debug_main = main_r_debug; + } + } +} + +/* Return the r_debug object for the namespace NS. */ +static inline struct r_debug_extended * +get_rdebug (Lmid_t ns) +{ + if (ns == LM_ID_BASE) + return &_r_debug_extended; + else + return &_r_debug_array[ns - 1]; +} +#else /* !SHARED */ +static inline struct r_debug_extended * +get_rdebug (Lmid_t ns) +{ + return &_r_debug_extended; /* There is just one namespace. */ +} +#endif /* !SHARED */ + /* Update the `r_map' member and return the address of `struct r_debug' of the namespace NS. */ struct r_debug * _dl_debug_update (Lmid_t ns) { - struct r_debug_extended *r; - if (ns == LM_ID_BASE) - r = &_r_debug_extended; - else - r = &GL(dl_ns)[ns]._ns_debug; + struct r_debug_extended *r = get_rdebug (ns); if (r->base.r_map == NULL) atomic_store_release (&r->base.r_map, (void *) GL(dl_ns)[ns]._ns_loaded); return &r->base; } +void +_dl_debug_change_state (struct r_debug *r, int state) +{ + atomic_store_release (&r->r_state, state); +#ifdef SHARED + if (r == &_r_debug_extended.base && _r_debug_main != NULL) + /* Update the copy-relocation of _r_debug. */ + atomic_store_release (&_r_debug_main->r_state, state); +#endif + _dl_debug_state (); +} + /* Initialize _r_debug_extended for the namespace NS. LDBASE is the run-time load address of the dynamic linker, to be put in _r_debug_extended.r_ldbase. Return the address of _r_debug. */ @@ -54,34 +118,7 @@ _dl_debug_update (Lmid_t ns) struct r_debug * _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns) { - struct r_debug_extended *r, **pp = NULL; - - if (ns == LM_ID_BASE) - { - r = &_r_debug_extended; - /* Initialize r_version to 1. */ - if (_r_debug_extended.base.r_version == 0) - _r_debug_extended.base.r_version = 1; - } - else if (DL_NNS > 1) - { - r = &GL(dl_ns)[ns]._ns_debug; - if (r->base.r_brk == 0) - { - /* Add the new namespace to the linked list. After a namespace - is initialized, r_brk becomes non-zero. A namespace becomes - empty (r_map == NULL) when it is unused. But it is never - removed from the linked list. */ - struct r_debug_extended *p; - for (pp = &_r_debug_extended.r_next; - (p = *pp) != NULL; - pp = &p->r_next) - ; - - r->base.r_version = 2; - } - } - + struct r_debug_extended *r = get_rdebug (ns); if (r->base.r_brk == 0) { /* Tell the debugger where to find the map of loaded objects. @@ -89,30 +126,44 @@ _dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns) only once. */ r->base.r_ldbase = ldbase ?: _r_debug_extended.base.r_ldbase; r->base.r_brk = (ElfW(Addr)) &_dl_debug_state; - r->r_next = NULL; + +#ifdef SHARED + /* Add the new namespace to the linked list. This assumes that + namespaces are allocated in increasing order. After a + namespace is initialized, r_brk becomes non-zero. A + namespace becomes empty (r_map == NULL) when it is unused. + But it is never removed from the linked list. */ + + if (ns != LM_ID_BASE) + { + r->base.r_version = 2; + if (ns - 1 == LM_ID_BASE) + { + atomic_store_release (&_r_debug_extended.r_next, r); + /* Now there are multiple namespaces. Note that this + deliberately does not update the copy in the main + executable (if it exists). */ + atomic_store_release (&_r_debug_extended.base.r_version, 2); + } + else + /* Update r_debug_extended of the previous namespace. */ + atomic_store_release (&_r_debug_array[ns - 2].r_next, r); + } + else +#endif /* SHARED */ + r->base.r_version = 1; } if (r->base.r_map == NULL) - atomic_store_release (&r->base.r_map, - (void *) GL(dl_ns)[ns]._ns_loaded); - - if (pp != NULL) { - atomic_store_release (pp, r); - /* Bump r_version to 2 for the new namespace. */ - atomic_store_release (&_r_debug_extended.base.r_version, 2); + struct link_map_public *l = (void *) GL(dl_ns)[ns]._ns_loaded; + atomic_store_release (&r->base.r_map, l); +#ifdef SHARED + if (ns == LM_ID_BASE && _r_debug_main != NULL) + /* Update the copy-relocation of _r_debug. */ + atomic_store_release (&_r_debug_main->r_map, l); +#endif } return &r->base; } - - -/* This function exists solely to have a breakpoint set on it by the - debugger. The debugger is supposed to find this function's address by - examining the r_brk member of struct r_debug, but GDB 4.15 in fact looks - for this particular symbol name in the PT_INTERP file. */ -void -_dl_debug_state (void) -{ -} -rtld_hidden_def (_dl_debug_state) diff --git a/elf/dl-debug_state.c b/elf/dl-debug_state.c new file mode 100644 index 0000000..40c134a --- /dev/null +++ b/elf/dl-debug_state.c @@ -0,0 +1,30 @@ +/* Debugger hook called after dynamic linker updates. + Copyright (C) 1996-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <ldsodefs.h> + +/* This function exists solely to have a breakpoint set on it by the + debugger. The debugger is supposed to find this function's address by + examining the r_brk member of struct r_debug, but GDB 4.15 in fact looks + for this particular symbol name in the PT_INTERP file. Therefore, + this function must not be inlined. */ +void +_dl_debug_state (void) +{ +} +rtld_hidden_def (_dl_debug_state) diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c index 940fa5c..f258665 100644 --- a/elf/dl-find_object.c +++ b/elf/dl-find_object.c @@ -465,6 +465,37 @@ _dl_find_object (void *pc1, struct dl_find_object *result) } rtld_hidden_def (_dl_find_object) +/* Subroutine of _dlfo_process_initial to split out noncontigous link + maps. NODELETE is the number of used _dlfo_nodelete_mappings + elements. It is incremented as needed, and the new NODELETE value + is returned. */ +static size_t +_dlfo_process_initial_noncontiguous_map (struct link_map *map, + size_t nodelete) +{ + struct dl_find_object_internal dlfo; + _dl_find_object_from_map (map, &dlfo); + + /* PT_LOAD segments for a non-contiguous link map are added to the + non-closeable mappings. */ + const ElfW(Phdr) *ph = map->l_phdr; + const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum; + for (; ph < ph_end; ++ph) + if (ph->p_type == PT_LOAD) + { + if (_dlfo_nodelete_mappings != NULL) + { + /* Second pass only. */ + _dlfo_nodelete_mappings[nodelete] = dlfo; + ElfW(Addr) start = ph->p_vaddr + map->l_addr; + _dlfo_nodelete_mappings[nodelete].map_start = start; + _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz; + } + ++nodelete; + } + return nodelete; +} + /* _dlfo_process_initial is called twice. First to compute the array sizes from the initial loaded mappings. Second to fill in the bases and infos arrays with the (still unsorted) data. Returns the @@ -476,29 +507,8 @@ _dlfo_process_initial (void) size_t nodelete = 0; if (!main_map->l_contiguous) - { - struct dl_find_object_internal dlfo; - _dl_find_object_from_map (main_map, &dlfo); - - /* PT_LOAD segments for a non-contiguous are added to the - non-closeable mappings. */ - for (const ElfW(Phdr) *ph = main_map->l_phdr, - *ph_end = main_map->l_phdr + main_map->l_phnum; - ph < ph_end; ++ph) - if (ph->p_type == PT_LOAD) - { - if (_dlfo_nodelete_mappings != NULL) - { - /* Second pass only. */ - _dlfo_nodelete_mappings[nodelete] = dlfo; - _dlfo_nodelete_mappings[nodelete].map_start - = ph->p_vaddr + main_map->l_addr; - _dlfo_nodelete_mappings[nodelete].map_end - = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz; - } - ++nodelete; - } - } + /* Contiguous case already handled in _dl_find_object_init. */ + nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete); size_t loaded = 0; for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns) @@ -510,11 +520,22 @@ _dlfo_process_initial (void) /* lt_library link maps are implicitly NODELETE. */ if (l->l_type == lt_library || l->l_nodelete_active) { - if (_dlfo_nodelete_mappings != NULL) - /* Second pass only. */ - _dl_find_object_from_map - (l, _dlfo_nodelete_mappings + nodelete); - ++nodelete; + /* The kernel may have loaded ld.so with gaps. */ + if (!l->l_contiguous +#ifdef SHARED + && l == &GL(dl_rtld_map) +#endif + ) + nodelete + = _dlfo_process_initial_noncontiguous_map (l, nodelete); + else + { + if (_dlfo_nodelete_mappings != NULL) + /* Second pass only. */ + _dl_find_object_from_map + (l, _dlfo_nodelete_mappings + nodelete); + ++nodelete; + } } else if (l->l_type == lt_loaded) { @@ -661,6 +682,14 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count) = _dlfo_loaded_mappings[!active_idx]; size_t remaining_to_add = current_used + count; + /* remaining_to_add can be 0 if (current_used + count) wraps, but in practice + this is not possible as it represent counts of link maps. Link maps have + sizes larger than 1 byte, so the sum of any two link map counts will + always fit within a size_t without wrapping around. This check ensures + that target_seg is not erroneously considered potentially NULL by GCC. */ + if (remaining_to_add == 0) + __builtin_unreachable (); + /* Ensure that the new segment chain has enough space. */ { size_t new_allocated @@ -756,7 +785,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count) /* Prefer newly loaded link map. */ assert (loaded_index1 > 0); _dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo); - loaded[loaded_index1 - 1]->l_find_object_processed = 1; --loaded_index1; } diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h index 0915065..8894c66 100644 --- a/elf/dl-find_object.h +++ b/elf/dl-find_object.h @@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal, } /* Extract the object location data from a link map and writes it to - *RESULT using relaxed MO stores. */ + *RESULT using relaxed MO stores. Set L->l_find_object_processed. */ static void __attribute__ ((unused)) _dl_find_object_from_map (struct link_map *l, struct dl_find_object_internal *result) @@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l, atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]); #endif + l->l_find_object_processed = 1; + for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum; ph < ph_end; ++ph) if (ph->p_type == DLFO_EH_SEGMENT_TYPE) diff --git a/elf/dl-fini.c b/elf/dl-fini.c index db99627..a1a4c25 100644 --- a/elf/dl-fini.c +++ b/elf/dl-fini.c @@ -69,6 +69,7 @@ _dl_fini (void) unsigned int i; struct link_map *l; + struct link_map *proxy_link_map = NULL; assert (nloaded != 0 || GL(dl_ns)[ns]._ns_loaded == NULL); for (l = GL(dl_ns)[ns]._ns_loaded, i = 0; l != NULL; l = l->l_next) /* Do not handle ld.so in secondary namespaces. */ @@ -84,6 +85,11 @@ _dl_fini (void) are not dlclose()ed from underneath us. */ ++l->l_direct_opencount; } + else + /* Used below to call la_objclose for the ld.so proxy + link map. */ + proxy_link_map = l; + assert (ns != LM_ID_BASE || i == nloaded); assert (ns == LM_ID_BASE || i == nloaded || i == nloaded - 1); unsigned int nmaps = i; @@ -122,6 +128,9 @@ _dl_fini (void) --l->l_direct_opencount; } + if (proxy_link_map != NULL) + _dl_audit_objclose (proxy_link_map); + #ifdef SHARED _dl_audit_activity_nsid (ns, LA_ACT_CONSISTENT); #endif diff --git a/elf/dl-load.c b/elf/dl-load.c index ce8fdea..8b08904 100644 --- a/elf/dl-load.c +++ b/elf/dl-load.c @@ -929,6 +929,36 @@ _dl_process_pt_gnu_property (struct link_map *l, int fd, const ElfW(Phdr) *ph) } } +static void +_dl_notify_new_object (int mode, Lmid_t nsid, struct link_map *l) +{ + /* Signal that we are going to add new objects. */ + struct r_debug *r = _dl_debug_update (nsid); + if (r->r_state == RT_CONSISTENT) + { +#ifdef SHARED + /* Auditing checkpoint: we are going to add new objects. Since this + is called after _dl_add_to_namespace_list the namespace is guaranteed + to not be empty. */ + if ((mode & __RTLD_AUDIT) == 0) + _dl_audit_activity_nsid (nsid, LA_ACT_ADD); +#endif + + /* Notify the debugger we have added some objects. We need to + call _dl_debug_initialize in a static program in case dynamic + linking has not been used before. */ + _dl_debug_change_state (r, RT_ADD); + LIBC_PROBE (map_start, 2, nsid, r); + } + else + assert (r->r_state == RT_ADD); + +#ifdef SHARED + /* Auditing checkpoint: we have a new object. */ + if (!GL(dl_ns)[l->l_ns]._ns_loaded->l_auditing) + _dl_audit_objopen (l, nsid); +#endif +} /* Map in the shared object NAME, actually located in REALNAME, and already opened on FD. */ @@ -1029,6 +1059,8 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd, /* Add the map for the mirrored object to the object list. */ _dl_add_to_namespace_list (l, nsid); + _dl_notify_new_object (mode, nsid, l); + return l; } #endif @@ -1487,33 +1519,7 @@ cannot enable executable stack as shared object requires"); if (mode & __RTLD_SPROF) return l; - /* Signal that we are going to add new objects. */ - struct r_debug *r = _dl_debug_update (nsid); - if (r->r_state == RT_CONSISTENT) - { -#ifdef SHARED - /* Auditing checkpoint: we are going to add new objects. Since this - is called after _dl_add_to_namespace_list the namespace is guaranteed - to not be empty. */ - if ((mode & __RTLD_AUDIT) == 0) - _dl_audit_activity_nsid (nsid, LA_ACT_ADD); -#endif - - /* Notify the debugger we have added some objects. We need to - call _dl_debug_initialize in a static program in case dynamic - linking has not been used before. */ - r->r_state = RT_ADD; - _dl_debug_state (); - LIBC_PROBE (map_start, 2, nsid, r); - } - else - assert (r->r_state == RT_ADD); - -#ifdef SHARED - /* Auditing checkpoint: we have a new object. */ - if (!GL(dl_ns)[l->l_ns]._ns_loaded->l_auditing) - _dl_audit_objopen (l, nsid); -#endif + _dl_notify_new_object (mode, nsid, l); return l; } diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 19ad2a2..7a70f1d 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -113,12 +113,22 @@ check_match (const char *const undef_name, /* We can match the version information or use the default one if it is not hidden. */ ElfW(Half) ndx = verstab[symidx] & 0x7fff; - if ((map->l_versions[ndx].hash != version->hash - || strcmp (map->l_versions[ndx].name, version->name)) - && (version->hidden || map->l_versions[ndx].hash - || (verstab[symidx] & 0x8000))) - /* It's not the version we want. */ - return NULL; + if (map->l_versions[ndx].hash == version->hash + && strcmp (map->l_versions[ndx].name, version->name) == 0) + /* This is an exact version match. Return the symbol below. */ + ; + else + { + if (!version->hidden + && map->l_versions[ndx].name[0] == '\0' + && (verstab[symidx] & 0x8000) == 0 + && (*num_versions)++ == 0) + /* This is the global default version. Store it as a + fallback match. */ + *versioned_sym = sym; + + return NULL; + } } } else diff --git a/elf/dl-open.c b/elf/dl-open.c index 8556e7b..6f6d3dd 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -576,6 +576,14 @@ dl_open_worker_begin (void *a) _dl_debug_printf ("opening file=%s [%lu]; direct_opencount=%u\n\n", new->l_name, new->l_ns, new->l_direct_opencount); +#ifdef SHARED + /* No relocation processing on this execution path. But + relocation has not been performed for static + position-dependent executables, so disable the assert for + static linking. */ + assert (new->l_relocated); +#endif + /* If the user requested the object to be in the global namespace but it is not so far, prepare to add it now. This can raise an exception to do a malloc failure. */ @@ -597,9 +605,15 @@ dl_open_worker_begin (void *a) if ((mode & RTLD_GLOBAL) && new->l_global == 0) add_to_global_update (new); - const int r_state __attribute__ ((unused)) - = _dl_debug_update (args->nsid)->r_state; - assert (r_state == RT_CONSISTENT); + /* It is not possible to run the ELF constructor for the new + link map if it has not executed yet: If this dlopen call came + from an ELF constructor that has not put that object into a + consistent state, completing initialization for the entire + scope will expose objects that have this partially + constructed object among its dependencies to this + inconsistent state. This could happen even with a benign + dlopen (NULL, RTLD_LAZY) call from a constructor of an + initially loaded shared object. */ return; } @@ -631,17 +645,6 @@ dl_open_worker_begin (void *a) #endif } -#ifdef SHARED - /* Auditing checkpoint: we have added all objects. */ - _dl_audit_activity_nsid (new->l_ns, LA_ACT_CONSISTENT); -#endif - - /* Notify the debugger all new objects are now ready to go. */ - struct r_debug *r = _dl_debug_update (args->nsid); - r->r_state = RT_CONSISTENT; - _dl_debug_state (); - LIBC_PROBE (map_complete, 3, args->nsid, r, new); - _dl_open_check (new); /* Print scope information. */ @@ -688,6 +691,7 @@ dl_open_worker_begin (void *a) created dlmopen namespaces. Do not do this for static dlopen because libc has relocations against ld.so, which may not have been relocated at this point. */ + struct r_debug *r = _dl_debug_update (args->nsid); #ifdef SHARED if (GL(dl_ns)[args->nsid].libc_map != NULL) _dl_open_relocate_one_object (args, r, GL(dl_ns)[args->nsid].libc_map, @@ -779,6 +783,25 @@ dl_open_worker (void *a) __rtld_lock_unlock_recursive (GL(dl_load_tls_lock)); + /* Auditing checkpoint and debugger signalling. Do this even on + error, so that dlopen exists with consistent state. */ + if (args->nsid >= 0 || args->map != NULL) + { + Lmid_t nsid = args->map != NULL ? args->map->l_ns : args->nsid; + struct r_debug *r = _dl_debug_update (nsid); +#ifdef SHARED + bool was_not_consistent = r->r_state != RT_CONSISTENT; +#endif + _dl_debug_change_state (r, RT_CONSISTENT); + LIBC_PROBE (map_complete, 3, nsid, r, args->map); + +#ifdef SHARED + if (was_not_consistent) + /* Avoid redudant/recursive signalling. */ + _dl_audit_activity_nsid (nsid, LA_ACT_CONSISTENT); +#endif + } + if (__glibc_unlikely (ex.errstring != NULL)) /* Reraise the error. */ _dl_signal_exception (err, &ex, NULL); @@ -847,7 +870,7 @@ no more namespaces available for dlmopen()")); } GL(dl_ns)[nsid].libc_map = NULL; - _dl_debug_update (nsid)->r_state = RT_CONSISTENT; + _dl_debug_change_state (_dl_debug_update (nsid), RT_CONSISTENT); } /* Never allow loading a DSO in a namespace which is empty. Such direct placements is only causing problems. Also don't allow diff --git a/elf/dl-reloc.c b/elf/dl-reloc.c index 4bf7aec..76d1483 100644 --- a/elf/dl-reloc.c +++ b/elf/dl-reloc.c @@ -202,12 +202,9 @@ resolve_map (lookup_t l, struct r_scope_elem *scope[], const ElfW(Sym) **ref, #include "dynamic-link.h" void -_dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], - int reloc_mode, int consider_profiling) +_dl_relocate_object_no_relro (struct link_map *l, struct r_scope_elem *scope[], + int reloc_mode, int consider_profiling) { - if (l->l_relocated) - return; - struct textrels { caddr_t start; @@ -220,8 +217,8 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], int lazy = reloc_mode & RTLD_LAZY; int skip_ifunc = reloc_mode & __RTLD_NOIFUNC; -#ifdef SHARED bool consider_symbind = false; +#ifdef SHARED /* If we are auditing, install the same handlers we need for profiling. */ if ((reloc_mode & __RTLD_AUDIT) == 0) { @@ -240,9 +237,7 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], } #elif defined PROF /* Never use dynamic linker profiling for gprof profiling code. */ -# define consider_profiling 0 -#else -# define consider_symbind 0 + consider_profiling = 0; #endif /* If DT_BIND_NOW is set relocate all references in this object. We @@ -300,7 +295,6 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], ELF_DYNAMIC_RELOCATE (l, scope, lazy, consider_profiling, skip_ifunc); -#ifndef PROF if ((consider_profiling || consider_symbind) && l->l_info[DT_PLTRELSZ] != NULL) { @@ -321,7 +315,6 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], _dl_fatal_printf (errstring, RTLD_PROGNAME, l->l_name); } } -#endif } /* Mark the object so we know this work has been done. */ @@ -342,17 +335,24 @@ _dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], textrels = textrels->next; } - - /* In case we can protect the data now that the relocations are - done, do it. */ - if (l->l_relro_size != 0) - _dl_protect_relro (l); } +void +_dl_relocate_object (struct link_map *l, struct r_scope_elem *scope[], + int reloc_mode, int consider_profiling) +{ + if (l->l_relocated) + return; + _dl_relocate_object_no_relro (l, scope, reloc_mode, consider_profiling); + _dl_protect_relro (l); +} void _dl_protect_relro (struct link_map *l) { + if (l->l_relro_size == 0) + return; + ElfW(Addr) start = ALIGN_DOWN((l->l_addr + l->l_relro_addr), GLRO(dl_pagesize)); diff --git a/elf/dl-support.c b/elf/dl-support.c index 451932d..ee590ed 100644 --- a/elf/dl-support.c +++ b/elf/dl-support.c @@ -338,8 +338,7 @@ _dl_non_dynamic_init (void) call_function_static_weak (_dl_find_object_init); /* Setup relro on the binary itself. */ - if (_dl_main_map.l_relro_size != 0) - _dl_protect_relro (&_dl_main_map); + _dl_protect_relro (&_dl_main_map); } #ifdef DL_SYSINFO_IMPLEMENTATION diff --git a/elf/dl-tls.c b/elf/dl-tls.c index 3d529b7..b13e752 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -528,6 +528,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid) if (newp == NULL) oom (); memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t)); +#ifdef SHARED + /* Auditors can trigger a DTV resize event while the full malloc + is not yet in use. Mark the new DTV allocation as the + initial allocation. */ + if (!__rtld_malloc_is_complete ()) + GL(dl_initial_dtv) = &newp[1]; +#endif } else { diff --git a/elf/dl-version.c b/elf/dl-version.c index 8966d61..708b1c9 100644 --- a/elf/dl-version.c +++ b/elf/dl-version.c @@ -357,6 +357,13 @@ _dl_check_map_versions (struct link_map *map, int verbose, int trace_mode) ent = (ElfW(Verdef) *) ((char *) ent + ent->vd_next); } } + + /* The empty string has ELF hash zero. This avoids a NULL check + before the version string comparison in check_match in + dl-lookup.c. */ + for (unsigned int i = 0; i < map->l_nversions; ++i) + if (map->l_versions[i].name == NULL) + map->l_versions[i].name = ""; } /* When there is a DT_VERNEED entry with libc.so on DT_NEEDED, issue diff --git a/elf/endswith.h b/elf/endswith.h new file mode 100644 index 0000000..3954e57 --- /dev/null +++ b/elf/endswith.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2023-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see <https://www.gnu.org/licenses/>. */ + +#ifndef _ENDSWITH_H +#define _ENDSWITH_H + +#include <stdbool.h> +#include <string.h> + +/* Return true if the N bytes at NAME end with with the characters in + the string SUFFIX. (NAME[N + 1] does not have to be a null byte.) + Expected to be called with a string literal for SUFFIX. */ +static inline bool +endswithn (const char *name, size_t n, const char *suffix) +{ + return (n >= strlen (suffix) + && memcmp (name + n - strlen (suffix), suffix, + strlen (suffix)) == 0); +} + +/* Same as endswithn, but uses the entire SUBJECT for matching. */ +static inline bool +endswith (const char *subject, const char *suffix) +{ + return endswithn (subject, strlen (subject), suffix); +} + +#endif /* _ENDSWITH_H */ diff --git a/elf/ldconfig.c b/elf/ldconfig.c index b64c54b..0f3ef70 100644 --- a/elf/ldconfig.c +++ b/elf/ldconfig.c @@ -40,6 +40,7 @@ #include <libgen.h> #include <ldconfig.h> +#include <endswith.h> #include <dl-cache.h> #include <dl-hwcaps.h> #include <dl-is_dso.h> @@ -661,17 +662,6 @@ struct dlib_entry struct dlib_entry *next; }; -/* Return true if the N bytes at NAME end with with the characters in - the string SUFFIX. (NAME[N + 1] does not have to be a null byte.) - Expected to be called with a string literal for SUFFIX. */ -static inline bool -endswithn (const char *name, size_t n, const char *suffix) -{ - return (n >= strlen (suffix) - && memcmp (name + n - strlen (suffix), suffix, - strlen (suffix)) == 0); -} - /* Skip some temporary DSO files. These files may be partially written and lead to ldconfig crashes when examined. */ static bool diff --git a/elf/readlib.c b/elf/readlib.c index 4d67c74..32e8b8e 100644 --- a/elf/readlib.c +++ b/elf/readlib.c @@ -33,6 +33,7 @@ #include <gnu/lib-names.h> #include <ldconfig.h> +#include <endswith.h> #define Elf32_CLASS ELFCLASS32 #define Elf64_CLASS ELFCLASS64 @@ -48,7 +49,7 @@ static bool is_gdb_python_file (const char *name) { size_t len = strlen (name); - return len > 7 && strcmp (name + len - 7, "-gdb.py") == 0; + return endswithn (name, len, "-gdb.py"); } /* Returns 0 if everything is ok, != 0 in case of error. */ @@ -1284,6 +1284,60 @@ rtld_setup_main_map (struct link_map *main_map) return has_interp; } +/* Set up the program header information for the dynamic linker + itself. It can be accessed via _r_debug and dl_iterate_phdr + callbacks, and it is used by _dl_find_object. */ +static void +rtld_setup_phdr (void) +{ + /* Starting from binutils-2.23, the linker will define the magic + symbol __ehdr_start to point to our own ELF header if it is + visible in a segment that also includes the phdrs. */ + + const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; + assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); + assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); + + const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; + + GL(dl_rtld_map).l_phdr = rtld_phdr; + GL(dl_rtld_map).l_phnum = rtld_ehdr->e_phnum; + + + GL(dl_rtld_map).l_contiguous = 1; + /* The linker may not have produced a contiguous object. The kernel + will load the object with actual gaps (unlike the glibc loader + for shared objects, which always produces a contiguous mapping). + See similar logic in rtld_setup_main_map above. */ + { + ElfW(Addr) expected_load_address = 0; + for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum]; + ++ph) + if (ph->p_type == PT_LOAD) + { + ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1); + if (GL(dl_rtld_map).l_contiguous && expected_load_address != 0 + && expected_load_address != mapstart) + GL(dl_rtld_map).l_contiguous = 0; + ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz; + /* The next expected address is the page following this load + segment. */ + expected_load_address = ((allocend + GLRO(dl_pagesize) - 1) + & ~(GLRO(dl_pagesize) - 1)); + } + } + + /* PT_GNU_RELRO is usually the last phdr. */ + size_t cnt = rtld_ehdr->e_phnum; + while (cnt-- > 0) + if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) + { + GL(dl_rtld_map).l_relro_addr = rtld_phdr[cnt].p_vaddr; + GL(dl_rtld_map).l_relro_size = rtld_phdr[cnt].p_memsz; + break; + } +} + /* Adjusts the contents of the stack and related globals for the user entry point. The ld.so processed skip_args arguments and bumped _dl_argv and _dl_argc accordingly. Those arguments are removed from @@ -1749,33 +1803,7 @@ dl_main (const ElfW(Phdr) *phdr, ++GL(dl_ns)[LM_ID_BASE]._ns_nloaded; ++GL(dl_load_adds); - /* Starting from binutils-2.23, the linker will define the magic symbol - __ehdr_start to point to our own ELF header if it is visible in a - segment that also includes the phdrs. If that's not available, we use - the old method that assumes the beginning of the file is part of the - lowest-addressed PT_LOAD segment. */ - - /* Set up the program header information for the dynamic linker - itself. It is needed in the dl_iterate_phdr callbacks. */ - const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start; - assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr); - assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr))); - - const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff; - - GL(dl_rtld_map).l_phdr = rtld_phdr; - GL(dl_rtld_map).l_phnum = rtld_ehdr->e_phnum; - - - /* PT_GNU_RELRO is usually the last phdr. */ - size_t cnt = rtld_ehdr->e_phnum; - while (cnt-- > 0) - if (rtld_phdr[cnt].p_type == PT_GNU_RELRO) - { - GL(dl_rtld_map).l_relro_addr = rtld_phdr[cnt].p_vaddr; - GL(dl_rtld_map).l_relro_size = rtld_phdr[cnt].p_memsz; - break; - } + rtld_setup_phdr (); /* Add the dynamic linker to the TLS list if it also uses TLS. */ if (GL(dl_rtld_map).l_tls_blocksize != 0) @@ -1822,8 +1850,7 @@ dl_main (const ElfW(Phdr) *phdr, elf_setup_debug_entry (main_map, r); /* We start adding objects. */ - r->r_state = RT_ADD; - _dl_debug_state (); + _dl_debug_change_state (r, RT_ADD); LIBC_PROBE (init_start, 2, LM_ID_BASE, r); /* Auditing checkpoint: we are ready to signal that the initial map @@ -1982,43 +2009,37 @@ dl_main (const ElfW(Phdr) *phdr, if (main_map->l_searchlist.r_list[i] == &GL(dl_rtld_map)) break; - bool rtld_multiple_ref = false; - if (__glibc_likely (i < main_map->l_searchlist.r_nlist)) - { - /* Some DT_NEEDED entry referred to the interpreter object itself, so - put it back in the list of visible objects. We insert it into the - chain in symbol search order because gdb uses the chain's order as - its symbol search order. */ - rtld_multiple_ref = true; + /* Insert the link map for the dynamic loader into the chain in + symbol search order because gdb uses the chain's order as its + symbol search order. */ - GL(dl_rtld_map).l_prev = main_map->l_searchlist.r_list[i - 1]; - if (__glibc_likely (state.mode == rtld_mode_normal)) - { - GL(dl_rtld_map).l_next = (i + 1 < main_map->l_searchlist.r_nlist - ? main_map->l_searchlist.r_list[i + 1] - : NULL); + GL(dl_rtld_map).l_prev = main_map->l_searchlist.r_list[i - 1]; + if (__glibc_likely (state.mode == rtld_mode_normal)) + { + GL(dl_rtld_map).l_next = (i + 1 < main_map->l_searchlist.r_nlist + ? main_map->l_searchlist.r_list[i + 1] + : NULL); #ifdef NEED_DL_SYSINFO_DSO - if (GLRO(dl_sysinfo_map) != NULL - && GL(dl_rtld_map).l_prev->l_next == GLRO(dl_sysinfo_map) - && GL(dl_rtld_map).l_next != GLRO(dl_sysinfo_map)) - GL(dl_rtld_map).l_prev = GLRO(dl_sysinfo_map); + if (GLRO(dl_sysinfo_map) != NULL + && GL(dl_rtld_map).l_prev->l_next == GLRO(dl_sysinfo_map) + && GL(dl_rtld_map).l_next != GLRO(dl_sysinfo_map)) + GL(dl_rtld_map).l_prev = GLRO(dl_sysinfo_map); #endif - } - else - /* In trace mode there might be an invisible object (which we - could not find) after the previous one in the search list. - In this case it doesn't matter much where we put the - interpreter object, so we just initialize the list pointer so - that the assertion below holds. */ - GL(dl_rtld_map).l_next = GL(dl_rtld_map).l_prev->l_next; - - assert (GL(dl_rtld_map).l_prev->l_next == GL(dl_rtld_map).l_next); - GL(dl_rtld_map).l_prev->l_next = &GL(dl_rtld_map); - if (GL(dl_rtld_map).l_next != NULL) - { - assert (GL(dl_rtld_map).l_next->l_prev == GL(dl_rtld_map).l_prev); - GL(dl_rtld_map).l_next->l_prev = &GL(dl_rtld_map); - } + } + else + /* In trace mode there might be an invisible object (which we + could not find) after the previous one in the search list. + In this case it doesn't matter much where we put the + interpreter object, so we just initialize the list pointer so + that the assertion below holds. */ + GL(dl_rtld_map).l_next = GL(dl_rtld_map).l_prev->l_next; + + assert (GL(dl_rtld_map).l_prev->l_next == GL(dl_rtld_map).l_next); + GL(dl_rtld_map).l_prev->l_next = &GL(dl_rtld_map); + if (GL(dl_rtld_map).l_next != NULL) + { + assert (GL(dl_rtld_map).l_next->l_prev == GL(dl_rtld_map).l_prev); + GL(dl_rtld_map).l_next->l_prev = &GL(dl_rtld_map); } /* Now let us see whether all libraries are available in the @@ -2268,25 +2289,25 @@ dl_main (const ElfW(Phdr) *phdr, _rtld_main_check (main_map, _dl_argv[0]); - /* Now we have all the objects loaded. Relocate them all except for - the dynamic linker itself. We do this in reverse order so that copy - relocs of earlier objects overwrite the data written by later - objects. We do not re-relocate the dynamic linker itself in this - loop because that could result in the GOT entries for functions we - call being changed, and that would break us. It is safe to relocate - the dynamic linker out of order because it has no copy relocations. - Likewise for libc, which is relocated early to ensure that IFUNC - resolvers in libc work. */ + /* Now we have all the objects loaded. */ int consider_profiling = GLRO(dl_profile) != NULL; /* If we are profiling we also must do lazy reloaction. */ GLRO(dl_lazy) |= consider_profiling; + /* If libc.so has been loaded, relocate it early, after the dynamic + loader itself. The initial self-relocation of ld.so should be + sufficient for IFUNC resolvers in libc.so. */ if (GL(dl_ns)[LM_ID_BASE].libc_map != NULL) - _dl_relocate_object (GL(dl_ns)[LM_ID_BASE].libc_map, - GL(dl_ns)[LM_ID_BASE].libc_map->l_scope, - GLRO(dl_lazy) ? RTLD_LAZY : 0, consider_profiling); + { + RTLD_TIMING_VAR (start); + rtld_timer_start (&start); + _dl_relocate_object (GL(dl_ns)[LM_ID_BASE].libc_map, + GL(dl_ns)[LM_ID_BASE].libc_map->l_scope, + GLRO(dl_lazy) ? RTLD_LAZY : 0, consider_profiling); + rtld_timer_accum (&relocate_time, start); + } RTLD_TIMING_VAR (start); rtld_timer_start (&start); @@ -2309,9 +2330,8 @@ dl_main (const ElfW(Phdr) *phdr, /* Also allocated with the fake malloc(). */ l->l_free_initfini = 0; - if (l != &GL(dl_rtld_map)) - _dl_relocate_object (l, l->l_scope, GLRO(dl_lazy) ? RTLD_LAZY : 0, - consider_profiling); + _dl_relocate_object (l, l->l_scope, GLRO(dl_lazy) ? RTLD_LAZY : 0, + consider_profiling); /* Add object to slot information data if necessasy. */ if (l->l_tls_blocksize != 0 && __rtld_tls_init_tp_called) @@ -2346,36 +2366,29 @@ dl_main (const ElfW(Phdr) *phdr, /* Make sure no new search directories have been added. */ assert (GLRO(dl_init_all_dirs) == GL(dl_all_dirs)); - if (rtld_multiple_ref) - { - /* There was an explicit ref to the dynamic linker as a shared lib. - Re-relocate ourselves with user-controlled symbol definitions. - - We must do this after TLS initialization in case after this - re-relocation, we might call a user-supplied function - (e.g. calloc from _dl_relocate_object) that uses TLS data. */ - - /* Set up the object lookup structures. */ - _dl_find_object_init (); - - /* The malloc implementation has been relocated, so resolving - its symbols (and potentially calling IFUNC resolvers) is safe - at this point. */ - __rtld_malloc_init_real (main_map); - - /* Likewise for the locking implementation. */ - __rtld_mutex_init (); + /* Set up the object lookup structures. */ + _dl_find_object_init (); + /* If libc.so was loaded, relocate ld.so against it. Complete ld.so + initialization with mutex symbols from libc.so and malloc symbols + from the global scope. */ + if (GL(dl_ns)[LM_ID_BASE].libc_map != NULL) + { RTLD_TIMING_VAR (start); rtld_timer_start (&start); + _dl_relocate_object_no_relro (&GL(dl_rtld_map), main_map->l_scope, 0, 0); + rtld_timer_accum (&relocate_time, start); - /* Mark the link map as not yet relocated again. */ - GL(dl_rtld_map).l_relocated = 0; - _dl_relocate_object (&GL(dl_rtld_map), main_map->l_scope, 0, 0); + __rtld_mutex_init (); + __rtld_malloc_init_real (main_map); - rtld_timer_accum (&relocate_time, start); + /* Update copy-relocated _r_debug if necessary. */ + _dl_debug_post_relocate (main_map); } + /* All ld.so initialization is complete. Apply RELRO. */ + _dl_protect_relro (&GL(dl_rtld_map)); + /* Relocation is complete. Perform early libc initialization. This is the initial libc, even if audit modules have been loaded with other libcs. */ @@ -2388,16 +2401,15 @@ dl_main (const ElfW(Phdr) *phdr, _dl_relocate_object might need to call `mprotect' for DT_TEXTREL. */ _dl_sysdep_start_cleanup (); - /* Auditing checkpoint: we have added all objects. */ - _dl_audit_activity_nsid (LM_ID_BASE, LA_ACT_CONSISTENT); - /* Notify the debugger all new objects are now ready to go. We must re-get the address since by now the variable might be in another object. */ r = _dl_debug_update (LM_ID_BASE); - r->r_state = RT_CONSISTENT; - _dl_debug_state (); + _dl_debug_change_state (r, RT_CONSISTENT); LIBC_PROBE (init_complete, 2, LM_ID_BASE, r); + /* Auditing checkpoint: we have added all objects. */ + _dl_audit_activity_nsid (LM_ID_BASE, LA_ACT_CONSISTENT); + #if defined USE_LDCONFIG && !defined MAP_COPY /* We must munmap() the cache file. */ _dl_unload_cache (); diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c new file mode 100644 index 0000000..7ba2c41 --- /dev/null +++ b/elf/tst-audit-tlsdesc-dlopen2.c @@ -0,0 +1,46 @@ +/* Loading TLS-using modules from auditors (bug 32412). Main program. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <support/xdlfcn.h> +#include <stdio.h> + +static int +do_test (void) +{ + puts ("info: start of main program"); + + /* Load TLS-using modules, to trigger DTV resizing. The dynamic + linker will load them again (requiring their own TLS) because the + dlopen calls from the auditor were in the auditing namespace. */ + for (int i = 1; i <= 19; ++i) + { + char dso[30]; + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); + char sym[30]; + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); + + void *handle = xdlopen (dso, RTLD_LAZY); + int (*func) (void) = xdlsym (handle, sym); + /* Trigger TLS allocation. */ + func (); + } + + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-audit23.c b/elf/tst-audit23.c index 32e7c8b..1b76336 100644 --- a/elf/tst-audit23.c +++ b/elf/tst-audit23.c @@ -17,6 +17,7 @@ <https://www.gnu.org/licenses/>. */ #include <array_length.h> +#include <endswith.h> #include <errno.h> #include <getopt.h> #include <link.h> @@ -30,16 +31,21 @@ #include <support/xstdio.h> #include <support/xdlfcn.h> #include <support/support.h> +#include <support/test-driver.h> static int restart; +static int do_dlclose; #define CMDLINE_OPTIONS \ - { "restart", no_argument, &restart, 1 }, + { "restart", no_argument, &restart, 1 }, \ + { "dlclose", no_argument, &do_dlclose, 1 }, \ static int handle_restart (void) { xdlopen ("tst-audit23mod.so", RTLD_NOW); - xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW); + void *handle = xdlmopen (LM_ID_NEWLM, LIBC_SO, RTLD_NOW); + if (do_dlclose) + xdlclose (handle); return 0; } @@ -59,8 +65,8 @@ is_vdso (const char *str) || startswith (str, "linux-vdso"); } -static int -do_test (int argc, char *argv[]) +static void +do_one_test (int argc, char *argv[], bool pass_dlclose_flag) { /* We must have either: - One or four parameters left if called initially: @@ -68,16 +74,15 @@ do_test (int argc, char *argv[]) + "--library-path" optional + the library path optional + the application name */ - if (restart) - return handle_restart (); - - char *spargv[9]; + char *spargv[10]; TEST_VERIFY_EXIT (((argc - 1) + 3) < array_length (spargv)); int i = 0; for (; i < argc - 1; i++) spargv[i] = argv[i + 1]; spargv[i++] = (char *) "--direct"; spargv[i++] = (char *) "--restart"; + if (pass_dlclose_flag) + spargv[i++] = (char *) "--dlclose"; spargv[i] = NULL; setenv ("LD_AUDIT", "tst-auditmod23.so", 0); @@ -85,14 +90,30 @@ do_test (int argc, char *argv[]) = support_capture_subprogram (spargv[0], spargv); support_capture_subprocess_check (&result, "tst-audit22", 0, sc_allow_stderr); + { + FILE *fp = fmemopen (result.err.buffer, result.err.length, "r"); + TEST_VERIFY (fp != NULL); + unsigned int line = 0; + char *buffer = NULL; + size_t buffer_length = 0; + puts ("info: *** audit log start ***"); + while (xgetline (&buffer, &buffer_length, fp)) + printf ("%6u\t%s", ++line, buffer); + puts ("info: *** audit log end ***"); + free (buffer); + xfclose (fp); + } + /* The expected la_objopen/la_objclose: 1. executable 2. loader 3. libc.so - 4. tst-audit23mod.so - 5. libc.so (LM_ID_NEWLM). - 6. vdso (optional and ignored). */ - enum { max_objs = 6 }; + 4. libgcc_s.so (one some architectures, for libsupport) + 5. tst-audit23mod.so + 6. libc.so (LM_ID_NEWLM). + 7. loader (proxy link map in new namespace) + vdso (optional and ignored). */ + enum { max_objs = 7 }; struct la_obj_t { char *lname; @@ -115,8 +136,10 @@ do_test (int argc, char *argv[]) TEST_VERIFY (out != NULL); char *buffer = NULL; size_t buffer_length = 0; + unsigned int line = 0; while (xgetline (&buffer, &buffer_length, out)) { + ++line; if (startswith (buffer, "la_activity: ")) { uintptr_t cookie; @@ -127,8 +150,14 @@ do_test (int argc, char *argv[]) /* The cookie identifies the object at the head of the link map, so we only add a new namespace if it changes from the previous - one. This works since dlmopen is the last in the test body. */ - if (cookie != last_act_cookie && last_act_cookie != -1) + one. This works since dlmopen is the last in the test body. + + Currently, this does not work as expected because there + is no head link map if a namespace is completely deleted. + No LA_ACT_CONSISTENT event is generated in that case. + See the comment in _dl_audit_activity_nsid and bug 32068. */ + if (cookie != last_act_cookie && last_act_cookie != -1 + && !pass_dlclose_flag) TEST_COMPARE (last_act, LA_ACT_CONSISTENT); if (this_act == LA_ACT_ADD && acts[nacts] != cookie) @@ -174,8 +203,8 @@ do_test (int argc, char *argv[]) if (is_vdso (lname)) continue; if (nobjs == max_objs) - FAIL_EXIT1 ("non expected la_objopen: %s %"PRIxPTR" %ld", - lname, laddr, lmid); + FAIL_EXIT1 ("(line %u) non expected la_objopen: %s %"PRIxPTR" %ld", + line, lname, laddr, lmid); objs[nobjs].lname = lname; objs[nobjs].laddr = laddr; objs[nobjs].lmid = lmid; @@ -217,11 +246,26 @@ do_test (int argc, char *argv[]) } } + Lmid_t lmid_other = LM_ID_NEWLM; + unsigned int other_namespace_count = 0; for (size_t i = 0; i < nobjs; i++) { + if (objs[i].lmid != LM_ID_BASE) + { + if (lmid_other == LM_ID_NEWLM) + lmid_other = objs[i].lmid; + TEST_COMPARE (objs[i].lmid, lmid_other); + ++other_namespace_count; + if (!(endswith (objs[i].lname, "/" LIBC_SO) + || endswith (objs[i].lname, "/" LD_SO))) + FAIL ("unexpected object in secondary namespace: %s", + objs[i].lname); + } TEST_COMPARE (objs[i].closed, true); free (objs[i].lname); } + /* Both libc.so and ld.so should be present. */ + TEST_COMPARE (other_namespace_count, 2); /* la_activity(LA_ACT_CONSISTENT) should be the last callback received. Since only one link map may be not-CONSISTENT at a time, this also @@ -231,7 +275,16 @@ do_test (int argc, char *argv[]) free (buffer); xfclose (out); +} + +static int +do_test (int argc, char *argv[]) +{ + if (restart) + return handle_restart (); + do_one_test (argc, argv, false); + do_one_test (argc, argv, true); return 0; } diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c new file mode 100644 index 0000000..50275cd --- /dev/null +++ b/elf/tst-auditmod-tlsdesc2.c @@ -0,0 +1,59 @@ +/* Loading TLS-using modules from auditors (bug 32412). Audit module. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <link.h> +#include <stdbool.h> +#include <stdio.h> +#include <unistd.h> + +unsigned int +la_version (unsigned int version) +{ + /* Open some modules, to trigger DTV resizing before the switch to + the main malloc. */ + for (int i = 1; i <= 19; ++i) + { + char dso[30]; + snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i); + char sym[30]; + snprintf (sym, sizeof(sym), "tlsmod17a%d", i); + + void *handle = dlopen (dso, RTLD_LAZY); + if (handle == NULL) + { + printf ("error: dlmopen from auditor: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + int (*func) (void) = dlsym (handle, sym); + if (func == NULL) + { + printf ("error: dlsym from auditor: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + /* Trigger TLS allocation. */ + func (); + } + + puts ("info: TLS-using modules loaded from auditor"); + fflush (stdout); + + return LAV_CURRENT; +} diff --git a/elf/tst-dlmopen4-nonpic.c b/elf/tst-dlmopen4-nonpic.c new file mode 100644 index 0000000..ad4e409 --- /dev/null +++ b/elf/tst-dlmopen4-nonpic.c @@ -0,0 +1,2 @@ +#define BUILD_FOR_NONPIC +#include "tst-dlmopen4.c" diff --git a/elf/tst-dlmopen4-pic.c b/elf/tst-dlmopen4-pic.c new file mode 100644 index 0000000..919fa85 --- /dev/null +++ b/elf/tst-dlmopen4-pic.c @@ -0,0 +1,2 @@ +#define BUILD_FOR_PIC +#include "tst-dlmopen4.c" diff --git a/elf/tst-dlmopen4.c b/elf/tst-dlmopen4.c index b1c5502..9e053fb 100644 --- a/elf/tst-dlmopen4.c +++ b/elf/tst-dlmopen4.c @@ -46,6 +46,15 @@ do_test (void) TEST_COMPARE (debug->base.r_version, 1); TEST_VERIFY_EXIT (debug->r_next == NULL); +#ifdef BUILD_FOR_PIC + /* In a PIC build, using _r_debug directly should give us the same + object. */ + TEST_VERIFY (&_r_debug == &debug->base); +#endif +#ifdef BUILD_FOR_NONPIC + TEST_COMPARE (_r_debug.r_version, 1); +#endif + void *h = xdlmopen (LM_ID_NEWLM, "$ORIGIN/tst-dlmopen1mod.so", RTLD_LAZY); @@ -57,6 +66,19 @@ do_test (void) const char *name = basename (debug->r_next->base.r_map->l_name); TEST_COMPARE_STRING (name, "tst-dlmopen1mod.so"); +#ifdef BUILD_FOR_NONPIC + /* If a copy relocation is used, it must be at version 1. */ + if (&_r_debug != &debug->base) + { + TEST_COMPARE (_r_debug.r_version, 1); + TEST_COMPARE ((uintptr_t) _r_debug.r_map, + (uintptr_t) debug->base.r_map); + TEST_COMPARE (_r_debug.r_brk, debug->base.r_brk); + TEST_COMPARE (_r_debug.r_state, debug->base.r_state); + TEST_COMPARE (_r_debug.r_ldbase, debug->base.r_ldbase); + } +#endif + xdlclose (h); return 0; diff --git a/elf/tst-dlopen-auditdup-auditmod.c b/elf/tst-dlopen-auditdup-auditmod.c new file mode 100644 index 0000000..270a595 --- /dev/null +++ b/elf/tst-dlopen-auditdup-auditmod.c @@ -0,0 +1,104 @@ +/* Auditor that opens again an object that just has been opened. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <link.h> +#include <stdbool.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +unsigned int +la_version (unsigned int v) +{ + return LAV_CURRENT; +} + +static bool trigger_on_la_activity; + +unsigned int +la_objopen (struct link_map *map, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("info: la_objopen: \"%s\"\n", map->l_name); + if (strstr (map->l_name, "/tst-dlopen-auditdupmod.so") != NULL) + trigger_on_la_activity = true; + return 0; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + static unsigned int calls; + ++calls; + printf ("info: la_activity: call %u (flag %u)\n", calls, flag); + fflush (stdout); + if (trigger_on_la_activity) + { + /* Avoid triggering on the dlmopen call below. */ + static bool recursion; + if (recursion) + return; + recursion = true; + + puts ("info: about to dlmopen tst-dlopen-auditdupmod.so"); + fflush (stdout); + void *handle = dlmopen (LM_ID_BASE, "tst-dlopen-auditdupmod.so", + RTLD_NOW); + if (handle == NULL) + { + printf ("error: dlmopen: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + + /* Check that the constructor has not run. Running the + constructor would require constructing its dependencies, but + the constructor call that triggered this auditing activity + has not completed, and constructors among the dependencies + may not be able to deal with that. */ + int *status = dlsym (handle, "auditdupmod_status"); + if (status == NULL) + { + printf ("error: dlsym: %s\n", dlerror ()); + fflush (stdout); + _exit (1); + } + printf ("info: auditdupmod_status == %d\n", *status); + if (*status != 0) + { + puts ("error: auditdupmod_status == 0 expected"); + fflush (stdout); + _exit (1); + } + /* Checked in the destructor and the main program. */ + ++*status; + printf ("info: auditdupmod_status == %d\n", *status); + + /* Check that the module has been relocated. */ + int **status_address = dlsym (handle, "auditdupmod_status_address"); + if (status_address == NULL || *status_address != status) + { + puts ("error: invalid auditdupmod_status address in" + " tst-dlopen-auditdupmod.so"); + fflush (stdout); + _exit (1); + } + + fflush (stdout); + } +} diff --git a/elf/tst-dlopen-auditdup.c b/elf/tst-dlopen-auditdup.c new file mode 100644 index 0000000..d022c58 --- /dev/null +++ b/elf/tst-dlopen-auditdup.c @@ -0,0 +1,36 @@ +/* Test that recursive dlopen from auditor works (bug 31986). + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <support/check.h> +#include <support/xdlfcn.h> + +static int +do_test (void) +{ + puts ("info: about to dlopen tst-dlopen-auditdupmod.so"); + fflush (stdout); + void *handle = xdlopen ("tst-dlopen-auditdupmod.so", RTLD_NOW); + int *status = xdlsym (handle, "auditdupmod_status"); + printf ("info: auditdupmod_status == %d (from main)\n", *status); + TEST_COMPARE (*status, 2); + xdlclose (handle); + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-dlopen-auditdupmod.c b/elf/tst-dlopen-auditdupmod.c new file mode 100644 index 0000000..59b7e21 --- /dev/null +++ b/elf/tst-dlopen-auditdupmod.c @@ -0,0 +1,48 @@ +/* Directly opened test module that gets reopened from the auditor. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdlib.h> +#include <support/xdlfcn.h> + +int auditdupmod_status; + +/* Used to check for successful relocation processing. */ +int *auditdupmod_status_address = &auditdupmod_status; + +static void __attribute__ ((constructor)) +init (void) +{ + ++auditdupmod_status; + printf ("info: tst-dlopen-auditdupmod.so constructor called (status %d)\n", + auditdupmod_status); +} + +static void __attribute__ ((destructor)) +fini (void) +{ + /* The tst-dlopen-auditdup-auditmod.so auditor incremented + auditdupmod_status. */ + printf ("info: tst-dlopen-auditdupmod.so destructor called (status %d)\n", + auditdupmod_status); + if (auditdupmod_status != 2) + { + puts ("error: auditdupmod_status == 2 expected"); + exit (1); + } +} diff --git a/elf/tst-dlopen-constructor-null-mod1.c b/elf/tst-dlopen-constructor-null-mod1.c new file mode 100644 index 0000000..70a7a0a --- /dev/null +++ b/elf/tst-dlopen-constructor-null-mod1.c @@ -0,0 +1,55 @@ +/* Module calling dlopen (NULL, RTLD_LAZY) to obtain the global scope. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> + +int mod1_status; + +static void __attribute__ ((constructor)) +init (void) +{ + puts ("info: tst-dlopen-constructor-null-mod1.so constructor"); + + void *handle = dlopen (NULL, RTLD_LAZY); + if (handle == NULL) + { + printf ("error: %s\n", dlerror ()); + exit (1); + } + puts ("info: dlopen returned"); + if (dlsym (handle, "malloc") != malloc) + { + puts ("error: dlsym did not produce expected result"); + exit (1); + } + dlclose (handle); + + /* Check that the second module's constructor has not executed. */ + if (getenv ("mod2_status") != NULL) + { + printf ("error: mod2_status environment variable set: %s\n", + getenv ("mod2_status")); + exit (1); + } + + /* Communicate to the second module that the constructor executed. */ + mod1_status = 1; +} diff --git a/elf/tst-dlopen-constructor-null-mod2.c b/elf/tst-dlopen-constructor-null-mod2.c new file mode 100644 index 0000000..d6e945b --- /dev/null +++ b/elf/tst-dlopen-constructor-null-mod2.c @@ -0,0 +1,37 @@ +/* Module whose constructor should not be invoked by dlopen (NULL, RTLD_LAZY). + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdlib.h> + +extern int mod1_status; +int mod2_status; + +static void __attribute__ ((constructor)) +init (void) +{ + printf ("info: tst-dlopen-constructor-null-mod2.so constructor" + " (mod1_status=%d)", mod1_status); + if (!(mod1_status == 1 && mod2_status == 0)) + { + puts ("error: mod1_status == 1 && mod2_status == 0 expected"); + exit (1); + } + setenv ("mod2_status", "constructed", 1); + mod2_status = 1; +} diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/elf/tst-dlopen-constructor-null.c index 84eac4a..db90643 100644 --- a/sysdeps/x86_64/dl-trampoline-save.h +++ b/elf/tst-dlopen-constructor-null.c @@ -1,4 +1,4 @@ -/* x86-64 PLT trampoline register save macros. +/* Verify that dlopen (NULL, RTLD_LAZY) does not complete initialization. Copyright (C) 2024 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,19 +16,23 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#ifndef DL_STACK_ALIGNMENT -/* Due to GCC bug: +/* This test mimics what the glvndSetupPthreads function in libglvnd + does. */ - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 +#include <stdlib.h> +#include <support/check.h> - __tls_get_addr may be called with 8-byte stack alignment. Although - this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume - that stack will be always aligned at 16 bytes. */ -# define DL_STACK_ALIGNMENT 8 -#endif +/* Defined and initialized in the shared objects. */ +extern int mod1_status; +extern int mod2_status; -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align - stack to 16 bytes before calling _dl_fixup. */ -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ - (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ - || 16 > DL_STACK_ALIGNMENT) +static int +do_test (void) +{ + TEST_COMPARE (mod1_status, 1); + TEST_COMPARE (mod2_status, 1); + TEST_COMPARE_STRING (getenv ("mod2_status"), "constructed"); + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c new file mode 100644 index 0000000..5eb79ee --- /dev/null +++ b/elf/tst-dlopen-sgid-mod.c @@ -0,0 +1 @@ +/* Opening this object should not succeed. */ diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c new file mode 100644 index 0000000..8aec52e --- /dev/null +++ b/elf/tst-dlopen-sgid.c @@ -0,0 +1,106 @@ +/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <gnu/lib-names.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <support/capture_subprocess.h> +#include <support/check.h> +#include <support/support.h> +#include <support/temp_file.h> +#include <support/test-driver.h> +#include <sys/wait.h> +#include <unistd.h> + +/* This is the name of our test object. Use a custom module for + testing, so that this object does not get picked up from the system + path. */ +static const char dso_name[] = "tst-dlopen-sgid-mod.so"; + +/* Used to mark the recursive invocation. */ +static const char magic_argument[] = "run-actual-test"; + +static int +do_test (void) +{ +/* Pathname of the directory that receives the shared objects this + test attempts to load. */ + char *libdir = support_create_temp_directory ("tst-dlopen-sgid-"); + + /* This is supposed to be ignored and stripped. */ + TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0); + + /* Copy of libc.so.6. */ + { + char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO); + char *to = xasprintf ("%s/%s", libdir, LIBC_SO); + add_temp_file (to); + support_copy_file (from, to); + free (to); + free (from); + } + + /* Copy of the test object. */ + { + char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name); + char *to = xasprintf ("%s/%s", libdir, dso_name); + add_temp_file (to); + support_copy_file (from, to); + free (to); + free (from); + } + + free (libdir); + + support_capture_subprogram_self_sgid (magic_argument); + + return 0; +} + +static void +alternative_main (int argc, char **argv) +{ + if (argc == 2 && strcmp (argv[1], magic_argument) == 0) + { + if (getgid () == getegid ()) + /* This can happen if the file system is mounted nosuid. */ + FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n", + (intmax_t) getgid ()); + + /* Should be removed due to SGID. */ + TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL); + + TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL); + { + const char *message = dlerror (); + TEST_COMPARE_STRING (message, + "tst-dlopen-sgid-mod.so:" + " cannot open shared object file:" + " No such file or directory"); + } + + support_record_failure_barrier (); + exit (EXIT_SUCCESS); + } +} + +#define PREPARE alternative_main +#include <support/test-driver.c> diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c index a472190..233eec76 100644 --- a/elf/tst-env-setuid-tunables.c +++ b/elf/tst-env-setuid-tunables.c @@ -105,10 +105,7 @@ do_test (int argc, char **argv) if (ret != 0) exit (1); - - /* Special return code to make sure that the child executed all the way - through. */ - exit (42); + return 0; } else { @@ -127,18 +124,7 @@ do_test (int argc, char **argv) continue; } - int status = support_capture_subprogram_self_sgid (buf); - - /* Bail out early if unsupported. */ - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (WEXITSTATUS (status) != 42) - { - printf (" [%d] child failed with status %d\n", i, - WEXITSTATUS (status)); - support_record_failure (); - } + support_capture_subprogram_self_sgid (buf); } return 0; } diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c index 43047c4..c084aa4 100644 --- a/elf/tst-env-setuid.c +++ b/elf/tst-env-setuid.c @@ -148,10 +148,7 @@ do_test (int argc, char **argv) if (ret != 0) exit (1); - - /* Special return code to make sure that the child executed all the way - through. */ - exit (42); + return 0; } else { @@ -175,17 +172,7 @@ do_test (int argc, char **argv) free (profilepath); } - int status = support_capture_subprogram_self_sgid (SETGID_CHILD); - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - exit (EXIT_UNSUPPORTED); - - if (WEXITSTATUS (status) != 42) - { - printf (" child failed with status %d\n", - WEXITSTATUS (status)); - support_record_failure (); - } + support_capture_subprogram_self_sgid (SETGID_CHILD); return 0; } diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c new file mode 100644 index 0000000..04de808 --- /dev/null +++ b/elf/tst-link-map-contiguous-ldso.c @@ -0,0 +1,98 @@ +/* Check that _dl_find_object behavior matches up with gaps. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <gnu/lib-names.h> +#include <link.h> +#include <stdbool.h> +#include <stdio.h> +#include <support/check.h> +#include <support/xdlfcn.h> +#include <support/xunistd.h> +#include <sys/mman.h> +#include <unistd.h> + +static int +do_test (void) +{ + struct link_map *l = xdlopen (LD_SO, RTLD_NOW); + if (!l->l_contiguous) + { + puts ("info: ld.so link map is not contiguous"); + + /* Try to find holes by probing with mmap. */ + int pagesize = getpagesize (); + bool gap_found = false; + ElfW(Addr) addr = l->l_map_start; + TEST_COMPARE (addr % pagesize, 0); + while (addr < l->l_map_end) + { + void *expected = (void *) addr; + void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); + struct dl_find_object dlfo; + int dlfo_ret = _dl_find_object (expected, &dlfo); + if (ptr == expected) + { + if (dlfo_ret < 0) + { + TEST_COMPARE (dlfo_ret, -1); + printf ("info: hole without mapping data found at %p\n", ptr); + } + else + FAIL ("object \"%s\" found in gap at %p", + dlfo.dlfo_link_map->l_name, ptr); + gap_found = true; + } + else if (dlfo_ret == 0) + { + if ((void *) dlfo.dlfo_link_map != (void *) l) + { + printf ("info: object \"%s\" found at %p\n", + dlfo.dlfo_link_map->l_name, ptr); + gap_found = true; + } + } + else + TEST_COMPARE (dlfo_ret, -1); + xmunmap (ptr, 1); + addr += pagesize; + } + if (!gap_found) + FAIL ("no ld.so gap found"); + } + else + { + puts ("info: ld.so link map is contiguous"); + + /* Assert that ld.so is truly contiguous in memory. */ + volatile long int *p = (volatile long int *) l->l_map_start; + volatile long int *end = (volatile long int *) l->l_map_end; + while (p < end) + { + *p; + ++p; + } + } + + xdlclose (l); + + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c new file mode 100644 index 0000000..eb5728c --- /dev/null +++ b/elf/tst-link-map-contiguous-libc.c @@ -0,0 +1,57 @@ +/* Check that the entire libc.so program image is readable if contiguous. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <gnu/lib-names.h> +#include <link.h> +#include <support/check.h> +#include <support/xdlfcn.h> +#include <support/xunistd.h> +#include <sys/mman.h> +#include <unistd.h> + +static int +do_test (void) +{ + struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW); + + /* The dynamic loader fills holes with PROT_NONE mappings. */ + if (!l->l_contiguous) + FAIL_EXIT1 ("libc.so link map is not contiguous"); + + /* Direct probing does not work because not everything is readable + due to PROT_NONE mappings. */ + int pagesize = getpagesize (); + ElfW(Addr) addr = l->l_map_start; + TEST_COMPARE (addr % pagesize, 0); + while (addr < l->l_map_end) + { + void *expected = (void *) addr; + void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1); + if (ptr == expected) + FAIL ("hole in libc.so memory image after %lu bytes", + (unsigned long int) (addr - l->l_map_start)); + xmunmap (ptr, 1); + addr += pagesize; + } + + xdlclose (l); + + return 0; +} +#include <support/test-driver.c> diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c new file mode 100644 index 0000000..2d1a054 --- /dev/null +++ b/elf/tst-link-map-contiguous-main.c @@ -0,0 +1,45 @@ +/* Check that the entire main program image is readable if contiguous. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <link.h> +#include <support/check.h> +#include <support/xdlfcn.h> + +static int +do_test (void) +{ + struct link_map *l = xdlopen ("", RTLD_NOW); + if (!l->l_contiguous) + FAIL_UNSUPPORTED ("main link map is not contiguous"); + + /* This check only works if the kernel loaded the main program. The + dynamic loader replaces gaps with PROT_NONE mappings, resulting + in faults. */ + volatile long int *p = (volatile long int *) l->l_map_start; + volatile long int *end = (volatile long int *) l->l_map_end; + while (p < end) + { + *p; + ++p; + } + + xdlclose (l); + + return 0; +} +#include <support/test-driver.c> diff --git a/elf/tst-rtld-no-malloc-audit.c b/elf/tst-rtld-no-malloc-audit.c new file mode 100644 index 0000000..a028377 --- /dev/null +++ b/elf/tst-rtld-no-malloc-audit.c @@ -0,0 +1 @@ +#include "tst-rtld-no-malloc.c" diff --git a/elf/tst-rtld-no-malloc-preload.c b/elf/tst-rtld-no-malloc-preload.c new file mode 100644 index 0000000..a028377 --- /dev/null +++ b/elf/tst-rtld-no-malloc-preload.c @@ -0,0 +1 @@ +#include "tst-rtld-no-malloc.c" diff --git a/elf/tst-rtld-no-malloc.c b/elf/tst-rtld-no-malloc.c new file mode 100644 index 0000000..5f24d4b --- /dev/null +++ b/elf/tst-rtld-no-malloc.c @@ -0,0 +1,76 @@ +/* Test that program loading does not call malloc. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <string.h> +#include <unistd.h> + +static void +print (const char *s) +{ + const char *end = s + strlen (s); + while (s < end) + { + ssize_t ret = write (STDOUT_FILENO, s, end - s); + if (ret <= 0) + _exit (2); + s += ret; + } +} + +static void __attribute__ ((noreturn)) +unexpected_call (const char *function) +{ + print ("error: unexpected call to "); + print (function); + print ("\n"); + _exit (1); +} + +/* These are the malloc functions implement in elf/dl-minimal.c. */ + +void +free (void *ignored) +{ + unexpected_call ("free"); +} + +void * +calloc (size_t ignored1, size_t ignored2) +{ + unexpected_call ("calloc"); +} + +void * +malloc (size_t ignored) +{ + unexpected_call ("malloc"); +} + +void * +realloc (void *ignored1, size_t ignored2) +{ + unexpected_call ("realloc"); +} + +int +main (void) +{ + /* Do not use the test wrapper, to avoid spurious malloc calls from it. */ + return 0; +} diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/elf/tst-tls23-mod.c index 7d35ef2..3ee4c70 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S +++ b/elf/tst-tls23-mod.c @@ -1,5 +1,5 @@ -/* Optimized memchr implementation for POWER10/PPC64. - Copyright (C) 2016-2024 Free Software Foundation, Inc. +/* DSO used by tst-tls23. + Copyright (C) 2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,13 +16,17 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if defined __LITTLE_ENDIAN__ && IS_IN (libc) -#define MEMCHR __memchr_power10 +#include <tst-tls23.h> -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) -#undef weak_alias -#define weak_alias(name,alias) +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden"))); -#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S> -#endif +struct tls * +apply_tls (struct tls *p) +{ + INIT_TLS_CALL (); + BEFORE_TLS_CALL (); + tls_var0 = *p; + struct tls *ret = &tls_var0; + AFTER_TLS_CALL (); + return ret; +} diff --git a/elf/tst-tls23.c b/elf/tst-tls23.c new file mode 100644 index 0000000..afe594c --- /dev/null +++ b/elf/tst-tls23.c @@ -0,0 +1,106 @@ +/* Test that __tls_get_addr preserves caller-saved registers. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <dlfcn.h> +#include <pthread.h> +#include <support/xdlfcn.h> +#include <support/xthread.h> +#include <support/check.h> +#include <support/test-driver.h> +#include <tst-tls23.h> + +#ifndef IS_SUPPORTED +# define IS_SUPPORTED() true +#endif + +/* An architecture can define it to clobber caller-saved registers in + malloc below to verify that __tls_get_addr won't change caller-saved + registers. */ +#ifndef PREPARE_MALLOC +# define PREPARE_MALLOC() +#endif + +extern void * __libc_malloc (size_t); + +size_t malloc_counter = 0; + +void * +malloc (size_t n) +{ + PREPARE_MALLOC (); + malloc_counter++; + return __libc_malloc (n); +} + +static void *mod; +static const char *modname = "tst-tls23-mod.so"; + +static void +open_mod (void) +{ + mod = xdlopen (modname, RTLD_LAZY); + printf ("open %s\n", modname); +} + +static void +close_mod (void) +{ + xdlclose (mod); + mod = NULL; + printf ("close %s\n", modname); +} + +static void +access_mod (const char *sym) +{ + struct tls var = { -4, -4, -4, -4 }; + struct tls *(*f) (struct tls *) = xdlsym (mod, sym); + /* Check that our malloc is called. */ + malloc_counter = 0; + struct tls *p = f (&var); + TEST_VERIFY (malloc_counter != 0); + printf ("access %s: %s() = %p\n", modname, sym, p); + TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0); + ++(p->a); +} + +static void * +start (void *arg) +{ + access_mod ("apply_tls"); + return arg; +} + +static int +do_test (void) +{ + if (!IS_SUPPORTED ()) + return EXIT_UNSUPPORTED; + + open_mod (); + pthread_t t = xpthread_create (NULL, start, NULL); + xpthread_join (t); + close_mod (); + + return 0; +} + +#include <support/test-driver.c> diff --git a/elf/tst-tls23.h b/elf/tst-tls23.h new file mode 100644 index 0000000..d0e7345 --- /dev/null +++ b/elf/tst-tls23.h @@ -0,0 +1,40 @@ +/* Test that __tls_get_addr preserves caller-saved registers. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdint.h> + +struct tls +{ + int64_t a, b, c, d; +}; + +extern struct tls *apply_tls (struct tls *); + +/* An architecture can define them to verify that caller-saved registers + aren't changed by __tls_get_addr. */ +#ifndef INIT_TLS_CALL +# define INIT_TLS_CALL() +#endif + +#ifndef BEFORE_TLS_CALL +# define BEFORE_TLS_CALL() +#endif + +#ifndef AFTER_TLS_CALL +# define AFTER_TLS_CALL() +#endif diff --git a/elf/tst-version-hash-zero-linkmod.c b/elf/tst-version-hash-zero-linkmod.c new file mode 100644 index 0000000..15e2506 --- /dev/null +++ b/elf/tst-version-hash-zero-linkmod.c @@ -0,0 +1,22 @@ +/* Stub module for linking tst-version-hash-zero-refmod.so. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see <https://www.gnu.org/licenses/>. */ + +/* The version script assigns a different symbol version for the stub + module. Loading the module with the incorrect version is expected + to fail. */ +#include "tst-version-hash-zero-mod.c" diff --git a/elf/tst-version-hash-zero-linkmod.map b/elf/tst-version-hash-zero-linkmod.map new file mode 100644 index 0000000..2dba7c2 --- /dev/null +++ b/elf/tst-version-hash-zero-linkmod.map @@ -0,0 +1,7 @@ +Base { + local: *; +}; + +OTHER_VERSION { + global: global_variable; +} Base; diff --git a/elf/tst-version-hash-zero-mod.c b/elf/tst-version-hash-zero-mod.c new file mode 100644 index 0000000..ac6b0dc --- /dev/null +++ b/elf/tst-version-hash-zero-mod.c @@ -0,0 +1,20 @@ +/* Test module with a zero version symbol hash. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see <https://www.gnu.org/licenses/>. */ + +/* The symbol version is assigned by version script. */ +int global_variable; diff --git a/elf/tst-version-hash-zero-mod.map b/elf/tst-version-hash-zero-mod.map new file mode 100644 index 0000000..41eaff7 --- /dev/null +++ b/elf/tst-version-hash-zero-mod.map @@ -0,0 +1,13 @@ +Base { + local: *; +}; + +/* Define the version so that tst-version-hash-zero-refmod.so passes + the initial symbol version check. */ +OTHER_VERSION { +} Base; + +/* This version string hashes to zero. */ +PPPPPPPPPPPP { + global: global_variable; +} Base; diff --git a/elf/tst-version-hash-zero-refmod.c b/elf/tst-version-hash-zero-refmod.c new file mode 100644 index 0000000..cd8b3dc --- /dev/null +++ b/elf/tst-version-hash-zero-refmod.c @@ -0,0 +1,23 @@ +/* Test module that triggers a relocation failure in tst-version-hash-zero. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see <https://www.gnu.org/licenses/>. */ + +/* This is bound to global_variable@@OTHER_VERSION via + tst-version-hash-zero-linkmod.so, but at run time, only + global_variable@PPPPPPPPPPPP exists. */ +extern int global_variable; +int *pointer_variable = &global_variable; diff --git a/elf/tst-version-hash-zero.c b/elf/tst-version-hash-zero.c new file mode 100644 index 0000000..66a0db4 --- /dev/null +++ b/elf/tst-version-hash-zero.c @@ -0,0 +1,56 @@ +/* Symbols with version hash zero should not match any version (bug 29190). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; see the file COPYING.LIB. If + not, see <https://www.gnu.org/licenses/>. */ + +#include <support/check.h> +#include <support/xdlfcn.h> +#include <stddef.h> +#include <string.h> + +static int +do_test (void) +{ + void *handle = xdlopen ("tst-version-hash-zero-mod.so", RTLD_NOW); + + /* This used to crash because some struct r_found_version entries + with hash zero did not have valid version strings. */ + TEST_VERIFY (xdlvsym (handle, "global_variable", "PPPPPPPPPPPP") != NULL); + + /* Consistency check. */ + TEST_VERIFY (xdlsym (handle, "global_variable") + == xdlvsym (handle, "global_variable", "PPPPPPPPPPPP")); + + /* This symbol version is supposed to be missing. */ + TEST_VERIFY (dlvsym (handle, "global_variable", "OTHER_VERSION") == NULL); + + /* tst-version-hash-zero-refmod.so references + global_variable@@OTHER_VERSION and is expected to fail to load. + dlvsym sets the hidden flag during lookup. Relocation does not, + so this exercises a different failure case. */ + TEST_VERIFY_EXIT (dlopen ("tst-version-hash-zero-refmod.so", RTLD_NOW) + == NULL); + const char *message = dlerror (); + if (strstr (message, + ": undefined symbol: global_variable, version OTHER_VERSION") + == NULL) + FAIL_EXIT1 ("unexpected dlopen failure: %s", message); + + xdlclose (handle); + return 0; +} + +#include <support/test-driver.c> diff --git a/include/ctype.h b/include/ctype.h index 493a6f8..a15e5b6 100644 --- a/include/ctype.h +++ b/include/ctype.h @@ -24,33 +24,35 @@ libc_hidden_proto (toupper) NL_CURRENT_INDIRECT. */ # include "../locale/localeinfo.h" -# include <libc-tsd.h> # ifndef CTYPE_EXTERN_INLINE /* Used by ctype/ctype-info.c, which see. */ # define CTYPE_EXTERN_INLINE extern inline # endif -__libc_tsd_define (extern, const uint16_t *, CTYPE_B) -__libc_tsd_define (extern, const int32_t *, CTYPE_TOUPPER) -__libc_tsd_define (extern, const int32_t *, CTYPE_TOLOWER) +extern __thread const uint16_t * __libc_tsd_CTYPE_B + attribute_hidden attribute_tls_model_ie; +extern __thread const int32_t * __libc_tsd_CTYPE_TOUPPER + attribute_hidden attribute_tls_model_ie; +extern __thread const int32_t * __libc_tsd_CTYPE_TOLOWER + attribute_hidden attribute_tls_model_ie; CTYPE_EXTERN_INLINE const uint16_t ** __attribute__ ((const)) __ctype_b_loc (void) { - return __libc_tsd_address (const uint16_t *, CTYPE_B); + return &__libc_tsd_CTYPE_B; } CTYPE_EXTERN_INLINE const int32_t ** __attribute__ ((const)) __ctype_toupper_loc (void) { - return __libc_tsd_address (const int32_t *, CTYPE_TOUPPER); + return &__libc_tsd_CTYPE_TOUPPER; } CTYPE_EXTERN_INLINE const int32_t ** __attribute__ ((const)) __ctype_tolower_loc (void) { - return __libc_tsd_address (const int32_t *, CTYPE_TOLOWER); + return &__libc_tsd_CTYPE_TOLOWER; } # ifndef __NO_CTYPE @@ -64,6 +66,11 @@ __ctype_tolower_loc (void) # define __isdigit_l(c, l) ({ int __c = (c); __c >= '0' && __c <= '9'; }) # endif /* Not __NO_CTYPE. */ +/* For use in initializers. */ +extern const char _nl_C_LC_CTYPE_class[] attribute_hidden; +extern const uint32_t _nl_C_LC_CTYPE_toupper[] attribute_hidden; +extern const uint32_t _nl_C_LC_CTYPE_tolower[] attribute_hidden; + # endif /* IS_IN (libc). */ #endif /* Not _ISOMAC. */ diff --git a/include/link.h b/include/link.h index 5ed445d..7ca305f 100644 --- a/include/link.h +++ b/include/link.h @@ -365,6 +365,8 @@ struct auditstate dynamic linker. */ extern struct r_debug_extended _r_debug_extended attribute_hidden; +rtld_hidden_proto (_r_debug) + #if __ELF_NATIVE_CLASS == 32 # define symbind symbind32 # define LA_SYMBIND "la_symbind32" diff --git a/include/rpc/rpc.h b/include/rpc/rpc.h index f5cee6c..ba96783 100644 --- a/include/rpc/rpc.h +++ b/include/rpc/rpc.h @@ -3,8 +3,6 @@ # ifndef _ISOMAC -#include <libc-tsd.h> - /* Now define the internal interfaces. */ extern unsigned long _create_xid (void); @@ -47,7 +45,8 @@ extern void __rpc_thread_key_cleanup (void) attribute_hidden; extern void __rpc_thread_destroy (void) attribute_hidden; -__libc_tsd_define (extern, struct rpc_thread_variables *, RPC_VARS) +extern __thread struct rpc_thread_variables *__libc_tsd_RPC_VARS + attribute_hidden attribute_tls_model_ie; #define RPC_THREAD_VARIABLE(x) (__rpc_thread_variables()->x) diff --git a/libio/Makefile b/libio/Makefile index b92aeaf..b189455 100644 --- a/libio/Makefile +++ b/libio/Makefile @@ -68,22 +68,80 @@ routines_no_fortify += \ wprintf \ # routines_no_fortify -tests = tst_swprintf tst_wprintf tst_swscanf tst_wscanf tst_getwc tst_putwc \ - tst_wprintf2 tst-widetext test-fmemopen tst-ext tst-ext2 \ - tst-fgetws tst-ungetwc1 tst-ungetwc2 tst-swscanf tst-sscanf \ - tst-mmap-setvbuf bug-ungetwc1 bug-ungetwc2 tst-atime tst-eof \ - tst-freopen bug-rewind bug-rewind2 bug-ungetc bug-fseek \ - tst-mmap-eofsync tst-mmap-fflushsync bug-mmap-fflush \ - tst-mmap2-eofsync tst-mmap-offend bug-fopena+ bug-wfflush \ - bug-ungetc2 bug-ftell bug-ungetc3 bug-ungetc4 tst-fopenloc2 \ - tst-memstream1 tst-memstream2 tst-memstream3 tst-memstream4 \ - tst-wmemstream1 tst-wmemstream2 tst-wmemstream3 tst-wmemstream4 \ - tst-wmemstream5 bug-memstream1 bug-wmemstream1 \ - tst-setvbuf1 tst-popen1 tst-fgetwc bug-wsetpos tst-fseek \ - tst-fwrite-error tst-ftell-partial-wide tst-ftell-active-handler \ - tst-ftell-append tst-fputws tst-bz22415 tst-fgetc-after-eof \ - tst-sprintf-ub tst-sprintf-chk-ub tst-bz24051 tst-bz24153 \ - tst-wfile-sync tst-bz28828 tst-getdelim +tests = \ + bug-fopena+ \ + bug-fseek \ + bug-ftell \ + bug-memstream1 \ + bug-mmap-fflush \ + bug-rewind \ + bug-rewind2 \ + bug-ungetc \ + bug-ungetc2 \ + bug-ungetc3 \ + bug-ungetc4 \ + bug-ungetwc1 \ + bug-ungetwc2 \ + bug-wfflush \ + bug-wmemstream1 \ + bug-wsetpos \ + test-fmemopen \ + tst-atime \ + tst-bz22415 \ + tst-bz24051 \ + tst-bz24153 \ + tst-bz28828 \ + tst-eof \ + tst-ext \ + tst-ext2 \ + tst-fdopen-seek-failure \ + tst-fgetc-after-eof \ + tst-fgetwc \ + tst-fgetws \ + tst-fopenloc2 \ + tst-fputws \ + tst-freopen \ + tst-fseek \ + tst-ftell-active-handler \ + tst-ftell-append \ + tst-ftell-partial-wide \ + tst-fwrite-error \ + tst-getdelim \ + tst-memstream1 \ + tst-memstream2 \ + tst-memstream3 \ + tst-memstream4 \ + tst-mmap-eofsync \ + tst-mmap-fflushsync \ + tst-mmap-offend \ + tst-mmap-setvbuf \ + tst-mmap2-eofsync \ + tst-popen-fork \ + tst-popen1 \ + tst-setvbuf1 \ + tst-sprintf-chk-ub \ + tst-sprintf-ub \ + tst-sscanf \ + tst-swscanf \ + tst-ungetwc1 \ + tst-ungetwc2 \ + tst-wfile-sync \ + tst-widetext \ + tst-wmemstream1 \ + tst-wmemstream2 \ + tst-wmemstream3 \ + tst-wmemstream4 \ + tst-wmemstream5 \ + tst_getwc \ + tst_putwc \ + tst_swprintf \ + tst_swscanf \ + tst_wprintf \ + tst_wprintf2 \ + tst_wscanf \ + # tests + +$(objpfx)tst-popen-fork: $(shared-thread-library) tests-internal = tst-vtables tst-vtables-interposed @@ -196,6 +254,9 @@ tst_wprintf2-ARGS = "Some Text" test-fmemopen-ENV = MALLOC_TRACE=$(objpfx)test-fmemopen.mtrace \ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so +tst-fdopen-seek-failure-ENV = \ + MALLOC_TRACE=$(objpfx)tst-fdopen-seek-failure.mtrace \ + LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so tst-fopenloc-ENV = MALLOC_TRACE=$(objpfx)tst-fopenloc.mtrace \ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so tst-bz22415-ENV = MALLOC_TRACE=$(objpfx)tst-bz22415.mtrace \ @@ -204,6 +265,7 @@ tst-bz24228-ENV = MALLOC_TRACE=$(objpfx)tst-bz24228.mtrace \ LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so generated += test-fmemopen.mtrace test-fmemopen.check +generated += tst-fdopen-seek-failure.mtrace tst-fdopen-seek-failure.check generated += tst-fopenloc.mtrace tst-fopenloc.check generated += tst-bz22415.mtrace tst-bz22415.check @@ -226,8 +288,12 @@ shared-only-routines = oldiofopen oldiofdopen oldiofclose oldfileops \ oldiofsetpos64 ifeq ($(run-built-tests),yes) -tests-special += $(objpfx)test-freopen.out $(objpfx)test-fmemopen-mem.out \ - $(objpfx)tst-bz22415-mem.out +tests-special += \ + $(objpfx)test-fmemopen-mem.out \ + $(objpfx)test-freopen.out \ + $(objpfx)tst-bz22415-mem.out \ + $(objpfx)tst-fdopen-seek-failure-mem.out \ + # tests-special ifeq (yes,$(build-shared)) # Run tst-fopenloc-cmp.out and tst-openloc-mem.out only if shared # library is enabled since they depend on tst-fopenloc.out. @@ -235,16 +301,26 @@ tests-special += $(objpfx)tst-fopenloc-cmp.out $(objpfx)tst-fopenloc-mem.out \ $(objpfx)tst-bz24228-mem.out endif -tests += tst-cleanup-default tst-cleanup-default-static +tests += \ + tst-cleanup-default \ + tst-cleanup-default-static \ + # tests tests-static += tst-cleanup-default-static tests-special += $(objpfx)tst-cleanup-default-cmp.out $(objpfx)tst-cleanup-default-static-cmp.out LDFLAGS-tst-cleanup-default = -Wl,--gc-sections LDFLAGS-tst-cleanup-default-static = -Wl,--gc-sections ifeq ($(have-gnu-retain)$(have-z-start-stop-gc),yesyes) -tests += tst-cleanup-start-stop-gc tst-cleanup-start-stop-gc-static \ - tst-cleanup-nostart-stop-gc tst-cleanup-nostart-stop-gc-static -tests-static += tst-cleanup-start-stop-gc-static tst-cleanup-nostart-stop-gc-static +tests += \ + tst-cleanup-nostart-stop-gc \ + tst-cleanup-nostart-stop-gc-static \ + tst-cleanup-start-stop-gc \ + tst-cleanup-start-stop-gc-static \ + # tests +tests-static += \ + tst-cleanup-nostart-stop-gc-static \ + tst-cleanup-start-stop-gc-static \ + # tests-static tests-special += $(objpfx)tst-cleanup-start-stop-gc-cmp.out \ $(objpfx)tst-cleanup-start-stop-gc-static-cmp.out \ $(objpfx)tst-cleanup-nostart-stop-gc-cmp.out \ @@ -305,6 +381,11 @@ $(objpfx)test-fmemopen-mem.out: $(objpfx)test-fmemopen.out $(common-objpfx)malloc/mtrace $(objpfx)test-fmemopen.mtrace > $@; \ $(evaluate-test) +$(objpfx)tst-fdopen-seek-failure-mem.out: $(objpfx)tst-fdopen-seek-failure.out + $(common-objpfx)malloc/mtrace \ + $(objpfx)tst-fdopen-seek-failure.mtrace > $@; \ + $(evaluate-test) + $(objpfx)tst-fopenloc-mem.out: $(objpfx)tst-fopenloc.out $(common-objpfx)malloc/mtrace $(objpfx)tst-fopenloc.mtrace > $@; \ $(evaluate-test) diff --git a/libio/iofdopen.c b/libio/iofdopen.c index 2583fb8..14fbc7b 100644 --- a/libio/iofdopen.c +++ b/libio/iofdopen.c @@ -156,7 +156,11 @@ _IO_new_fdopen (int fd, const char *mode) { off64_t new_pos = _IO_SYSSEEK (&new_f->fp.file, 0, _IO_seek_end); if (new_pos == _IO_pos_BAD && errno != ESPIPE) - return NULL; + { + _IO_un_link (&new_f->fp); + free (new_f); + return NULL; + } } return &new_f->fp.file; } diff --git a/libio/iopopen.c b/libio/iopopen.c index d01cb06..352513a 100644 --- a/libio/iopopen.c +++ b/libio/iopopen.c @@ -57,6 +57,26 @@ unlock (void *not_used) } #endif +/* These lock/unlock/resetlock functions are used during fork. */ + +void +_IO_proc_file_chain_lock (void) +{ + _IO_lock_lock (proc_file_chain_lock); +} + +void +_IO_proc_file_chain_unlock (void) +{ + _IO_lock_unlock (proc_file_chain_lock); +} + +void +_IO_proc_file_chain_resetlock (void) +{ + _IO_lock_init (proc_file_chain_lock); +} + /* POSIX states popen shall ensure that any streams from previous popen() calls that remain open in the parent process should be closed in the new child process. diff --git a/libio/libioP.h b/libio/libioP.h index 616253f..a83a411 100644 --- a/libio/libioP.h +++ b/libio/libioP.h @@ -429,6 +429,12 @@ libc_hidden_proto (_IO_list_resetlock) extern void _IO_enable_locks (void) __THROW; libc_hidden_proto (_IO_enable_locks) +/* Functions for operating popen's proc_file_chain_lock during fork. */ + +extern void _IO_proc_file_chain_lock (void) __THROW attribute_hidden; +extern void _IO_proc_file_chain_unlock (void) __THROW attribute_hidden; +extern void _IO_proc_file_chain_resetlock (void) __THROW attribute_hidden; + /* Default jumptable functions. */ extern int _IO_default_underflow (FILE *) __THROW; diff --git a/libio/tst-fdopen-seek-failure.c b/libio/tst-fdopen-seek-failure.c new file mode 100644 index 0000000..5c4d40a --- /dev/null +++ b/libio/tst-fdopen-seek-failure.c @@ -0,0 +1,48 @@ +/* Test for fdopen memory leak without SEEK_END support (bug 31840). + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <errno.h> +#include <fcntl.h> +#include <mcheck.h> +#include <stddef.h> +#include <stdio.h> +#include <support/check.h> +#include <support/xunistd.h> +#include <unistd.h> + +static int +do_test (void) +{ + mtrace (); + + /* This file is special because it is seekable, but only + with SEEK_SET, not SEEK_END. */ + int fd = open ("/proc/self/mem", O_RDWR); + if (fd < 0) + FAIL_UNSUPPORTED ("/proc/self/mem not found: %m"); + FILE *fp = fdopen (fd, "a"); + /* The fdopen call should have failed because it tried to use + SEEK_END. */ + TEST_VERIFY (fp == NULL); + TEST_COMPARE (errno, EINVAL); + xclose (fd); + return 0; +} + +#include <support/test-driver.c> diff --git a/libio/tst-popen-fork.c b/libio/tst-popen-fork.c new file mode 100644 index 0000000..1df30fc --- /dev/null +++ b/libio/tst-popen-fork.c @@ -0,0 +1,80 @@ +/* Test concurrent popen and fork. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdatomic.h> +#include <pthread.h> +#include <unistd.h> +#include <sys/wait.h> + +#include <support/check.h> +#include <support/xthread.h> +#include <support/xunistd.h> + +static void +popen_and_pclose (void) +{ + FILE *f = popen ("true", "r"); + TEST_VERIFY_EXIT (f != NULL); + pclose (f); + return; +} + +static atomic_bool done = ATOMIC_VAR_INIT (0); + +static void * +popen_and_pclose_forever (__attribute__ ((unused)) + void *arg) +{ + while (!atomic_load_explicit (&done, memory_order_acquire)) + popen_and_pclose (); + return NULL; +} + +static int +do_test (void) +{ + + /* Repeatedly call popen in a loop during the entire test. */ + pthread_t t = xpthread_create (NULL, popen_and_pclose_forever, NULL); + + /* Repeatedly fork off and reap child processes one-by-one. + Each child calls popen once, then exits, leading to the possibility + that a child forks *during* our own popen call, thus inheriting any + intermediate popen state, possibly including lock state(s). */ + for (int i = 0; i < 100; i++) + { + int cpid = xfork (); + + if (cpid == 0) + { + popen_and_pclose (); + _exit (0); + } + else + xwaitpid (cpid, NULL, 0); + } + + /* Stop calling popen. */ + atomic_store_explicit (&done, 1, memory_order_release); + xpthread_join (t); + + return 0; +} + +#include <support/test-driver.c> diff --git a/locale/lc-ctype.c b/locale/lc-ctype.c index c77ec51..70556ac 100644 --- a/locale/lc-ctype.c +++ b/locale/lc-ctype.c @@ -64,12 +64,9 @@ _nl_postload_ctype (void) in fact using the global locale. */ if (_NL_CURRENT_LOCALE == &_nl_global_locale) { - __libc_tsd_set (const uint16_t *, CTYPE_B, - (void *) _nl_global_locale.__ctype_b); - __libc_tsd_set (const int32_t *, CTYPE_TOUPPER, - (void *) _nl_global_locale.__ctype_toupper); - __libc_tsd_set (const int32_t *, CTYPE_TOLOWER, - (void *) _nl_global_locale.__ctype_tolower); + __libc_tsd_CTYPE_B = _nl_global_locale.__ctype_b; + __libc_tsd_CTYPE_TOUPPER = _nl_global_locale.__ctype_toupper; + __libc_tsd_CTYPE_TOLOWER = _nl_global_locale.__ctype_tolower; } #include <shlib-compat.h> diff --git a/locale/localeinfo.h b/locale/localeinfo.h index ed698fa..c3249d3 100644 --- a/locale/localeinfo.h +++ b/locale/localeinfo.h @@ -236,10 +236,9 @@ extern struct __locale_struct _nl_global_locale attribute_hidden; /* This fetches the thread-local locale_t pointer, either one set with uselocale or &_nl_global_locale. */ -#define _NL_CURRENT_LOCALE (__libc_tsd_get (locale_t, LOCALE)) -#include <libc-tsd.h> -__libc_tsd_define (extern, locale_t, LOCALE) - +#define _NL_CURRENT_LOCALE __libc_tsd_LOCALE +extern __thread locale_t __libc_tsd_LOCALE + attribute_hidden attribute_tls_model_ie; /* For static linking it is desireable to avoid always linking in the code and data for every category when we can tell at link time that they are diff --git a/locale/uselocale.c b/locale/uselocale.c index 8136caf..0b247a7 100644 --- a/locale/uselocale.c +++ b/locale/uselocale.c @@ -34,7 +34,7 @@ __uselocale (locale_t newloc) { const locale_t locobj = newloc == LC_GLOBAL_LOCALE ? &_nl_global_locale : newloc; - __libc_tsd_set (locale_t, LOCALE, locobj); + __libc_tsd_LOCALE = locobj; #ifdef NL_CURRENT_INDIRECT /* Now we must update all the per-category thread-local variables to @@ -62,11 +62,9 @@ __uselocale (locale_t newloc) #endif /* Update the special tsd cache of some locale data. */ - __libc_tsd_set (const uint16_t *, CTYPE_B, (void *) locobj->__ctype_b); - __libc_tsd_set (const int32_t *, CTYPE_TOLOWER, - (void *) locobj->__ctype_tolower); - __libc_tsd_set (const int32_t *, CTYPE_TOUPPER, - (void *) locobj->__ctype_toupper); + __libc_tsd_CTYPE_B = locobj->__ctype_b; + __libc_tsd_CTYPE_TOLOWER = locobj->__ctype_tolower; + __libc_tsd_CTYPE_TOUPPER = locobj->__ctype_toupper; } return oldloc == &_nl_global_locale ? LC_GLOBAL_LOCALE : oldloc; diff --git a/locale/xlocale.c b/locale/xlocale.c index f2b9d03..d11c1cb 100644 --- a/locale/xlocale.c +++ b/locale/xlocale.c @@ -18,18 +18,13 @@ #include <locale.h> #include "localeinfo.h" +#include <ctype.h> #define DEFINE_CATEGORY(category, category_name, items, a) \ extern struct __locale_data _nl_C_##category; #include "categories.def" #undef DEFINE_CATEGORY -/* Defined in locale/C-ctype.c. */ -extern const char _nl_C_LC_CTYPE_class[] attribute_hidden; -extern const char _nl_C_LC_CTYPE_toupper[] attribute_hidden; -extern const char _nl_C_LC_CTYPE_tolower[] attribute_hidden; - - const struct __locale_struct _nl_C_locobj attribute_hidden = { .__locales = diff --git a/manual/tunables.texi b/manual/tunables.texi index be97190..b255a14 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -52,6 +52,7 @@ glibc.elision.skip_lock_busy: 3 (min: 0, max: 2147483647) glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) +glibc.cpu.x86_memset_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0xfffffffffffffff) glibc.cpu.x86_shstk: glibc.pthread.stack_cache_size: 0x2800000 (min: 0x0, max: 0xffffffffffffffff) glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) @@ -485,7 +486,8 @@ thread stack originally backup by Huge Pages to default pages. @cindex shared_cache_size tunables @cindex tunables, shared_cache_size @cindex non_temporal_threshold tunables -@cindex tunables, non_temporal_threshold +@cindex memset_non_temporal_threshold tunables +@cindex tunables, non_temporal_threshold, memset_non_temporal_threshold @deftp {Tunable namespace} glibc.cpu Behavior of @theglibc{} can be tuned to assume specific hardware capabilities @@ -561,6 +563,18 @@ like memmove and memcpy. This tunable is specific to i386 and x86-64. @end deftp +@deftp Tunable glibc.cpu.x86_memset_non_temporal_threshold +The @code{glibc.cpu.x86_memset_non_temporal_threshold} tunable allows +the user to set threshold in bytes for non temporal store in +memset. Non temporal stores give a hint to the hardware to move data +directly to memory without displacing other data from the cache. This +tunable is used by some platforms to determine when to use non +temporal stores memset. + +This tunable is specific to i386 and x86-64. +@end deftp + + @deftp Tunable glibc.cpu.x86_rep_movsb_threshold The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to set threshold in bytes to start using "rep movsb". The value must be diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in index d728f97..5a69002 100644 --- a/math/auto-libm-test-in +++ b/math/auto-libm-test-in @@ -5354,7 +5354,7 @@ exp2 -0x4.8ce878p-4 exp2 0xf.93d18bf7be8d272p-4 expm1 0 -expm1 -0 no-mathvec +expm1 -0 expm1 1 expm1 0.75 expm1 2 @@ -5419,7 +5419,7 @@ expm1 -0x1p-100 expm1 0x1p-600 expm1 -0x1p-600 expm1 0x1p-10000 -expm1 -0x1p-10000 no-mathvec +expm1 -0x1p-10000 expm1 0xe.4152ac57cd1ea7ap-60 expm1 0x6.660247486aed8p-4 expm1 0x6.289a78p-4 @@ -6577,7 +6577,7 @@ log10 0xf.bf1b2p-4 log10 0x1.6b5f7ap+96 log1p 0 -log1p -0 no-mathvec +log1p -0 log1p e-1 log1p -0.25 log1p -0.875 @@ -7318,7 +7318,7 @@ pow 0x1.7ac7cp+5 23 pow -0x1.7ac7cp+5 23 sin 0 -sin -0 no-mathvec +sin -0 sin pi/6 sin -pi/6 sin pi/2 @@ -7655,7 +7655,7 @@ sqrt min sqrt min_subnorm tan 0 -tan -0 no-mathvec +tan -0 tan pi/4 tan pi/2 tan -pi/2 diff --git a/math/auto-libm-test-out-expm1 b/math/auto-libm-test-out-expm1 index 91da41b..8483455 100644 --- a/math/auto-libm-test-out-expm1 +++ b/math/auto-libm-test-out-expm1 @@ -23,31 +23,31 @@ expm1 0 = expm1 tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok = expm1 towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok = expm1 upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok -expm1 -0 no-mathvec -= expm1 downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok +expm1 -0 += expm1 downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok expm1 1 = expm1 downward binary32 0x1p+0 : 0x1.b7e15p+0 : inexact-ok = expm1 tonearest binary32 0x1p+0 : 0x1.b7e152p+0 : inexact-ok @@ -1880,87 +1880,87 @@ expm1 0x1p-10000 = expm1 tonearest binary128 0x1p-10000 : 0x1p-10000 : inexact-ok = expm1 towardzero binary128 0x1p-10000 : 0x1p-10000 : inexact-ok = expm1 upward binary128 0x1p-10000 : 0x1.0000000000000000000000000001p-10000 : inexact-ok -expm1 -0x1p-10000 no-mathvec -= expm1 downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= expm1 downward binary32 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 tonearest binary32 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 towardzero binary32 -0x8p-152 : -0x0p+0 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 upward binary32 -0x8p-152 : -0x0p+0 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 downward binary64 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 tonearest binary64 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 towardzero binary64 -0x8p-152 : -0x7.ffffffffffffcp-152 : no-mathvec inexact-ok -= expm1 upward binary64 -0x8p-152 : -0x7.ffffffffffffcp-152 : no-mathvec inexact-ok -= expm1 downward intel96 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 tonearest intel96 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 towardzero intel96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : no-mathvec inexact-ok -= expm1 upward intel96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : no-mathvec inexact-ok -= expm1 downward m68k96 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 tonearest m68k96 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 towardzero m68k96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : no-mathvec inexact-ok -= expm1 upward m68k96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : no-mathvec inexact-ok -= expm1 downward binary128 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 tonearest binary128 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 towardzero binary128 -0x8p-152 : -0x7.fffffffffffffffffffffffffffcp-152 : no-mathvec inexact-ok -= expm1 upward binary128 -0x8p-152 : -0x7.fffffffffffffffffffffffffffcp-152 : no-mathvec inexact-ok -= expm1 downward ibm128 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 tonearest ibm128 -0x8p-152 : -0x8p-152 : no-mathvec inexact-ok -= expm1 towardzero ibm128 -0x8p-152 : -0x7.fffffffffffffffffffffffffep-152 : no-mathvec inexact-ok -= expm1 upward ibm128 -0x8p-152 : -0x7.fffffffffffffffffffffffffep-152 : no-mathvec inexact-ok -= expm1 downward binary64 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 tonearest binary64 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 towardzero binary64 -0x4p-1076 : -0x0p+0 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 upward binary64 -0x4p-1076 : -0x0p+0 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 downward intel96 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 tonearest intel96 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 towardzero intel96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : no-mathvec inexact-ok -= expm1 upward intel96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : no-mathvec inexact-ok -= expm1 downward m68k96 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 tonearest m68k96 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 towardzero m68k96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : no-mathvec inexact-ok -= expm1 upward m68k96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : no-mathvec inexact-ok -= expm1 downward binary128 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 tonearest binary128 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok -= expm1 towardzero binary128 -0x4p-1076 : -0x3.fffffffffffffffffffffffffffep-1076 : no-mathvec inexact-ok -= expm1 upward binary128 -0x4p-1076 : -0x3.fffffffffffffffffffffffffffep-1076 : no-mathvec inexact-ok -= expm1 downward ibm128 -0x4p-1076 : -0x4p-1076 : no-mathvec xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok -= expm1 tonearest ibm128 -0x4p-1076 : -0x4p-1076 : no-mathvec inexact-ok underflow errno-erange-ok -= expm1 towardzero ibm128 -0x4p-1076 : -0x0p+0 : no-mathvec xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok -= expm1 upward ibm128 -0x4p-1076 : -0x0p+0 : no-mathvec xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok -= expm1 downward intel96 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 tonearest intel96 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 towardzero intel96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : no-mathvec inexact-ok -= expm1 upward intel96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : no-mathvec inexact-ok -= expm1 downward m68k96 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 tonearest m68k96 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 towardzero m68k96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : no-mathvec inexact-ok -= expm1 upward m68k96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : no-mathvec inexact-ok -= expm1 downward binary128 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 tonearest binary128 -0x1p-10000 : -0x1p-10000 : no-mathvec inexact-ok -= expm1 towardzero binary128 -0x1p-10000 : -0xf.fffffffffffffffffffffffffff8p-10004 : no-mathvec inexact-ok -= expm1 upward binary128 -0x1p-10000 : -0xf.fffffffffffffffffffffffffff8p-10004 : no-mathvec inexact-ok +expm1 -0x1p-10000 += expm1 downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += expm1 downward binary32 -0x8p-152 : -0x8p-152 : inexact-ok underflow errno-erange-ok += expm1 tonearest binary32 -0x8p-152 : -0x8p-152 : inexact-ok underflow errno-erange-ok += expm1 towardzero binary32 -0x8p-152 : -0x0p+0 : inexact-ok underflow errno-erange-ok += expm1 upward binary32 -0x8p-152 : -0x0p+0 : inexact-ok underflow errno-erange-ok += expm1 downward binary64 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 tonearest binary64 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 towardzero binary64 -0x8p-152 : -0x7.ffffffffffffcp-152 : inexact-ok += expm1 upward binary64 -0x8p-152 : -0x7.ffffffffffffcp-152 : inexact-ok += expm1 downward intel96 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 tonearest intel96 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 towardzero intel96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : inexact-ok += expm1 upward intel96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : inexact-ok += expm1 downward m68k96 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 tonearest m68k96 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 towardzero m68k96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : inexact-ok += expm1 upward m68k96 -0x8p-152 : -0x7.fffffffffffffff8p-152 : inexact-ok += expm1 downward binary128 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 tonearest binary128 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 towardzero binary128 -0x8p-152 : -0x7.fffffffffffffffffffffffffffcp-152 : inexact-ok += expm1 upward binary128 -0x8p-152 : -0x7.fffffffffffffffffffffffffffcp-152 : inexact-ok += expm1 downward ibm128 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 tonearest ibm128 -0x8p-152 : -0x8p-152 : inexact-ok += expm1 towardzero ibm128 -0x8p-152 : -0x7.fffffffffffffffffffffffffep-152 : inexact-ok += expm1 upward ibm128 -0x8p-152 : -0x7.fffffffffffffffffffffffffep-152 : inexact-ok += expm1 downward binary64 -0x4p-1076 : -0x4p-1076 : inexact-ok underflow errno-erange-ok += expm1 tonearest binary64 -0x4p-1076 : -0x4p-1076 : inexact-ok underflow errno-erange-ok += expm1 towardzero binary64 -0x4p-1076 : -0x0p+0 : inexact-ok underflow errno-erange-ok += expm1 upward binary64 -0x4p-1076 : -0x0p+0 : inexact-ok underflow errno-erange-ok += expm1 downward intel96 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 tonearest intel96 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 towardzero intel96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : inexact-ok += expm1 upward intel96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : inexact-ok += expm1 downward m68k96 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 tonearest m68k96 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 towardzero m68k96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : inexact-ok += expm1 upward m68k96 -0x4p-1076 : -0x3.fffffffffffffffcp-1076 : inexact-ok += expm1 downward binary128 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 tonearest binary128 -0x4p-1076 : -0x4p-1076 : inexact-ok += expm1 towardzero binary128 -0x4p-1076 : -0x3.fffffffffffffffffffffffffffep-1076 : inexact-ok += expm1 upward binary128 -0x4p-1076 : -0x3.fffffffffffffffffffffffffffep-1076 : inexact-ok += expm1 downward ibm128 -0x4p-1076 : -0x4p-1076 : xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok += expm1 tonearest ibm128 -0x4p-1076 : -0x4p-1076 : inexact-ok underflow errno-erange-ok += expm1 towardzero ibm128 -0x4p-1076 : -0x0p+0 : xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok += expm1 upward ibm128 -0x4p-1076 : -0x0p+0 : xfail:ibm128-libgcc inexact-ok underflow errno-erange-ok += expm1 downward intel96 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 tonearest intel96 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 towardzero intel96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : inexact-ok += expm1 upward intel96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : inexact-ok += expm1 downward m68k96 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 tonearest m68k96 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 towardzero m68k96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : inexact-ok += expm1 upward m68k96 -0x1p-10000 : -0xf.fffffffffffffffp-10004 : inexact-ok += expm1 downward binary128 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 tonearest binary128 -0x1p-10000 : -0x1p-10000 : inexact-ok += expm1 towardzero binary128 -0x1p-10000 : -0xf.fffffffffffffffffffffffffff8p-10004 : inexact-ok += expm1 upward binary128 -0x1p-10000 : -0xf.fffffffffffffffffffffffffff8p-10004 : inexact-ok expm1 0xe.4152ac57cd1ea7ap-60 = expm1 downward binary32 0xe.4152bp-60 : 0xe.4152bp-60 : inexact-ok = expm1 tonearest binary32 0xe.4152bp-60 : 0xe.4152bp-60 : inexact-ok diff --git a/math/auto-libm-test-out-log1p b/math/auto-libm-test-out-log1p index f83241f..f7d3b35 100644 --- a/math/auto-libm-test-out-log1p +++ b/math/auto-libm-test-out-log1p @@ -23,31 +23,31 @@ log1p 0 = log1p tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok = log1p towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok = log1p upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok -log1p -0 no-mathvec -= log1p downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= log1p upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok +log1p -0 += log1p downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += log1p downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += log1p downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += log1p downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += log1p upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok log1p e-1 = log1p downward binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok = log1p tonearest binary32 0x1.b7e152p+0 : 0x1p+0 : inexact-ok diff --git a/math/auto-libm-test-out-sin b/math/auto-libm-test-out-sin index e1f6845..f1d21b1 100644 --- a/math/auto-libm-test-out-sin +++ b/math/auto-libm-test-out-sin @@ -23,31 +23,31 @@ sin 0 = sin tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok = sin towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok = sin upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok -sin -0 no-mathvec -= sin downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= sin upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok +sin -0 += sin downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += sin downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += sin downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += sin downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += sin downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += sin downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += sin tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += sin towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += sin upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok sin pi/6 = sin downward binary32 0x8.60a92p-4 : 0x8p-4 : inexact-ok = sin tonearest binary32 0x8.60a92p-4 : 0x8p-4 : inexact-ok diff --git a/math/auto-libm-test-out-tan b/math/auto-libm-test-out-tan index f46fdc7..7d00d03 100644 --- a/math/auto-libm-test-out-tan +++ b/math/auto-libm-test-out-tan @@ -23,31 +23,31 @@ tan 0 = tan tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok = tan towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok = tan upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok -tan -0 no-mathvec -= tan downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok -= tan upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok +tan -0 += tan downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok += tan downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok += tan downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok += tan downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok += tan downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok += tan downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += tan tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += tan towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok += tan upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok tan pi/4 = tan downward binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok = tan tonearest binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok diff --git a/math/gen-auto-libm-tests.c b/math/gen-auto-libm-tests.c index c35242b..4822724 100644 --- a/math/gen-auto-libm-tests.c +++ b/math/gen-auto-libm-tests.c @@ -96,8 +96,7 @@ zero and infinite results should be ignored; "xfail" indicates the test is disabled as expected to produce incorrect results, "xfail-rounding" indicates the test is disabled only in rounding - modes other than round-to-nearest; "no-mathvec" indicates the test - is disabled in vector math libraries. Otherwise, test flags are of + modes other than round-to-nearest. Otherwise, test flags are of the form "spurious-<exception>" and "missing-<exception>", for any exception ("overflow", "underflow", "inexact", "invalid", "divbyzero"), "spurious-errno" and "missing-errno", to indicate @@ -353,7 +352,6 @@ typedef enum flag_missing_overflow, flag_missing_underflow, flag_missing_errno, - flag_no_mathvec, num_input_flag_types, flag_first_flag = 0, flag_spurious_first = flag_spurious_divbyzero, @@ -379,7 +377,6 @@ static const char *const input_flags[num_input_flag_types] = "missing-overflow", "missing-underflow", "missing-errno", - "no-mathvec", }; /* An input flag, possibly conditional. */ @@ -2052,7 +2049,6 @@ output_for_one_input_case (FILE *fp, const char *filename, test_function *tf, { case flag_ignore_zero_inf_sign: case flag_xfail: - case flag_no_mathvec: if (fprintf (fp, " %s%s", input_flags[it->flags[i].type], (it->flags[i].cond diff --git a/math/gen-libm-test.py b/math/gen-libm-test.py index 397dbd3..6e8bb56 100755 --- a/math/gen-libm-test.py +++ b/math/gen-libm-test.py @@ -93,8 +93,7 @@ BEAUTIFY_MAP = {'minus_zero': '-0', # Flags in auto-libm-test-out that map directly to C flags. FLAGS_SIMPLE = {'ignore-zero-inf-sign': 'IGNORE_ZERO_INF_SIGN', - 'xfail': 'XFAIL_TEST', - 'no-mathvec': 'NO_TEST_MATHVEC'} + 'xfail': 'XFAIL_TEST'} # Exceptions in auto-libm-test-out, and their corresponding C flags # for being required, OK or required to be absent. diff --git a/math/libm-test-support.c b/math/libm-test-support.c index 1d60ac7..315229c 100644 --- a/math/libm-test-support.c +++ b/math/libm-test-support.c @@ -684,7 +684,7 @@ check_float_internal (const char *test_name, FLOAT computed, FLOAT expected, ulps = ULPDIFF (computed, expected); set_max_error (ulps, curr_max_error); print_diff = 1; - if ((exceptions & IGNORE_ZERO_INF_SIGN) == 0 + if (((exceptions & IGNORE_ZERO_INF_SIGN) == 0) && !flag_test_mathvec && computed == 0.0 && expected == 0.0 && signbit(computed) != signbit (expected)) ok = 0; diff --git a/nptl/pthread_cond_broadcast.c b/nptl/pthread_cond_broadcast.c index aada916..51afa62 100644 --- a/nptl/pthread_cond_broadcast.c +++ b/nptl/pthread_cond_broadcast.c @@ -57,10 +57,10 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) { /* Add as many signals as the remaining size of the group. */ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, - cond->__data.__g_size[g1] << 1); + cond->__data.__g_size[g1]); cond->__data.__g_size[g1] = 0; - /* We need to wake G1 waiters before we quiesce G1 below. */ + /* We need to wake G1 waiters before we switch G1 below. */ /* TODO Only set it if there are indeed futex waiters. We could also try to move this out of the critical section in cases when G2 is empty (and we don't need to quiesce). */ @@ -69,11 +69,11 @@ ___pthread_cond_broadcast (pthread_cond_t *cond) /* G1 is complete. Step (2) is next unless there are no waiters in G2, in which case we can stop. */ - if (__condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) + if (__condvar_switch_g1 (cond, wseq, &g1, private)) { /* Step (3): Send signals to all waiters in the old G2 / new G1. */ atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, - cond->__data.__g_size[g1] << 1); + cond->__data.__g_size[g1]); cond->__data.__g_size[g1] = 0; /* TODO Only set it if there are indeed futex waiters. */ do_futex_wake = true; diff --git a/nptl/pthread_cond_common.c b/nptl/pthread_cond_common.c index 3487557..3894029 100644 --- a/nptl/pthread_cond_common.c +++ b/nptl/pthread_cond_common.c @@ -189,19 +189,17 @@ __condvar_get_private (int flags) return FUTEX_SHARED; } -/* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to - leave G1, converts G1 into a fresh G2, and then switches group roles so that - the former G2 becomes the new G1 ending at the current __wseq value when we - eventually make the switch (WSEQ is just an observation of __wseq by the - signaler). +/* This closes G1 (whose index is in G1INDEX), converts G1 into a fresh G2, + and then switches group roles so that the former G2 becomes the new G1 + ending at the current __wseq value when we eventually make the switch + (WSEQ is just an observation of __wseq by the signaler). If G2 is empty, it will not switch groups because then it would create an empty G1 which would require switching groups again on the next signal. Returns false iff groups were not switched because G2 was empty. */ static bool __attribute__ ((unused)) -__condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, +__condvar_switch_g1 (pthread_cond_t *cond, uint64_t wseq, unsigned int *g1index, int private) { - const unsigned int maxspin = 0; unsigned int g1 = *g1index; /* If there is no waiter in G2, we don't do anything. The expression may @@ -210,96 +208,23 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, behavior. Note that this works correctly for a zero-initialized condvar too. */ unsigned int old_orig_size = __condvar_get_orig_size (cond); - uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; - if (((unsigned) (wseq - old_g1_start - old_orig_size) - + cond->__data.__g_size[g1 ^ 1]) == 0) + uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond); + uint64_t new_g1_start = old_g1_start + old_orig_size; + if (((unsigned) (wseq - new_g1_start) + cond->__data.__g_size[g1 ^ 1]) == 0) return false; - /* Now try to close and quiesce G1. We have to consider the following kinds - of waiters: + /* We have to consider the following kinds of waiters: * Waiters from less recent groups than G1 are not affected because nothing will change for them apart from __g1_start getting larger. * New waiters arriving concurrently with the group switching will all go into G2 until we atomically make the switch. Waiters existing in G2 are not affected. - * Waiters in G1 will be closed out immediately by setting a flag in - __g_signals, which will prevent waiters from blocking using a futex on - __g_signals and also notifies them that the group is closed. As a - result, they will eventually remove their group reference, allowing us - to close switch group roles. */ + * Waiters in G1 have already received a signal and been woken. */ - /* First, set the closed flag on __g_signals. This tells waiters that are - about to wait that they shouldn't do that anymore. This basically - serves as an advance notification of the upcoming change to __g1_start; - waiters interpret it as if __g1_start was larger than their waiter - sequence position. This allows us to change __g1_start after waiting - for all existing waiters with group references to leave, which in turn - makes recovery after stealing a signal simpler because it then can be - skipped if __g1_start indicates that the group is closed (otherwise, - we would have to recover always because waiters don't know how big their - groups are). Relaxed MO is fine. */ - atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1); - - /* Wait until there are no group references anymore. The fetch-or operation - injects us into the modification order of __g_refs; release MO ensures - that waiters incrementing __g_refs after our fetch-or see the previous - changes to __g_signals and to __g1_start that had to happen before we can - switch this G1 and alias with an older group (we have two groups, so - aliasing requires switching group roles twice). Note that nobody else - can have set the wake-request flag, so we do not have to act upon it. - - Also note that it is harmless if older waiters or waiters from this G1 - get a group reference after we have quiesced the group because it will - remain closed for them either because of the closed flag in __g_signals - or the later update to __g1_start. New waiters will never arrive here - but instead continue to go into the still current G2. */ - unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0); - while ((r >> 1) > 0) - { - for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--) - { - /* TODO Back off. */ - r = atomic_load_relaxed (cond->__data.__g_refs + g1); - } - if ((r >> 1) > 0) - { - /* There is still a waiter after spinning. Set the wake-request - flag and block. Relaxed MO is fine because this is just about - this futex word. - - Update r to include the set wake-request flag so that the upcoming - futex_wait only blocks if the flag is still set (otherwise, we'd - violate the basic client-side futex protocol). */ - r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1) | 1; - - if ((r >> 1) > 0) - futex_wait_simple (cond->__data.__g_refs + g1, r, private); - /* Reload here so we eventually see the most recent value even if we - do not spin. */ - r = atomic_load_relaxed (cond->__data.__g_refs + g1); - } - } - /* Acquire MO so that we synchronize with the release operation that waiters - use to decrement __g_refs and thus happen after the waiters we waited - for. */ - atomic_thread_fence_acquire (); - - /* Update __g1_start, which finishes closing this group. The value we add - will never be negative because old_orig_size can only be zero when we - switch groups the first time after a condvar was initialized, in which - case G1 will be at index 1 and we will add a value of 1. See above for - why this takes place after waiting for quiescence of the group. - Relaxed MO is fine because the change comes with no additional - constraints that others would have to observe. */ - __condvar_add_g1_start_relaxed (cond, - (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); - - /* Now reopen the group, thus enabling waiters to again block using the - futex controlled by __g_signals. Release MO so that observers that see - no signals (and thus can block) also see the write __g1_start and thus - that this is now a new group (see __pthread_cond_wait_common for the - matching acquire MO loads). */ - atomic_store_release (cond->__data.__g_signals + g1, 0); + /* Update __g1_start, which closes this group. Relaxed MO is fine because + the change comes with no additional constraints that others would have + to observe. */ + __condvar_add_g1_start_relaxed (cond, old_orig_size); /* At this point, the old G1 is now a valid new G2 (but not in use yet). No old waiter can neither grab a signal nor acquire a reference without @@ -311,9 +236,13 @@ __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, g1 ^= 1; *g1index ^= 1; + /* Now advance the new G1 g_signals to the new g1_start, giving it + an effective signal count of 0 to start. */ + atomic_store_release (cond->__data.__g_signals + g1, (unsigned)new_g1_start); + /* These values are just observed by signalers, and thus protected by the lock. */ - unsigned int orig_size = wseq - (old_g1_start + old_orig_size); + unsigned int orig_size = wseq - new_g1_start; __condvar_set_orig_size (cond, orig_size); /* Use and addition to not loose track of cancellations in what was previously G2. */ diff --git a/nptl/pthread_cond_signal.c b/nptl/pthread_cond_signal.c index 43d6286..fa3a5c3 100644 --- a/nptl/pthread_cond_signal.c +++ b/nptl/pthread_cond_signal.c @@ -69,19 +69,18 @@ ___pthread_cond_signal (pthread_cond_t *cond) bool do_futex_wake = false; /* If G1 is still receiving signals, we put the signal there. If not, we - check if G2 has waiters, and if so, quiesce and switch G1 to the former - G2; if this results in a new G1 with waiters (G2 might have cancellations - already, see __condvar_quiesce_and_switch_g1), we put the signal in the - new G1. */ + check if G2 has waiters, and if so, switch G1 to the former G2; if this + results in a new G1 with waiters (G2 might have cancellations already, + see __condvar_switch_g1), we put the signal in the new G1. */ if ((cond->__data.__g_size[g1] != 0) - || __condvar_quiesce_and_switch_g1 (cond, wseq, &g1, private)) + || __condvar_switch_g1 (cond, wseq, &g1, private)) { /* Add a signal. Relaxed MO is fine because signaling does not need to - establish a happens-before relation (see above). We do not mask the - release-MO store when initializing a group in - __condvar_quiesce_and_switch_g1 because we use an atomic - read-modify-write and thus extend that store's release sequence. */ - atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 2); + establish a happens-before relation (see above). We do not mask the + release-MO store when initializing a group in __condvar_switch_g1 + because we use an atomic read-modify-write and thus extend that + store's release sequence. */ + atomic_fetch_add_relaxed (cond->__data.__g_signals + g1, 1); cond->__data.__g_size[g1]--; /* TODO Only set it if there are indeed futex waiters. */ do_futex_wake = true; diff --git a/nptl/pthread_cond_wait.c b/nptl/pthread_cond_wait.c index 66786c7..0f1dfcb 100644 --- a/nptl/pthread_cond_wait.c +++ b/nptl/pthread_cond_wait.c @@ -84,7 +84,7 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, not hold a reference on the group. */ __condvar_acquire_lock (cond, private); - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); if (g1_start > seq) { /* Our group is closed, so someone provided enough signals for it. @@ -143,23 +143,6 @@ __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g, } } -/* Wake up any signalers that might be waiting. */ -static void -__condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private) -{ - /* Release MO to synchronize-with the acquire load in - __condvar_quiesce_and_switch_g1. */ - if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3) - { - /* Clear the wake-up request flag before waking up. We do not need more - than relaxed MO and it doesn't matter if we apply this for an aliased - group because we wake all futex waiters right after clearing the - flag. */ - atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1); - futex_wake (cond->__data.__g_refs + g, INT_MAX, private); - } -} - /* Clean-up for cancellation of waiters waiting for normal signals. We cancel our registration as a waiter, confirm we have woken up, and re-acquire the mutex. */ @@ -171,8 +154,6 @@ __condvar_cleanup_waiting (void *arg) pthread_cond_t *cond = cbuffer->cond; unsigned g = cbuffer->wseq & 1; - __condvar_dec_grefs (cond, g, cbuffer->private); - __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private); /* FIXME With the current cancellation implementation, it is possible that a thread is cancelled after it has returned from a syscall. This could @@ -238,9 +219,7 @@ __condvar_cleanup_waiting (void *arg) signaled), and a reference count. The group reference count is used to maintain the number of waiters that - are using the group's futex. Before a group can change its role, the - reference count must show that no waiters are using the futex anymore; this - prevents ABA issues on the futex word. + are using the group's futex. To represent which intervals in the waiter sequence the groups cover (and thus also which group slot contains G1 or G2), we use a 64b counter to @@ -251,7 +230,7 @@ __condvar_cleanup_waiting (void *arg) figure out whether they are in a group that has already been completely signaled (i.e., if the current G1 starts at a later position that the waiter's position). Waiters cannot determine whether they are currently - in G2 or G1 -- but they do not have too because all they are interested in + in G2 or G1 -- but they do not have to because all they are interested in is whether there are available signals, and they always start in G2 (whose group slot they know because of the bit in the waiter sequence. Signalers will simply fill the right group until it is completely signaled and can @@ -280,7 +259,6 @@ __condvar_cleanup_waiting (void *arg) * Waiters fetch-add while having acquire the mutex associated with the condvar. Signalers load it and fetch-xor it concurrently. __g1_start: Starting position of G1 (inclusive) - * LSB is index of current G2. * Modified by signalers while having acquired the condvar-internal lock and observed concurrently by waiters. __g1_orig_size: Initial size of G1 @@ -300,11 +278,10 @@ __condvar_cleanup_waiting (void *arg) last reference. * Reference count used by waiters concurrently with signalers that have acquired the condvar-internal lock. - __g_signals: The number of signals that can still be consumed. + __g_signals: The number of signals that can still be consumed, relative to + the current g1_start. (i.e. g1_start with the signal count added) * Used as a futex word by waiters. Used concurrently by waiters and signalers. - * LSB is true iff this group has been completely signaled (i.e., it is - closed). __g_size: Waiters remaining in this group (i.e., which have not been signaled yet. * Accessed by signalers and waiters that cancel waiting (both do so only @@ -328,27 +305,6 @@ __condvar_cleanup_waiting (void *arg) sufficient because if a waiter can see a sufficiently large value, it could have also consume a signal in the waiters group. - Waiters try to grab a signal from __g_signals without holding a reference - count, which can lead to stealing a signal from a more recent group after - their own group was already closed. They cannot always detect whether they - in fact did because they do not know when they stole, but they can - conservatively add a signal back to the group they stole from; if they - did so unnecessarily, all that happens is a spurious wake-up. To make this - even less likely, __g1_start contains the index of the current g2 too, - which allows waiters to check if there aliasing on the group slots; if - there wasn't, they didn't steal from the current G1, which means that the - G1 they stole from must have been already closed and they do not need to - fix anything. - - It is essential that the last field in pthread_cond_t is __g_signals[1]: - The previous condvar used a pointer-sized field in pthread_cond_t, so a - PTHREAD_COND_INITIALIZER from that condvar implementation might only - initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes - in total instead of the 48 we need). __g_signals[1] is not accessed before - the first group switch (G2 starts at index 0), which will set its value to - zero after a harmless fetch-or whose return value is ignored. This - effectively completes initialization. - Limitations: * This condvar isn't designed to allow for more than @@ -379,7 +335,6 @@ static __always_inline int __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, clockid_t clockid, const struct __timespec64 *abstime) { - const int maxspin = 0; int err; int result = 0; @@ -396,8 +351,7 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, because we do not need to establish any happens-before relation with signalers (see __pthread_cond_signal); modification order alone establishes a total order of waiters/signals. We do need acquire MO - to synchronize with group reinitialization in - __condvar_quiesce_and_switch_g1. */ + to synchronize with group reinitialization in __condvar_switch_g1. */ uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2); /* Find our group's index. We always go into what was G2 when we acquired our position. */ @@ -424,178 +378,64 @@ __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex, return err; } - /* Now wait until a signal is available in our group or it is closed. - Acquire MO so that if we observe a value of zero written after group - switching in __condvar_quiesce_and_switch_g1, we synchronize with that - store and will see the prior update of __g1_start done while switching - groups too. */ - unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); - do + while (1) { - while (1) - { - /* Spin-wait first. - Note that spinning first without checking whether a timeout - passed might lead to what looks like a spurious wake-up even - though we should return ETIMEDOUT (e.g., if the caller provides - an absolute timeout that is clearly in the past). However, - (1) spurious wake-ups are allowed, (2) it seems unlikely that a - user will (ab)use pthread_cond_wait as a check for whether a - point in time is in the past, and (3) spinning first without - having to compare against the current time seems to be the right - choice from a performance perspective for most use cases. */ - unsigned int spin = maxspin; - while (signals == 0 && spin > 0) - { - /* Check that we are not spinning on a group that's already - closed. */ - if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)) - goto done; - - /* TODO Back off. */ - - /* Reload signals. See above for MO. */ - signals = atomic_load_acquire (cond->__data.__g_signals + g); - spin--; - } - - /* If our group will be closed as indicated by the flag on signals, - don't bother grabbing a signal. */ - if (signals & 1) - goto done; - - /* If there is an available signal, don't block. */ - if (signals != 0) - break; - - /* No signals available after spinning, so prepare to block. - We first acquire a group reference and use acquire MO for that so - that we synchronize with the dummy read-modify-write in - __condvar_quiesce_and_switch_g1 if we read from that. In turn, - in this case this will make us see the closed flag on __g_signals - that designates a concurrent attempt to reuse the group's slot. - We use acquire MO for the __g_signals check to make the - __g1_start check work (see spinning above). - Note that the group reference acquisition will not mask the - release MO when decrementing the reference count because we use - an atomic read-modify-write operation and thus extend the release - sequence. */ - atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2); - if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0) - || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))) - { - /* Our group is closed. Wake up any signalers that might be - waiting. */ - __condvar_dec_grefs (cond, g, private); - goto done; - } - - // Now block. - struct _pthread_cleanup_buffer buffer; - struct _condvar_cleanup_buffer cbuffer; - cbuffer.wseq = wseq; - cbuffer.cond = cond; - cbuffer.mutex = mutex; - cbuffer.private = private; - __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); - - err = __futex_abstimed_wait_cancelable64 ( - cond->__data.__g_signals + g, 0, clockid, abstime, private); - - __pthread_cleanup_pop (&buffer, 0); - - if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) - { - __condvar_dec_grefs (cond, g, private); - /* If we timed out, we effectively cancel waiting. Note that - we have decremented __g_refs before cancellation, so that a - deadlock between waiting for quiescence of our group in - __condvar_quiesce_and_switch_g1 and us trying to acquire - the lock during cancellation is not possible. */ - __condvar_cancel_waiting (cond, seq, g, private); - result = err; - goto done; - } - else - __condvar_dec_grefs (cond, g, private); - - /* Reload signals. See above for MO. */ - signals = atomic_load_acquire (cond->__data.__g_signals + g); + /* Now wait until a signal is available in our group or it is closed. + Acquire MO so that if we observe (signals == lowseq) after group + switching in __condvar_switch_g1, we synchronize with that store and + will see the prior update of __g1_start done while switching groups + too. */ + unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g); + uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); + + if (seq < g1_start) + { + /* If the group is closed already, + then this waiter originally had enough extra signals to + consume, up until the time its group was closed. */ + break; + } + + /* If there is an available signal, don't block. + If __g1_start has advanced at all, then we must be in G1 + by now, perhaps in the process of switching back to an older + G2, but in either case we're allowed to consume the available + signal and should not block anymore. */ + if ((int)(signals - (unsigned int)g1_start) > 0) + { + /* Try to grab a signal. See above for MO. (if we do another loop + iteration we need to see the correct value of g1_start) */ + if (atomic_compare_exchange_weak_acquire ( + cond->__data.__g_signals + g, + &signals, signals - 1)) + break; + else + continue; } + // Now block. + struct _pthread_cleanup_buffer buffer; + struct _condvar_cleanup_buffer cbuffer; + cbuffer.wseq = wseq; + cbuffer.cond = cond; + cbuffer.mutex = mutex; + cbuffer.private = private; + __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer); + + err = __futex_abstimed_wait_cancelable64 ( + cond->__data.__g_signals + g, signals, clockid, abstime, private); + + __pthread_cleanup_pop (&buffer, 0); + + if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW)) + { + /* If we timed out, we effectively cancel waiting. */ + __condvar_cancel_waiting (cond, seq, g, private); + result = err; + break; + } } - /* Try to grab a signal. Use acquire MO so that we see an up-to-date value - of __g1_start below (see spinning above for a similar case). In - particular, if we steal from a more recent group, we will also see a - more recent __g1_start below. */ - while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g, - &signals, signals - 2)); - - /* We consumed a signal but we could have consumed from a more recent group - that aliased with ours due to being in the same group slot. If this - might be the case our group must be closed as visible through - __g1_start. */ - uint64_t g1_start = __condvar_load_g1_start_relaxed (cond); - if (seq < (g1_start >> 1)) - { - /* We potentially stole a signal from a more recent group but we do not - know which group we really consumed from. - We do not care about groups older than current G1 because they are - closed; we could have stolen from these, but then we just add a - spurious wake-up for the current groups. - We will never steal a signal from current G2 that was really intended - for G2 because G2 never receives signals (until it becomes G1). We - could have stolen a signal from G2 that was conservatively added by a - previous waiter that also thought it stole a signal -- but given that - that signal was added unnecessarily, it's not a problem if we steal - it. - Thus, the remaining case is that we could have stolen from the current - G1, where "current" means the __g1_start value we observed. However, - if the current G1 does not have the same slot index as we do, we did - not steal from it and do not need to undo that. This is the reason - for putting a bit with G2's index into__g1_start as well. */ - if (((g1_start & 1) ^ 1) == g) - { - /* We have to conservatively undo our potential mistake of stealing - a signal. We can stop trying to do that when the current G1 - changes because other spinning waiters will notice this too and - __condvar_quiesce_and_switch_g1 has checked that there are no - futex waiters anymore before switching G1. - Relaxed MO is fine for the __g1_start load because we need to - merely be able to observe this fact and not have to observe - something else as well. - ??? Would it help to spin for a little while to see whether the - current G1 gets closed? This might be worthwhile if the group is - small or close to being closed. */ - unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g); - while (__condvar_load_g1_start_relaxed (cond) == g1_start) - { - /* Try to add a signal. We don't need to acquire the lock - because at worst we can cause a spurious wake-up. If the - group is in the process of being closed (LSB is true), this - has an effect similar to us adding a signal. */ - if (((s & 1) != 0) - || atomic_compare_exchange_weak_relaxed - (cond->__data.__g_signals + g, &s, s + 2)) - { - /* If we added a signal, we also need to add a wake-up on - the futex. We also need to do that if we skipped adding - a signal because the group is being closed because - while __condvar_quiesce_and_switch_g1 could have closed - the group, it might still be waiting for futex waiters to - leave (and one of those waiters might be the one we stole - the signal from, which cause it to block using the - futex). */ - futex_wake (cond->__data.__g_signals + g, 1, private); - break; - } - /* TODO Back off. */ - } - } - } - - done: /* Confirm that we have been woken. We do that before acquiring the mutex to allow for execution of pthread_cond_destroy while having acquired the diff --git a/nptl/tst-cond22.c b/nptl/tst-cond22.c index 1336e9c..bdcb45c 100644 --- a/nptl/tst-cond22.c +++ b/nptl/tst-cond22.c @@ -106,13 +106,13 @@ do_test (void) status = 1; } - printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", + printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", c.__data.__wseq.__value32.__high, c.__data.__wseq.__value32.__low, c.__data.__g1_start.__value32.__high, c.__data.__g1_start.__value32.__low, - c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], - c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], + c.__data.__g_signals[0], c.__data.__g_size[0], + c.__data.__g_signals[1], c.__data.__g_size[1], c.__data.__g1_orig_size, c.__data.__wrefs); if (pthread_create (&th, NULL, tf, (void *) 1l) != 0) @@ -152,13 +152,13 @@ do_test (void) status = 1; } - printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u/%u, %u/%u/%u, %u, %u }\n", + printf ("cond = { 0x%x:%x, 0x%x:%x, %u/%u, %u/%u, %u, %u }\n", c.__data.__wseq.__value32.__high, c.__data.__wseq.__value32.__low, c.__data.__g1_start.__value32.__high, c.__data.__g1_start.__value32.__low, - c.__data.__g_signals[0], c.__data.__g_refs[0], c.__data.__g_size[0], - c.__data.__g_signals[1], c.__data.__g_refs[1], c.__data.__g_size[1], + c.__data.__g_signals[0], c.__data.__g_size[0], + c.__data.__g_signals[1], c.__data.__g_size[1], c.__data.__g1_orig_size, c.__data.__wrefs); return status; diff --git a/posix/Makefile b/posix/Makefile index a1e8485..18ddb8c 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -303,6 +303,7 @@ tests := \ tst-posix_spawn-setsid \ tst-preadwrite \ tst-preadwrite64 \ + tst-regcomp-bracket-free \ tst-regcomp-truncated \ tst-regex \ tst-regex2 \ diff --git a/posix/fork.c b/posix/fork.c index 298765a..cf9b80e 100644 --- a/posix/fork.c +++ b/posix/fork.c @@ -62,6 +62,7 @@ __libc_fork (void) call_function_static_weak (__nss_database_fork_prepare_parent, &nss_database_data); + _IO_proc_file_chain_lock (); _IO_list_lock (); /* Acquire malloc locks. This needs to come last because fork @@ -92,6 +93,7 @@ __libc_fork (void) /* Reset locks in the I/O code. */ _IO_list_resetlock (); + _IO_proc_file_chain_resetlock (); call_function_static_weak (__nss_database_fork_subprocess, &nss_database_data); @@ -121,6 +123,7 @@ __libc_fork (void) /* We execute this even if the 'fork' call failed. */ _IO_list_unlock (); + _IO_proc_file_chain_unlock (); } /* Run the handlers registered for the parent. */ diff --git a/posix/regcomp.c b/posix/regcomp.c index 5380d3c..6595bb3 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, { #ifdef RE_ENABLE_I18N free_charset (mbcset); + mbcset = NULL; #endif /* Build a tree for simple bracket. */ br_token.type = SIMPLE_BRACKET; @@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token, parse_bracket_exp_free_return: re_free (sbcset); #ifdef RE_ENABLE_I18N - free_charset (mbcset); + if (__glibc_likely (mbcset != NULL)) + free_charset (mbcset); #endif /* RE_ENABLE_I18N */ return NULL; } diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c new file mode 100644 index 0000000..3c091d8 --- /dev/null +++ b/posix/tst-regcomp-bracket-free.c @@ -0,0 +1,176 @@ +/* Test regcomp bracket parsing with injected allocation failures (bug 33185). + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* This test invokes regcomp multiple times, failing one memory + allocation in each call. The function call should fail with + REG_ESPACE (or succeed if it can recover from the allocation + failure). Previously, there was double-free bug. */ + +#include <errno.h> +#include <regex.h> +#include <stdio.h> +#include <string.h> +#include <support/check.h> +#include <support/namespace.h> +#include <support/support.h> + +/* Data structure allocated via MAP_SHARED, so that writes from the + subprocess are visible. */ +struct shared_data +{ + /* Number of tracked allocations performed so far. */ + volatile unsigned int allocation_count; + + /* If this number is reached, one allocation fails. */ + volatile unsigned int failing_allocation; + + /* The subprocess stores the expected name here. */ + char name[100]; +}; + +/* Allocation count in shared mapping. */ +static struct shared_data *shared; + +/* Returns true if a failure should be injected for this allocation. */ +static bool +fail_this_allocation (void) +{ + if (shared != NULL) + { + unsigned int count = shared->allocation_count; + shared->allocation_count = count + 1; + return count == shared->failing_allocation; + } + else + return false; +} + +/* Failure-injecting wrappers for allocation functions used by glibc. */ + +void * +malloc (size_t size) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (malloc) __libc_malloc; + return __libc_malloc (size); +} + +void * +calloc (size_t a, size_t b) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (calloc) __libc_calloc; + return __libc_calloc (a, b); +} + +void * +realloc (void *ptr, size_t size) +{ + if (fail_this_allocation ()) + { + errno = ENOMEM; + return NULL; + } + extern __typeof (realloc) __libc_realloc; + return __libc_realloc (ptr, size); +} + +/* No-op subprocess to verify that support_isolate_in_subprocess does + not perform any heap allocations. */ +static void +no_op (void *ignored) +{ +} + +/* Perform a regcomp call in a subprocess. Used to count its + allocations. */ +static void +initialize (void *regexp1) +{ + const char *regexp = regexp1; + + shared->allocation_count = 0; + + regex_t reg; + TEST_COMPARE (regcomp (®, regexp, 0), 0); +} + +/* Perform regcomp in a subprocess with fault injection. */ +static void +test_in_subprocess (void *regexp1) +{ + const char *regexp = regexp1; + unsigned int inject_at = shared->failing_allocation; + + regex_t reg; + int ret = regcomp (®, regexp, 0); + + if (ret != 0) + { + TEST_COMPARE (ret, REG_ESPACE); + printf ("info: allocation %u failure results in return value %d," + " error %s (%d)\n", + inject_at, ret, strerrorname_np (errno), errno); + } +} + +static int +do_test (void) +{ + char regexp[] = "[:alpha:]"; + + shared = support_shared_allocate (sizeof (*shared)); + + /* Disable fault injection. */ + shared->failing_allocation = ~0U; + + support_isolate_in_subprocess (no_op, NULL); + TEST_COMPARE (shared->allocation_count, 0); + + support_isolate_in_subprocess (initialize, regexp); + + /* The number of allocations in the successful case, plus some + slack. Once the number of expected allocations is exceeded, + injecting further failures does not make a difference. */ + unsigned int maximum_allocation_count = shared->allocation_count; + printf ("info: successful call performs %u allocations\n", + maximum_allocation_count); + maximum_allocation_count += 10; + + for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count; + ++inject_at) + { + shared->allocation_count = 0; + shared->failing_allocation = inject_at; + support_isolate_in_subprocess (test_in_subprocess, regexp); + } + + support_shared_free (shared); + + return 0; +} + +#include <support/test-driver.c> diff --git a/scripts/sort-makefile-lines.py b/scripts/sort-makefile-lines.py index f65ee40..b2249ae 100755 --- a/scripts/sort-makefile-lines.py +++ b/scripts/sort-makefile-lines.py @@ -129,7 +129,7 @@ def sort_makefile_lines(): for i in range(len(lines)): # Look for things like "var = \", "var := \" or "var += \" # to start the sorted list. - var = re.search(r'^([a-zA-Z0-9-]*) [\+:]?\= \\$', lines[i]) + var = re.search(r'^([-_a-zA-Z0-9]*) [\+:]?\= \\$', lines[i]) if var: # Remember the index and the name. startmarks.append((i, var.group(1))) @@ -140,7 +140,7 @@ def sort_makefile_lines(): rangemarks = [] for sm in startmarks: # Look for things like " # var" to end the sorted list. - reg = r'^ # ' + sm[1] + r'$' + reg = r'^ *# ' + sm[1] + r'$' for j in range(sm[0] + 1, len(lines)): if re.search(reg, lines[j]): # Remember the block to sort (inclusive). diff --git a/stdio-common/printf-parsemb.c b/stdio-common/printf-parsemb.c index ab9fafb..8db18f1 100644 --- a/stdio-common/printf-parsemb.c +++ b/stdio-common/printf-parsemb.c @@ -17,6 +17,7 @@ <https://www.gnu.org/licenses/>. */ #include <ctype.h> +#include <errno.h> #include <limits.h> #include <stdlib.h> #include <string.h> diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c index cc26ed6..cefee58 100644 --- a/stdlib/tst-secure-getenv.c +++ b/stdlib/tst-secure-getenv.c @@ -57,13 +57,7 @@ do_test (void) exit (1); } - int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); - - if (WEXITSTATUS (status) == EXIT_UNSUPPORTED) - return EXIT_UNSUPPORTED; - - if (!WIFEXITED (status)) - FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status); + support_capture_subprogram_self_sgid (MAGIC_ARGUMENT); return 0; } @@ -82,6 +76,7 @@ alternative_main (int argc, char **argv) if (secure_getenv ("PATH") != NULL) FAIL_EXIT (4, "PATH variable not filtered out\n"); + support_record_failure_barrier (); exit (EXIT_SUCCESS); } } diff --git a/string/strerror.c b/string/strerror.c index 107d9d3..efa4e90 100644 --- a/string/strerror.c +++ b/string/strerror.c @@ -21,5 +21,5 @@ char * strerror (int errnum) { - return __strerror_l (errnum, __libc_tsd_get (locale_t, LOCALE)); + return __strerror_l (errnum, __libc_tsd_LOCALE); } diff --git a/sunrpc/rpc_thread.c b/sunrpc/rpc_thread.c index a04b7ec..e20f0a6 100644 --- a/sunrpc/rpc_thread.c +++ b/sunrpc/rpc_thread.c @@ -3,7 +3,6 @@ #include <assert.h> #include <libc-lock.h> -#include <libc-tsd.h> #include <shlib-compat.h> #include <libc-symbols.h> diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h index 1ecbdfe..8cbdca3 100644 --- a/support/capture_subprocess.h +++ b/support/capture_subprocess.h @@ -41,11 +41,12 @@ struct support_capture_subprocess support_capture_subprocess struct support_capture_subprocess support_capture_subprogram (const char *file, char *const argv[]); -/* Copy the running program into a setgid binary and run it with CHILD_ID - argument. If execution is successful, return the exit status of the child - program, otherwise return a non-zero failure exit code. */ -int support_capture_subprogram_self_sgid - (char *child_id); +/* Copy the running program into a setgid binary and run it with + CHILD_ID argument. If the program exits with a non-zero status, + exit with that exit status (or status 1 if the program did not exit + normally). If the test cannot be performed, exit with + EXIT_UNSUPPORTED. */ +void support_capture_subprogram_self_sgid (const char *child_id); /* Deallocate the subprocess data captured by support_capture_subprocess. */ diff --git a/support/check.h b/support/check.h index 7ea22c7..8f41e5b 100644 --- a/support/check.h +++ b/support/check.h @@ -207,6 +207,9 @@ void support_record_failure_reset (void); failures or not. */ int support_record_failure_is_failed (void); +/* Terminate the process if any failures have been encountered so far. */ +void support_record_failure_barrier (void); + __END_DECLS #endif /* SUPPORT_CHECK_H */ diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c index ffced8a..8dc95f8 100644 --- a/support/support_capture_subprocess.c +++ b/support/support_capture_subprocess.c @@ -21,12 +21,17 @@ #include <errno.h> #include <fcntl.h> +#include <grp.h> +#include <scratch_buffer.h> +#include <stdio_ext.h> #include <stdlib.h> +#include <string.h> #include <support/check.h> #include <support/xunistd.h> #include <support/xsocket.h> #include <support/xspawn.h> #include <support/support.h> +#include <support/temp_file.h> #include <support/test-driver.h> static void @@ -108,111 +113,88 @@ support_capture_subprogram (const char *file, char *const argv[]) /* Copies the executable into a restricted directory, so that we can safely make it SGID with the TARGET group ID. Then runs the executable. */ -static int -copy_and_spawn_sgid (char *child_id, gid_t gid) +static void +copy_and_spawn_sgid (const char *child_id, gid_t gid) { - char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd", - test_dir, (intmax_t) getpid ()); + char *dirname = support_create_temp_directory ("tst-glibc-sgid-"); char *execname = xasprintf ("%s/bin", dirname); - int infd = -1; - int outfd = -1; - int ret = 1, status = 1; - - TEST_VERIFY (mkdir (dirname, 0700) == 0); - if (support_record_failure_is_failed ()) - goto err; + add_temp_file (execname); - infd = open ("/proc/self/exe", O_RDONLY); - if (infd < 0) + if (access ("/proc/self/exe", R_OK) != 0) FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n"); - outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700); - TEST_VERIFY (outfd >= 0); - if (support_record_failure_is_failed ()) - goto err; - - char buf[4096]; - for (;;) - { - ssize_t rdcount = read (infd, buf, sizeof (buf)); - TEST_VERIFY (rdcount >= 0); - if (support_record_failure_is_failed ()) - goto err; - if (rdcount == 0) - break; - char *p = buf; - char *end = buf + rdcount; - while (p != end) - { - ssize_t wrcount = write (outfd, buf, end - p); - if (wrcount == 0) - errno = ENOSPC; - TEST_VERIFY (wrcount > 0); - if (support_record_failure_is_failed ()) - goto err; - p += wrcount; - } - } + support_copy_file ("/proc/self/exe", execname); - bool chowned = false; - TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0) - || errno == EPERM); - if (support_record_failure_is_failed ()) - goto err; - else if (!chowned) - { - ret = 77; - goto err; - } + if (chown (execname, getuid (), gid) != 0) + FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m", + execname, (intmax_t) gid); - TEST_VERIFY (fchmod (outfd, 02750) == 0); - if (support_record_failure_is_failed ()) - goto err; - TEST_VERIFY (close (outfd) == 0); - if (support_record_failure_is_failed ()) - goto err; - TEST_VERIFY (close (infd) == 0); - if (support_record_failure_is_failed ()) - goto err; + if (chmod (execname, 02750) != 0) + FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname); /* We have the binary, now spawn the subprocess. Avoid using support_subprogram because we only want the program exit status, not the contents. */ - ret = 0; - infd = outfd = -1; - char * const args[] = {execname, child_id, NULL}; + char * const args[] = {execname, (char *) child_id, NULL}; + int status = support_subprogram_wait (args[0], args); - status = support_subprogram_wait (args[0], args); + free (execname); + free (dirname); -err: - if (outfd >= 0) - close (outfd); - if (infd >= 0) - close (infd); - if (execname != NULL) + if (WIFEXITED (status)) { - unlink (execname); - free (execname); + if (WEXITSTATUS (status) == 0) + return; + else + exit (WEXITSTATUS (status)); } - if (dirname != NULL) + else + FAIL_EXIT1 ("subprogram failed with status %d", status); +} + +/* Returns true if a group with NAME has been found, and writes its + GID to *TARGET. */ +static bool +find_sgid_group (gid_t *target, const char *name) +{ + /* Do not use getgrname_r because it does not work in statically + linked binaries if the system libc is different. */ + FILE *fp = fopen ("/etc/group", "rce"); + if (fp == NULL) + return false; + __fsetlocking (fp, FSETLOCKING_BYCALLER); + + bool ok = false; + struct scratch_buffer buf; + scratch_buffer_init (&buf); + while (true) { - rmdir (dirname); - free (dirname); + struct group grp; + struct group *result = NULL; + int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result); + if (status == 0 && result != NULL) + { + if (strcmp (result->gr_name, name) == 0) + { + *target = result->gr_gid; + ok = true; + break; + } + } + else if (errno != ERANGE) + break; + else if (!scratch_buffer_grow (&buf)) + break; } - - if (ret == 77) - FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n"); - if (ret != 0) - FAIL_EXIT1 ("Failed to make sgid executable for test\n"); - - return status; + scratch_buffer_free (&buf); + fclose (fp); + return ok; } -int -support_capture_subprogram_self_sgid (char *child_id) +void +support_capture_subprogram_self_sgid (const char *child_id) { - gid_t target = 0; const int count = 64; gid_t groups[count]; @@ -224,6 +206,7 @@ support_capture_subprogram_self_sgid (char *child_id) (intmax_t) getuid ()); gid_t current = getgid (); + gid_t target = current; for (int i = 0; i < ret; ++i) { if (groups[i] != current) @@ -233,11 +216,18 @@ support_capture_subprogram_self_sgid (char *child_id) } } - if (target == 0) - FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", - (intmax_t) getuid ()); + if (target == current) + { + /* If running as root, try to find a harmless group for SGID. */ + if (getuid () != 0 + || (!find_sgid_group (&target, "nogroup") + && !find_sgid_group (&target, "bin") + && !find_sgid_group (&target, "daemon"))) + FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n", + (intmax_t) getuid ()); + } - return copy_and_spawn_sgid (child_id, target); + copy_and_spawn_sgid (child_id, target); } void diff --git a/support/support_record_failure.c b/support/support_record_failure.c index 9781237..72ee2b2 100644 --- a/support/support_record_failure.c +++ b/support/support_record_failure.c @@ -112,3 +112,13 @@ support_record_failure_is_failed (void) synchronization for reliable test error reporting anyway. */ return __atomic_load_n (&state->failed, __ATOMIC_RELAXED); } + +void +support_record_failure_barrier (void) +{ + if (__atomic_load_n (&state->failed, __ATOMIC_RELAXED)) + { + puts ("error: exiting due to previous errors"); + exit (1); + } +} diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c index 0a86c98..aaf874f 100644 --- a/sysdeps/aarch64/fpu/acos_advsimd.c +++ b/sysdeps/aarch64/fpu/acos_advsimd.c @@ -18,24 +18,23 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { - float64x2_t poly[12]; - float64x2_t pi, pi_over_2; + double c1, c3, c5, c7, c9, c11; + float64x2_t c0, c2, c4, c6, c8, c10; uint64x2_t abs_mask; + float64x2_t pi, pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), - V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), - V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), - V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), - V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), - V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, - .pi = V2 (0x1.921fb54442d18p+1), - .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi = V2 (0x1.921fb54442d18p+1), .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; @@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) acos(x) ~ pi/2 - (x + x^3 P(x^2)). - The largest observed error in this region is 1.18 ulps, + The largest observed error in this region is 1.18 ulp: _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0 want 0x1.0d54d1985c069p+0. @@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 1.52 ulps, - _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1 - want 0x1.edbbedf8a7d6cp-1. */ + The largest observed error in this region is 1.50 ulp: + _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 + want 0x1.ec1a46aa829p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x) float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2)); /* Use a single polynomial approximation P for both intervals. */ + float64x2_t z3 = vmulq_f64 (z2, z); float64x2_t z4 = vmulq_f64 (z2, z2); float64x2_t z8 = vmulq_f64 (z4, z4); - float64x2_t z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); - /* Finalize polynomial: z + z * z2 * P(z2). */ - p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); + /* Order-11 Estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p411 = vfmaq_f64 (p47, z8, p811); + float64x2_t p = vfmaq_f64 (p03, z8, p411); + + /* Finalize polynomial: z + z3 * P(z2). */ + p = vfmaq_f64 (z, z3, p); /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 = 2 Q(|x|) , for 0.5 < x < 1.0 diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c index 99dbfac..870d806 100644 --- a/sysdeps/aarch64/fpu/acos_sve.c +++ b/sysdeps/aarch64/fpu/acos_sve.c @@ -18,20 +18,21 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[12]; - float64_t pi, pi_over_2; + float64_t c1, c3, c5, c7, c9, c11; + float64_t c0, c2, c4, c6, c8, c10; + float64_t pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5, - 0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, - 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8, - 0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, - .pi = 0x1.921fb54442d18p+1, + .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, + .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, + .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, + .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, + .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, .pi_over_2 = 0x1.921fb54442d18p+0, }; @@ -42,20 +43,21 @@ static const struct data acos(x) ~ pi/2 - (x + x^3 P(x^2)). - The largest observed error in this region is 1.18 ulps, - _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0 - want 0x1.0d4d0f55667f7p+0. + The largest observed error in this region is 1.18 ulp: + _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0 + want 0x1.0d51266607583p+0. For |x| in [0.5, 1.0], use same approximation with a change of variable acos(x) = y + y * z * P(z), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 1.52 ulps, - _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1 - want 0x1.ed82df4243f0bp-1. */ + The largest observed error in this region is 1.50 ulp: + _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1 + want 0x1.ec1a46aa829p-1. */ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b64 (); svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); svfloat64_t ax = svabs_x (pg, x); @@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svsqrt_m (ax, a_gt_half, z2); /* Use a single polynomial approximation P for both intervals. */ - svfloat64_t z4 = svmul_x (pg, z2, z2); - svfloat64_t z8 = svmul_x (pg, z4, z4); - svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + svfloat64_t z3 = svmul_x (ptrue, z2, z); + svfloat64_t z4 = svmul_x (ptrue, z2, z2); + svfloat64_t z8 = svmul_x (ptrue, z4, z4); + + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p411 = svmla_x (pg, p47, z8, p811); + svfloat64_t p = svmad_x (pg, p411, z8, p03); /* Finalize polynomial: z + z * z2 * P(z2). */ - p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + p = svmad_x (pg, p, z3, z); /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for |x| < 0.5 = 2 Q(|x|) , for 0.5 < x < 1.0 = pi - 2 Q(|x|) , for -1.0 < x < -0.5. */ - svfloat64_t y - = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign)); - - svbool_t is_neg = svcmplt (pg, x, 0.0); - svfloat64_t off = svdup_f64_z (is_neg, d->pi); - svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0)); - svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2)); - - return svmla_x (pg, add, mul, y); + svfloat64_t mul = svreinterpret_f64 ( + svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10)); + mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign)); + svfloat64_t add = svreinterpret_f64 ( + svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2)))); + add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add); + + return svmsb_x (pg, p, mul, add); } diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c index 2de6eff..f488422 100644 --- a/sysdeps/aarch64/fpu/asin_advsimd.c +++ b/sysdeps/aarch64/fpu/asin_advsimd.c @@ -18,24 +18,23 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { - float64x2_t poly[12]; + float64x2_t c0, c2, c4, c6, c8, c10; float64x2_t pi_over_2; uint64x2_t abs_mask; + double c1, c3, c5, c7, c9, c11; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4), - V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6), - V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6), - V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7), - V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6), - V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), }, - .pi_over_2 = V2 (0x1.921fb54442d18p+0), - .abs_mask = V2 (0x7fffffffffffffff), + .c0 = V2 (0x1.555555555554ep-3), .c1 = 0x1.3333333337233p-4, + .c2 = V2 (0x1.6db6db67f6d9fp-5), .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = V2 (0x1.6e8b264d467d6p-6), .c5 = 0x1.1c5997c357e9dp-6, + .c6 = V2 (0x1.c86a22cd9389dp-7), .c7 = 0x1.856073c22ebbep-7, + .c8 = V2 (0x1.fd1151acb6bedp-8), .c9 = 0x1.087182f799c1dp-6, + .c10 = V2 (-0x1.6602748120927p-7), .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff), }; #define AllMask v_u64 (0xffffffffffffffff) @@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special) asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). The largest observed error in this region is 2.69 ulps, - _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1 + want 0x1.1111dd54ddf99p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) return special_case (x, x, AllMask); #endif - uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5)); + uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with z = x ^ 2 and y = |x| , if |x| < 0.5 @@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x) float64x2_t z4 = vmulq_f64 (z2, z2); float64x2_t z8 = vmulq_f64 (z4, z4); float64x2_t z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly); + + /* order-11 estrin. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); + + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); + + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); + float64x2_t p = vfmaq_f64 (p07, z16, p811); /* Finalize polynomial: z + z * z2 * P(z2). */ p = vfmaq_f64 (z, vmulq_f64 (z, z2), p); diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c index 9daa382..acbf1b3 100644 --- a/sysdeps/aarch64/fpu/asin_sve.c +++ b/sysdeps/aarch64/fpu/asin_sve.c @@ -18,45 +18,43 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[12]; - float64_t pi_over_2f; + float64_t c1, c3, c5, c7, c9, c11; + float64_t c0, c2, c4, c6, c8, c10; + float64_t pi_over_2; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57. */ - .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, - 0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6, - 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6, - 0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, - 0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6, - -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, }, - .pi_over_2f = 0x1.921fb54442d18p+0, + .c0 = 0x1.555555555554ep-3, .c1 = 0x1.3333333337233p-4, + .c2 = 0x1.6db6db67f6d9fp-5, .c3 = 0x1.f1c71fbd29fbbp-6, + .c4 = 0x1.6e8b264d467d6p-6, .c5 = 0x1.1c5997c357e9dp-6, + .c6 = 0x1.c86a22cd9389dp-7, .c7 = 0x1.856073c22ebbep-7, + .c8 = 0x1.fd1151acb6bedp-8, .c9 = 0x1.087182f799c1dp-6, + .c10 = -0x1.6602748120927p-7, .c11 = 0x1.cfa0dd1f9478p-6, + .pi_over_2 = 0x1.921fb54442d18p+0, }; -#define P(i) sv_f64 (d->poly[i]) - /* Double-precision SVE implementation of vector asin(x). For |x| in [0, 0.5], use an order 11 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). - The largest observed error in this region is 0.52 ulps, - _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2 - want 0x1.ec13757305f26p-2. - - For |x| in [0.5, 1.0], use same approximation with a change of variable + The largest observed error in this region is 0.98 ulp: + _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2 + want 0x1.ec4eb661a73d2p-2. - asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). + For |x| in [0.5, 1.0], use same approximation with a change of variable: + asin(x) = pi/2 - (y + y * z * P(z)), with z = (1-x)/2 and y = sqrt(z). - The largest observed error in this region is 2.69 ulps, - _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1 - want 0x1.110d7e85fdd53p-1. */ + The largest observed error in this region is 2.66 ulp: + _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1 + want 0x1.10b9586f087abp-1. */ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b64 (); svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000); svfloat64_t ax = svabs_x (pg, x); @@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svsqrt_m (ax, a_ge_half, z2); /* Use a single polynomial approximation P for both intervals. */ + svfloat64_t z3 = svmul_x (pg, z2, z); svfloat64_t z4 = svmul_x (pg, z2, z2); svfloat64_t z8 = svmul_x (pg, z4, z4); - svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly); + + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + + /* Order-11 Estrin scheme. */ + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p411 = svmla_x (pg, p47, z8, p811); + svfloat64_t p = svmla_x (pg, p03, z8, p411); + /* Finalize polynomial: z + z * z2 * P(z2). */ - p = svmla_x (pg, z, svmul_x (pg, z, z2), p); + p = svmla_x (pg, z, z3, p); - /* asin(|x|) = Q(|x|) , for |x| < 0.5 - = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ - svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f); + /* asin(|x|) = Q(|x|), for |x| < 0.5 + = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ + svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2); - /* Copy sign. */ + /* Reinsert the sign from the argument. */ return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); } diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c index 59d870c..88437f8 100644 --- a/sysdeps/aarch64/fpu/asinf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c @@ -18,22 +18,21 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - float32x4_t poly[5]; + float32x4_t c0, c2, c4; + float c1, c3; float32x4_t pi_over_2f; } data = { /* Polynomial approximation of (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x)) on [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 . */ - .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5), - V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) }, - .pi_over_2f = V4 (0x1.921fb6p+0f), + .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f, + .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f, + .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f), }; #define AbsMask 0x7fffffff -#define Half 0x3f000000 #define One 0x3f800000 #define Small 0x39800000 /* 2^-12. */ @@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) /* Single-precision implementation of vector asin(x). - For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct - rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the - following approximation. - For |x| in [Small, 0.5], use order 4 polynomial P such that the final + For |x| <0.5, use order 4 polynomial P such that the final approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2). The largest observed error in this region is 0.83 ulps, @@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x) #endif float32x4_t ax = vreinterpretq_f32_u32 (ia); - uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half)); + uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f)); /* Evaluate polynomial Q(x) = y + y * z * P(z) with z = x ^ 2 and y = |x| , if |x| < 0.5 z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5. */ float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x), - vfmsq_n_f32 (v_f32 (0.5), ax, 0.5)); + vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f)); float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2)); /* Use a single polynomial approximation P for both intervals. */ - float32x4_t p = v_horner_4_f32 (z2, d->poly); + + /* PW Horner 3 evaluation scheme. */ + float32x4_t z4 = vmulq_f32 (z2, z2); + float32x4_t c13 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1); + float32x4_t p = vfmaq_f32 (p23, d->c4, z4); + p = vfmaq_f32 (p01, p, z4); /* Finalize polynomial: z + z * z2 * P(z2). */ p = vfmaq_f32 (z, vmulq_f32 (z, z2), p); /* asin(|x|) = Q(|x|) , for |x| < 0.5 = pi/2 - 2 Q(|x|), for |x| >= 0.5. */ float32x4_t y - = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0)); + = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f)); /* Copy sign. */ return vbslq_f32 (v_u32 (AbsMask), y, x); diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c index 5df4b99..36be4ce 100644 --- a/sysdeps/aarch64/fpu/atan2_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c @@ -18,40 +18,38 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; - uint64x2_t zeroinfnan, minustwo; + uint64x2_t zeroinfnan; } data = { - /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - [2**-1022, 1.0]. */ - .c0 = V2 (-0x1.5555555555555p-2), - .c1 = 0x1.99999999996c1p-3, - .c2 = V2 (-0x1.2492492478f88p-3), - .c3 = 0x1.c71c71bc3951cp-4, - .c4 = V2 (-0x1.745d160a7e368p-4), - .c5 = 0x1.3b139b6a88ba1p-4, - .c6 = V2 (-0x1.11100ee084227p-4), - .c7 = 0x1.e1d0f9696f63bp-5, - .c8 = V2 (-0x1.aebfe7b418581p-5), - .c9 = 0x1.842dbe9b0d916p-5, - .c10 = V2 (-0x1.5d30140ae5e99p-5), - .c11 = 0x1.338e31eb2fbbcp-5, - .c12 = V2 (-0x1.00e6eece7de8p-5), - .c13 = 0x1.860897b29e5efp-6, - .c14 = V2 (-0x1.0051381722a59p-6), - .c15 = 0x1.14e9dc19a4a4ep-7, - .c16 = V2 (-0x1.d0062b42fe3bfp-9), - .c17 = 0x1.17739e210171ap-10, - .c18 = V2 (-0x1.ab24da7be7402p-13), - .c19 = 0x1.358851160a528p-16, + /* Coefficients of polynomial P such that + atan(x)~x+x*P(x^2) on [2^-1022, 1.0]. */ + .c0 = V2 (-0x1.555555555552ap-2), + .c1 = 0x1.9999999995aebp-3, + .c2 = V2 (-0x1.24924923923f6p-3), + .c3 = 0x1.c71c7184288a2p-4, + .c4 = V2 (-0x1.745d11fb3d32bp-4), + .c5 = 0x1.3b136a18051b9p-4, + .c6 = V2 (-0x1.110e6d985f496p-4), + .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = V2 (-0x1.ae644e28058c3p-5), + .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = V2 (-0x1.59d7f901566cbp-5), + .c11 = 0x1.2c982855ab069p-5, + .c12 = V2 (-0x1.eb49592998177p-6), + .c13 = 0x1.69d8b396e3d38p-6, + .c14 = V2 (-0x1.ca980345c4204p-7), + .c15 = 0x1.dc050eafde0b3p-8, + .c16 = V2 (-0x1.7ea70755b8eccp-9), + .c17 = 0x1.ba3da3de903e8p-11, + .c18 = V2 (-0x1.44a4b059b6f67p-13), + .c19 = 0x1.c4a45029e5a91p-17, .pi_over_2 = V2 (0x1.921fb54442d18p+0), .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), - .minustwo = V2 (0xc000000000000000), }; #define SignMask v_u64 (0x8000000000000000) @@ -76,10 +74,9 @@ zeroinfnan (uint64x2_t i, const struct data *d) } /* Fast implementation of vector atan2. - Maximum observed error is 2.8 ulps: - _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5) - got 0x1.92d628ab678ccp-1 - want 0x1.92d628ab678cfp-1. */ + Maximum observed error is 1.97 ulps: + _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5) + got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1. */ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -100,26 +97,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) uint64x2_t pred_xlt0 = vcltzq_f64 (x); uint64x2_t pred_aygtax = vcagtq_f64 (y, x); - /* Set up z for call to atan. */ - float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); - float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); - float64x2_t z = vdivq_f64 (n, q); - - /* Work out the correct shift. */ - float64x2_t shift - = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); - shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); - shift = vmulq_f64 (shift, d->pi_over_2); - - /* Calculate the polynomial approximation. - Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of - full scheme to avoid underflow in x^16. - The order 19 polynomial P approximates - (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + /* Set up z for evaluation of atan. */ + float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); + float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ + float64x2_t shift = vreinterpretq_f64_u64 ( + vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + float64x2_t shift2 = vreinterpretq_f64_u64 ( + vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0)))); + shift = vaddq_f64 (shift, shift2); + + /* Calculate the polynomial approximation. */ float64x2_t z2 = vmulq_f64 (z, z); - float64x2_t x2 = vmulq_f64 (z2, z2); - float64x2_t x4 = vmulq_f64 (x2, x2); - float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t z3 = vmulq_f64 (z2, z); + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); float64x2_t c13 = vld1q_f64 (&d->c1); float64x2_t c57 = vld1q_f64 (&d->c5); @@ -127,45 +127,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t c1315 = vld1q_f64 (&d->c13); float64x2_t c1719 = vld1q_f64 (&d->c17); - /* estrin_7. */ + /* Order-7 Estrin. */ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); - float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); - float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); - float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); - /* estrin_11. */ + /* Order-11 Estrin. */ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); - float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); - float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); - float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); - float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); - float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); + float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); - float64x2_t ret = vfmaq_f64 (p07, p819, x8); + float64x2_t poly = vfmaq_f64 (p07, p819, z16); /* Finalize. y = shift + z + z^3 * P(z^2). */ - ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); - ret = vaddq_f64 (ret, shift); + float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2); + ret = vfmaq_f64 (ret, z3, poly); if (__glibc_unlikely (v_any_u64 (special_cases))) return special_case (y, x, ret, sign_xy, special_cases); /* Account for the sign of x and y. */ - ret = vreinterpretq_f64_u64 ( + return vreinterpretq_f64_u64 ( veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); - - return ret; } diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c index 04fa71f..7af4931 100644 --- a/sysdeps/aarch64/fpu/atan2_sve.c +++ b/sysdeps/aarch64/fpu/atan2_sve.c @@ -18,25 +18,25 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[20]; - float64_t pi_over_2; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, - 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, - -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, - 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, - -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, - 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, - -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, - .pi_over_2 = 0x1.921fb54442d18p+0, + .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, + .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, + .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, + .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, + .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, + .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, + .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, + .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, }; - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ static svfloat64_t NOINLINE special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, @@ -55,15 +55,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg) } /* Fast implementation of SVE atan2. Errors are greatest when y and - x are reasonably close together. The greatest observed error is 2.28 ULP: - _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732) - got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1. */ -svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) + x are reasonably close together. The greatest observed error is 1.94 ULP: + _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5) + got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1. */ +svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, + const svbool_t pg) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); svuint64_t ix = svreinterpret_u64 (x); svuint64_t iy = svreinterpret_u64 (y); + svbool_t ptrue = svptrue_b64 (); svbool_t cmp_x = zeroinfnan (ix, pg); svbool_t cmp_y = zeroinfnan (iy, pg); @@ -80,32 +82,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svbool_t pred_aygtax = svcmpgt (pg, ay, ax); - /* Set up z for call to atan. */ - svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); - svfloat64_t d = svsel (pred_aygtax, ay, ax); - svfloat64_t z = svdiv_x (pg, n, d); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atan. */ + svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat64_t den = svsel (pred_aygtax, ay, ax); + svfloat64_t z = svdiv_x (pg, num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); + svfloat64_t shift_mul = svreinterpret_f64 ( + svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0)))); shift = svsel (pred_aygtax, sv_f64 (1.0), shift); - shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); - shift = svmul_x (pg, shift, data_ptr->pi_over_2); + shift = svmla_x (pg, z, shift, shift_mul); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ svfloat64_t z2 = svmul_x (pg, z, z); - svfloat64_t x2 = svmul_x (pg, z2, z2); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t x8 = svmul_x (pg, x4, x4); + svfloat64_t z3 = svmul_x (pg, z2, z); + svfloat64_t z4 = svmul_x (pg, z2, z2); + svfloat64_t z8 = svmul_x (pg, z4, z4); + svfloat64_t z16 = svmul_x (pg, z8, z8); - svfloat64_t ret = svmla_x ( - pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly), - sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8); + /* Order-7 Estrin. */ + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); - /* y = shift + z + z^3 * P(z^2). */ - svfloat64_t z3 = svmul_x (pg, z2, z); - ret = svmla_x (pg, z, z3, ret); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + svfloat64_t p07 = svmla_x (pg, p03, z8, p47); + + /* Order-11 Estrin. */ + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + svfloat64_t c1315 = svld1rq (ptrue, &d->c13); + svfloat64_t c1719 = svld1rq (ptrue, &d->c17); - ret = svadd_m (pg, ret, shift); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); + + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); + svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); + + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); + svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); + svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); + + svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); + svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); + + svfloat64_t poly = svmla_x (pg, p07, z16, p819); + + /* y = shift + z + z^3 * P(z^2). */ + svfloat64_t ret = svmla_x (pg, shift, z3, poly); /* Account for the sign of x and y. */ if (__glibc_unlikely (svptest_any (pg, cmp_xy))) diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c index 88daacd..23a825e 100644 --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c @@ -18,22 +18,22 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - float32x4_t c0, pi_over_2, c4, c6, c2; + float32x4_t c0, c4, c6, c2; float c1, c3, c5, c7; uint32x4_t comp_const; + float32x4_t pi; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, - .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, - .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, - .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, - .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), + .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, + .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, + .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, + .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, + .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1), }; #define SignMask v_u32 (0x80000000) @@ -54,13 +54,13 @@ static inline uint32x4_t zeroinfnan (uint32x4_t i, const struct data *d) { /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); + return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const); } /* Fast implementation of vector atan2f. Maximum observed error is - 2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: - _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ + 2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]: + _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1 + want 0x1.97ae9ep-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) uint32x4_t pred_xlt0 = vcltzq_f32 (x); uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax); - /* Set up z for call to atanf. */ - float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); - float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); - float32x4_t z = vdivq_f32 (n, q); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atanf. */ + float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); + float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ float32x4_t shift = vreinterpretq_f32_u32 ( - vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); - shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); - shift = vmulq_f32 (shift, d->pi_over_2); - - /* Calculate the polynomial approximation. - Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, - a standard implementation using z8 creates spurious underflow - in the very last fma (when z^8 is small enough). - Therefore, we split the last fma into a mul and an fma. - Horner and single-level Estrin have higher errors that exceed - threshold. */ + vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f)))); + float32x4_t shift2 = vreinterpretq_f32_u32 ( + vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f)))); + shift = vaddq_f32 (shift, shift2); + + /* Calculate the polynomial approximation. */ float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z3 = vmulq_f32 (z2, z); float32x4_t z4 = vmulq_f32 (z2, z2); + float32x4_t z8 = vmulq_f32 (z4, z4); float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); @@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t p03 = vfmaq_f32 (p01, z4, p23); float32x4_t p47 = vfmaq_f32 (p45, z4, p67); - float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); + float32x4_t poly = vfmaq_f32 (p03, z8, p47); /* y = shift + z * P(z^2). */ - ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); + float32x4_t ret = vfmaq_f32 (z, shift, d->pi); + ret = vfmaq_f32 (ret, z3, poly); if (__glibc_unlikely (v_any_u32 (special_cases))) { diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c index 9ea1971..f7f6d40 100644 --- a/sysdeps/aarch64/fpu/atan2f_sve.c +++ b/sysdeps/aarch64/fpu/atan2f_sve.c @@ -18,18 +18,18 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f32.h" static const struct data { - float32_t poly[8]; + float32_t c0, c2, c4, c6; + float32_t c1, c3, c5, c7; float32_t pi_over_2; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. */ - .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, - -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, - .pi_over_2 = 0x1.921fb6p+0f, + .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3, .c2 = -0x1.230a94p-3, + .c3 = 0x1.b4debp-4, .c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5, + .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9, .pi_over_2 = 0x1.921fb6p+0f, }; /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ @@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg) /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum - observed error is 2.95 ULP: - _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1 - want 0x1.967f00p-1. */ -svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) + observed error is 2.21 ULP: + _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ +svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, + const svbool_t pg) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b32 (); svuint32_t ix = svreinterpret_u32 (x); svuint32_t iy = svreinterpret_u32 (y); @@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svbool_t pred_aygtax = svcmpgt (pg, ay, ax); - /* Set up z for call to atan. */ - svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay); - svfloat32_t d = svsel (pred_aygtax, ay, ax); - svfloat32_t z = svdiv_x (pg, n, d); - - /* Work out the correct shift. */ + /* Set up z for evaluation of atanf. */ + svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay); + svfloat32_t den = svsel (pred_aygtax, ay, ax); + svfloat32_t z = svdiv_x (ptrue, num, den); + + /* Work out the correct shift for atan2: + Multiplication by pi is done later. + -pi when x < 0 and ax < ay + -pi/2 when x < 0 and ax > ay + 0 when x >= 0 and ax < ay + pi/2 when x >= 0 and ax > ay. */ svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); shift = svsel (pred_aygtax, sv_f32 (1.0), shift); shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); - shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ - svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z2 = svmul_x (ptrue, z, z); + svfloat32_t z3 = svmul_x (pg, z2, z); svfloat32_t z4 = svmul_x (pg, z2, z2); svfloat32_t z8 = svmul_x (pg, z4, z4); - svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly); + svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); - /* ret = shift + z + z^3 * P(z^2). */ - svfloat32_t z3 = svmul_x (pg, z2, z); - ret = svmla_x (pg, z, z3, ret); + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); - ret = svadd_m (pg, ret, shift); + svfloat32_t p03 = svmla_x (pg, p01, z4, p23); + svfloat32_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat32_t poly = svmla_x (pg, p03, z8, p47); + + /* ret = shift + z + z^3 * P(z^2). */ + svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2)); + ret = svmla_x (pg, ret, z3, poly); /* Account for the sign of x and y. */ diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c index 14f1809..f835dae 100644 --- a/sysdeps/aarch64/fpu/atan_advsimd.c +++ b/sysdeps/aarch64/fpu/atan_advsimd.c @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" static const struct data { @@ -28,16 +27,16 @@ static const struct data } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, - .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, - .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, - .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, - .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, - .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, - .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, - .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, - .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, - .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, + .c0 = V2 (-0x1.555555555552ap-2), .c1 = 0x1.9999999995aebp-3, + .c2 = V2 (-0x1.24924923923f6p-3), .c3 = 0x1.c71c7184288a2p-4, + .c4 = V2 (-0x1.745d11fb3d32bp-4), .c5 = 0x1.3b136a18051b9p-4, + .c6 = V2 (-0x1.110e6d985f496p-4), .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = V2 (-0x1.ae644e28058c3p-5), .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = V2 (-0x1.59d7f901566cbp-5), .c11 = 0x1.2c982855ab069p-5, + .c12 = V2 (-0x1.eb49592998177p-6), .c13 = 0x1.69d8b396e3d38p-6, + .c14 = V2 (-0x1.ca980345c4204p-7), .c15 = 0x1.dc050eafde0b3p-8, + .c16 = V2 (-0x1.7ea70755b8eccp-9), .c17 = 0x1.ba3da3de903e8p-11, + .c18 = V2 (-0x1.44a4b059b6f67p-13), .c19 = 0x1.c4a45029e5a91p-17, .pi_over_2 = V2 (0x1.921fb54442d18p+0), }; @@ -47,9 +46,9 @@ static const struct data /* Fast implementation of vector atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using - z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps: - _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ + z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps: + _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1 + want 0x1.92288c551a4c3p-1. */ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0)); + uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0)); /* Avoid dependency in abs(x) in division (and comparison). */ - float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x); + float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x); + float64x2_t shift = vreinterpretq_f64_u64 ( vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2))); - /* Use absolute value only when needed (odd powers of z). */ - float64x2_t az = vbslq_f64 ( - SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z); - - /* Calculate the polynomial approximation. - Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of - full scheme to avoid underflow in x^16. - The order 19 polynomial P approximates - (atan(sqrt(x))-sqrt(x))/x^(3/2). */ + + /* Reinsert sign bit from argument into the shift value. */ + shift = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (shift), sign)); + + /* Calculate polynomial approximation P(z^2) with deg(P)=19. */ float64x2_t z2 = vmulq_f64 (z, z); - float64x2_t x2 = vmulq_f64 (z2, z2); - float64x2_t x4 = vmulq_f64 (x2, x2); - float64x2_t x8 = vmulq_f64 (x4, x4); + float64x2_t z4 = vmulq_f64 (z2, z2); + float64x2_t z8 = vmulq_f64 (z4, z4); + float64x2_t z16 = vmulq_f64 (z8, z8); - /* estrin_7. */ + /* Order-7 Estrin. */ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); - float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + float64x2_t p03 = vfmaq_f64 (p01, z4, p23); float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); - float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + float64x2_t p47 = vfmaq_f64 (p45, z4, p67); - float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + float64x2_t p07 = vfmaq_f64 (p03, z8, p47); - /* estrin_11. */ + /* Order-11 Estrin. */ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); - float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + float64x2_t p811 = vfmaq_f64 (p89, z4, p1011); float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); - float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415); float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); - float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819); - float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); - float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + float64x2_t p815 = vfmaq_f64 (p811, z8, p1215); + float64x2_t p819 = vfmaq_f64 (p815, z16, p1619); - float64x2_t y = vfmaq_f64 (p07, p819, x8); + float64x2_t y = vfmaq_f64 (p07, p819, z16); /* Finalize. y = shift + z + z^3 * P(z^2). */ - y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); - y = vaddq_f64 (y, shift); - - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign)); - return y; + y = vfmsq_f64 (v_f64 (-1.0), z2, y); + return vfmsq_f64 (shift, z, y); } diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c index fa16303..046c04f 100644 --- a/sysdeps/aarch64/fpu/atan_sve.c +++ b/sysdeps/aarch64/fpu/atan_sve.c @@ -18,23 +18,26 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" static const struct data { - float64_t poly[20]; - float64_t pi_over_2; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + float64_t shift_val, neg_one; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { -0x1.5555555555555p-2, 0x1.99999999996c1p-3, -0x1.2492492478f88p-3, - 0x1.c71c71bc3951cp-4, -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4, - -0x1.11100ee084227p-4, 0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5, - 0x1.842dbe9b0d916p-5, -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5, - -0x1.00e6eece7de8p-5, 0x1.860897b29e5efp-6, -0x1.0051381722a59p-6, - 0x1.14e9dc19a4a4ep-7, -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10, - -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, }, - .pi_over_2 = 0x1.921fb54442d18p+0, + .c0 = -0x1.555555555552ap-2, .c1 = 0x1.9999999995aebp-3, + .c2 = -0x1.24924923923f6p-3, .c3 = 0x1.c71c7184288a2p-4, + .c4 = -0x1.745d11fb3d32bp-4, .c5 = 0x1.3b136a18051b9p-4, + .c6 = -0x1.110e6d985f496p-4, .c7 = 0x1.e1bcf7f08801dp-5, + .c8 = -0x1.ae644e28058c3p-5, .c9 = 0x1.82eeb1fed85c6p-5, + .c10 = -0x1.59d7f901566cbp-5, .c11 = 0x1.2c982855ab069p-5, + .c12 = -0x1.eb49592998177p-6, .c13 = 0x1.69d8b396e3d38p-6, + .c14 = -0x1.ca980345c4204p-7, .c15 = 0x1.dc050eafde0b3p-8, + .c16 = -0x1.7ea70755b8eccp-9, .c17 = 0x1.ba3da3de903e8p-11, + .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17, + .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1, }; /* Useful constants. */ @@ -43,15 +46,14 @@ static const struct data /* Fast implementation of SVE atan. Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed - error is 2.27 ulps: - _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1 - want 0x1.9225645bdd7c3p-1. */ + error is 2.08 ulps: + _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1 + want 0x1.922a3163e15c4p-1. */ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* No need to trigger special case. Small cases, infs and nans - are supported by our approximation technique. */ + svbool_t ptrue = svptrue_b64 (); svuint64_t ix = svreinterpret_u64 (x); svuint64_t sign = svand_x (pg, ix, SignMask); @@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg) y := arctan(x) for x < 1 y := pi/2 + arctan(-1/x) for x > 1 Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt (pg, x, 1.0); - /* Avoid dependency in abs(x) in division (and comparison). */ - svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x); - /* Use absolute value only when needed (odd powers of z). */ - svfloat64_t az = svabs_x (pg, z); - az = svneg_m (az, red, az); + svbool_t red = svacgt (pg, x, d->neg_one); + svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x); + + /* Reuse of -1.0f to reduce constant loads, + We need a shift value of 1/2, which is created via -1 + (1 + 1/2). */ + svfloat64_t shift + = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val)); + + /* Reinserts the sign bit of the argument to handle the case of x < -1. */ + shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign)); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ - svfloat64_t z2 = svmul_x (pg, z, z); - svfloat64_t x2 = svmul_x (pg, z2, z2); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t x8 = svmul_x (pg, x4, x4); + svfloat64_t z2 = svmul_x (ptrue, z, z); + svfloat64_t z4 = svmul_x (ptrue, z2, z2); + svfloat64_t z8 = svmul_x (ptrue, z4, z4); + svfloat64_t z16 = svmul_x (ptrue, z8, z8); - svfloat64_t y - = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly), - sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8); + /* Order-7 Estrin. */ + svfloat64_t c13 = svld1rq (ptrue, &d->c1); + svfloat64_t c57 = svld1rq (ptrue, &d->c5); - /* y = shift + z + z^3 * P(z^2). */ - svfloat64_t z3 = svmul_x (pg, z2, az); - y = svmla_x (pg, az, z3, y); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, z4, p23); + svfloat64_t p47 = svmla_x (pg, p45, z4, p67); + svfloat64_t p07 = svmla_x (pg, p03, z8, p47); + + /* Order-11 Estrin. */ + svfloat64_t c911 = svld1rq (ptrue, &d->c9); + svfloat64_t c1315 = svld1rq (ptrue, &d->c13); + svfloat64_t c1719 = svld1rq (ptrue, &d->c17); - /* Apply shift as indicated by `red` predicate. */ - y = svadd_m (red, y, d->pi_over_2); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1); + svfloat64_t p811 = svmla_x (pg, p89, z4, p1011); - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1); + svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415); - return y; + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0); + svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1); + svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819); + + svfloat64_t p815 = svmla_x (pg, p811, z8, p1215); + svfloat64_t p819 = svmla_x (pg, p815, z16, p1619); + + svfloat64_t y = svmla_x (pg, p07, z16, p819); + + /* y = shift + z + z^3 * P(z^2). */ + shift = svadd_m (red, z, shift); + y = svmul_x (pg, z2, y); + return svmla_x (pg, shift, z, y); } diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c index d015cc7..e72074e 100644 --- a/sysdeps/aarch64/fpu/atanf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c @@ -22,26 +22,35 @@ static const struct data { + uint32x4_t sign_mask, pi_over_2; + float32x4_t neg_one; +#if WANT_SIMD_EXCEPT float32x4_t poly[8]; - float32x4_t pi_over_2; +} data = { + .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3), + V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5), + V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) }, +#else + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, - .pi_over_2 = V4 (0x1.921fb6p+0f), + .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3, + .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4, + .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5, + .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9, +#endif + .pi_over_2 = V4 (0x3fc90fdb), + .neg_one = V4 (-1.0f), + .sign_mask = V4 (0x80000000), }; -#define SignMask v_u32 (0x80000000) - -#define P(i) d->poly[i] - +#if WANT_SIMD_EXCEPT #define TinyBound 0x30800000 /* asuint(0x1p-30). */ #define BigBound 0x4e800000 /* asuint(0x1p30). */ -#if WANT_SIMD_EXCEPT static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t x, float32x4_t y, uint32x4_t special) { @@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special) /* Fast implementation of vector atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] - using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps: - _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1. */ + using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps: + _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - /* Small cases, infs and nans are supported by our approximation technique, - but do not set fenv flags correctly. Only trigger special case if we need - fenv. */ uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint32x4_t sign = vandq_u32 (ix, SignMask); + uint32x4_t sign = vandq_u32 (ix, d->sign_mask); #if WANT_SIMD_EXCEPT + /* Small cases, infs and nans are supported by our approximation technique, + but do not set fenv flags correctly. Only trigger special case if we need + fenv. */ uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000)); uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)), v_u32 (BigBound - TinyBound)); @@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) if (__glibc_unlikely (v_any_u32 (special))) return special_case (x, x, v_u32 (-1)); #endif - /* Argument reduction: - y := arctan(x) for x < 1 - y := pi/2 + arctan(-1/x) for x > 1 - Hence, use z=-1/a if x>=1, otherwise z=a. */ - uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0)); - /* Avoid dependency in abs(x) in division (and comparison). */ - float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); + y := arctan(x) for |x| < 1 + y := arctan(-1/x) + pi/2 for x > +1 + y := arctan(-1/x) - pi/2 for x < -1 + Hence, use z=-1/a if x>=|-1|, otherwise z=a. */ + uint32x4_t red = vcagtq_f32 (x, d->neg_one); + + float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x); + + /* Shift is calculated as +-pi/2 or 0, depending on the argument case. */ float32x4_t shift = vreinterpretq_f32_u32 ( - vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2))); - /* Use absolute value only when needed (odd powers of z). */ - float32x4_t az = vbslq_f32 ( - SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z); + vandq_u32 (red, veorq_u32 (d->pi_over_2, sign))); + + float32x4_t z2 = vmulq_f32 (z, z); + float32x4_t z3 = vmulq_f32 (z, z2); + float32x4_t z4 = vmulq_f32 (z2, z2); +#if WANT_SIMD_EXCEPT /* Calculate the polynomial approximation. Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, a standard implementation using z8 creates spurious underflow in the very last fma (when z^8 is small enough). - Therefore, we split the last fma into a mul and an fma. - Horner and single-level Estrin have higher errors that exceed - threshold. */ - float32x4_t z2 = vmulq_f32 (z, z); - float32x4_t z4 = vmulq_f32 (z2, z2); - + Therefore, we split the last fma into a mul and an fma. */ float32x4_t y = vfmaq_f32 ( v_pairwise_poly_3_f32 (z2, z4, d->poly), z4, vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4))); - /* y = shift + z * P(z^2). */ - y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift); +#else + float32x4_t z8 = vmulq_f32 (z4, z4); + + /* Uses an Estrin scheme for polynomial approximation. */ + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3); - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); - return y; + float32x4_t y = vfmaq_f32 (p03, z8, p47); +#endif + + /* y = shift + z * P(z^2). */ + return vfmaq_f32 (vaddq_f32 (shift, z), z3, y); } libmvec_hidden_def (V_NAME_F1 (atan)) HALF_WIDTH_ALIAS_F1 (atan) diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c index 7b54094..0f5a054 100644 --- a/sysdeps/aarch64/fpu/atanf_sve.c +++ b/sysdeps/aarch64/fpu/atanf_sve.c @@ -18,18 +18,26 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f32.h" static const struct data { - float32_t poly[8]; - float32_t pi_over_2; + float32_t c1, c3, c5, c7; + float32_t c0, c2, c4, c6; + float32_t shift_val, neg_one; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. */ - .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f, - -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f }, - .pi_over_2 = 0x1.921fb6p+0f, + .c0 = -0x1.5554dcp-2, + .c1 = 0x1.9978ecp-3, + .c2 = -0x1.230a94p-3, + .c3 = 0x1.b4debp-4, + .c4 = -0x1.3550dap-4, + .c5 = 0x1.61eebp-5, + .c6 = -0x1.0c17d4p-6, + .c7 = 0x1.7ea694p-9, + /* pi/2, used as a shift value after reduction. */ + .shift_val = 0x1.921fb54442d18p+0, + .neg_one = -1.0f, }; #define SignMask (0x80000000) @@ -37,43 +45,49 @@ static const struct data /* Fast implementation of SVE atanf based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using z=-1/x and shift = pi/2. - Largest observed error is 2.9 ULP, close to +/-1.0: - _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1 - want -0x1.967fp-1. */ + Largest observed error is 2.12 ULP: + _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1 + want 0x1.95ed36p-1. */ svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); + svbool_t ptrue = svptrue_b32 (); /* No need to trigger special case. Small cases, infs and nans are supported by our approximation technique. */ svuint32_t ix = svreinterpret_u32 (x); - svuint32_t sign = svand_x (pg, ix, SignMask); + svuint32_t sign = svand_x (ptrue, ix, SignMask); /* Argument reduction: y := arctan(x) for x < 1 - y := pi/2 + arctan(-1/x) for x > 1 - Hence, use z=-1/a if x>=1, otherwise z=a. */ - svbool_t red = svacgt (pg, x, 1.0f); - /* Avoid dependency in abs(x) in division (and comparison). */ - svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x); - /* Use absolute value only when needed (odd powers of z). */ - svfloat32_t az = svabs_x (pg, z); - az = svneg_m (az, red, az); - - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ - svfloat32_t z2 = svmul_x (pg, z, z); - svfloat32_t z4 = svmul_x (pg, z2, z2); - svfloat32_t z8 = svmul_x (pg, z4, z4); - - svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly); - - /* y = shift + z + z^3 * P(z^2). */ - svfloat32_t z3 = svmul_x (pg, z2, az); - y = svmla_x (pg, az, z3, y); - - /* Apply shift as indicated by 'red' predicate. */ - y = svadd_m (red, y, sv_f32 (d->pi_over_2)); - - /* y = atan(x) if x>0, -atan(-x) otherwise. */ - return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); + y := arctan(-1/x) + pi/2 for x > +1 + y := arctan(-1/x) - pi/2 for x < -1 + Hence, use z=-1/a if |x|>=|-1|, otherwise z=a. */ + svbool_t red = svacgt (pg, x, d->neg_one); + svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x); + + /* Reinserts the sign bit of the argument to handle the case of x < -1. */ + svfloat32_t shift = svreinterpret_f32 ( + sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign)); + + svfloat32_t z2 = svmul_x (ptrue, z, z); + svfloat32_t z3 = svmul_x (ptrue, z2, z); + svfloat32_t z4 = svmul_x (ptrue, z2, z2); + svfloat32_t z8 = svmul_x (ptrue, z4, z4); + + svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1); + + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3); + + svfloat32_t p03 = svmla_x (pg, p01, z4, p23); + svfloat32_t p47 = svmla_x (pg, p45, z4, p67); + + svfloat32_t y = svmla_x (pg, p03, z8, p47); + + /* shift + z + z^3 * P(z^2). */ + shift = svadd_m (red, z, shift); + return svmla_x (pg, shift, z3, y); } diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c index 04f7e57..5251f3c 100644 --- a/sysdeps/aarch64/fpu/log1p_sve.c +++ b/sysdeps/aarch64/fpu/log1p_sve.c @@ -22,19 +22,33 @@ static const struct data { - double poly[19]; + float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16; + float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; double ln2_hi, ln2_lo; uint64_t hfrt2_top, onemhfrt2_top, inf, mone; } data = { /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20 - polynomial, however first 2 coefficients are 0 and 1 so are not stored. */ - .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2, - 0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3, - -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4, - 0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4, - -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5, - 0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4, - -0x1.cfa7385bdb37ep-6, }, + polynomial, however first 2 coefficients are 0 and 1 so are not + stored. */ + .c0 = -0x1.ffffffffffffbp-2, + .c1 = 0x1.55555555551a9p-2, + .c2 = -0x1.00000000008e3p-2, + .c3 = 0x1.9999999a32797p-3, + .c4 = -0x1.555555552fecfp-3, + .c5 = 0x1.249248e071e5ap-3, + .c6 = -0x1.ffffff8bf8482p-4, + .c7 = 0x1.c71c8f07da57ap-4, + .c8 = -0x1.9999ca4ccb617p-4, + .c9 = 0x1.7459ad2e1dfa3p-4, + .c10 = -0x1.554d2680a3ff2p-4, + .c11 = 0x1.3b4c54d487455p-4, + .c12 = -0x1.2548a9ffe80e6p-4, + .c13 = 0x1.0f389a24b2e07p-4, + .c14 = -0x1.eee4db15db335p-5, + .c15 = 0x1.e95b494d4a5ddp-5, + .c16 = -0x1.15fdf07cb7c73p-4, + .c17 = 0x1.0310b70800fcfp-4, + .c18 = -0x1.cfa7385bdb37ep-6, .ln2_hi = 0x1.62e42fefa3800p-1, .ln2_lo = 0x1.ef35793c76730p-45, /* top32(asuint64(sqrt(2)/2)) << 32. */ @@ -49,7 +63,7 @@ static const struct data #define BottomMask 0xffffffff static svfloat64_t NOINLINE -special_case (svbool_t special, svfloat64_t x, svfloat64_t y) +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) { return sv_call_f64 (log1p, x, y, special); } @@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ svuint64_t utop = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top); - svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask)); - svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1); + svuint64_t u_red + = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask)); + svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1); /* Correction term c/m. */ svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m); @@ -103,16 +118,47 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) Hence approximation has the form f + f^2 * P(f) where P(x) = C0 + C1*x + C2x^2 + ... Assembling this all correctly is dealt with at the final step. */ - svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2), - f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8); - svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly); + svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f), + f4 = svmul_x (svptrue_b64 (), f2, f2), + f8 = svmul_x (svptrue_b64 (), f4, f4), + f16 = svmul_x (svptrue_b64 (), f8, f8); + + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17); + + /* Order-18 Estrin scheme. */ + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1); + + svfloat64_t p03 = svmla_x (pg, p01, f2, p23); + svfloat64_t p47 = svmla_x (pg, p45, f2, p67); + svfloat64_t p07 = svmla_x (pg, p03, f4, p47); + + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0); + svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1); + + svfloat64_t p811 = svmla_x (pg, p89, f2, p1011); + svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415); + svfloat64_t p815 = svmla_x (pg, p811, f4, p1215); + + svfloat64_t p015 = svmla_x (pg, p07, f8, p815); + svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0); + svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1); + svfloat64_t p = svmla_x (pg, p015, f16, p1618); svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo); svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi); - svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (special, x, y); - - return y; + return special_case ( + x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p), + special); + return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p); } diff --git a/sysdeps/aarch64/preconfigure b/sysdeps/aarch64/preconfigure index 19657b6..e1b772c 100644 --- a/sysdeps/aarch64/preconfigure +++ b/sysdeps/aarch64/preconfigure @@ -3,5 +3,6 @@ aarch64*) base_machine=aarch64 machine=aarch64 mtls_descriptor=desc + mtls_traditional=trad ;; esac diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c index d647865..4257c26 100644 --- a/sysdeps/arm/find_exidx.c +++ b/sysdeps/arm/find_exidx.c @@ -15,65 +15,17 @@ License along with the GNU C Library. If not, see <https://www.gnu.org/licenses/>. */ +#include <ldsodefs.h> #include <link.h> -#include <unwind.h> - -struct unw_eh_callback_data -{ - _Unwind_Ptr pc; - _Unwind_Ptr exidx_start; - int exidx_len; -}; - - -/* Callback to determines if the PC lies within an object, and remember the - location of the exception index table if it does. */ - -static int -find_exidx_callback (struct dl_phdr_info * info, size_t size, void * ptr) -{ - struct unw_eh_callback_data * data; - const ElfW(Phdr) *phdr; - int i; - int match; - _Unwind_Ptr load_base; - - data = (struct unw_eh_callback_data *) ptr; - load_base = info->dlpi_addr; - phdr = info->dlpi_phdr; - - match = 0; - for (i = info->dlpi_phnum; i > 0; i--, phdr++) - { - if (phdr->p_type == PT_LOAD) - { - _Unwind_Ptr vaddr = phdr->p_vaddr + load_base; - if (data->pc >= vaddr && data->pc < vaddr + phdr->p_memsz) - match = 1; - } - else if (phdr->p_type == PT_ARM_EXIDX) - { - data->exidx_start = (_Unwind_Ptr) (phdr->p_vaddr + load_base); - data->exidx_len = phdr->p_memsz; - } - } - - return match; -} - /* Find the exception index table containing PC. */ _Unwind_Ptr __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount) { - struct unw_eh_callback_data data; - - data.pc = pc; - data.exidx_start = 0; - if (__dl_iterate_phdr (find_exidx_callback, &data) <= 0) + struct dl_find_object data; + if (GLRO(dl_find_object) ((void *) pc, &data) < 0) return 0; - - *pcount = data.exidx_len / 8; - return data.exidx_start; + *pcount = data.dlfo_eh_count; + return (_Unwind_Ptr) data.dlfo_eh_frame; } diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index 259ce2e..b4c6e6d 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -351,7 +351,7 @@ struct rtld_global void (*free) (void *); } _ns_unique_sym_table; /* Keep track of changes to each namespace' list. */ - struct r_debug_extended _ns_debug; + struct r_debug_extended _ns_debug_unused; } _dl_ns[DL_NNS]; /* One higher than index of last used namespace. */ EXTERN size_t _dl_nns; @@ -1014,6 +1014,13 @@ extern void _dl_relocate_object (struct link_map *map, int reloc_mode, int consider_profiling) attribute_hidden; +/* Perform relocation, but do not apply RELRO. Does not check + L->relocated. Otherwise the same as _dl_relocate_object. */ +void _dl_relocate_object_no_relro (struct link_map *map, + struct r_scope_elem *scope[], + int reloc_mode, int consider_profiling) + attribute_hidden; + /* Protect PT_GNU_RELRO area. */ extern void _dl_protect_relro (struct link_map *map) attribute_hidden; @@ -1062,15 +1069,29 @@ extern void _dl_debug_state (void); rtld_hidden_proto (_dl_debug_state) /* Initialize `struct r_debug_extended' for the namespace NS. LDBASE - is the run-time load address of the dynamic linker, to be put in the - `r_ldbase' member. Return the address of the structure. */ + is the run-time load address of the dynamic linker, to be put in + the `r_ldbase' member. + + Return the address of the r_debug structure for the namespace. + This is not merely a convenience or optimization, but it is + necessary for the LIBC_PROBE Systemtap/debugger probes to work + reliably: direct variable access can create probes that tools + cannot consume. */ extern struct r_debug *_dl_debug_initialize (ElfW(Addr) ldbase, Lmid_t ns) attribute_hidden; +/* This is called after relocation processing to handle a potential + copy relocation for _r_debug. */ +void _dl_debug_post_relocate (struct link_map *main_map) attribute_hidden; + /* Update the `r_map' member and return the address of `struct r_debug' of the namespace NS. */ extern struct r_debug *_dl_debug_update (Lmid_t ns) attribute_hidden; +/* Update R->r_state to STATE and notify the debugger by calling + _dl_debug_state. */ +void _dl_debug_change_state (struct r_debug *r, int state) attribute_hidden; + /* Initialize the basic data structure for the search paths. SOURCE is either "LD_LIBRARY_PATH" or "--library-path". GLIBC_HWCAPS_PREPEND adds additional glibc-hwcaps subdirectories to diff --git a/sysdeps/generic/libc-tsd.h b/sysdeps/generic/libc-tsd.h deleted file mode 100644 index ac0e99e..0000000 --- a/sysdeps/generic/libc-tsd.h +++ /dev/null @@ -1,60 +0,0 @@ -/* libc-internal interface for thread-specific data. Stub or TLS version. - Copyright (C) 1998-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#ifndef _GENERIC_LIBC_TSD_H -#define _GENERIC_LIBC_TSD_H 1 - -/* This file defines the following macros for accessing a small fixed - set of thread-specific `void *' data used only internally by libc. - - __libc_tsd_define(CLASS, TYPE, KEY) -- Define or declare a datum with TYPE - for KEY. CLASS can be `static' for - keys used in only one source file, - empty for global definitions, or - `extern' for global declarations. - __libc_tsd_address(TYPE, KEY) -- Return the `TYPE *' pointing to - the current thread's datum for KEY. - __libc_tsd_get(TYPE, KEY) -- Return the `TYPE' datum for KEY. - __libc_tsd_set(TYPE, KEY, VALUE) -- Set the datum for KEY to VALUE. - - The set of available KEY's will usually be provided as an enum, - and contains (at least): - _LIBC_TSD_KEY_MALLOC - _LIBC_TSD_KEY_DL_ERROR - _LIBC_TSD_KEY_RPC_VARS - All uses must be the literal _LIBC_TSD_* name in the __libc_tsd_* macros. - Some implementations may not provide any enum at all and instead - using string pasting in the macros. */ - -#include <tls.h> - -/* When full support for __thread variables is available, this interface is - just a trivial wrapper for it. Without TLS, this is the generic/stub - implementation for wholly single-threaded systems. - - We don't define an enum for the possible key values, because the KEYs - translate directly into variables by macro magic. */ - -#define __libc_tsd_define(CLASS, TYPE, KEY) \ - CLASS __thread TYPE __libc_tsd_##KEY attribute_tls_model_ie; - -#define __libc_tsd_address(TYPE, KEY) (&__libc_tsd_##KEY) -#define __libc_tsd_get(TYPE, KEY) (__libc_tsd_##KEY) -#define __libc_tsd_set(TYPE, KEY, VALUE) (__libc_tsd_##KEY = (VALUE)) - -#endif /* libc-tsd.h */ diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile index a2e8c0b..c0c017b 100644 --- a/sysdeps/i386/Makefile +++ b/sysdeps/i386/Makefile @@ -30,7 +30,9 @@ stack-align-test-flags += -malign-double endif ifeq ($(subdir),elf) -sysdep-dl-routines += tlsdesc dl-tlsdesc +sysdep-dl-routines += \ + dl-tls-get-addr \ +# sysdep-dl-routines tests += tst-audit3 modules-names += tst-auditmod3a tst-auditmod3b @@ -58,6 +60,15 @@ $(objpfx)tst-ld-sse-use.out: ../sysdeps/i386/tst-ld-sse-use.sh $(objpfx)ld.so @echo "Checking ld.so for SSE register use. This will take a few seconds..." $(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \ $(evaluate-test) + +tests-special += $(objpfx)check-gnu-tls.out + +$(objpfx)check-gnu-tls.out: $(common-objpfx)libc.so + LC_ALL=C $(READELF) -V -W $< \ + | sed -ne '/.gnu.version_d/, /.gnu.version_r/ p' \ + | grep GLIBC_ABI_GNU_TLS > $@; \ + $(evaluate-test) +generated += check-gnu-tls.out else CFLAGS-.os += $(if $(filter rtld-%.os,$(@F)), $(rtld-CFLAGS)) endif diff --git a/sysdeps/i386/Versions b/sysdeps/i386/Versions index 36e23b4..9c84c8e 100644 --- a/sysdeps/i386/Versions +++ b/sysdeps/i386/Versions @@ -28,6 +28,11 @@ libc { GLIBC_2.13 { __fentry__; } + GLIBC_ABI_GNU_TLS { + # This symbol is used only for empty version map and will be removed + # by scripts/versions.awk. + __placeholder_only_for_empty_version_map; + } } libm { GLIBC_2.1 { diff --git a/sysdeps/i386/dl-tls-get-addr.c b/sysdeps/i386/dl-tls-get-addr.c new file mode 100644 index 0000000..c97e5c5 --- /dev/null +++ b/sysdeps/i386/dl-tls-get-addr.c @@ -0,0 +1,68 @@ +/* Ifunc selector for ___tls_get_addr. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifdef SHARED +# define ___tls_get_addr __redirect____tls_get_addr +# include <dl-tls.h> +# undef ___tls_get_addr +# undef __tls_get_addr + +# define SYMBOL_NAME ___tls_get_addr +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (fnsave) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (fxsave) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (xsave) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (xsavec) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (cpu_features->xsave_state_size != 0) + { + if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)) + return OPTIMIZE (xsavec); + else + return OPTIMIZE (xsave); + } + else if (CPU_FEATURE_USABLE_P (cpu_features, FXSR)) + return OPTIMIZE (fxsave); + return OPTIMIZE (fnsave); +} + +libc_ifunc_redirected (__redirect____tls_get_addr, ___tls_get_addr, + IFUNC_SELECTOR ()); + +/* The special thing about the x86 TLS ABI is that we have two + variants of the __tls_get_addr function with different calling + conventions. The GNU version, which we are mostly concerned here, + takes the parameter in a register. The name is changed by adding + an additional underscore at the beginning. The Sun version uses + the normal calling convention. */ + +rtld_hidden_proto (___tls_get_addr) +rtld_hidden_def (___tls_get_addr) + +void * +__tls_get_addr (tls_index *ti) +{ + return ___tls_get_addr (ti); +} +#endif diff --git a/sysdeps/i386/dl-tls.h b/sysdeps/i386/dl-tls.h index f172867..2380ec1 100644 --- a/sysdeps/i386/dl-tls.h +++ b/sysdeps/i386/dl-tls.h @@ -29,33 +29,13 @@ typedef struct dl_tls_index /* This is the prototype for the GNU version. */ extern void *___tls_get_addr (tls_index *ti) __attribute__ ((__regparm__ (1))); -extern void *___tls_get_addr_internal (tls_index *ti) - __attribute__ ((__regparm__ (1))) attribute_hidden; - # if IS_IN (rtld) -/* The special thing about the x86 TLS ABI is that we have two - variants of the __tls_get_addr function with different calling - conventions. The GNU version, which we are mostly concerned here, - takes the parameter in a register. The name is changed by adding - an additional underscore at the beginning. The Sun version uses - the normal calling convention. */ -void * -__tls_get_addr (tls_index *ti) -{ - return ___tls_get_addr_internal (ti); -} - - /* Prepare using the definition of __tls_get_addr in the generic version of this file. */ -# define __tls_get_addr __attribute__ ((__regparm__ (1))) ___tls_get_addr -strong_alias (___tls_get_addr, ___tls_get_addr_internal) -rtld_hidden_proto (___tls_get_addr) -rtld_hidden_def (___tls_get_addr) -#else - +# define __tls_get_addr \ + __attribute__ ((__regparm__ (1))) ___tls_get_addr_internal +# else /* Users should get the better interface. */ -# define __tls_get_addr ___tls_get_addr - +# define __tls_get_addr ___tls_get_addr # endif #endif diff --git a/sysdeps/i386/dl-tlsdesc-dynamic.h b/sysdeps/i386/dl-tlsdesc-dynamic.h index 3627028..8a59524 100644 --- a/sysdeps/i386/dl-tlsdesc-dynamic.h +++ b/sysdeps/i386/dl-tlsdesc-dynamic.h @@ -16,34 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#undef REGISTER_SAVE_AREA - -#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0 -# error STATE_SAVE_ALIGNMENT must be multiple of 16 -#endif - -#if DL_RUNTIME_RESOLVE_REALIGN_STACK -# ifdef USE_FNSAVE -# error USE_FNSAVE shouldn't be defined -# endif -# ifdef USE_FXSAVE -/* Use fxsave to save all registers. */ -# define REGISTER_SAVE_AREA 512 -# endif -#else -# ifdef USE_FNSAVE -/* Use fnsave to save x87 FPU stack registers. */ -# define REGISTER_SAVE_AREA 108 -# else -# ifndef USE_FXSAVE -# error USE_FXSAVE must be defined -# endif -/* Use fxsave to save all registers. Add 12 bytes to align the stack - to 16 bytes. */ -# define REGISTER_SAVE_AREA (512 + 12) -# endif -#endif - .hidden _dl_tlsdesc_dynamic .global _dl_tlsdesc_dynamic .type _dl_tlsdesc_dynamic,@function @@ -104,85 +76,7 @@ _dl_tlsdesc_dynamic: ret .p2align 4,,7 2: - cfi_adjust_cfa_offset (32) -#if DL_RUNTIME_RESOLVE_REALIGN_STACK - movl %ebx, -28(%esp) - movl %esp, %ebx - cfi_def_cfa_register(%ebx) - and $-STATE_SAVE_ALIGNMENT, %esp -#endif -#ifdef REGISTER_SAVE_AREA - subl $REGISTER_SAVE_AREA, %esp -# if !DL_RUNTIME_RESOLVE_REALIGN_STACK - cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) -# endif -#else -# if !DL_RUNTIME_RESOLVE_REALIGN_STACK -# error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true -# endif - /* Allocate stack space of the required size to save the state. */ - LOAD_PIC_REG (cx) - subl RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp -#endif -#ifdef USE_FNSAVE - fnsave (%esp) -#elif defined USE_FXSAVE - fxsave (%esp) -#else - /* Save the argument for ___tls_get_addr in EAX. */ - movl %eax, %ecx - movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax - xorl %edx, %edx - /* Clear the XSAVE Header. */ -# ifdef USE_XSAVE - movl %edx, (512)(%esp) - movl %edx, (512 + 4 * 1)(%esp) - movl %edx, (512 + 4 * 2)(%esp) - movl %edx, (512 + 4 * 3)(%esp) -# endif - movl %edx, (512 + 4 * 4)(%esp) - movl %edx, (512 + 4 * 5)(%esp) - movl %edx, (512 + 4 * 6)(%esp) - movl %edx, (512 + 4 * 7)(%esp) - movl %edx, (512 + 4 * 8)(%esp) - movl %edx, (512 + 4 * 9)(%esp) - movl %edx, (512 + 4 * 10)(%esp) - movl %edx, (512 + 4 * 11)(%esp) - movl %edx, (512 + 4 * 12)(%esp) - movl %edx, (512 + 4 * 13)(%esp) - movl %edx, (512 + 4 * 14)(%esp) - movl %edx, (512 + 4 * 15)(%esp) -# ifdef USE_XSAVE - xsave (%esp) -# else - xsavec (%esp) -# endif - /* Restore the argument for ___tls_get_addr in EAX. */ - movl %ecx, %eax -#endif - call HIDDEN_JUMPTARGET (___tls_get_addr) - /* Get register content back. */ -#ifdef USE_FNSAVE - frstor (%esp) -#elif defined USE_FXSAVE - fxrstor (%esp) -#else - /* Save and retore ___tls_get_addr return value stored in EAX. */ - movl %eax, %ecx - movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax - xorl %edx, %edx - xrstor (%esp) - movl %ecx, %eax -#endif -#if DL_RUNTIME_RESOLVE_REALIGN_STACK - mov %ebx, %esp - cfi_def_cfa_register(%esp) - movl -28(%esp), %ebx - cfi_restore(%ebx) -#else - addl $REGISTER_SAVE_AREA, %esp - cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA) -#endif +#include "tls-get-addr-wrapper.h" jmp 1b cfi_endproc .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic diff --git a/sysdeps/i386/dl-tlsdesc.S b/sysdeps/i386/dl-tlsdesc.S index f002fee..dec7049 100644 --- a/sysdeps/i386/dl-tlsdesc.S +++ b/sysdeps/i386/dl-tlsdesc.S @@ -22,23 +22,6 @@ #include <features-offsets.h> #include "tlsdesc.h" -#ifndef DL_STACK_ALIGNMENT -/* Due to GCC bug: - - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 - - __tls_get_addr may be called with 4-byte stack alignment. Although - this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume - that stack will be always aligned at 16 bytes. */ -# define DL_STACK_ALIGNMENT 4 -#endif - -/* True if _dl_tlsdesc_dynamic should align stack for STATE_SAVE or align - stack to MINIMUM_ALIGNMENT bytes before calling ___tls_get_addr. */ -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ - (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ - || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT) - .text /* This function is used to compute the TP offset for symbols in diff --git a/sysdeps/i386/tls-get-addr-wrapper.h b/sysdeps/i386/tls-get-addr-wrapper.h new file mode 100644 index 0000000..0708e5a --- /dev/null +++ b/sysdeps/i386/tls-get-addr-wrapper.h @@ -0,0 +1,127 @@ +/* Wrapper of i386 ___tls_get_addr to save and restore vector registers. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#undef REGISTER_SAVE_AREA + +#if !defined USE_FNSAVE && (STATE_SAVE_ALIGNMENT % 16) != 0 +# error STATE_SAVE_ALIGNMENT must be multiple of 16 +#endif + +#if DL_RUNTIME_RESOLVE_REALIGN_STACK +# ifdef USE_FNSAVE +# error USE_FNSAVE shouldn't be defined +# endif +# ifdef USE_FXSAVE +/* Use fxsave to save all registers. */ +# define REGISTER_SAVE_AREA 512 +# endif +#else +# ifdef USE_FNSAVE +/* Use fnsave to save x87 FPU stack registers. */ +# define REGISTER_SAVE_AREA 108 +# else +# ifndef USE_FXSAVE +# error USE_FXSAVE must be defined +# endif +/* Use fxsave to save all registers. Add 12 bytes to align the stack + to 16 bytes. */ +# define REGISTER_SAVE_AREA (512 + 12) +# endif +#endif + +#if DL_RUNTIME_RESOLVE_REALIGN_STACK + movl %ebx, 28(%esp) + movl %esp, %ebx + cfi_def_cfa_register(%ebx) + and $-STATE_SAVE_ALIGNMENT, %esp +#endif +#ifdef REGISTER_SAVE_AREA + subl $REGISTER_SAVE_AREA, %esp +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK + cfi_adjust_cfa_offset(REGISTER_SAVE_AREA) +# endif +#else +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK +# error DL_RUNTIME_RESOLVE_REALIGN_STACK must be true +# endif + /* Allocate stack space of the required size to save the state. */ + LOAD_PIC_REG (cx) + subl RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET \ + +XSAVE_STATE_SIZE_OFFSET+_rtld_local_ro@GOTOFF(%ecx), %esp +#endif +#ifdef USE_FNSAVE + fnsave (%esp) +#elif defined USE_FXSAVE + fxsave (%esp) +#else + /* Save the argument for ___tls_get_addr in EAX. */ + movl %eax, %ecx + movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax + xorl %edx, %edx + /* Clear the XSAVE Header. */ +# ifdef USE_XSAVE + movl %edx, (512)(%esp) + movl %edx, (512 + 4 * 1)(%esp) + movl %edx, (512 + 4 * 2)(%esp) + movl %edx, (512 + 4 * 3)(%esp) +# endif + movl %edx, (512 + 4 * 4)(%esp) + movl %edx, (512 + 4 * 5)(%esp) + movl %edx, (512 + 4 * 6)(%esp) + movl %edx, (512 + 4 * 7)(%esp) + movl %edx, (512 + 4 * 8)(%esp) + movl %edx, (512 + 4 * 9)(%esp) + movl %edx, (512 + 4 * 10)(%esp) + movl %edx, (512 + 4 * 11)(%esp) + movl %edx, (512 + 4 * 12)(%esp) + movl %edx, (512 + 4 * 13)(%esp) + movl %edx, (512 + 4 * 14)(%esp) + movl %edx, (512 + 4 * 15)(%esp) +# ifdef USE_XSAVE + xsave (%esp) +# else + xsavec (%esp) +# endif + /* Restore the argument for ___tls_get_addr in EAX. */ + movl %ecx, %eax +#endif + call ___tls_get_addr_internal + /* Get register content back. */ +#ifdef USE_FNSAVE + frstor (%esp) +#elif defined USE_FXSAVE + fxrstor (%esp) +#else + /* Save and retore ___tls_get_addr return value stored in EAX. */ + movl %eax, %ecx + movl $TLSDESC_CALL_STATE_SAVE_MASK, %eax + xorl %edx, %edx + xrstor (%esp) + movl %ecx, %eax +#endif +#if DL_RUNTIME_RESOLVE_REALIGN_STACK + mov %ebx, %esp + cfi_def_cfa_register(%esp) + movl 28(%esp), %ebx + cfi_restore(%ebx) +#else + addl $REGISTER_SAVE_AREA, %esp + cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA) +#endif + +#undef STATE_SAVE_ALIGNMENT diff --git a/sysdeps/i386/tls_get_addr.S b/sysdeps/i386/tls_get_addr.S new file mode 100644 index 0000000..7d143d8 --- /dev/null +++ b/sysdeps/i386/tls_get_addr.S @@ -0,0 +1,57 @@ +/* Thread-local storage handling in the ELF dynamic linker. i386 version. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <tls.h> +#include <cpu-features-offsets.h> +#include <features-offsets.h> + + .text +#ifdef SHARED +# define USE_FNSAVE +# define MINIMUM_ALIGNMENT 4 +# define STATE_SAVE_ALIGNMENT 4 +# define ___tls_get_addr _____tls_get_addr_fnsave +# include "tls_get_addr.h" +# undef ___tls_get_addr +# undef MINIMUM_ALIGNMENT +# undef USE_FNSAVE + +# define MINIMUM_ALIGNMENT 16 + +# define USE_FXSAVE +# define STATE_SAVE_ALIGNMENT 16 +# define ___tls_get_addr _____tls_get_addr_fxsave +# include "tls_get_addr.h" +# undef ___tls_get_addr +# undef USE_FXSAVE + +# define USE_XSAVE +# define STATE_SAVE_ALIGNMENT 64 +# define ___tls_get_addr _____tls_get_addr_xsave +# include "tls_get_addr.h" +# undef ___tls_get_addr +# undef USE_XSAVE + +# define USE_XSAVEC +# define STATE_SAVE_ALIGNMENT 64 +# define ___tls_get_addr _____tls_get_addr_xsavec +# include "tls_get_addr.h" +# undef ___tls_get_addr +# undef USE_XSAVEC +#endif /* SHARED */ diff --git a/sysdeps/i386/tls_get_addr.h b/sysdeps/i386/tls_get_addr.h new file mode 100644 index 0000000..1825798 --- /dev/null +++ b/sysdeps/i386/tls_get_addr.h @@ -0,0 +1,42 @@ +/* Thread-local storage handling in the ELF dynamic linker. i386 version. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + .hidden ___tls_get_addr + .global ___tls_get_addr + .type ___tls_get_addr,@function + + /* This function is a wrapper of ___tls_get_addr_internal to + preserve caller-saved vector registers. */ + + cfi_startproc + .align 16 +___tls_get_addr: + /* Like all TLS resolvers, preserve call-clobbered registers. + We need two scratch regs anyway. */ + subl $32, %esp + cfi_adjust_cfa_offset (32) + movl %ecx, 20(%esp) + movl %edx, 24(%esp) +#include "tls-get-addr-wrapper.h" + movl 20(%esp), %ecx + movl 24(%esp), %edx + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret + cfi_endproc + .size ___tls_get_addr, .-___tls_get_addr diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c index 11a2a45..05ac0a1 100644 --- a/sysdeps/ieee754/dbl-64/e_atanh.c +++ b/sysdeps/ieee754/dbl-64/e_atanh.c @@ -44,6 +44,11 @@ static const double huge = 1e300; +#ifndef SECTION +# define SECTION +#endif + +SECTION double __ieee754_atanh (double x) { @@ -73,4 +78,7 @@ __ieee754_atanh (double x) return copysign (t, x); } + +#ifndef __ieee754_atanh libm_alias_finite (__ieee754_atanh, __atanh) +#endif diff --git a/sysdeps/loongarch/preconfigure b/sysdeps/loongarch/preconfigure index dfc7ecf..6b015ae 100644 --- a/sysdeps/loongarch/preconfigure +++ b/sysdeps/loongarch/preconfigure @@ -43,6 +43,7 @@ loongarch*) base_machine=loongarch + mtls_traditional=trad ;; esac diff --git a/sysdeps/loongarch/preconfigure.ac b/sysdeps/loongarch/preconfigure.ac index 67e4357..31e9579 100644 --- a/sysdeps/loongarch/preconfigure.ac +++ b/sysdeps/loongarch/preconfigure.ac @@ -41,6 +41,7 @@ loongarch*) AC_DEFINE_UNQUOTED([LOONGARCH_ABI_FRLEN], [$abi_flen]) base_machine=loongarch + mtls_traditional=trad ;; esac diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h index df54eef..bccc200 100644 --- a/sysdeps/nptl/bits/thread-shared-types.h +++ b/sysdeps/nptl/bits/thread-shared-types.h @@ -95,11 +95,12 @@ struct __pthread_cond_s { __atomic_wide_counter __wseq; __atomic_wide_counter __g1_start; - unsigned int __g_refs[2] __LOCK_ALIGNMENT; - unsigned int __g_size[2]; + unsigned int __g_size[2] __LOCK_ALIGNMENT; unsigned int __g1_orig_size; unsigned int __wrefs; unsigned int __g_signals[2]; + unsigned int __unused_initialized_1; + unsigned int __unused_initialized_2; }; typedef unsigned int __tss_t; diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h index 3d4f4a7..e0f2441 100644 --- a/sysdeps/nptl/pthread.h +++ b/sysdeps/nptl/pthread.h @@ -152,7 +152,7 @@ enum /* Conditional variable handling. */ -#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, {0, 0}, 0, 0, {0, 0} } } +#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } } /* Cleanup buffers */ diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile index 5e6cb07..5cdb64f 100644 --- a/sysdeps/powerpc/Makefile +++ b/sysdeps/powerpc/Makefile @@ -28,6 +28,11 @@ tst-cache-ppc-static-dlopen-ENV = LD_LIBRARY_PATH=$(objpfx):$(common-objpfx):$(c $(objpfx)tst-cache-ppc-static-dlopen.out: $(objpfx)mod-cache-ppc.so $(objpfx)tst-cache-ppc: $(objpfx)mod-cache-ppc.so + +# The test checks if the __tls_get_addr does not clobber caller-saved +# register, so disable the powerpc specific optimization to force a +# __tls_get_addr call. +LDFLAGS-tst-tls23-mod.so = -Wl,--no-tls-get-addr-optimize endif ifneq (no,$(multi-arch)) diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S deleted file mode 100644 index 53e5716..0000000 --- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S +++ /dev/null @@ -1,315 +0,0 @@ -/* Optimized memchr implementation for POWER10 LE. - Copyright (C) 2021-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -# ifndef MEMCHR -# define MEMCHR __memchr -# endif -# define M_VREG_ZERO v20 -# define M_OFF_START_LOOP 256 -# define MEMCHR_SUBTRACT_VECTORS \ - vsububm v4,v4,v18; \ - vsububm v5,v5,v18; \ - vsububm v6,v6,v18; \ - vsububm v7,v7,v18; -# define M_TAIL(vreg,increment) \ - vctzlsbb r4,vreg; \ - cmpld r5,r4; \ - ble L(null); \ - addi r4,r4,increment; \ - add r3,r6,r4; \ - blr - -/* TODO: Replace macros by the actual instructions when minimum binutils becomes - >= 2.35. This is used to keep compatibility with older versions. */ -#define M_VEXTRACTBM(rt,vrb) \ - .long(((4)<<(32-6)) \ - | ((rt)<<(32-11)) \ - | ((8)<<(32-16)) \ - | ((vrb)<<(32-21)) \ - | 1602) - -#define M_LXVP(xtp,dq,ra) \ - .long(((6)<<(32-6)) \ - | ((((xtp)-32)>>1)<<(32-10)) \ - | ((1)<<(32-11)) \ - | ((ra)<<(32-16)) \ - | dq) - -#define CHECK16B(vreg,offset,addr,label) \ - lxv vreg+32,offset(addr); \ - vcmpequb. vreg,vreg,v18; \ - bne cr6,L(label); \ - cmpldi r5,16; \ - ble L(null); \ - addi r5,r5,-16; - -/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # - of bytes already checked. */ -#define CHECK64B(offset,addr,label) \ - M_LXVP(v4+32,offset,addr); \ - M_LXVP(v6+32,offset+32,addr); \ - MEMCHR_SUBTRACT_VECTORS; \ - vminub v14,v4,v5; \ - vminub v15,v6,v7; \ - vminub v16,v14,v15; \ - vcmpequb. v0,v16,M_VREG_ZERO; \ - beq cr6,$+12; \ - li r7,offset; \ - b L(label); \ - cmpldi r5,64; \ - ble L(null); \ - addi r5,r5,-64 - -/* Implements the function - void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */ - - .machine power9 - -ENTRY_TOCLESS (MEMCHR) - CALL_MCOUNT 3 - - cmpldi r5,0 - beq L(null) - mr r0,r5 - xori r6,r4,0xff - - mtvsrd v18+32,r4 /* matching char in v18 */ - mtvsrd v19+32,r6 /* non matching char in v19 */ - - vspltb v18,v18,7 /* replicate */ - vspltb v19,v19,7 /* replicate */ - vspltisb M_VREG_ZERO,0 - - /* Next 16B-aligned address. Prepare address for L(aligned). */ - addi r6,r3,16 - clrrdi r6,r6,4 - - /* Align data and fill bytes not loaded with non matching char. */ - lvx v0,0,r3 - lvsr v1,0,r3 - vperm v0,v19,v0,v1 - - vcmpequb. v6,v0,v18 - bne cr6,L(found) - sub r4,r6,r3 - cmpld r5,r4 - ble L(null) - sub r5,r5,r4 - - /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is - optimized for longer strings, so checking the first bytes in 16B - chunks benefits a lot small strings. */ - .p2align 5 -L(aligned): - cmpldi r5,0 - beq L(null) - - CHECK16B(v0,0,r6,tail1) - CHECK16B(v1,16,r6,tail2) - CHECK16B(v2,32,r6,tail3) - CHECK16B(v3,48,r6,tail4) - CHECK16B(v4,64,r6,tail5) - CHECK16B(v5,80,r6,tail6) - CHECK16B(v6,96,r6,tail7) - CHECK16B(v7,112,r6,tail8) - CHECK16B(v8,128,r6,tail9) - CHECK16B(v9,144,r6,tail10) - CHECK16B(v10,160,r6,tail11) - CHECK16B(v0,176,r6,tail12) - CHECK16B(v1,192,r6,tail13) - CHECK16B(v2,208,r6,tail14) - CHECK16B(v3,224,r6,tail15) - - cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to - choose how we will perform the main loop. */ - - /* Prepare address for the loop. */ - addi r4,r3,M_OFF_START_LOOP - clrrdi r4,r4,6 - sub r6,r4,r3 - sub r5,r0,r6 - addi r6,r4,128 - - /* If c == 0, use the loop without the vsububm. */ - beq cr5,L(loop) - - /* This is very similar to the block after L(loop), the difference is - that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract - each byte loaded by the char we are looking for, this way we can keep - using vminub to merge the results and checking for nulls. */ - .p2align 5 -L(memchr_loop): - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - b L(memchr_loop) - /* Switch to a more aggressive approach checking 64B each time. Use 2 - pointers 128B apart and unroll the loop once to make the pointer - updates and usages separated enough to avoid stalls waiting for - address calculation. */ - .p2align 5 -L(loop): -#undef MEMCHR_SUBTRACT_VECTORS -#define MEMCHR_SUBTRACT_VECTORS /* nothing */ - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - CHECK64B(0,r4,pre_tail_64b) - CHECK64B(64,r4,pre_tail_64b) - addi r4,r4,256 - - CHECK64B(0,r6,tail_64b) - CHECK64B(64,r6,tail_64b) - addi r6,r6,256 - - b L(loop) - - .p2align 5 -L(pre_tail_64b): - mr r6,r4 -L(tail_64b): - /* OK, we found a null byte. Let's look for it in the current 64-byte - block and mark it in its corresponding VR. lxvp vx,0(ry) puts the - low 16B bytes into vx+1, and the high into vx, so the order here is - v5, v4, v7, v6. */ - vcmpequb v1,v5,M_VREG_ZERO - vcmpequb v2,v4,M_VREG_ZERO - vcmpequb v3,v7,M_VREG_ZERO - vcmpequb v4,v6,M_VREG_ZERO - - /* Take into account the other 64B blocks we had already checked. */ - add r6,r6,r7 - /* Extract first bit of each byte. */ - M_VEXTRACTBM(r8,v1) - M_VEXTRACTBM(r9,v2) - M_VEXTRACTBM(r10,v3) - M_VEXTRACTBM(r11,v4) - - /* Shift each value into their corresponding position. */ - sldi r9,r9,16 - sldi r10,r10,32 - sldi r11,r11,48 - - /* Merge the results. */ - or r8,r8,r9 - or r9,r10,r11 - or r11,r9,r8 - - cnttzd r0,r11 /* Count trailing zeros before the match. */ - cmpld r5,r0 - ble L(null) - add r3,r6,r0 /* Compute final address. */ - blr - - .p2align 5 -L(tail1): - M_TAIL(v0,0) - - .p2align 5 -L(tail2): - M_TAIL(v1,16) - - .p2align 5 -L(tail3): - M_TAIL(v2,32) - - .p2align 5 -L(tail4): - M_TAIL(v3,48) - - .p2align 5 -L(tail5): - M_TAIL(v4,64) - - .p2align 5 -L(tail6): - M_TAIL(v5,80) - - .p2align 5 -L(tail7): - M_TAIL(v6,96) - - .p2align 5 -L(tail8): - M_TAIL(v7,112) - - .p2align 5 -L(tail9): - M_TAIL(v8,128) - - .p2align 5 -L(tail10): - M_TAIL(v9,144) - - .p2align 5 -L(tail11): - M_TAIL(v10,160) - - .p2align 5 -L(tail12): - M_TAIL(v0,176) - - .p2align 5 -L(tail13): - M_TAIL(v1,192) - - .p2align 5 -L(tail14): - M_TAIL(v2,208) - - .p2align 5 -L(tail15): - M_TAIL(v3,224) - - .p2align 5 -L(found): - vctzlsbb r7,v6 - cmpld r5,r7 - ble L(null) - add r3,r3,r7 - blr - - .p2align 5 -L(null): - li r3,0 - blr - -END (MEMCHR) - -weak_alias (__memchr, memchr) -libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S deleted file mode 100644 index f0d6732..0000000 --- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S +++ /dev/null @@ -1,233 +0,0 @@ -/* Optimized strcmp implementation for PowerPC64/POWER10. - Copyright (C) 2021-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> - -#ifndef STRCMP -# define STRCMP strcmp -#endif - -/* Implements the function - int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */ - -/* TODO: Change this to actual instructions when minimum binutils is upgraded - to 2.27. Macros are defined below for these newer instructions in order - to maintain compatibility. */ - -#define LXVP(xtp,dq,ra) \ - .long(((6)<<(32-6)) \ - | ((((xtp)-32)>>1)<<(32-10)) \ - | ((1)<<(32-11)) \ - | ((ra)<<(32-16)) \ - | dq) - -#define COMPARE_16(vreg1,vreg2,offset) \ - lxv vreg1+32,offset(r3); \ - lxv vreg2+32,offset(r4); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(different); \ - -#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \ - LXVP(vreg1+32,offset,r3); \ - LXVP(vreg2+32,offset,r4); \ - vcmpnezb. v7,vreg1+1,vreg2+1; \ - bne cr6,L(label1); \ - vcmpnezb. v7,vreg1,vreg2; \ - bne cr6,L(label2); \ - -#define TAIL(vreg1,vreg2) \ - vctzlsbb r6,v7; \ - vextubrx r5,r6,vreg1; \ - vextubrx r4,r6,vreg2; \ - subf r3,r4,r5; \ - blr; \ - -#define CHECK_N_BYTES(reg1,reg2,len_reg) \ - sldi r0,len_reg,56; \ - lxvl 32+v4,reg1,r0; \ - lxvl 32+v5,reg2,r0; \ - add reg1,reg1,len_reg; \ - add reg2,reg2,len_reg; \ - vcmpnezb v7,v4,v5; \ - vctzlsbb r6,v7; \ - cmpld cr7,r6,len_reg; \ - blt cr7,L(different); \ - - /* TODO: change this to .machine power10 when the minimum required - binutils allows it. */ - - .machine power9 -ENTRY_TOCLESS (STRCMP, 4) - andi. r7,r3,4095 - andi. r8,r4,4095 - cmpldi cr0,r7,4096-16 - cmpldi cr1,r8,4096-16 - bgt cr0,L(crosses) - bgt cr1,L(crosses) - COMPARE_16(v4,v5,0) - -L(crosses): - andi. r7,r3,15 - subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */ - andi. r9,r4,15 - subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */ - cmpld cr7,r7,r5 - beq cr7,L(same_aligned) - blt cr7,L(nalign1_min) - - /* nalign2 is minimum and s2 pointer is aligned. */ - CHECK_N_BYTES(r3,r4,r5) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r3,63 /* Determine offset into 64B hunk. */ - andi. r8,r3,15 /* The offset into the 16B hunk. */ - neg r7,r3 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - b L(compare_64B_unaligned) - - /* nalign1 is minimum and s1 pointer is aligned. */ -L(nalign1_min): - CHECK_N_BYTES(r3,r4,r7) - /* Are we on the 64B hunk which crosses a page? */ - andi. r10,r4,63 /* Determine offset into 64B hunk. */ - andi. r8,r4,15 /* The offset into the 16B hunk. */ - neg r7,r4 - andi. r9,r7,15 /* Number of bytes after a 16B cross. */ - rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */ - beq L(compare_64_pagecross) - mtctr r7 - - .p2align 5 -L(compare_64B_unaligned): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - bdnz L(compare_64B_unaligned) - - /* Cross the page boundary of s2, carefully. Only for first - iteration we have to get the count of 64B blocks to be checked. - From second iteration and beyond, loop counter is always 63. */ -L(compare_64_pagecross): - li r11, 63 - mtctr r11 - cmpldi r10,16 - ble L(cross_4) - cmpldi r10,32 - ble L(cross_3) - cmpldi r10,48 - ble L(cross_2) -L(cross_1): - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - b L(compare_64B_unaligned) -L(cross_2): - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - b L(compare_64B_unaligned) -L(cross_3): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - addi r3,r3,32 - addi r4,r4,32 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - COMPARE_16(v4,v5,0) - addi r3,r3,16 - addi r4,r4,16 - b L(compare_64B_unaligned) -L(cross_4): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - addi r3,r3,48 - addi r4,r4,48 - CHECK_N_BYTES(r3,r4,r9) - CHECK_N_BYTES(r3,r4,r8) - b L(compare_64B_unaligned) - -L(same_aligned): - CHECK_N_BYTES(r3,r4,r7) - /* Align s1 to 32B and adjust s2 address. - Use lxvp only if both s1 and s2 are 32B aligned. */ - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - - clrldi r6,r3,59 - subfic r5,r6,32 - add r3,r3,r5 - add r4,r4,r5 - andi. r5,r4,0x1F - beq cr0,L(32B_aligned_loop) - - .p2align 5 -L(16B_aligned_loop): - COMPARE_16(v4,v5,0) - COMPARE_16(v4,v5,16) - COMPARE_16(v4,v5,32) - COMPARE_16(v4,v5,48) - addi r3,r3,64 - addi r4,r4,64 - b L(16B_aligned_loop) - - /* Calculate and return the difference. */ -L(different): - TAIL(v4,v5) - - .p2align 5 -L(32B_aligned_loop): - COMPARE_32(v14,v16,0,tail1,tail2) - COMPARE_32(v18,v20,32,tail3,tail4) - COMPARE_32(v22,v24,64,tail5,tail6) - COMPARE_32(v26,v28,96,tail7,tail8) - addi r3,r3,128 - addi r4,r4,128 - b L(32B_aligned_loop) - -L(tail1): TAIL(v15,v17) -L(tail2): TAIL(v14,v16) -L(tail3): TAIL(v19,v21) -L(tail4): TAIL(v18,v20) -L(tail5): TAIL(v23,v25) -L(tail6): TAIL(v22,v24) -L(tail7): TAIL(v27,v29) -L(tail8): TAIL(v26,v28) - -END (STRCMP) -libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 594fbb8..27d8495 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -31,10 +31,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ strncase-power8 ifneq (,$(filter %le,$(config-machine))) -sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \ - memmove-power10 memset-power10 rawmemchr-power9 \ - rawmemchr-power10 strcmp-power9 strcmp-power10 \ - strncmp-power9 strcpy-power9 stpcpy-power9 \ +sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \ + rawmemchr-power9 rawmemchr-power10 \ + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 endif CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 5b2d6a9..ad6080f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -226,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */ IFUNC_IMPL (i, name, memchr, -#ifdef __LITTLE_ENDIAN__ - IFUNC_IMPL_ADD (array, i, memchr, - hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX, - __memchr_power10) -#endif IFUNC_IMPL_ADD (array, i, memchr, hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC, @@ -384,10 +378,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcmp, #ifdef __LITTLE_ENDIAN__ IFUNC_IMPL_ADD (array, i, strcmp, - (hwcap2 & PPC_FEATURE2_ARCH_3_1) - && (hwcap & PPC_FEATURE_HAS_VSX), - __strcmp_power10) - IFUNC_IMPL_ADD (array, i, strcmp, hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC, __strcmp_power9) diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c index 57d23e7..b4655df 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c @@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden; extern __typeof (__memchr) __memchr_power7 attribute_hidden; extern __typeof (__memchr) __memchr_power8 attribute_hidden; -# ifdef __LITTLE_ENDIAN__ -extern __typeof (__memchr) __memchr_power10 attribute_hidden; -# endif /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (__memchr, -# ifdef __LITTLE_ENDIAN__ - (hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __memchr_power10 : -# endif - (hwcap2 & PPC_FEATURE2_ARCH_2_07 - && hwcap & PPC_FEATURE_HAS_ALTIVEC) - ? __memchr_power8 : - (hwcap & PPC_FEATURE_ARCH_2_06) - ? __memchr_power7 - : __memchr_ppc); + (hwcap2 & PPC_FEATURE2_ARCH_2_07 + && hwcap & PPC_FEATURE_HAS_ALTIVEC) + ? __memchr_power8 : + (hwcap & PPC_FEATURE_ARCH_2_06) + ? __memchr_power7 + : __memchr_ppc); weak_alias (__memchr, memchr) libc_hidden_builtin_def (memchr) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index ff32496..06b9b40 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden; extern __typeof (strcmp) __strcmp_power8 attribute_hidden; # ifdef __LITTLE_ENDIAN__ extern __typeof (strcmp) __strcmp_power9 attribute_hidden; -extern __typeof (strcmp) __strcmp_power10 attribute_hidden; # endif # undef strcmp libc_ifunc_redirected (__redirect_strcmp, strcmp, # ifdef __LITTLE_ENDIAN__ - (hwcap2 & PPC_FEATURE2_ARCH_3_1 - && hwcap & PPC_FEATURE_HAS_VSX) - ? __strcmp_power10 : (hwcap2 & PPC_FEATURE2_ARCH_3_00 && hwcap & PPC_FEATURE_HAS_ALTIVEC) ? __strcmp_power9 : diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile index 04ea565..830d9cb 100644 --- a/sysdeps/pthread/Makefile +++ b/sysdeps/pthread/Makefile @@ -254,6 +254,7 @@ tests += \ tst-sem14 \ tst-sem15 \ tst-sem16 \ + tst-sem18 \ tst-setuid3 \ tst-signal1 \ tst-signal2 \ diff --git a/sysdeps/pthread/sem_open.c b/sysdeps/pthread/sem_open.c index e412361..dab7341 100644 --- a/sysdeps/pthread/sem_open.c +++ b/sysdeps/pthread/sem_open.c @@ -76,6 +76,7 @@ __sem_open (const char *name, int oflag, ...) goto try_create; /* Return. errno is already set. */ + result = SEM_FAILED; } else /* Check whether we already have this semaphore mapped and diff --git a/sysdeps/pthread/tst-sem18.c b/sysdeps/pthread/tst-sem18.c new file mode 100644 index 0000000..1be207b --- /dev/null +++ b/sysdeps/pthread/tst-sem18.c @@ -0,0 +1,35 @@ +/* Test sem_open with missing file. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <errno.h> +#include <semaphore.h> + +#include <support/check.h> + +int +do_test (void) +{ + sem_unlink ("/glibc-tst-sem18"); + errno = 0; + sem_t *s = sem_open ("/glibc-tst-sem18", 0); + TEST_VERIFY (s == SEM_FAILED); + TEST_COMPARE (errno, ENOENT); + return 0; +} + +#include <support/test-driver.c> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index eee91c7..a7938a8 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -637,7 +637,15 @@ install-bin += \ # install-bin $(objpfx)pldd: $(objpfx)xmalloc.o + +test-internal-extras += tst-nolink-libc +ifeq ($(run-built-tests),yes) +tests-special += \ + $(objpfx)tst-nolink-libc-1.out \ + $(objpfx)tst-nolink-libc-2.out \ + # tests-special endif +endif # $(subdir) == elf ifeq ($(subdir),rt) CFLAGS-mq_send.c += -fexceptions diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index c0b047b..0ad55a0 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -23,6 +23,7 @@ #include <sys/prctl.h> #include <sys/utsname.h> #include <dl-tunables-parse.h> +#include <dl-symbol-redir-ifunc.h> #define DCZID_DZP_MASK (1 << 4) #define DCZID_BS_MASK (0xf) diff --git a/sysdeps/unix/sysv/linux/arm/Makefile b/sysdeps/unix/sysv/linux/arm/Makefile index a73c897..e73ce4f 100644 --- a/sysdeps/unix/sysv/linux/arm/Makefile +++ b/sysdeps/unix/sysv/linux/arm/Makefile @@ -1,5 +1,8 @@ ifeq ($(subdir),elf) sysdep-rtld-routines += aeabi_read_tp libc-do-syscall +# The test uses INTERNAL_SYSCALL_CALL. In thumb mode, this uses +# an undefined reference to __libc_do_syscall. +CFLAGS-tst-nolink-libc.c += -marm endif ifeq ($(subdir),misc) diff --git a/sysdeps/unix/sysv/linux/syscalls.list b/sysdeps/unix/sysv/linux/syscalls.list index 73e941e..9ac42c3 100644 --- a/sysdeps/unix/sysv/linux/syscalls.list +++ b/sysdeps/unix/sysv/linux/syscalls.list @@ -46,6 +46,7 @@ open_tree EXTRA open_tree i:isU open_tree pipe2 - pipe2 i:fi __pipe2 pipe2 pidfd_open EXTRA pidfd_open i:iU pidfd_open pidfd_getfd EXTRA pidfd_getfd i:iiU pidfd_getfd +prctl EXTRA prctl i:iiiii __prctl prctl __prctl_time64 pivot_root EXTRA pivot_root i:ss pivot_root pidfd_send_signal EXTRA pidfd_send_signal i:iiPU pidfd_send_signal process_madvise EXTRA process_madvise i:iPniU process_madvise diff --git a/sysdeps/unix/sysv/linux/tst-nolink-libc.c b/sysdeps/unix/sysv/linux/tst-nolink-libc.c new file mode 100644 index 0000000..817f377 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-nolink-libc.c @@ -0,0 +1,25 @@ +/* Test program not linked against libc.so and not using any glibc functions. + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +void +_start (void) +{ + INTERNAL_SYSCALL_CALL (exit_group, 0); +} diff --git a/sysdeps/unix/sysv/linux/x86_64/Makefile b/sysdeps/unix/sysv/linux/x86_64/Makefile index 9a1e7aa..fcbffd8 100644 --- a/sysdeps/unix/sysv/linux/x86_64/Makefile +++ b/sysdeps/unix/sysv/linux/x86_64/Makefile @@ -17,18 +17,21 @@ endif ifeq ($(subdir),elf) ifeq (yes,$(enable-x86-isa-level)) tests += \ - tst-glibc-hwcaps-2 + tst-glibc-hwcaps-2 \ +# tests ifeq (no,$(build-hardcoded-path-in-tests)) # This is an ld.so.cache test, and RPATH/RUNPATH in the executable # interferes with its test objectives. tests-container += \ - tst-glibc-hwcaps-2-cache + tst-glibc-hwcaps-2-cache \ +# tests-container endif modules-names += \ libx86-64-isa-level-1 \ libx86-64-isa-level-2 \ libx86-64-isa-level-3 \ - libx86-64-isa-level-4 + libx86-64-isa-level-4 \ +# modules-names $(objpfx)tst-glibc-hwcaps-2: $(objpfx)libx86-64-isa-level.so diff --git a/sysdeps/unix/sysv/linux/prctl.c b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c index 52d234e..4bf1b47 100644 --- a/sysdeps/unix/sysv/linux/prctl.c +++ b/sysdeps/unix/sysv/linux/x86_64/x32/prctl.c @@ -1,4 +1,4 @@ -/* prctl - Linux specific syscall. +/* prctl - Linux specific syscall. x86-64 x32 version. Copyright (C) 2020-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -40,6 +40,3 @@ __prctl (int option, ...) libc_hidden_def (__prctl) weak_alias (__prctl, prctl) -#if __TIMESIZE != 64 -weak_alias (__prctl, __prctl_time64) -#endif diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index 5311b59..c814060 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -4,7 +4,13 @@ endif ifeq ($(subdir),elf) sysdep_routines += get-cpuid-feature-leaf -sysdep-dl-routines += dl-get-cpu-features +sysdep-dl-routines += \ + dl-get-cpu-features \ + dl-tlsdesc \ + tls_get_addr \ + tlsdesc \ +# sysdep-dl-routines + sysdep_headers += \ bits/platform/features.h \ bits/platform/x86.h \ @@ -21,6 +27,9 @@ tests += \ tst-cpu-features-supports-static \ tst-get-cpu-features \ tst-get-cpu-features-static \ + tst-gnu2-tls2-x86-noxsave \ + tst-gnu2-tls2-x86-noxsavec \ + tst-gnu2-tls2-x86-noxsavexsavec \ tst-hwcap-tunables \ # tests tests-static += \ @@ -91,6 +100,42 @@ CFLAGS-tst-gnu2-tls2.c += -msse CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell + +LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy +LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy +LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy + +# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled +# via tunable. +tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE +tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC +tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC +$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library) +$(objpfx)tst-gnu2-tls2-x86-noxsave.out \ +$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \ +$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \ + $(objpfx)tst-gnu2-tls2mod0.so \ + $(objpfx)tst-gnu2-tls2mod1.so \ + $(objpfx)tst-gnu2-tls2mod2.so + +CFLAGS-tst-tls23.c += -msse2 +CFLAGS-tst-tls23-mod.c += -msse2 -mtune=haswell + +LDFLAGS-tst-tls23 += -rdynamic +tst-tls23-mod.so-no-z-defs = yes + +$(objpfx)tst-tls23-mod.so: $(libsupport) + +tests-special += $(objpfx)check-gnu2-tls.out + +$(objpfx)check-gnu2-tls.out: $(common-objpfx)libc.so + LC_ALL=C $(READELF) -V -W $< \ + | sed -ne '/.gnu.version_d/, /.gnu.version_r/ p' \ + | grep GLIBC_ABI_GNU2_TLS > $@; \ + $(evaluate-test) +generated += check-gnu2-tls.out endif ifeq ($(subdir),math) diff --git a/sysdeps/x86/Versions b/sysdeps/x86/Versions index 4b10c4b..e8dcfcc 100644 --- a/sysdeps/x86/Versions +++ b/sysdeps/x86/Versions @@ -7,4 +7,9 @@ libc { GLIBC_2.33 { __x86_get_cpuid_feature_leaf; } + GLIBC_ABI_GNU2_TLS { + # This symbol is used only for empty version map and will be removed + # by scripts/versions.awk. + __placeholder_only_for_empty_version_map; + } } diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h index ab73556..8349160 100644 --- a/sysdeps/x86/cacheinfo.h +++ b/sysdeps/x86/cacheinfo.h @@ -35,9 +35,12 @@ long int __x86_data_cache_size attribute_hidden = 32 * 1024; long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; long int __x86_shared_cache_size attribute_hidden = 1024 * 1024; -/* Threshold to use non temporal store. */ +/* Threshold to use non temporal store in memmove. */ long int __x86_shared_non_temporal_threshold attribute_hidden; +/* Threshold to use non temporal store in memset. */ +long int __x86_memset_non_temporal_threshold attribute_hidden; + /* Threshold to use Enhanced REP MOVSB. */ long int __x86_rep_movsb_threshold attribute_hidden = 2048; @@ -77,6 +80,9 @@ init_cacheinfo (void) __x86_shared_non_temporal_threshold = cpu_features->non_temporal_threshold; + __x86_memset_non_temporal_threshold + = cpu_features->memset_non_temporal_threshold; + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold; __x86_rep_movsb_stop_threshold = cpu_features->rep_movsb_stop_threshold; diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 3d7c281..47dc3b1 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -24,6 +24,7 @@ #include <dl-cacheinfo.h> #include <dl-minsigstacksize.h> #include <dl-hwcap2.h> +#include <gcc-macros.h> extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *) attribute_hidden; @@ -83,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *) # include <dl-cet.h> #endif +unsigned long int _dl_x86_features_tlsdesc_state_size; + static void update_active (struct cpu_features *cpu_features) { @@ -317,17 +320,13 @@ update_active (struct cpu_features *cpu_features) = xsave_state_full_size; cpu_features->xsave_state_full_size = xsave_state_full_size; + _dl_x86_features_tlsdesc_state_size = xsave_state_full_size; /* Check if XSAVEC is available. */ if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC)) { - unsigned int xstate_comp_offsets[32]; - unsigned int xstate_comp_sizes[32]; -#ifdef __x86_64__ - unsigned int xstate_amx_comp_offsets[32]; - unsigned int xstate_amx_comp_sizes[32]; - unsigned int amx_ecx; -#endif + unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1]; + unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1]; unsigned int i; xstate_comp_offsets[0] = 0; @@ -335,39 +334,16 @@ update_active (struct cpu_features *cpu_features) xstate_comp_offsets[2] = 576; xstate_comp_sizes[0] = 160; xstate_comp_sizes[1] = 256; -#ifdef __x86_64__ - xstate_amx_comp_offsets[0] = 0; - xstate_amx_comp_offsets[1] = 160; - xstate_amx_comp_offsets[2] = 576; - xstate_amx_comp_sizes[0] = 160; - xstate_amx_comp_sizes[1] = 256; -#endif - for (i = 2; i < 32; i++) + for (i = 2; i <= X86_XSTATE_MAX_ID; i++) { if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0) { __cpuid_count (0xd, i, eax, ebx, ecx, edx); -#ifdef __x86_64__ - /* Include this in xsave_state_full_size. */ - amx_ecx = ecx; - xstate_amx_comp_sizes[i] = eax; - if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0) - { - /* Exclude this from xsave_state_size. */ - ecx = 0; - xstate_comp_sizes[i] = 0; - } - else -#endif - xstate_comp_sizes[i] = eax; + xstate_comp_sizes[i] = eax; } else { -#ifdef __x86_64__ - amx_ecx = 0; - xstate_amx_comp_sizes[i] = 0; -#endif ecx = 0; xstate_comp_sizes[i] = 0; } @@ -376,44 +352,32 @@ update_active (struct cpu_features *cpu_features) { xstate_comp_offsets[i] = (xstate_comp_offsets[i - 1] - + xstate_comp_sizes[i -1]); + + xstate_comp_sizes[i - 1]); if ((ecx & (1 << 1)) != 0) xstate_comp_offsets[i] = ALIGN_UP (xstate_comp_offsets[i], 64); -#ifdef __x86_64__ - xstate_amx_comp_offsets[i] - = (xstate_amx_comp_offsets[i - 1] - + xstate_amx_comp_sizes[i - 1]); - if ((amx_ecx & (1 << 1)) != 0) - xstate_amx_comp_offsets[i] - = ALIGN_UP (xstate_amx_comp_offsets[i], - 64); -#endif } } /* Use XSAVEC. */ unsigned int size - = xstate_comp_offsets[31] + xstate_comp_sizes[31]; + = (xstate_comp_offsets[X86_XSTATE_MAX_ID] + + xstate_comp_sizes[X86_XSTATE_MAX_ID]); if (size) { + size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, + 64); #ifdef __x86_64__ - unsigned int amx_size - = (xstate_amx_comp_offsets[31] - + xstate_amx_comp_sizes[31]); - amx_size - = ALIGN_UP ((amx_size - + TLSDESC_CALL_REGISTER_SAVE_AREA), - 64); - /* Set xsave_state_full_size to the compact AMX - state size for XSAVEC. NB: xsave_state_full_size - is only used in _dl_tlsdesc_dynamic_xsave and - _dl_tlsdesc_dynamic_xsavec. */ - cpu_features->xsave_state_full_size = amx_size; + _dl_x86_features_tlsdesc_state_size = size; + /* Exclude the AMX space from the start of TILECFG + space to the end of TILEDATA space. If CPU + doesn't support AMX, TILECFG offset is the same + as TILEDATA + 1 offset. Otherwise, they are + multiples of 64. */ + size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1] + - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]); #endif - cpu_features->xsave_state_size - = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA, - 64); + cpu_features->xsave_state_size = size; CPU_FEATURE_SET (cpu_features, XSAVEC); } } @@ -538,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load "Incorrect index_arch_Fast_Unaligned_Load"); -/* Intel Family-6 microarch list. */ -enum +/* Intel microarch list. */ +enum intel_microarch { /* Atom processors. */ INTEL_ATOM_BONNELL, @@ -548,6 +512,7 @@ enum INTEL_ATOM_GOLDMONT, INTEL_ATOM_GOLDMONT_PLUS, INTEL_ATOM_SIERRAFOREST, + INTEL_ATOM_CLEARWATERFOREST, INTEL_ATOM_GRANDRIDGE, INTEL_ATOM_TREMONT, @@ -575,7 +540,9 @@ enum INTEL_BIGCORE_METEORLAKE, INTEL_BIGCORE_LUNARLAKE, INTEL_BIGCORE_ARROWLAKE, + INTEL_BIGCORE_PANTHERLAKE, INTEL_BIGCORE_GRANITERAPIDS, + INTEL_BIGCORE_DIAMONDRAPIDS, /* Mixed (bigcore + atom SOC). */ INTEL_MIXED_LAKEFIELD, @@ -589,7 +556,7 @@ enum INTEL_UNKNOWN, }; -static unsigned int +static enum intel_microarch intel_get_fam6_microarch (unsigned int model, __attribute__ ((unused)) unsigned int stepping) { @@ -620,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_ATOM_GOLDMONT_PLUS; case 0xAF: return INTEL_ATOM_SIERRAFOREST; + case 0xDD: + return INTEL_ATOM_CLEARWATERFOREST; case 0xB6: return INTEL_ATOM_GRANDRIDGE; case 0x86: @@ -727,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model, return INTEL_BIGCORE_METEORLAKE; case 0xbd: return INTEL_BIGCORE_LUNARLAKE; + case 0xb5: + case 0xc5: case 0xc6: return INTEL_BIGCORE_ARROWLAKE; + case 0xCC: + return INTEL_BIGCORE_PANTHERLAKE; case 0xAD: case 0xAE: return INTEL_BIGCORE_GRANITERAPIDS; @@ -756,6 +729,12 @@ init_cpu_features (struct cpu_features *cpu_features) unsigned int stepping = 0; enum cpu_features_kind kind; + /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is, + as of writing this, we only have benchmarks indicatings it profitability + on Intel/AMD. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + cpu_features->cachesize_non_temporal_divisor = 4; #if !HAS_CPUID if (__get_cpuid_max (0, 0) == 0) @@ -781,125 +760,25 @@ init_cpu_features (struct cpu_features *cpu_features) update_active (cpu_features); + /* Benchmarks indicate non-temporal memset can be profitable on Intel + hardware. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + + enum intel_microarch microarch = INTEL_UNKNOWN; if (family == 0x06) { model += extended_model; - unsigned int microarch - = intel_get_fam6_microarch (model, stepping); + microarch = intel_get_fam6_microarch (model, stepping); + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (microarch) { - /* Atom / KNL tuning. */ - case INTEL_ATOM_BONNELL: - /* BSF is slow on Bonnell. */ - cpu_features->preferred[index_arch_Slow_BSF] - |= bit_arch_Slow_BSF; - break; - - /* Unaligned load versions are faster than SSSE3 - on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ - case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_SILVERMONT: - case INTEL_ATOM_GOLDMONT: - case INTEL_ATOM_GOLDMONT_PLUS: - - /* Knights Landing. Enable Silvermont optimizations. */ - case INTEL_KNIGHTS_LANDING: - - cpu_features->preferred[index_arch_Fast_Unaligned_Load] - |= (bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - case INTEL_ATOM_TREMONT: - /* Enable rep string instructions, unaligned load, unaligned - copy, pminub and avoid SSE 4.2 on Tremont. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop - | bit_arch_Slow_SSE4_2); - break; - - /* - Default tuned Knights microarch. - case INTEL_KNIGHTS_MILL: - */ - - /* - Default tuned atom microarch. - case INTEL_ATOM_SIERRAFOREST: - case INTEL_ATOM_GRANDRIDGE: - */ - - /* Bigcore/Default Tuning. */ default: - default_tuning: - /* Unknown family 0x06 processors. Assuming this is one - of Core i3/i5/i7 processors if AVX is available. */ - if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) - break; - - enable_modern_features: - /* Rep string instructions, unaligned load, unaligned copy, - and pminub are fast on Intel Core i3, i5 and i7. */ - cpu_features->preferred[index_arch_Fast_Rep_String] - |= (bit_arch_Fast_Rep_String - | bit_arch_Fast_Unaligned_Load - | bit_arch_Fast_Unaligned_Copy - | bit_arch_Prefer_PMINUB_for_stringop); break; - case INTEL_BIGCORE_NEHALEM: - case INTEL_BIGCORE_WESTMERE: - /* Older CPUs prefer non-temporal stores at lower threshold. */ - cpu_features->cachesize_non_temporal_divisor = 8; - goto enable_modern_features; - - /* Older Bigcore microarch (smaller non-temporal store - threshold). */ - case INTEL_BIGCORE_SANDYBRIDGE: - case INTEL_BIGCORE_IVYBRIDGE: - case INTEL_BIGCORE_HASWELL: - case INTEL_BIGCORE_BROADWELL: - cpu_features->cachesize_non_temporal_divisor = 8; - goto default_tuning; - - /* Newer Bigcore microarch (larger non-temporal store - threshold). */ - case INTEL_BIGCORE_SKYLAKE: - case INTEL_BIGCORE_KABYLAKE: - case INTEL_BIGCORE_COMETLAKE: - case INTEL_BIGCORE_SKYLAKE_AVX512: - case INTEL_BIGCORE_CANNONLAKE: - case INTEL_BIGCORE_ICELAKE: - case INTEL_BIGCORE_TIGERLAKE: - case INTEL_BIGCORE_ROCKETLAKE: - case INTEL_BIGCORE_RAPTORLAKE: - case INTEL_BIGCORE_METEORLAKE: - case INTEL_BIGCORE_LUNARLAKE: - case INTEL_BIGCORE_ARROWLAKE: - case INTEL_BIGCORE_SAPPHIRERAPIDS: - case INTEL_BIGCORE_EMERALDRAPIDS: - case INTEL_BIGCORE_GRANITERAPIDS: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - - /* Default tuned Mixed (bigcore + atom SOC). */ - case INTEL_MIXED_LAKEFIELD: - case INTEL_MIXED_ALDERLAKE: - cpu_features->cachesize_non_temporal_divisor = 2; - goto default_tuning; - } - - /* Disable TSX on some processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which - disables broken feature by default). */ - switch (microarch) - { case INTEL_BIGCORE_SKYLAKE_AVX512: /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */ if (stepping <= 5) @@ -908,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features) case INTEL_BIGCORE_KABYLAKE: /* NB: Although the errata documents that for model == 0x8e - (kabylake skylake client), only 0xb stepping or lower are - impacted, the intention of the errata was to disable TSX on - all client processors on all steppings. Include 0xc - stepping which is an Intel Core i7-8665U, a client mobile - processor. */ + (kabylake skylake client), only 0xb stepping or lower are + impacted, the intention of the errata was to disable TSX on + all client processors on all steppings. Include 0xc + stepping which is an Intel Core i7-8665U, a client mobile + processor. */ if (stepping > 0xc) break; /* Fall through. */ case INTEL_BIGCORE_SKYLAKE: - /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for - processors listed in: - -https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html - */ - disable_tsx: - CPU_FEATURE_UNSET (cpu_features, HLE); - CPU_FEATURE_UNSET (cpu_features, RTM); - CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); - break; + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + + https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + CPU_FEATURE_UNSET (cpu_features, HLE); + CPU_FEATURE_UNSET (cpu_features, RTM); + CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT); + break; case INTEL_BIGCORE_HASWELL: - /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working - TSX. Haswell also include other model numbers that have - working TSX. */ - if (model == 0x3f && stepping >= 4) + /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working + TSX. Haswell also includes other model numbers that have + working TSX. */ + if (model == 0x3f && stepping >= 4) break; - CPU_FEATURE_UNSET (cpu_features, RTM); - break; + CPU_FEATURE_UNSET (cpu_features, RTM); + break; } } + else if (family == 19) + switch (model) + { + case 0x01: + microarch = INTEL_BIGCORE_DIAMONDRAPIDS; + break; + default: + break; + } + + switch (microarch) + { + /* Atom / KNL tuning. */ + case INTEL_ATOM_BONNELL: + /* BSF is slow on Bonnell. */ + cpu_features->preferred[index_arch_Slow_BSF] + |= bit_arch_Slow_BSF; + break; + + /* Unaligned load versions are faster than SSSE3 + on Airmont, Silvermont, Goldmont, and Goldmont Plus. */ + case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_SILVERMONT: + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: + + /* Knights Landing. Enable Silvermont optimizations. */ + case INTEL_KNIGHTS_LANDING: + + cpu_features->preferred[index_arch_Fast_Unaligned_Load] + |= (bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + case INTEL_ATOM_TREMONT: + /* Enable rep string instructions, unaligned load, unaligned + copy, pminub and avoid SSE 4.2 on Tremont. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop + | bit_arch_Slow_SSE4_2); + break; + + /* + Default tuned Knights microarch. + case INTEL_KNIGHTS_MILL: + */ + + /* + Default tuned atom microarch. + case INTEL_ATOM_SIERRAFOREST: + case INTEL_ATOM_GRANDRIDGE: + case INTEL_ATOM_CLEARWATERFOREST: + */ + + /* Bigcore/Default Tuning. */ + default: + default_tuning: + /* Unknown Intel processors. Assuming this is one of Core + i3/i5/i7 processors if AVX is available. */ + if (!CPU_FEATURES_CPU_P (cpu_features, AVX)) + break; + + enable_modern_features: + /* Rep string instructions, unaligned load, unaligned copy, + and pminub are fast on Intel Core i3, i5 and i7. */ + cpu_features->preferred[index_arch_Fast_Rep_String] + |= (bit_arch_Fast_Rep_String + | bit_arch_Fast_Unaligned_Load + | bit_arch_Fast_Unaligned_Copy + | bit_arch_Prefer_PMINUB_for_stringop); + break; + + case INTEL_BIGCORE_NEHALEM: + case INTEL_BIGCORE_WESTMERE: + /* Older CPUs prefer non-temporal stores at lower threshold. */ + cpu_features->cachesize_non_temporal_divisor = 8; + goto enable_modern_features; + + /* Older Bigcore microarch (smaller non-temporal store + threshold). */ + case INTEL_BIGCORE_SANDYBRIDGE: + case INTEL_BIGCORE_IVYBRIDGE: + case INTEL_BIGCORE_HASWELL: + case INTEL_BIGCORE_BROADWELL: + cpu_features->cachesize_non_temporal_divisor = 8; + goto default_tuning; + + /* Newer Bigcore microarch (larger non-temporal store + threshold). */ + case INTEL_BIGCORE_SKYLAKE_AVX512: + case INTEL_BIGCORE_CANNONLAKE: + /* Benchmarks indicate non-temporal memset is not + necessarily profitable on SKX (and in some cases much + worse). This is likely unique to SKX due to its unique + mesh interconnect (not present on ICX or BWD). Disable + non-temporal on all Skylake servers. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + |= bit_arch_Avoid_Non_Temporal_Memset; + /* fallthrough */ + case INTEL_BIGCORE_COMETLAKE: + case INTEL_BIGCORE_SKYLAKE: + case INTEL_BIGCORE_KABYLAKE: + case INTEL_BIGCORE_ICELAKE: + case INTEL_BIGCORE_TIGERLAKE: + case INTEL_BIGCORE_ROCKETLAKE: + case INTEL_BIGCORE_RAPTORLAKE: + case INTEL_BIGCORE_METEORLAKE: + case INTEL_BIGCORE_LUNARLAKE: + case INTEL_BIGCORE_ARROWLAKE: + case INTEL_BIGCORE_PANTHERLAKE: + case INTEL_BIGCORE_SAPPHIRERAPIDS: + case INTEL_BIGCORE_EMERALDRAPIDS: + case INTEL_BIGCORE_GRANITERAPIDS: + case INTEL_BIGCORE_DIAMONDRAPIDS: + /* Default tuned Mixed (bigcore + atom SOC). */ + case INTEL_MIXED_LAKEFIELD: + case INTEL_MIXED_ALDERLAKE: + cpu_features->cachesize_non_temporal_divisor = 2; + goto default_tuning; + } /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER if AVX512ER is available. Don't use AVX512 to avoid lower CPU @@ -984,6 +988,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht ecx = cpu_features->features[CPUID_INDEX_1].cpuid.ecx; + /* Benchmarks indicate non-temporal memset can be profitable on AMD + hardware. */ + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) { /* Since the FMA4 bit is in CPUID_INDEX_80000001 and @@ -1092,6 +1101,9 @@ no_cpuid: TUNABLE_CALLBACK (set_prefer_map_32bit_exec)); #endif + /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build + requires AVX and therefore XSAVE or XSAVEC support. */ +#ifndef GCCMACRO__AVX__ bool disable_xsave_features = false; if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE)) @@ -1145,6 +1157,7 @@ no_cpuid: CPU_FEATURE_UNSET (cpu_features, FMA4); } +#endif #ifdef __x86_64__ GLRO(dl_hwcap) = HWCAP_X86_64; diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c index 89da7a0..a0b31d8 100644 --- a/sysdeps/x86/cpu-tunables.c +++ b/sysdeps/x86/cpu-tunables.c @@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) /* Update xsave_state_size to XSAVE state size. */ cpu_features->xsave_state_size = cpu_features->xsave_state_full_size; + _dl_x86_features_tlsdesc_state_size + = cpu_features->xsave_state_full_size; CPU_FEATURE_UNSET (cpu_features, XSAVEC); } } @@ -243,6 +245,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) (n, cpu_features, MathVec_Prefer_No_AVX512, AVX512F, 24); } break; + case 25: + { + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Avoid_Non_Temporal_Memset, 25); + } + break; case 26: { CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index 1f68968..10ad180 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -986,9 +986,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) rep_movsb_threshold = 2112; - /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of - cases slower than the vectorized path (and for some alignments, - it is really slow, check BZ #30994). */ + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of + cases slower than the vectorized path (and for some alignments, + it is really slow, check BZ #30994). */ if (cpu_features->basic.kind == arch_kind_amd) rep_movsb_threshold = non_temporal_threshold; @@ -1007,11 +1007,23 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) if (tunable_size != 0) shared = tunable_size; + /* Non-temporal stores are more performant on some hardware above + non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both + Intel and AMD hardware. */ + unsigned long int memset_non_temporal_threshold = SIZE_MAX; + if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) + memset_non_temporal_threshold = non_temporal_threshold; + tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL); if (tunable_size > minimum_non_temporal_threshold && tunable_size <= maximum_non_temporal_threshold) non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + if (tunable_size > minimum_non_temporal_threshold + && tunable_size <= maximum_non_temporal_threshold) + memset_non_temporal_threshold = tunable_size; + tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL); if (tunable_size > minimum_rep_movsb_threshold) rep_movsb_threshold = tunable_size; @@ -1032,6 +1044,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, minimum_non_temporal_threshold, maximum_non_temporal_threshold); + TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, + memset_non_temporal_threshold, + minimum_non_temporal_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, minimum_rep_movsb_threshold, SIZE_MAX); TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, @@ -1045,6 +1060,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) cpu_features->data_cache_size = data; cpu_features->shared_cache_size = shared; cpu_features->non_temporal_threshold = non_temporal_threshold; + cpu_features->memset_non_temporal_threshold = memset_non_temporal_threshold; cpu_features->rep_movsb_threshold = rep_movsb_threshold; cpu_features->rep_stosb_threshold = rep_stosb_threshold; cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold; diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c index c76ea3b..8113a93 100644 --- a/sysdeps/x86/dl-diagnostics-cpu.c +++ b/sysdeps/x86/dl-diagnostics-cpu.c @@ -78,11 +78,15 @@ _dl_diagnostics_cpu (void) cpu_features->xsave_state_size); print_cpu_features_value ("xsave_state_full_size", cpu_features->xsave_state_full_size); + print_cpu_features_value ("tlsdesc_state_full_size", + _dl_x86_features_tlsdesc_state_size); print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size); print_cpu_features_value ("shared_cache_size", cpu_features->shared_cache_size); print_cpu_features_value ("non_temporal_threshold", cpu_features->non_temporal_threshold); + print_cpu_features_value ("memset_non_temporal_threshold", + cpu_features->memset_non_temporal_threshold); print_cpu_features_value ("rep_movsb_threshold", cpu_features->rep_movsb_threshold); print_cpu_features_value ("rep_movsb_stop_threshold", diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list index 7d82da0..a0a1299 100644 --- a/sysdeps/x86/dl-tunables.list +++ b/sysdeps/x86/dl-tunables.list @@ -30,6 +30,9 @@ glibc { x86_non_temporal_threshold { type: SIZE_T } + x86_memset_non_temporal_threshold { + type: SIZE_T + } x86_rep_movsb_threshold { type: SIZE_T # Since there is overhead to set up REP MOVSB operation, REP diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def index 85e7f54..61bbbc2 100644 --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def @@ -33,3 +33,4 @@ BIT (Prefer_No_AVX512) BIT (MathVec_Prefer_No_AVX512) BIT (Prefer_FSRM) BIT (Avoid_Short_Distance_REP_MOVSB) +BIT (Avoid_Non_Temporal_Memset) diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h index cd7bd27..03c7138 100644 --- a/sysdeps/x86/include/cpu-features.h +++ b/sysdeps/x86/include/cpu-features.h @@ -934,8 +934,6 @@ struct cpu_features /* The full state size for XSAVE when XSAVEC is disabled by GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC - - and the AMX state size when XSAVEC is available. */ unsigned int xsave_state_full_size; /* Data cache size for use in memory and string routines, typically @@ -944,8 +942,10 @@ struct cpu_features /* Shared cache size for use in memory and string routines, typically L2 or L3 size. */ unsigned long int shared_cache_size; - /* Threshold to use non temporal store. */ + /* Threshold to use non temporal store in memmove. */ unsigned long int non_temporal_threshold; + /* Threshold to use non temporal store in memset. */ + unsigned long int memset_non_temporal_threshold; /* Threshold to use "rep movsb". */ unsigned long int rep_movsb_threshold; /* Threshold to stop using "rep movsb". */ @@ -987,6 +987,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void) #define __get_cpu_features() _dl_x86_get_cpu_features() +#if IS_IN (rtld) || IS_IN (libc) +/* XSAVE/XSAVEC state size used by TLS descriptors. Compared to + xsave_state_size from struct cpu_features, this includes additional + registers. */ +extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden; +#endif + #if defined (_LIBC) && !IS_IN (nonlib) /* Unused for x86. */ # define INIT_ARCH() diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h index 7359149..d5f5ec0 100644 --- a/sysdeps/x86/sysdep.h +++ b/sysdeps/x86/sysdep.h @@ -102,6 +102,9 @@ | (1 << X86_XSTATE_ZMM_ID) \ | (1 << X86_XSTATE_APX_F_ID)) +/* The maximum supported xstate ID. */ +# define X86_XSTATE_MAX_ID X86_XSTATE_APX_F_ID + /* AMX state mask. */ # define AMX_STATE_SAVE_MASK \ ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID)) @@ -123,6 +126,9 @@ | (1 << X86_XSTATE_K_ID) \ | (1 << X86_XSTATE_ZMM_H_ID)) +/* The maximum supported xstate ID. */ +# define X86_XSTATE_MAX_ID X86_XSTATE_ZMM_H_ID + /* States to be included in xsave_state_size. */ # define FULL_STATE_SAVE_MASK STATE_SAVE_MASK #endif @@ -177,6 +183,29 @@ #define atom_text_section .section ".text.atom", "ax" +#ifndef DL_STACK_ALIGNMENT +/* Due to GCC bug: + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 + + __tls_get_addr may be called with 8-byte/4-byte stack alignment. + Although this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't + assume that stack will be always aligned at 16 bytes. */ +# ifdef __x86_64__ +# define DL_STACK_ALIGNMENT 8 +# define MINIMUM_ALIGNMENT 16 +# else +# define DL_STACK_ALIGNMENT 4 +# endif +#endif + +/* True if _dl_runtime_resolve/_dl_tlsdesc_dynamic should align stack for + STATE_SAVE or align stack to MINIMUM_ALIGNMENT bytes before calling + _dl_fixup/__tls_get_addr. */ +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \ + (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \ + || MINIMUM_ALIGNMENT > DL_STACK_ALIGNMENT) + #endif /* __ASSEMBLER__ */ #endif /* _X86_SYSDEP_H */ diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c new file mode 100644 index 0000000..f0024c1 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c @@ -0,0 +1 @@ +#include <elf/tst-gnu2-tls2.c> diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c new file mode 100644 index 0000000..f0024c1 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c @@ -0,0 +1 @@ +#include <elf/tst-gnu2-tls2.c> diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c new file mode 100644 index 0000000..f0024c1 --- /dev/null +++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c @@ -0,0 +1 @@ +#include <elf/tst-gnu2-tls2.c> diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c index f6a65b8..bc573c7 100644 --- a/sysdeps/x86/tst-hwcap-tunables.c +++ b/sysdeps/x86/tst-hwcap-tunables.c @@ -60,7 +60,7 @@ static const struct test_t /* Disable everything. */ "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," - "-AVX_Fast_Unaligned_Load", + "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", test_1, array_length (test_1) }, @@ -68,7 +68,7 @@ static const struct test_t /* Same as before, but with some empty suboptions. */ ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," - "-ERMS,-AVX_Fast_Unaligned_Load,-,", + "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", test_1, array_length (test_1) } diff --git a/sysdeps/x86/tst-tls23.c b/sysdeps/x86/tst-tls23.c new file mode 100644 index 0000000..6130d91 --- /dev/null +++ b/sysdeps/x86/tst-tls23.c @@ -0,0 +1,22 @@ +#ifndef __x86_64__ +#include <sys/platform/x86.h> + +#define IS_SUPPORTED() CPU_FEATURE_ACTIVE (SSE2) +#endif + +/* Set XMM0...XMM7 to all 1s. */ +#define PREPARE_MALLOC() \ +{ \ + asm volatile ("pcmpeqd %%xmm0, %%xmm0" : : : "xmm0" ); \ + asm volatile ("pcmpeqd %%xmm1, %%xmm1" : : : "xmm1" ); \ + asm volatile ("pcmpeqd %%xmm2, %%xmm2" : : : "xmm2" ); \ + asm volatile ("pcmpeqd %%xmm3, %%xmm3" : : : "xmm3" ); \ + asm volatile ("pcmpeqd %%xmm4, %%xmm4" : : : "xmm4" ); \ + asm volatile ("pcmpeqd %%xmm5, %%xmm5" : : : "xmm5" ); \ + asm volatile ("pcmpeqd %%xmm6, %%xmm6" : : : "xmm6" ); \ + asm volatile ("pcmpeqd %%xmm7, %%xmm7" : : : "xmm7" ); \ +} + +#include <elf/tst-tls23.c> + +v2di v1, v2, v3; diff --git a/sysdeps/x86/tst-tls23.h b/sysdeps/x86/tst-tls23.h new file mode 100644 index 0000000..21cee4c --- /dev/null +++ b/sysdeps/x86/tst-tls23.h @@ -0,0 +1,35 @@ +/* Test that __tls_get_addr preserves XMM registers. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <support/check.h> + +typedef long long v2di __attribute__((vector_size(16))); +extern v2di v1, v2, v3; + +#define BEFORE_TLS_CALL() \ + v1 = __extension__(v2di){0, 0}; \ + v2 = __extension__(v2di){0, 0}; + +#define AFTER_TLS_CALL() \ + v3 = __extension__(v2di){0, 0}; \ + asm volatile ("" : "+x" (v3)); \ + union { v2di x; long long a[2]; } u; \ + u.x = v3; \ + TEST_VERIFY_EXIT (u.a[0] == 0 && u.a[1] == 0); + +#include <elf/tst-tls23.h> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 0ede447..c97b3ac 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -32,7 +32,8 @@ sysdep_routines += \ # sysdep_routines gen-as-const-headers += locale-defines.sym tests += \ - tst-rsi-strlen + tst-rsi-strlen \ +# tests endif ifeq ($(subdir),elf) @@ -40,9 +41,6 @@ ifeq ($(subdir),elf) CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\ -mno-mmx) -sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr - -tests += ifuncmain8 modules-names += ifuncmod8 $(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so @@ -208,6 +206,15 @@ LDFLAGS-tst-plt-rewrite2 = -Wl,-z,now LDFLAGS-tst-plt-rewritemod2.so = -Wl,-z,now,-z,undefs tst-plt-rewrite2-ENV = GLIBC_TUNABLES=glibc.cpu.plt_rewrite=2 $(objpfx)tst-plt-rewrite2: $(objpfx)tst-plt-rewritemod2.so + +tests-special += $(objpfx)check-dt-x86-64-plt.out + +$(objpfx)check-dt-x86-64-plt.out: $(common-objpfx)libc.so + LC_ALL=C $(READELF) -V -W $< \ + | sed -ne '/.gnu.version_d/, /.gnu.version_r/ p' \ + | grep GLIBC_ABI_DT_X86_64_PLT > $@; \ + $(evaluate-test) +generated += check-dt-x86-64-plt.out endif test-internal-extras += tst-gnu2-tls2mod1 @@ -232,7 +239,8 @@ sysdep_routines += \ # sysdep_routines tests += \ - tst-rsi-wcslen + tst-rsi-wcslen \ +# tests endif diff --git a/sysdeps/x86_64/Versions b/sysdeps/x86_64/Versions index e94758b..6a989ad 100644 --- a/sysdeps/x86_64/Versions +++ b/sysdeps/x86_64/Versions @@ -5,6 +5,11 @@ libc { GLIBC_2.13 { __fentry__; } + GLIBC_ABI_DT_X86_64_PLT { + # This symbol is used only for empty version map and will be removed + # by scripts/versions.awk. + __placeholder_only_for_empty_version_map; + } } libm { GLIBC_2.1 { diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h index 9f02cfc..44d9486 100644 --- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h @@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic: # endif #else /* Allocate stack space of the required size to save the state. */ - sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP + sub _dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP #endif /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9, r10 and r11. */ diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S index 057a108..5860792 100644 --- a/sysdeps/x86_64/dl-tlsdesc.S +++ b/sysdeps/x86_64/dl-tlsdesc.S @@ -22,7 +22,6 @@ #include <features-offsets.h> #include <isa-level.h> #include "tlsdesc.h" -#include "dl-trampoline-save.h" /* Area on stack to save and restore registers used for parameter passing when calling _dl_tlsdesc_dynamic. */ diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 87c5137..4c11fcf 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -22,7 +22,6 @@ #include <features-offsets.h> #include <link-defines.h> #include <isa-level.h> -#include "dl-trampoline-save.h" /* Area on stack to save and restore registers used for parameter passing when calling _dl_fixup. */ diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index b527cab..bc479b4 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -1,6 +1,7 @@ ifeq ($(subdir),math) CFLAGS-e_asin-fma.c = -mfma -mavx2 CFLAGS-e_atan2-fma.c = -mfma -mavx2 +CFLAGS-e_atanh-fma.c = -mfma -mavx2 CFLAGS-e_exp-fma.c = -mfma -mavx2 CFLAGS-e_log-fma.c = -mfma -mavx2 CFLAGS-e_log2-fma.c = -mfma -mavx2 @@ -57,6 +58,7 @@ libm-sysdep_routines += \ e_asin-fma \ e_atan2-avx \ e_atan2-fma \ + e_atanh-fma \ e_exp-avx \ e_exp-fma \ e_exp2f-fma \ diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c new file mode 100644 index 0000000..c3f2f9e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c @@ -0,0 +1,6 @@ +#define __ieee754_atanh __ieee754_atanh_fma +#define __log1p __log1p_fma + +#define SECTION __attribute__ ((section (".text.fma"))) + +#include <sysdeps/ieee754/dbl-64/e_atanh.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c new file mode 100644 index 0000000..d2b785d --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c @@ -0,0 +1,34 @@ +/* Multiple versions of atanh. + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdeps/x86/isa-level.h> +#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL +# include <libm-alias-finite.h> + +extern double __redirect_ieee754_atanh (double); + +# define SYMBOL_NAME ieee754_atanh +# include "ifunc-fma.h" + +libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ()); + +libm_alias_finite (__ieee754_atanh, __atanh) + +# define __ieee754_atanh __ieee754_atanh_sse2 +#endif +#include <sysdeps/ieee754/dbl-64/e_atanh.c> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c4a21d4..c34c94c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -928,7 +928,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), __wcsncpy_avx2) - X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, + X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, 1, __wcsncpy_generic)) @@ -958,7 +958,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), __wcpncpy_avx2) - X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy, + X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy, 1, __wcpncpy_generic)) diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 637caad..88bf08e 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -24,9 +24,9 @@ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with 4 VEC stores and store 4 * VEC at a time until done. 6. On machines ERMS feature, if size is range - [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) + [__x86_rep_stosb_threshold, __x86_memset_non_temporal_threshold) then REP STOSB will be used. - 7. If size >= __x86_shared_non_temporal_threshold, use a + 7. If size >= __x86_memset_non_temporal_threshold, use a non-temporal stores. */ #include <sysdep.h> @@ -318,7 +318,7 @@ L(return_vzeroupper): /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in range for 2-byte jump encoding. */ L(stosb_local): - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + cmp __x86_memset_non_temporal_threshold(%rip), %RDX_LP jae L(nt_memset) movzbl %sil, %eax mov %RDX_LP, %RCX_LP diff --git a/time/strftime_l.c b/time/strftime_l.c index 77adec9..066c839 100644 --- a/time/strftime_l.c +++ b/time/strftime_l.c @@ -40,6 +40,7 @@ #endif #include <ctype.h> +#include <errno.h> #include <sys/types.h> /* Some systems define `time_t' here. */ #ifdef TIME_WITH_SYS_TIME |