diff options
Diffstat (limited to 'sysdeps/aarch64')
316 files changed, 4088 insertions, 5903 deletions
diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile index 141d7d9..4b7f8a5 100644 --- a/sysdeps/aarch64/Makefile +++ b/sysdeps/aarch64/Makefile @@ -9,7 +9,9 @@ LDFLAGS-rtld += -Wl,-z,force-bti,--fatal-warnings endif ifeq ($(subdir),elf) -sysdep-dl-routines += dl-bti +sysdep-dl-routines += \ + dl-bti \ + dl-gcs tests += tst-audit26 \ tst-audit27 @@ -35,7 +37,9 @@ endif ifeq ($(subdir),elf) sysdep-rtld-routines += dl-start sysdep-dl-routines += tlsdesc dl-tlsdesc -gen-as-const-headers += dl-link.sym +gen-as-const-headers += \ + dl-link.sym \ + rtld-global-offsets.sym tests-internal += tst-ifunc-arg-1 tst-ifunc-arg-2 @@ -67,7 +71,8 @@ sysdep_headers += sys/ifunc.h sysdep_routines += \ __mtag_tag_zero_region \ __mtag_tag_region \ - __arm_za_disable + __arm_za_disable \ + __alloc_gcs tests += \ tst-sme-jmp diff --git a/sysdeps/aarch64/__alloc_gcs.c b/sysdeps/aarch64/__alloc_gcs.c new file mode 100644 index 0000000..e70b459 --- /dev/null +++ b/sysdeps/aarch64/__alloc_gcs.c @@ -0,0 +1,63 @@ +/* AArch64 GCS allocation. + Copyright (C) 2024-2025 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <unistd.h> +#include <sys/mman.h> + +#ifndef SHADOW_STACK_SET_TOKEN +# define SHADOW_STACK_SET_TOKEN (1UL << 0) +# define SHADOW_STACK_SET_MARKER (1UL << 1) +#endif + +static void * +map_shadow_stack (void *addr, size_t size, unsigned long flags) +{ + return (void *) INLINE_SYSCALL_CALL (map_shadow_stack, addr, size, flags); +} + +#define GCS_MAX_SIZE (1UL << 31) +#define GCS_ALTSTACK_RESERVE 160 + +void * +__alloc_gcs (size_t stack_size, void **ss_base, size_t *ss_size) +{ + size_t size = (stack_size / 2 + GCS_ALTSTACK_RESERVE) & -8UL; + if (size > GCS_MAX_SIZE) + size = GCS_MAX_SIZE; + + unsigned long flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN; + void *base = map_shadow_stack (NULL, size, flags); + if (base == MAP_FAILED) + return NULL; + + *ss_base = base; + *ss_size = size; + + uint64_t *gcsp = (uint64_t *) ((char *) base + size); + /* Skip end of GCS token. */ + gcsp--; + /* Verify GCS cap token. */ + gcsp--; + if (((uint64_t)gcsp & 0xfffffffffffff000) + 1 != *gcsp) + { + __munmap (base, size); + return NULL; + } + /* Return the target GCS pointer for context switch. */ + return gcsp + 1; +} diff --git a/sysdeps/aarch64/__arm_za_disable.S b/sysdeps/aarch64/__arm_za_disable.S index 649891e..6290803 100644 --- a/sysdeps/aarch64/__arm_za_disable.S +++ b/sysdeps/aarch64/__arm_za_disable.S @@ -1,5 +1,5 @@ /* Libc internal support routine for SME. - Copyright (C) 2023 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S index 7b6add7..38efddb 100644 --- a/sysdeps/aarch64/__longjmp.S +++ b/sysdeps/aarch64/__longjmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -91,6 +91,36 @@ ENTRY (__longjmp) ldp d12, d13, [x0, #JB_D12<<3] ldp d14, d15, [x0, #JB_D14<<3] + /* GCS support. */ + mov x16, 1 + CHKFEAT_X16 + tbnz x16, 0, L(gcs_done) + MRS_GCSPR (x2) + ldr x3, [x0, #JB_GCSPR] + mov x4, x3 + /* x2: GCSPR now. x3, x4: target GCSPR. x5, x6: tmp regs. */ +L(gcs_scan): + cmp x2, x4 + b.eq L(gcs_pop) + sub x4, x4, 8 + /* Check for a cap token. */ + ldr x5, [x4] + and x6, x4, 0xfffffffffffff000 + orr x6, x6, 1 + cmp x5, x6 + b.ne L(gcs_scan) +L(gcs_switch): + add x2, x4, 8 + GCSSS1 (x4) + GCSSS2 (xzr) +L(gcs_pop): + cmp x2, x3 + b.eq L(gcs_done) + GCSPOPM (xzr) + add x2, x2, 8 + b L(gcs_pop) +L(gcs_done): + /* Originally this was implemented with a series of .cfi_restore() directives. diff --git a/sysdeps/aarch64/__mtag_tag_region.S b/sysdeps/aarch64/__mtag_tag_region.S index 22e8d8b..9b10596 100644 --- a/sysdeps/aarch64/__mtag_tag_region.S +++ b/sysdeps/aarch64/__mtag_tag_region.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2020-2024 Free Software Foundation, Inc. +/* Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/__mtag_tag_zero_region.S b/sysdeps/aarch64/__mtag_tag_zero_region.S index 566698e..e7d4117 100644 --- a/sysdeps/aarch64/__mtag_tag_zero_region.S +++ b/sysdeps/aarch64/__mtag_tag_zero_region.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2020-2024 Free Software Foundation, Inc. +/* Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/aarch64-gcs.h index 6c7f60b..162ef18 100644 --- a/sysdeps/aarch64/memset-reg.h +++ b/sysdeps/aarch64/aarch64-gcs.h @@ -1,6 +1,7 @@ -/* Register aliases for memset to be used across implementations. - Copyright (C) 2017-2024 Free Software Foundation, Inc. +/* AArch64 GCS (Guarded Control Stack) declarations. This file is part of the GNU C Library. + Copyright (C) 2024-2025 Free Software Foundation, Inc. + Copyright The GNU Toolchain Authors. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -16,15 +17,12 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define dstin x0 -#define val x1 -#define valw w1 -#define count x2 -#define dst x3 -#define dstend x4 -#define tmp1 x5 -#define tmp1w w5 -#define tmp2 x6 -#define tmp2w w6 -#define zva_len x7 -#define zva_lenw w7 +#ifndef _AARCH64_GCS_H +#define _AARCH64_GCS_H + +#include <stddef.h> +#include <stdbool.h> + +void *__alloc_gcs (size_t, void **, size_t *) attribute_hidden; + +#endif diff --git a/sysdeps/aarch64/atomic-machine.h b/sysdeps/aarch64/atomic-machine.h index 405f984..b4e8d85 100644 --- a/sysdeps/aarch64/atomic-machine.h +++ b/sysdeps/aarch64/atomic-machine.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2024 Free Software Foundation, Inc. +/* Copyright (C) 2003-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/fenv.h b/sysdeps/aarch64/bits/fenv.h index cefa660..ce73980 100644 --- a/sysdeps/aarch64/bits/fenv.h +++ b/sysdeps/aarch64/bits/fenv.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2004-2024 Free Software Foundation, Inc. +/* Copyright (C) 2004-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/fp-fast.h b/sysdeps/aarch64/bits/fp-fast.h index f36e636..6448bc1 100644 --- a/sysdeps/aarch64/bits/fp-fast.h +++ b/sysdeps/aarch64/bits/fp-fast.h @@ -1,5 +1,5 @@ /* Define FP_FAST_* macros. AArch64 version. - Copyright (C) 2016-2024 Free Software Foundation, Inc. + Copyright (C) 2016-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/bits/indirect-return.h b/sysdeps/aarch64/bits/indirect-return.h new file mode 100644 index 0000000..f0c4d2a --- /dev/null +++ b/sysdeps/aarch64/bits/indirect-return.h @@ -0,0 +1,36 @@ +/* Definition of __INDIRECT_RETURN. AArch64 version. + Copyright (C) 2024-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _UCONTEXT_H +# error "Never include <bits/indirect-return.h> directly; use <ucontext.h> instead." +#endif + +/* __INDIRECT_RETURN indicates that swapcontext may return via + an indirect branch. This happens when GCS is enabled, so + add the attribute if available, otherwise returns_twice has + a similar effect, but it prevents some code transformations + that can cause build failures in some rare cases so it is + only used when GCS is enabled. */ +#if __glibc_has_attribute (__indirect_return__) +# define __INDIRECT_RETURN __attribute__ ((__indirect_return__)) +#elif __glibc_has_attribute (__returns_twice__) \ + && defined __ARM_FEATURE_GCS_DEFAULT +# define __INDIRECT_RETURN __attribute__ ((__returns_twice__)) +#else +# define __INDIRECT_RETURN +#endif diff --git a/sysdeps/aarch64/bits/link.h b/sysdeps/aarch64/bits/link.h index e0be0a2..aacea19 100644 --- a/sysdeps/aarch64/bits/link.h +++ b/sysdeps/aarch64/bits/link.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/setjmp.h b/sysdeps/aarch64/bits/setjmp.h index e8a488d..54a338b 100644 --- a/sysdeps/aarch64/bits/setjmp.h +++ b/sysdeps/aarch64/bits/setjmp.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/bits/wordsize.h b/sysdeps/aarch64/bits/wordsize.h index 118e591..f56260c 100644 --- a/sysdeps/aarch64/bits/wordsize.h +++ b/sysdeps/aarch64/bits/wordsize.h @@ -1,6 +1,6 @@ /* Determine the wordsize from the preprocessor defines. - Copyright (C) 2016-2024 Free Software Foundation, Inc. + Copyright (C) 2016-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/cpu-features.h b/sysdeps/aarch64/cpu-features.h index bc8d842..ef4e947 100644 --- a/sysdeps/aarch64/cpu-features.h +++ b/sysdeps/aarch64/cpu-features.h @@ -1,6 +1,6 @@ /* Initialize CPU feature data. AArch64 version. This file is part of the GNU C Library. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. The GNU C Library is free software; you can redistribute it and/or @@ -40,14 +40,6 @@ #define MIDR_IMPLEMENTOR(midr) \ (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) -#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ - && MIDR_PARTNUM(midr) == 0x0a1) - -#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \ - && MIDR_PARTNUM(midr) == 0x516) -#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \ - && MIDR_PARTNUM(midr) == 0xaf) - #define IS_EMAG(midr) (MIDR_IMPLEMENTOR(midr) == 'P' \ && MIDR_PARTNUM(midr) == 0x000) diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S index e54cb02..80da5dc 100644 --- a/sysdeps/aarch64/crti.S +++ b/sysdeps/aarch64/crti.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for AArch64. - Copyright (C) 1995-2024 Free Software Foundation, Inc. + Copyright (C) 1995-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/crtn.S b/sysdeps/aarch64/crtn.S index 3220e45..b52b10e 100644 --- a/sysdeps/aarch64/crtn.S +++ b/sysdeps/aarch64/crtn.S @@ -1,5 +1,5 @@ /* Special .init and .fini section support for AArch64. - Copyright (C) 1995-2024 Free Software Foundation, Inc. + Copyright (C) 1995-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-audit-check.h b/sysdeps/aarch64/dl-audit-check.h index 51f81eb..17bae43 100644 --- a/sysdeps/aarch64/dl-audit-check.h +++ b/sysdeps/aarch64/dl-audit-check.h @@ -1,5 +1,5 @@ /* rtld-audit version check. AArch64 version. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/dl-bti.c b/sysdeps/aarch64/dl-bti.c index fd0d308..de61304 100644 --- a/sysdeps/aarch64/dl-bti.c +++ b/sysdeps/aarch64/dl-bti.c @@ -1,5 +1,5 @@ /* AArch64 BTI functions. - Copyright (C) 2020-2024 Free Software Foundation, Inc. + Copyright (C) 2020-2025 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -84,10 +84,9 @@ _dl_bti_check (struct link_map *l, const char *program) if (l->l_mach.bti_fail) bti_failed (l, program); - unsigned int i = l->l_searchlist.r_nlist; - while (i-- > 0) + for (unsigned int i = 0; i < l->l_searchlist.r_nlist; i++) { - struct link_map *dep = l->l_initfini[i]; + struct link_map *dep = l->l_searchlist.r_list[i]; if (dep->l_mach.bti_fail) bti_failed (dep, program); } diff --git a/sysdeps/aarch64/dl-diagnostics-cpu.c b/sysdeps/aarch64/dl-diagnostics-cpu.c index e037e6e..9130d2b 100644 --- a/sysdeps/aarch64/dl-diagnostics-cpu.c +++ b/sysdeps/aarch64/dl-diagnostics-cpu.c @@ -1,5 +1,5 @@ /* Print CPU diagnostics data in ld.so. AArch64 version. - Copyright (C) 2021-2024 Free Software Foundation, Inc. + Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/dl-dtprocnum.h b/sysdeps/aarch64/dl-dtprocnum.h index cf17995..3358031 100644 --- a/sysdeps/aarch64/dl-dtprocnum.h +++ b/sysdeps/aarch64/dl-dtprocnum.h @@ -1,5 +1,5 @@ /* Configuration of lookup functions. AArch64 version. - Copyright (C) 2019-2024 Free Software Foundation, Inc. + Copyright (C) 2019-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/dl-gcs.c b/sysdeps/aarch64/dl-gcs.c new file mode 100644 index 0000000..4ac86a5 --- /dev/null +++ b/sysdeps/aarch64/dl-gcs.c @@ -0,0 +1,121 @@ +/* AArch64 GCS functions. + Copyright (C) 2024-2025 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <unistd.h> +#include <ldsodefs.h> + +/* GCS is disabled. */ +#define GCS_POLICY_DISABLED 0 + +/* Enable GCS, abort if unmarked binary is found. */ +#define GCS_POLICY_ENFORCED 1 + +/* Optionally enable GCS if all startup dependencies are marked. */ +#define GCS_POLICY_OPTIONAL 2 + +/* Override binary marking and always enabled GCS. */ +#define GCS_POLICY_OVERRIDE 3 + +static void +fail (struct link_map *l, const char *program) +{ + if (program && program[0]) + _dl_fatal_printf ("%s: %s: %s\n", program, l->l_name, "not GCS compatible"); + else if (program) + _dl_fatal_printf ("%s\n", "not GCS compatible"); + else + _dl_signal_error (0, l->l_name, "dlopen", "not GCS compatible"); +} + +static void +unsupported (void) +{ + _dl_fatal_printf ("%s\n", "unsupported GCS policy"); +} + +/* This function is called only when binary markings are not + ignored and GCS is supposed to be enabled. This occurs + for the GCS_POLICY_ENFORCED and GCS_POLICY_ENFORCED policies. */ +static bool +check_gcs (struct link_map *l, const char *program, bool enforced) +{ +#ifdef SHARED + /* Ignore GCS marking on ld.so: its properties are not processed. */ + if (is_rtld_link_map (l->l_real)) + return true; +#endif + /* Binary is marked, all good. */ + if (l->l_mach.gcs) + return true; + /* Binary is not marked and loaded via dlopen: abort. */ + if (program == NULL) + fail (l, program); + /* Binary is not marked and we enforce GCS: abort. */ + if (enforced) + fail (l, program); + /* Binary is not marked but GSC is optional: disable GCS. */ + else + { + GL(dl_aarch64_gcs) = 0; + return false; + } + __builtin_unreachable (); +} + +/* Iterate over all dependencies and check GCS marking. + This function is called only when binary markings are not + ignored and GCS is supposed to be enabled. This occurs + for the GCS_POLICY_ENFORCED and GCS_POLICY_ENFORCED policies. + We interrupt checking if GCS is optional and we already know + it is going to be disabled. */ +static void +check_gcs_depends (struct link_map *l, const char *program, bool enforced) +{ + if (check_gcs (l, program, enforced)) + for (unsigned int i = 0; i < l->l_searchlist.r_nlist; i++) + if (!check_gcs (l->l_searchlist.r_list[i], program, enforced)) + break; +} + +/* Apply GCS policy for L and its dependencies. + PROGRAM is NULL when this check is invoked for dl_open. */ +void +_dl_gcs_check (struct link_map *l, const char *program) +{ + unsigned long policy = GL (dl_aarch64_gcs); + switch (policy) + { + case GCS_POLICY_DISABLED: + case GCS_POLICY_OVERRIDE: + return; + case GCS_POLICY_ENFORCED: + check_gcs_depends (l, program, true); + return; + case GCS_POLICY_OPTIONAL: + check_gcs_depends (l, program, false); + return; + default: + /* All other policy values are not supported: abort. */ + unsupported (); + } +} + +/* Used to report error when prctl system call to enabled GCS fails. */ +void _dl_gcs_enable_failed (int code) +{ + _dl_fatal_printf ("failed to enable GCS: %d\n", -code); +} diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h index a0fbbbd..e157290 100644 --- a/sysdeps/aarch64/dl-irel.h +++ b/sysdeps/aarch64/dl-irel.h @@ -1,6 +1,6 @@ /* Machine-dependent ELF indirect relocation inline functions. AArch64 version. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/dl-lookupcfg.h b/sysdeps/aarch64/dl-lookupcfg.h index 3cc8e34..52ce8a7 100644 --- a/sysdeps/aarch64/dl-lookupcfg.h +++ b/sysdeps/aarch64/dl-lookupcfg.h @@ -1,5 +1,5 @@ /* Configuration of lookup functions. - Copyright (C) 2006-2024 Free Software Foundation, Inc. + Copyright (C) 2006-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h index 71135aa..bb8f8a9 100644 --- a/sysdeps/aarch64/dl-machine.h +++ b/sysdeps/aarch64/dl-machine.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2024 Free Software Foundation, Inc. +/* Copyright (C) 1995-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-prop.h b/sysdeps/aarch64/dl-prop.h index df05c02..abca2be 100644 --- a/sysdeps/aarch64/dl-prop.h +++ b/sysdeps/aarch64/dl-prop.h @@ -1,5 +1,5 @@ /* Support for GNU properties. AArch64 version. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,16 +24,21 @@ extern void _dl_bti_protect (struct link_map *, int) attribute_hidden; extern void _dl_bti_check (struct link_map *, const char *) attribute_hidden; +extern void _dl_gcs_check (struct link_map *, const char *) + attribute_hidden; + static inline void __attribute__ ((always_inline)) _rtld_main_check (struct link_map *m, const char *program) { _dl_bti_check (m, program); + _dl_gcs_check (m, program); } static inline void __attribute__ ((always_inline)) _dl_open_check (struct link_map *m) { _dl_bti_check (m, NULL); + _dl_gcs_check (m, NULL); } static inline void __attribute__ ((always_inline)) @@ -45,10 +50,6 @@ static inline int _dl_process_gnu_property (struct link_map *l, int fd, uint32_t type, uint32_t datasz, void *data) { - if (!GLRO(dl_aarch64_cpu_features).bti) - /* Skip note processing. */ - return 0; - if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) { /* Stop if the property note is ill-formed. */ @@ -57,7 +58,11 @@ _dl_process_gnu_property (struct link_map *l, int fd, uint32_t type, unsigned int feature_1 = *(unsigned int *) data; if (feature_1 & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) - _dl_bti_protect (l, fd); + if (GLRO(dl_aarch64_cpu_features).bti) + _dl_bti_protect (l, fd); + + if (feature_1 & GNU_PROPERTY_AARCH64_FEATURE_1_GCS) + l->l_mach.gcs = 1; /* Stop if we processed the property note. */ return 0; diff --git a/sysdeps/aarch64/dl-start.S b/sysdeps/aarch64/dl-start.S index d645484..a249fda 100644 --- a/sysdeps/aarch64/dl-start.S +++ b/sysdeps/aarch64/dl-start.S @@ -1,5 +1,5 @@ /* ld.so _start code. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,6 +18,7 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> +#include <rtld-global-offsets.h> ENTRY (_start) /* Create an initial frame with 0 LR and FP */ @@ -25,11 +26,33 @@ ENTRY (_start) mov x29, #0 mov x30, #0 + /* Load and relocate all library dependencies. */ mov x0, sp PTR_ARG (0) bl _dl_start /* Returns user entry point in x0. */ mov PTR_REG (21), PTR_REG (0) + + /* Use GL(dl_aarch64_gcs) to set the shadow stack status. */ + adrp x16, _rtld_local + add PTR_REG (16), PTR_REG (16), :lo12:_rtld_local + ldr x1, [x16, GL_DL_AARCH64_GCS_OFFSET] + cbz x1, L(skip_gcs_enable) + + /* Enable GCS before user code runs. Note that IFUNC resolvers and + LD_AUDIT hooks may run before, but should not create threads. */ +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_SHADOW_STACK_ENABLE (1UL << 0) + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE + mov x2, 0 + mov x3, 0 + mov x4, 0 + mov x8, #SYS_ify(prctl) + svc 0x0 + cbnz w0, L(failed_gcs_enable) +L(skip_gcs_enable): + .globl _dl_start_user .type _dl_start_user, %function _dl_start_user: @@ -40,14 +63,17 @@ _dl_start_user: /* Compute envp. */ add PTR_REG (3), PTR_REG (2), PTR_REG (1), lsl PTR_LOG_SIZE add PTR_REG (3), PTR_REG (3), PTR_SIZE - adrp x16, _rtld_local - add PTR_REG (16), PTR_REG (16), :lo12:_rtld_local + /* Run the init functions of the loaded modules. */ ldr PTR_REG (0), [x16] bl _dl_init /* Load the finalizer function. */ adrp x0, _dl_fini add PTR_REG (0), PTR_REG (0), :lo12:_dl_fini /* Jump to the user's entry point. */ - mov x16, x21 - br x16 + mov x16, x21 + br x16 + +L(failed_gcs_enable): + b _dl_gcs_enable_failed + END (_start) diff --git a/sysdeps/aarch64/dl-tls.h b/sysdeps/aarch64/dl-tls.h deleted file mode 100644 index c7c8c11..0000000 --- a/sysdeps/aarch64/dl-tls.h +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 2.1 of the - License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -/* Type used for the representation of TLS information in the GOT. */ -typedef struct -{ - unsigned long int ti_module; - unsigned long int ti_offset; -} tls_index; - - -extern void *__tls_get_addr (tls_index *ti); diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S index 4febf2a..6045b2e 100644 --- a/sysdeps/aarch64/dl-tlsdesc.S +++ b/sysdeps/aarch64/dl-tlsdesc.S @@ -1,6 +1,6 @@ /* Thread-local storage handling in the ELF dynamic linker. AArch64 version. - Copyright (C) 2011-2024 Free Software Foundation, Inc. + Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h index 6a512cb..c850fc8 100644 --- a/sysdeps/aarch64/dl-tlsdesc.h +++ b/sysdeps/aarch64/dl-tlsdesc.h @@ -1,6 +1,6 @@ /* Thread-local storage descriptor handling in the ELF dynamic linker. AArch64 version. - Copyright (C) 2011-2024 Free Software Foundation, Inc. + Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S index 8360756..e9fce9a 100644 --- a/sysdeps/aarch64/dl-trampoline.S +++ b/sysdeps/aarch64/dl-trampoline.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/dl-tunables.list b/sysdeps/aarch64/dl-tunables.list index 92c6cbf..d461c1e 100644 --- a/sysdeps/aarch64/dl-tunables.list +++ b/sysdeps/aarch64/dl-tunables.list @@ -1,5 +1,5 @@ # aarch64 specific tunables. -# Copyright (C) 2017-2024 Free Software Foundation, Inc. +# Copyright (C) 2017-2025 Free Software Foundation, Inc. # This file is part of the GNU C Library. # The GNU C Library is free software; you can redistribute it and/or @@ -21,5 +21,11 @@ glibc { name { type: STRING } + aarch64_gcs { + type: UINT_64 + minval: 0 + maxval: 3 + default: 0 + } } } diff --git a/sysdeps/aarch64/e_sqrtl.c b/sysdeps/aarch64/e_sqrtl.c index 0eda6c6..3885ab5 100644 --- a/sysdeps/aarch64/e_sqrtl.c +++ b/sysdeps/aarch64/e_sqrtl.c @@ -1,5 +1,5 @@ /* long double square root in software floating-point emulation. - Copyright (C) 1997-2024 Free Software Foundation, Inc. + Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/elf-initfini.h b/sysdeps/aarch64/elf-initfini.h index 9b5fdcd..e603415 100644 --- a/sysdeps/aarch64/elf-initfini.h +++ b/sysdeps/aarch64/elf-initfini.h @@ -1,5 +1,5 @@ /* Determine DT_INIT/DT_FINI support in the dynamic loader. AArch64 version. - Copyright (C) 2020-2024 Free Software Foundation, Inc. + Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile index 234a6c4..aadedf1 100644 --- a/sysdeps/aarch64/fpu/Makefile +++ b/sysdeps/aarch64/fpu/Makefile @@ -8,6 +8,7 @@ libmvec-supported-funcs = acos \ cbrt \ cos \ cosh \ + cospi \ erf \ erfc \ exp \ @@ -22,8 +23,10 @@ libmvec-supported-funcs = acos \ pow \ sin \ sinh \ + sinpi \ tan \ - tanh + tanh \ + tanpi float-advsimd-funcs = $(libmvec-supported-funcs) double-advsimd-funcs = $(libmvec-supported-funcs) @@ -41,8 +44,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \ v_log10_data \ erf_data \ erff_data \ - sv_erf_data \ - sv_erff_data \ v_exp_tail_data \ erfc_data \ erfcf_data \ diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions index cc15ce2..0f9503f 100644 --- a/sysdeps/aarch64/fpu/Versions +++ b/sysdeps/aarch64/fpu/Versions @@ -135,4 +135,26 @@ libmvec { _ZGVsMxv_tanh; _ZGVsMxv_tanhf; } + GLIBC_2.41 { + _ZGVnN2v_cospi; + _ZGVnN2v_cospif; + _ZGVnN4v_cospif; + _ZGVsMxv_cospi; + _ZGVsMxv_cospif; + _ZGVnN2v_logp1; + _ZGVnN2v_logp1f; + _ZGVnN4v_logp1f; + _ZGVsMxv_logp1; + _ZGVsMxv_logp1f; + _ZGVnN2v_sinpi; + _ZGVnN2v_sinpif; + _ZGVnN4v_sinpif; + _ZGVsMxv_sinpi; + _ZGVsMxv_sinpif; + _ZGVnN2v_tanpi; + _ZGVnN2v_tanpif; + _ZGVnN4v_tanpif; + _ZGVsMxv_tanpi; + _ZGVsMxv_tanpif; + } } diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c index 0a86c98..7709b54 100644 --- a/sysdeps/aarch64/fpu/acos_advsimd.c +++ b/sysdeps/aarch64/fpu/acos_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD inverse cos - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c index 99dbfac..74e2f7d 100644 --- a/sysdeps/aarch64/fpu/acos_sve.c +++ b/sysdeps/aarch64/fpu/acos_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE inverse cos - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/acosf_advsimd.c b/sysdeps/aarch64/fpu/acosf_advsimd.c index 2ee8fba..4bdccf6 100644 --- a/sysdeps/aarch64/fpu/acosf_advsimd.c +++ b/sysdeps/aarch64/fpu/acosf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD inverse cos - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/acosf_sve.c b/sysdeps/aarch64/fpu/acosf_sve.c index 2701cce..95cf580 100644 --- a/sysdeps/aarch64/fpu/acosf_sve.c +++ b/sysdeps/aarch64/fpu/acosf_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE inverse cos - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c index c88283c..d7dab02 100644 --- a/sysdeps/aarch64/fpu/acosh_advsimd.c +++ b/sysdeps/aarch64/fpu/acosh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) acosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -54,9 +54,8 @@ VPCS_ATTR float64x2_t V_NAME_D1 (acosh) (float64x2_t x) x = vbslq_f64 (special, vreinterpretq_f64_u64 (d->one), x); #endif - float64x2_t xm1 = vsubq_f64 (x, v_f64 (1)); - float64x2_t y; - y = vaddq_f64 (x, v_f64 (1)); + float64x2_t xm1 = vsubq_f64 (x, v_f64 (1.0)); + float64x2_t y = vaddq_f64 (x, v_f64 (1.0)); y = vmulq_f64 (y, xm1); y = vsqrtq_f64 (y); y = vaddq_f64 (xm1, y); diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c index 3e4faaa..326b2cc 100644 --- a/sysdeps/aarch64/fpu/acosh_sve.c +++ b/sysdeps/aarch64/fpu/acosh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) acosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c index 8916dcbf4..61d41d1 100644 --- a/sysdeps/aarch64/fpu/acoshf_advsimd.c +++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) acosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -25,35 +25,32 @@ const static struct data { struct v_log1pf_data log1pf_consts; uint32x4_t one; - uint16x4_t thresh; -} data = { - .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, - .one = V4 (0x3f800000), - .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ -}; +} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) }; + +#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */ static float32x4_t NOINLINE VPCS_ATTR special_case (float32x4_t x, float32x4_t y, uint16x4_t special, - const struct v_log1pf_data d) + const struct v_log1pf_data *d) { return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special)); } /* Vector approximation for single-precision acosh, based on log1p. Maximum error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it - is 2.78 ULP: - __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3 - want 0x1.ef9ea2p-3. + is 3.00 ULP: + _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4 + want 0x1.ef0a7cp-4. With exceptions disabled, we can compute u with a shorter dependency chain, - which gives maximum error of 3.07 ULP: - __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4 - want 0x1.fbc7f4p-4. */ + which gives maximum error of 3.22 ULP: + _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5 + want 0x1.fdcdd2p-5. */ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh); + uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh); #if WANT_SIMD_EXCEPT /* Mask special lanes with 1 to side-step spurious invalid or overflow. Use @@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x) float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p); float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1); #else - float32x4_t xm1 = vsubq_f32 (x, v_f32 (1)); - float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f))); + float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one)); + float32x4_t u + = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one))); #endif float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, y, special, d->log1pf_consts); - return log1pf_inline (y, d->log1pf_consts); + return special_case (x, y, special, &d->log1pf_consts); + return log1pf_inline (y, &d->log1pf_consts); } libmvec_hidden_def (V_NAME_F1 (acosh)) HALF_WIDTH_ALIAS_F1 (acosh) diff --git a/sysdeps/aarch64/fpu/acoshf_sve.c b/sysdeps/aarch64/fpu/acoshf_sve.c index 2110894..74adac7 100644 --- a/sysdeps/aarch64/fpu/acoshf_sve.c +++ b/sysdeps/aarch64/fpu/acoshf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) acosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,23 +17,26 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include "sv_math.h" +#include "sv_log1pf_inline.h" + #define One 0x3f800000 #define Thres 0x20000000 /* asuint(0x1p64) - One. */ -#include "sv_log1pf_inline.h" - static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svfloat32_t xm1, svfloat32_t tmp, svbool_t special) { + svfloat32_t x = svadd_x (svptrue_b32 (), xm1, 1.0f); + svfloat32_t y = sv_log1pf_inline (tmp, svptrue_b32 ()); return sv_call_f32 (acoshf, x, y, special); } /* Single-precision SVE acosh(x) routine. Implements the same algorithm as vector acoshf and log1p. - Maximum error is 2.78 ULPs: - SV_NAME_F1 (acosh) (0x1.01e996p+0) got 0x1.f45b42p-4 - want 0x1.f45b3cp-4. */ + Maximum error is 2.47 ULPs: + SV_NAME_F1 (acosh) (0x1.01ca76p+0) got 0x1.e435a6p-4 + want 0x1.e435a2p-4. */ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) { svuint32_t ix = svreinterpret_u32 (x); @@ -41,9 +44,9 @@ svfloat32_t SV_NAME_F1 (acosh) (svfloat32_t x, const svbool_t pg) svfloat32_t xm1 = svsub_x (pg, x, 1.0f); svfloat32_t u = svmul_x (pg, xm1, svadd_x (pg, x, 1.0f)); - svfloat32_t y = sv_log1pf_inline (svadd_x (pg, xm1, svsqrt_x (pg, u)), pg); + svfloat32_t tmp = svadd_x (pg, xm1, svsqrt_x (pg, u)); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, y, special); - return y; + return special_case (xm1, tmp, special); + return sv_log1pf_inline (tmp, pg); } diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h index 097d403..38681a4 100644 --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h @@ -1,6 +1,6 @@ /* Hidden prototypes for single-precision AdvSIMD routines - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -26,6 +26,7 @@ libmvec_hidden_proto (V_NAME_F1(atanh)); libmvec_hidden_proto (V_NAME_F1(cbrt)); libmvec_hidden_proto (V_NAME_F1(cos)); libmvec_hidden_proto (V_NAME_F1(cosh)); +libmvec_hidden_proto (V_NAME_F1(cospi)); libmvec_hidden_proto (V_NAME_F1(erf)); libmvec_hidden_proto (V_NAME_F1(erfc)); libmvec_hidden_proto (V_NAME_F1(exp10)); @@ -36,10 +37,13 @@ libmvec_hidden_proto (V_NAME_F2(hypot)); libmvec_hidden_proto (V_NAME_F1(log10)); libmvec_hidden_proto (V_NAME_F1(log1p)); libmvec_hidden_proto (V_NAME_F1(log2)); +libmvec_hidden_proto (V_NAME_F1(logp1)); libmvec_hidden_proto (V_NAME_F1(log)); libmvec_hidden_proto (V_NAME_F2(pow)); libmvec_hidden_proto (V_NAME_F1(sin)); libmvec_hidden_proto (V_NAME_F1(sinh)); +libmvec_hidden_proto (V_NAME_F1(sinpi)); libmvec_hidden_proto (V_NAME_F1(tan)); libmvec_hidden_proto (V_NAME_F1(tanh)); +libmvec_hidden_proto (V_NAME_F1(tanpi)); libmvec_hidden_proto (V_NAME_F2(atan2)); diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c index 2de6eff..4142116 100644 --- a/sysdeps/aarch64/fpu/asin_advsimd.c +++ b/sysdeps/aarch64/fpu/asin_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD inverse sin - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c index 9daa382..9314466 100644 --- a/sysdeps/aarch64/fpu/asin_sve.c +++ b/sysdeps/aarch64/fpu/asin_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE inverse sin - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c index 59d870c..52c7c0e 100644 --- a/sysdeps/aarch64/fpu/asinf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD inverse sin - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/asinf_sve.c b/sysdeps/aarch64/fpu/asinf_sve.c index 37c51e6..f2182ac 100644 --- a/sysdeps/aarch64/fpu/asinf_sve.c +++ b/sysdeps/aarch64/fpu/asinf_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE inverse sin - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c index 6207e7d..1afc072 100644 --- a/sysdeps/aarch64/fpu/asinh_advsimd.c +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) asinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,41 +20,71 @@ #include "v_math.h" #include "poly_advsimd_f64.h" -#define A(i) v_f64 (__v_log_data.poly[i]) -#define N (1 << V_LOG_TABLE_BITS) -#define IndexMask (N - 1) - const static struct data { - float64x2_t poly[18]; - uint64x2_t off, huge_bound, abs_mask; - float64x2_t ln2, tiny_bound; + uint64x2_t huge_bound, abs_mask, off, mask; +#if WANT_SIMD_EXCEPT + float64x2_t tiny_bound; +#endif + float64x2_t lc0, lc2; + double lc1, lc3, ln2, lc4; + + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; + double c1, c3, c5, c7, c9, c11, c13, c15; + } data = { - .off = V2 (0x3fe6900900000000), - .ln2 = V2 (0x1.62e42fefa39efp-1), - .huge_bound = V2 (0x5fe0000000000000), + +#if WANT_SIMD_EXCEPT .tiny_bound = V2 (0x1p-26), - .abs_mask = V2 (0x7fffffffffffffff), +#endif /* Even terms of polynomial s.t. asinh(x) is approximated by asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ - .poly = { V2 (-0x1.55555555554a7p-3), V2 (0x1.3333333326c7p-4), - V2 (-0x1.6db6db68332e6p-5), V2 (0x1.f1c71b26fb40dp-6), - V2 (-0x1.6e8b8b654a621p-6), V2 (0x1.1c4daa9e67871p-6), - V2 (-0x1.c9871d10885afp-7), V2 (0x1.7a16e8d9d2ecfp-7), - V2 (-0x1.3ddca533e9f54p-7), V2 (0x1.0becef748dafcp-7), - V2 (-0x1.b90c7099dd397p-8), V2 (0x1.541f2bb1ffe51p-8), - V2 (-0x1.d217026a669ecp-9), V2 (0x1.0b5c7977aaf7p-9), - V2 (-0x1.e0f37daef9127p-11), V2 (0x1.388b5fe542a6p-12), - V2 (-0x1.021a48685e287p-14), V2 (0x1.93d4ba83d34dap-18) }, + + .c0 = V2 (-0x1.55555555554a7p-3), + .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), + .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), + .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), + .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), + .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), + .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), + .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), + .c17 = V2 (0x1.93d4ba83d34dap-18), + + .lc0 = V2 (-0x1.ffffffffffff7p-2), + .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), + .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + + .off = V2 (0x3fe6900900000000), + .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), + .mask = V2 (0xfffULL << 52), }; static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, + uint64x2_t special) { + /* Copy sign. */ + y = vbslq_f64 (abs_mask, y, x); return v_call_f64 (asinh, x, y, special); } +#define N (1 << V_LOG_TABLE_BITS) +#define IndexMask (N - 1) + struct entry { float64x2_t invc; @@ -76,27 +106,34 @@ lookup (uint64x2_t i) } static inline float64x2_t -log_inline (float64x2_t x, const struct data *d) +log_inline (float64x2_t xm, const struct data *d) { - /* Double-precision vector log, copied from ordinary vector log with some - cosmetic modification and special-cases removed. */ - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint64x2_t tmp = vsubq_u64 (ix, d->off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz - = vsubq_u64 (ix, vandq_u64 (tmp, vdupq_n_u64 (0xfffULL << 52))); + + uint64x2_t u = vreinterpretq_u64_f64 (xm); + uint64x2_t u_off = vsubq_u64 (u, d->off); + + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->mask)); float64x2_t z = vreinterpretq_f64_u64 (iz); - struct entry e = lookup (tmp); + + struct entry e = lookup (u_off); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); float64x2_t kd = vcvtq_f64_s64 (k); - float64x2_t hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + + /* hi = r + log(c) + k*Ln2. */ + float64x2_t ln2_and_lc4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_lc4, 0); + + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ + float64x2_t odd_coeffs = vld1q_f64 (&d->lc1); float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = vfmaq_f64 (A (2), A (3), r); - float64x2_t p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - y = vfmaq_f64 (hi, y, r2); - return y; + float64x2_t y = vfmaq_laneq_f64 (d->lc2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->lc0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_lc4, 1); + y = vfmaq_f64 (p, r2, y); + return vfmaq_f64 (hi, y, r2); } /* Double-precision implementation of vector asinh(x). @@ -106,23 +143,24 @@ log_inline (float64x2_t x, const struct data *d) asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise where log(x) is an optimized log approximation, and P(x) is a polynomial - shared with the scalar routine. The greatest observed error 3.29 ULP, in + shared with the scalar routine. The greatest observed error 2.79 ULP, in |x| >= 1: - __v_asinh(0x1.2cd9d717e2c9bp+0) got 0x1.ffffcfd0e234fp-1 - want 0x1.ffffcfd0e2352p-1. */ + _ZGVnN2v_asinh(0x1.2cd9d73ea76a6p+0) got 0x1.ffffd003219dap-1 + want 0x1.ffffd003219ddp-1. */ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - float64x2_t ax = vabsq_f64 (x); - uint64x2_t iax = vreinterpretq_u64_f64 (ax); uint64x2_t gt1 = vcgeq_f64 (ax, v_f64 (1)); - uint64x2_t special = vcgeq_u64 (iax, d->huge_bound); #if WANT_SIMD_EXCEPT + uint64x2_t iax = vreinterpretq_u64_f64 (ax); + uint64x2_t special = vcgeq_u64 (iax, (d->huge_bound)); uint64x2_t tiny = vcltq_f64 (ax, d->tiny_bound); special = vorrq_u64 (special, tiny); +#else + uint64x2_t special = vcgeq_f64 (ax, vreinterpretq_f64_u64 (d->huge_bound)); #endif /* Option 1: |x| >= 1. @@ -147,19 +185,45 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) overflow, and tiny lanes, which will underflow, by setting them to 0. They will be fixed later, either by selecting x or falling back to the scalar special-case. The largest observed error in this region is 1.47 ULPs: - __v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 - want 0x1.c1d6bf874019cp-1. */ + _ZGVnN2v_asinh(0x1.fdfcd00cc1e6ap-1) got 0x1.c1d6bf874019bp-1 + want 0x1.c1d6bf874019cp-1. */ float64x2_t option_2 = v_f64 (0); + if (__glibc_likely (v_any_u64 (vceqzq_u64 (gt1)))) { + #if WANT_SIMD_EXCEPT ax = v_zerofy_f64 (ax, vorrq_u64 (tiny, gt1)); #endif - float64x2_t x2 = vmulq_f64 (ax, ax), x3 = vmulq_f64 (ax, x2), - z2 = vmulq_f64 (x2, x2), z4 = vmulq_f64 (z2, z2), - z8 = vmulq_f64 (z4, z4), z16 = vmulq_f64 (z8, z8); - float64x2_t p = v_estrin_17_f64 (x2, z2, z4, z8, z16, d->poly); - option_2 = vfmaq_f64 (ax, p, x3); + float64x2_t x2 = vmulq_f64 (ax, ax), z2 = vmulq_f64 (x2, x2); + /* Order-17 Pairwise Horner scheme. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, x2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, x2, c13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, x2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, x2, c57, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, x2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, x2, c911, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, x2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, x2, c1315, 1); + float64x2_t p1617 = vfmaq_f64 (d->c16, x2, d->c17); + + float64x2_t p = vfmaq_f64 (p1415, z2, p1617); + p = vfmaq_f64 (p1213, z2, p); + p = vfmaq_f64 (p1011, z2, p); + p = vfmaq_f64 (p89, z2, p); + + p = vfmaq_f64 (p67, z2, p); + p = vfmaq_f64 (p45, z2, p); + + p = vfmaq_f64 (p23, z2, p); + + p = vfmaq_f64 (p01, z2, p); + option_2 = vfmaq_f64 (ax, p, vmulq_f64 (ax, x2)); #if WANT_SIMD_EXCEPT option_2 = vbslq_f64 (tiny, x, option_2); #endif @@ -167,10 +231,10 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) /* Choose the right option for each lane. */ float64x2_t y = vbslq_f64 (gt1, option_1, option_2); - /* Copy sign. */ - y = vbslq_f64 (d->abs_mask, y, x); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, y, special); - return y; + { + return special_case (x, y, d->abs_mask, special); + } + /* Copy sign. */ + return vbslq_f64 (d->abs_mask, y, x); } diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c index 28dc5c4..ff6b713 100644 --- a/sysdeps/aarch64/fpu/asinh_sve.c +++ b/sysdeps/aarch64/fpu/asinh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) asinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,36 +18,49 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" #define SignMask (0x8000000000000000) #define One (0x3ff0000000000000) #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511). */ +#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1) static const struct data { - double poly[18]; - double ln2, p3, p1, p4, p0, p2; - uint64_t n; - uint64_t off; + double even_coeffs[9]; + double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; + uint64_t off, mask; } data = { - /* Polynomial generated using Remez on [2^-26, 1]. */ - .poly - = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5, - 0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6, - -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7, - 0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8, - -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11, - 0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 }, + /* Polynomial generated using Remez on [2^-26, 1]. */ + .even_coeffs ={ + -0x1.55555555554a7p-3, + -0x1.6db6db68332e6p-5, + -0x1.6e8b8b654a621p-6, + -0x1.c9871d10885afp-7, + -0x1.3ddca533e9f54p-7, + -0x1.b90c7099dd397p-8, + -0x1.d217026a669ecp-9, + -0x1.e0f37daef9127p-11, + -0x1.021a48685e287p-14, }, + + .c1 = 0x1.3333333326c7p-4, + .c3 = 0x1.f1c71b26fb40dp-6, + .c5 = 0x1.1c4daa9e67871p-6, + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c9 = 0x1.0becef748dafcp-7, + .c11 = 0x1.541f2bb1ffe51p-8, + .c13 = 0x1.0b5c7977aaf7p-9, + .c15 = 0x1.388b5fe542a6p-12, + .c17 = 0x1.93d4ba83d34dap-18, + .ln2 = 0x1.62e42fefa39efp-1, .p0 = -0x1.ffffffffffff7p-2, .p1 = 0x1.55555555170d4p-2, .p2 = -0x1.0000000399c27p-2, .p3 = 0x1.999b2e90e94cap-3, .p4 = -0x1.554e550bd501ep-3, - .n = 1 << V_LOG_TABLE_BITS, - .off = 0x3fe6900900000000 + .off = 0x3fe6900900000000, + .mask = 0xfffULL << 52, }; static svfloat64_t NOINLINE @@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) of the algorithm used. */ svuint64_t ix = svreinterpret_u64 (x); - svuint64_t tmp = svsub_x (pg, ix, d->off); - svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), - (d->n - 1) << 1); - svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); + svuint64_t i_off = svsub_x (pg, ix, d->off); + svuint64_t i + = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask); + svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask)); svfloat64_t z = svreinterpret_f64 (iz); svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i); @@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1); svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z); - svfloat64_t kd = svcvt_f64_x (pg, k); + svfloat64_t kd + = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52)); svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0); - svfloat64_t r2 = svmul_x (pg, r, r); - + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1); - svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0); + y = svmla_lane (y, r2, p1_p4, 1); y = svmla_x (pg, p, r2, y); y = svmla_x (pg, hi, r2, y); @@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) svuint64_t iax = svbic_x (pg, ix, SignMask); svuint64_t sign = svand_x (pg, ix, SignMask); svfloat64_t ax = svreinterpret_f64 (iax); - svbool_t ge1 = svcmpge (pg, iax, One); svbool_t special = svcmpge (pg, iax, Thres); @@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) svfloat64_t option_1 = sv_f64 (0); if (__glibc_likely (svptest_any (pg, ge1))) { - svfloat64_t x2 = svmul_x (pg, ax, ax); + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); option_1 = __sv_log_inline ( svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg); } @@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) The largest observed error in this region is 1.51 ULPs: _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1 want 0x1.c1e649ee2681dp-1. */ + svfloat64_t option_2 = sv_f64 (0); if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1)))) { - svfloat64_t x2 = svmul_x (pg, ax, ax); - svfloat64_t x4 = svmul_x (pg, x2, x2); - svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly); - option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax)); + svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax); + svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2); + /* Order-17 Pairwise Horner scheme. */ + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); + svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5); + svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9); + svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13); + + svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1); + svfloat64_t p1213 + = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0); + svfloat64_t p1415 + = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1); + svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17); + + svfloat64_t p = svmla_x (pg, p1415, x4, p1617); + p = svmla_x (pg, p1213, x4, p); + p = svmla_x (pg, p1011, x4, p); + p = svmla_x (pg, p89, x4, p); + + p = svmla_x (pg, p67, x4, p); + p = svmla_x (pg, p45, x4, p); + + p = svmla_x (pg, p23, x4, p); + + p = svmla_x (pg, p01, x4, p); + + option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); } - /* Choose the right option for each lane. */ - svfloat64_t y = svsel (ge1, option_1, option_2); - if (__glibc_unlikely (svptest_any (pg, special))) return special_case ( - x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)), + x, + svreinterpret_f64 (sveor_x ( + pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), special); + + /* Choose the right option for each lane. */ + svfloat64_t y = svsel (ge1, option_1, option_2); return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c index 09fd8a6..cff66e5 100644 --- a/sysdeps/aarch64/fpu/asinhf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) asinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,16 +20,16 @@ #include "v_math.h" #include "v_log1pf_inline.h" -#define SignMask v_u32 (0x80000000) - const static struct data { struct v_log1pf_data log1pf_consts; + float32x4_t one; uint32x4_t big_bound; #if WANT_SIMD_EXCEPT uint32x4_t tiny_bound; #endif } data = { + .one = V4 (1), .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .big_bound = V4 (0x5f800000), /* asuint(0x1p64). */ #if WANT_SIMD_EXCEPT @@ -38,20 +38,27 @@ const static struct data }; static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, uint32x4_t sign, float32x4_t y, + uint32x4_t special, const struct data *d) { - return v_call_f32 (asinhf, x, y, special); + return v_call_f32 ( + asinhf, x, + vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))), + special); } /* Single-precision implementation of vector asinh(x), using vector log1p. - Worst-case error is 2.66 ULP, at roughly +/-0.25: - __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */ + Worst-case error is 2.59 ULP: + _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3 + want 0x1.d449c4p-3. */ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) { const struct data *dat = ptr_barrier (&data); - uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask); - float32x4_t ax = vreinterpretq_f32_u32 (iax); + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); uint32x4_t special = vcgeq_u32 (iax, dat->big_bound); + uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax); float32x4_t special_arg = x; #if WANT_SIMD_EXCEPT @@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x) /* asinh(x) = log(x + sqrt(x * x + 1)). For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */ float32x4_t d - = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x))); - float32x4_t y = log1pf_inline ( - vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts); + = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax))); + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)); if (__glibc_unlikely (v_any_u32 (special))) - return special_case (special_arg, vbslq_f32 (SignMask, x, y), special); - return vbslq_f32 (SignMask, x, y); + return special_case (special_arg, sign, y, special, dat); + return vreinterpretq_f32_u32 (veorq_u32 ( + sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts)))); } libmvec_hidden_def (V_NAME_F1 (asinh)) HALF_WIDTH_ALIAS_F1 (asinh) diff --git a/sysdeps/aarch64/fpu/asinhf_sve.c b/sysdeps/aarch64/fpu/asinhf_sve.c index d85c3a6..f07b8a2 100644 --- a/sysdeps/aarch64/fpu/asinhf_sve.c +++ b/sysdeps/aarch64/fpu/asinhf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) asinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,20 +20,23 @@ #include "sv_math.h" #include "sv_log1pf_inline.h" -#define BigBound (0x5f800000) /* asuint(0x1p64). */ +#define BigBound 0x5f800000 /* asuint(0x1p64). */ static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t y, svbool_t special) { + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svreinterpret_f32 ( + svorr_x (svptrue_b32 (), sign, svreinterpret_u32 (y))); return sv_call_f32 (asinhf, x, y, special); } /* Single-precision SVE asinh(x) routine. Implements the same algorithm as vector asinhf and log1p. - Maximum error is 2.48 ULPs: - SV_NAME_F1 (asinh) (0x1.008864p-3) got 0x1.ffbbbcp-4 - want 0x1.ffbbb8p-4. */ + Maximum error is 1.92 ULPs: + SV_NAME_F1 (asinh) (-0x1.0922ecp-1) got -0x1.fd0bccp-2 + want -0x1.fd0bc8p-2. */ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) { svfloat32_t ax = svabs_x (pg, x); @@ -49,8 +52,6 @@ svfloat32_t SV_NAME_F1 (asinh) (svfloat32_t x, const svbool_t pg) = sv_log1pf_inline (svadd_x (pg, ax, svdiv_x (pg, ax2, d)), pg); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case ( - x, svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))), - special); + return special_case (iax, sign, y, special); return svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (y))); } diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c index b1e7a9b..00b4a4f 100644 --- a/sysdeps/aarch64/fpu/atan2_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD atan2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,40 +23,57 @@ static const struct data { + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - float64x2_t poly[20]; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + uint64x2_t zeroinfnan, minustwo; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - the interval [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), + .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), + .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), + .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), + .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), + .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), + .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), + .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), + .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), + .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), + .c19 = 0x1.358851160a528p-16, .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), + .minustwo = V2 (0xc000000000000000), }; #define SignMask v_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, + uint64x2_t sign_xy, uint64x2_t cmp) { + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); return v_call2_f64 (atan2, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ static inline uint64x2_t -zeroinfnan (uint64x2_t i) +zeroinfnan (uint64x2_t i, const struct data *d) { /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ - return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), - v_u64 (2 * asuint64 (INFINITY) - 1)); + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); } /* Fast implementation of vector atan2. @@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i) want 0x1.92d628ab678cfp-1. */ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); uint64x2_t ix = vreinterpretq_u64_f64 (x); uint64x2_t iy = vreinterpretq_u64_f64 (y); - uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); + uint64x2_t special_cases + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); uint64x2_t sign_x = vandq_u64 (ix, SignMask); uint64x2_t sign_y = vandq_u64 (iy, SignMask); @@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t ay = vabsq_f64 (y); uint64x2_t pred_xlt0 = vcltzq_f64 (x); - uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); /* Set up z for call to atan. */ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); - float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); - float64x2_t z = vdivq_f64 (n, d); + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, q); /* Work out the correct shift. */ - float64x2_t shift = vreinterpretq_f64_u64 ( - vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + float64x2_t shift + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); - shift = vmulq_f64 (shift, data_ptr->pi_over_2); + shift = vmulq_f64 (shift, d->pi_over_2); /* Calculate the polynomial approximation. Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of @@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t ret - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), - v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); /* Finalize. y = shift + z + z^3 * P(z^2). */ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); ret = vaddq_f64 (ret, shift); + if (__glibc_unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + /* Account for the sign of x and y. */ ret = vreinterpretq_f64_u64 ( veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); - if (__glibc_unlikely (v_any_u64 (special_cases))) - return special_case (y, x, ret, special_cases); - return ret; } diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c index ed9f683..163f613 100644 --- a/sysdeps/aarch64/fpu/atan2_sve.c +++ b/sysdeps/aarch64/fpu/atan2_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE atan2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c index 56e610c..e65406f 100644 --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD atan2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,34 +22,39 @@ static const struct data { - float32x4_t poly[8]; - float32x4_t pi_over_2; + float32x4_t c0, pi_over_2, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, - .pi_over_2 = V4 (0x1.921fb6p+0f), + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), }; #define SignMask v_u32 (0x80000000) /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, + uint32x4_t sign_xy, uint32x4_t cmp) { + /* Account for the sign of y. */ + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); return v_call2_f32 (atan2f, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ static inline uint32x4_t -zeroinfnan (uint32x4_t i) +zeroinfnan (uint32x4_t i, const struct data *d) { /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), - v_u32 (2 * 0x7f800000lu - 1)); + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); } /* Fast implementation of vector atan2f. Maximum observed error is @@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i) want 0x1.967f00p-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t iy = vreinterpretq_u32_f32 (y); - uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); + uint32x4_t special_cases + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); uint32x4_t sign_x = vandq_u32 (ix, SignMask); uint32x4_t sign_y = vandq_u32 (iy, SignMask); @@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) /* Set up z for call to atanf. */ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); - float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); - float32x4_t z = vdivq_f32 (n, d); + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, q); /* Work out the correct shift. */ float32x4_t shift = vreinterpretq_f32_u32 ( vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); - shift = vmulq_f32 (shift, data_ptr->pi_over_2); + shift = vmulq_f32 (shift, d->pi_over_2); /* Calculate the polynomial approximation. Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, @@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z4 = vmulq_f32 (z2, z2); - float32x4_t ret = vfmaq_f32 ( - v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, - vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); /* y = shift + z * P(z^2). */ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); - /* Account for the sign of y. */ - ret = vreinterpretq_f32_u32 ( - veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); - if (__glibc_unlikely (v_any_u32 (special_cases))) { - return special_case (y, x, ret, special_cases); + return special_case (y, x, ret, sign_xy, special_cases); } - return ret; + /* Account for the sign of y. */ + return vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); } libmvec_hidden_def (V_NAME_F2 (atan2)) HALF_WIDTH_ALIAS_F2(atan2) diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c index 9ea1971..5f26e2a 100644 --- a/sysdeps/aarch64/fpu/atan2f_sve.c +++ b/sysdeps/aarch64/fpu/atan2f_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE atan2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c index a962be0..f024fd1 100644 --- a/sysdeps/aarch64/fpu/atan_advsimd.c +++ b/sysdeps/aarch64/fpu/atan_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD inverse tan - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,21 +22,22 @@ static const struct data { + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - float64x2_t poly[20]; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, .pi_over_2 = V2 (0x1.921fb54442d18p+0), }; @@ -52,6 +53,11 @@ static const struct data float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { const struct data *d = ptr_barrier (&data); + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need @@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t y - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), - v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t y = vfmaq_f64 (p07, p819, x8); /* Finalize. y = shift + z + z^3 * P(z^2). */ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c index fa16303..3880ced 100644 --- a/sysdeps/aarch64/fpu/atan_sve.c +++ b/sysdeps/aarch64/fpu/atan_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE inverse tan - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c index d015cc7..472865e 100644 --- a/sysdeps/aarch64/fpu/atanf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD inverse tan - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c index 7b54094..3a98d70 100644 --- a/sysdeps/aarch64/fpu/atanf_sve.c +++ b/sysdeps/aarch64/fpu/atanf_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE inverse tan - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c index 3c3d0bd..7fba309 100644 --- a/sysdeps/aarch64/fpu/atanh_advsimd.c +++ b/sysdeps/aarch64/fpu/atanh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) atanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,15 +23,19 @@ const static struct data { struct v_log1p_data log1p_consts; - uint64x2_t one, half; + uint64x2_t one; + uint64x2_t sign_mask; } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, .one = V2 (0x3ff0000000000000), - .half = V2 (0x3fe0000000000000) }; + .sign_mask = V2 (0x8000000000000000) }; static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, + uint64x2_t special, const struct data *d) { - return v_call_f64 (atanh, x, y, special); + y = log1p_inline (y, &d->log1p_consts); + return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), + vmulq_f64 (halfsign, y), special); } /* Approximation for vector double-precision atanh(x) using modified log1p. @@ -43,11 +47,10 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); + float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); float64x2_t ax = vabsq_f64 (x); uint64x2_t ia = vreinterpretq_u64_f64 (ax); - uint64x2_t sign = veorq_u64 (vreinterpretq_u64_f64 (x), ia); uint64x2_t special = vcgeq_u64 (ia, d->one); - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->half)); #if WANT_SIMD_EXCEPT ax = v_zerofy_f64 (ax, special); @@ -55,10 +58,15 @@ float64x2_t V_NAME_D1 (atanh) (float64x2_t x) float64x2_t y; y = vaddq_f64 (ax, ax); - y = vdivq_f64 (y, vsubq_f64 (v_f64 (1), ax)); - y = log1p_inline (y, &d->log1p_consts); + y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, vmulq_f64 (y, halfsign), special); +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special, d); +#else + return special_case (ax, halfsign, y, special, d); +#endif + + y = log1p_inline (y, &d->log1p_consts); return vmulq_f64 (y, halfsign); } diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c index 7a52728..16a7cf6 100644 --- a/sysdeps/aarch64/fpu/atanh_sve.c +++ b/sysdeps/aarch64/fpu/atanh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) atanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c index ae488f7..ef23b89 100644 --- a/sysdeps/aarch64/fpu/atanhf_advsimd.c +++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) atanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -40,15 +40,17 @@ const static struct data #define Half v_u32 (0x3f000000) static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y, + uint32x4_t special) { - return v_call_f32 (atanhf, x, y, special); + return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign), + vmulq_f32 (halfsign, y), special); } /* Approximation for vector single-precision atanh(x) using modified log1p. - The maximum error is 3.08 ULP: - __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5 - want 0x1.ffcb82p-5. */ + The maximum error is 2.93 ULP: + _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5 + want 0x1.f4dcf8p-5. */ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); @@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x) uint32x4_t special = vcgeq_u32 (iax, d->one); #endif - float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax)); - y = log1pf_inline (y, d->log1pf_consts); + float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), + vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax)); + y = log1pf_inline (y, &d->log1pf_consts); + /* If exceptions not required, pass ax to special-case for shorter dependency + chain. If exceptions are required ax will have been zerofied, so have to + pass x. */ if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, vmulq_f32 (halfsign, y), special); +#if WANT_SIMD_EXCEPT + return special_case (x, halfsign, y, special); +#else + return special_case (ax, halfsign, y, special); +#endif return vmulq_f32 (halfsign, y); } libmvec_hidden_def (V_NAME_F1 (atanh)) diff --git a/sysdeps/aarch64/fpu/atanhf_sve.c b/sysdeps/aarch64/fpu/atanhf_sve.c index dae8304..98e9950 100644 --- a/sysdeps/aarch64/fpu/atanhf_sve.c +++ b/sysdeps/aarch64/fpu/atanhf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) atanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,21 +17,25 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ +#include "sv_math.h" #include "sv_log1pf_inline.h" #define One (0x3f800000) #define Half (0x3f000000) static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t iax, svuint32_t sign, svfloat32_t halfsign, + svfloat32_t y, svbool_t special) { + svfloat32_t x = svreinterpret_f32 (sveor_x (svptrue_b32 (), iax, sign)); + y = svmul_x (svptrue_b32 (), halfsign, y); return sv_call_f32 (atanhf, x, y, special); } /* Approximation for vector single-precision atanh(x) using modified log1p. - The maximum error is 2.28 ULP: - _ZGVsMxv_atanhf(0x1.ff1194p-5) got 0x1.ffbbbcp-5 - want 0x1.ffbbb6p-5. */ + The maximum error is 1.99 ULP: + _ZGVsMxv_atanhf(0x1.f1583p-5) got 0x1.f1f4fap-5 + want 0x1.f1f4f6p-5. */ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) { svfloat32_t ax = svabs_x (pg, x); @@ -48,7 +52,7 @@ svfloat32_t SV_NAME_F1 (atanh) (svfloat32_t x, const svbool_t pg) y = sv_log1pf_inline (y, pg); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmul_x (pg, halfsign, y), special); + return special_case (iax, sign, halfsign, y, special); return svmul_x (pg, halfsign, y); } diff --git a/sysdeps/aarch64/fpu/bench-libmvec-arch.h b/sysdeps/aarch64/fpu/bench-libmvec-arch.h index baa0513..f57f349 100644 --- a/sysdeps/aarch64/fpu/bench-libmvec-arch.h +++ b/sysdeps/aarch64/fpu/bench-libmvec-arch.h @@ -1,5 +1,5 @@ /* Runtime architecture check for libmvec benchtests. aarch64 version. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h index 7484150..5152c0d 100644 --- a/sysdeps/aarch64/fpu/bits/math-vector.h +++ b/sysdeps/aarch64/fpu/bits/math-vector.h @@ -1,6 +1,6 @@ /* Platform-specific SIMD declarations of math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -69,6 +69,10 @@ # define __DECL_SIMD_cosh __DECL_SIMD_aarch64 # undef __DECL_SIMD_coshf # define __DECL_SIMD_coshf __DECL_SIMD_aarch64 +# undef __DECL_SIMD_cospi +# define __DECL_SIMD_cospi __DECL_SIMD_aarch64 +# undef __DECL_SIMD_cospif +# define __DECL_SIMD_cospif __DECL_SIMD_aarch64 # undef __DECL_SIMD_erf # define __DECL_SIMD_erf __DECL_SIMD_aarch64 # undef __DECL_SIMD_erff @@ -113,6 +117,10 @@ # define __DECL_SIMD_log2 __DECL_SIMD_aarch64 # undef __DECL_SIMD_log2f # define __DECL_SIMD_log2f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_logp1 +# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64 +# undef __DECL_SIMD_logp1f +# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64 # undef __DECL_SIMD_pow # define __DECL_SIMD_pow __DECL_SIMD_aarch64 # undef __DECL_SIMD_powf @@ -125,6 +133,10 @@ # define __DECL_SIMD_sinh __DECL_SIMD_aarch64 # undef __DECL_SIMD_sinhf # define __DECL_SIMD_sinhf __DECL_SIMD_aarch64 +# undef __DECL_SIMD_sinpi +# define __DECL_SIMD_sinpi __DECL_SIMD_aarch64 +# undef __DECL_SIMD_sinpif +# define __DECL_SIMD_sinpif __DECL_SIMD_aarch64 # undef __DECL_SIMD_tan # define __DECL_SIMD_tan __DECL_SIMD_aarch64 # undef __DECL_SIMD_tanf @@ -133,6 +145,10 @@ # define __DECL_SIMD_tanh __DECL_SIMD_aarch64 # undef __DECL_SIMD_tanhf # define __DECL_SIMD_tanhf __DECL_SIMD_aarch64 +# undef __DECL_SIMD_tanpi +# define __DECL_SIMD_tanpi __DECL_SIMD_aarch64 +# undef __DECL_SIMD_tanpif +# define __DECL_SIMD_tanpif __DECL_SIMD_aarch64 #endif #if __GNUC_PREREQ(9, 0) @@ -169,6 +185,7 @@ __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_cospif (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_erfcf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); @@ -180,11 +197,14 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t); __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_sinpif (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_tanpif (__f32x4_t); __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t); __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t); @@ -196,6 +216,7 @@ __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_cospi (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_erfc (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); @@ -207,11 +228,14 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t); __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_sinpi (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_tanh (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_tanpi (__f64x2_t); # undef __ADVSIMD_VEC_MATH_SUPPORTED #endif /* __ADVSIMD_VEC_MATH_SUPPORTED */ @@ -228,6 +252,7 @@ __sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_cospif (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_erfcf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t); @@ -239,11 +264,14 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_sinpif (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_tanhf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_tanpif (__sv_f32_t, __sv_bool_t); __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t); @@ -255,6 +283,7 @@ __sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_cospi (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_erfc (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t); @@ -266,11 +295,14 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_sinpi (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_tanh (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_tanpi (__sv_f64_t, __sv_bool_t); # undef __SVE_VEC_MATH_SUPPORTED #endif /* __SVE_VEC_MATH_SUPPORTED */ diff --git a/sysdeps/aarch64/fpu/cbrt_advsimd.c b/sysdeps/aarch64/fpu/cbrt_advsimd.c index adfbb60..78ee9e2 100644 --- a/sysdeps/aarch64/fpu/cbrt_advsimd.c +++ b/sysdeps/aarch64/fpu/cbrt_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) cbrt function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cbrt_sve.c b/sysdeps/aarch64/fpu/cbrt_sve.c index fc976ed..342b530 100644 --- a/sysdeps/aarch64/fpu/cbrt_sve.c +++ b/sysdeps/aarch64/fpu/cbrt_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) cbrt function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cbrtf_advsimd.c b/sysdeps/aarch64/fpu/cbrtf_advsimd.c index 27debb8..f9ae1c6 100644 --- a/sysdeps/aarch64/fpu/cbrtf_advsimd.c +++ b/sysdeps/aarch64/fpu/cbrtf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) cbrt function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cbrtf_sve.c b/sysdeps/aarch64/fpu/cbrtf_sve.c index 23c220c..3a66aed 100644 --- a/sysdeps/aarch64/fpu/cbrtf_sve.c +++ b/sysdeps/aarch64/fpu/cbrtf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) cbrt function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c index 3924c9c..0b5b7af 100644 --- a/sysdeps/aarch64/fpu/cos_advsimd.c +++ b/sysdeps/aarch64/fpu/cos_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) cos function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ static const struct data { float64x2_t poly[7]; - float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), @@ -30,11 +30,9 @@ static const struct data V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), V2 (-0x1.9e9540300a1p-41) }, .inv_pi = V2 (0x1.45f306dc9c883p-2), - .half_pi = V2 (0x1.921fb54442d18p+0), .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), .range_val = V2 (0x1p23) }; @@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); - n = vsubq_f64 (n, v_f64 (0.5)); + n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); + n = vsubq_f64 (n, v_f64 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); diff --git a/sysdeps/aarch64/fpu/cos_sve.c b/sysdeps/aarch64/fpu/cos_sve.c index 1c3e25b..1931faf 100644 --- a/sysdeps/aarch64/fpu/cos_sve.c +++ b/sysdeps/aarch64/fpu/cos_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) cos function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c index d0c285b..4d51679 100644 --- a/sysdeps/aarch64/fpu/cosf_advsimd.c +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) cos function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -33,8 +33,6 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), - .half_pi = V4 (0x1.921fb6p0f), .range_val = V4 (0x1p20f) }; @@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) #endif /* n = rint((|x|+pi/2)/pi) - 0.5. */ - n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); n = vsubq_f32 (n, v_f32 (0.5f)); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ diff --git a/sysdeps/aarch64/fpu/cosf_sve.c b/sysdeps/aarch64/fpu/cosf_sve.c index 4d7f4ea..31a4f63 100644 --- a/sysdeps/aarch64/fpu/cosf_sve.c +++ b/sysdeps/aarch64/fpu/cosf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) cos function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c index 4bee734..302d229 100644 --- a/sysdeps/aarch64/fpu/cosh_advsimd.c +++ b/sysdeps/aarch64/fpu/cosh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) cosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c index 919f346..77e58e1 100644 --- a/sysdeps/aarch64/fpu/cosh_sve.c +++ b/sysdeps/aarch64/fpu/cosh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) cosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,7 +23,7 @@ static const struct data { float64_t poly[3]; float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; - uint64_t index_mask, special_bound; + uint64_t special_bound; } data = { .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, 0x1.5555576a59599p-5, }, @@ -35,14 +35,16 @@ static const struct data .shift = 0x1.8p+52, .thres = 704.0, - .index_mask = 0xff, /* 0x1.6p9, above which exp overflows. */ .special_bound = 0x4086000000000000, }; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special) { + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + svfloat64_t y = svadd_x (pg, half_t, half_over_t); return sv_call_f64 (cosh, x, y, special); } @@ -60,12 +62,12 @@ exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) svuint64_t u = svreinterpret_u64 (z); svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); - svuint64_t i = svand_x (pg, u, d->index_mask); + svuint64_t i = svand_x (svptrue_b64 (), u, 0xff); svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); y = svmla_x (pg, sv_f64 (1.0), r, y); - y = svmul_x (pg, r, y); + y = svmul_x (svptrue_b64 (), r, y); /* s = 2^(n/N). */ u = svld1_gather_index (pg, __v_exp_tail_data, i); @@ -94,12 +96,12 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) /* Up to the point that exp overflows, we can use it to calculate cosh by exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ svfloat64_t t = exp_inline (ax, pg, d); - svfloat64_t half_t = svmul_x (pg, t, 0.5); - svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); /* Fall back to scalar for any special cases. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); + return special_case (x, pg, t, special); + svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); return svadd_x (pg, half_t, half_over_t); } diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c index c1ab492..35e1562 100644 --- a/sysdeps/aarch64/fpu/coshf_advsimd.c +++ b/sysdeps/aarch64/fpu/coshf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) cosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,19 +23,27 @@ static const struct data { struct v_expf_data expf_consts; - uint32x4_t tiny_bound, special_bound; + uint32x4_t tiny_bound; + float32x4_t bound; +#if WANT_SIMD_EXCEPT + uint32x4_t special_bound; +#endif } data = { .expf_consts = V_EXPF_DATA, .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .bound = V4 (0x1.5a92d8p+6), +#if WANT_SIMD_EXCEPT .special_bound = V4 (0x42ad496c), +#endif }; #if !WANT_SIMD_EXCEPT static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, + uint32x4_t special) { - return v_call_f32 (coshf, x, y, special); + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); } #endif @@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t special = vcgeq_u32 (iax, d->special_bound); - #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered correctly, fall back to the scalar variant for all inputs if any input is a special value or above the bound at which expf overflows. */ + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); if (__glibc_unlikely (v_any_u32 (special))) return v_call_f32 (coshf, x, x, v_u32 (-1)); @@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) input to 0, which will generate no exceptions. */ if (__glibc_unlikely (v_any_u32 (tiny))) ax = v_zerofy_f32 (ax, tiny); + float32x4_t t = v_expf_inline (ax, &d->expf_consts); +#else + uint32x4_t special = vcageq_f32 (x, d->bound); + float32x4_t t = v_expf_inline (x, &d->expf_consts); #endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - float32x4_t t = v_expf_inline (ax, &d->expf_consts); float32x4_t half_t = vmulq_n_f32 (t, 0.5); float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); @@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); #else if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, vaddq_f32 (half_t, half_over_t), special); + return special_case (x, half_t, half_over_t, special); #endif return vaddq_f32 (half_t, half_over_t); diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c index e5d8a29..fb8e06c 100644 --- a/sysdeps/aarch64/fpu/coshf_sve.c +++ b/sysdeps/aarch64/fpu/coshf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) cosh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,37 +23,42 @@ static const struct data { struct sv_expf_data expf_consts; - uint32_t special_bound; + float special_bound; } data = { .expf_consts = SV_EXPF_DATA, /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ - .special_bound = 0x42ad496c, + .special_bound = 0x1.5a92d8p+6, }; static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e, + svbool_t pg) { - return sv_call_f32 (coshf, x, y, pg); + return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e), + pg); } /* Single-precision vector cosh, using vector expf. - Maximum error is 1.89 ULP: - _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 - want 0x1.f00adcp+127. */ + Maximum error is 2.77 ULP: + _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2 + want 0x1.e4594cp+2. */ svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) { const struct data *d = ptr_barrier (&data); - svfloat32_t ax = svabs_x (pg, x); - svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); + svbool_t special = svacge (pg, x, d->special_bound); - /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); - svfloat32_t half_t = svmul_x (pg, t, 0.5); - svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. + Note that x is passed to exp here, rather than |x|. This is to avoid using + destructive unary ABS for better register usage. However it means the + routine is not exactly symmetrical, as the exp helper is slightly less + accurate in the negative range. */ + svfloat32_t e = expf_inline (x, pg, &d->expf_consts); + svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5); + svfloat32_t half_over_e = svdivr_x (pg, e, 0.5); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svadd_x (pg, half_t, half_over_t), special); + return special_case (x, half_e, half_over_e, special); - return svadd_x (pg, half_t, half_over_t); + return svadd_x (svptrue_b32 (), half_e, half_over_e); } diff --git a/sysdeps/aarch64/fpu/cospi_advsimd.c b/sysdeps/aarch64/fpu/cospi_advsimd.c new file mode 100644 index 0000000..dcd12c8 --- /dev/null +++ b/sysdeps/aarch64/fpu/cospi_advsimd.c @@ -0,0 +1,87 @@ +/* Double-precision (Advanced SIMD) cospi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +static const struct data +{ + float64x2_t poly[10]; + float64x2_t range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, + .range_val = V2 (0x1p63), +}; + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cospi, x, y, cmp); +} + +/* Approximation for vector double-precision cospi(x). + Maximum Error 3.06 ULP: + _ZGVnN2v_cospi(0x1.7dd4c0b03cc66p-5) got 0x1.fa854babfb6bep-1 + want 0x1.fa854babfb6c1p-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cospi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float64x2_t r = vabsq_f64 (x); + uint64x2_t cmp = vcaleq_f64 (v_f64 (0x1p64), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f64 (r, cmp); + uint64x2_t odd = vshlq_n_u64 (vcvtnq_u64_f64 (r), 63); + +#else + float64x2_t r = x; + uint64x2_t cmp = vcageq_f64 (r, d->range_val); + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + +#endif + + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f64 (v_f64 (0.5), vabsq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (__glibc_unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/sysdeps/aarch64/fpu/cospi_sve.c b/sysdeps/aarch64/fpu/cospi_sve.c new file mode 100644 index 0000000..dd98815 --- /dev/null +++ b/sysdeps/aarch64/fpu/cospi_sve.c @@ -0,0 +1,65 @@ +/* Double-precision (SVE) cospi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10]; + double range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p53, +}; + +/* A fast SVE implementation of cospi. + Maximum error 3.20 ULP: + _ZGVsMxv_cospi(0x1.f18ba32c63159p-6) got 0x1.fdabf595f9763p-1 + want 0x1.fdabf595f9766p-1. */ +svfloat64_t SV_NAME_D1 (cospi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f64 (0.5), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^53, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/cospif_advsimd.c b/sysdeps/aarch64/fpu/cospif_advsimd.c new file mode 100644 index 0000000..a81471f --- /dev/null +++ b/sysdeps/aarch64/fpu/cospif_advsimd.c @@ -0,0 +1,87 @@ +/* Single-precision (Advanced SIMD) cospi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" + +static const struct data +{ + float32x4_t poly[6]; + float32x4_t range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, + .range_val = V4 (0x1p31f), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cospif, x, y, cmp); +} + +/* Approximation for vector single-precision cospi(x) + Maximum Error: 3.17 ULP: + _ZGVnN4v_cospif(0x1.d341a8p-5) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cospi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + float32x4_t r = vabsq_f32 (x); + uint32x4_t cmp = vcaleq_f32 (v_f32 (0x1p32f), x); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be zero'd + to avoid them overflowing and throwing exceptions. */ + r = v_zerofy_f32 (r, cmp); + uint32x4_t odd = vshlq_n_u32 (vcvtnq_u32_f32 (r), 31); + +#else + float32x4_t r = x; + uint32x4_t cmp = vcageq_f32 (r, d->range_val); + + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + +#endif + + /* r = x - rint(x). */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* cospi(x) = sinpi(0.5 - abs(x)) for values -1/2 .. 1/2. */ + r = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + + /* Fallback to scalar. */ + if (__glibc_unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + + /* Reintroduce the sign bit for inputs which round to odd. */ + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +libmvec_hidden_def (V_NAME_F1 (cospi)) +HALF_WIDTH_ALIAS_F1 (cospi) diff --git a/sysdeps/aarch64/fpu/cospif_sve.c b/sysdeps/aarch64/fpu/cospif_sve.c new file mode 100644 index 0000000..e8980da --- /dev/null +++ b/sysdeps/aarch64/fpu/cospif_sve.c @@ -0,0 +1,61 @@ +/* Single-precision (SVE) cospi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6]; + float range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31f, +}; + +/* A fast SVE implementation of cospif. + Maximum error: 2.60 ULP: + _ZGVsMxv_cospif(+/-0x1.cae664p-4) got 0x1.e09c9ep-1 + want 0x1.e09c98p-1. */ +svfloat32_t SV_NAME_F1 (cospi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* Using cospi(x) = sinpi(0.5 - x) + range reduction and offset into sinpi range -1/2 .. 1/2 + r = 0.5 - |x - rint(x)|. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + r = svsub_x (pg, sv_f32 (0.5f), svabs_x (pg, r)); + + /* Result should be negated based on if n is odd or not. + If ax >= 2^31, the result will always be positive. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_x (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c index 19cbb7d..ec1ad0b 100644 --- a/sysdeps/aarch64/fpu/erf_advsimd.c +++ b/sysdeps/aarch64/fpu/erf_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) erf function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,19 +22,21 @@ static const struct data { float64x2_t third; - float64x2_t tenth, two_over_five, two_over_fifteen; - float64x2_t two_over_nine, two_over_fortyfive; + float64x2_t tenth, two_over_five, two_over_nine; + double two_over_fifteen, two_over_fortyfive; float64x2_t max, shift; + uint64x2_t max_idx; #if WANT_SIMD_EXCEPT float64x2_t tiny_bound, huge_bound, scale_minus_one; #endif } data = { + .max_idx = V2 (768), .third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */ - .two_over_fifteen = V2 (0x1.1111111111111p-3), + .two_over_fifteen = 0x1.1111111111111p-3, .tenth = V2 (-0x1.999999999999ap-4), .two_over_five = V2 (-0x1.999999999999ap-2), .two_over_nine = V2 (-0x1.c71c71c71c71cp-3), - .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5), + .two_over_fortyfive = 0x1.6c16c16c16c17p-5, .max = V2 (5.9921875), /* 6 - 1/128. */ .shift = V2 (0x1p45), #if WANT_SIMD_EXCEPT @@ -56,8 +58,8 @@ static inline struct entry lookup (uint64x2_t i) { struct entry e; - float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf), - e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf); + float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf), + e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf); e.erf = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) float64x2_t a = vabsq_f64 (x); /* Reciprocal conditions that do not catch NaNs so they can be used in BSLs to return expected results. */ - uint64x2_t a_le_max = vcleq_f64 (a, dat->max); - uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max); + uint64x2_t a_le_max = vcaleq_f64 (x, dat->max); + uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max); #if WANT_SIMD_EXCEPT /* |x| huge or tiny. */ @@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) segfault. */ uint64x2_t i = vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift)); - i = vbslq_u64 (a_le_max, i, v_u64 (768)); + i = vbslq_u64 (a_le_max, i, dat->max_idx); struct entry e = lookup (i); float64x2_t r = vsubq_f64 (z, shift); @@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x) float64x2_t d2 = vmulq_f64 (d, d); float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t two_over_fifteen_and_fortyfive + = vld1q_f64 (&dat->two_over_fifteen); + /* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */ float64x2_t p1 = r; float64x2_t p2 = vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third)); float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third)); - float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen); + float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2, + two_over_fifteen_and_fortyfive, 0); p4 = vfmsq_f64 (dat->tenth, r2, p4); - float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive); + float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2, + two_over_fifteen_and_fortyfive, 1); p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5)); float64x2_t p34 = vfmaq_f64 (p3, d, p4); diff --git a/sysdeps/aarch64/fpu/erf_data.c b/sysdeps/aarch64/fpu/erf_data.c index 6d2dcd2..80c3f6b 100644 --- a/sysdeps/aarch64/fpu/erf_data.c +++ b/sysdeps/aarch64/fpu/erf_data.c @@ -1,6 +1,6 @@ /* Table for Advanced SIMD erf approximation - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,14 @@ #include "vecmath_config.h" -/* Lookup table used in erf. +/* Lookup table used in vector erf. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = 6.0 (769 values): - - the first entry __erff_data.tab.erf contains the values of erf(r), - - the second entry __erff_data.tab.scale contains the values of + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct erf_data __erf_data = { +const struct v_erf_data __v_erf_data = { .tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 }, { 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 }, { 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 }, diff --git a/sysdeps/aarch64/fpu/erf_sve.c b/sysdeps/aarch64/fpu/erf_sve.c index 7d51417..44241d4 100644 --- a/sysdeps/aarch64/fpu/erf_sve.c +++ b/sysdeps/aarch64/fpu/erf_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) erf function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg) svfloat64_t a = svabs_x (pg, x); svfloat64_t shift = sv_f64 (dat->shift); svfloat64_t z = svadd_x (pg, a, shift); - svuint64_t i - = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift)); + svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff); + i = svadd_x (pg, i, i); /* Lookup without shortcut for small values but with predicate to avoid segfault for large values and NaNs. */ svfloat64_t r = svsub_x (pg, z, shift); - svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i); - svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i); + svfloat64_t erfr + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i); + svfloat64_t scale + = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i); /* erf(x) ~ erf(r) + scale * d * poly (r, d). */ svfloat64_t d = svsub_x (pg, a, r); diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c index f1b3bfe..6b0fadb 100644 --- a/sysdeps/aarch64/fpu/erfc_advsimd.c +++ b/sysdeps/aarch64/fpu/erfc_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) erfc function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,8 +24,8 @@ static const struct data { uint64x2_t offset, table_scale; float64x2_t max, shift; - float64x2_t p20, p40, p41, p42; - float64x2_t p51, p52; + float64x2_t p20, p40, p41, p51; + double p42, p52; double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2]; #if WANT_SIMD_EXCEPT float64x2_t uflow_bound; @@ -41,9 +41,9 @@ static const struct data .p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */ .p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */ .p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */ - .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */ + .p42 = 0x1.1111111111111p-3, /* 2/15. */ .p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */ - .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */ + .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */ /* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */ .qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 }, .qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 }, @@ -69,9 +69,9 @@ lookup (uint64x2_t i) { struct entry e; float64x2_t e1 - = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc); float64x2_t e2 - = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); + = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc); e.erfc = vuzp1q_f64 (e1, e2); e.scale = vuzp2q_f64 (e1, e2); return e; @@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x) float64x2_t p1 = r; float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20)); float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20)); - float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42); + float64x2_t p42_p52 = vld1q_f64 (&dat->p42); + float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0); p4 = vfmsq_f64 (dat->p40, r2, p4); - float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52); + float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1); p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5)); /* Compute p_i using recurrence relation: p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */ diff --git a/sysdeps/aarch64/fpu/erfc_data.c b/sysdeps/aarch64/fpu/erfc_data.c index 76a94e4..e4b61a0 100644 --- a/sysdeps/aarch64/fpu/erfc_data.c +++ b/sysdeps/aarch64/fpu/erfc_data.c @@ -1,6 +1,6 @@ /* Table for Advanced SIMD erfc - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,14 @@ #include "vecmath_config.h" -/* Lookup table used in erfc. +/* Lookup table used in vector erfc. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = ~27.0 (3488 values): - - the first entry __erfc_data.tab.erfc contains the values of erfc(r), - - the second entry __erfc_data.tab.scale contains the values of + - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfc_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore they are scaled by a large enough value 2^128 (fits in 8bit). */ -const struct erfc_data __erfc_data = { +const struct v_erfc_data __v_erfc_data = { .tab = { { 0x1p128, 0x1.20dd750429b6dp128 }, { 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 }, { 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 }, diff --git a/sysdeps/aarch64/fpu/erfc_sve.c b/sysdeps/aarch64/fpu/erfc_sve.c index c17d3e4..036d823 100644 --- a/sysdeps/aarch64/fpu/erfc_sve.c +++ b/sysdeps/aarch64/fpu/erfc_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) erfc function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg) /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ i = svadd_x (pg, i, i); - const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr; + const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr; svfloat64_t erfcr = svld1_gather_index (pg, p, i); svfloat64_t scale = svld1_gather_index (pg, p + 1, i); diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c index ca5bc3a..c469004 100644 --- a/sysdeps/aarch64/fpu/erfcf_advsimd.c +++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) erfc function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -62,13 +62,13 @@ lookup (uint32x4_t i) { struct entry e; float32x2_t t0 - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc); float32x2_t t1 - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc); float32x2_t t2 - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc); float32x2_t t3 - = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); + = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc); float32x4_t e1 = vcombine_f32 (t0, t1); float32x4_t e2 = vcombine_f32 (t2, t3); e.erfc = vuzp1q_f32 (e1, e2); diff --git a/sysdeps/aarch64/fpu/erfcf_data.c b/sysdeps/aarch64/fpu/erfcf_data.c index 77fb889..b150532 100644 --- a/sysdeps/aarch64/fpu/erfcf_data.c +++ b/sysdeps/aarch64/fpu/erfcf_data.c @@ -1,6 +1,6 @@ /* Table for Advanced SIMD erfcf - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,14 @@ #include "vecmath_config.h" -/* Lookup table used in erfcf. +/* Lookup table used in vector erfcf. For each possible rounded input r (multiples of 1/64), between r = 0.0 and r = 10.0625 (645 values): - - the first entry __erfcf_data.tab.erfc contains the values of erfc(r), - - the second entry __erfcf_data.tab.scale contains the values of + - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r), + - the second entry __v_erfcf_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore they are scaled by a large enough value 2^47 (fits in 8 bits). */ -const struct erfcf_data __erfcf_data = { +const struct v_erfcf_data __v_erfcf_data = { .tab = { { 0x1p47, 0x1.20dd76p47 }, { 0x1.f6f944p46, 0x1.20cb68p47 }, { 0x1.edf3aap46, 0x1.209546p47 }, diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c index 48d1677..b57ab51 100644 --- a/sysdeps/aarch64/fpu/erfcf_sve.c +++ b/sysdeps/aarch64/fpu/erfcf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) erfc function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -76,23 +76,23 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg) svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx); /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */ - i = svmul_x (pg, i, 2); - const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr; + i = svlsl_x (svptrue_b32 (), i, 1); + const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr; svfloat32_t erfcr = svld1_gather_index (pg, p, i); svfloat32_t scale = svld1_gather_index (pg, p + 1, i); /* erfc(x) ~ erfc(r) - scale * d * poly(r, d). */ svfloat32_t r = svsub_x (pg, z, shift); svfloat32_t d = svsub_x (pg, a, r); - svfloat32_t d2 = svmul_x (pg, d, d); - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third); - svfloat32_t third = svdup_lane (coeffs, 0); svfloat32_t p1 = r; - svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1); - svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); + svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1); + svfloat32_t p3 + = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0)); svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2); p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4); diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c index f2fe6ff..72e49a7 100644 --- a/sysdeps/aarch64/fpu/erff_advsimd.c +++ b/sysdeps/aarch64/fpu/erff_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) erf function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -47,10 +47,10 @@ static inline struct entry lookup (uint32x4_t i) { struct entry e; - float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf); - float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf); - float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf); - float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf); + float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf); + float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf); + float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf); + float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf); float32x4_t e1 = vcombine_f32 (t0, t1); float32x4_t e2 = vcombine_f32 (t2, t3); e.erf = vuzp1q_f32 (e1, e2); diff --git a/sysdeps/aarch64/fpu/erff_data.c b/sysdeps/aarch64/fpu/erff_data.c index 9a32940..e474015 100644 --- a/sysdeps/aarch64/fpu/erff_data.c +++ b/sysdeps/aarch64/fpu/erff_data.c @@ -1,6 +1,6 @@ /* Table for Advanced SIMD erff approximation - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,14 @@ #include "vecmath_config.h" -/* Lookup table used in erff. +/* Lookup table used in vector erff. For each possible rounded input r (multiples of 1/128), between r = 0.0 and r = 4.0 (513 values): - - the first entry __erff_data.tab.erf contains the values of erf(r), - - the second entry __erff_data.tab.scale contains the values of + - the first entry __v_erff_data.tab.erf contains the values of erf(r), + - the second entry __v_erff_data.tab.scale contains the values of 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct erff_data __erff_data = { +const struct v_erff_data __v_erff_data = { .tab = { { 0x0.000000p+0, 0x1.20dd76p+0 }, { 0x1.20dbf4p-7, 0x1.20d8f2p+0 }, { 0x1.20d770p-6, 0x1.20cb68p+0 }, diff --git a/sysdeps/aarch64/fpu/erff_sve.c b/sysdeps/aarch64/fpu/erff_sve.c index 38f00db..e9345fb 100644 --- a/sysdeps/aarch64/fpu/erff_sve.c +++ b/sysdeps/aarch64/fpu/erff_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) erf function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg) svfloat32_t shift = sv_f32 (dat->shift); svfloat32_t z = svadd_x (pg, a, shift); - svuint32_t i - = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift)); - - /* Saturate lookup index. */ - i = svsel (a_ge_max, sv_u32 (512), i); + svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff); + i = svadd_x (pg, i, i); /* r and erf(r) set to 0 for |x| below min. */ svfloat32_t r = svsub_z (a_gt_min, z, shift); - svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i); + svfloat32_t erfr + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i); /* scale set to 2/sqrt(pi) for |x| below min. */ - svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i); + svfloat32_t scale + = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i); scale = svsel (a_gt_min, scale, sv_f32 (dat->scale)); /* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */ diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c index eeb31ca..1417787 100644 --- a/sysdeps/aarch64/fpu/exp10_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) exp10 function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c index ddf6470..53b2893 100644 --- a/sysdeps/aarch64/fpu/exp10_sve.c +++ b/sysdeps/aarch64/fpu/exp10_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) exp10 function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,21 +18,23 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" #define SpecialBound 307.0 /* floor (log10 (2^1023)). */ static const struct data { - double poly[5]; + double c1, c3, c2, c4, c0; double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound; } data = { /* Coefficients generated using Remez algorithm. rel error: 0x1.9fcb9b3p-60 abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ] max ulp err 0.52 +0.5. */ - .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1, - 0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 }, + .c0 = 0x1.26bb1bbb55516p1, + .c1 = 0x1.53524c73cd32ap1, + .c2 = 0x1.0470591daeafbp1, + .c3 = 0x1.2bd77b1361ef6p0, + .c4 = 0x1.142b5d54e9621p-1, /* 1.5*2^46+1023. This value is further explained below. */ .shift = 0x1.800000000ffc0p+46, .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ @@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->scale_thres); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg) comes at significant performance cost. */ svuint64_t u = svreinterpret_u64 (z); svfloat64_t scale = svexpa (u); - + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2); /* Approximate exp10(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2, - sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1)); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0); + svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1); + svfloat64_t p14 = svmla_x (pg, p12, p34, r2); + + svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14); /* Assemble result as exp10(x) = 2^n * exp10(r). If |x| > SpecialBound multiplication may overflow, so use special case routine. */ diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c index cf53e73..47726ed 100644 --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) exp10 function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,16 +18,15 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" #define ScaleBound 192.0f static const struct data { - float32x4_t poly[5]; - float log10_2_and_inv[4]; - float32x4_t shift; - + float32x4_t c0, c1, c3; + float log10_2_high, log10_2_low, c2, c4; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; #endif @@ -37,19 +36,24 @@ static const struct data rel error: 0x1.89dafa3p-24 abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] maxerr: 1.85943 +0.5 ulp. */ - .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), - V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, - .shift = V4 (0x1.8p23f), - - /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ - .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, + .c0 = V4 (0x1.26bb16p+1f), + .c1 = V4 (0x1.5350d2p+1f), + .c2 = 0x1.04744ap+1f, + .c3 = V4 (0x1.2d8176p+0f), + .c4 = 0x1.12b41ap-1f, + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + .special_bound = V4 (126.0f), + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .scale_thresh = V4 (ScaleBound) #endif }; -#define ExponentBias v_u32 (0x3f800000) - #if WANT_SIMD_EXCEPT # define SpecialBound 38.0f /* rint(log10(2^127)). */ @@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) +# define SpecialBound 126.0f static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ - float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); - float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); - float32x4_t n = vsubq_f32 (z, d->shift); - float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); - r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly - = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), - v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c index e09b2f3..1a74db2 100644 --- a/sysdeps/aarch64/fpu/exp10f_sve.c +++ b/sysdeps/aarch64/fpu/exp10f_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) exp10 function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,74 +18,83 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f32.h" -/* For x < -SpecialBound, the result is subnormal and not handled correctly by +/* For x < -Thres, the result is subnormal and not handled correctly by FEXPA. */ -#define SpecialBound 37.9 +#define Thres 37.9 static const struct data { - float poly[5]; - float shift, log10_2, log2_10_hi, log2_10_lo, special_bound; + float log2_10_lo, c0, c2, c4; + float c1, c3, log10_2; + float shift, log2_10_hi, thres; } data = { /* Coefficients generated using Remez algorithm with minimisation of relative error. rel error: 0x1.89dafa3p-24 abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] maxerr: 0.52 +0.5 ulp. */ - .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f, - 0x1.12b41ap-1f }, + .c0 = 0x1.26bb16p+1f, + .c1 = 0x1.5350d2p+1f, + .c2 = 0x1.04744ap+1f, + .c3 = 0x1.2d8176p+0f, + .c4 = 0x1.12b41ap-1f, /* 1.5*2^17 + 127, a shift value suitable for FEXPA. */ - .shift = 0x1.903f8p17f, + .shift = 0x1.803f8p17f, .log10_2 = 0x1.a934fp+1, .log2_10_hi = 0x1.344136p-2, .log2_10_lo = -0x1.ec10cp-27, - .special_bound = SpecialBound, + .thres = Thres, }; -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +static inline svfloat32_t +sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) { - return sv_call_f32 (exp10f, x, y, special); -} - -/* Single-precision SVE exp10f routine. Implements the same algorithm - as AdvSIMD exp10f. - Worst case error is 1.02 ULPs. - _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 - want 0x1.ba5f9cp-1. */ -svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) -{ - const struct data *d = ptr_barrier (&data); /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */ - /* Load some constants in quad-word chunks to minimise memory access (last - lane is wasted). */ - svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2); + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo); /* n = round(x/(log10(2)/N)). */ svfloat32_t shift = sv_f32 (d->shift); - svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0); - svfloat32_t n = svsub_x (pg, z, shift); + svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, shift); /* r = x - n*log10(2)/N. */ - svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1); - r = svmls_lane (r, n, log10_2_and_inv, 2); + svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); - svbool_t special = svacgt (pg, x, d->special_bound); svfloat32_t scale = svexpa (svreinterpret_u32 (z)); /* Polynomial evaluation: poly(r) ~ exp10(r)-1. */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t poly - = svmla_x (pg, svmul_x (pg, r, d->poly[0]), - sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2); - - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (pg, scale, scale, poly), special); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); return svmla_x (pg, scale, scale, poly); } + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp10f routine. Implements the same algorithm + as AdvSIMD exp10f. + Worst case error is 1.02 ULPs. + _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1 + want 0x1.ba5f9cp-1. */ +svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp10f_inline (x, pg, d); +} diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c index ae1e63d..71affe8 100644 --- a/sysdeps/aarch64/fpu/exp2_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c index 22848eb..6db8526 100644 --- a/sysdeps/aarch64/fpu/exp2_sve.c +++ b/sysdeps/aarch64/fpu/exp2_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" #define N (1 << V_EXP_TABLE_BITS) @@ -27,15 +26,15 @@ static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double shift, big_bound, uoflow_bound; } data = { /* Coefficients are computed using Remez algorithm with minimisation of the absolute error. */ - .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, - 0x1.3b2abf5571ad8p-7 }, - .shift = 0x1.8p52 / N, - .uoflow_bound = UOFlowBound, + .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, + .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, + .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, .big_bound = BigBound, }; @@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* Approximate exp2(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); - svfloat64_t y = svmul_x (pg, r, p); - + /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); /* Assemble exp2(x) = exp2(r) * scale. */ if (__glibc_unlikely (svptest_any (pg, special))) return special_case (pg, scale, y, kd, d); diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c index 69e0b19..7cbce0d 100644 --- a/sysdeps/aarch64/fpu/exp2f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,24 +21,28 @@ static const struct data { - float32x4_t poly[5]; - uint32x4_t exponent_bias; + float32x4_t c1, c3; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; + float32x4_t scale_thresh, special_bound; #endif + float c0, c2, c4, zero; } data = { /* maxerr: 1.962 ulp. */ - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .c0 = 0x1.59977ap-10f, + .c1 = V4 (0x1.3ce9e4p-7f), + .c2 = 0x1.c6bd32p-5f, + .c3 = V4 (0x1.ebf9bcp-3f), + .c4 = 0x1.62e422p-1f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), #endif }; -#define C(i) d->poly[i] - #if WANT_SIMD_EXCEPT # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ @@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; #if WANT_SIMD_EXCEPT /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); float32x4_t xm = x; /* If any lanes are special, mask them with 1 and retain a copy of x to allow special_case to fix special lanes later. This is only necessary if fenv @@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) x = vbslq_f32 (cmp, v_f32 (1), x); #endif - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - n = vrndaq_f32 (x); - r = vsubq_f32 (x, n); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t c024 = vld1q_f32 (&d->c0); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_laneq_f32 (r, c024, 2); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c index 8a686e3..fcd7830 100644 --- a/sysdeps/aarch64/fpu/exp2f_sve.c +++ b/sysdeps/aarch64/fpu/exp2f_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,54 +24,64 @@ static const struct data { - float poly[5]; + float c0, c2, c4, c1, c3; float shift, thres; } data = { - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. */ - .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f, - 0x1.59977ap-10f }, + /* Coefficients copied from the polynomial in AdvSIMD variant. */ + .c0 = 0x1.62e422p-1f, + .c1 = 0x1.ebf9bcp-3f, + .c2 = 0x1.c6bd32p-5f, + .c3 = 0x1.3ce9e4p-7f, + .c4 = 0x1.59977ap-10f, /* 1.5*2^17 + 127. */ - .shift = 0x1.903f8p17f, + .shift = 0x1.803f8p17f, /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled correctly by FEXPA. */ .thres = Thres, }; -static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) -{ - return sv_call_f32 (exp2f, x, y, special); -} - -/* Single-precision SVE exp2f routine. Implements the same algorithm - as AdvSIMD exp2f. - Worst case error is 1.04 ULPs. - SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0 - want 0x1.ba7ebp+0. */ -svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +static inline svfloat32_t +sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d) { - const struct data *d = ptr_barrier (&data); /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = n + r, with r in [-1/2, 1/2]. */ - svfloat32_t shift = sv_f32 (d->shift); - svfloat32_t z = svadd_x (pg, x, shift); - svfloat32_t n = svsub_x (pg, z, shift); - svfloat32_t r = svsub_x (pg, x, n); + svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift); + svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift); + svfloat32_t r = svsub_x (svptrue_b32 (), x, n); - svbool_t special = svacgt (pg, x, d->thres); svfloat32_t scale = svexpa (svreinterpret_u32 (z)); /* Polynomial evaluation: poly(r) ~ exp2(r)-1. Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for coefficients 1 to 4, and apply most significant coefficient directly. */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1); - svfloat32_t p0 = svmul_x (pg, r, d->poly[0]); + svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2); + svfloat32_t p14 = svmla_x (pg, p12, r2, p34); + svfloat32_t p0 = svmul_lane (r, even_coeffs, 0); svfloat32_t poly = svmla_x (pg, p0, r2, p14); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (pg, scale, scale, poly), special); - return svmla_x (pg, scale, scale, poly); } + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svbool_t special, const struct data *d) +{ + return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d), + special); +} + +/* Single-precision SVE exp2f routine. Implements the same algorithm + as AdvSIMD exp2f. + Worst case error is 1.04 ULPs. + _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1 + want 0x1.ba6a64p-1. */ +svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->thres); + if (__glibc_unlikely (svptest_any (special, special))) + return special_case (x, special, d); + return sv_exp2f_inline (x, pg, d); +} diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c index 5e3a9a0..bd97012 100644 --- a/sysdeps/aarch64/fpu/exp_advsimd.c +++ b/sysdeps/aarch64/fpu/exp_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) exp function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c index aabaaa1..dc04948 100644 --- a/sysdeps/aarch64/fpu/exp_sve.c +++ b/sysdeps/aarch64/fpu/exp_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) exp function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,12 +21,15 @@ static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double ln2_hi, ln2_lo, inv_ln2, shift, thres; + } data = { - .poly = { /* ulp error: 0.53. */ - 0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5, - 0x1.1111266d28935p-7 }, + .c0 = 0x1.fffffffffdbcdp-2, + .c1 = 0x1.555555555444cp-3, + .c2 = 0x1.555573c6a9f7dp-5, + .c3 = 0x1.1111266d28935p-7, .ln2_hi = 0x1.62e42fefa3800p-1, .ln2_lo = 0x1.ef35793c76730p-45, /* 1/ln2. */ @@ -36,7 +39,6 @@ static const struct data .thres = 704.0, }; -#define C(i) sv_f64 (d->poly[i]) #define SpecialOffset 0x6000000000000000 /* 0x1p513. */ /* SpecialBias1 + SpecialBias1 = asuint(1.0). */ #define SpecialBias1 0x7000000000000000 /* 0x1p769. */ @@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n) svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ - /* Set s1 to generate overflow depending on sign of exponent n. */ - svfloat64_t s1 = svreinterpret_f64 ( - svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b. */ - /* Offset s to avoid overflow in final result if n is below threshold. */ + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ svfloat64_t s2 = svreinterpret_f64 ( - svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), - b)); /* as_u64 (s) - 0x3010...0 + b. */ + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, 1280.0); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg) svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); svuint64_t u = svreinterpret_u64 (z); svfloat64_t n = svsub_x (pg, z, d->shift); - + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)]. */ svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi); svfloat64_t r = svmls_lane (x, n, ln2, 0); r = svmls_lane (r, n, ln2, 1); /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p01 = svmla_x (pg, C (0), C (1), r); - svfloat64_t p23 = svmla_x (pg, C (2), C (3), r); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); svfloat64_t p04 = svmla_x (pg, p01, p23, r2); svfloat64_t y = svmla_x (pg, r, p04, r2); diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c index 99d2e64..f1e93b8 100644 --- a/sysdeps/aarch64/fpu/expf_advsimd.c +++ b/sysdeps/aarch64/fpu/expf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) exp function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,21 +21,25 @@ static const struct data { - float32x4_t poly[5]; - float32x4_t shift, inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; + float32x4_t c1, c3, c4, inv_ln2; + float ln2_hi, ln2_lo, c0, c2; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t special_bound, scale_thresh; #endif } data = { /* maxerr: 1.45358 +0.5 ulp. */ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, - .shift = V4 (0x1.8p23f), + .c0 = 0x1.0e4020p-7f, + .c1 = V4 (0x1.573e2ep-5f), + .c2 = 0x1.555e66p-3f, + .c3 = V4 (0x1.fffdb6p-2f), + .c4 = V4 (0x1.ffffecp-1f), .inv_ln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), @@ -60,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); + // (s2 + p*s2)*s1 = s2(p+1)s1 float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); /* Similar to r1 but avoids double rounding in the subnormal range. */ float32x4_t r0 = vfmaq_f32 (scale, poly, scale); @@ -85,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly, z; - uint32x4_t cmp, e; + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); #if WANT_SIMD_EXCEPT /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32 ( + uint32x4_t cmp = vcgeq_u32 ( vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), TinyBound), SpecialBound); @@ -104,23 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - z = vfmaq_f32 (d->shift, x, d->inv_ln2); - n = vsubq_f32 (z, d->shift); - r = vfmsq_f32 (x, n, d->ln2_hi); - r = vfmsq_f32 (r, n, d->ln2_lo); - e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c index 3ba79bc..f9249db 100644 --- a/sysdeps/aarch64/fpu/expf_sve.c +++ b/sysdeps/aarch64/fpu/expf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) exp function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,33 +18,25 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" +#include "sv_expf_inline.h" + +/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled + correctly by FEXPA. */ +#define Thres 0x1.5d5e2ap+6f static const struct data { - float poly[5]; - float inv_ln2, ln2_hi, ln2_lo, shift, thres; + struct sv_expf_data d; + float thres; } data = { - /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for - compatibility with polynomial helpers. */ - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, - 0x1.0e4020p-7f }, - .inv_ln2 = 0x1.715476p+0f, - .ln2_hi = 0x1.62e4p-1f, - .ln2_lo = 0x1.7f7d1cp-20f, - /* 1.5*2^17 + 127. */ - .shift = 0x1.903f8p17f, - /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled - correctly by FEXPA. */ - .thres = 0x1.5d5e2ap+6f, + .d = SV_EXPF_DATA, + .thres = Thres, }; -#define C(i) sv_f32 (d->poly[i]) -#define ExponentBias 0x3f800000 - static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d) { - return sv_call_f32 (expf, x, y, special); + return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special); } /* Optimised single-precision SVE exp function. @@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - - /* Load some constants in quad-word chunks to minimise memory access (last - lane is wasted). */ - svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2); - - /* n = round(x/(ln2/N)). */ - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0); - svfloat32_t n = svsub_x (pg, z, d->shift); - - /* r = x - n*ln2/N. */ - svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1); - r = svmls_lane (r, n, invln2_and_ln2, 2); - - /* scale = 2^(n/N). */ svbool_t is_special_case = svacgt (pg, x, d->thres); - svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); - svfloat32_t p34 = svmla_x (pg, C (3), C (4), r); - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_x (pg, r, C (0)); - svfloat32_t poly = svmla_x (pg, p0, r2, p14); - if (__glibc_unlikely (svptest_any (pg, is_special_case))) - return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case); - - return svmla_x (pg, scale, scale, poly); + return special_case (x, is_special_case, &d->d); + return expf_inline (x, pg, &d->d); } diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c index 3db3b80..01e7292 100644 --- a/sysdeps/aarch64/fpu/expm1_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD expm1 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,31 +18,18 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11]; - float64x2_t invln2; - double ln2[2]; - float64x2_t shift; - int64x2_t exponent_bias; + struct v_expm1_data d; #if WANT_SIMD_EXCEPT uint64x2_t thresh, tiny_bound; #else float64x2_t oflow_bound; #endif } data = { - /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, - .invln2 = V2 (0x1.71547652b82fep0), - .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, - .shift = V2 (0x1.8p52), - .exponent_bias = V2 (0x3ff0000000000000), + .d = V_EXPM1_DATA, #if WANT_SIMD_EXCEPT /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs compare. */ @@ -58,67 +45,36 @@ static const struct data }; static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, uint64x2_t special, const struct data *d) { - return v_call_f64 (expm1, x, y, special); + return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), + special); } /* Double-precision vector exp(x) - 1 function. - The maximum error observed error is 2.18 ULP: - _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ + The maximum error observed error is 2.05 ULP: + _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2 + want 0x1.a8b636e2a9386p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - #if WANT_SIMD_EXCEPT + uint64x2_t ix = vreinterpretq_u64_f64 (x); /* If fp exceptions are to be triggered correctly, fall back to scalar for |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for shift-left by 1, and compare with thresh which was left-shifted offline - this is effectively an absolute compare. */ uint64x2_t special = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); - if (__glibc_unlikely (v_any_u64 (special))) - x = v_zerofy_f64 (x, special); #else /* Large input, NaNs and Infs. */ uint64x2_t special = vcageq_f64 (x, d->oflow_bound); #endif - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (n); - float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); - float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); - f = vfmsq_laneq_f64 (f, n, ln2, 1); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); - float64x2_t t = vreinterpretq_f64_s64 (u); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (vreinterpretq_f64_u64 (ix), - vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t), - special); + return special_case (x, special, d); /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); + return expm1_inline (x, &d->d); } diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c index c933cf9..d4ba8cc 100644 --- a/sysdeps/aarch64/fpu/expm1_sve.c +++ b/sysdeps/aarch64/fpu/expm1_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE expm1 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c index a0616ec..a5712cf 100644 --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD expm1 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,27 +18,18 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" +#include "v_expm1f_inline.h" static const struct data { - float32x4_t poly[5]; - float invln2_and_ln2[4]; - float32x4_t shift; - int32x4_t exponent_bias; + struct v_expm1f_data d; #if WANT_SIMD_EXCEPT uint32x4_t thresh; #else float32x4_t oflow_bound; #endif } data = { - /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, - /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, - .shift = V4 (0x1.8p23f), - .exponent_bias = V4 (0x3f800000), + .d = V_EXPM1F_DATA, #if !WANT_SIMD_EXCEPT /* Value above which expm1f(x) should overflow. Absolute value of the underflow bound is greater than this, so it catches both cases - there is @@ -55,67 +46,38 @@ static const struct data #define TinyBound v_u32 (0x34000000 << 1) static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, uint32x4_t special, const struct data *d) { - return v_call_f32 (expm1f, x, y, special); + return v_call_f32 ( + expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special); } /* Single-precision vector exp(x) - 1 function. - The maximum error is 1.51 ULP: - _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2 - want 0x1.e2fb94p-2. */ + The maximum error is 1.62 ULP: + _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2 + want 0x1.da9f44p-2. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - uint32x4_t ix = vreinterpretq_u32_f32 (x); #if WANT_SIMD_EXCEPT + uint32x4_t ix = vreinterpretq_u32_f32 (x); /* If fp exceptions are to be triggered correctly, fall back to scalar for |x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for shift-left by 1, and compare with thresh which was left-shifted offline - this is effectively an absolute compare. */ uint32x4_t special = vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh); - if (__glibc_unlikely (v_any_u32 (special))) - x = v_zerofy_f32 (x, special); #else /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); #endif - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); - float32x4_t j - = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); - int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float32x4_t p = v_horner_4_f32 (f, d->poly); - p = vfmaq_f32 (f, vmulq_f32 (f, f), p); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias); - float32x4_t t = vreinterpretq_f32_s32 (u); - if (__glibc_unlikely (v_any_u32 (special))) - return special_case (vreinterpretq_f32_u32 (ix), - vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t), - special); + return special_case (x, special, d); /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t); + return expm1f_inline (x, &d->d); } libmvec_hidden_def (V_NAME_F1 (expm1)) HALF_WIDTH_ALIAS_F1 (expm1) diff --git a/sysdeps/aarch64/fpu/expm1f_sve.c b/sysdeps/aarch64/fpu/expm1f_sve.c index 7c85212..21cb2e1 100644 --- a/sysdeps/aarch64/fpu/expm1f_sve.c +++ b/sysdeps/aarch64/fpu/expm1f_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE expm1 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f32.h" /* Largest value of x for which expm1(x) should round to -1. */ #define SpecialBound 0x1.5ebc4p+6f @@ -28,20 +27,17 @@ static const struct data /* These 4 are grouped together so they can be loaded as one quadword, then used with _lane forms of svmla/svmls. */ float c2, c4, ln2_hi, ln2_lo; - float c0, c1, c3, inv_ln2, special_bound, shift; + float c0, inv_ln2, c1, c3, special_bound; } data = { /* Generated using fpminimax. */ .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, - .c4 = 0x1.6b55a2p-10, + .c4 = 0x1.6b55a2p-10, .inv_ln2 = 0x1.715476p+0f, + .special_bound = SpecialBound, .ln2_lo = 0x1.7f7d1cp-20f, + .ln2_hi = 0x1.62e4p-1f, - .special_bound = SpecialBound, .shift = 0x1.8p23f, - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, - .ln2_lo = 0x1.7f7d1cp-20f, }; -#define C(i) sv_f32 (d->c##i) - static svfloat32_t NOINLINE special_case (svfloat32_t x, svbool_t pg) { @@ -71,9 +67,8 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); - j = svsub_x (pg, j, d->shift); - svint32_t i = svcvt_s32_x (pg, j); + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); svfloat32_t f = svmls_lane (x, j, lane_constants, 2); f = svmls_lane (f, j, lane_constants, 3); @@ -83,17 +78,17 @@ svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); - svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); svfloat32_t p = svmla_x (pg, p12, f2, p34); - p = svmla_x (pg, C (0), f, p); + + p = svmla_x (pg, sv_f32 (d->c0), f, p); p = svmla_x (pg, f, f2, p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - svfloat32_t t = svreinterpret_f32 ( - svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000)); - return svmla_x (pg, svsub_x (pg, t, 1), p, t); + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); } diff --git a/sysdeps/aarch64/fpu/fclrexcpt.c b/sysdeps/aarch64/fpu/fclrexcpt.c index cbc7afd..841794a 100644 --- a/sysdeps/aarch64/fpu/fclrexcpt.c +++ b/sysdeps/aarch64/fpu/fclrexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fedisblxcpt.c b/sysdeps/aarch64/fpu/fedisblxcpt.c index 7f5284a..ee4e2d8 100644 --- a/sysdeps/aarch64/fpu/fedisblxcpt.c +++ b/sysdeps/aarch64/fpu/fedisblxcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2024 Free Software Foundation, Inc. +/* Copyright (C) 2001-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feenablxcpt.c b/sysdeps/aarch64/fpu/feenablxcpt.c index a03604a..0d214fc 100644 --- a/sysdeps/aarch64/fpu/feenablxcpt.c +++ b/sysdeps/aarch64/fpu/feenablxcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2024 Free Software Foundation, Inc. +/* Copyright (C) 2001-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fegetenv.c b/sysdeps/aarch64/fpu/fegetenv.c index 0b83aea..eb8117c 100644 --- a/sysdeps/aarch64/fpu/fegetenv.c +++ b/sysdeps/aarch64/fpu/fegetenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fegetexcept.c b/sysdeps/aarch64/fpu/fegetexcept.c index 5c3f02a..9fb2cb5 100644 --- a/sysdeps/aarch64/fpu/fegetexcept.c +++ b/sysdeps/aarch64/fpu/fegetexcept.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2024 Free Software Foundation, Inc. +/* Copyright (C) 2001-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fegetmode.c b/sysdeps/aarch64/fpu/fegetmode.c index f1d743c..de9454c 100644 --- a/sysdeps/aarch64/fpu/fegetmode.c +++ b/sysdeps/aarch64/fpu/fegetmode.c @@ -1,5 +1,5 @@ /* Store current floating-point control modes. AArch64 version. - Copyright (C) 2016-2024 Free Software Foundation, Inc. + Copyright (C) 2016-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/fegetround.c b/sysdeps/aarch64/fpu/fegetround.c index c7243e4..d6758c9 100644 --- a/sysdeps/aarch64/fpu/fegetround.c +++ b/sysdeps/aarch64/fpu/fegetround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feholdexcpt.c b/sysdeps/aarch64/fpu/feholdexcpt.c index 3948cfd..79ae549 100644 --- a/sysdeps/aarch64/fpu/feholdexcpt.c +++ b/sysdeps/aarch64/fpu/feholdexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fenv_private.h b/sysdeps/aarch64/fpu/fenv_private.h index 0052656..eb16572 100644 --- a/sysdeps/aarch64/fpu/fenv_private.h +++ b/sysdeps/aarch64/fpu/fenv_private.h @@ -1,5 +1,5 @@ /* Private floating point rounding and exceptions handling. AArch64 version. - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/fesetenv.c b/sysdeps/aarch64/fpu/fesetenv.c index f51c44f..6a0df36 100644 --- a/sysdeps/aarch64/fpu/fesetenv.c +++ b/sysdeps/aarch64/fpu/fesetenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fesetexcept.c b/sysdeps/aarch64/fpu/fesetexcept.c index c610e74..6913973 100644 --- a/sysdeps/aarch64/fpu/fesetexcept.c +++ b/sysdeps/aarch64/fpu/fesetexcept.c @@ -1,5 +1,5 @@ /* Set given exception flags. AArch64 version. - Copyright (C) 2016-2024 Free Software Foundation, Inc. + Copyright (C) 2016-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/fesetmode.c b/sysdeps/aarch64/fpu/fesetmode.c index 669594e..d6b3682 100644 --- a/sysdeps/aarch64/fpu/fesetmode.c +++ b/sysdeps/aarch64/fpu/fesetmode.c @@ -1,5 +1,5 @@ /* Install given floating-point control modes. AArch64 version. - Copyright (C) 2016-2024 Free Software Foundation, Inc. + Copyright (C) 2016-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/fesetround.c b/sysdeps/aarch64/fpu/fesetround.c index def7872..cd0c3bc 100644 --- a/sysdeps/aarch64/fpu/fesetround.c +++ b/sysdeps/aarch64/fpu/fesetround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/feupdateenv.c b/sysdeps/aarch64/fpu/feupdateenv.c index f3f0525..4690e84 100644 --- a/sysdeps/aarch64/fpu/feupdateenv.c +++ b/sysdeps/aarch64/fpu/feupdateenv.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2024 Free Software Foundation, Inc. +/* Copyright (C) 2009-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/fgetexcptflg.c b/sysdeps/aarch64/fpu/fgetexcptflg.c index 932a96f..a65b295 100644 --- a/sysdeps/aarch64/fpu/fgetexcptflg.c +++ b/sysdeps/aarch64/fpu/fgetexcptflg.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2024 Free Software Foundation, Inc. +/* Copyright (C) 2001-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h index 92e15f0..d753035 100644 --- a/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h +++ b/sysdeps/aarch64/fpu/finclude/math-vector-fortran.h @@ -1,5 +1,5 @@ ! Platform-specific declarations of SIMD math functions for Fortran. -*- f90 -*- -! Copyright (C) 2019-2024 Free Software Foundation, Inc. +! Copyright (C) 2019-2025 Free Software Foundation, Inc. ! This file is part of the GNU C Library. ! ! The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/finite_pow.h b/sysdeps/aarch64/fpu/finite_pow.h index 84c93d4..bbe6d2d 100644 --- a/sysdeps/aarch64/fpu/finite_pow.h +++ b/sysdeps/aarch64/fpu/finite_pow.h @@ -1,6 +1,6 @@ /* Double-precision x^y function. - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/fpu_control.h b/sysdeps/aarch64/fpu/fpu_control.h index 263cf36..a93dbf5 100644 --- a/sysdeps/aarch64/fpu/fpu_control.h +++ b/sysdeps/aarch64/fpu/fpu_control.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2024 Free Software Foundation, Inc. +/* Copyright (C) 1996-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -29,17 +29,31 @@ # define _FPU_GETFPSR(fpsr) (fpsr = __builtin_aarch64_get_fpsr ()) # define _FPU_SETFPSR(fpsr) __builtin_aarch64_set_fpsr (fpsr) #else -# define _FPU_GETCW(fpcr) \ - __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (fpcr)) - -# define _FPU_SETCW(fpcr) \ - __asm__ __volatile__ ("msr fpcr, %0" : : "r" (fpcr)) - -# define _FPU_GETFPSR(fpsr) \ - __asm__ __volatile__ ("mrs %0, fpsr" : "=r" (fpsr)) - -# define _FPU_SETFPSR(fpsr) \ - __asm__ __volatile__ ("msr fpsr, %0" : : "r" (fpsr)) +# define _FPU_GETCW(fpcr) \ + ({ \ + __uint64_t __fpcr; \ + __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (__fpcr)); \ + fpcr = __fpcr; \ + }) + +# define _FPU_SETCW(fpcr) \ + ({ \ + __uint64_t __fpcr = fpcr; \ + __asm__ __volatile__ ("msr fpcr, %0" : : "r" (__fpcr)); \ + }) + +# define _FPU_GETFPSR(fpsr) \ + ({ \ + __uint64_t __fpsr; \ + __asm__ __volatile__ ("mrs %0, fpsr" : "=r" (__fpsr)); \ + fpsr = __fpsr; \ + }) + +# define _FPU_SETFPSR(fpsr) \ + ({ \ + __uint64_t __fpsr = fpsr; \ + __asm__ __volatile__ ("msr fpsr, %0" : : "r" (__fpsr)); \ + }) #endif /* Reserved bits should be preserved when modifying register diff --git a/sysdeps/aarch64/fpu/fraiseexcpt.c b/sysdeps/aarch64/fpu/fraiseexcpt.c index 5abf498..518a6eb 100644 --- a/sysdeps/aarch64/fpu/fraiseexcpt.c +++ b/sysdeps/aarch64/fpu/fraiseexcpt.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -19,11 +19,12 @@ #include <fenv.h> #include <fpu_control.h> #include <float.h> +#include <stdint.h> int __feraiseexcept (int excepts) { - int fpsr; + uint64_t fpsr; const float fp_zero = 0.0; const float fp_one = 1.0; const float fp_max = FLT_MAX; diff --git a/sysdeps/aarch64/fpu/fsetexcptflg.c b/sysdeps/aarch64/fpu/fsetexcptflg.c index d4df771..1afb1eb 100644 --- a/sysdeps/aarch64/fpu/fsetexcptflg.c +++ b/sysdeps/aarch64/fpu/fsetexcptflg.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/ftestexcept.c b/sysdeps/aarch64/fpu/ftestexcept.c index d40148d..1c709e8 100644 --- a/sysdeps/aarch64/fpu/ftestexcept.c +++ b/sysdeps/aarch64/fpu/ftestexcept.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/get-rounding-mode.h b/sysdeps/aarch64/fpu/get-rounding-mode.h index 5adb830..ae3c11d 100644 --- a/sysdeps/aarch64/fpu/get-rounding-mode.h +++ b/sysdeps/aarch64/fpu/get-rounding-mode.h @@ -1,6 +1,6 @@ /* Determine floating-point rounding mode within libc. AArch64 version. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/hypot_advsimd.c b/sysdeps/aarch64/fpu/hypot_advsimd.c index e4e279f..9af5402 100644 --- a/sysdeps/aarch64/fpu/hypot_advsimd.c +++ b/sysdeps/aarch64/fpu/hypot_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) hypot function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/hypot_sve.c b/sysdeps/aarch64/fpu/hypot_sve.c index 7441704..1e7803a 100644 --- a/sysdeps/aarch64/fpu/hypot_sve.c +++ b/sysdeps/aarch64/fpu/hypot_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) hypot function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/hypotf_advsimd.c b/sysdeps/aarch64/fpu/hypotf_advsimd.c index 34818b0..e2206f0 100644 --- a/sysdeps/aarch64/fpu/hypotf_advsimd.c +++ b/sysdeps/aarch64/fpu/hypotf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) hypot function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/hypotf_sve.c b/sysdeps/aarch64/fpu/hypotf_sve.c index 3a403de..489db8f 100644 --- a/sysdeps/aarch64/fpu/hypotf_sve.c +++ b/sysdeps/aarch64/fpu/hypotf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) hypot function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c index c065aae..faa2b25 100644 --- a/sysdeps/aarch64/fpu/log10_advsimd.c +++ b/sysdeps/aarch64/fpu/log10_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) log10 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,36 +18,36 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" - -#define N (1 << V_LOG10_TABLE_BITS) static const struct data { - uint64x2_t min_norm; + uint64x2_t off, sign_exp_mask, offset_lower_bound; uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t invln10, log10_2, ln2; - uint64x2_t sign_exp_mask; + double invln10, log10_2; + double c1, c3; + float64x2_t c0, c2, c4; } data = { /* Computed from log coefficients divided by log(10) then rounded to double precision. */ - .poly = { V2 (-0x1.bcb7b1526e506p-3), V2 (0x1.287a7636be1d1p-3), - V2 (-0x1.bcb7b158af938p-4), V2 (0x1.63c78734e6d07p-4), - V2 (-0x1.287461742fee4p-4) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .invln10 = V2 (0x1.bcb7b1526e50ep-2), - .log10_2 = V2 (0x1.34413509f79ffp-2), - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .c0 = V2 (-0x1.bcb7b1526e506p-3), + .c1 = 0x1.287a7636be1d1p-3, + .c2 = V2 (-0x1.bcb7b158af938p-4), + .c3 = 0x1.63c78734e6d07p-4, + .c4 = V2 (-0x1.287461742fee4p-4), + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = V2 (0x3fe6900900000000), .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ }; -#define Off v_u64 (0x3fe6900900000000) +#define N (1 << V_LOG10_TABLE_BITS) #define IndexMask (N - 1) -#define T(s, i) __v_log10_data.s[i] - struct entry { float64x2_t invc; @@ -70,10 +70,11 @@ lookup (uint64x2_t i) } static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t special) +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) { - return v_call_f64 (log10, x, vfmaq_f64 (hi, r2, y), vmovl_u32 (special)); + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); } /* Fast implementation of double-precision vector log10 @@ -85,19 +86,24 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, Off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); float64x2_t z = vreinterpretq_f64_u64 (iz); - struct entry e = lookup (tmp); + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); @@ -105,17 +111,22 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x) /* hi = r / log(10) + log10(c) + k*log10(2). Constants in v_log10_data.c are computed (in extended precision) as - e.log10c := e.logc * ivln10. */ - float64x2_t w = vfmaq_f64 (e.log10c, r, d->invln10); + e.log10c := e.logc * invln10. */ + float64x2_t cte = vld1q_f64 (&d->invln10); + float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0); /* y = log10(1+r) + n * log10(2). */ - float64x2_t hi = vfmaq_f64 (w, kd, d->log10_2); + hi = vfmaq_laneq_f64 (hi, kd, cte, 1); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_f64 (y, d->c4, r2); + y = vfmaq_f64 (p, y, r2); if (__glibc_unlikely (v_any_u32h (special))) - return special_case (x, y, hi, r2, special); - return vfmaq_f64 (hi, r2, y); + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); } diff --git a/sysdeps/aarch64/fpu/log10_sve.c b/sysdeps/aarch64/fpu/log10_sve.c index ab73621..35283a5 100644 --- a/sysdeps/aarch64/fpu/log10_sve.c +++ b/sysdeps/aarch64/fpu/log10_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) log10 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,28 +23,49 @@ #define Min 0x0010000000000000 #define Max 0x7ff0000000000000 #define Thres 0x7fe0000000000000 /* Max - Min. */ -#define Off 0x3fe6900900000000 #define N (1 << V_LOG10_TABLE_BITS) +static const struct data +{ + double c0, c2; + double c1, c3; + double invln10, log10_2; + double c4; + uint64_t off; +} data = { + .c0 = -0x1.bcb7b1526e506p-3, + .c1 = 0x1.287a7636be1d1p-3, + .c2 = -0x1.bcb7b158af938p-4, + .c3 = 0x1.63c78734e6d07p-4, + .c4 = -0x1.287461742fee4p-4, + .invln10 = 0x1.bcb7b1526e50ep-2, + .log10_2 = 0x1.34413509f79ffp-2, + .off = 0x3fe6900900000000, +}; + static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) { - return sv_call_f64 (log10, x, y, special); + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log10, x, svmla_x (svptrue_b64 (), hi, r2, y), special); } -/* SVE log10 algorithm. +/* Double-precision SVE log10 routine. Maximum measured error is 2.46 ulps. SV_NAME_D1 (log10)(0x1.131956cd4b627p+0) got 0x1.fffbdf6eaa669p-6 want 0x1.fffbdf6eaa667p-6. */ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG10_TABLE_BITS); i = svand_x (pg, i, (N - 1) << 1); svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); @@ -62,15 +83,19 @@ svfloat64_t SV_NAME_D1 (log10) (svfloat64_t x, const svbool_t pg) svfloat64_t r = svmad_x (pg, invc, z, -1.0); /* hi = log(c) + k*log(2). */ - svfloat64_t w = svmla_x (pg, logc, r, __v_log10_data.invln10); - svfloat64_t hi = svmla_x (pg, w, k, __v_log10_data.log10_2); + svfloat64_t invln10_log10_2 = svld1rq_f64 (svptrue_b64 (), &d->invln10); + svfloat64_t w = svmla_lane_f64 (logc, r, invln10_log10_2, 0); + svfloat64_t hi = svmla_lane_f64 (w, k, invln10_log10_2, 1); /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log10_data.poly); + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_x (pg, y, r2, d->c4); + y = svmla_x (pg, p, r2, y); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), - special); + return special_case (hi, tmp, y, r2, special, d); return svmla_x (pg, hi, r2, y); } diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c index 9347422..092abea 100644 --- a/sysdeps/aarch64/fpu/log10f_advsimd.c +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) log10 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,35 +18,43 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - uint32x4_t min_norm; + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; + uint32x4_t off, offset_lower_bound; uint16x8_t special_bound; - float32x4_t poly[8]; - float32x4_t inv_ln10, ln2; - uint32x4_t off, mantissa_mask; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; } data = { /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), - V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), - V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, + .c0 = V4 (-0x1.bcb79cp-3f), + .c1 = 0x1.2879c8p-3f, + .c2 = V4 (-0x1.bcd472p-4f), + .c3 = 0x1.6408f8p-4f, + .c4 = V4 (-0x1.246f8p-4f), + .c5 = 0x1.f0e514p-5f, + .c6 = V4 (-0x1.0fc92cp-4f), + .c7 = 0x1.f5f76ap-5f, .ln2 = V4 (0x1.62e43p-1f), .inv_ln10 = V4 (0x1.bcb7b2p-2f), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ .off = V4 (0x3f2aaaab), /* 0.666667. */ .mantissa_mask = V4 (0x007fffff), }; static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, - uint16x4_t cmp) +special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, + uint16x4_t cmp, const struct data *d) { /* Fall back to scalar code. */ - return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); + return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (y, p, r2), vmovl_u16 (cmp)); } /* Fast implementation of AdvSIMD log10f, @@ -58,26 +66,41 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); + float32x4_t c1357 = vld1q_f32 (&d->c1); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); + u_off = vsubq_u32 (u_off, d->off); float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log10(1+r) + n * log10(2). */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); + float32x4_t poly = vfmaq_f32 (c01, r2, p27); + /* y = Log10(2) * n + poly * InvLn(10). */ float32x4_t y = vfmaq_f32 (r, d->ln2, n); y = vmulq_f32 (y, d->inv_ln10); if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, y, poly, r2, special); + return special_case (y, u_off, poly, r2, special, d); return vfmaq_f32 (y, poly, r2); } libmvec_hidden_def (V_NAME_F1 (log10)) diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c index bdbb49c..4a8ddc5 100644 --- a/sysdeps/aarch64/fpu/log10f_sve.c +++ b/sysdeps/aarch64/fpu/log10f_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) log10 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,6 +24,7 @@ static const struct data float poly_0246[4]; float poly_1357[4]; float ln2, inv_ln10; + uint32_t off, lower; } data = { .poly_1357 = { /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs @@ -35,18 +36,23 @@ static const struct data -0x1.0fc92cp-4f }, .ln2 = 0x1.62e43p-1f, .inv_ln10 = 0x1.bcb7b2p-2f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min 0x00800000 -#define Max 0x7f800000 -#define Thres 0x7f000000 /* Max - Min. */ -#define Offset 0x3f2aaaab /* 0.666667. */ +#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */ #define MantissaMask 0x007fffff static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (log10f, x, y, special); + return sv_call_f32 ( + log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE log10f using the same algorithm and @@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t ix = svreinterpret_u32 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres); + + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - ix = svsub_x (pg, ix, Offset); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */ - ix = svand_x (pg, ix, MantissaMask); - ix = svadd_x (pg, ix, Offset); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */ + svuint32_t ix = svand_x (pg, u_off, MantissaMask); + ix = svadd_x (pg, ix, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f); /* y = log10(1+r) + n*log10(2) log10(1+r) ~ r * InvLn(10) + P(r) where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */ - svfloat32_t r2 = svmul_x (pg, r, r); - svfloat32_t r4 = svmul_x (pg, r2, r2); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); + svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2); svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0); svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1); @@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg) hi = svmul_x (pg, hi, d->inv_ln10); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y), - special); - return svmla_x (pg, hi, r2, y); + return special_case (u_off, hi, r2, y, special); + return svmla_x (svptrue_b32 (), hi, r2, y); } diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c index ffc418f..44474be 100644 --- a/sysdeps/aarch64/fpu/log1p_advsimd.c +++ b/sysdeps/aarch64/fpu/log1p_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision AdvSIMD log1p - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,43 +17,26 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "v_math.h" -#include "poly_advsimd_f64.h" +#define WANT_V_LOG1P_K0_SHORTCUT 0 +#include "v_log1p_inline.h" const static struct data { - float64x2_t poly[19], ln2[2]; - uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask, inf, minus_one; - int64x2_t one_top; -} data = { - /* Generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), - V2 (-0x1.cfa7385bdb37ep-6) }, - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, - /* top32(asuint64(sqrt(2)/2)) << 32. */ - .hf_rt2_top = V2 (0x3fe6a09e00000000), - /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */ - .one_m_hf_rt2_top = V2 (0x00095f6200000000), - .umask = V2 (0x000fffff00000000), - .one_top = V2 (0x3ff), - .inf = V2 (0x7ff0000000000000), - .minus_one = V2 (0xbff0000000000000) -}; + struct v_log1p_data d; + uint64x2_t inf, minus_one; +} data = { .d = V_LOG1P_CONSTANTS_TABLE, + .inf = V2 (0x7ff0000000000000), + .minus_one = V2 (0xbff0000000000000) }; #define BottomMask v_u64 (0xffffffff) -static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, uint64x2_t cmp, const struct data *d) { - return v_call_f64 (log1p, x, y, special); + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float64x2_t x_nospecial = v_zerofy_f64 (x, cmp); + return v_call_f64 (log1p, x, log1p_inline (x_nospecial, &d->d), cmp); } /* Vector log1p approximation using polynomial on reduced interval. Routine is @@ -66,64 +49,14 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x) const struct data *d = ptr_barrier (&data); uint64x2_t ix = vreinterpretq_u64_f64 (x); uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); - uint64x2_t special = vcgeq_u64 (ia, d->inf); -#if WANT_SIMD_EXCEPT - special = vorrq_u64 (special, - vcgeq_u64 (ix, vreinterpretq_u64_f64 (v_f64 (-1)))); - if (__glibc_unlikely (v_any_u64 (special))) - x = v_zerofy_f64 (x, special); -#else - special = vorrq_u64 (special, vcleq_f64 (x, v_f64 (-1))); -#endif + uint64x2_t special_cases + = vorrq_u64 (vcgeq_u64 (ia, d->inf), vcgeq_u64 (ix, d->minus_one)); - /* With x + 1 = t * 2^k (where t = f + 1 and k is chosen such that f - is in [sqrt(2)/2, sqrt(2)]): - log1p(x) = k*log(2) + log1p(f). + if (__glibc_unlikely (v_any_u64 (special_cases))) + return special_case (x, special_cases, d); - f may not be representable exactly, so we need a correction term: - let m = round(1 + x), c = (1 + x) - m. - c << m: at very small x, log1p(x) ~ x, hence: - log(1+x) - log(m) ~ c/m. - - We therefore calculate log1p(x) by k*log2 + log1p(f) + c/m. */ - - /* Obtain correctly scaled k by manipulation in the exponent. - The scalar algorithm casts down to 32-bit at this point to calculate k and - u_red. We stay in double-width to obtain f and k, using the same constants - as the scalar algorithm but shifted left by 32. */ - float64x2_t m = vaddq_f64 (x, v_f64 (1)); - uint64x2_t mi = vreinterpretq_u64_f64 (m); - uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); - - int64x2_t ki - = vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top); - float64x2_t k = vcvtq_f64_s64 (ki); - - /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ - uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); - uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); - - /* Correction term c/m. */ - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); - - /* Approximate log1p(x) on the reduced input using a polynomial. Because - log1p(0)=0 we choose an approximation of the form: - x + C0*x^2 + C1*x^3 + C2x^4 + ... - Hence approximation has the form f + f^2 * P(f) - where P(x) = C0 + C1*x + C2x^2 + ... - Assembling this all correctly is dealt with at the final step. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); - - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); - float64x2_t y = vaddq_f64 (ylo, yhi); - - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (vreinterpretq_f64_u64 (ix), vfmaq_f64 (y, f2, p), - special); - - return vfmaq_f64 (y, f2, p); + return log1p_inline (x, &d->d); } + +strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1)) diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c index 04f7e57..862c13f 100644 --- a/sysdeps/aarch64/fpu/log1p_sve.c +++ b/sysdeps/aarch64/fpu/log1p_sve.c @@ -1,6 +1,6 @@ /* Double-precision SVE log1p - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg) return y; } + +strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1)) diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c index dc15334..887c504 100644 --- a/sysdeps/aarch64/fpu/log1pf_advsimd.c +++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision AdvSIMD log1p - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,113 +18,81 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" +#include "v_log1pf_inline.h" + +#if WANT_SIMD_EXCEPT const static struct data { - float32x4_t poly[8], ln2; - uint32x4_t tiny_bound, minus_one, four, thresh; - int32x4_t three_quarters; + uint32x4_t minus_one, thresh; + struct v_log1pf_data d; } data = { - .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients - (1, -0.5) are not stored as they can be generated more - efficiently. */ - V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, - .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ - .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */ + .d = V_LOG1PF_CONSTANTS_TABLE, + .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */ .minus_one = V4 (0xbf800000), - .four = V4 (0x40800000), - .three_quarters = V4 (0x3f400000) }; -static inline float32x4_t -eval_poly (float32x4_t m, const float32x4_t *p) -{ - /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */ - float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]); - float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]); - float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]); - float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]); - - float32x4_t m2 = vmulq_f32 (m, m); - float32x4_t p_02 = vfmaq_f32 (m, m2, p_12); - float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56); - float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]); - - float32x4_t m4 = vmulq_f32 (m2, m2); - float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36); - return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79)); -} +/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */ +# define TinyBound v_u32 (0x34000000) static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, uint32x4_t cmp, const struct data *d) { - return v_call_f32 (log1pf, x, y, special); + /* Side-step special lanes so fenv exceptions are not triggered + inadvertently. */ + float32x4_t x_nospecial = v_zerofy_f32 (x, cmp); + return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp); } -/* Vector log1pf approximation using polynomial on reduced interval. Accuracy - is roughly 2.02 ULP: - log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */ +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.69 ULP: + _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3 + want 0x1.cfcbdcp-3. */ VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t special_cases - = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh), + = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh), vcgeq_u32 (ix, d->minus_one)); - float32x4_t special_arg = x; -#if WANT_SIMD_EXCEPT if (__glibc_unlikely (v_any_u32 (special_cases))) - /* Side-step special lanes so fenv exceptions are not triggered - inadvertently. */ - x = v_zerofy_f32 (x, special_cases); -#endif + return special_case (x, special_cases, d); - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m - is in [-0.25, 0.5]): - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). - - We approximate log1p(m) with a polynomial, then scale by - k*log(2). Instead of doing this directly, we use an intermediate - scale factor s = 4*k*log(2) to ensure the scale is representable - as a normalised fp32 number. */ + return log1pf_inline (x, &d->d); +} - float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); +#else - /* Choose k to scale x to the range [-1/4, 1/2]. */ - int32x4_t k - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), - v_s32 (0xff800000)); - uint32x4_t ku = vreinterpretq_u32_s32 (k); +const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE; - /* Scale x by exponent manipulation. */ - float32x4_t m_scale - = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, uint32x4_t cmp) +{ + return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp); +} - /* Scale up to ensure that the scale factor is representable as normalised - fp32 number, and scale m down accordingly. */ - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); - m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); +/* Vector log1pf approximation using polynomial on reduced interval. Worst-case + error is 1.63 ULP: + _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3 + want 0x1.fdcb16p-3. */ +VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x) +{ + uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)), + vcaleq_f32 (x, v_f32 (0x1p127f))); - /* Evaluate polynomial on the reduced interval. */ - float32x4_t p = eval_poly (m_scale, d->poly); + if (__glibc_unlikely (v_any_u32 (special_cases))) + return special_case (x, special_cases); - /* The scale factor to be applied back at the end - by multiplying float(k) - by 2^-23 we get the unbiased exponent of k. */ - float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23)); + return log1pf_inline (x, ptr_barrier (&data)); +} - /* Apply the scaling back. */ - float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2); +#endif - if (__glibc_unlikely (v_any_u32 (special_cases))) - return special_case (special_arg, y, special_cases); - return y; -} libmvec_hidden_def (V_NAME_F1 (log1p)) HALF_WIDTH_ALIAS_F1 (log1p) +strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1)) +libmvec_hidden_def (V_NAME_F1 (logp1)) +HALF_WIDTH_ALIAS_F1 (logp1) diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c index f645cc9..937115f 100644 --- a/sysdeps/aarch64/fpu/log1pf_sve.c +++ b/sysdeps/aarch64/fpu/log1pf_sve.c @@ -1,6 +1,6 @@ /* Single-precision SVE log1p - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,30 +18,13 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f32.h" - -static const struct data -{ - float poly[8]; - float ln2, exp_bias; - uint32_t four, three_quarters; -} data = {.poly = {/* Do not store first term of polynomial, which is -0.5, as - this can be fmov-ed directly instead of including it in - the main load-and-mla polynomial schedule. */ - 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, - 0x1.abcb6p-4f, -0x1.6f0d5ep-5f}, - .ln2 = 0x1.62e43p-1f, - .exp_bias = 0x1p-23f, - .four = 0x40800000, - .three_quarters = 0x3f400000}; - -#define SignExponentMask 0xff800000 +#include "sv_log1pf_inline.h" static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svfloat32_t x, svbool_t special) { - return sv_call_f32 (log1pf, x, y, special); + return sv_call_f32 (log1pf, x, sv_log1pf_inline (x, svptrue_b32 ()), + special); } /* Vector log1pf approximation using polynomial on reduced interval. Worst-case @@ -50,51 +33,14 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special) want 0x1.9f323ep-2. */ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg) { - const struct data *d = ptr_barrier (&data); /* x < -1, Inf/Nan. */ svbool_t special = svcmpeq (pg, svreinterpret_u32 (x), 0x7f800000); special = svorn_z (pg, special, svcmpge (pg, x, -1)); - /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m - is in [-0.25, 0.5]): - log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). - - We approximate log1p(m) with a polynomial, then scale by - k*log(2). Instead of doing this directly, we use an intermediate - scale factor s = 4*k*log(2) to ensure the scale is representable - as a normalised fp32 number. */ - svfloat32_t m = svadd_x (pg, x, 1); - - /* Choose k to scale x to the range [-1/4, 1/2]. */ - svint32_t k - = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), - sv_s32 (SignExponentMask)); - - /* Scale x by exponent manipulation. */ - svfloat32_t m_scale = svreinterpret_f32 ( - svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); - - /* Scale up to ensure that the scale factor is representable as normalised - fp32 number, and scale m down accordingly. */ - svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); - m_scale = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1), s, 0.25)); - - /* Evaluate polynomial on reduced interval. */ - svfloat32_t ms2 = svmul_x (pg, m_scale, m_scale), - ms4 = svmul_x (pg, ms2, ms2); - svfloat32_t p = sv_estrin_7_f32_x (pg, m_scale, ms2, ms4, d->poly); - p = svmad_x (pg, m_scale, p, -0.5); - p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); - - /* The scale factor to be applied back at the end - by multiplying float(k) - by 2^-23 we get the unbiased exponent of k. */ - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->exp_bias); - - /* Apply the scaling back. */ - svfloat32_t y = svmla_x (pg, p, scale_back, d->ln2); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, y, special); + return special_case (x, special); - return y; + return sv_log1pf_inline (x, pg); } + +strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1)) diff --git a/sysdeps/aarch64/fpu/log2_advsimd.c b/sysdeps/aarch64/fpu/log2_advsimd.c index 4057c55..1130e47 100644 --- a/sysdeps/aarch64/fpu/log2_advsimd.c +++ b/sysdeps/aarch64/fpu/log2_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,31 +18,33 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" - -#define N (1 << V_LOG2_TABLE_BITS) static const struct data { - uint64x2_t min_norm; + uint64x2_t off, sign_exp_mask, offset_lower_bound; uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t invln2; - uint64x2_t sign_exp_mask; + float64x2_t c0, c2; + double c1, c3, invln2, c4; } data = { /* Each coefficient was generated to approximate log(r) for |r| < 0x1.fp-9 and N = 128, then scaled by log2(e) in extended precision and rounded back to double precision. */ - .poly = { V2 (-0x1.71547652b83p-1), V2 (0x1.ec709dc340953p-2), - V2 (-0x1.71547651c8f35p-2), V2 (0x1.2777ebe12dda5p-2), - V2 (-0x1.ec738d616fe26p-3) }, - .invln2 = V2 (0x1.71547652b82fep0), - .min_norm = V2 (0x0010000000000000), /* asuint64(0x1p-1022). */ - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ + .c0 = V2 (-0x1.71547652b8300p-1), + .c1 = 0x1.ec709dc340953p-2, + .c2 = V2 (-0x1.71547651c8f35p-2), + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = V2 (0x3fe6900900000000), .sign_exp_mask = V2 (0xfff0000000000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-1022). */ }; -#define Off v_u64 (0x3fe6900900000000) +#define N (1 << V_LOG2_TABLE_BITS) #define IndexMask (N - 1) struct entry @@ -67,10 +69,11 @@ lookup (uint64x2_t i) } static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, - uint32x2_t special) +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) { - return v_call_f64 (log2, x, vfmaq_f64 (w, r2, y), vmovl_u32 (special)); + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log2, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); } /* Double-precision vector log2 routine. Implements the same algorithm as @@ -81,31 +84,41 @@ special_case (float64x2_t x, float64x2_t y, float64x2_t w, float64x2_t r2, float64x2_t VPCS_ATTR V_NAME_D1 (log2) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - uint32x2_t special = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, Off); - int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); float64x2_t z = vreinterpretq_f64_u64 (iz); - struct entry e = lookup (tmp); + struct entry e = lookup (u_off); - /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); + /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); float64x2_t kd = vcvtq_f64_s64 (k); - float64x2_t w = vfmaq_f64 (e.log2c, r, d->invln2); + + float64x2_t invln2_and_c4 = vld1q_f64 (&d->invln2); + float64x2_t hi + = vfmaq_laneq_f64 (vaddq_f64 (e.log2c, kd), r, invln2_and_c4, 0); float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t y = v_pw_horner_4_f64 (r, r2, d->poly); - w = vaddq_f64 (kd, w); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, invln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); if (__glibc_unlikely (v_any_u32h (special))) - return special_case (x, y, w, r2, special); - return vfmaq_f64 (w, r2, y); + return special_case (hi, u_off, y, r2, special, d); + return vfmaq_f64 (hi, y, r2); } diff --git a/sysdeps/aarch64/fpu/log2_sve.c b/sysdeps/aarch64/fpu/log2_sve.c index 743fa2a..e05dc4e 100644 --- a/sysdeps/aarch64/fpu/log2_sve.c +++ b/sysdeps/aarch64/fpu/log2_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) log2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,15 +21,32 @@ #include "poly_sve_f64.h" #define N (1 << V_LOG2_TABLE_BITS) -#define Off 0x3fe6900900000000 #define Max (0x7ff0000000000000) #define Min (0x0010000000000000) #define Thresh (0x7fe0000000000000) /* Max - Min. */ +static const struct data +{ + double c0, c2; + double c1, c3; + double invln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.71547652b83p-1, + .c1 = 0x1.ec709dc340953p-2, + .c2 = -0x1.71547651c8f35p-2, + .c3 = 0x1.2777ebe12dda5p-2, + .c4 = -0x1.ec738d616fe26p-3, + .invln2 = 0x1.71547652b82fep0, + .off = 0x3fe6900900000000, +}; + static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) +special_case (svfloat64_t w, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) { - return sv_call_f64 (log2, x, y, cmp); + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log2, x, svmla_x (svptrue_b64 (), w, r2, y), special); } /* Double-precision SVE log2 routine. @@ -40,13 +57,15 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) want 0x1.fffb34198d9ddp-5. */ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); svuint64_t i = svlsr_x (pg, tmp, 51 - V_LOG2_TABLE_BITS); i = svand_x (pg, i, (N - 1) << 1); svfloat64_t k = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); @@ -59,15 +78,19 @@ svfloat64_t SV_NAME_D1 (log2) (svfloat64_t x, const svbool_t pg) /* log2(x) = log1p(z/c-1)/log(2) + log2(c) + k. */ + svfloat64_t invln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->invln2); svfloat64_t r = svmad_x (pg, invc, z, -1.0); - svfloat64_t w = svmla_x (pg, log2c, r, __v_log2_data.invln2); - - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = sv_pw_horner_4_f64_x (pg, r, r2, __v_log2_data.poly); + svfloat64_t w = svmla_lane_f64 (log2c, r, invln2_and_c4, 0); w = svadd_x (pg, k, w); + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, invln2_and_c4, 1); + y = svmla_x (pg, p, r2, y); + if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), w, r2, y), - special); + return special_case (w, tmp, y, r2, special, d); return svmla_x (pg, w, r2, y); } diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c index db21836..28f1857 100644 --- a/sysdeps/aarch64/fpu/log2f_advsimd.c +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) exp2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,34 +18,43 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { - uint32x4_t min_norm; + float32x4_t c0, c2, c4, c6, c8; + uint32x4_t off, offset_lower_bound; uint16x8_t special_bound; - uint32x4_t off, mantissa_mask; - float32x4_t poly[9]; + uint32x4_t mantissa_mask; + float c1, c3, c5, c7; } data = { /* Coefficients generated using Remez algorithm approximate log2(1+r)/r for r in [ -1/3, 1/3 ]. rel error: 0x1.c4c4b0cp-26. */ - .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ - V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), - V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), - V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + .c1 = -0x1.715458p-1f, + .c2 = V4 (0x1.ec701cp-2f), + .c3 = -0x1.7171a4p-2f, + .c4 = V4 (0x1.27a0b8p-2f), + .c5 = -0x1.e5143ep-3f, + .c6 = V4 (0x1.9d8ecap-3f), + .c7 = -0x1.c675bp-3f, + .c8 = V4 (0x1.9e495p-3f), + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ .off = V4 (0x3f2aaaab), /* 0.666667. */ .mantissa_mask = V4 (0x007fffff), }; static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, - uint16x4_t cmp) +special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, + uint16x4_t cmp, const struct data *d) { /* Fall back to scalar code. */ - return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); + return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (n, p, r), vmovl_u16 (cmp)); } /* Fast implementation for single precision AdvSIMD log2, @@ -56,24 +65,40 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - uint32x4_t u = vreinterpretq_u32_f32 (x); - uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); + + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vreinterpretq_u32_f32 (x); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); + u_off = vsubq_u32 (u_off, d->off); float32x4_t n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off); + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + + uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log2(1+r) + n. */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); + float32x4_t p = vfmaq_f32 (c01, r2, p28); if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, n, p, r, special); + return special_case (n, u_off, p, r, special, d); return vfmaq_f32 (n, p, r); } + libmvec_hidden_def (V_NAME_F1 (log2)) HALF_WIDTH_ALIAS_F1 (log2) diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c index 5031c42..515eeff 100644 --- a/sysdeps/aarch64/fpu/log2f_sve.c +++ b/sysdeps/aarch64/fpu/log2f_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) log2 function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,6 +23,7 @@ static const struct data { float poly_02468[5]; float poly_1357[4]; + uint32_t off, lower; } data = { .poly_1357 = { /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs @@ -32,18 +33,23 @@ static const struct data }, .poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f, 0x1.9d8ecap-3f, 0x1.9e495p-3f }, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min (0x00800000) -#define Max (0x7f800000) -#define Thres (0x7f000000) /* Max - Min. */ +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ #define MantissaMask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667. */ static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (log2f, x, y, cmp); + return sv_call_f32 ( + log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE log2f, using the same algorithm @@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t u = svreinterpret_u32 (x); - svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres); + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_x (pg, u, Off); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ - u = svand_x (pg, u, MantissaMask); - u = svadd_x (pg, u, Off); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + svuint32_t u = svand_x (pg, u_off, MantissaMask); + u = svadd_x (pg, u, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log2(1+r) + n. */ - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); /* Evaluate polynomial using pairwise Horner scheme. */ svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]); @@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg) y = svmla_x (pg, q_01, r2, y); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special); - return svmla_x (pg, n, r, y); + return special_case (u_off, n, r, y, special); + return svmla_x (svptrue_b32 (), n, r, y); } diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c index 015a6da..92c2c1d 100644 --- a/sysdeps/aarch64/fpu/log_advsimd.c +++ b/sysdeps/aarch64/fpu/log_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) log function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,27 +21,29 @@ static const struct data { - uint64x2_t min_norm; + uint64x2_t off, sign_exp_mask, offset_lower_bound; uint32x4_t special_bound; - float64x2_t poly[5]; - float64x2_t ln2; - uint64x2_t sign_exp_mask; + float64x2_t c0, c2; + double c1, c3, ln2, c4; } data = { - /* Worst-case error: 1.17 + 0.5 ulp. - Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ - .poly = { V2 (-0x1.ffffffffffff7p-2), V2 (0x1.55555555170d4p-2), - V2 (-0x1.0000000399c27p-2), V2 (0x1.999b2e90e94cap-3), - V2 (-0x1.554e550bd501ep-3) }, - .ln2 = V2 (0x1.62e42fefa39efp-1), - .min_norm = V2 (0x0010000000000000), - .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */ - .sign_exp_mask = V2 (0xfff0000000000000) + /* Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */ + .c0 = V2 (-0x1.ffffffffffff7p-2), + .c1 = 0x1.55555555170d4p-2, + .c2 = V2 (-0x1.0000000399c27p-2), + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .sign_exp_mask = V2 (0xfff0000000000000), + .off = V2 (0x3fe6900900000000), + /* Lower bound is 0x0010000000000000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound - offset (which wraps around). */ + .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), + .special_bound = V4 (0x7fe00000), /* asuint64(inf) - asuint64(0x1p-126). */ }; -#define A(i) d->poly[i] #define N (1 << V_LOG_TABLE_BITS) #define IndexMask (N - 1) -#define Off v_u64 (0x3fe6900900000000) struct entry { @@ -64,48 +66,56 @@ lookup (uint64x2_t i) } static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2, - uint32x2_t cmp) +special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, + uint32x2_t special, const struct data *d) { - return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp)); + float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off)); + return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special)); } +/* Double-precision vector log routine. + The maximum observed error is 2.17 ULP: + _ZGVnN2v_log(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 + want 0x1.ffffff1cca045p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - float64x2_t z, r, r2, p, y, kd, hi; - uint64x2_t ix, iz, tmp; - uint32x2_t cmp; - int64x2_t k; - struct entry e; - ix = vreinterpretq_u64_f64 (x); - cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm), - vget_low_u32 (d->special_bound)); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint64x2_t u = vreinterpretq_u64_f64 (x); + uint64x2_t u_off = vsubq_u64 (u, d->off); /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - tmp = vsubq_u64 (ix, Off); - k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask)); - z = vreinterpretq_f64_u64 (iz); - e = lookup (tmp); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52); + uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask)); + float64x2_t z = vreinterpretq_f64_u64 (iz); + + struct entry e = lookup (u_off); + + uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound), + vget_low_u32 (d->special_bound)); /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); - kd = vcvtq_f64_s64 (k); + float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); + float64x2_t kd = vcvtq_f64_s64 (k); /* hi = r + log(c) + k*Ln2. */ - hi = vfmaq_f64 (vaddq_f64 (e.logc, r), kd, d->ln2); + float64x2_t ln2_and_c4 = vld1q_f64 (&d->ln2); + float64x2_t hi = vfmaq_laneq_f64 (vaddq_f64 (e.logc, r), kd, ln2_and_c4, 0); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - r2 = vmulq_f64 (r, r); - y = vfmaq_f64 (A (2), A (3), r); - p = vfmaq_f64 (A (0), A (1), r); - y = vfmaq_f64 (y, A (4), r2); - y = vfmaq_f64 (p, y, r2); - - if (__glibc_unlikely (v_any_u32h (cmp))) - return special_case (x, y, hi, r2, cmp); + float64x2_t odd_coeffs = vld1q_f64 (&d->c1); + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1); + float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); + y = vfmaq_laneq_f64 (y, r2, ln2_and_c4, 1); + y = vfmaq_f64 (p, r2, y); + + if (__glibc_unlikely (v_any_u32h (special))) + return special_case (hi, u_off, y, r2, special, d); return vfmaq_f64 (hi, y, r2); } diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c index 9b689f2..dcf84e8 100644 --- a/sysdeps/aarch64/fpu/log_sve.c +++ b/sysdeps/aarch64/fpu/log_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) log function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,39 +19,54 @@ #include "sv_math.h" -#define P(i) sv_f64 (__v_log_data.poly[i]) #define N (1 << V_LOG_TABLE_BITS) -#define Off (0x3fe6900900000000) -#define MaxTop (0x7ff) -#define MinTop (0x001) -#define ThreshTop (0x7fe) /* MaxTop - MinTop. */ +#define Max (0x7ff0000000000000) +#define Min (0x0010000000000000) +#define Thresh (0x7fe0000000000000) /* Max - Min. */ + +static const struct data +{ + double c0, c2; + double c1, c3; + double ln2, c4; + uint64_t off; +} data = { + .c0 = -0x1.ffffffffffff7p-2, + .c1 = 0x1.55555555170d4p-2, + .c2 = -0x1.0000000399c27p-2, + .c3 = 0x1.999b2e90e94cap-3, + .c4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .off = 0x3fe6900900000000, +}; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) +special_case (svfloat64_t hi, svuint64_t tmp, svfloat64_t y, svfloat64_t r2, + svbool_t special, const struct data *d) { - return sv_call_f64 (log, x, y, cmp); + svfloat64_t x = svreinterpret_f64 (svadd_x (svptrue_b64 (), tmp, d->off)); + return sv_call_f64 (log, x, svmla_x (svptrue_b64 (), hi, r2, y), special); } -/* SVE port of AdvSIMD log algorithm. - Maximum measured error is 2.17 ulp: - SV_NAME_D1 (log)(0x1.a6129884398a3p+0) got 0x1.ffffff1cca043p-2 - want 0x1.ffffff1cca045p-2. */ +/* Double-precision SVE log routine. + Maximum measured error is 2.64 ulp: + SV_NAME_D1 (log)(0x1.95e54bc91a5e2p+184) got 0x1.fffffffe88cacp+6 + want 0x1.fffffffe88cafp+6. */ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + svuint64_t ix = svreinterpret_u64 (x); - svuint64_t top = svlsr_x (pg, ix, 52); - svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop)); + svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thresh); /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N. The actual value of i is double this due to table layout. */ svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1); - svint64_t k - = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52)); svfloat64_t z = svreinterpret_f64 (iz); /* Lookup in 2 global lists (length N). */ @@ -59,18 +74,22 @@ svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg) svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i); /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ - svfloat64_t r = svmad_x (pg, invc, z, -1); - svfloat64_t kd = svcvt_f64_x (pg, k); + svfloat64_t kd = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (tmp), 52)); /* hi = r + log(c) + k*Ln2. */ - svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2); + svfloat64_t ln2_and_c4 = svld1rq_f64 (svptrue_b64 (), &d->ln2); + svfloat64_t r = svmad_x (pg, invc, z, -1); + svfloat64_t hi = svmla_lane_f64 (logc, kd, ln2_and_c4, 0); + hi = svadd_x (pg, r, hi); + /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t y = svmla_x (pg, P (2), r, P (3)); - svfloat64_t p = svmla_x (pg, P (0), r, P (1)); - y = svmla_x (pg, y, r2, P (4)); + svfloat64_t odd_coeffs = svld1rq_f64 (svptrue_b64 (), &d->c1); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t y = svmla_lane_f64 (sv_f64 (d->c2), r, odd_coeffs, 1); + svfloat64_t p = svmla_lane_f64 (sv_f64 (d->c0), r, odd_coeffs, 0); + y = svmla_lane_f64 (y, r2, ln2_and_c4, 1); y = svmla_x (pg, p, r2, y); - if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp); + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (hi, tmp, y, r2, special, d); return svmla_x (pg, hi, r2, y); } diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c index 3c0d0fc..8a0c9a1 100644 --- a/sysdeps/aarch64/fpu/logf_advsimd.c +++ b/sysdeps/aarch64/fpu/logf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) log function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,66 +21,71 @@ static const struct data { - uint32x4_t min_norm; + float32x4_t c2, c4, c6, ln2; + uint32x4_t off, offset_lower_bound, mantissa_mask; uint16x8_t special_bound; - float32x4_t poly[7]; - float32x4_t ln2, tiny_bound; - uint32x4_t off, mantissa_mask; + float c1, c3, c5, c0; } data = { /* 3.34 ulp error. */ - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), - V4 (-0x1.ffffc8p-2f) }, + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = V4 (-0x1.4f9934p-3f), + .c3 = 0x1.961348p-3f, + .c4 = V4 (-0x1.00187cp-2f), + .c5 = 0x1.555d7cp-2f, + .c6 = V4 (-0x1.ffffc8p-2f), .ln2 = V4 (0x1.62e43p-1f), - .tiny_bound = V4 (0x1p-126), - .min_norm = V4 (0x00800000), - .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */ + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab), + .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */ .off = V4 (0x3f2aaaab), /* 0.666667. */ .mantissa_mask = V4 (0x007fffff) }; -#define P(i) d->poly[7 - i] - static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p, - uint16x4_t cmp) +special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, + uint16x4_t cmp, const struct data *d) { /* Fall back to scalar code. */ - return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); + return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)), + vfmaq_f32 (p, y, r2), vmovl_u16 (cmp)); } float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, p, q, r, r2, y; - uint32x4_t u; - uint16x4_t cmp; + float32x4_t c1350 = vld1q_f32 (&d->c1); - u = vreinterpretq_u32_f32 (x); - cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm), - vget_low_u16 (d->special_bound)); + /* To avoid having to mov x out of the way, keep u after offset has been + applied, and recover x by adding the offset back in the special-case + handler. */ + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = vsubq_u32 (u, d->off); - n = vcvtq_f32_s32 ( - vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */ - u = vandq_u32 (u, d->mantissa_mask); - u = vaddq_u32 (u, d->off); - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + float32x4_t n = vcvtq_f32_s32 ( + vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); + + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log(1+r) + n*ln2. */ - r2 = vmulq_f32 (r, r); + float32x4_t r2 = vmulq_f32 (r, r); /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = vfmaq_f32 (P (5), P (6), r); - q = vfmaq_f32 (P (3), P (4), r); - y = vfmaq_f32 (P (1), P (2), r); - p = vfmaq_f32 (p, P (7), r2); + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); + p = vfmaq_laneq_f32 (p, r2, c1350, 3); + q = vfmaq_f32 (q, p, r2); y = vfmaq_f32 (y, q, r2); p = vfmaq_f32 (r, d->ln2, n); if (__glibc_unlikely (v_any_u16h (cmp))) - return special_case (x, y, r2, p, cmp); + return special_case (p, u_off, y, r2, cmp, d); return vfmaq_f32 (p, y, r2); } libmvec_hidden_def (V_NAME_F1 (log)) diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c index d64e810..2e81a93 100644 --- a/sysdeps/aarch64/fpu/logf_sve.c +++ b/sysdeps/aarch64/fpu/logf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) log function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,6 +24,7 @@ static const struct data float poly_0135[4]; float poly_246[3]; float ln2; + uint32_t off, lower; } data = { .poly_0135 = { /* Coefficients copied from the AdvSIMD routine in math/, then rearranged so @@ -32,19 +33,24 @@ static const struct data -0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f }, .poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f }, - .ln2 = 0x1.62e43p-1f + .ln2 = 0x1.62e43p-1f, + .off = 0x3f2aaaab, + /* Lower bound is the smallest positive normal float 0x00800000. For + optimised register use subnormals are detected after offset has been + subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ + .lower = 0x00800000 - 0x3f2aaaab }; -#define Min (0x00800000) -#define Max (0x7f800000) -#define Thresh (0x7f000000) /* Max - Min. */ +#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */ #define Mask (0x007fffff) -#define Off (0x3f2aaaab) /* 0.666667. */ static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp) +special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y, + svbool_t cmp) { - return sv_call_f32 (logf, x, y, cmp); + return sv_call_f32 ( + logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)), + svmla_x (svptrue_b32 (), p, r2, y), cmp); } /* Optimised implementation of SVE logf, using the same algorithm and @@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svuint32_t u = svreinterpret_u32 (x); - svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh); + svuint32_t u_off = svreinterpret_u32 (x); + + u_off = svsub_x (pg, u_off, d->off); + svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u = svsub_x (pg, u, Off); svfloat32_t n = svcvt_f32_x ( - pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */ - u = svand_x (pg, u, Mask); - u = svadd_x (pg, u, Off); + pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */ + + svuint32_t u = svand_x (pg, u_off, Mask); + u = svadd_x (pg, u, d->off); svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f); /* y = log(1+r) + n*ln2. */ - svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); /* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */ svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]); svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1); @@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg) p = svmla_x (pg, r, n, d->ln2); if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp); + return special_case (u_off, p, r2, y, cmp); return svmla_x (pg, p, r2, y); } diff --git a/sysdeps/aarch64/fpu/math-barriers.h b/sysdeps/aarch64/fpu/math-barriers.h index 695251e..bf23350 100644 --- a/sysdeps/aarch64/fpu/math-barriers.h +++ b/sysdeps/aarch64/fpu/math-barriers.h @@ -1,5 +1,5 @@ /* Control when floating-point expressions are evaluated. AArch64 version. - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/math-tests-arch.h b/sysdeps/aarch64/fpu/math-tests-arch.h index 235deec..1062f84 100644 --- a/sysdeps/aarch64/fpu/math-tests-arch.h +++ b/sysdeps/aarch64/fpu/math-tests-arch.h @@ -1,6 +1,6 @@ /* Runtime architecture check for math tests. AArch64 version. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/math_private.h b/sysdeps/aarch64/fpu/math_private.h index bbcb2bd..1030d97 100644 --- a/sysdeps/aarch64/fpu/math_private.h +++ b/sysdeps/aarch64/fpu/math_private.h @@ -1,5 +1,5 @@ /* Configure optimized libm functions. AArch64 version. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f32.h b/sysdeps/aarch64/fpu/poly_advsimd_f32.h index 2d11d0c..284ab07 100644 --- a/sysdeps/aarch64/fpu/poly_advsimd_f32.h +++ b/sysdeps/aarch64/fpu/poly_advsimd_f32.h @@ -1,7 +1,7 @@ /* Helpers for evaluating polynomials on single-precision AdvSIMD input, using various schemes. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f64.h b/sysdeps/aarch64/fpu/poly_advsimd_f64.h index 1ea5f85..179c7fd 100644 --- a/sysdeps/aarch64/fpu/poly_advsimd_f64.h +++ b/sysdeps/aarch64/fpu/poly_advsimd_f64.h @@ -1,7 +1,7 @@ /* Helpers for evaluating polynomials on double-precision AdvSIMD input, using various schemes. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_generic.h b/sysdeps/aarch64/fpu/poly_generic.h index 595cda9..562e5bd 100644 --- a/sysdeps/aarch64/fpu/poly_generic.h +++ b/sysdeps/aarch64/fpu/poly_generic.h @@ -1,6 +1,6 @@ /* Generic helpers for evaluating polynomials with various schemes. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_sve_f32.h b/sysdeps/aarch64/fpu/poly_sve_f32.h index ef6db29..f0766e8 100644 --- a/sysdeps/aarch64/fpu/poly_sve_f32.h +++ b/sysdeps/aarch64/fpu/poly_sve_f32.h @@ -1,7 +1,7 @@ /* Helpers for evaluating polynomials on single-precision SVE input, using various schemes. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_sve_f64.h b/sysdeps/aarch64/fpu/poly_sve_f64.h index f7c8468..b36c164 100644 --- a/sysdeps/aarch64/fpu/poly_sve_f64.h +++ b/sysdeps/aarch64/fpu/poly_sve_f64.h @@ -1,7 +1,7 @@ /* Helpers for evaluating polynomials on double-precision SVE input, using various schemes. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/poly_sve_generic.h b/sysdeps/aarch64/fpu/poly_sve_generic.h index 227760c..6e8efc5 100644 --- a/sysdeps/aarch64/fpu/poly_sve_generic.h +++ b/sysdeps/aarch64/fpu/poly_sve_generic.h @@ -1,7 +1,7 @@ /* Helpers for evaluating polynomials with various schemes - specific to SVE but precision-agnostic. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/pow_advsimd.c b/sysdeps/aarch64/fpu/pow_advsimd.c index 3c91e3e..e5bb4f0 100644 --- a/sysdeps/aarch64/fpu/pow_advsimd.c +++ b/sysdeps/aarch64/fpu/pow_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (AdvSIMD) pow function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,9 +22,6 @@ /* Defines parameters of the approximation and scalar fallback. */ #include "finite_pow.h" -#define VecSmallExp v_u64 (SmallExp) -#define VecThresExp v_u64 (ThresExp) - #define VecSmallPowX v_u64 (SmallPowX) #define VecThresPowX v_u64 (ThresPowX) #define VecSmallPowY v_u64 (SmallPowY) @@ -32,36 +29,48 @@ static const struct data { - float64x2_t log_poly[6]; - float64x2_t exp_poly[3]; - float64x2_t ln2_hi, ln2_lo; - float64x2_t shift, inv_ln2_n, ln2_hi_n, ln2_lo_n, small_powx; uint64x2_t inf; + float64x2_t small_powx; + uint64x2_t offset, mask; + uint64x2_t mask_sub_0, mask_sub_1; + float64x2_t log_c0, log_c2, log_c4, log_c5; + double log_c1, log_c3; + double ln2_lo, ln2_hi; + uint64x2_t small_exp, thres_exp; + double ln2_lo_n, ln2_hi_n; + double inv_ln2_n, exp_c2; + float64x2_t exp_c0, exp_c1; } data = { + /* Power threshold. */ + .inf = V2 (0x7ff0000000000000), + .small_powx = V2 (0x1p-126), + .offset = V2 (Off), + .mask = V2 (0xfffULL << 52), + .mask_sub_0 = V2 (1ULL << 52), + .mask_sub_1 = V2 (52ULL << 52), /* Coefficients copied from v_pow_log_data.c relative error: 0x1.11922ap-70 in [-0x1.6bp-8, 0x1.6bp-8] Coefficients are scaled to match the scaling during evaluation. */ - .log_poly - = { V2 (0x1.555555555556p-2 * -2), V2 (-0x1.0000000000006p-2 * -2), - V2 (0x1.999999959554ep-3 * 4), V2 (-0x1.555555529a47ap-3 * 4), - V2 (0x1.2495b9b4845e9p-3 * -8), V2 (-0x1.0002b8b263fc3p-3 * -8) }, - .ln2_hi = V2 (0x1.62e42fefa3800p-1), - .ln2_lo = V2 (0x1.ef35793c76730p-45), + .log_c0 = V2 (0x1.555555555556p-2 * -2), + .log_c1 = -0x1.0000000000006p-2 * -2, + .log_c2 = V2 (0x1.999999959554ep-3 * 4), + .log_c3 = -0x1.555555529a47ap-3 * 4, + .log_c4 = V2 (0x1.2495b9b4845e9p-3 * -8), + .log_c5 = V2 (-0x1.0002b8b263fc3p-3 * -8), + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, /* Polynomial coefficients: abs error: 1.43*2^-58, ulp error: 0.549 (0.550 without fma) if |x| < ln2/512. */ - .exp_poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6ef9p-3), - V2 (0x1.5555576a5adcep-5) }, - .shift = V2 (0x1.8p52), /* round to nearest int. without intrinsics. */ - .inv_ln2_n = V2 (0x1.71547652b82fep8), /* N/ln2. */ - .ln2_hi_n = V2 (0x1.62e42fefc0000p-9), /* ln2/N. */ - .ln2_lo_n = V2 (-0x1.c610ca86c3899p-45), - .small_powx = V2 (0x1p-126), - .inf = V2 (0x7ff0000000000000) + .exp_c0 = V2 (0x1.fffffffffffd4p-2), + .exp_c1 = V2 (0x1.5555571d6ef9p-3), + .exp_c2 = 0x1.5555576a5adcep-5, + .small_exp = V2 (0x3c90000000000000), + .thres_exp = V2 (0x03f0000000000000), + .inv_ln2_n = 0x1.71547652b82fep8, /* N/ln2. */ + .ln2_hi_n = 0x1.62e42fefc0000p-9, /* ln2/N. */ + .ln2_lo_n = -0x1.c610ca86c3899p-45, }; -#define A(i) data.log_poly[i] -#define C(i) data.exp_poly[i] - /* This version implements an algorithm close to scalar pow but - does not implement the trick in the exp's specialcase subroutine to avoid double-rounding, @@ -91,10 +100,9 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - uint64x2_t tmp = vsubq_u64 (ix, v_u64 (Off)); - int64x2_t k - = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */ - uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, v_u64 (0xfffULL << 52))); + uint64x2_t tmp = vsubq_u64 (ix, d->offset); + int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); + uint64x2_t iz = vsubq_u64 (ix, vandq_u64 (tmp, d->mask)); float64x2_t z = vreinterpretq_f64_u64 (iz); float64x2_t kd = vcvtq_f64_s64 (k); /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ @@ -105,9 +113,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, invc); /* k*Ln2 + log(c) + r. */ - float64x2_t t1 = vfmaq_f64 (logc, kd, d->ln2_hi); + float64x2_t ln2 = vld1q_f64 (&d->ln2_lo); + float64x2_t t1 = vfmaq_laneq_f64 (logc, kd, ln2, 1); float64x2_t t2 = vaddq_f64 (t1, r); - float64x2_t lo1 = vfmaq_f64 (logctail, kd, d->ln2_lo); + float64x2_t lo1 = vfmaq_laneq_f64 (logctail, kd, ln2, 0); float64x2_t lo2 = vaddq_f64 (vsubq_f64 (t1, t2), r); /* Evaluation is optimized assuming superscalar pipelined execution. */ float64x2_t ar = vmulq_f64 (v_f64 (-0.5), r); @@ -118,9 +127,10 @@ v_log_inline (uint64x2_t ix, float64x2_t *tail, const struct data *d) float64x2_t lo3 = vfmaq_f64 (vnegq_f64 (ar2), ar, r); float64x2_t lo4 = vaddq_f64 (vsubq_f64 (t2, hi), ar2); /* p = log1p(r) - r - A[0]*r*r. */ - float64x2_t a56 = vfmaq_f64 (A (4), r, A (5)); - float64x2_t a34 = vfmaq_f64 (A (2), r, A (3)); - float64x2_t a12 = vfmaq_f64 (A (0), r, A (1)); + float64x2_t odd_coeffs = vld1q_f64 (&d->log_c1); + float64x2_t a56 = vfmaq_f64 (d->log_c4, r, d->log_c5); + float64x2_t a34 = vfmaq_laneq_f64 (d->log_c2, r, odd_coeffs, 1); + float64x2_t a12 = vfmaq_laneq_f64 (d->log_c0, r, odd_coeffs, 0); float64x2_t p = vfmaq_f64 (a34, ar2, a56); p = vfmaq_f64 (a12, ar2, p); p = vmulq_f64 (ar3, p); @@ -140,28 +150,28 @@ exp_special_case (float64x2_t x, float64x2_t xtail) /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. */ static inline float64x2_t -v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) +v_exp_inline (float64x2_t x, float64x2_t neg_xtail, const struct data *d) { /* Fallback to scalar exp_inline for all lanes if any lane contains value of x s.t. |x| <= 2^-54 or >= 512. */ - uint64x2_t abstop - = vshrq_n_u64 (vandq_u64 (vreinterpretq_u64_f64 (x), d->inf), 52); - uint64x2_t uoflowx - = vcgeq_u64 (vsubq_u64 (abstop, VecSmallExp), VecThresExp); + uint64x2_t uoflowx = vcgeq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (vabsq_f64 (x)), d->small_exp), + d->thres_exp); if (__glibc_unlikely (v_any_u64 (uoflowx))) - return exp_special_case (x, xtail); + return exp_special_case (x, vnegq_f64 (neg_xtail)); /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ /* x = ln2/N*k + r, with k integer and r in [-ln2/2N, ln2/2N]. */ - float64x2_t z = vmulq_f64 (d->inv_ln2_n, x); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - float64x2_t kd = vaddq_f64 (z, d->shift); - uint64x2_t ki = vreinterpretq_u64_f64 (kd); - kd = vsubq_f64 (kd, d->shift); - float64x2_t r = vfmsq_f64 (x, kd, d->ln2_hi_n); - r = vfmsq_f64 (r, kd, d->ln2_lo_n); + float64x2_t exp_consts = vld1q_f64 (&d->inv_ln2_n); + float64x2_t z = vmulq_laneq_f64 (x, exp_consts, 0); + float64x2_t kd = vrndnq_f64 (z); + uint64x2_t ki = vreinterpretq_u64_s64 (vcvtaq_s64_f64 (z)); + float64x2_t ln2_n = vld1q_f64 (&d->ln2_lo_n); + float64x2_t r = vfmsq_laneq_f64 (x, kd, ln2_n, 1); + r = vfmsq_laneq_f64 (r, kd, ln2_n, 0); /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - r = vaddq_f64 (r, xtail); + r = vsubq_f64 (r, neg_xtail); /* 2^(k/N) ~= scale. */ uint64x2_t idx = vandq_u64 (ki, v_u64 (N_EXP - 1)); uint64x2_t top = vshlq_n_u64 (ki, 52 - V_POW_EXP_TABLE_BITS); @@ -170,8 +180,8 @@ v_exp_inline (float64x2_t x, float64x2_t xtail, const struct data *d) sbits = vaddq_u64 (sbits, top); /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ float64x2_t r2 = vmulq_f64 (r, r); - float64x2_t tmp = vfmaq_f64 (C (1), r, C (2)); - tmp = vfmaq_f64 (C (0), r, tmp); + float64x2_t tmp = vfmaq_laneq_f64 (d->exp_c1, r, exp_consts, 1); + tmp = vfmaq_f64 (d->exp_c0, r, tmp); tmp = vfmaq_f64 (r, r2, tmp); float64x2_t scale = vreinterpretq_f64_u64 (sbits); /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there @@ -230,8 +240,8 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) { /* Normalize subnormal x so exponent becomes negative. */ uint64x2_t vix_norm = vreinterpretq_u64_f64 ( - vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (v_u64 (1ULL << 52))))); - vix_norm = vsubq_u64 (vix_norm, v_u64 (52ULL << 52)); + vabsq_f64 (vmulq_f64 (x, vcvtq_f64_u64 (d->mask_sub_0)))); + vix_norm = vsubq_u64 (vix_norm, d->mask_sub_1); vix = vbslq_u64 (sub_x, vix_norm, vix); } } @@ -242,8 +252,7 @@ float64x2_t VPCS_ATTR V_NAME_D2 (pow) (float64x2_t x, float64x2_t y) /* Vector Exp(y_loghi, y_loglo). */ float64x2_t vehi = vmulq_f64 (y, vhi); - float64x2_t velo = vmulq_f64 (y, vlo); float64x2_t vemi = vfmsq_f64 (vehi, y, vhi); - velo = vsubq_f64 (velo, vemi); - return v_exp_inline (vehi, velo, d); + float64x2_t neg_velo = vfmsq_f64 (vemi, y, vlo); + return v_exp_inline (vehi, neg_velo, d); } diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c index 4c0bf89..b8c1b39 100644 --- a/sysdeps/aarch64/fpu/pow_sve.c +++ b/sysdeps/aarch64/fpu/pow_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) pow function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -44,19 +44,18 @@ /* Data is defined in v_pow_log_data.c. */ #define N_LOG (1 << V_POW_LOG_TABLE_BITS) -#define A __v_pow_log_data.poly #define Off 0x3fe6955500000000 /* Data is defined in v_pow_exp_data.c. */ #define N_EXP (1 << V_POW_EXP_TABLE_BITS) #define SignBias (0x800 << V_POW_EXP_TABLE_BITS) -#define C __v_pow_exp_data.poly #define SmallExp 0x3c9 /* top12(0x1p-54). */ #define BigExp 0x408 /* top12(512.). */ #define ThresExp 0x03f /* BigExp - SmallExp. */ #define HugeExp 0x409 /* top12(1024.). */ /* Constants associated with pow. */ +#define SmallBoundX 0x1p-126 #define SmallPowX 0x001 /* top12(0x1p-126). */ #define BigPowX 0x7ff /* top12(INFINITY). */ #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */ @@ -64,6 +63,31 @@ #define BigPowY 0x43e /* top12(0x1.749p62). */ #define ThresPowY 0x080 /* BigPowY - SmallPowY. */ +static const struct data +{ + double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo; + double log_c1, log_c3, log_c5, off; + double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo; + double exp_c0, exp_c1; +} data = { + .log_c0 = -0x1p-1, + .log_c1 = -0x1.555555555556p-1, + .log_c2 = 0x1.0000000000006p-1, + .log_c3 = 0x1.999999959554ep-1, + .log_c4 = -0x1.555555529a47ap-1, + .log_c5 = -0x1.2495b9b4845e9p0, + .log_c6 = 0x1.0002b8b263fc3p0, + .off = Off, + .exp_c0 = 0x1.fffffffffffd4p-2, + .exp_c1 = 0x1.5555571d6ef9p-3, + .exp_c2 = 0x1.5555576a5adcep-5, + .ln2_hi = 0x1.62e42fefa3800p-1, + .ln2_lo = 0x1.ef35793c76730p-45, + .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP, + .ln2_over_n_hi = 0x1.62e42fefc0000p-9, + .ln2_over_n_lo = -0x1.c610ca86c3899p-45, +}; + /* Check if x is an integer. */ static inline svbool_t sv_isint (svbool_t pg, svfloat64_t x) @@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x) static inline svbool_t sv_isodd (svbool_t pg, svfloat64_t x) { - svfloat64_t y = svmul_x (pg, x, 0.5); + svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5); return sv_isnotint (pg, y); } @@ -121,7 +145,7 @@ zeroinfnan (uint64_t i) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint64_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2 * asuint64 (INFINITY) - 1); } @@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, additional 15 bits precision. IX is the bit representation of x, but normalized in the subnormal range using the sign bit for the exponent. */ static inline svfloat64_t -sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) +sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail, + const struct data *d) { /* x = 2^k z; where z is in range [Off,2*Off) and exact. The range is split into N subintervals. The ith subinterval contains z and c is near its center. */ - svuint64_t tmp = svsub_x (pg, ix, Off); + svuint64_t tmp = svsub_x (pg, ix, d->off); svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS), sv_u64 (N_LOG - 1)); svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52); - svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52))); + svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52)); svfloat64_t z = svreinterpret_f64 (iz); svfloat64_t kd = svcvt_f64_x (pg, k); @@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail) |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ svfloat64_t r = svmad_x (pg, z, invc, -1.0); /* k*Ln2 + log(c) + r. */ - svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi); + + svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi); + svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0); svfloat64_t t2 = svadd_x (pg, t1, r); - svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo); + svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1); svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r); /* Evaluation is optimized assuming superscalar pipelined execution. */ - svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5. */ - svfloat64_t ar2 = svmul_x (pg, r, ar); - svfloat64_t ar3 = svmul_x (pg, r, ar2); + + svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0); + svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0); + svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar); + svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2); /* k*Ln2 + log(c) + r + A[0]*r*r. */ svfloat64_t hi = svadd_x (pg, t2, ar2); - svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r); + svfloat64_t lo3 = svmls_x (pg, ar2, ar, r); svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2); /* p = log1p(r) - r - A[0]*r*r. */ /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6])))). */ - svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]); - svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]); - svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]); + + svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4); + svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1); + svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0); + svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1); svfloat64_t p = svmla_x (pg, a34, ar2, a56); p = svmla_x (pg, a12, ar2, p); - p = svmul_x (pg, ar3, p); + p = svmul_x (svptrue_b64 (), ar3, p); svfloat64_t lo = svadd_x ( - pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); + pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p); svfloat64_t y = svadd_x (pg, hi, lo); *tail = svadd_x (pg, svsub_x (pg, hi, y), lo); return y; } +static inline svfloat64_t +sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail, + svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits, + svuint64_t *ki, const struct data *d) +{ + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2); + svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0); + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + svfloat64_t kd = svrinta_x (pg, z); + *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd)); + + svfloat64_t ln2_over_n_hilo + = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi); + svfloat64_t r = x; + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0); + r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1); + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r = svadd_x (pg, r, xtail); + /* 2^(k/N) ~= scale. */ + svuint64_t idx = svand_x (pg, *ki, N_EXP - 1); + svuint64_t top + = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); + *sbits = svadd_x (pg, *sbits, top); + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1); + *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp); + *tmp = svmla_x (pg, r, r2, *tmp); + svfloat64_t scale = svreinterpret_f64 (*sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + z = svmla_x (pg, scale, scale, *tmp); + return z; +} + /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */ static inline svfloat64_t sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, - svuint64_t sign_bias) + svuint64_t sign_bias, const struct data *d) { /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow) and other cases of large values of x (scale * (1 + TMP) oflow). */ @@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail, /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */ svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp); - /* Conditions special, uflow and oflow are all expressed as uoflow && - something, hence do not bother computing anything if no lane in uoflow is - true. */ - svbool_t special = svpfalse_b (); - svbool_t uflow = svpfalse_b (); - svbool_t oflow = svpfalse_b (); + svfloat64_t tmp; + svuint64_t sbits, ki; if (__glibc_unlikely (svptest_any (pg, uoflow))) { + svfloat64_t z + = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); + /* |x| is tiny (|x| <= 0x1p-54). */ - uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); + svbool_t uflow + = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000); uflow = svand_z (pg, uoflow, uflow); /* |x| is huge (|x| >= 1024). */ - oflow = svcmpge (pg, abstop, HugeExp); + svbool_t oflow = svcmpge (pg, abstop, HugeExp); oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); + /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow - or underflow. */ - special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + or underflow. */ + svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); + + /* Update result with special and large cases. */ + z = sv_call_specialcase (tmp, sbits, ki, z, special); + + /* Handle underflow and overflow. */ + svbool_t x_is_neg = svcmplt (pg, x, 0); + svuint64_t sign_mask + = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); + svfloat64_t res_uoflow + = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); + res_uoflow = svreinterpret_f64 ( + svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); + /* Avoid spurious underflow for tiny x. */ + svfloat64_t res_spurious_uflow + = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); + + z = svsel (oflow, res_uoflow, z); + z = svsel (uflow, res_spurious_uflow, z); + return z; } - /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ - /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ - svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2); - /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift); - svfloat64_t kd = svadd_x (pg, z, shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, shift); - svfloat64_t r = x; - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi); - r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo); - /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ - r = svadd_x (pg, r, xtail); - /* 2^(k/N) ~= scale. */ - svuint64_t idx = svand_x (pg, ki, N_EXP - 1); - svuint64_t top - = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS); - /* This is only a valid scale when -1023*N < k < 1024*N. */ - svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx); - sbits = svadd_x (pg, sbits, top); - /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]); - tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp); - tmp = svmla_x (pg, r, r2, tmp); - svfloat64_t scale = svreinterpret_f64 (sbits); - /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there - is no spurious underflow here even without fma. */ - z = svmla_x (pg, scale, scale, tmp); - - /* Update result with special and large cases. */ - if (__glibc_unlikely (svptest_any (pg, special))) - z = sv_call_specialcase (tmp, sbits, ki, z, special); - - /* Handle underflow and overflow. */ - svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63); - svbool_t x_is_neg = svcmpne (pg, sign_bit, 0); - svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); - svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY)); - res_uoflow = svreinterpret_f64 ( - svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask)); - z = svsel (oflow, res_uoflow, z); - /* Avoid spurious underflow for tiny x. */ - svfloat64_t res_spurious_uflow - = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000)); - z = svsel (uflow, res_spurious_uflow, z); - - return z; + return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d); } static inline double @@ -341,47 +384,39 @@ pow_sc (double x, double y) svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* This preamble handles special case conditions used in the final scalar fallbacks. It also updates ix and sign_bias, that are used in the core computation too, i.e., exp( y * log (x) ). */ svuint64_t vix0 = svreinterpret_u64 (x); svuint64_t viy0 = svreinterpret_u64 (y); - svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52); /* Negative x cases. */ - svuint64_t sign_bit = svlsr_m (pg, vix0, 63); - svbool_t xisneg = svcmpeq (pg, sign_bit, 1); + svbool_t xisneg = svcmplt (pg, x, 0); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint64_t sign_bias = sv_u64 (0); svuint64_t vix = vix0; - svuint64_t vtopx1 = vtopx0; if (__glibc_unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = sv_isnotint (xisneg, y); - svbool_t yisint_xisneg = sv_isint (xisneg, y); + yint_or_xpos = sv_isint (xisneg, y); svbool_t yisodd_xisneg = sv_isodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff); - vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0)); } - /* Special cases of x or y: zero, inf and nan. */ - svbool_t xspecial = sv_zeroinfnan (pg, vix0); - svbool_t yspecial = sv_zeroinfnan (pg, viy0); - svbool_t special = svorr_z (pg, xspecial, yspecial); - /* Small cases of x: |x| < 0x1p-126. */ - svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff); - svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX); - if (__glibc_unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX); + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ - svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0); + svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52); + svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0); svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52)); vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff); @@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) /* y_hi = log(ix, &y_lo). */ svfloat64_t vlo; - svfloat64_t vhi = sv_log_inline (pg, vix, &vlo); + svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d); /* z = exp(y_hi, y_lo, sign_bias). */ - svfloat64_t vehi = svmul_x (pg, y, vhi); - svfloat64_t velo = svmul_x (pg, y, vlo); - svfloat64_t vemi = svmls_x (pg, vehi, y, vhi); - velo = svsub_x (pg, velo, vemi); - svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias); + svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi); + svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi); + svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo); + svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d); /* Cases of finite y and finite negative x. */ - vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz); + vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan (""))); + + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0); + svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0); + svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial); /* Cases of zero/inf/nan x or y. */ - if (__glibc_unlikely (svptest_any (pg, special))) + if (__glibc_unlikely (svptest_any (svptrue_b64 (), special))) vz = sv_call2_f64 (pow_sc, x, y, vz, special); return vz; diff --git a/sysdeps/aarch64/fpu/powf_advsimd.c b/sysdeps/aarch64/fpu/powf_advsimd.c index 8232e70..5a4626b 100644 --- a/sysdeps/aarch64/fpu/powf_advsimd.c +++ b/sysdeps/aarch64/fpu/powf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (AdvSIMD) pow function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c index 4f6a142..7046990 100644 --- a/sysdeps/aarch64/fpu/powf_sve.c +++ b/sysdeps/aarch64/fpu/powf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) pow function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -26,7 +26,6 @@ #define Tlogc __v_powf_data.logc #define Texp __v_powf_data.scale #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) -#define Shift 0x1.8p52 #define Norm 0x1p23f /* 0x4b000000. */ /* Overall ULP error bound for pow is 2.6 ulp @@ -36,7 +35,7 @@ static const struct data double log_poly[4]; double exp_poly[3]; float uflow_bound, oflow_bound, small_bound; - uint32_t sign_bias, sign_mask, subnormal_bias, off; + uint32_t sign_bias, subnormal_bias, off; } data = { /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of V_POWF_EXP2_N. */ @@ -53,7 +52,6 @@ static const struct data .small_bound = 0x1p-126f, .off = 0x3f35d000, .sign_bias = SignBias, - .sign_mask = 0x80000000, .subnormal_bias = 0x0b800000, /* 23 << 23. */ }; @@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x) static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint32_t i) { - return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), + return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1), 2u * 0x7f800000 - 1); } @@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z) } /* Scalar fallback for special case routines with custom signature. */ -static inline svfloat32_t -sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) +static svfloat32_t NOINLINE +sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) { + /* Special cases of x or y: zero, inf and nan. */ + svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1)); + svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2)); + svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial); + svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { @@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, /* Polynomial to approximate log1p(r)/ln2. */ svfloat64_t logx = A (0); - logx = svmla_x (pg, A (1), r, logx); - logx = svmla_x (pg, A (2), r, logx); - logx = svmla_x (pg, A (3), r, logx); - logx = svmla_x (pg, y0, r, logx); + logx = svmad_x (pg, r, logx, A (1)); + logx = svmad_x (pg, r, logx, A (2)); + logx = svmad_x (pg, r, logx, A (3)); + logx = svmad_x (pg, r, logx, y0); *pylogx = svmul_x (pg, y, logx); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ - svfloat64_t kd = svadd_x (pg, *pylogx, Shift); - svuint64_t ki = svreinterpret_u64 (kd); - kd = svsub_x (pg, kd, Shift); + svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx); + svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd)); r = svsub_x (pg, *pylogx, kd); /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ - svuint64_t t - = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); - svuint64_t ski = svadd_x (pg, ki, sign_bias); - t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); + svuint64_t t = svld1_gather_index ( + svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1)); + svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias); + t = svadd_x (svptrue_b64 (), t, + svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS)); svfloat64_t s = svreinterpret_f64 (t); svfloat64_t p = C (0); p = svmla_x (pg, C (1), p, r); p = svmla_x (pg, C (2), p, r); - p = svmla_x (pg, s, p, svmul_x (pg, s, r)); + p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r)); return p; } @@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, { const svbool_t ptrue = svptrue_b64 (); - /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in - order to perform core computation in double precision. */ + /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two + * in order to perform core computation in double precision. */ const svbool_t pg_lo = svunpklo (pg); const svbool_t pg_hi = svunpkhi (pg); - svfloat64_t y_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); - svfloat64_t y_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); - svfloat32_t z = svreinterpret_f32 (iz); - svfloat64_t z_lo = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); - svfloat64_t z_hi = svcvt_f64_x ( - ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); + svfloat64_t y_lo + = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); + svfloat64_t y_hi + = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); + svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz))); + svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz))); svuint64_t i_lo = svunpklo (i); svuint64_t i_hi = svunpkhi (i); svint64_t k_lo = svunpklo (k); @@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, /* Implementation of SVE powf. Provides the same accuracy as AdvSIMD powf, since it relies on the same algorithm. The theoretical maximum error is under 2.60 ULPs. - Maximum measured error is 2.56 ULPs: - SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 - want 0x1.fd4b06p+127. */ + Maximum measured error is 2.57 ULPs: + SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127 + want 0x1.fff862p+127. */ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) { const struct data *d = ptr_barrier (&data); @@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svuint32_t viy0 = svreinterpret_u32 (y); /* Negative x cases. */ - svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); - svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); + svbool_t xisneg = svcmplt (pg, x, sv_f32 (0)); /* Set sign_bias and ix depending on sign of x and nature of y. */ - svbool_t yisnotint_xisneg = svpfalse_b (); + svbool_t yint_or_xpos = pg; svuint32_t sign_bias = sv_u32 (0); svuint32_t vix = vix0; if (__glibc_unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ - yisnotint_xisneg = svisnotint (xisneg, y); - svbool_t yisint_xisneg = svisint (xisneg, y); + yint_or_xpos = svisint (xisneg, y); svbool_t yisodd_xisneg = svisodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ - vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); + vix = svand_m (yint_or_xpos, vix0, 0x7fffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); } @@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) svbool_t cmp = svorr_z (pg, xspecial, yspecial); /* Small cases of x: |x| < 0x1p-126. */ - svbool_t xsmall = svaclt (pg, x, d->small_bound); - if (__glibc_unlikely (svptest_any (pg, xsmall))) + svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound); + if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); @@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) vix = svsel (xsmall, vix_norm, vix); } /* Part of core computation carried in working precision. */ - svuint32_t tmp = svsub_x (pg, vix, d->off); - svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), - V_POWF_LOG2_N - 1); - svuint32_t top = svand_x (pg, tmp, 0xff800000); - svuint32_t iz = svsub_x (pg, vix, top); - svint32_t k - = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); - - /* Compute core in extended precision and return intermediate ylogx results to - handle cases of underflow and underflow in exp. */ + svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off); + svuint32_t i = svand_x ( + yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), + V_POWF_LOG2_N - 1); + svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000); + svuint32_t iz = svsub_x (yint_or_xpos, vix, top); + svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top), + (23 - V_POWF_EXP2_TABLE_BITS)); + + /* Compute core in extended precision and return intermediate ylogx results + * to handle cases of underflow and underflow in exp. */ svfloat32_t ylogx; - svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); + svfloat32_t ret + = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d); /* Handle exp special cases of underflow and overflow. */ - svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); + svuint32_t sign + = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); svfloat32_t ret_oflow - = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); + = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY))); svfloat32_t ret_uflow = svreinterpret_f32 (sign); - ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); - ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); + ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret); + ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret); /* Cases of finite y and finite negative x. */ - ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); + ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); - if (__glibc_unlikely (svptest_any (pg, cmp))) - return sv_call_powf_sc (x, y, ret, cmp); + if (__glibc_unlikely (svptest_any (cmp, cmp))) + return sv_call_powf_sc (x, y, ret); return ret; } diff --git a/sysdeps/aarch64/fpu/s_llrint.c b/sysdeps/aarch64/fpu/s_llrint.c index e0e3e1b..3ed519c 100644 --- a/sysdeps/aarch64/fpu/s_llrint.c +++ b/sysdeps/aarch64/fpu/s_llrint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_llrintf.c b/sysdeps/aarch64/fpu/s_llrintf.c index 2585f91..0f33958 100644 --- a/sysdeps/aarch64/fpu/s_llrintf.c +++ b/sysdeps/aarch64/fpu/s_llrintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_llround.c b/sysdeps/aarch64/fpu/s_llround.c index 9092089..a1b46e8 100644 --- a/sysdeps/aarch64/fpu/s_llround.c +++ b/sysdeps/aarch64/fpu/s_llround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_llroundf.c b/sysdeps/aarch64/fpu/s_llroundf.c index 578290a..5269f44 100644 --- a/sysdeps/aarch64/fpu/s_llroundf.c +++ b/sysdeps/aarch64/fpu/s_llroundf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_lrint.c b/sysdeps/aarch64/fpu/s_lrint.c index 6b5242e..904c42c 100644 --- a/sysdeps/aarch64/fpu/s_lrint.c +++ b/sysdeps/aarch64/fpu/s_lrint.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2024 Free Software Foundation, Inc. +/* Copyright (C) 1996-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_lrintf.c b/sysdeps/aarch64/fpu/s_lrintf.c index 5933576..cc9d44c 100644 --- a/sysdeps/aarch64/fpu/s_lrintf.c +++ b/sysdeps/aarch64/fpu/s_lrintf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_lround.c b/sysdeps/aarch64/fpu/s_lround.c index 5d7b092..ab5c7ce 100644 --- a/sysdeps/aarch64/fpu/s_lround.c +++ b/sysdeps/aarch64/fpu/s_lround.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1996-2024 Free Software Foundation, Inc. +/* Copyright (C) 1996-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_lroundf.c b/sysdeps/aarch64/fpu/s_lroundf.c index 0365e88..236f1a8 100644 --- a/sysdeps/aarch64/fpu/s_lroundf.c +++ b/sysdeps/aarch64/fpu/s_lroundf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_roundeven.c b/sysdeps/aarch64/fpu/s_roundeven.c index d69b352..de9ee56 100644 --- a/sysdeps/aarch64/fpu/s_roundeven.c +++ b/sysdeps/aarch64/fpu/s_roundeven.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2021-2024 Free Software Foundation, Inc. +/* Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/s_roundevenf.c b/sysdeps/aarch64/fpu/s_roundevenf.c index 2278976..63b8aa7 100644 --- a/sysdeps/aarch64/fpu/s_roundevenf.c +++ b/sysdeps/aarch64/fpu/s_roundevenf.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2021-2024 Free Software Foundation, Inc. +/* Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py index 8cc8d4f..7aaa563 100644 --- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py +++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -# Copyright (C) 2023-2024 Free Software Foundation, Inc. +# Copyright (C) 2023-2025 Free Software Foundation, Inc. # This file is part of the GNU C Library. # # The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py index 591e4d3..594e2a4 100755 --- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py +++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -# Copyright (C) 2023-2024 Free Software Foundation, Inc. +# Copyright (C) 2023-2025 Free Software Foundation, Inc. # This file is part of the GNU C Library. # # The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c index a0d9d3b..4e5118d 100644 --- a/sysdeps/aarch64/fpu/sin_advsimd.c +++ b/sysdeps/aarch64/fpu/sin_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) sin function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ static const struct data { float64x2_t poly[7]; - float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), @@ -34,12 +34,13 @@ static const struct data .pi_1 = V2 (0x1.921fb54442d18p+1), .pi_2 = V2 (0x1.1a62633145c06p-53), .pi_3 = V2 (0x1.c1cd129024e09p-106), - .shift = V2 (0x1.8p52), }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */ -# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */ +/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */ +# define TinyBound v_u64 (0x3020000000000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u64 (0x1160000000000000) #endif #define C(i) d->poly[i] @@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) fenv). These lanes will be fixed by special-case handler later. */ uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); - r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); + r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp)); #else r = x; cmp = vcageq_f64 (x, d->range_val); #endif /* n = rint(|x|/pi). */ - n = vfmaq_f64 (d->shift, d->inv_pi, r); - odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); - n = vsubq_f64 (n, d->shift); + n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f64 (r, d->pi_1, n); diff --git a/sysdeps/aarch64/fpu/sin_sve.c b/sysdeps/aarch64/fpu/sin_sve.c index 123a56b..fee3dc3 100644 --- a/sysdeps/aarch64/fpu/sin_sve.c +++ b/sysdeps/aarch64/fpu/sin_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) sin function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c index 375dfc3..4cda651 100644 --- a/sysdeps/aarch64/fpu/sinf_advsimd.c +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) sin function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,7 +22,7 @@ static const struct data { float32x4_t poly[4]; - float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; + float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3; } data = { /* 1.886 ulp error. */ .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), @@ -33,13 +33,14 @@ static const struct data .pi_3 = V4 (-0x1.ee59dap-49f), .inv_pi = V4 (0x1.45f306p-2f), - .shift = V4 (0x1.8p+23f), .range_val = V4 (0x1p20f) }; #if WANT_SIMD_EXCEPT -# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */ -# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */ +/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */ +# define TinyBound v_u32 (0x22000000) +/* RangeVal - TinyBound. */ +# define Thresh v_u32 (0x27800000) #endif #define C(i) d->poly[i] @@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) /* If fenv exceptions are to be triggered correctly, set any special lanes to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by special-case handler later. */ - r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); + r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp)); #else r = x; cmp = vcageq_f32 (x, d->range_val); #endif - /* n = rint(|x|/pi) */ - n = vfmaq_f32 (d->shift, d->inv_pi, r); - odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); - n = vsubq_f32 (n, d->shift); + /* n = rint(|x|/pi). */ + n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31); - /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */ + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ r = vfmsq_f32 (r, d->pi_1, n); r = vfmsq_f32 (r, d->pi_2, n); r = vfmsq_f32 (r, d->pi_3, n); - /* y = sin(r) */ + /* y = sin(r). */ r2 = vmulq_f32 (r, r); y = vfmaq_f32 (C (2), C (3), r2); y = vfmaq_f32 (C (1), y, r2); diff --git a/sysdeps/aarch64/fpu/sinf_sve.c b/sysdeps/aarch64/fpu/sinf_sve.c index 0d1ff81..14c4510 100644 --- a/sysdeps/aarch64/fpu/sinf_sve.c +++ b/sysdeps/aarch64/fpu/sinf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) sin function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c index 3e3b76c..0d6a485 100644 --- a/sysdeps/aarch64/fpu/sinh_advsimd.c +++ b/sysdeps/aarch64/fpu/sinh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) sinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,72 +18,31 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11], inv_ln2; - double m_ln2[2]; - float64x2_t shift; + struct v_expm1_data d; uint64x2_t halff; - int64x2_t onef; #if WANT_SIMD_EXCEPT uint64x2_t tiny_bound, thresh; #else - uint64x2_t large_bound; + float64x2_t large_bound; #endif } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, - .shift = V2 (0x1.8p52), - + .d = V_EXPM1_DATA, .halff = V2 (0x3fe0000000000000), - .onef = V2 (0x3ff0000000000000), #if WANT_SIMD_EXCEPT /* 2^-26, below which sinh(x) rounds to x. */ .tiny_bound = V2 (0x3e50000000000000), /* asuint(large_bound) - asuint(tiny_bound). */ .thresh = V2 (0x0230000000000000), #else -/* 2^9. expm1 helper overflows for large input. */ - .large_bound = V2 (0x4080000000000000), + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x1p+9), #endif }; -static inline float64x2_t -expm1_inline (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - /* Reduce argument: - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where i = round(x / ln2) - and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - - float64x2_t m_ln2 = vld1q_f64 (d->m_ln2); - float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0); - f = vfmaq_laneq_f64 (f, j, m_ln2, 1); - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - /* t = 2^i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef))); - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); -} - static float64x2_t NOINLINE VPCS_ATTR special_case (float64x2_t x) { @@ -92,23 +51,23 @@ special_case (float64x2_t x) /* Approximation for vector double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. - The greatest observed error is 2.57 ULP: - _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 - want 0x1.ab34e59d678d9p-2. */ + The greatest observed error is 2.52 ULP: + _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 + want -0x1.ac2f05bb66fc9p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); float64x2_t ax = vabsq_f64 (x); - uint64x2_t sign - = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax)); - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff)); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + float64x2_t halfsign = vreinterpretq_f64_u64 ( + vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); #if WANT_SIMD_EXCEPT uint64x2_t special = vcgeq_u64 ( vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); #else - uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound); + uint64x2_t special = vcageq_f64 (x, d->large_bound); #endif /* Fall back to scalar variant for all lanes if any of them are special. */ @@ -118,7 +77,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ - float64x2_t t = expm1_inline (ax); + float64x2_t t = expm1_inline (ax, &d->d); t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); return vmulq_f64 (t, halfsign); } diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c index df5f6c8..963453f 100644 --- a/sysdeps/aarch64/fpu/sinh_sve.c +++ b/sysdeps/aarch64/fpu/sinh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) atanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c index 6bb7482..41d1910 100644 --- a/sysdeps/aarch64/fpu/sinhf_advsimd.c +++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) sinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -23,15 +23,13 @@ static const struct data { struct v_expm1f_data expm1f_consts; - uint32x4_t halff; #if WANT_SIMD_EXCEPT uint32x4_t tiny_bound, thresh; #else - uint32x4_t oflow_bound; + float32x4_t oflow_bound; #endif } data = { .expm1f_consts = V_EXPM1F_DATA, - .halff = V4 (0x3f000000), #if WANT_SIMD_EXCEPT /* 0x1.6a09e8p-32, below which expm1f underflows. */ .tiny_bound = V4 (0x2fb504f4), @@ -39,14 +37,15 @@ static const struct data .thresh = V4 (0x12fbbbb3), #else /* 0x1.61814ep+6, above which expm1f helper overflows. */ - .oflow_bound = V4 (0x42b0c0a7), + .oflow_bound = V4 (0x1.61814ep+6), #endif }; static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign, + uint32x4_t special) { - return v_call_f32 (sinhf, x, y, special); + return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special); } /* Approximation for vector single-precision sinh(x) using expm1. @@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) uint32x4_t ix = vreinterpretq_u32_f32 (x); float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t sign = veorq_u32 (ix, iax); - float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff)); + float32x4_t halfsign = vreinterpretq_f32_u32 ( + vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5)))); #if WANT_SIMD_EXCEPT - uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh); + uint32x4_t special = vcgeq_u32 ( + vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh); ax = v_zerofy_f32 (ax, special); #else - uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound); + uint32x4_t special = vcageq_f32 (x, d->oflow_bound); #endif /* Up to the point that expm1f overflows, we can use it to calculate sinhf @@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x) /* Fall back to the scalar variant for any lanes that should trigger an exception. */ if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, vmulq_f32 (t, halfsign), special); + return special_case (x, t, halfsign, special); return vmulq_f32 (t, halfsign); } diff --git a/sysdeps/aarch64/fpu/sinhf_sve.c b/sysdeps/aarch64/fpu/sinhf_sve.c index 6c204b5..90692ac 100644 --- a/sysdeps/aarch64/fpu/sinhf_sve.c +++ b/sysdeps/aarch64/fpu/sinhf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) sinh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -63,5 +63,5 @@ svfloat32_t SV_NAME_F1 (sinh) (svfloat32_t x, const svbool_t pg) if (__glibc_unlikely (svptest_any (pg, special))) return special_case (x, svmul_x (pg, t, halfsign), special); - return svmul_x (pg, t, halfsign); + return svmul_x (svptrue_b32 (), t, halfsign); } diff --git a/sysdeps/aarch64/fpu/sinpi_advsimd.c b/sysdeps/aarch64/fpu/sinpi_advsimd.c new file mode 100644 index 0000000..6965644 --- /dev/null +++ b/sysdeps/aarch64/fpu/sinpi_advsimd.c @@ -0,0 +1,87 @@ +/* Double-precision (Advanced SIMD) sinpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f64.h" + +static const struct data +{ + float64x2_t poly[10]; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { V2 (0x1.921fb54442d184p1), V2 (-0x1.4abbce625be53p2), + V2 (0x1.466bc6775ab16p1), V2 (-0x1.32d2cce62dc33p-1), + V2 (0x1.507834891188ep-4), V2 (-0x1.e30750a28c88ep-8), + V2 (0x1.e8f48308acda4p-12), V2 (-0x1.6fc0032b3c29fp-16), + V2 (0x1.af86ae521260bp-21), V2 (-0x1.012a9870eeb7dp-25) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u64 (0x3bf0000000000000) /* asuint64(0x1p-64). */ +/* asuint64(0x1p64) - TinyBound. */ +# define Thresh v_u64 (0x07f0000000000000) + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (sinpi, x, y, cmp); +} +#endif + +/* Approximation for vector double-precision sinpi(x). + Maximum Error 3.05 ULP: + _ZGVnN2v_sinpi(0x1.d32750db30b4ap-2) got 0x1.fb295878301c7p-1 + want 0x1.fb295878301cap-1. */ +float64x2_t VPCS_ATTR V_NAME_D1 (sinpi) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x)); + uint64x2_t cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float64x2_t r = v_zerofy_f64 (x, cmp); +#else + float64x2_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint64x2_t odd + = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtaq_s64_f64 (r)), 63); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f64 (r, vrndaq_f64 (r)); + + /* y = sin(r). */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + float64x2_t y = vmulq_f64 (v_pw_horner_9_f64 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (__glibc_unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); +} diff --git a/sysdeps/aarch64/fpu/sinpi_sve.c b/sysdeps/aarch64/fpu/sinpi_sve.c new file mode 100644 index 0000000..b9c6257 --- /dev/null +++ b/sysdeps/aarch64/fpu/sinpi_sve.c @@ -0,0 +1,61 @@ +/* Double-precision (SVE) sinpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f64.h" + +static const struct data +{ + double poly[10], range_val; +} data = { + /* Polynomial coefficients generated using Remez algorithm, + see sinpi.sollya for details. */ + .poly = { 0x1.921fb54442d184p1, -0x1.4abbce625be53p2, 0x1.466bc6775ab16p1, + -0x1.32d2cce62dc33p-1, 0x1.507834891188ep-4, -0x1.e30750a28c88ep-8, + 0x1.e8f48308acda4p-12, -0x1.6fc0032b3c29fp-16, + 0x1.af86ae521260bp-21, -0x1.012a9870eeb7dp-25 }, + .range_val = 0x1p63, +}; + +/* A fast SVE implementation of sinpi. + Maximum error 3.10 ULP: + _ZGVsMxv_sinpi(0x1.df1a14f1b235p-2) got 0x1.fd64f541606cp-1 + want 0x1.fd64f541606c3p-1. */ +svfloat64_t SV_NAME_D1 (sinpi) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2) + with n = rint(x) and r = r - n. */ + svfloat64_t n = svrinta_x (pg, x); + svfloat64_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint64_t intn = svreinterpret_u64 (svcvt_s64_z (pg, n)); + svuint64_t sign = svlsl_z (cmp, intn, 63); + + /* y = sin(r). */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + svfloat64_t y = sv_pw_horner_9_f64_x (pg, r2, r4, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/sinpif_advsimd.c b/sysdeps/aarch64/fpu/sinpif_advsimd.c new file mode 100644 index 0000000..2e77aaa --- /dev/null +++ b/sysdeps/aarch64/fpu/sinpif_advsimd.c @@ -0,0 +1,85 @@ +/* Single-precision (Advanced SIMD) sinpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" +#include "poly_advsimd_f32.h" + +static const struct data +{ + float32x4_t poly[6]; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { V4 (0x1.921fb6p1f), V4 (-0x1.4abbcep2f), V4 (0x1.466bc6p1f), + V4 (-0x1.32d2ccp-1f), V4 (0x1.50783p-4f), V4 (-0x1.e30750p-8f) }, +}; + +#if WANT_SIMD_EXCEPT +# define TinyBound v_u32 (0x30000000) /* asuint32(0x1p-31f). */ +# define Thresh v_u32 (0x1f000000) /* asuint32(0x1p31f) - TinyBound. */ + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (sinpif, x, y, cmp); +} +#endif + +/* Approximation for vector single-precision sinpi(x) + Maximum Error 3.03 ULP: + _ZGVnN4v_sinpif(0x1.c597ccp-2) got 0x1.f7cd56p-1 + want 0x1.f7cd5p-1. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinpi) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + +#if WANT_SIMD_EXCEPT + uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x)); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh); + + /* When WANT_SIMD_EXCEPT = 1, special lanes should be set to 0 + to avoid them under/overflowing and throwing exceptions. */ + float32x4_t r = v_zerofy_f32 (x, cmp); +#else + float32x4_t r = x; +#endif + + /* If r is odd, the sign of the result should be inverted. */ + uint32x4_t odd + = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (r)), 31); + + /* r = x - rint(x). Range reduction to -1/2 .. 1/2. */ + r = vsubq_f32 (r, vrndaq_f32 (r)); + + /* Pairwise Horner approximation for y = sin(r * pi). */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); + +#if WANT_SIMD_EXCEPT + if (__glibc_unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); +#endif + + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); +} + +libmvec_hidden_def (V_NAME_F1 (sinpi)) +HALF_WIDTH_ALIAS_F1 (sinpi) diff --git a/sysdeps/aarch64/fpu/sinpif_sve.c b/sysdeps/aarch64/fpu/sinpif_sve.c new file mode 100644 index 0000000..10ff569 --- /dev/null +++ b/sysdeps/aarch64/fpu/sinpif_sve.c @@ -0,0 +1,57 @@ +/* Single-precision (SVE) sinpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "poly_sve_f32.h" + +static const struct data +{ + float poly[6], range_val; +} data = { + /* Taylor series coefficents for sin(pi * x). */ + .poly = { 0x1.921fb6p1f, -0x1.4abbcep2f, 0x1.466bc6p1f, -0x1.32d2ccp-1f, + 0x1.50783p-4f, -0x1.e30750p-8f }, + .range_val = 0x1p31, +}; + +/* A fast SVE implementation of sinpif. + Maximum error 2.48 ULP: + _ZGVsMxv_sinpif(0x1.d062b6p-2) got 0x1.fa8c06p-1 + want 0x1.fa8c02p-1. */ +svfloat32_t SV_NAME_F1 (sinpi) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* range reduction into -1/2 .. 1/2 + with n = rint(x) and r = r - n. */ + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + + /* Result should be negated based on if n is odd or not. */ + svbool_t cmp = svaclt (pg, x, d->range_val); + svuint32_t intn = svreinterpret_u32 (svcvt_s32_z (pg, n)); + svuint32_t sign = svlsl_z (cmp, intn, 31); + + /* y = sin(r). */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t y = sv_horner_5_f32_x (pg, r2, d->poly); + y = svmul_x (pg, y, r); + + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/sv_erf_data.c b/sysdeps/aarch64/fpu/sv_erf_data.c deleted file mode 100644 index a53878f..0000000 --- a/sysdeps/aarch64/fpu/sv_erf_data.c +++ /dev/null @@ -1,1570 +0,0 @@ -/* Table for SVE erf approximation - - Copyright (C) 2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include "vecmath_config.h" - -/* Lookup table used in vector erf. - For each possible rounded input r (multiples of 1/128), between - r = 0.0 and r = 6.0 (769 values): - - the first entry __erf_data.tab.erf contains the values of erf(r), - - the second entry __erf_data.tab.scale contains the values of - 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the - algorithm, since lookup is performed only for x >= 1/64-1/512. */ -const struct sv_erf_data __sv_erf_data = { - .erf = { 0x0.0000000000000p+0, - 0x1.20dbf3deb1340p-7, - 0x1.20d77083f17a0p-6, - 0x1.b137e0cf584dcp-6, - 0x1.20c5645dd2538p-5, - 0x1.68e5d3bbc9526p-5, - 0x1.b0fafef135745p-5, - 0x1.f902a77bd3821p-5, - 0x1.207d480e90658p-4, - 0x1.44703e87e8593p-4, - 0x1.68591a1e83b5dp-4, - 0x1.8c36beb8a8d23p-4, - 0x1.b0081148a873ap-4, - 0x1.d3cbf7e70a4b3p-4, - 0x1.f78159ec8bb50p-4, - 0x1.0d939005f65e5p-3, - 0x1.1f5e1a35c3b89p-3, - 0x1.311fc15f56d14p-3, - 0x1.42d7fc2f64959p-3, - 0x1.548642321d7c6p-3, - 0x1.662a0bdf7a89fp-3, - 0x1.77c2d2a765f9ep-3, - 0x1.895010fdbdbfdp-3, - 0x1.9ad142662e14dp-3, - 0x1.ac45e37fe2526p-3, - 0x1.bdad72110a648p-3, - 0x1.cf076d1233237p-3, - 0x1.e05354b96ff36p-3, - 0x1.f190aa85540e2p-3, - 0x1.015f78a3dcf3dp-2, - 0x1.09eed6982b948p-2, - 0x1.127631eb8de32p-2, - 0x1.1af54e232d609p-2, - 0x1.236bef825d9a2p-2, - 0x1.2bd9db0f7827fp-2, - 0x1.343ed6989b7d9p-2, - 0x1.3c9aa8b84bedap-2, - 0x1.44ed18d9f6462p-2, - 0x1.4d35ef3e5372ep-2, - 0x1.5574f4ffac98ep-2, - 0x1.5da9f415ff23fp-2, - 0x1.65d4b75b00471p-2, - 0x1.6df50a8dff772p-2, - 0x1.760aba57a76bfp-2, - 0x1.7e15944d9d3e4p-2, - 0x1.861566f5fd3c0p-2, - 0x1.8e0a01cab516bp-2, - 0x1.95f3353cbb146p-2, - 0x1.9dd0d2b721f39p-2, - 0x1.a5a2aca209394p-2, - 0x1.ad68966569a87p-2, - 0x1.b522646bbda68p-2, - 0x1.bccfec24855b8p-2, - 0x1.c4710406a65fcp-2, - 0x1.cc058392a6d2dp-2, - 0x1.d38d4354c3bd0p-2, - 0x1.db081ce6e2a48p-2, - 0x1.e275eaf25e458p-2, - 0x1.e9d68931ae650p-2, - 0x1.f129d471eabb1p-2, - 0x1.f86faa9428f9dp-2, - 0x1.ffa7ea8eb5fd0p-2, - 0x1.03693a371519cp-1, - 0x1.06f794ab2cae7p-1, - 0x1.0a7ef5c18edd2p-1, - 0x1.0dff4f247f6c6p-1, - 0x1.1178930ada115p-1, - 0x1.14eab43841b55p-1, - 0x1.1855a5fd3dd50p-1, - 0x1.1bb95c3746199p-1, - 0x1.1f15cb50bc4dep-1, - 0x1.226ae840d4d70p-1, - 0x1.25b8a88b6dd7fp-1, - 0x1.28ff0240d52cdp-1, - 0x1.2c3debfd7d6c1p-1, - 0x1.2f755ce9a21f4p-1, - 0x1.32a54cb8db67bp-1, - 0x1.35cdb3a9a144dp-1, - 0x1.38ee8a84beb71p-1, - 0x1.3c07ca9cb4f9ep-1, - 0x1.3f196dcd0f135p-1, - 0x1.42236e79a5fa6p-1, - 0x1.4525c78dd5966p-1, - 0x1.4820747ba2dc2p-1, - 0x1.4b13713ad3513p-1, - 0x1.4dfeba47f63ccp-1, - 0x1.50e24ca35fd2cp-1, - 0x1.53be25d016a4fp-1, - 0x1.569243d2b3a9bp-1, - 0x1.595ea53035283p-1, - 0x1.5c2348ecc4dc3p-1, - 0x1.5ee02e8a71a53p-1, - 0x1.61955607dd15dp-1, - 0x1.6442bfdedd397p-1, - 0x1.66e86d0312e82p-1, - 0x1.69865ee075011p-1, - 0x1.6c1c9759d0e5fp-1, - 0x1.6eab18c74091bp-1, - 0x1.7131e5f496a5ap-1, - 0x1.73b1021fc0cb8p-1, - 0x1.762870f720c6fp-1, - 0x1.78983697dc96fp-1, - 0x1.7b00578c26037p-1, - 0x1.7d60d8c979f7bp-1, - 0x1.7fb9bfaed8078p-1, - 0x1.820b1202f27fbp-1, - 0x1.8454d5f25760dp-1, - 0x1.8697120d92a4ap-1, - 0x1.88d1cd474a2e0p-1, - 0x1.8b050ef253c37p-1, - 0x1.8d30debfc572ep-1, - 0x1.8f5544bd00c04p-1, - 0x1.91724951b8fc6p-1, - 0x1.9387f53df5238p-1, - 0x1.959651980da31p-1, - 0x1.979d67caa6631p-1, - 0x1.999d4192a5715p-1, - 0x1.9b95e8fd26abap-1, - 0x1.9d8768656cc42p-1, - 0x1.9f71ca72cffb6p-1, - 0x1.a1551a16aaeafp-1, - 0x1.a331628a45b92p-1, - 0x1.a506af4cc00f4p-1, - 0x1.a6d50c20fa293p-1, - 0x1.a89c850b7d54dp-1, - 0x1.aa5d265064366p-1, - 0x1.ac16fc7143263p-1, - 0x1.adca142b10f98p-1, - 0x1.af767a741088bp-1, - 0x1.b11c3c79bb424p-1, - 0x1.b2bb679ead19cp-1, - 0x1.b4540978921eep-1, - 0x1.b5e62fce16095p-1, - 0x1.b771e894d602ep-1, - 0x1.b8f741ef54f83p-1, - 0x1.ba764a2af2b78p-1, - 0x1.bbef0fbde6221p-1, - 0x1.bd61a1453ab44p-1, - 0x1.bece0d82d1a5cp-1, - 0x1.c034635b66e23p-1, - 0x1.c194b1d49a184p-1, - 0x1.c2ef0812fc1bdp-1, - 0x1.c443755820d64p-1, - 0x1.c5920900b5fd1p-1, - 0x1.c6dad2829ec62p-1, - 0x1.c81de16b14cefp-1, - 0x1.c95b455cce69dp-1, - 0x1.ca930e0e2a825p-1, - 0x1.cbc54b476248dp-1, - 0x1.ccf20ce0c0d27p-1, - 0x1.ce1962c0e0d8bp-1, - 0x1.cf3b5cdaf0c39p-1, - 0x1.d0580b2cfd249p-1, - 0x1.d16f7dbe41ca0p-1, - 0x1.d281c49d818d0p-1, - 0x1.d38eefdf64fddp-1, - 0x1.d4970f9ce00d9p-1, - 0x1.d59a33f19ed42p-1, - 0x1.d6986cfa798e7p-1, - 0x1.d791cad3eff01p-1, - 0x1.d8865d98abe01p-1, - 0x1.d97635600bb89p-1, - 0x1.da61623cb41e0p-1, - 0x1.db47f43b2980dp-1, - 0x1.dc29fb60715afp-1, - 0x1.dd0787a8bb39dp-1, - 0x1.dde0a90611a0dp-1, - 0x1.deb56f5f12d28p-1, - 0x1.df85ea8db188ep-1, - 0x1.e0522a5dfda73p-1, - 0x1.e11a3e8cf4eb8p-1, - 0x1.e1de36c75ba58p-1, - 0x1.e29e22a89d766p-1, - 0x1.e35a11b9b61cep-1, - 0x1.e4121370224ccp-1, - 0x1.e4c6372cd8927p-1, - 0x1.e5768c3b4a3fcp-1, - 0x1.e62321d06c5e0p-1, - 0x1.e6cc0709c8a0dp-1, - 0x1.e7714aec96534p-1, - 0x1.e812fc64db369p-1, - 0x1.e8b12a44944a8p-1, - 0x1.e94be342e6743p-1, - 0x1.e9e335fb56f87p-1, - 0x1.ea7730ed0bbb9p-1, - 0x1.eb07e27a133aap-1, - 0x1.eb9558e6b42cep-1, - 0x1.ec1fa258c4beap-1, - 0x1.eca6ccd709544p-1, - 0x1.ed2ae6489ac1ep-1, - 0x1.edabfc7453e63p-1, - 0x1.ee2a1d004692cp-1, - 0x1.eea5557137ae0p-1, - 0x1.ef1db32a2277cp-1, - 0x1.ef93436bc2daap-1, - 0x1.f006135426b26p-1, - 0x1.f0762fde45ee6p-1, - 0x1.f0e3a5e1a1788p-1, - 0x1.f14e8211e8c55p-1, - 0x1.f1b6d0fea5f4dp-1, - 0x1.f21c9f12f0677p-1, - 0x1.f27ff89525acfp-1, - 0x1.f2e0e9a6a8b09p-1, - 0x1.f33f7e43a706bp-1, - 0x1.f39bc242e43e6p-1, - 0x1.f3f5c1558b19ep-1, - 0x1.f44d870704911p-1, - 0x1.f4a31ebcd47dfp-1, - 0x1.f4f693b67bd77p-1, - 0x1.f547f10d60597p-1, - 0x1.f59741b4b97cfp-1, - 0x1.f5e4907982a07p-1, - 0x1.f62fe80272419p-1, - 0x1.f67952cff6282p-1, - 0x1.f6c0db3c34641p-1, - 0x1.f7068b7b10fd9p-1, - 0x1.f74a6d9a38383p-1, - 0x1.f78c8b812d498p-1, - 0x1.f7cceef15d631p-1, - 0x1.f80ba18636f07p-1, - 0x1.f848acb544e95p-1, - 0x1.f88419ce4e184p-1, - 0x1.f8bdf1fb78370p-1, - 0x1.f8f63e416ebffp-1, - 0x1.f92d077f8d56dp-1, - 0x1.f96256700da8ep-1, - 0x1.f99633a838a57p-1, - 0x1.f9c8a7989af0dp-1, - 0x1.f9f9ba8d3c733p-1, - 0x1.fa2974addae45p-1, - 0x1.fa57ddfe27376p-1, - 0x1.fa84fe5e05c8dp-1, - 0x1.fab0dd89d1309p-1, - 0x1.fadb831a9f9c3p-1, - 0x1.fb04f6868a944p-1, - 0x1.fb2d3f20f9101p-1, - 0x1.fb54641aebbc9p-1, - 0x1.fb7a6c834b5a2p-1, - 0x1.fb9f5f4739170p-1, - 0x1.fbc3433260ca5p-1, - 0x1.fbe61eef4cf6ap-1, - 0x1.fc07f907bc794p-1, - 0x1.fc28d7e4f9cd0p-1, - 0x1.fc48c1d033c7ap-1, - 0x1.fc67bcf2d7b8fp-1, - 0x1.fc85cf56ecd38p-1, - 0x1.fca2fee770c79p-1, - 0x1.fcbf5170b578bp-1, - 0x1.fcdacca0bfb73p-1, - 0x1.fcf57607a6e7cp-1, - 0x1.fd0f5317f582fp-1, - 0x1.fd2869270a56fp-1, - 0x1.fd40bd6d7a785p-1, - 0x1.fd58550773cb5p-1, - 0x1.fd6f34f52013ap-1, - 0x1.fd85621b0876dp-1, - 0x1.fd9ae142795e3p-1, - 0x1.fdafb719e6a69p-1, - 0x1.fdc3e835500b3p-1, - 0x1.fdd7790ea5bc0p-1, - 0x1.fdea6e062d0c9p-1, - 0x1.fdfccb62e52d3p-1, - 0x1.fe0e9552ebdd6p-1, - 0x1.fe1fcfebe2083p-1, - 0x1.fe307f2b503d0p-1, - 0x1.fe40a6f70af4bp-1, - 0x1.fe504b1d9696cp-1, - 0x1.fe5f6f568b301p-1, - 0x1.fe6e1742f7cf6p-1, - 0x1.fe7c466dc57a1p-1, - 0x1.fe8a004c19ae6p-1, - 0x1.fe97483db8670p-1, - 0x1.fea4218d6594ap-1, - 0x1.feb08f7146046p-1, - 0x1.febc950b3fa75p-1, - 0x1.fec835695932ep-1, - 0x1.fed37386190fbp-1, - 0x1.fede5248e38f4p-1, - 0x1.fee8d486585eep-1, - 0x1.fef2fd00af31ap-1, - 0x1.fefcce6813974p-1, - 0x1.ff064b5afffbep-1, - 0x1.ff0f766697c76p-1, - 0x1.ff18520700971p-1, - 0x1.ff20e0a7ba8c2p-1, - 0x1.ff2924a3f7a83p-1, - 0x1.ff312046f2339p-1, - 0x1.ff38d5cc4227fp-1, - 0x1.ff404760319b4p-1, - 0x1.ff47772010262p-1, - 0x1.ff4e671a85425p-1, - 0x1.ff55194fe19dfp-1, - 0x1.ff5b8fb26f5f6p-1, - 0x1.ff61cc26c1578p-1, - 0x1.ff67d08401202p-1, - 0x1.ff6d9e943c231p-1, - 0x1.ff733814af88cp-1, - 0x1.ff789eb6130c9p-1, - 0x1.ff7dd41ce2b4dp-1, - 0x1.ff82d9e1a76d8p-1, - 0x1.ff87b1913e853p-1, - 0x1.ff8c5cad200a5p-1, - 0x1.ff90dcaba4096p-1, - 0x1.ff9532f846ab0p-1, - 0x1.ff9960f3eb327p-1, - 0x1.ff9d67f51ddbap-1, - 0x1.ffa14948549a7p-1, - 0x1.ffa506302ebaep-1, - 0x1.ffa89fe5b3625p-1, - 0x1.ffac17988ef4bp-1, - 0x1.ffaf6e6f4f5c0p-1, - 0x1.ffb2a5879f35ep-1, - 0x1.ffb5bdf67fe6fp-1, - 0x1.ffb8b8c88295fp-1, - 0x1.ffbb970200110p-1, - 0x1.ffbe599f4f9d9p-1, - 0x1.ffc10194fcb64p-1, - 0x1.ffc38fcffbb7cp-1, - 0x1.ffc60535dd7f5p-1, - 0x1.ffc862a501fd7p-1, - 0x1.ffcaa8f4c9beap-1, - 0x1.ffccd8f5c66d1p-1, - 0x1.ffcef371ea4d7p-1, - 0x1.ffd0f92cb6ba7p-1, - 0x1.ffd2eae369a07p-1, - 0x1.ffd4c94d29fdbp-1, - 0x1.ffd6951b33686p-1, - 0x1.ffd84ef9009eep-1, - 0x1.ffd9f78c7524ap-1, - 0x1.ffdb8f7605ee7p-1, - 0x1.ffdd1750e1220p-1, - 0x1.ffde8fb314ebfp-1, - 0x1.ffdff92db56e5p-1, - 0x1.ffe1544d01ccbp-1, - 0x1.ffe2a1988857cp-1, - 0x1.ffe3e19349dc7p-1, - 0x1.ffe514bbdc197p-1, - 0x1.ffe63b8c8b5f7p-1, - 0x1.ffe7567b7b5e1p-1, - 0x1.ffe865fac722bp-1, - 0x1.ffe96a78a04a9p-1, - 0x1.ffea645f6d6dap-1, - 0x1.ffeb5415e7c44p-1, - 0x1.ffec39ff380b9p-1, - 0x1.ffed167b12ac2p-1, - 0x1.ffede9e5d3262p-1, - 0x1.ffeeb49896c6dp-1, - 0x1.ffef76e956a9fp-1, - 0x1.fff0312b010b5p-1, - 0x1.fff0e3ad91ec2p-1, - 0x1.fff18ebe2b0e1p-1, - 0x1.fff232a72b48ep-1, - 0x1.fff2cfb0453d9p-1, - 0x1.fff3661e9569dp-1, - 0x1.fff3f634b79f9p-1, - 0x1.fff48032dbe40p-1, - 0x1.fff50456dab8cp-1, - 0x1.fff582dc48d30p-1, - 0x1.fff5fbfc8a439p-1, - 0x1.fff66feee5129p-1, - 0x1.fff6dee89352ep-1, - 0x1.fff7491cd4af6p-1, - 0x1.fff7aebcff755p-1, - 0x1.fff80ff8911fdp-1, - 0x1.fff86cfd3e657p-1, - 0x1.fff8c5f702ccfp-1, - 0x1.fff91b102fca8p-1, - 0x1.fff96c717b695p-1, - 0x1.fff9ba420e834p-1, - 0x1.fffa04a7928b1p-1, - 0x1.fffa4bc63ee9ap-1, - 0x1.fffa8fc0e5f33p-1, - 0x1.fffad0b901755p-1, - 0x1.fffb0ecebee1bp-1, - 0x1.fffb4a210b172p-1, - 0x1.fffb82cd9dcbfp-1, - 0x1.fffbb8f1049c6p-1, - 0x1.fffbeca6adbe9p-1, - 0x1.fffc1e08f25f5p-1, - 0x1.fffc4d3120aa1p-1, - 0x1.fffc7a37857d2p-1, - 0x1.fffca53375ce3p-1, - 0x1.fffcce3b57bffp-1, - 0x1.fffcf564ab6b7p-1, - 0x1.fffd1ac4135f9p-1, - 0x1.fffd3e6d5cd87p-1, - 0x1.fffd607387b07p-1, - 0x1.fffd80e8ce0dap-1, - 0x1.fffd9fdeabccep-1, - 0x1.fffdbd65e5ad0p-1, - 0x1.fffdd98e903b2p-1, - 0x1.fffdf46816833p-1, - 0x1.fffe0e0140857p-1, - 0x1.fffe26683972ap-1, - 0x1.fffe3daa95b18p-1, - 0x1.fffe53d558ae9p-1, - 0x1.fffe68f4fa777p-1, - 0x1.fffe7d156d244p-1, - 0x1.fffe904222101p-1, - 0x1.fffea2860ee1ep-1, - 0x1.fffeb3ebb267bp-1, - 0x1.fffec47d19457p-1, - 0x1.fffed443e2787p-1, - 0x1.fffee34943b15p-1, - 0x1.fffef1960d85dp-1, - 0x1.fffeff32af7afp-1, - 0x1.ffff0c273bea2p-1, - 0x1.ffff187b6bc0ep-1, - 0x1.ffff2436a21dcp-1, - 0x1.ffff2f5fefcaap-1, - 0x1.ffff39fe16963p-1, - 0x1.ffff44178c8d2p-1, - 0x1.ffff4db27f146p-1, - 0x1.ffff56d4d5e5ep-1, - 0x1.ffff5f8435efcp-1, - 0x1.ffff67c604180p-1, - 0x1.ffff6f9f67e55p-1, - 0x1.ffff77154e0d6p-1, - 0x1.ffff7e2c6aea2p-1, - 0x1.ffff84e93cd75p-1, - 0x1.ffff8b500e77cp-1, - 0x1.ffff9164f8e46p-1, - 0x1.ffff972be5c59p-1, - 0x1.ffff9ca891572p-1, - 0x1.ffffa1de8c582p-1, - 0x1.ffffa6d13de73p-1, - 0x1.ffffab83e54b8p-1, - 0x1.ffffaff99bac4p-1, - 0x1.ffffb43555b5fp-1, - 0x1.ffffb839e52f3p-1, - 0x1.ffffbc09fa7cdp-1, - 0x1.ffffbfa82616bp-1, - 0x1.ffffc316d9ed0p-1, - 0x1.ffffc6586abf6p-1, - 0x1.ffffc96f1165ep-1, - 0x1.ffffcc5cec0c1p-1, - 0x1.ffffcf23ff5fcp-1, - 0x1.ffffd1c637b2bp-1, - 0x1.ffffd4456a10dp-1, - 0x1.ffffd6a3554a1p-1, - 0x1.ffffd8e1a2f22p-1, - 0x1.ffffdb01e8546p-1, - 0x1.ffffdd05a75eap-1, - 0x1.ffffdeee4f810p-1, - 0x1.ffffe0bd3e852p-1, - 0x1.ffffe273c15b7p-1, - 0x1.ffffe41314e06p-1, - 0x1.ffffe59c6698bp-1, - 0x1.ffffe710d565ep-1, - 0x1.ffffe8717232dp-1, - 0x1.ffffe9bf4098cp-1, - 0x1.ffffeafb377d5p-1, - 0x1.ffffec2641a9ep-1, - 0x1.ffffed413e5b7p-1, - 0x1.ffffee4d01cd6p-1, - 0x1.ffffef4a55bd4p-1, - 0x1.fffff039f9e8fp-1, - 0x1.fffff11ca4876p-1, - 0x1.fffff1f302bc1p-1, - 0x1.fffff2bdb904dp-1, - 0x1.fffff37d63a36p-1, - 0x1.fffff43297019p-1, - 0x1.fffff4dde0118p-1, - 0x1.fffff57fc4a95p-1, - 0x1.fffff618c3da6p-1, - 0x1.fffff6a956450p-1, - 0x1.fffff731ee681p-1, - 0x1.fffff7b2f8ed6p-1, - 0x1.fffff82cdcf1bp-1, - 0x1.fffff89ffc4aap-1, - 0x1.fffff90cb3c81p-1, - 0x1.fffff9735b73bp-1, - 0x1.fffff9d446cccp-1, - 0x1.fffffa2fc5015p-1, - 0x1.fffffa8621251p-1, - 0x1.fffffad7a2652p-1, - 0x1.fffffb248c39dp-1, - 0x1.fffffb6d1e95dp-1, - 0x1.fffffbb196132p-1, - 0x1.fffffbf22c1e2p-1, - 0x1.fffffc2f171e3p-1, - 0x1.fffffc688a9cfp-1, - 0x1.fffffc9eb76acp-1, - 0x1.fffffcd1cbc28p-1, - 0x1.fffffd01f36afp-1, - 0x1.fffffd2f57d68p-1, - 0x1.fffffd5a2041fp-1, - 0x1.fffffd8271d12p-1, - 0x1.fffffda86faa9p-1, - 0x1.fffffdcc3b117p-1, - 0x1.fffffdedf37edp-1, - 0x1.fffffe0db6b91p-1, - 0x1.fffffe2ba0ea5p-1, - 0x1.fffffe47ccb60p-1, - 0x1.fffffe62534d4p-1, - 0x1.fffffe7b4c81ep-1, - 0x1.fffffe92ced93p-1, - 0x1.fffffea8ef9cfp-1, - 0x1.fffffebdc2ec6p-1, - 0x1.fffffed15bcbap-1, - 0x1.fffffee3cc32cp-1, - 0x1.fffffef5251c2p-1, - 0x1.ffffff0576917p-1, - 0x1.ffffff14cfb92p-1, - 0x1.ffffff233ee1dp-1, - 0x1.ffffff30d18e8p-1, - 0x1.ffffff3d9480fp-1, - 0x1.ffffff4993c46p-1, - 0x1.ffffff54dab72p-1, - 0x1.ffffff5f74141p-1, - 0x1.ffffff6969fb8p-1, - 0x1.ffffff72c5fb6p-1, - 0x1.ffffff7b91176p-1, - 0x1.ffffff83d3d07p-1, - 0x1.ffffff8b962bep-1, - 0x1.ffffff92dfba2p-1, - 0x1.ffffff99b79d2p-1, - 0x1.ffffffa0248e8p-1, - 0x1.ffffffa62ce54p-1, - 0x1.ffffffabd69b4p-1, - 0x1.ffffffb127525p-1, - 0x1.ffffffb624592p-1, - 0x1.ffffffbad2affp-1, - 0x1.ffffffbf370cdp-1, - 0x1.ffffffc355dfdp-1, - 0x1.ffffffc733572p-1, - 0x1.ffffffcad3626p-1, - 0x1.ffffffce39b67p-1, - 0x1.ffffffd169d0cp-1, - 0x1.ffffffd466fa5p-1, - 0x1.ffffffd7344aap-1, - 0x1.ffffffd9d4aabp-1, - 0x1.ffffffdc4ad7ap-1, - 0x1.ffffffde9964ep-1, - 0x1.ffffffe0c2bf0p-1, - 0x1.ffffffe2c92dbp-1, - 0x1.ffffffe4aed5ep-1, - 0x1.ffffffe675bbdp-1, - 0x1.ffffffe81fc4ep-1, - 0x1.ffffffe9aeb97p-1, - 0x1.ffffffeb24467p-1, - 0x1.ffffffec81ff2p-1, - 0x1.ffffffedc95e7p-1, - 0x1.ffffffeefbc85p-1, - 0x1.fffffff01a8b6p-1, - 0x1.fffffff126e1ep-1, - 0x1.fffffff221f30p-1, - 0x1.fffffff30cd3fp-1, - 0x1.fffffff3e8892p-1, - 0x1.fffffff4b606fp-1, - 0x1.fffffff57632dp-1, - 0x1.fffffff629e44p-1, - 0x1.fffffff6d1e56p-1, - 0x1.fffffff76ef3fp-1, - 0x1.fffffff801c1fp-1, - 0x1.fffffff88af67p-1, - 0x1.fffffff90b2e3p-1, - 0x1.fffffff982fc1p-1, - 0x1.fffffff9f2e9fp-1, - 0x1.fffffffa5b790p-1, - 0x1.fffffffabd229p-1, - 0x1.fffffffb18582p-1, - 0x1.fffffffb6d844p-1, - 0x1.fffffffbbd0aap-1, - 0x1.fffffffc0748fp-1, - 0x1.fffffffc4c96cp-1, - 0x1.fffffffc8d462p-1, - 0x1.fffffffcc9a41p-1, - 0x1.fffffffd01f89p-1, - 0x1.fffffffd36871p-1, - 0x1.fffffffd678edp-1, - 0x1.fffffffd954aep-1, - 0x1.fffffffdbff2ap-1, - 0x1.fffffffde7ba0p-1, - 0x1.fffffffe0cd16p-1, - 0x1.fffffffe2f664p-1, - 0x1.fffffffe4fa30p-1, - 0x1.fffffffe6daf7p-1, - 0x1.fffffffe89b0cp-1, - 0x1.fffffffea3c9ap-1, - 0x1.fffffffebc1a9p-1, - 0x1.fffffffed2c21p-1, - 0x1.fffffffee7dc8p-1, - 0x1.fffffffefb847p-1, - 0x1.ffffffff0dd2bp-1, - 0x1.ffffffff1ede9p-1, - 0x1.ffffffff2ebdap-1, - 0x1.ffffffff3d843p-1, - 0x1.ffffffff4b453p-1, - 0x1.ffffffff58126p-1, - 0x1.ffffffff63fc3p-1, - 0x1.ffffffff6f121p-1, - 0x1.ffffffff79626p-1, - 0x1.ffffffff82fabp-1, - 0x1.ffffffff8be77p-1, - 0x1.ffffffff94346p-1, - 0x1.ffffffff9bec8p-1, - 0x1.ffffffffa319fp-1, - 0x1.ffffffffa9c63p-1, - 0x1.ffffffffaffa4p-1, - 0x1.ffffffffb5be5p-1, - 0x1.ffffffffbb1a2p-1, - 0x1.ffffffffc014ep-1, - 0x1.ffffffffc4b56p-1, - 0x1.ffffffffc901cp-1, - 0x1.ffffffffccfffp-1, - 0x1.ffffffffd0b56p-1, - 0x1.ffffffffd4271p-1, - 0x1.ffffffffd759dp-1, - 0x1.ffffffffda520p-1, - 0x1.ffffffffdd13cp-1, - 0x1.ffffffffdfa2dp-1, - 0x1.ffffffffe202dp-1, - 0x1.ffffffffe4371p-1, - 0x1.ffffffffe642ap-1, - 0x1.ffffffffe8286p-1, - 0x1.ffffffffe9eb0p-1, - 0x1.ffffffffeb8d0p-1, - 0x1.ffffffffed10ap-1, - 0x1.ffffffffee782p-1, - 0x1.ffffffffefc57p-1, - 0x1.fffffffff0fa7p-1, - 0x1.fffffffff218fp-1, - 0x1.fffffffff3227p-1, - 0x1.fffffffff4188p-1, - 0x1.fffffffff4fc9p-1, - 0x1.fffffffff5cfdp-1, - 0x1.fffffffff6939p-1, - 0x1.fffffffff748ep-1, - 0x1.fffffffff7f0dp-1, - 0x1.fffffffff88c5p-1, - 0x1.fffffffff91c6p-1, - 0x1.fffffffff9a1bp-1, - 0x1.fffffffffa1d2p-1, - 0x1.fffffffffa8f6p-1, - 0x1.fffffffffaf92p-1, - 0x1.fffffffffb5b0p-1, - 0x1.fffffffffbb58p-1, - 0x1.fffffffffc095p-1, - 0x1.fffffffffc56dp-1, - 0x1.fffffffffc9e8p-1, - 0x1.fffffffffce0dp-1, - 0x1.fffffffffd1e1p-1, - 0x1.fffffffffd56cp-1, - 0x1.fffffffffd8b3p-1, - 0x1.fffffffffdbbap-1, - 0x1.fffffffffde86p-1, - 0x1.fffffffffe11dp-1, - 0x1.fffffffffe380p-1, - 0x1.fffffffffe5b6p-1, - 0x1.fffffffffe7c0p-1, - 0x1.fffffffffe9a2p-1, - 0x1.fffffffffeb60p-1, - 0x1.fffffffffecfbp-1, - 0x1.fffffffffee77p-1, - 0x1.fffffffffefd6p-1, - 0x1.ffffffffff11ap-1, - 0x1.ffffffffff245p-1, - 0x1.ffffffffff359p-1, - 0x1.ffffffffff457p-1, - 0x1.ffffffffff542p-1, - 0x1.ffffffffff61bp-1, - 0x1.ffffffffff6e3p-1, - 0x1.ffffffffff79bp-1, - 0x1.ffffffffff845p-1, - 0x1.ffffffffff8e2p-1, - 0x1.ffffffffff973p-1, - 0x1.ffffffffff9f8p-1, - 0x1.ffffffffffa73p-1, - 0x1.ffffffffffae4p-1, - 0x1.ffffffffffb4cp-1, - 0x1.ffffffffffbadp-1, - 0x1.ffffffffffc05p-1, - 0x1.ffffffffffc57p-1, - 0x1.ffffffffffca2p-1, - 0x1.ffffffffffce7p-1, - 0x1.ffffffffffd27p-1, - 0x1.ffffffffffd62p-1, - 0x1.ffffffffffd98p-1, - 0x1.ffffffffffdcap-1, - 0x1.ffffffffffdf8p-1, - 0x1.ffffffffffe22p-1, - 0x1.ffffffffffe49p-1, - 0x1.ffffffffffe6cp-1, - 0x1.ffffffffffe8dp-1, - 0x1.ffffffffffeabp-1, - 0x1.ffffffffffec7p-1, - 0x1.ffffffffffee1p-1, - 0x1.ffffffffffef8p-1, - 0x1.fffffffffff0ep-1, - 0x1.fffffffffff22p-1, - 0x1.fffffffffff34p-1, - 0x1.fffffffffff45p-1, - 0x1.fffffffffff54p-1, - 0x1.fffffffffff62p-1, - 0x1.fffffffffff6fp-1, - 0x1.fffffffffff7bp-1, - 0x1.fffffffffff86p-1, - 0x1.fffffffffff90p-1, - 0x1.fffffffffff9ap-1, - 0x1.fffffffffffa2p-1, - 0x1.fffffffffffaap-1, - 0x1.fffffffffffb1p-1, - 0x1.fffffffffffb8p-1, - 0x1.fffffffffffbep-1, - 0x1.fffffffffffc3p-1, - 0x1.fffffffffffc8p-1, - 0x1.fffffffffffcdp-1, - 0x1.fffffffffffd1p-1, - 0x1.fffffffffffd5p-1, - 0x1.fffffffffffd9p-1, - 0x1.fffffffffffdcp-1, - 0x1.fffffffffffdfp-1, - 0x1.fffffffffffe2p-1, - 0x1.fffffffffffe4p-1, - 0x1.fffffffffffe7p-1, - 0x1.fffffffffffe9p-1, - 0x1.fffffffffffebp-1, - 0x1.fffffffffffedp-1, - 0x1.fffffffffffeep-1, - 0x1.ffffffffffff0p-1, - 0x1.ffffffffffff1p-1, - 0x1.ffffffffffff3p-1, - 0x1.ffffffffffff4p-1, - 0x1.ffffffffffff5p-1, - 0x1.ffffffffffff6p-1, - 0x1.ffffffffffff7p-1, - 0x1.ffffffffffff7p-1, - 0x1.ffffffffffff8p-1, - 0x1.ffffffffffff9p-1, - 0x1.ffffffffffff9p-1, - 0x1.ffffffffffffap-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffbp-1, - 0x1.ffffffffffffcp-1, - 0x1.ffffffffffffcp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffdp-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.ffffffffffffep-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.fffffffffffffp-1, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - 0x1.0000000000000p+0, - }, - .scale = { 0x1.20dd750429b6dp+0, - 0x1.20d8f1975c85dp+0, - 0x1.20cb67bd452c7p+0, - 0x1.20b4d8bac36c1p+0, - 0x1.209546ad13ccfp+0, - 0x1.206cb4897b148p+0, - 0x1.203b261cd0052p+0, - 0x1.2000a00ae3804p+0, - 0x1.1fbd27cdc72d3p+0, - 0x1.1f70c3b4f2cc7p+0, - 0x1.1f1b7ae44867fp+0, - 0x1.1ebd5552f795bp+0, - 0x1.1e565bca400d4p+0, - 0x1.1de697e413d28p+0, - 0x1.1d6e14099944ap+0, - 0x1.1cecdb718d61cp+0, - 0x1.1c62fa1e869b6p+0, - 0x1.1bd07cdd189acp+0, - 0x1.1b357141d95d5p+0, - 0x1.1a91e5a748165p+0, - 0x1.19e5e92b964abp+0, - 0x1.19318bae53a04p+0, - 0x1.1874ddcdfce24p+0, - 0x1.17aff0e56ec10p+0, - 0x1.16e2d7093cd8cp+0, - 0x1.160da304ed92fp+0, - 0x1.153068581b781p+0, - 0x1.144b3b337c90cp+0, - 0x1.135e3075d076bp+0, - 0x1.12695da8b5bdep+0, - 0x1.116cd8fd67618p+0, - 0x1.1068b94962e5ep+0, - 0x1.0f5d1602f7e41p+0, - 0x1.0e4a073dc1b91p+0, - 0x1.0d2fa5a70c168p+0, - 0x1.0c0e0a8223359p+0, - 0x1.0ae54fa490722p+0, - 0x1.09b58f724416bp+0, - 0x1.087ee4d9ad247p+0, - 0x1.07416b4fbfe7cp+0, - 0x1.05fd3ecbec297p+0, - 0x1.04b27bc403d30p+0, - 0x1.03613f2812dafp+0, - 0x1.0209a65e29545p+0, - 0x1.00abcf3e187a9p+0, - 0x1.fe8fb01a47307p-1, - 0x1.fbbbbef34b4b2p-1, - 0x1.f8dc092d58ff8p-1, - 0x1.f5f0cdaf15313p-1, - 0x1.f2fa4c16c0019p-1, - 0x1.eff8c4b1375dbp-1, - 0x1.ecec7870ebca7p-1, - 0x1.e9d5a8e4c934ep-1, - 0x1.e6b4982f158b9p-1, - 0x1.e38988fc46e72p-1, - 0x1.e054be79d3042p-1, - 0x1.dd167c4cf9d2ap-1, - 0x1.d9cf06898cdafp-1, - 0x1.d67ea1a8b5368p-1, - 0x1.d325927fb9d89p-1, - 0x1.cfc41e36c7df9p-1, - 0x1.cc5a8a3fbea40p-1, - 0x1.c8e91c4d01368p-1, - 0x1.c5701a484ef9dp-1, - 0x1.c1efca49a5011p-1, - 0x1.be68728e29d5dp-1, - 0x1.bada596f25436p-1, - 0x1.b745c55905bf8p-1, - 0x1.b3aafcc27502ep-1, - 0x1.b00a46237d5bep-1, - 0x1.ac63e7ecc1411p-1, - 0x1.a8b8287ec6a09p-1, - 0x1.a5074e2157620p-1, - 0x1.a1519efaf889ep-1, - 0x1.9d97610879642p-1, - 0x1.99d8da149c13fp-1, - 0x1.96164fafd8de3p-1, - 0x1.925007283d7aap-1, - 0x1.8e86458169af8p-1, - 0x1.8ab94f6caa71dp-1, - 0x1.86e9694134b9ep-1, - 0x1.8316d6f48133dp-1, - 0x1.7f41dc12c9e89p-1, - 0x1.7b6abbb7aaf19p-1, - 0x1.7791b886e7403p-1, - 0x1.73b714a552763p-1, - 0x1.6fdb11b1e0c34p-1, - 0x1.6bfdf0beddaf5p-1, - 0x1.681ff24b4ab04p-1, - 0x1.6441563c665d4p-1, - 0x1.60625bd75d07bp-1, - 0x1.5c8341bb23767p-1, - 0x1.58a445da7c74cp-1, - 0x1.54c5a57629db0p-1, - 0x1.50e79d1749ac9p-1, - 0x1.4d0a6889dfd9fp-1, - 0x1.492e42d78d2c5p-1, - 0x1.4553664273d24p-1, - 0x1.417a0c4049fd0p-1, - 0x1.3da26d759aef5p-1, - 0x1.39ccc1b136d5ap-1, - 0x1.35f93fe7d1b3dp-1, - 0x1.32281e2fd1a92p-1, - 0x1.2e5991bd4cbfcp-1, - 0x1.2a8dcede3673bp-1, - 0x1.26c508f6bd0ffp-1, - 0x1.22ff727dd6f7bp-1, - 0x1.1f3d3cf9ffe5ap-1, - 0x1.1b7e98fe26217p-1, - 0x1.17c3b626c7a11p-1, - 0x1.140cc3173f007p-1, - 0x1.1059ed7740313p-1, - 0x1.0cab61f084b93p-1, - 0x1.09014c2ca74dap-1, - 0x1.055bd6d32e8d7p-1, - 0x1.01bb2b87c6968p-1, - 0x1.fc3ee5d1524b0p-2, - 0x1.f511a91a67d2ap-2, - 0x1.edeeee0959518p-2, - 0x1.e6d6ffaa65a25p-2, - 0x1.dfca26f5bbf88p-2, - 0x1.d8c8aace11e63p-2, - 0x1.d1d2cfff91594p-2, - 0x1.cae8d93f1d7b6p-2, - 0x1.c40b0729ed547p-2, - 0x1.bd3998457afdap-2, - 0x1.b674c8ffc6283p-2, - 0x1.afbcd3afe8ab6p-2, - 0x1.a911f096fbc26p-2, - 0x1.a27455e14c93cp-2, - 0x1.9be437a7de946p-2, - 0x1.9561c7f23a47bp-2, - 0x1.8eed36b886d93p-2, - 0x1.8886b1e5ecfd1p-2, - 0x1.822e655b417e6p-2, - 0x1.7be47af1f5d89p-2, - 0x1.75a91a7f4d2edp-2, - 0x1.6f7c69d7d3ef8p-2, - 0x1.695e8cd31867ep-2, - 0x1.634fa54fa285fp-2, - 0x1.5d4fd33729015p-2, - 0x1.575f3483021c3p-2, - 0x1.517de540ce2a3p-2, - 0x1.4babff975a04cp-2, - 0x1.45e99bcbb7915p-2, - 0x1.4036d0468a7a2p-2, - 0x1.3a93b1998736cp-2, - 0x1.35005285227f1p-2, - 0x1.2f7cc3fe6f423p-2, - 0x1.2a09153529381p-2, - 0x1.24a55399ea239p-2, - 0x1.1f518ae487dc8p-2, - 0x1.1a0dc51a9934dp-2, - 0x1.14da0a961fd14p-2, - 0x1.0fb6620c550afp-2, - 0x1.0aa2d09497f2bp-2, - 0x1.059f59af7a906p-2, - 0x1.00abff4dec7a3p-2, - 0x1.f79183b101c5bp-3, - 0x1.edeb406d9c824p-3, - 0x1.e4652fadcb6b2p-3, - 0x1.daff4969c0b04p-3, - 0x1.d1b982c501370p-3, - 0x1.c893ce1dcbef7p-3, - 0x1.bf8e1b1ca2279p-3, - 0x1.b6a856c3ed54fp-3, - 0x1.ade26b7fbed95p-3, - 0x1.a53c4135a6526p-3, - 0x1.9cb5bd549b111p-3, - 0x1.944ec2e4f5630p-3, - 0x1.8c07329874652p-3, - 0x1.83deeada4d25ap-3, - 0x1.7bd5c7df3fe9cp-3, - 0x1.73eba3b5b07b7p-3, - 0x1.6c205655be71fp-3, - 0x1.6473b5b15a7a1p-3, - 0x1.5ce595c455b0ap-3, - 0x1.5575c8a468361p-3, - 0x1.4e241e912c305p-3, - 0x1.46f066040a832p-3, - 0x1.3fda6bc016994p-3, - 0x1.38e1fae1d6a9dp-3, - 0x1.3206dceef5f87p-3, - 0x1.2b48d9e5dea1cp-3, - 0x1.24a7b84d38971p-3, - 0x1.1e233d434b813p-3, - 0x1.17bb2c8d41535p-3, - 0x1.116f48a6476ccp-3, - 0x1.0b3f52ce8c383p-3, - 0x1.052b0b1a174eap-3, - 0x1.fe6460fef4680p-4, - 0x1.f2a901ccafb37p-4, - 0x1.e723726b824a9p-4, - 0x1.dbd32ac4c99b0p-4, - 0x1.d0b7a0f921e7cp-4, - 0x1.c5d0497c09e74p-4, - 0x1.bb1c972f23e50p-4, - 0x1.b09bfb7d11a83p-4, - 0x1.a64de673e8837p-4, - 0x1.9c31c6df3b1b8p-4, - 0x1.92470a61b6965p-4, - 0x1.888d1d8e510a3p-4, - 0x1.7f036c0107294p-4, - 0x1.75a96077274bap-4, - 0x1.6c7e64e7281cbp-4, - 0x1.6381e2980956bp-4, - 0x1.5ab342383d177p-4, - 0x1.5211ebf41880bp-4, - 0x1.499d478bca735p-4, - 0x1.4154bc68d75c3p-4, - 0x1.3937b1b319259p-4, - 0x1.31458e6542847p-4, - 0x1.297db960e4f63p-4, - 0x1.21df9981f8e53p-4, - 0x1.1a6a95b1e786fp-4, - 0x1.131e14fa1625dp-4, - 0x1.0bf97e95f2a64p-4, - 0x1.04fc3a0481321p-4, - 0x1.fc4b5e32d6259p-5, - 0x1.eeea8c1b1db93p-5, - 0x1.e1d4cf1e2450ap-5, - 0x1.d508f9a1ea64ep-5, - 0x1.c885df3451a07p-5, - 0x1.bc4a54a84e834p-5, - 0x1.b055303221015p-5, - 0x1.a4a549829587ep-5, - 0x1.993979e14fffdp-5, - 0x1.8e109c4622913p-5, - 0x1.83298d717210ep-5, - 0x1.78832c03aa2b1p-5, - 0x1.6e1c5893c380bp-5, - 0x1.63f3f5c4de13bp-5, - 0x1.5a08e85af27e0p-5, - 0x1.505a174e9c929p-5, - 0x1.46e66be002240p-5, - 0x1.3dacd1a8d8ccdp-5, - 0x1.34ac36ad8dafep-5, - 0x1.2be38b6d92415p-5, - 0x1.2351c2f2d1449p-5, - 0x1.1af5d2e04f3f6p-5, - 0x1.12ceb37ff9bc3p-5, - 0x1.0adb5fcfa8c75p-5, - 0x1.031ad58d56279p-5, - 0x1.f7182a851bca2p-6, - 0x1.e85c449e377f2p-6, - 0x1.da0005e5f28dfp-6, - 0x1.cc0180af00a8bp-6, - 0x1.be5ecd2fcb5f9p-6, - 0x1.b1160991ff737p-6, - 0x1.a4255a00b9f03p-6, - 0x1.978ae8b55ce1bp-6, - 0x1.8b44e6031383ep-6, - 0x1.7f5188610ddc8p-6, - 0x1.73af0c737bb45p-6, - 0x1.685bb5134ef13p-6, - 0x1.5d55cb54cd53ap-6, - 0x1.529b9e8cf9a1ep-6, - 0x1.482b8455dc491p-6, - 0x1.3e03d891b37dep-6, - 0x1.3422fd6d12e2bp-6, - 0x1.2a875b5ffab56p-6, - 0x1.212f612dee7fbp-6, - 0x1.181983e5133ddp-6, - 0x1.0f443edc5ce49p-6, - 0x1.06ae13b0d3255p-6, - 0x1.fcab1483ea7fcp-7, - 0x1.ec72615a894c4p-7, - 0x1.dcaf3691fc448p-7, - 0x1.cd5ec93c12431p-7, - 0x1.be7e5ac24963bp-7, - 0x1.b00b38d6b3575p-7, - 0x1.a202bd6372dcep-7, - 0x1.94624e78e0fafp-7, - 0x1.87275e3a6869dp-7, - 0x1.7a4f6aca256cbp-7, - 0x1.6dd7fe3358230p-7, - 0x1.61beae53b72b7p-7, - 0x1.56011cc3b036dp-7, - 0x1.4a9cf6bda3f4cp-7, - 0x1.3f8ff5042a88ep-7, - 0x1.34d7dbc76d7e5p-7, - 0x1.2a727a89a3f14p-7, - 0x1.205dac02bd6b9p-7, - 0x1.1697560347b25p-7, - 0x1.0d1d69569b82dp-7, - 0x1.03ede1a45bfeep-7, - 0x1.f60d8aa2a88f2p-8, - 0x1.e4cc4abf7d065p-8, - 0x1.d4143a9dfe965p-8, - 0x1.c3e1a5f5c077cp-8, - 0x1.b430ecf4a83a8p-8, - 0x1.a4fe83fb9db25p-8, - 0x1.9646f35a76623p-8, - 0x1.8806d70b2fc36p-8, - 0x1.7a3ade6c8b3e4p-8, - 0x1.6cdfcbfc1e263p-8, - 0x1.5ff2750fe7820p-8, - 0x1.536fc18f7ce5cp-8, - 0x1.4754abacdf1dcp-8, - 0x1.3b9e3f9d06e3fp-8, - 0x1.30499b503957fp-8, - 0x1.2553ee2a336bfp-8, - 0x1.1aba78ba3af89p-8, - 0x1.107a8c7323a6ep-8, - 0x1.06918b6355624p-8, - 0x1.f9f9cfd9c3035p-9, - 0x1.e77448fb66bb9p-9, - 0x1.d58da68fd1170p-9, - 0x1.c4412bf4b8f0bp-9, - 0x1.b38a3af2e55b4p-9, - 0x1.a3645330550ffp-9, - 0x1.93cb11a30d765p-9, - 0x1.84ba3004a50d0p-9, - 0x1.762d84469c18fp-9, - 0x1.6821000795a03p-9, - 0x1.5a90b00981d93p-9, - 0x1.4d78bba8ca5fdp-9, - 0x1.40d564548fad7p-9, - 0x1.34a305080681fp-9, - 0x1.28de11c5031ebp-9, - 0x1.1d83170fbf6fbp-9, - 0x1.128eb96be8798p-9, - 0x1.07fdb4dafea5fp-9, - 0x1.fb99b8b8279e1p-10, - 0x1.e7f232d9e2630p-10, - 0x1.d4fed7195d7e8p-10, - 0x1.c2b9cf7f893bfp-10, - 0x1.b11d702b3deb1p-10, - 0x1.a024365f771bdp-10, - 0x1.8fc8c794b03b5p-10, - 0x1.8005f08d6f1efp-10, - 0x1.70d6a46e07ddap-10, - 0x1.6235fbd7a4345p-10, - 0x1.541f340697987p-10, - 0x1.468dadf4080abp-10, - 0x1.397ced7af2b15p-10, - 0x1.2ce898809244ep-10, - 0x1.20cc76202c5fap-10, - 0x1.15246dda49d47p-10, - 0x1.09ec86c75d497p-10, - 0x1.fe41cd9bb4eeep-11, - 0x1.e97ba3b77f306p-11, - 0x1.d57f524723822p-11, - 0x1.c245d4b998479p-11, - 0x1.afc85e0f82e12p-11, - 0x1.9e005769dbc1dp-11, - 0x1.8ce75e9f6f8a0p-11, - 0x1.7c7744d9378f7p-11, - 0x1.6caa0d3582fe9p-11, - 0x1.5d79eb71e893bp-11, - 0x1.4ee1429bf7cc0p-11, - 0x1.40daa3c89f5b6p-11, - 0x1.3360ccd23db3ap-11, - 0x1.266ea71d4f71ap-11, - 0x1.19ff4663ae9dfp-11, - 0x1.0e0de78654d1ep-11, - 0x1.0295ef6591848p-11, - 0x1.ef25d37f49fe1p-12, - 0x1.da01102b5f851p-12, - 0x1.c5b5412dcafadp-12, - 0x1.b23a5a23e4210p-12, - 0x1.9f8893d8fd1c1p-12, - 0x1.8d986a4187285p-12, - 0x1.7c629a822bc9ep-12, - 0x1.6be02102b3520p-12, - 0x1.5c0a378c90bcap-12, - 0x1.4cda5374ea275p-12, - 0x1.3e4a23d1f4702p-12, - 0x1.30538fbb77ecdp-12, - 0x1.22f0b496539bdp-12, - 0x1.161be46ad3b50p-12, - 0x1.09cfa445b00ffp-12, - 0x1.fc0d55470cf51p-13, - 0x1.e577bbcd49935p-13, - 0x1.cfd4a5adec5bfp-13, - 0x1.bb1a9657ce465p-13, - 0x1.a740684026555p-13, - 0x1.943d4a1d1ed39p-13, - 0x1.8208bc334a6a5p-13, - 0x1.709a8db59f25cp-13, - 0x1.5feada379d8b7p-13, - 0x1.4ff207314a102p-13, - 0x1.40a8c1949f75ep-13, - 0x1.3207fb7420eb9p-13, - 0x1.2408e9ba3327fp-13, - 0x1.16a501f0e42cap-13, - 0x1.09d5f819c9e29p-13, - 0x1.fb2b792b40a22p-14, - 0x1.e3bcf436a1a95p-14, - 0x1.cd55277c18d05p-14, - 0x1.b7e94604479dcp-14, - 0x1.a36eec00926ddp-14, - 0x1.8fdc1b2dcf7b9p-14, - 0x1.7d2737527c3f9p-14, - 0x1.6b4702d7d5849p-14, - 0x1.5a329b7d30748p-14, - 0x1.49e17724f4d41p-14, - 0x1.3a4b60ba9aa4dp-14, - 0x1.2b6875310f785p-14, - 0x1.1d312098e9dbap-14, - 0x1.0f9e1b4dd36dfp-14, - 0x1.02a8673a94691p-14, - 0x1.ec929a665b449p-15, - 0x1.d4f4b4c8e09edp-15, - 0x1.be6abbb10a5aap-15, - 0x1.a8e8cc1fadef6p-15, - 0x1.94637d5bacfdbp-15, - 0x1.80cfdc72220cfp-15, - 0x1.6e2367dc27f95p-15, - 0x1.5c540b4936fd2p-15, - 0x1.4b581b8d170fcp-15, - 0x1.3b2652b06c2b2p-15, - 0x1.2bb5cc22e5db6p-15, - 0x1.1cfe010e2052dp-15, - 0x1.0ef6c4c84a0fep-15, - 0x1.01984165a5f36p-15, - 0x1.e9b5e8d00ce76p-16, - 0x1.d16f5716c6c1ap-16, - 0x1.ba4f035d60e02p-16, - 0x1.a447b7b03f045p-16, - 0x1.8f4ccca7fc90dp-16, - 0x1.7b5223dac7336p-16, - 0x1.684c227fcacefp-16, - 0x1.562fac4329b48p-16, - 0x1.44f21e49054f2p-16, - 0x1.34894a5e24657p-16, - 0x1.24eb7254ccf83p-16, - 0x1.160f438c70913p-16, - 0x1.07ebd2a2d2844p-16, - 0x1.f4f12e9ab070ap-17, - 0x1.db5ad0b27805cp-17, - 0x1.c304efa2c6f4ep-17, - 0x1.abe09e9144b5ep-17, - 0x1.95df988e76644p-17, - 0x1.80f439b4ee04bp-17, - 0x1.6d11788a69c64p-17, - 0x1.5a2adfa0b4bc4p-17, - 0x1.4834877429b8fp-17, - 0x1.37231085c7d9ap-17, - 0x1.26eb9daed6f7ep-17, - 0x1.1783ceac28910p-17, - 0x1.08e1badf0fcedp-17, - 0x1.f5f7d88472604p-18, - 0x1.db92b5212fb8dp-18, - 0x1.c282cd3957edap-18, - 0x1.aab7abace48dcp-18, - 0x1.94219bfcb4928p-18, - 0x1.7eb1a2075864dp-18, - 0x1.6a597219a93d9p-18, - 0x1.570b69502f313p-18, - 0x1.44ba864670882p-18, - 0x1.335a62115bce2p-18, - 0x1.22df298214423p-18, - 0x1.133d96ae7e0ddp-18, - 0x1.046aeabcfcdecp-18, - 0x1.ecb9cfe1d8642p-19, - 0x1.d21397ead99cbp-19, - 0x1.b8d094c86d374p-19, - 0x1.a0df0f0c626dcp-19, - 0x1.8a2e269750a39p-19, - 0x1.74adc8f4064d3p-19, - 0x1.604ea819f007cp-19, - 0x1.4d0231928c6f9p-19, - 0x1.3aba85fe22e1fp-19, - 0x1.296a70f414053p-19, - 0x1.1905613b3abf2p-19, - 0x1.097f6156f32c5p-19, - 0x1.f59a20caf6695p-20, - 0x1.d9c73698fb1dcp-20, - 0x1.bf716c6168baep-20, - 0x1.a6852c6b58392p-20, - 0x1.8eefd70594a88p-20, - 0x1.789fb715aae95p-20, - 0x1.6383f726a8e04p-20, - 0x1.4f8c96f26a26ap-20, - 0x1.3caa61607f920p-20, - 0x1.2acee2f5ecdb8p-20, - 0x1.19ec60b1242edp-20, - 0x1.09f5cf4dd2877p-20, - 0x1.f5bd95d8730d8p-21, - 0x1.d9371e2ff7c35p-21, - 0x1.be41de54d155ap-21, - 0x1.a4c89e08ef4f3p-21, - 0x1.8cb738399b12cp-21, - 0x1.75fa8dbc84becp-21, - 0x1.608078a70dcbcp-21, - 0x1.4c37c0394d094p-21, - 0x1.39100d5687bfep-21, - 0x1.26f9df8519bd6p-21, - 0x1.15e6827001f18p-21, - 0x1.05c803e4831c1p-21, - 0x1.ed22548cffd35p-22, - 0x1.d06ad6ecdf971p-22, - 0x1.b551c847fbc96p-22, - 0x1.9bc09f112b494p-22, - 0x1.83a1ff0aa239dp-22, - 0x1.6ce1aa3fd7bddp-22, - 0x1.576c72b514859p-22, - 0x1.43302cc4a0da8p-22, - 0x1.301ba221dc9bbp-22, - 0x1.1e1e857adc568p-22, - 0x1.0d2966b1746f7p-22, - 0x1.fa5b4f49cc6b2p-23, - 0x1.dc3ae30b55c16p-23, - 0x1.bfd7555a3bd68p-23, - 0x1.a517d9e61628ap-23, - 0x1.8be4f8f6c951fp-23, - 0x1.74287ded49339p-23, - 0x1.5dcd669f2cd34p-23, - 0x1.48bfd38302870p-23, - 0x1.34ecf8a3c124ap-23, - 0x1.22430f521cbcfp-23, - 0x1.10b1488aeb235p-23, - 0x1.0027c00a263a6p-23, - 0x1.e12ee004efc37p-24, - 0x1.c3e44ae32b16bp-24, - 0x1.a854ea14102a8p-24, - 0x1.8e6761569f45dp-24, - 0x1.7603bac345f65p-24, - 0x1.5f1353cdad001p-24, - 0x1.4980cb3c80949p-24, - 0x1.3537f00b6ad4dp-24, - 0x1.2225b12bffc68p-24, - 0x1.10380e1adb7e9p-24, - 0x1.febc107d5efaap-25, - 0x1.df0f2a0ee6946p-25, - 0x1.c14b2188bcee4p-25, - 0x1.a553644f7f07dp-25, - 0x1.8b0cfce0579dfp-25, - 0x1.725e7c5dd20f7p-25, - 0x1.5b2fe547a1340p-25, - 0x1.456a974e92e93p-25, - 0x1.30f93c3699078p-25, - 0x1.1dc7b5b978cf8p-25, - 0x1.0bc30c5d52f15p-25, - 0x1.f5b2be65a0c7fp-26, - 0x1.d5f3a8dea7357p-26, - 0x1.b82915b03515bp-26, - 0x1.9c3517e789488p-26, - 0x1.81fb7df06136ep-26, - 0x1.6961b8d641d06p-26, - 0x1.524ec4d916caep-26, - 0x1.3cab1343d18d1p-26, - 0x1.2860757487a01p-26, - 0x1.155a09065d4f7p-26, - 0x1.0384250e4c9fcp-26, - 0x1.e59890b926c78p-27, - 0x1.c642116a8a9e3p-27, - 0x1.a8e405e651ab6p-27, - 0x1.8d5f98114f872p-27, - 0x1.7397c5a66e307p-27, - 0x1.5b71456c5a4c4p-27, - 0x1.44d26de513197p-27, - 0x1.2fa31d6371537p-27, - 0x1.1bcca373b7b43p-27, - 0x1.0939ab853339fp-27, - 0x1.efac5187b2863p-28, - 0x1.cf1e86235d0e6p-28, - 0x1.b0a68a2128babp-28, - 0x1.9423165bc4444p-28, - 0x1.7974e743dea3cp-28, - 0x1.607e9eacd1050p-28, - 0x1.4924a74dec728p-28, - 0x1.334d19e0c2160p-28, - 0x1.1edfa3c5f5ccap-28, - 0x1.0bc56f1b54701p-28, - 0x1.f3d2185e047d9p-29, - 0x1.d26cb87945e87p-29, - 0x1.b334fac4b9f99p-29, - 0x1.96076f7918d1cp-29, - 0x1.7ac2d72fc2c63p-29, - 0x1.614801550319ep-29, - 0x1.4979ac8b28926p-29, - 0x1.333c68e2d0548p-29, - 0x1.1e767bce37dd7p-29, - 0x1.0b0fc5b6d05a0p-29, - 0x1.f1e3523b41d7dp-30, - 0x1.d00de6608effep-30, - 0x1.b0778b7b3301ap-30, - 0x1.92fb04ec0f6cfp-30, - 0x1.77756ec9f78fap-30, - 0x1.5dc61922d5a06p-30, - 0x1.45ce65699ff6dp-30, - 0x1.2f71a5f159970p-30, - 0x1.1a94ff571654fp-30, - 0x1.071f4bbea09ecp-30, - 0x1.e9f1ff8ddd774p-31, - 0x1.c818223a202c7p-31, - 0x1.a887bd2b4404dp-31, - 0x1.8b1a336c5eb6bp-31, - 0x1.6fab63324088ap-31, - 0x1.56197e30205bap-31, - 0x1.3e44e45301b92p-31, - 0x1.281000bfe4c3fp-31, - 0x1.135f28f2d50b4p-31, - 0x1.00187dded5975p-31, - 0x1.dc479de0ef001p-32, - 0x1.bad4fdad3caa1p-32, - 0x1.9baed3ed27ab8p-32, - 0x1.7ead9ce4285bbp-32, - 0x1.63ac6b4edc88ep-32, - 0x1.4a88be2a6390cp-32, - 0x1.332259185f1a0p-32, - 0x1.1d5b1f3793044p-32, - 0x1.0916f04b6e18bp-32, - 0x1.ec77101de6926p-33, - 0x1.c960bf23153e0p-33, - 0x1.a8bd20fc65ef7p-33, - 0x1.8a61745ec7d1dp-33, - 0x1.6e25d0e756261p-33, - 0x1.53e4f7d1666cbp-33, - 0x1.3b7c27a7ddb0ep-33, - 0x1.24caf2c32af14p-33, - 0x1.0fb3186804d0fp-33, - 0x1.f830c0bb41fd7p-34, - 0x1.d3c0f1a91c846p-34, - 0x1.b1e5acf351d87p-34, - 0x1.92712d259ce66p-34, - 0x1.7538c60a04476p-34, - 0x1.5a14b04b47879p-34, - 0x1.40dfd87456f4cp-34, - 0x1.2977b1172b9d5p-34, - 0x1.13bc07e891491p-34, - 0x1.ff1dbb4300811p-35, - 0x1.d9a880f306bd8p-35, - 0x1.b6e45220b55e0p-35, - 0x1.96a0b33f2c4dap-35, - 0x1.78b07e9e924acp-35, - 0x1.5ce9ab1670dd2p-35, - 0x1.4325167006bb0p-35, - 0x1.2b3e53538ff3fp-35, - 0x1.15137a7f44864p-35, - 0x1.0084ff125639dp-35, - 0x1.daeb0b7311ec7p-36, - 0x1.b7937d1c40c52p-36, - 0x1.96d082f59ab06p-36, - 0x1.7872d9fa10aadp-36, - 0x1.5c4e8e37bc7d0p-36, - 0x1.423ac0df49a40p-36, - 0x1.2a117230ad284p-36, - 0x1.13af4f04f9998p-36, - 0x1.fde703724e560p-37, - 0x1.d77f0c82e7641p-37, - 0x1.b3ee02611d7ddp-37, - 0x1.92ff33023d5bdp-37, - 0x1.7481a9e69f53fp-37, - 0x1.5847eda620959p-37, - 0x1.3e27c1fcc74bdp-37, - 0x1.25f9ee0b923dcp-37, - 0x1.0f9a0686531ffp-37, - 0x1.f5cc7718082afp-38, - 0x1.cf7e53d6a2ca5p-38, - 0x1.ac0f5f3229372p-38, - 0x1.8b498644847eap-38, - 0x1.6cfa9bcca59dcp-38, - 0x1.50f411d4fd2cdp-38, - 0x1.370ab8327af5ep-38, - 0x1.1f167f88c6b6ep-38, - 0x1.08f24085d4597p-38, - 0x1.e8f70e181d619p-39, - 0x1.c324c20e337dcp-39, - 0x1.a03261574b54ep-39, - 0x1.7fe903cdf5855p-39, - 0x1.6215c58da3450p-39, - 0x1.46897d4b69fc6p-39, - 0x1.2d1877d731b7bp-39, - 0x1.159a386b11517p-39, - 0x1.ffd27ae9393cep-40, - 0x1.d7c593130dd0bp-40, - 0x1.b2cd607c79bcfp-40, - 0x1.90ae4d3405651p-40, - 0x1.71312dd1759e2p-40, - 0x1.5422ef5d8949dp-40, - 0x1.39544b0ecc957p-40, - 0x1.20997f73e73ddp-40, - 0x1.09ca0eaacd277p-40, - 0x1.e9810295890ecp-41, - 0x1.c2b45b5aa4a1dp-41, - 0x1.9eee068fa7596p-41, - 0x1.7df2b399c10a8p-41, - 0x1.5f8b87a31bd85p-41, - 0x1.4385c96e9a2d9p-41, - 0x1.29b2933ef4cbcp-41, - 0x1.11e68a6378f8ap-41, - 0x1.f7f338086a86bp-42, - 0x1.cf8d7d9ce040ap-42, - 0x1.aa577251ae484p-42, - 0x1.8811d739efb5ep-42, - 0x1.68823e52970bep-42, - 0x1.4b72ae68e8b4cp-42, - 0x1.30b14dbe876bcp-42, - 0x1.181012ef86610p-42, - 0x1.01647ba798744p-42, - 0x1.d90e917701675p-43, - 0x1.b2a87e86d0c8ap-43, - 0x1.8f53dcb377293p-43, - 0x1.6ed2f2515e933p-43, - 0x1.50ecc9ed47f19p-43, - 0x1.356cd5ce7799ep-43, - 0x1.1c229a587ab78p-43, - 0x1.04e15ecc7f3f6p-43, - 0x1.deffc7e6a6017p-44, - 0x1.b7b040832f310p-44, - 0x1.938e021f36d76p-44, - 0x1.7258610b3b233p-44, - 0x1.53d3bfc82a909p-44, - 0x1.37c92babdc2fdp-44, - 0x1.1e06010120f6ap-44, - 0x1.065b9616170d4p-44, - 0x1.e13dd96b3753ap-45, - 0x1.b950d32467392p-45, - 0x1.94a72263259a5p-45, - 0x1.72fd93e036cdcp-45, - 0x1.54164576929abp-45, - 0x1.37b83c521fe96p-45, - 0x1.1daf033182e96p-45, - 0x1.05ca50205d26ap-45, - 0x1.dfbb6235639fap-46, - 0x1.b7807e294781fp-46, - 0x1.9298add70a734p-46, - 0x1.70beaf9c7ffb6p-46, - 0x1.51b2cd6709222p-46, - 0x1.353a6cf7f7fffp-46, - 0x1.1b1fa8cbe84a7p-46, - 0x1.0330f0fd69921p-46, - 0x1.da81670f96f9bp-47, - 0x1.b24a16b4d09aap-47, - 0x1.8d6eeb6efdbd6p-47, - 0x1.6ba91ac734785p-47, - 0x1.4cb7966770ab5p-47, - 0x1.305e9721d0981p-47, - 0x1.1667311fff70ap-47, - 0x1.fd3de10d62855p-48, - 0x1.d1aefbcd48d0cp-48, - 0x1.a9cc93c25aca9p-48, - 0x1.85487ee3ea735p-48, - 0x1.63daf8b4b1e0cp-48, - 0x1.45421e69a6ca1p-48, - 0x1.294175802d99ap-48, - 0x1.0fa17bf41068fp-48, - 0x1.f05e82aae2bb9p-49, - 0x1.c578101b29058p-49, - 0x1.9e39dc5dd2f7cp-49, - 0x1.7a553a728bbf2p-49, - 0x1.5982008db1304p-49, - 0x1.3b7e00422e51bp-49, - 0x1.200c898d9ee3ep-49, - 0x1.06f5f7eb65a56p-49, - 0x1.e00e9148a1d25p-50, - 0x1.b623734024e92p-50, - 0x1.8fd4e01891bf8p-50, - 0x1.6cd44c7470d89p-50, - 0x1.4cd9c04158cd7p-50, - 0x1.2fa34bf5c8344p-50, - 0x1.14f4890ff2461p-50, - 0x1.f92c49dfa4df5p-51, - 0x1.ccaaea71ab0dfp-51, - 0x1.a40829f001197p-51, - 0x1.7eef13b59e96cp-51, - 0x1.5d11e1a252bf5p-51, - 0x1.3e296303b2297p-51, - 0x1.21f47009f43cep-51, - 0x1.083768c5e4541p-51, - 0x1.e1777d831265ep-52, - 0x1.b69f10b0191b5p-52, - 0x1.8f8a3a05b5b52p-52, - 0x1.6be573c40c8e7p-52, - 0x1.4b645ba991fdbp-52, - 0x1.2dc119095729fp-52, - }, -}; diff --git a/sysdeps/aarch64/fpu/sv_erff_data.c b/sysdeps/aarch64/fpu/sv_erff_data.c deleted file mode 100644 index 6dcd72a..0000000 --- a/sysdeps/aarch64/fpu/sv_erff_data.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* Table for SVE erff approximation - - Copyright (C) 2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include "vecmath_config.h" - -/* Lookup table used in SVE erff. - For each possible rounded input r (multiples of 1/128), between - r = 0.0 and r = 4.0 (513 values): - - __erff_data.erf contains the values of erf(r), - - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2). - Note that indices 0 and 1 are never hit by the algorithm, since lookup is - performed only for x >= 1/64-1/512. */ -const struct sv_erff_data __sv_erff_data = { - .erf = { 0x0.000000p+0, - 0x1.20dbf4p-7, - 0x1.20d770p-6, - 0x1.b137e0p-6, - 0x1.20c564p-5, - 0x1.68e5d4p-5, - 0x1.b0fafep-5, - 0x1.f902a8p-5, - 0x1.207d48p-4, - 0x1.44703ep-4, - 0x1.68591ap-4, - 0x1.8c36bep-4, - 0x1.b00812p-4, - 0x1.d3cbf8p-4, - 0x1.f7815ap-4, - 0x1.0d9390p-3, - 0x1.1f5e1ap-3, - 0x1.311fc2p-3, - 0x1.42d7fcp-3, - 0x1.548642p-3, - 0x1.662a0cp-3, - 0x1.77c2d2p-3, - 0x1.895010p-3, - 0x1.9ad142p-3, - 0x1.ac45e4p-3, - 0x1.bdad72p-3, - 0x1.cf076ep-3, - 0x1.e05354p-3, - 0x1.f190aap-3, - 0x1.015f78p-2, - 0x1.09eed6p-2, - 0x1.127632p-2, - 0x1.1af54ep-2, - 0x1.236bf0p-2, - 0x1.2bd9dcp-2, - 0x1.343ed6p-2, - 0x1.3c9aa8p-2, - 0x1.44ed18p-2, - 0x1.4d35f0p-2, - 0x1.5574f4p-2, - 0x1.5da9f4p-2, - 0x1.65d4b8p-2, - 0x1.6df50ap-2, - 0x1.760abap-2, - 0x1.7e1594p-2, - 0x1.861566p-2, - 0x1.8e0a02p-2, - 0x1.95f336p-2, - 0x1.9dd0d2p-2, - 0x1.a5a2acp-2, - 0x1.ad6896p-2, - 0x1.b52264p-2, - 0x1.bccfecp-2, - 0x1.c47104p-2, - 0x1.cc0584p-2, - 0x1.d38d44p-2, - 0x1.db081cp-2, - 0x1.e275eap-2, - 0x1.e9d68ap-2, - 0x1.f129d4p-2, - 0x1.f86faap-2, - 0x1.ffa7eap-2, - 0x1.03693ap-1, - 0x1.06f794p-1, - 0x1.0a7ef6p-1, - 0x1.0dff50p-1, - 0x1.117894p-1, - 0x1.14eab4p-1, - 0x1.1855a6p-1, - 0x1.1bb95cp-1, - 0x1.1f15ccp-1, - 0x1.226ae8p-1, - 0x1.25b8a8p-1, - 0x1.28ff02p-1, - 0x1.2c3decp-1, - 0x1.2f755cp-1, - 0x1.32a54cp-1, - 0x1.35cdb4p-1, - 0x1.38ee8ap-1, - 0x1.3c07cap-1, - 0x1.3f196ep-1, - 0x1.42236ep-1, - 0x1.4525c8p-1, - 0x1.482074p-1, - 0x1.4b1372p-1, - 0x1.4dfebap-1, - 0x1.50e24cp-1, - 0x1.53be26p-1, - 0x1.569244p-1, - 0x1.595ea6p-1, - 0x1.5c2348p-1, - 0x1.5ee02ep-1, - 0x1.619556p-1, - 0x1.6442c0p-1, - 0x1.66e86ep-1, - 0x1.69865ep-1, - 0x1.6c1c98p-1, - 0x1.6eab18p-1, - 0x1.7131e6p-1, - 0x1.73b102p-1, - 0x1.762870p-1, - 0x1.789836p-1, - 0x1.7b0058p-1, - 0x1.7d60d8p-1, - 0x1.7fb9c0p-1, - 0x1.820b12p-1, - 0x1.8454d6p-1, - 0x1.869712p-1, - 0x1.88d1cep-1, - 0x1.8b050ep-1, - 0x1.8d30dep-1, - 0x1.8f5544p-1, - 0x1.91724ap-1, - 0x1.9387f6p-1, - 0x1.959652p-1, - 0x1.979d68p-1, - 0x1.999d42p-1, - 0x1.9b95e8p-1, - 0x1.9d8768p-1, - 0x1.9f71cap-1, - 0x1.a1551ap-1, - 0x1.a33162p-1, - 0x1.a506b0p-1, - 0x1.a6d50cp-1, - 0x1.a89c86p-1, - 0x1.aa5d26p-1, - 0x1.ac16fcp-1, - 0x1.adca14p-1, - 0x1.af767ap-1, - 0x1.b11c3cp-1, - 0x1.b2bb68p-1, - 0x1.b4540ap-1, - 0x1.b5e630p-1, - 0x1.b771e8p-1, - 0x1.b8f742p-1, - 0x1.ba764ap-1, - 0x1.bbef10p-1, - 0x1.bd61a2p-1, - 0x1.bece0ep-1, - 0x1.c03464p-1, - 0x1.c194b2p-1, - 0x1.c2ef08p-1, - 0x1.c44376p-1, - 0x1.c5920ap-1, - 0x1.c6dad2p-1, - 0x1.c81de2p-1, - 0x1.c95b46p-1, - 0x1.ca930ep-1, - 0x1.cbc54cp-1, - 0x1.ccf20cp-1, - 0x1.ce1962p-1, - 0x1.cf3b5cp-1, - 0x1.d0580cp-1, - 0x1.d16f7ep-1, - 0x1.d281c4p-1, - 0x1.d38ef0p-1, - 0x1.d49710p-1, - 0x1.d59a34p-1, - 0x1.d6986cp-1, - 0x1.d791cap-1, - 0x1.d8865ep-1, - 0x1.d97636p-1, - 0x1.da6162p-1, - 0x1.db47f4p-1, - 0x1.dc29fcp-1, - 0x1.dd0788p-1, - 0x1.dde0aap-1, - 0x1.deb570p-1, - 0x1.df85eap-1, - 0x1.e0522ap-1, - 0x1.e11a3ep-1, - 0x1.e1de36p-1, - 0x1.e29e22p-1, - 0x1.e35a12p-1, - 0x1.e41214p-1, - 0x1.e4c638p-1, - 0x1.e5768cp-1, - 0x1.e62322p-1, - 0x1.e6cc08p-1, - 0x1.e7714ap-1, - 0x1.e812fcp-1, - 0x1.e8b12ap-1, - 0x1.e94be4p-1, - 0x1.e9e336p-1, - 0x1.ea7730p-1, - 0x1.eb07e2p-1, - 0x1.eb9558p-1, - 0x1.ec1fa2p-1, - 0x1.eca6ccp-1, - 0x1.ed2ae6p-1, - 0x1.edabfcp-1, - 0x1.ee2a1ep-1, - 0x1.eea556p-1, - 0x1.ef1db4p-1, - 0x1.ef9344p-1, - 0x1.f00614p-1, - 0x1.f07630p-1, - 0x1.f0e3a6p-1, - 0x1.f14e82p-1, - 0x1.f1b6d0p-1, - 0x1.f21ca0p-1, - 0x1.f27ff8p-1, - 0x1.f2e0eap-1, - 0x1.f33f7ep-1, - 0x1.f39bc2p-1, - 0x1.f3f5c2p-1, - 0x1.f44d88p-1, - 0x1.f4a31ep-1, - 0x1.f4f694p-1, - 0x1.f547f2p-1, - 0x1.f59742p-1, - 0x1.f5e490p-1, - 0x1.f62fe8p-1, - 0x1.f67952p-1, - 0x1.f6c0dcp-1, - 0x1.f7068cp-1, - 0x1.f74a6ep-1, - 0x1.f78c8cp-1, - 0x1.f7cceep-1, - 0x1.f80ba2p-1, - 0x1.f848acp-1, - 0x1.f8841ap-1, - 0x1.f8bdf2p-1, - 0x1.f8f63ep-1, - 0x1.f92d08p-1, - 0x1.f96256p-1, - 0x1.f99634p-1, - 0x1.f9c8a8p-1, - 0x1.f9f9bap-1, - 0x1.fa2974p-1, - 0x1.fa57dep-1, - 0x1.fa84fep-1, - 0x1.fab0dep-1, - 0x1.fadb84p-1, - 0x1.fb04f6p-1, - 0x1.fb2d40p-1, - 0x1.fb5464p-1, - 0x1.fb7a6cp-1, - 0x1.fb9f60p-1, - 0x1.fbc344p-1, - 0x1.fbe61ep-1, - 0x1.fc07fap-1, - 0x1.fc28d8p-1, - 0x1.fc48c2p-1, - 0x1.fc67bcp-1, - 0x1.fc85d0p-1, - 0x1.fca2fep-1, - 0x1.fcbf52p-1, - 0x1.fcdaccp-1, - 0x1.fcf576p-1, - 0x1.fd0f54p-1, - 0x1.fd286ap-1, - 0x1.fd40bep-1, - 0x1.fd5856p-1, - 0x1.fd6f34p-1, - 0x1.fd8562p-1, - 0x1.fd9ae2p-1, - 0x1.fdafb8p-1, - 0x1.fdc3e8p-1, - 0x1.fdd77ap-1, - 0x1.fdea6ep-1, - 0x1.fdfcccp-1, - 0x1.fe0e96p-1, - 0x1.fe1fd0p-1, - 0x1.fe3080p-1, - 0x1.fe40a6p-1, - 0x1.fe504cp-1, - 0x1.fe5f70p-1, - 0x1.fe6e18p-1, - 0x1.fe7c46p-1, - 0x1.fe8a00p-1, - 0x1.fe9748p-1, - 0x1.fea422p-1, - 0x1.feb090p-1, - 0x1.febc96p-1, - 0x1.fec836p-1, - 0x1.fed374p-1, - 0x1.fede52p-1, - 0x1.fee8d4p-1, - 0x1.fef2fep-1, - 0x1.fefccep-1, - 0x1.ff064cp-1, - 0x1.ff0f76p-1, - 0x1.ff1852p-1, - 0x1.ff20e0p-1, - 0x1.ff2924p-1, - 0x1.ff3120p-1, - 0x1.ff38d6p-1, - 0x1.ff4048p-1, - 0x1.ff4778p-1, - 0x1.ff4e68p-1, - 0x1.ff551ap-1, - 0x1.ff5b90p-1, - 0x1.ff61ccp-1, - 0x1.ff67d0p-1, - 0x1.ff6d9ep-1, - 0x1.ff7338p-1, - 0x1.ff789ep-1, - 0x1.ff7dd4p-1, - 0x1.ff82dap-1, - 0x1.ff87b2p-1, - 0x1.ff8c5cp-1, - 0x1.ff90dcp-1, - 0x1.ff9532p-1, - 0x1.ff9960p-1, - 0x1.ff9d68p-1, - 0x1.ffa14ap-1, - 0x1.ffa506p-1, - 0x1.ffa8a0p-1, - 0x1.ffac18p-1, - 0x1.ffaf6ep-1, - 0x1.ffb2a6p-1, - 0x1.ffb5bep-1, - 0x1.ffb8b8p-1, - 0x1.ffbb98p-1, - 0x1.ffbe5ap-1, - 0x1.ffc102p-1, - 0x1.ffc390p-1, - 0x1.ffc606p-1, - 0x1.ffc862p-1, - 0x1.ffcaa8p-1, - 0x1.ffccd8p-1, - 0x1.ffcef4p-1, - 0x1.ffd0fap-1, - 0x1.ffd2eap-1, - 0x1.ffd4cap-1, - 0x1.ffd696p-1, - 0x1.ffd84ep-1, - 0x1.ffd9f8p-1, - 0x1.ffdb90p-1, - 0x1.ffdd18p-1, - 0x1.ffde90p-1, - 0x1.ffdffap-1, - 0x1.ffe154p-1, - 0x1.ffe2a2p-1, - 0x1.ffe3e2p-1, - 0x1.ffe514p-1, - 0x1.ffe63cp-1, - 0x1.ffe756p-1, - 0x1.ffe866p-1, - 0x1.ffe96ap-1, - 0x1.ffea64p-1, - 0x1.ffeb54p-1, - 0x1.ffec3ap-1, - 0x1.ffed16p-1, - 0x1.ffedeap-1, - 0x1.ffeeb4p-1, - 0x1.ffef76p-1, - 0x1.fff032p-1, - 0x1.fff0e4p-1, - 0x1.fff18ep-1, - 0x1.fff232p-1, - 0x1.fff2d0p-1, - 0x1.fff366p-1, - 0x1.fff3f6p-1, - 0x1.fff480p-1, - 0x1.fff504p-1, - 0x1.fff582p-1, - 0x1.fff5fcp-1, - 0x1.fff670p-1, - 0x1.fff6dep-1, - 0x1.fff74ap-1, - 0x1.fff7aep-1, - 0x1.fff810p-1, - 0x1.fff86cp-1, - 0x1.fff8c6p-1, - 0x1.fff91cp-1, - 0x1.fff96cp-1, - 0x1.fff9bap-1, - 0x1.fffa04p-1, - 0x1.fffa4cp-1, - 0x1.fffa90p-1, - 0x1.fffad0p-1, - 0x1.fffb0ep-1, - 0x1.fffb4ap-1, - 0x1.fffb82p-1, - 0x1.fffbb8p-1, - 0x1.fffbecp-1, - 0x1.fffc1ep-1, - 0x1.fffc4ep-1, - 0x1.fffc7ap-1, - 0x1.fffca6p-1, - 0x1.fffccep-1, - 0x1.fffcf6p-1, - 0x1.fffd1ap-1, - 0x1.fffd3ep-1, - 0x1.fffd60p-1, - 0x1.fffd80p-1, - 0x1.fffda0p-1, - 0x1.fffdbep-1, - 0x1.fffddap-1, - 0x1.fffdf4p-1, - 0x1.fffe0ep-1, - 0x1.fffe26p-1, - 0x1.fffe3ep-1, - 0x1.fffe54p-1, - 0x1.fffe68p-1, - 0x1.fffe7ep-1, - 0x1.fffe90p-1, - 0x1.fffea2p-1, - 0x1.fffeb4p-1, - 0x1.fffec4p-1, - 0x1.fffed4p-1, - 0x1.fffee4p-1, - 0x1.fffef2p-1, - 0x1.ffff00p-1, - 0x1.ffff0cp-1, - 0x1.ffff18p-1, - 0x1.ffff24p-1, - 0x1.ffff30p-1, - 0x1.ffff3ap-1, - 0x1.ffff44p-1, - 0x1.ffff4ep-1, - 0x1.ffff56p-1, - 0x1.ffff60p-1, - 0x1.ffff68p-1, - 0x1.ffff70p-1, - 0x1.ffff78p-1, - 0x1.ffff7ep-1, - 0x1.ffff84p-1, - 0x1.ffff8cp-1, - 0x1.ffff92p-1, - 0x1.ffff98p-1, - 0x1.ffff9cp-1, - 0x1.ffffa2p-1, - 0x1.ffffa6p-1, - 0x1.ffffacp-1, - 0x1.ffffb0p-1, - 0x1.ffffb4p-1, - 0x1.ffffb8p-1, - 0x1.ffffbcp-1, - 0x1.ffffc0p-1, - 0x1.ffffc4p-1, - 0x1.ffffc6p-1, - 0x1.ffffcap-1, - 0x1.ffffccp-1, - 0x1.ffffd0p-1, - 0x1.ffffd2p-1, - 0x1.ffffd4p-1, - 0x1.ffffd6p-1, - 0x1.ffffd8p-1, - 0x1.ffffdcp-1, - 0x1.ffffdep-1, - 0x1.ffffdep-1, - 0x1.ffffe0p-1, - 0x1.ffffe2p-1, - 0x1.ffffe4p-1, - 0x1.ffffe6p-1, - 0x1.ffffe8p-1, - 0x1.ffffe8p-1, - 0x1.ffffeap-1, - 0x1.ffffeap-1, - 0x1.ffffecp-1, - 0x1.ffffeep-1, - 0x1.ffffeep-1, - 0x1.fffff0p-1, - 0x1.fffff0p-1, - 0x1.fffff2p-1, - 0x1.fffff2p-1, - 0x1.fffff2p-1, - 0x1.fffff4p-1, - 0x1.fffff4p-1, - 0x1.fffff4p-1, - 0x1.fffff6p-1, - 0x1.fffff6p-1, - 0x1.fffff6p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffff8p-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffap-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffcp-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.fffffep-1, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - 0x1.000000p+0, - }, - .scale = { 0x1.20dd76p+0, - 0x1.20d8f2p+0, - 0x1.20cb68p+0, - 0x1.20b4d8p+0, - 0x1.209546p+0, - 0x1.206cb4p+0, - 0x1.203b26p+0, - 0x1.2000a0p+0, - 0x1.1fbd28p+0, - 0x1.1f70c4p+0, - 0x1.1f1b7ap+0, - 0x1.1ebd56p+0, - 0x1.1e565cp+0, - 0x1.1de698p+0, - 0x1.1d6e14p+0, - 0x1.1cecdcp+0, - 0x1.1c62fap+0, - 0x1.1bd07cp+0, - 0x1.1b3572p+0, - 0x1.1a91e6p+0, - 0x1.19e5eap+0, - 0x1.19318cp+0, - 0x1.1874dep+0, - 0x1.17aff0p+0, - 0x1.16e2d8p+0, - 0x1.160da4p+0, - 0x1.153068p+0, - 0x1.144b3cp+0, - 0x1.135e30p+0, - 0x1.12695ep+0, - 0x1.116cd8p+0, - 0x1.1068bap+0, - 0x1.0f5d16p+0, - 0x1.0e4a08p+0, - 0x1.0d2fa6p+0, - 0x1.0c0e0ap+0, - 0x1.0ae550p+0, - 0x1.09b590p+0, - 0x1.087ee4p+0, - 0x1.07416cp+0, - 0x1.05fd3ep+0, - 0x1.04b27cp+0, - 0x1.036140p+0, - 0x1.0209a6p+0, - 0x1.00abd0p+0, - 0x1.fe8fb0p-1, - 0x1.fbbbbep-1, - 0x1.f8dc0ap-1, - 0x1.f5f0cep-1, - 0x1.f2fa4cp-1, - 0x1.eff8c4p-1, - 0x1.ecec78p-1, - 0x1.e9d5a8p-1, - 0x1.e6b498p-1, - 0x1.e38988p-1, - 0x1.e054bep-1, - 0x1.dd167cp-1, - 0x1.d9cf06p-1, - 0x1.d67ea2p-1, - 0x1.d32592p-1, - 0x1.cfc41ep-1, - 0x1.cc5a8ap-1, - 0x1.c8e91cp-1, - 0x1.c5701ap-1, - 0x1.c1efcap-1, - 0x1.be6872p-1, - 0x1.bada5ap-1, - 0x1.b745c6p-1, - 0x1.b3aafcp-1, - 0x1.b00a46p-1, - 0x1.ac63e8p-1, - 0x1.a8b828p-1, - 0x1.a5074ep-1, - 0x1.a1519ep-1, - 0x1.9d9762p-1, - 0x1.99d8dap-1, - 0x1.961650p-1, - 0x1.925008p-1, - 0x1.8e8646p-1, - 0x1.8ab950p-1, - 0x1.86e96ap-1, - 0x1.8316d6p-1, - 0x1.7f41dcp-1, - 0x1.7b6abcp-1, - 0x1.7791b8p-1, - 0x1.73b714p-1, - 0x1.6fdb12p-1, - 0x1.6bfdf0p-1, - 0x1.681ff2p-1, - 0x1.644156p-1, - 0x1.60625cp-1, - 0x1.5c8342p-1, - 0x1.58a446p-1, - 0x1.54c5a6p-1, - 0x1.50e79ep-1, - 0x1.4d0a68p-1, - 0x1.492e42p-1, - 0x1.455366p-1, - 0x1.417a0cp-1, - 0x1.3da26ep-1, - 0x1.39ccc2p-1, - 0x1.35f940p-1, - 0x1.32281ep-1, - 0x1.2e5992p-1, - 0x1.2a8dcep-1, - 0x1.26c508p-1, - 0x1.22ff72p-1, - 0x1.1f3d3cp-1, - 0x1.1b7e98p-1, - 0x1.17c3b6p-1, - 0x1.140cc4p-1, - 0x1.1059eep-1, - 0x1.0cab62p-1, - 0x1.09014cp-1, - 0x1.055bd6p-1, - 0x1.01bb2cp-1, - 0x1.fc3ee6p-2, - 0x1.f511aap-2, - 0x1.edeeeep-2, - 0x1.e6d700p-2, - 0x1.dfca26p-2, - 0x1.d8c8aap-2, - 0x1.d1d2d0p-2, - 0x1.cae8dap-2, - 0x1.c40b08p-2, - 0x1.bd3998p-2, - 0x1.b674c8p-2, - 0x1.afbcd4p-2, - 0x1.a911f0p-2, - 0x1.a27456p-2, - 0x1.9be438p-2, - 0x1.9561c8p-2, - 0x1.8eed36p-2, - 0x1.8886b2p-2, - 0x1.822e66p-2, - 0x1.7be47ap-2, - 0x1.75a91ap-2, - 0x1.6f7c6ap-2, - 0x1.695e8cp-2, - 0x1.634fa6p-2, - 0x1.5d4fd4p-2, - 0x1.575f34p-2, - 0x1.517de6p-2, - 0x1.4bac00p-2, - 0x1.45e99cp-2, - 0x1.4036d0p-2, - 0x1.3a93b2p-2, - 0x1.350052p-2, - 0x1.2f7cc4p-2, - 0x1.2a0916p-2, - 0x1.24a554p-2, - 0x1.1f518ap-2, - 0x1.1a0dc6p-2, - 0x1.14da0ap-2, - 0x1.0fb662p-2, - 0x1.0aa2d0p-2, - 0x1.059f5ap-2, - 0x1.00ac00p-2, - 0x1.f79184p-3, - 0x1.edeb40p-3, - 0x1.e46530p-3, - 0x1.daff4ap-3, - 0x1.d1b982p-3, - 0x1.c893cep-3, - 0x1.bf8e1cp-3, - 0x1.b6a856p-3, - 0x1.ade26cp-3, - 0x1.a53c42p-3, - 0x1.9cb5bep-3, - 0x1.944ec2p-3, - 0x1.8c0732p-3, - 0x1.83deeap-3, - 0x1.7bd5c8p-3, - 0x1.73eba4p-3, - 0x1.6c2056p-3, - 0x1.6473b6p-3, - 0x1.5ce596p-3, - 0x1.5575c8p-3, - 0x1.4e241ep-3, - 0x1.46f066p-3, - 0x1.3fda6cp-3, - 0x1.38e1fap-3, - 0x1.3206dcp-3, - 0x1.2b48dap-3, - 0x1.24a7b8p-3, - 0x1.1e233ep-3, - 0x1.17bb2cp-3, - 0x1.116f48p-3, - 0x1.0b3f52p-3, - 0x1.052b0cp-3, - 0x1.fe6460p-4, - 0x1.f2a902p-4, - 0x1.e72372p-4, - 0x1.dbd32ap-4, - 0x1.d0b7a0p-4, - 0x1.c5d04ap-4, - 0x1.bb1c98p-4, - 0x1.b09bfcp-4, - 0x1.a64de6p-4, - 0x1.9c31c6p-4, - 0x1.92470ap-4, - 0x1.888d1ep-4, - 0x1.7f036cp-4, - 0x1.75a960p-4, - 0x1.6c7e64p-4, - 0x1.6381e2p-4, - 0x1.5ab342p-4, - 0x1.5211ecp-4, - 0x1.499d48p-4, - 0x1.4154bcp-4, - 0x1.3937b2p-4, - 0x1.31458ep-4, - 0x1.297dbap-4, - 0x1.21df9ap-4, - 0x1.1a6a96p-4, - 0x1.131e14p-4, - 0x1.0bf97ep-4, - 0x1.04fc3ap-4, - 0x1.fc4b5ep-5, - 0x1.eeea8cp-5, - 0x1.e1d4d0p-5, - 0x1.d508fap-5, - 0x1.c885e0p-5, - 0x1.bc4a54p-5, - 0x1.b05530p-5, - 0x1.a4a54ap-5, - 0x1.99397ap-5, - 0x1.8e109cp-5, - 0x1.83298ep-5, - 0x1.78832cp-5, - 0x1.6e1c58p-5, - 0x1.63f3f6p-5, - 0x1.5a08e8p-5, - 0x1.505a18p-5, - 0x1.46e66cp-5, - 0x1.3dacd2p-5, - 0x1.34ac36p-5, - 0x1.2be38cp-5, - 0x1.2351c2p-5, - 0x1.1af5d2p-5, - 0x1.12ceb4p-5, - 0x1.0adb60p-5, - 0x1.031ad6p-5, - 0x1.f7182ap-6, - 0x1.e85c44p-6, - 0x1.da0006p-6, - 0x1.cc0180p-6, - 0x1.be5ecep-6, - 0x1.b1160ap-6, - 0x1.a4255ap-6, - 0x1.978ae8p-6, - 0x1.8b44e6p-6, - 0x1.7f5188p-6, - 0x1.73af0cp-6, - 0x1.685bb6p-6, - 0x1.5d55ccp-6, - 0x1.529b9ep-6, - 0x1.482b84p-6, - 0x1.3e03d8p-6, - 0x1.3422fep-6, - 0x1.2a875cp-6, - 0x1.212f62p-6, - 0x1.181984p-6, - 0x1.0f443ep-6, - 0x1.06ae14p-6, - 0x1.fcab14p-7, - 0x1.ec7262p-7, - 0x1.dcaf36p-7, - 0x1.cd5ecap-7, - 0x1.be7e5ap-7, - 0x1.b00b38p-7, - 0x1.a202bep-7, - 0x1.94624ep-7, - 0x1.87275ep-7, - 0x1.7a4f6ap-7, - 0x1.6dd7fep-7, - 0x1.61beaep-7, - 0x1.56011cp-7, - 0x1.4a9cf6p-7, - 0x1.3f8ff6p-7, - 0x1.34d7dcp-7, - 0x1.2a727ap-7, - 0x1.205dacp-7, - 0x1.169756p-7, - 0x1.0d1d6ap-7, - 0x1.03ede2p-7, - 0x1.f60d8ap-8, - 0x1.e4cc4ap-8, - 0x1.d4143ap-8, - 0x1.c3e1a6p-8, - 0x1.b430ecp-8, - 0x1.a4fe84p-8, - 0x1.9646f4p-8, - 0x1.8806d8p-8, - 0x1.7a3adep-8, - 0x1.6cdfccp-8, - 0x1.5ff276p-8, - 0x1.536fc2p-8, - 0x1.4754acp-8, - 0x1.3b9e40p-8, - 0x1.30499cp-8, - 0x1.2553eep-8, - 0x1.1aba78p-8, - 0x1.107a8cp-8, - 0x1.06918cp-8, - 0x1.f9f9d0p-9, - 0x1.e77448p-9, - 0x1.d58da6p-9, - 0x1.c4412cp-9, - 0x1.b38a3ap-9, - 0x1.a36454p-9, - 0x1.93cb12p-9, - 0x1.84ba30p-9, - 0x1.762d84p-9, - 0x1.682100p-9, - 0x1.5a90b0p-9, - 0x1.4d78bcp-9, - 0x1.40d564p-9, - 0x1.34a306p-9, - 0x1.28de12p-9, - 0x1.1d8318p-9, - 0x1.128ebap-9, - 0x1.07fdb4p-9, - 0x1.fb99b8p-10, - 0x1.e7f232p-10, - 0x1.d4fed8p-10, - 0x1.c2b9d0p-10, - 0x1.b11d70p-10, - 0x1.a02436p-10, - 0x1.8fc8c8p-10, - 0x1.8005f0p-10, - 0x1.70d6a4p-10, - 0x1.6235fcp-10, - 0x1.541f34p-10, - 0x1.468daep-10, - 0x1.397ceep-10, - 0x1.2ce898p-10, - 0x1.20cc76p-10, - 0x1.15246ep-10, - 0x1.09ec86p-10, - 0x1.fe41cep-11, - 0x1.e97ba4p-11, - 0x1.d57f52p-11, - 0x1.c245d4p-11, - 0x1.afc85ep-11, - 0x1.9e0058p-11, - 0x1.8ce75ep-11, - 0x1.7c7744p-11, - 0x1.6caa0ep-11, - 0x1.5d79ecp-11, - 0x1.4ee142p-11, - 0x1.40daa4p-11, - 0x1.3360ccp-11, - 0x1.266ea8p-11, - 0x1.19ff46p-11, - 0x1.0e0de8p-11, - 0x1.0295f0p-11, - 0x1.ef25d4p-12, - 0x1.da0110p-12, - 0x1.c5b542p-12, - 0x1.b23a5ap-12, - 0x1.9f8894p-12, - 0x1.8d986ap-12, - 0x1.7c629ap-12, - 0x1.6be022p-12, - 0x1.5c0a38p-12, - 0x1.4cda54p-12, - 0x1.3e4a24p-12, - 0x1.305390p-12, - 0x1.22f0b4p-12, - 0x1.161be4p-12, - 0x1.09cfa4p-12, - 0x1.fc0d56p-13, - 0x1.e577bcp-13, - 0x1.cfd4a6p-13, - 0x1.bb1a96p-13, - 0x1.a74068p-13, - 0x1.943d4ap-13, - 0x1.8208bcp-13, - 0x1.709a8ep-13, - 0x1.5feadap-13, - 0x1.4ff208p-13, - 0x1.40a8c2p-13, - 0x1.3207fcp-13, - 0x1.2408eap-13, - 0x1.16a502p-13, - 0x1.09d5f8p-13, - 0x1.fb2b7ap-14, - 0x1.e3bcf4p-14, - 0x1.cd5528p-14, - 0x1.b7e946p-14, - 0x1.a36eecp-14, - 0x1.8fdc1cp-14, - 0x1.7d2738p-14, - 0x1.6b4702p-14, - 0x1.5a329cp-14, - 0x1.49e178p-14, - 0x1.3a4b60p-14, - 0x1.2b6876p-14, - 0x1.1d3120p-14, - 0x1.0f9e1cp-14, - 0x1.02a868p-14, - 0x1.ec929ap-15, - 0x1.d4f4b4p-15, - 0x1.be6abcp-15, - 0x1.a8e8ccp-15, - 0x1.94637ep-15, - 0x1.80cfdcp-15, - 0x1.6e2368p-15, - 0x1.5c540cp-15, - 0x1.4b581cp-15, - 0x1.3b2652p-15, - 0x1.2bb5ccp-15, - 0x1.1cfe02p-15, - 0x1.0ef6c4p-15, - 0x1.019842p-15, - 0x1.e9b5e8p-16, - 0x1.d16f58p-16, - 0x1.ba4f04p-16, - 0x1.a447b8p-16, - 0x1.8f4cccp-16, - 0x1.7b5224p-16, - 0x1.684c22p-16, - 0x1.562facp-16, - 0x1.44f21ep-16, - 0x1.34894ap-16, - 0x1.24eb72p-16, - 0x1.160f44p-16, - 0x1.07ebd2p-16, - 0x1.f4f12ep-17, - 0x1.db5ad0p-17, - 0x1.c304f0p-17, - 0x1.abe09ep-17, - 0x1.95df98p-17, - 0x1.80f43ap-17, - 0x1.6d1178p-17, - 0x1.5a2ae0p-17, - 0x1.483488p-17, - 0x1.372310p-17, - 0x1.26eb9ep-17, - 0x1.1783cep-17, - 0x1.08e1bap-17, - 0x1.f5f7d8p-18, - 0x1.db92b6p-18, - 0x1.c282cep-18, - 0x1.aab7acp-18, - 0x1.94219cp-18, - 0x1.7eb1a2p-18, - 0x1.6a5972p-18, - 0x1.570b6ap-18, - 0x1.44ba86p-18, - 0x1.335a62p-18, - 0x1.22df2ap-18, - 0x1.133d96p-18, - 0x1.046aeap-18, - 0x1.ecb9d0p-19, - 0x1.d21398p-19, - 0x1.b8d094p-19, - 0x1.a0df10p-19, - 0x1.8a2e26p-19, - 0x1.74adc8p-19, - 0x1.604ea8p-19, - 0x1.4d0232p-19, - 0x1.3aba86p-19, - 0x1.296a70p-19, - 0x1.190562p-19, - 0x1.097f62p-19, - 0x1.f59a20p-20, - 0x1.d9c736p-20, - 0x1.bf716cp-20, - 0x1.a6852cp-20, - 0x1.8eefd8p-20, - 0x1.789fb8p-20, - 0x1.6383f8p-20, - 0x1.4f8c96p-20, - 0x1.3caa62p-20, - 0x1.2acee2p-20, - 0x1.19ec60p-20, - 0x1.09f5d0p-20, - 0x1.f5bd96p-21, - 0x1.d9371ep-21, - 0x1.be41dep-21, - 0x1.a4c89ep-21, - 0x1.8cb738p-21, - 0x1.75fa8ep-21, - 0x1.608078p-21, - 0x1.4c37c0p-21, - 0x1.39100ep-21, - 0x1.26f9e0p-21, - 0x1.15e682p-21, - 0x1.05c804p-21, - 0x1.ed2254p-22, - 0x1.d06ad6p-22, - 0x1.b551c8p-22, - 0x1.9bc0a0p-22, - 0x1.83a200p-22, - 0x1.6ce1aap-22, - 0x1.576c72p-22, - 0x1.43302cp-22, - 0x1.301ba2p-22, - 0x1.1e1e86p-22, - 0x1.0d2966p-22, - 0x1.fa5b50p-23, - 0x1.dc3ae4p-23, - 0x1.bfd756p-23, - 0x1.a517dap-23, - 0x1.8be4f8p-23, - 0x1.74287ep-23, - 0x1.5dcd66p-23, - 0x1.48bfd4p-23, - 0x1.34ecf8p-23, - 0x1.224310p-23, - 0x1.10b148p-23, - }, -}; diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h index 23963b5..16b81fc 100644 --- a/sysdeps/aarch64/fpu/sv_expf_inline.h +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h @@ -1,6 +1,6 @@ /* SVE helper for single-precision routines which depend on exp - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,19 +24,20 @@ struct sv_expf_data { - float poly[5]; - float inv_ln2, ln2_hi, ln2_lo, shift; + float c1, c3, inv_ln2; + float ln2_lo, c0, c2, c4; + float ln2_hi, shift; }; /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ #define SV_EXPF_DATA \ { \ - .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ - 0x1.0e4020p-7f }, \ - \ - .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ - .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ + /* Coefficients copied from the polynomial in AdvSIMD variant. */ \ + .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \ + .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .shift = 0x1.803f8p17f, \ } #define C(i) sv_f32 (d->poly[i]) @@ -47,29 +48,27 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - /* Load some constants in quad-word chunks to minimise memory access. */ - svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); + svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo); /* n = round(x/(ln2/N)). */ - svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); + svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift); svfloat32_t n = svsub_x (pg, z, d->shift); /* r = x - n*ln2/N. */ - svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); - r = svmls_lane (r, n, c4_invln2_and_ln2, 3); + svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x); + r = svmls_lane (r, n, lane_consts, 0); /* scale = 2^(n/N). */ - svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); + svfloat32_t scale = svexpa (svreinterpret_u32 (z)); - /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ - svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); - svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); - svfloat32_t r2 = svmul_f32_x (pg, r, r); + /* poly(r) = exp(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4 + C4 r^5. */ + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3); + svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t p14 = svmla_x (pg, p12, p34, r2); - svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); + svfloat32_t p0 = svmul_lane (r, lane_consts, 1); svfloat32_t poly = svmla_x (pg, p0, r2, p14); return svmla_x (pg, scale, scale, poly); } - #endif diff --git a/sysdeps/aarch64/fpu/sv_expm1f_inline.h b/sysdeps/aarch64/fpu/sv_expm1f_inline.h index 5b72451..ef2b8ea 100644 --- a/sysdeps/aarch64/fpu/sv_expm1f_inline.h +++ b/sysdeps/aarch64/fpu/sv_expm1f_inline.h @@ -1,6 +1,6 @@ /* Single-precision inline helper for vector (SVE) expm1 function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -27,21 +27,18 @@ struct sv_expm1f_data /* These 4 are grouped together so they can be loaded as one quadword, then used with _lane forms of svmla/svmls. */ float32_t c2, c4, ln2_hi, ln2_lo; - float32_t c0, c1, c3, inv_ln2, shift; + float c0, inv_ln2, c1, c3, special_bound; }; /* Coefficients generated using fpminimax. */ #define SV_EXPM1F_DATA \ { \ - .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .c2 = 0x1.555736p-5, \ - .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .c0 = 0x1.fffffep-2, .c1 = 0x1.5554aep-3, .inv_ln2 = 0x1.715476p+0f, \ + .c2 = 0x1.555736p-5, .c3 = 0x1.12287cp-7, \ \ - .shift = 0x1.8p23f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ - .ln2_lo = 0x1.7f7d1cp-20f, \ + .c4 = 0x1.6b55a2p-10, .ln2_lo = 0x1.7f7d1cp-20f, .ln2_hi = 0x1.62e4p-1f, \ } -#define C(i) sv_f32 (d->c##i) - static inline svfloat32_t expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) { @@ -55,9 +52,8 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2); - j = svsub_x (pg, j, d->shift); - svint32_t i = svcvt_s32_x (pg, j); + svfloat32_t j = svmul_x (svptrue_b32 (), x, d->inv_ln2); + j = svrinta_x (pg, j); svfloat32_t f = svmls_lane (x, j, lane_constants, 2); f = svmls_lane (f, j, lane_constants, 3); @@ -67,18 +63,18 @@ expm1f_inline (svfloat32_t x, svbool_t pg, const struct sv_expm1f_data *d) x + ax^2 + bx^3 + cx^4 .... So we calculate the polynomial P(f) = a + bf + cf^2 + ... and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0); - svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1); - svfloat32_t f2 = svmul_x (pg, f, f); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), f, lane_constants, 0); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), f, lane_constants, 1); + svfloat32_t f2 = svmul_x (svptrue_b32 (), f, f); svfloat32_t p = svmla_x (pg, p12, f2, p34); - p = svmla_x (pg, C (0), f, p); + p = svmla_x (pg, sv_f32 (d->c0), f, p); p = svmla_x (pg, f, f2, p); /* Assemble the result. expm1(x) ~= 2^i * (p + 1) - 1 Let t = 2^i. */ - svfloat32_t t = svscale_x (pg, sv_f32 (1), i); - return svmla_x (pg, svsub_x (pg, t, 1), p, t); + svfloat32_t t = svscale_x (pg, sv_f32 (1.0f), svcvt_s32_x (pg, j)); + return svmla_x (pg, svsub_x (pg, t, 1.0f), p, t); } #endif diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h index da01967..71f88e0 100644 --- a/sysdeps/aarch64/fpu/sv_log1p_inline.h +++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h @@ -1,6 +1,6 @@ /* Helper for double-precision SVE routines which depend on log1p - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/sv_log1pf_inline.h b/sysdeps/aarch64/fpu/sv_log1pf_inline.h index b94b2da..59cbf6c 100644 --- a/sysdeps/aarch64/fpu/sv_log1pf_inline.h +++ b/sysdeps/aarch64/fpu/sv_log1pf_inline.h @@ -1,6 +1,6 @@ /* Helper for single-precision SVE routines which depend on log1p - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,55 +22,76 @@ #include "sv_math.h" #include "vecmath_config.h" -#include "poly_sve_f32.h" + +#define SignExponentMask 0xff800000 static const struct sv_log1pf_data { - float32_t poly[9]; - float32_t ln2; - float32_t scale_back; + float c0, c2, c4, c6; + float c1, c3, c5, c7; + float ln2, exp_bias, quarter; + uint32_t four, three_quarters; } sv_log1pf_data = { - /* Polynomial generated using FPMinimax in [-0.25, 0.5]. */ - .poly = { -0x1p-1f, 0x1.5555aap-2f, -0x1.000038p-2f, 0x1.99675cp-3f, - -0x1.54ef78p-3f, 0x1.28a1f4p-3f, -0x1.0da91p-3f, 0x1.abcb6p-4f, - -0x1.6f0d5ep-5f }, - .scale_back = 0x1.0p-23f, - .ln2 = 0x1.62e43p-1f, + /* Do not store first term of polynomial, which is -0.5, as + this can be fmov-ed directly instead of including it in + the main load-and-mla polynomial schedule. */ + .c0 = 0x1.5555aap-2f, .c1 = -0x1.000038p-2f, .c2 = 0x1.99675cp-3f, + .c3 = -0x1.54ef78p-3f, .c4 = 0x1.28a1f4p-3f, .c5 = -0x1.0da91p-3f, + .c6 = 0x1.abcb6p-4f, .c7 = -0x1.6f0d5ep-5f, .ln2 = 0x1.62e43p-1f, + .exp_bias = 0x1p-23f, .quarter = 0x1p-2f, .four = 0x40800000, + .three_quarters = 0x3f400000, }; static inline svfloat32_t -eval_poly (svfloat32_t m, const float32_t *c, svbool_t pg) -{ - svfloat32_t p_12 = svmla_x (pg, sv_f32 (c[0]), m, sv_f32 (c[1])); - svfloat32_t m2 = svmul_x (pg, m, m); - svfloat32_t q = svmla_x (pg, m, m2, p_12); - svfloat32_t p = sv_pw_horner_6_f32_x (pg, m, m2, c + 2); - p = svmul_x (pg, m2, p); - - return svmla_x (pg, q, m2, p); -} - -static inline svfloat32_t sv_log1pf_inline (svfloat32_t x, svbool_t pg) { const struct sv_log1pf_data *d = ptr_barrier (&sv_log1pf_data); - svfloat32_t m = svadd_x (pg, x, 1.0f); - - svint32_t ks = svsub_x (pg, svreinterpret_s32 (m), - svreinterpret_s32 (svdup_f32 (0.75f))); - ks = svand_x (pg, ks, 0xff800000); - svuint32_t k = svreinterpret_u32 (ks); - svfloat32_t s = svreinterpret_f32 ( - svsub_x (pg, svreinterpret_u32 (svdup_f32 (4.0f)), k)); - - svfloat32_t m_scale - = svreinterpret_f32 (svsub_x (pg, svreinterpret_u32 (x), k)); - m_scale - = svadd_x (pg, m_scale, svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s)); - svfloat32_t p = eval_poly (m_scale, d->poly, pg); - svfloat32_t scale_back = svmul_x (pg, svcvt_f32_x (pg, k), d->scale_back); - return svmla_x (pg, p, scale_back, d->ln2); + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ + svfloat32_t m = svadd_x (pg, x, 1); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ + svint32_t k + = svand_x (pg, svsub_x (pg, svreinterpret_s32 (m), d->three_quarters), + sv_s32 (SignExponentMask)); + + /* Scale x by exponent manipulation. */ + svfloat32_t m_scale = svreinterpret_f32 ( + svsub_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (k))); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four)); + svfloat32_t fconst = svld1rq_f32 (svptrue_b32 (), &d->ln2); + m_scale = svadd_x (pg, m_scale, svmla_lane_f32 (sv_f32 (-1), s, fconst, 2)); + + /* Evaluate polynomial on reduced interval. */ + svfloat32_t ms2 = svmul_x (svptrue_b32 (), m_scale, m_scale); + + svfloat32_t c1357 = svld1rq_f32 (svptrue_b32 (), &d->c1); + svfloat32_t p01 = svmla_lane_f32 (sv_f32 (d->c0), m_scale, c1357, 0); + svfloat32_t p23 = svmla_lane_f32 (sv_f32 (d->c2), m_scale, c1357, 1); + svfloat32_t p45 = svmla_lane_f32 (sv_f32 (d->c4), m_scale, c1357, 2); + svfloat32_t p67 = svmla_lane_f32 (sv_f32 (d->c6), m_scale, c1357, 3); + + svfloat32_t p = svmla_x (pg, p45, p67, ms2); + p = svmla_x (pg, p23, p, ms2); + p = svmla_x (pg, p01, p, ms2); + + p = svmad_x (pg, m_scale, p, -0.5); + p = svmla_x (pg, m_scale, m_scale, svmul_x (pg, m_scale, p)); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ + svfloat32_t scale_back = svmul_lane_f32 (svcvt_f32_x (pg, k), fconst, 1); + return svmla_lane_f32 (p, scale_back, fconst, 0); } #endif diff --git a/sysdeps/aarch64/fpu/sv_math.h b/sysdeps/aarch64/fpu/sv_math.h index 41a2013..3d576df 100644 --- a/sysdeps/aarch64/fpu/sv_math.h +++ b/sysdeps/aarch64/fpu/sv_math.h @@ -1,5 +1,5 @@ /* Utilities for SVE libmvec routines. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c index d56a102..825c975 100644 --- a/sysdeps/aarch64/fpu/tan_advsimd.c +++ b/sysdeps/aarch64/fpu/tan_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) tan function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c index b2e4447..6cbd4f2 100644 --- a/sysdeps/aarch64/fpu/tan_sve.c +++ b/sysdeps/aarch64/fpu/tan_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) tan function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -22,24 +22,38 @@ static const struct data { - double poly[9]; - double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; + double c2, c4, c6, c8; + double poly_1357[4]; + double c0, inv_half_pi; + double half_pi_hi, half_pi_lo, range_val; } data = { /* Polynomial generated with FPMinimax. */ - .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, - 0x1.4e4fd14147622p-12, }, + .c2 = 0x1.ba1ba1bb46414p-5, + .c4 = 0x1.226e5e5ecdfa3p-7, + .c6 = 0x1.7ea75d05b583ep-10, + .c8 = 0x1.4e4fd14147622p-12, + .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, + 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, + .c0 = 0x1.5555555555556p-2, + .inv_half_pi = 0x1.45f306dc9c883p-1, .half_pi_hi = 0x1.921fb54442d18p0, .half_pi_lo = 0x1.1a62633145c07p-54, - .inv_half_pi = 0x1.45f306dc9c883p-1, .range_val = 0x1p23, - .shift = 0x1.8p52, }; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, + svbool_t special) { + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); return sv_call_f64 (tan, x, y, special); } @@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) { const struct data *dat = ptr_barrier (&data); - - /* Invert condition to catch NaNs and Infs as well as large values. */ - svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); - + svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); /* q = nearest integer to 2 * x / pi. */ - svfloat64_t shift = sv_f64 (dat->shift); - svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); - q = svsub_x (pg, q, shift); - svint64_t qi = svcvt_s64_x (pg, q); + svfloat64_t q = svmul_lane (x, half_pi_c0, 1); + q = svrinta_x (pg, q); /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ @@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) r = svmls_lane (r, q, half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ - r = svmul_x (pg, r, 0.5); + r = svmul_x (svptrue_b64 (), r, 0.5); /* Approximate tan(r) using order 8 polynomial. tan(x) is odd, so polynomial has the form: @@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... Then compute the approximation by: tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t r4 = svmul_x (pg, r2, r2); - svfloat64_t r8 = svmul_x (pg, r4, r4); + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); + svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ - svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); - p = svmad_x (pg, p, r2, dat->poly[0]); - p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); + svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); + svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); + + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); + svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); + svfloat64_t p03 = svmla_x (pg, p01, p23, r4); + + svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); + svfloat64_t p47 = svmla_x (pg, p45, p67, r4); + + svfloat64_t p = svmla_x (pg, p03, p47, r8); + + svfloat64_t z = svmul_x (svptrue_b64 (), p, r); + z = svmul_x (svptrue_b64 (), r2, z); + z = svmla_lane (z, r, half_pi_c0, 0); + p = svmla_x (pg, r, r2, z); /* Recombination uses double-angle formula: tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) and reciprocity around pi/2: tan(x) = 1 / (tan(pi/2 - x)) to assemble result using change-of-sign and conditional selection of - numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ - svbool_t use_recip - = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); + numerator/denominator dependent on odd/even-ness of q (quadrant). */ + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + if (__glibc_unlikely (svptest_any (pg, special))) + { + return special_case (x, p, q, pg, special); + } + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); svfloat64_t n = svmad_x (pg, p, p, -1); - svfloat64_t d = svmul_x (pg, p, 2); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); svfloat64_t swap = n; n = svneg_m (n, use_recip, d); d = svsel (use_recip, swap, d); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); return svdiv_x (pg, n, d); } diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c index 705586f..a8adf16 100644 --- a/sysdeps/aarch64/fpu/tanf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) tan function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c index f342583..8bd5440 100644 --- a/sysdeps/aarch64/fpu/tanf_sve.c +++ b/sysdeps/aarch64/fpu/tanf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) tan function - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* Determine whether input is too large to perform fast regression. */ - svbool_t cmp = svacge (pg, x, d->range_val); - svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); /* n = rint(x/(pi/2)). */ - svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); - svfloat32_t n = svsub_x (pg, q, d->shift); + svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); /* n is already a signed integer, simply convert it. */ svint32_t in = svcvt_s32_x (pg, n); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ svint32_t alt = svand_x (pg, in, 1); svbool_t pred_alt = svcmpne (pg, alt, 0); - /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ svfloat32_t r; r = svmls_lane (x, n, pi_vals, 0); @@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], using Estrin on z^2. */ - svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); @@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); - /* Transform result back, if necessary. */ - svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); - /* No need to pass pg to specialcase here since cmp is a strict subset, guaranteed by the cmpge above. */ + + /* Determine whether input is too large to perform fast regression. */ + svbool_t cmp = svacge (pg, x, d->range_val); if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, svsel (pred_alt, inv_y, y), cmp); + return special_case (x, svdivr_x (pg, y, 1.0f), cmp); + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); return svsel (pred_alt, inv_y, y); } diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c index 1da1dfa..40acefd 100644 --- a/sysdeps/aarch64/fpu/tanh_advsimd.c +++ b/sysdeps/aarch64/fpu/tanh_advsimd.c @@ -1,6 +1,6 @@ /* Double-precision vector (Advanced SIMD) tanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,68 +18,30 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11]; - float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; - uint64x2_t onef; + struct v_expm1_data d; uint64x2_t thresh, tiny_bound; } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .ln2_hi = V2 (-0x1.62e42fefa39efp-1), - .ln2_lo = V2 (-0x1.abc9e3b39803fp-56), - .shift = V2 (0x1.8p52), - - .onef = V2 (0x3ff0000000000000), + .d = V_EXPM1_DATA, .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ .thresh = V2 (0x01f241bf835f9d5f), }; -static inline float64x2_t -expm1_inline (float64x2_t x, const struct data *d) -{ - /* Helper routine for calculating exp(x) - 1. Vector port of the helper from - the scalar variant of tanh. */ - - /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi); - f = vfmaq_f64 (f, j, d->ln2_lo); - - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t p = vfmaq_f64 ( - f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly)); - - /* t = 2 ^ i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef)); - /* expm1(x) = p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t); -} - static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, float64x2_t q, float64x2_t qp2, + uint64x2_t special) { - return v_call_f64 (tanh, x, y, special); + return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special); } /* Vector approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.77 ULP: - _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 - want -0x1.bd6a21a163624p-3. */ + version of expm1. The greatest observed error is 2.70 ULP: + _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3 + want -0x1.be5452a6459fbp-3. */ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -100,10 +62,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) u = vaddq_f64 (u, u); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - float64x2_t q = expm1_inline (u, d); - float64x2_t qp2 = vaddq_f64 (q, v_f64 (2)); + float64x2_t q = expm1_inline (u, &d->d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, vdivq_f64 (q, qp2), special); + return special_case (x, q, qp2, special); return vdivq_f64 (q, qp2); } diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c index d25e011..789cc68 100644 --- a/sysdeps/aarch64/fpu/tanh_sve.c +++ b/sysdeps/aarch64/fpu/tanh_sve.c @@ -1,6 +1,6 @@ /* Double-precision vector (SVE) tanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c index 50defd6..f61a456 100644 --- a/sysdeps/aarch64/fpu/tanhf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c @@ -1,6 +1,6 @@ /* Single-precision vector (Advanced SIMD) tanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -28,13 +28,16 @@ static const struct data /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ .boring_bound = V4 (0x41102cb3), .large_bound = V4 (0x7f800000), - .onef = V4 (0x3f800000), }; static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring, + float32x4_t q, uint32x4_t special) { - return v_call_f32 (tanhf, x, y, special); + return v_call_f32 ( + tanhf, x, + vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))), + special); } /* Approximation for single-precision vector tanh(x), using a simplified @@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) uint32x4_t iax = vreinterpretq_u32_f32 (ax); uint32x4_t sign = veorq_u32 (ix, iax); uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound); - float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef)); + /* expm1 exponent bias is 1.0f reinterpreted to int. */ + float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 ( + sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias))); #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered properly, set all special and boring @@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x) /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts); - float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); + if (__glibc_unlikely (v_any_u32 (special))) - return special_case (vreinterpretq_f32_u32 (ix), - vbslq_f32 (is_boring, boring, y), special); + return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q, + special); + + float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0))); return vbslq_f32 (is_boring, boring, y); } libmvec_hidden_def (V_NAME_F1 (tanh)) diff --git a/sysdeps/aarch64/fpu/tanhf_sve.c b/sysdeps/aarch64/fpu/tanhf_sve.c index 0b94523..e12f86d 100644 --- a/sysdeps/aarch64/fpu/tanhf_sve.c +++ b/sysdeps/aarch64/fpu/tanhf_sve.c @@ -1,6 +1,6 @@ /* Single-precision vector (SVE) tanh function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,20 +19,27 @@ #include "sv_expm1f_inline.h" +/* Largest value of x for which tanhf(x) rounds to 1 (or -1 for negative). */ +#define BoringBound 0x1.205966p+3f + static const struct data { struct sv_expm1f_data expm1f_consts; - uint32_t boring_bound, onef; + uint32_t onef, special_bound; + float boring_bound; } data = { .expm1f_consts = SV_EXPM1F_DATA, - /* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */ - .boring_bound = 0x41102cb3, .onef = 0x3f800000, + .special_bound = 0x7f800000, + .boring_bound = BoringBound, }; static svfloat32_t NOINLINE -special_case (svfloat32_t x, svfloat32_t y, svbool_t special) +special_case (svfloat32_t x, svbool_t pg, svbool_t is_boring, + svfloat32_t boring, svfloat32_t q, svbool_t special) { + svfloat32_t y + = svsel_f32 (is_boring, boring, svdiv_x (pg, q, svadd_x (pg, q, 2.0))); return sv_call_f32 (tanhf, x, y, special); } @@ -47,15 +54,16 @@ svfloat32_t SV_NAME_F1 (tanh) (svfloat32_t x, const svbool_t pg) svfloat32_t ax = svabs_x (pg, x); svuint32_t iax = svreinterpret_u32 (ax); svuint32_t sign = sveor_x (pg, svreinterpret_u32 (x), iax); - svbool_t is_boring = svcmpgt (pg, iax, d->boring_bound); svfloat32_t boring = svreinterpret_f32 (svorr_x (pg, sign, d->onef)); - - svbool_t special = svcmpgt (pg, iax, 0x7f800000); + svbool_t special = svcmpgt (pg, iax, d->special_bound); + svbool_t is_boring = svacgt (pg, x, d->boring_bound); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - svfloat32_t q = expm1f_inline (svmul_x (pg, x, 2.0), pg, &d->expm1f_consts); - svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); + svfloat32_t q = expm1f_inline (svmul_x (svptrue_b32 (), x, 2.0), pg, + &d->expm1f_consts); + if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svsel_f32 (is_boring, boring, y), special); + return special_case (x, pg, is_boring, boring, q, special); + svfloat32_t y = svdiv_x (pg, q, svadd_x (pg, q, 2.0)); return svsel_f32 (is_boring, boring, y); } diff --git a/sysdeps/aarch64/fpu/tanpi_advsimd.c b/sysdeps/aarch64/fpu/tanpi_advsimd.c new file mode 100644 index 0000000..0a93bee --- /dev/null +++ b/sysdeps/aarch64/fpu/tanpi_advsimd.c @@ -0,0 +1,88 @@ +/* Double-precision (Advanced SIMD) tanpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +const static struct v_tanpi_data +{ + float64x2_t c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = V2 (0x1.921fb54442d18p1), /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = V2 (0x1.466bc6775b0f9p5), + .c3 = 0x1.45fff9b426f5ep7, .c4 = V2 (0x1.45f4730dbca5cp9), + .c5 = 0x1.45f3265994f85p11, .c6 = V2 (0x1.45f4234b330cap13), + .c7 = 0x1.45dca11be79ebp15, .c8 = V2 (0x1.47283fc5eea69p17), + .c9 = 0x1.3a6d958cdefaep19, .c10 = V2 (0x1.927896baee627p21), + .c11 = -0x1.89333f6acd922p19, .c12 = V2 (0x1.5d4e912bb8456p27), + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVnN2v_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +float64x2_t VPCS_ATTR V_NAME_D1 (tanpi) (float64x2_t x) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + float64x2_t n = vrndnq_f64 (x); + + /* inf produces nan that propagates. */ + float64x2_t xr = vsubq_f64 (x, n); + float64x2_t ar = vabdq_f64 (x, n); + uint64x2_t flip = vcgtq_f64 (ar, v_f64 (0.25)); + float64x2_t r = vbslq_f64 (flip, vsubq_f64 (v_f64 (0.5), ar), ar); + + /* Order-14 pairwise Horner. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t r4 = vmulq_f64 (r2, r2); + + float64x2_t c_1_3 = vld1q_f64 (&d->c1); + float64x2_t c_5_7 = vld1q_f64 (&d->c5); + float64x2_t c_9_11 = vld1q_f64 (&d->c9); + float64x2_t c_13_14 = vld1q_f64 (&d->c13); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, r2, c_1_3, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, r2, c_1_3, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, r2, c_5_7, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, r2, c_5_7, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, r2, c_9_11, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, r2, c_9_11, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, r2, c_13_14, 0); + + float64x2_t p = vfmaq_laneq_f64 (p1213, r4, c_13_14, 1); + p = vfmaq_f64 (p1011, r4, p); + p = vfmaq_f64 (p89, r4, p); + p = vfmaq_f64 (p67, r4, p); + p = vfmaq_f64 (p45, r4, p); + p = vfmaq_f64 (p23, r4, p); + p = vfmaq_f64 (p01, r4, p); + p = vmulq_f64 (r, p); + + float64x2_t p_recip = vdivq_f64 (v_f64 (1.0), p); + float64x2_t y = vbslq_f64 (flip, p_recip, p); + + uint64x2_t sign + = veorq_u64 (vreinterpretq_u64_f64 (xr), vreinterpretq_u64_f64 (ar)); + return vreinterpretq_f64_u64 (vorrq_u64 (vreinterpretq_u64_f64 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/tanpi_sve.c b/sysdeps/aarch64/fpu/tanpi_sve.c new file mode 100644 index 0000000..57c643a --- /dev/null +++ b/sysdeps/aarch64/fpu/tanpi_sve.c @@ -0,0 +1,88 @@ +/* Double-precision (SVE) tanpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +const static struct v_tanpi_data +{ + double c0, c2, c4, c6, c8, c10, c12; + double c1, c3, c5, c7, c9, c11, c13, c14; +} tanpi_data = { + /* Coefficents for tan(pi * x) computed with fpminimax + on [ 0x1p-1022 0x1p-2 ] + approx rel error: 0x1.7eap-55 + approx abs error: 0x1.7eap-55. */ + .c0 = 0x1.921fb54442d18p1, /* pi. */ + .c1 = 0x1.4abbce625be52p3, .c2 = 0x1.466bc6775b0f9p5, + .c3 = 0x1.45fff9b426f5ep7, .c4 = 0x1.45f4730dbca5cp9, + .c5 = 0x1.45f3265994f85p11, .c6 = 0x1.45f4234b330cap13, + .c7 = 0x1.45dca11be79ebp15, .c8 = 0x1.47283fc5eea69p17, + .c9 = 0x1.3a6d958cdefaep19, .c10 = 0x1.927896baee627p21, + .c11 = -0x1.89333f6acd922p19, .c12 = 0x1.5d4e912bb8456p27, + .c13 = -0x1.a854d53ab6874p29, .c14 = 0x1.1b76de7681424p32, +}; + +/* Approximation for double-precision vector tanpi(x) + The maximum error is 3.06 ULP: + _ZGVsMxv_tanpi(0x1.0a4a07dfcca3ep-1) got -0x1.fa30112702c98p+3 + want -0x1.fa30112702c95p+3. */ +svfloat64_t SV_NAME_D1 (tanpi) (svfloat64_t x, const svbool_t pg) +{ + const struct v_tanpi_data *d = ptr_barrier (&tanpi_data); + + svfloat64_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat64_t xr = svsub_x (pg, x, n); + svfloat64_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25); + svfloat64_t r = svsel (flip, svsubr_x (pg, ar, 0.5), ar); + + /* Order-14 pairwise Horner. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); + + svfloat64_t c_1_3 = svld1rq (pg, &d->c1); + svfloat64_t c_5_7 = svld1rq (pg, &d->c5); + svfloat64_t c_9_11 = svld1rq (pg, &d->c9); + svfloat64_t c_13_14 = svld1rq (pg, &d->c13); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r2, c_1_3, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r2, c_1_3, 1); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r2, c_5_7, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), r2, c_5_7, 1); + svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), r2, c_9_11, 0); + svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), r2, c_9_11, 1); + svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), r2, c_13_14, 0); + + svfloat64_t p = svmla_lane (p1213, r4, c_13_14, 1); + p = svmad_x (pg, p, r4, p1011); + p = svmad_x (pg, p, r4, p89); + p = svmad_x (pg, p, r4, p67); + p = svmad_x (pg, p, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + p = svmul_x (pg, r, p); + + svfloat64_t p_recip = svdivr_x (pg, p, 1.0); + svfloat64_t y = svsel (flip, p_recip, p); + + svuint64_t sign + = sveor_x (pg, svreinterpret_u64 (xr), svreinterpret_u64 (ar)); + return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/tanpif_advsimd.c b/sysdeps/aarch64/fpu/tanpif_advsimd.c new file mode 100644 index 0000000..248cb0f --- /dev/null +++ b/sysdeps/aarch64/fpu/tanpif_advsimd.c @@ -0,0 +1,72 @@ +/* Single-precision (Advanced SIMD) tanpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +const static struct v_tanpif_data +{ + float32x4_t c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficents for tan(pi * x). */ + .c0 = V4 (0x1.921fb4p1f), .c1 = 0x1.4abbcep3f, .c2 = V4 (0x1.466b8p5f), + .c3 = 0x1.461c72p7f, .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f, + .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpi(x) + The maximum error is 3.34 ULP: + _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +float32x4_t VPCS_ATTR V_NAME_F1 (tanpi) (float32x4_t x) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + + float32x4_t n = vrndnq_f32 (x); + + /* inf produces nan that propagates. */ + float32x4_t xr = vsubq_f32 (x, n); + float32x4_t ar = vabdq_f32 (x, n); + uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f)); + float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar); + + /* Order-7 pairwise Horner polynomial evaluation scheme. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t r4 = vmulq_f32 (r2, r2); + + float32x4_t odd_coeffs = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3); + float32x4_t p = vfmaq_f32 (p45, r4, p67); + p = vfmaq_f32 (p23, r4, p); + p = vfmaq_f32 (p01, r4, p); + + p = vmulq_f32 (r, p); + float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p); + float32x4_t y = vbslq_f32 (flip, p_recip, p); + + uint32x4_t sign + = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar)); + return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign)); +} + +libmvec_hidden_def (V_NAME_F1 (tanpi)) +HALF_WIDTH_ALIAS_F1 (tanpi) diff --git a/sysdeps/aarch64/fpu/tanpif_sve.c b/sysdeps/aarch64/fpu/tanpif_sve.c new file mode 100644 index 0000000..0285f56 --- /dev/null +++ b/sysdeps/aarch64/fpu/tanpif_sve.c @@ -0,0 +1,68 @@ +/* Single-precision (SVE) tanpi function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +const static struct v_tanpif_data +{ + float c0, c2, c4, c6; + float c1, c3, c5, c7; +} tanpif_data = { + /* Coefficients for tan(pi * x). */ + .c0 = 0x1.921fb4p1f, .c1 = 0x1.4abbcep3f, .c2 = 0x1.466b8p5f, + .c3 = 0x1.461c72p7f, .c4 = 0x1.42e9d4p9f, .c5 = 0x1.69e2c4p11f, + .c6 = 0x1.e85558p11f, .c7 = 0x1.a52e08p16f, +}; + +/* Approximation for single-precision vector tanpif(x) + The maximum error is 3.34 ULP: + _ZGVsMxv_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2 + want 0x1.f70aa6p+2. */ +svfloat32_t SV_NAME_F1 (tanpi) (svfloat32_t x, const svbool_t pg) +{ + const struct v_tanpif_data *d = ptr_barrier (&tanpif_data); + svfloat32_t odd_coeffs = svld1rq (pg, &d->c1); + svfloat32_t n = svrintn_x (pg, x); + + /* inf produces nan that propagates. */ + svfloat32_t xr = svsub_x (pg, x, n); + svfloat32_t ar = svabd_x (pg, x, n); + svbool_t flip = svcmpgt (pg, ar, 0.25f); + svfloat32_t r = svsel (flip, svsub_x (pg, sv_f32 (0.5f), ar), ar); + + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t r4 = svmul_x (pg, r2, r2); + + /* Order-7 Pairwise Horner. */ + svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), r2, odd_coeffs, 0); + svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), r2, odd_coeffs, 1); + svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), r2, odd_coeffs, 2); + svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), r2, odd_coeffs, 3); + svfloat32_t p = svmad_x (pg, p67, r4, p45); + p = svmad_x (pg, p, r4, p23); + p = svmad_x (pg, p, r4, p01); + svfloat32_t poly = svmul_x (pg, r, p); + + svfloat32_t poly_recip = svdiv_x (pg, sv_f32 (1.0), poly); + svfloat32_t y = svsel (flip, poly_recip, poly); + + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (xr), svreinterpret_u32 (ar)); + return svreinterpret_f32 (svorr_x (pg, svreinterpret_u32 (y), sign)); +} diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c index 8c98161..07133eb 100644 --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c @@ -1,6 +1,6 @@ /* Scalar wrappers for double-precision Advanced SIMD vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2) VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt) VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos) VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh) +VPCS_VECTOR_WRAPPER (cospi_advsimd, _ZGVnN2v_cospi) VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf) VPCS_VECTOR_WRAPPER (erfc_advsimd, _ZGVnN2v_erfc) VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp) @@ -47,5 +48,7 @@ VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2) VPCS_VECTOR_WRAPPER_ff (pow_advsimd, _ZGVnN2vv_pow) VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin) VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh) +VPCS_VECTOR_WRAPPER (sinpi_advsimd, _ZGVnN2v_sinpi) VPCS_VECTOR_WRAPPER (tan_advsimd, _ZGVnN2v_tan) VPCS_VECTOR_WRAPPER (tanh_advsimd, _ZGVnN2v_tanh) +VPCS_VECTOR_WRAPPER (tanpi_advsimd, _ZGVnN2v_tanpi) diff --git a/sysdeps/aarch64/fpu/test-double-advsimd.h b/sysdeps/aarch64/fpu/test-double-advsimd.h index ae2764e..f2ae992 100644 --- a/sysdeps/aarch64/fpu/test-double-advsimd.h +++ b/sysdeps/aarch64/fpu/test-double-advsimd.h @@ -1,6 +1,6 @@ /* Test declarations for double-precision Advanced SIMD vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c index 2583428..02953cb 100644 --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c @@ -1,6 +1,6 @@ /* Scalar wrappers for double-precision SVE vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2) SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt) SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos) SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh) +SVE_VECTOR_WRAPPER (cospi_sve, _ZGVsMxv_cospi) SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf) SVE_VECTOR_WRAPPER (erfc_sve, _ZGVsMxv_erfc) SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp) @@ -66,5 +67,7 @@ SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2) SVE_VECTOR_WRAPPER_ff (pow_sve, _ZGVsMxvv_pow) SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin) SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh) +SVE_VECTOR_WRAPPER (sinpi_sve, _ZGVsMxv_sinpi) SVE_VECTOR_WRAPPER (tan_sve, _ZGVsMxv_tan) SVE_VECTOR_WRAPPER (tanh_sve, _ZGVsMxv_tanh) +SVE_VECTOR_WRAPPER (tanpi_sve, _ZGVsMxv_tanpi) diff --git a/sysdeps/aarch64/fpu/test-double-sve.h b/sysdeps/aarch64/fpu/test-double-sve.h index 9738967..0219a6a 100644 --- a/sysdeps/aarch64/fpu/test-double-sve.h +++ b/sysdeps/aarch64/fpu/test-double-sve.h @@ -1,6 +1,6 @@ /* Test declarations for double-precision SVE vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c index 2667901..118bbb0 100644 --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c @@ -1,6 +1,6 @@ /* Scalar wrappers for single-precision Advanced SIMD vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -33,6 +33,7 @@ VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f) VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf) VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf) VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf) +VPCS_VECTOR_WRAPPER (cospif_advsimd, _ZGVnN4v_cospif) VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff) VPCS_VECTOR_WRAPPER (erfcf_advsimd, _ZGVnN4v_erfcf) VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf) @@ -47,5 +48,7 @@ VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f) VPCS_VECTOR_WRAPPER_ff (powf_advsimd, _ZGVnN4vv_powf) VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf) VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf) +VPCS_VECTOR_WRAPPER (sinpif_advsimd, _ZGVnN4v_sinpif) VPCS_VECTOR_WRAPPER (tanf_advsimd, _ZGVnN4v_tanf) VPCS_VECTOR_WRAPPER (tanhf_advsimd, _ZGVnN4v_tanhf) +VPCS_VECTOR_WRAPPER (tanpif_advsimd, _ZGVnN4v_tanpif) diff --git a/sysdeps/aarch64/fpu/test-float-advsimd.h b/sysdeps/aarch64/fpu/test-float-advsimd.h index e5502bc..11c057d 100644 --- a/sysdeps/aarch64/fpu/test-float-advsimd.h +++ b/sysdeps/aarch64/fpu/test-float-advsimd.h @@ -1,6 +1,6 @@ /* Test declarations for singlex-precision Advanced SIMD vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c index 0f972b7..f5e7c8c 100644 --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c @@ -1,6 +1,6 @@ /* Scalar wrappers for single-precision SVE vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -52,6 +52,7 @@ SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f) SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf) SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf) SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf) +SVE_VECTOR_WRAPPER (cospif_sve, _ZGVsMxv_cospif) SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff) SVE_VECTOR_WRAPPER (erfcf_sve, _ZGVsMxv_erfcf) SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf) @@ -66,5 +67,7 @@ SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f) SVE_VECTOR_WRAPPER_ff (powf_sve, _ZGVsMxvv_powf) SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf) SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf) +SVE_VECTOR_WRAPPER (sinpif_sve, _ZGVsMxv_sinpif) SVE_VECTOR_WRAPPER (tanf_sve, _ZGVsMxv_tanf) SVE_VECTOR_WRAPPER (tanhf_sve, _ZGVsMxv_tanhf) +SVE_VECTOR_WRAPPER (tanpif_sve, _ZGVsMxv_tanpif) diff --git a/sysdeps/aarch64/fpu/test-float-sve.h b/sysdeps/aarch64/fpu/test-float-sve.h index 058af48..368a1dd 100644 --- a/sysdeps/aarch64/fpu/test-float-sve.h +++ b/sysdeps/aarch64/fpu/test-float-sve.h @@ -1,6 +1,6 @@ /* Test declarations for single-precision SVE vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h index 421e500..75c9619 100644 --- a/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h +++ b/sysdeps/aarch64/fpu/test-vpcs-vector-wrapper.h @@ -1,6 +1,6 @@ /* Scalar wrapper for vpcs-enabled Advanced SIMD vector math functions. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_exp_data.c b/sysdeps/aarch64/fpu/v_exp_data.c index 6bc54e3..d9fc7b5 100644 --- a/sysdeps/aarch64/fpu/v_exp_data.c +++ b/sysdeps/aarch64/fpu/v_exp_data.c @@ -1,6 +1,6 @@ /* Scale values for vector exp and exp2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_exp_tail_data.c b/sysdeps/aarch64/fpu/v_exp_tail_data.c index 151e97c..ffd21b0 100644 --- a/sysdeps/aarch64/fpu/v_exp_tail_data.c +++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c @@ -1,6 +1,6 @@ /* Lookup table for high-precision exp(x, tail) function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h index 08b06e0..3d5cba8 100644 --- a/sysdeps/aarch64/fpu/v_expf_inline.h +++ b/sysdeps/aarch64/fpu/v_expf_inline.h @@ -1,6 +1,6 @@ /* Helper for single-precision AdvSIMD routines which depend on exp - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -24,50 +24,45 @@ struct v_expf_data { - float32x4_t poly[5]; - float32x4_t shift; - float invln2_and_ln2[4]; + float ln2_hi, ln2_lo, c0, c2; + float32x4_t inv_ln2, c1, c3, c4; + /* asuint(1.0f). */ + uint32x4_t exponent_bias; }; /* maxerr: 1.45358 +0.5 ulp. */ #define V_EXPF_DATA \ { \ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ - .shift = V4 (0x1.8p23f), \ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ } -#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ -#define C(i) d->poly[i] - static inline float32x4_t v_expf_inline (float32x4_t x, const struct v_expf_data *d) { - /* Helper routine for calculating exp(x). + /* Helper routine for calculating exp(ax). Copied from v_expf.c, with all special-case handling removed - the calling routine should handle special values if required. */ - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - float32x4_t n, r, z; - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); - z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0); - n = vsubq_f32 (z, d->shift); - r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1); - r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t ax = vabsq_f32 (x); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); /* Custom order-4 Estrin avoids building high order monomial. */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p, q, poly; - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); return vfmaq_f32 (scale, poly, scale); } - #endif diff --git a/sysdeps/aarch64/fpu/v_expm1_inline.h b/sysdeps/aarch64/fpu/v_expm1_inline.h new file mode 100644 index 0000000..46f6092 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_expm1_inline.h @@ -0,0 +1,97 @@ +/* Double-precision inline helper for vector (Advanced SIMD) expm1 function + + Copyright (C) 2024-2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_V_EXPM1_INLINE_H +#define AARCH64_FPU_V_EXPM1_INLINE_H + +#include "v_math.h" + +struct v_expm1_data +{ + float64x2_t c2, c4, c6, c8; + float64x2_t invln2; + int64x2_t exponent_bias; + double c1, c3, c5, c7, c9, c10; + double ln2[2]; +}; + +/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ +#define V_EXPM1_DATA \ + { \ + .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \ + .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \ + .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \ + .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \ + .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \ + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \ + .invln2 = V2 (0x1.71547652b82fep0), \ + .exponent_bias = V2 (0x3ff0000000000000), \ + } + +static inline float64x2_t +expm1_inline (float64x2_t x, const struct v_expm1_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2)); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t lane_consts_13 = vld1q_f64 (&d->c1); + float64x2_t lane_consts_57 = vld1q_f64 (&d->c5); + float64x2_t lane_consts_910 = vld1q_f64 (&d->c9); + float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1); + float64x2_t p03 = vfmaq_f64 (p01, f2, p23); + float64x2_t p47 = vfmaq_f64 (p45, f2, p67); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0); + float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1); + p = vfmaq_f64 (p47, f4, p); + p = vfmaq_f64 (p03, f4, p); + + p = vfmaq_f64 (f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); +} + +#endif diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h index 59b552d..bfad1f0 100644 --- a/sysdeps/aarch64/fpu/v_expm1f_inline.h +++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h @@ -1,6 +1,6 @@ /* Single-precision inline helper for vector (Advanced SIMD) expm1 function - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,48 +21,46 @@ #define AARCH64_FPU_V_EXPM1F_INLINE_H #include "v_math.h" -#include "poly_advsimd_f32.h" struct v_expm1f_data { - float32x4_t poly[5]; - float invln2_and_ln2[4]; - float32x4_t shift; + float32x4_t c0, c2; int32x4_t exponent_bias; + float c1, c3, inv_ln2, c4; + float ln2_hi, ln2_lo; }; /* Coefficients generated using fpminimax with degree=5 in [-log(2)/2, - log(2)/2]. Exponent bias is asuint(1.0f). - invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */ + log(2)/2]. Exponent bias is asuint(1.0f). */ #define V_EXPM1F_DATA \ { \ - .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \ - V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \ - .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \ + .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \ + .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ } static inline float32x4_t expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) { - /* Helper routine for calculating exp(x) - 1. - Copied from v_expm1f_1u6.c, with all special-case handling removed - the - calling routine should handle special values if required. */ + /* Helper routine for calculating exp(x) - 1. */ + + float32x2_t ln2 = vld1_f32 (&d->ln2_hi); + float32x4_t lane_consts = vld1q_f32 (&d->c1); /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); - float32x4_t j - = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift); + float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1); - f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2); + float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); + f = vfmsq_lane_f32 (f, j, ln2, 1); - /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). - Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses - Horner. */ + /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */ float32x4_t f2 = vmulq_f32 (f, f); float32x4_t f4 = vmulq_f32 (f2, f2); - float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); + float32x4_t p = vfmaq_f32 (p01, f2, p23); + p = vfmaq_laneq_f32 (p, f4, lane_consts, 3); p = vfmaq_f32 (f, f2, p); /* t = 2^i. */ diff --git a/sysdeps/aarch64/fpu/v_log10_data.c b/sysdeps/aarch64/fpu/v_log10_data.c index 2d6a1d9..becd40b 100644 --- a/sysdeps/aarch64/fpu/v_log10_data.c +++ b/sysdeps/aarch64/fpu/v_log10_data.c @@ -1,6 +1,6 @@ /* Lookup table for double-precision log10(x) vector function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_log1p_inline.h b/sysdeps/aarch64/fpu/v_log1p_inline.h index 242e43b..d38f8fb 100644 --- a/sysdeps/aarch64/fpu/v_log1p_inline.h +++ b/sysdeps/aarch64/fpu/v_log1p_inline.h @@ -1,6 +1,6 @@ /* Helper for double-precision Advanced SIMD routines which depend on log1p - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,29 +21,30 @@ #define AARCH64_FPU_V_LOG1P_INLINE_H #include "v_math.h" -#include "poly_advsimd_f64.h" struct v_log1p_data { - float64x2_t poly[19], ln2[2]; + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16; uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask; int64x2_t one_top; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18; + double ln2[2]; }; /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1]. */ #define V_LOG1P_CONSTANTS_TABLE \ { \ - .poly = { V2 (-0x1.ffffffffffffbp-2), V2 (0x1.55555555551a9p-2), \ - V2 (-0x1.00000000008e3p-2), V2 (0x1.9999999a32797p-3), \ - V2 (-0x1.555555552fecfp-3), V2 (0x1.249248e071e5ap-3), \ - V2 (-0x1.ffffff8bf8482p-4), V2 (0x1.c71c8f07da57ap-4), \ - V2 (-0x1.9999ca4ccb617p-4), V2 (0x1.7459ad2e1dfa3p-4), \ - V2 (-0x1.554d2680a3ff2p-4), V2 (0x1.3b4c54d487455p-4), \ - V2 (-0x1.2548a9ffe80e6p-4), V2 (0x1.0f389a24b2e07p-4), \ - V2 (-0x1.eee4db15db335p-5), V2 (0x1.e95b494d4a5ddp-5), \ - V2 (-0x1.15fdf07cb7c73p-4), V2 (0x1.0310b70800fcfp-4), \ - V2 (-0x1.cfa7385bdb37ep-6) }, \ - .ln2 = { V2 (0x1.62e42fefa3800p-1), V2 (0x1.ef35793c76730p-45) }, \ + .c0 = V2 (-0x1.ffffffffffffbp-2), .c1 = 0x1.55555555551a9p-2, \ + .c2 = V2 (-0x1.00000000008e3p-2), .c3 = 0x1.9999999a32797p-3, \ + .c4 = V2 (-0x1.555555552fecfp-3), .c5 = 0x1.249248e071e5ap-3, \ + .c6 = V2 (-0x1.ffffff8bf8482p-4), .c7 = 0x1.c71c8f07da57ap-4, \ + .c8 = V2 (-0x1.9999ca4ccb617p-4), .c9 = 0x1.7459ad2e1dfa3p-4, \ + .c10 = V2 (-0x1.554d2680a3ff2p-4), .c11 = 0x1.3b4c54d487455p-4, \ + .c12 = V2 (-0x1.2548a9ffe80e6p-4), .c13 = 0x1.0f389a24b2e07p-4, \ + .c14 = V2 (-0x1.eee4db15db335p-5), .c15 = 0x1.e95b494d4a5ddp-5, \ + .c16 = V2 (-0x1.15fdf07cb7c73p-4), .c17 = 0x1.0310b70800fcfp-4, \ + .c18 = -0x1.cfa7385bdb37ep-6, \ + .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 }, \ .hf_rt2_top = V2 (0x3fe6a09e00000000), \ .one_m_hf_rt2_top = V2 (0x00095f6200000000), \ .umask = V2 (0x000fffff00000000), .one_top = V2 (0x3ff) \ @@ -52,18 +53,44 @@ struct v_log1p_data #define BottomMask v_u64 (0xffffffff) static inline float64x2_t +eval_poly (float64x2_t m, float64x2_t m2, const struct v_log1p_data *d) +{ + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1718 = vld1q_f64 (&d->c17); + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, m, c1718, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, m, c1315, 1); + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, m, c1315, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, m, c911, 1); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, m, c911, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, m, c57, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, m, c57, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, m, c13, 1); + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, m, c13, 0); + float64x2_t p = vfmaq_laneq_f64 (p1617, m2, c1718, 1); + p = vfmaq_f64 (p1415, m2, p); + p = vfmaq_f64 (p1213, m2, p); + p = vfmaq_f64 (p1011, m2, p); + p = vfmaq_f64 (p89, m2, p); + p = vfmaq_f64 (p67, m2, p); + p = vfmaq_f64 (p45, m2, p); + p = vfmaq_f64 (p23, m2, p); + return vfmaq_f64 (p01, m2, p); +} + +static inline float64x2_t log1p_inline (float64x2_t x, const struct v_log1p_data *d) { - /* Helper for calculating log(x + 1). Copied from v_log1p_2u5.c, with several - modifications: + /* Helper for calculating log(x + 1): - No special-case handling - this should be dealt with by the caller. - - Pairwise Horner polynomial evaluation for improved accuracy. - Optionally simulate the shortcut for k=0, used in the scalar routine, - using v_sel, for improved accuracy when the argument to log1p is close to - 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 in - the source of the caller before including this file. - See v_log1pf_2u1.c for details of the algorithm. */ - float64x2_t m = vaddq_f64 (x, v_f64 (1)); + using v_sel, for improved accuracy when the argument to log1p is close + to 0. This feature is enabled by defining WANT_V_LOG1P_K0_SHORTCUT as 1 + in the source of the caller before including this file. */ + float64x2_t m = vaddq_f64 (x, v_f64 (1.0)); uint64x2_t mi = vreinterpretq_u64_f64 (m); uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top); @@ -74,14 +101,14 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d) /* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */ uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top); uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask)); - float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1)); + float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0)); /* Correction term c/m. */ - float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1))), m); + float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m); #ifndef WANT_V_LOG1P_K0_SHORTCUT -#error \ - "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" +# error \ + "Cannot use v_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0" #elif WANT_V_LOG1P_K0_SHORTCUT /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is that the approximation is solely the polynomial. */ @@ -92,11 +119,12 @@ log1p_inline (float64x2_t x, const struct v_log1p_data *d) /* Approximate log1p(f) on the reduced input using a polynomial. */ float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t p = v_pw_horner_18_f64 (f, f2, d->poly); + float64x2_t p = eval_poly (f, f2, d); /* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */ - float64x2_t ylo = vfmaq_f64 (cm, k, d->ln2[1]); - float64x2_t yhi = vfmaq_f64 (f, k, d->ln2[0]); + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + float64x2_t ylo = vfmaq_laneq_f64 (cm, k, ln2, 1); + float64x2_t yhi = vfmaq_laneq_f64 (f, k, ln2, 0); return vfmaq_f64 (vaddq_f64 (ylo, yhi), f2, p); } diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h index 643a6cd..ad84d5c 100644 --- a/sysdeps/aarch64/fpu/v_log1pf_inline.h +++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h @@ -1,6 +1,6 @@ /* Helper for single-precision Advanced SIMD routines which depend on log1p - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -25,54 +25,81 @@ struct v_log1pf_data { - float32x4_t poly[8], ln2; uint32x4_t four; int32x4_t three_quarters; + float c0, c3, c5, c7; + float32x4_t c4, c6, c1, c2, ln2; }; /* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients (1, -0.5) are not stored as they can be generated more efficiently. */ #define V_LOG1PF_CONSTANTS_TABLE \ { \ - .poly \ - = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \ - V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \ - V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \ - .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ - .three_quarters = V4 (0x3f400000) \ + .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \ + .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \ + .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \ + .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \ + .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \ + .three_quarters = V4 (0x3f400000) \ } static inline float32x4_t -eval_poly (float32x4_t m, const float32x4_t *c) +eval_poly (float32x4_t m, const struct v_log1pf_data *d) { - /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine - uses split Estrin, but this way reduces register pressure in the calling - routine). */ - float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]); + /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */ + float32x4_t c0357 = vld1q_f32 (&d->c0); + float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); float32x4_t m2 = vmulq_f32 (m, m); - q = vfmaq_f32 (m, m2, q); - float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); + float32x4_t p = vfmaq_f32 (p45, m2, p67); + p = vfmaq_f32 (p23, m2, p); + p = vfmaq_f32 (d->c1, m, p); p = vmulq_f32 (m2, p); - return vfmaq_f32 (q, m2, p); + p = vfmaq_f32 (m, m2, p); + return vfmaq_f32 (p, m2, q); } static inline float32x4_t -log1pf_inline (float32x4_t x, const struct v_log1pf_data d) +log1pf_inline (float32x4_t x, const struct v_log1pf_data *d) { - /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no - special-case handling. See that file for details of the algorithm. */ + /* Helper for calculating log(x + 1). */ + + /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m + is in [-0.25, 0.5]): + log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2). + + We approximate log1p(m) with a polynomial, then scale by + k*log(2). Instead of doing this directly, we use an intermediate + scale factor s = 4*k*log(2) to ensure the scale is representable + as a normalised fp32 number. */ float32x4_t m = vaddq_f32 (x, v_f32 (1.0f)); + + /* Choose k to scale x to the range [-1/4, 1/2]. */ int32x4_t k - = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters), + = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters), v_s32 (0xff800000)); uint32x4_t ku = vreinterpretq_u32_s32 (k); - float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku)); + + /* Scale up to ensure that the scale factor is representable as normalised + fp32 number, and scale m down accordingly. */ + float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku)); + + /* Scale x by exponent manipulation. */ float32x4_t m_scale = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku)); m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s)); - float32x4_t p = eval_poly (m_scale, d.poly); + + /* Evaluate polynomial on the reduced interval. */ + float32x4_t p = eval_poly (m_scale, d); + + /* The scale factor to be applied back at the end - by multiplying float(k) + by 2^-23 we get the unbiased exponent of k. */ float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f)); - return vfmaq_f32 (p, scale_back, d.ln2); + + /* Apply the scaling back. */ + return vfmaq_f32 (p, scale_back, d->ln2); } #endif diff --git a/sysdeps/aarch64/fpu/v_log2_data.c b/sysdeps/aarch64/fpu/v_log2_data.c index 41bc6ca..4a61a5b 100644 --- a/sysdeps/aarch64/fpu/v_log2_data.c +++ b/sysdeps/aarch64/fpu/v_log2_data.c @@ -1,6 +1,6 @@ /* Coefficients and table entries for vector log2 - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_log_data.c b/sysdeps/aarch64/fpu/v_log_data.c index 510b3e7..8f26cbd 100644 --- a/sysdeps/aarch64/fpu/v_log_data.c +++ b/sysdeps/aarch64/fpu/v_log_data.c @@ -1,6 +1,6 @@ /* Lookup table for double-precision log(x) vector function. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h index 12824fc..35d1b56 100644 --- a/sysdeps/aarch64/fpu/v_math.h +++ b/sysdeps/aarch64/fpu/v_math.h @@ -1,5 +1,5 @@ /* Utilities for Advanced SIMD libmvec routines. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_pow_exp_data.c b/sysdeps/aarch64/fpu/v_pow_exp_data.c index 8b7fb83..0f24368 100644 --- a/sysdeps/aarch64/fpu/v_pow_exp_data.c +++ b/sysdeps/aarch64/fpu/v_pow_exp_data.c @@ -1,6 +1,6 @@ /* Shared data between exp, exp2 and pow. - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_pow_log_data.c b/sysdeps/aarch64/fpu/v_pow_log_data.c index 0242fff..01bf6f4 100644 --- a/sysdeps/aarch64/fpu/v_pow_log_data.c +++ b/sysdeps/aarch64/fpu/v_pow_log_data.c @@ -1,6 +1,6 @@ /* Data for the log part of pow. - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/v_powf_data.c b/sysdeps/aarch64/fpu/v_powf_data.c index f789b84..b727136 100644 --- a/sysdeps/aarch64/fpu/v_powf_data.c +++ b/sysdeps/aarch64/fpu/v_powf_data.c @@ -1,6 +1,6 @@ /* Coefficients for single-precision SVE pow(x) function. - Copyright (C) 2024 Free Software Foundation, Inc. + Copyright (C) 2024-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h index 7f0a8aa..3af4852 100644 --- a/sysdeps/aarch64/fpu/vecmath_config.h +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -1,5 +1,5 @@ /* Configuration for libmvec routines. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -75,49 +75,37 @@ extern const struct v_log10_data } table[1 << V_LOG10_TABLE_BITS]; } __v_log10_data attribute_hidden; -extern const struct erff_data +extern const struct v_erff_data { struct { float erf, scale; } tab[513]; -} __erff_data attribute_hidden; +} __v_erff_data attribute_hidden; -extern const struct sv_erff_data -{ - float erf[513]; - float scale[513]; -} __sv_erff_data attribute_hidden; - -extern const struct erf_data +extern const struct v_erf_data { struct { double erf, scale; } tab[769]; -} __erf_data attribute_hidden; - -extern const struct sv_erf_data -{ - double erf[769]; - double scale[769]; -} __sv_erf_data attribute_hidden; +} __v_erf_data attribute_hidden; -extern const struct erfc_data +extern const struct v_erfc_data { struct { double erfc, scale; } tab[3488]; -} __erfc_data attribute_hidden; +} __v_erfc_data attribute_hidden; -extern const struct erfcf_data +extern const struct v_erfcf_data { struct { float erfc, scale; } tab[645]; -} __erfcf_data attribute_hidden; +} __v_erfcf_data attribute_hidden; /* Some data for AdvSIMD and SVE pow's internal exp and log. */ #define V_POW_EXP_TABLE_BITS 8 diff --git a/sysdeps/aarch64/hp-timing.h b/sysdeps/aarch64/hp-timing.h index 1e6dccd..d0620df 100644 --- a/sysdeps/aarch64/hp-timing.h +++ b/sysdeps/aarch64/hp-timing.h @@ -1,5 +1,5 @@ /* High precision, low overhead timing functions. AArch64 version. - Copyright (C) 2021-2024 Free Software Foundation, Inc. + Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/jmpbuf-offsets.h b/sysdeps/aarch64/jmpbuf-offsets.h index 632328c..2720526 100644 --- a/sysdeps/aarch64/jmpbuf-offsets.h +++ b/sysdeps/aarch64/jmpbuf-offsets.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2006-2024 Free Software Foundation, Inc. +/* Copyright (C) 2006-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -39,6 +39,68 @@ #define JB_D14 20 #define JB_D15 21 +/* The target specific part of jmp_buf has no space for expansion but + the public jmp_buf ABI type has. Unfortunately there is another type + that is used with setjmp APIs and exposed by thread cancellation (in + binaries built with -fno-exceptions) which complicates the situation. + + // Internal layout of the public jmp_buf type on AArch64. + // This is passed to setjmp, longjmp, sigsetjmp, siglongjmp. + struct + { + uint64_t jmpbuf[22]; // Target specific part. + uint32_t mask_was_saved; // savemask bool used by sigsetjmp/siglongjmp. + uint32_t pad; + uint64_t saved_mask; // sigset_t bits used on linux. + uint64_t unused[15]; // sigset_t bits not used on linux. + }; + + // Internal layout of the public __pthread_unwind_buf_t type. + // This is passed to sigsetjmp with !savemask and to the internal + // __libc_longjmp (currently alias of longjmp on AArch64). + struct + { + uint64_t jmpbuf[22]; // Must match jmp_buf. + uint32_t mask_was_saved; // Must match jmp_buf, always 0. + uint32_t pad; + void *prev; // List for unwinding. + void *cleanup; // Cleanup handlers. + uint32_t canceltype; // 1 bit cancellation type. + uint32_t pad2; + void *pad3; + }; + + Ideally only the target specific part of jmp_buf (A) is accessed by + __setjmp and __longjmp. But that is always embedded into one of the + two types above so the bits that are unused in those types (B) may be + reused for target specific purposes. Setjmp can't distinguish between + jmp_buf and __pthread_unwind_buf_t, but longjmp can: only an internal + longjmp call uses the latter, so state that is not needed for cancel + cleanups can go to fields (C). If generic code is refactored then the + usage of additional fields can be optimized (D). And some fields are + only accessible in the savedmask case (E). Reusability of jmp_buf + fields on AArch64 for target purposes: + + struct + { + uint64_t A[22]; // 0 .. 176 + uint32_t D; // 176 .. 180 + uint32_t B; // 180 .. 184 + uint64_t D; // 184 .. 192 + uint64_t C; // 192 .. 200 + uint32_t C; // 200 .. 204 + uint32_t B; // 204 .. 208 + uint64_t B; // 208 .. 216 + uint64_t E[12]; // 216 .. 312 + } + + The B fields can be used with minimal glibc code changes. We need a + 64 bit field for the Guarded Control Stack pointer (GCSPR_EL0) which + can use a C field too as cancellation cleanup does not execute RET + for a previous BL of the cancelled thread, but that would require a + custom __libc_longjmp. This layout can change in the future. */ +#define JB_GCSPR 208 + #ifndef __ASSEMBLER__ #include <setjmp.h> #include <stdint.h> diff --git a/sysdeps/aarch64/jmpbuf-unwind.h b/sysdeps/aarch64/jmpbuf-unwind.h index 469e853..a04d55a 100644 --- a/sysdeps/aarch64/jmpbuf-unwind.h +++ b/sysdeps/aarch64/jmpbuf-unwind.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/ldsodefs.h b/sysdeps/aarch64/ldsodefs.h index 5999daa..b66ae23 100644 --- a/sysdeps/aarch64/ldsodefs.h +++ b/sysdeps/aarch64/ldsodefs.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/libc-mtag.h b/sysdeps/aarch64/libc-mtag.h index 7b7e65b..e41dc47 100644 --- a/sysdeps/aarch64/libc-mtag.h +++ b/sysdeps/aarch64/libc-mtag.h @@ -1,5 +1,5 @@ /* libc-internal interface for tagged (colored) memory support. - Copyright (C) 2020-2024 Free Software Foundation, Inc. + Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/libc-start.c b/sysdeps/aarch64/libc-start.c index a293849..53fe801 100644 --- a/sysdeps/aarch64/libc-start.c +++ b/sysdeps/aarch64/libc-start.c @@ -1,5 +1,5 @@ /* Override csu/libc-start.c on AArch64. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/libc-tls.c b/sysdeps/aarch64/libc-tls.c index 182283d..fdba205 100644 --- a/sysdeps/aarch64/libc-tls.c +++ b/sysdeps/aarch64/libc-tls.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps index 846fb2c..3bcd0e5 100644 --- a/sysdeps/aarch64/libm-test-ulps +++ b/sysdeps/aarch64/libm-test-ulps @@ -3,7 +3,6 @@ # Maximal error of functions: Function: "acos": double: 1 -float: 1 ldouble: 1 Function: "acos_advsimd": @@ -12,7 +11,6 @@ float: 1 Function: "acos_downward": double: 1 -float: 1 ldouble: 1 Function: "acos_sve": @@ -21,17 +19,14 @@ float: 1 Function: "acos_towardzero": double: 1 -float: 1 ldouble: 1 Function: "acos_upward": double: 1 -float: 1 ldouble: 1 Function: "acosh": double: 2 -float: 2 ldouble: 4 Function: "acosh_advsimd": @@ -40,7 +35,6 @@ float: 2 Function: "acosh_downward": double: 2 -float: 2 ldouble: 3 Function: "acosh_sve": @@ -49,17 +43,30 @@ float: 2 Function: "acosh_towardzero": double: 2 -float: 2 ldouble: 2 Function: "acosh_upward": double: 2 -float: 2 ldouble: 3 +Function: "acospi": +double: 2 +ldouble: 2 + +Function: "acospi_downward": +double: 1 +ldouble: 1 + +Function: "acospi_towardzero": +double: 1 +ldouble: 1 + +Function: "acospi_upward": +double: 2 +ldouble: 2 + Function: "asin": double: 1 -float: 1 ldouble: 1 Function: "asin_advsimd": @@ -68,7 +75,6 @@ float: 2 Function: "asin_downward": double: 1 -float: 1 ldouble: 2 Function: "asin_sve": @@ -77,17 +83,14 @@ float: 2 Function: "asin_towardzero": double: 1 -float: 1 ldouble: 1 Function: "asin_upward": double: 2 -float: 1 ldouble: 2 Function: "asinh": double: 2 -float: 2 ldouble: 4 Function: "asinh_advsimd": @@ -96,7 +99,6 @@ float: 2 Function: "asinh_downward": double: 3 -float: 3 ldouble: 4 Function: "asinh_sve": @@ -105,21 +107,33 @@ float: 2 Function: "asinh_towardzero": double: 2 -float: 2 ldouble: 2 Function: "asinh_upward": double: 3 -float: 3 ldouble: 4 +Function: "asinpi": +double: 1 +ldouble: 2 + +Function: "asinpi_downward": +double: 1 +ldouble: 2 + +Function: "asinpi_towardzero": +double: 1 +ldouble: 1 + +Function: "asinpi_upward": +double: 2 +ldouble: 2 + Function: "atan": double: 1 -float: 1 ldouble: 1 Function: "atan2": -float: 1 ldouble: 2 Function: "atan2_advsimd": @@ -128,7 +142,6 @@ float: 2 Function: "atan2_downward": double: 1 -float: 2 ldouble: 2 Function: "atan2_sve": @@ -137,12 +150,26 @@ float: 2 Function: "atan2_towardzero": double: 1 -float: 2 ldouble: 3 Function: "atan2_upward": double: 1 -float: 1 +ldouble: 2 + +Function: "atan2pi": +double: 1 +ldouble: 3 + +Function: "atan2pi_downward": +double: 1 +ldouble: 2 + +Function: "atan2pi_towardzero": +double: 1 +ldouble: 2 + +Function: "atan2pi_upward": +double: 1 ldouble: 2 Function: "atan_advsimd": @@ -151,7 +178,6 @@ float: 1 Function: "atan_downward": double: 1 -float: 2 ldouble: 2 Function: "atan_sve": @@ -160,17 +186,14 @@ float: 1 Function: "atan_towardzero": double: 1 -float: 1 ldouble: 1 Function: "atan_upward": double: 1 -float: 2 ldouble: 2 Function: "atanh": double: 2 -float: 2 ldouble: 4 Function: "atanh_advsimd": @@ -179,7 +202,6 @@ float: 1 Function: "atanh_downward": double: 3 -float: 3 ldouble: 4 Function: "atanh_sve": @@ -188,14 +210,28 @@ float: 1 Function: "atanh_towardzero": double: 2 -float: 2 ldouble: 2 Function: "atanh_upward": double: 3 -float: 3 ldouble: 4 +Function: "atanpi": +double: 2 +ldouble: 2 + +Function: "atanpi_downward": +double: 1 +ldouble: 1 + +Function: "atanpi_towardzero": +double: 1 +ldouble: 1 + +Function: "atanpi_upward": +double: 1 +ldouble: 2 + Function: "cabs": double: 1 ldouble: 1 @@ -234,7 +270,7 @@ ldouble: 6 Function: Real part of "cacos_towardzero": double: 3 -float: 2 +float: 3 ldouble: 3 Function: Imaginary part of "cacos_towardzero": @@ -279,7 +315,7 @@ ldouble: 5 Function: Imaginary part of "cacosh_towardzero": double: 3 -float: 2 +float: 3 ldouble: 3 Function: Real part of "cacosh_upward": @@ -294,22 +330,18 @@ ldouble: 4 Function: "carg": double: 1 -float: 1 ldouble: 2 Function: "carg_downward": double: 1 -float: 2 ldouble: 2 Function: "carg_towardzero": double: 1 -float: 2 ldouble: 3 Function: "carg_upward": double: 1 -float: 1 ldouble: 2 Function: Real part of "casin": @@ -474,7 +506,6 @@ ldouble: 2 Function: "cbrt": double: 4 -float: 1 ldouble: 1 Function: "cbrt_advsimd": @@ -483,7 +514,6 @@ float: 1 Function: "cbrt_downward": double: 4 -float: 1 ldouble: 1 Function: "cbrt_sve": @@ -492,12 +522,10 @@ float: 1 Function: "cbrt_towardzero": double: 3 -float: 1 ldouble: 1 Function: "cbrt_upward": double: 5 -float: 1 ldouble: 1 Function: Real part of "ccos": @@ -730,7 +758,6 @@ ldouble: 2 Function: "cosh": double: 2 -float: 2 ldouble: 2 Function: "cosh_advsimd": @@ -739,7 +766,6 @@ float: 2 Function: "cosh_downward": double: 3 -float: 1 ldouble: 3 Function: "cosh_sve": @@ -748,14 +774,36 @@ float: 2 Function: "cosh_towardzero": double: 3 -float: 1 ldouble: 3 Function: "cosh_upward": double: 2 -float: 2 ldouble: 3 +Function: "cospi": +double: 2 +ldouble: 2 + +Function: "cospi_advsimd": +double: 2 +float: 1 + +Function: "cospi_downward": +double: 1 +ldouble: 2 + +Function: "cospi_sve": +double: 2 +float: 1 + +Function: "cospi_towardzero": +double: 1 +ldouble: 2 + +Function: "cospi_upward": +double: 1 +ldouble: 2 + Function: Real part of "cpow": double: 2 float: 5 @@ -801,6 +849,7 @@ float: 1 ldouble: 1 Function: Imaginary part of "csin": +float: 1 ldouble: 1 Function: Real part of "csin_downward": @@ -994,7 +1043,6 @@ ldouble: 5 Function: "erf": double: 1 -float: 1 ldouble: 1 Function: "erf_advsimd": @@ -1083,22 +1131,18 @@ ldouble: 3 Function: "exp10m1": double: 4 -float: 2 ldouble: 3 Function: "exp10m1_downward": double: 3 -float: 3 ldouble: 6 Function: "exp10m1_towardzero": double: 2 -float: 3 ldouble: 6 Function: "exp10m1_upward": double: 5 -float: 3 ldouble: 6 Function: "exp2": @@ -1131,22 +1175,18 @@ ldouble: 2 Function: "exp2m1": double: 2 -float: 2 ldouble: 2 Function: "exp2m1_downward": double: 3 -float: 3 ldouble: 3 Function: "exp2m1_towardzero": double: 3 -float: 2 ldouble: 4 Function: "exp2m1_upward": double: 3 -float: 3 ldouble: 5 Function: "exp_advsimd": @@ -1171,7 +1211,6 @@ float: 1 Function: "expm1": double: 1 -float: 1 ldouble: 2 Function: "expm1_advsimd": @@ -1180,7 +1219,6 @@ float: 1 Function: "expm1_downward": double: 1 -float: 1 ldouble: 2 Function: "expm1_sve": @@ -1189,12 +1227,10 @@ float: 1 Function: "expm1_towardzero": double: 1 -float: 2 ldouble: 4 Function: "expm1_upward": double: 1 -float: 1 ldouble: 3 Function: "gamma": @@ -1304,22 +1340,18 @@ ldouble: 7 Function: "lgamma": double: 3 -float: 4 ldouble: 5 Function: "lgamma_downward": double: 4 -float: 4 ldouble: 8 Function: "lgamma_towardzero": double: 4 -float: 3 ldouble: 5 Function: "lgamma_upward": double: 4 -float: 5 ldouble: 8 Function: "log": @@ -1357,27 +1389,22 @@ ldouble: 1 Function: "log10p1": double: 2 -float: 2 ldouble: 3 Function: "log10p1_downward": double: 2 -float: 3 ldouble: 4 Function: "log10p1_towardzero": double: 3 -float: 2 ldouble: 3 Function: "log10p1_upward": double: 2 -float: 3 ldouble: 4 Function: "log1p": double: 1 -float: 1 ldouble: 3 Function: "log1p_advsimd": @@ -1386,7 +1413,6 @@ float: 1 Function: "log1p_downward": double: 1 -float: 2 ldouble: 3 Function: "log1p_sve": @@ -1395,12 +1421,10 @@ float: 1 Function: "log1p_towardzero": double: 2 -float: 2 ldouble: 3 Function: "log1p_upward": double: 2 -float: 2 ldouble: 2 Function: "log2": @@ -1433,22 +1457,18 @@ ldouble: 1 Function: "log2p1": double: 2 -float: 2 ldouble: 3 Function: "log2p1_downward": double: 2 -float: 2 ldouble: 3 Function: "log2p1_towardzero": double: 2 -float: 2 ldouble: 2 Function: "log2p1_upward": double: 2 -float: 2 ldouble: 3 Function: "log_advsimd": @@ -1460,7 +1480,7 @@ float: 2 ldouble: 1 Function: "log_sve": -double: 1 +double: 2 float: 3 Function: "log_towardzero": @@ -1474,22 +1494,18 @@ ldouble: 1 Function: "logp1": double: 1 -float: 1 ldouble: 3 Function: "logp1_downward": double: 1 -float: 2 ldouble: 3 Function: "logp1_towardzero": double: 2 -float: 2 ldouble: 3 Function: "logp1_upward": double: 2 -float: 2 ldouble: 2 Function: "pow": @@ -1570,7 +1586,6 @@ ldouble: 3 Function: "sinh": double: 2 -float: 2 ldouble: 2 Function: "sinh_advsimd": @@ -1579,7 +1594,6 @@ float: 1 Function: "sinh_downward": double: 3 -float: 3 ldouble: 3 Function: "sinh_sve": @@ -1588,16 +1602,37 @@ float: 1 Function: "sinh_towardzero": double: 3 -float: 2 ldouble: 3 Function: "sinh_upward": double: 3 -float: 3 ldouble: 4 +Function: "sinpi": +double: 2 +ldouble: 2 + +Function: "sinpi_advsimd": +double: 2 +float: 2 + +Function: "sinpi_downward": +double: 2 +ldouble: 2 + +Function: "sinpi_sve": +double: 2 +float: 2 + +Function: "sinpi_towardzero": +double: 2 +ldouble: 2 + +Function: "sinpi_upward": +double: 2 +ldouble: 2 + Function: "tan": -float: 1 ldouble: 1 Function: "tan_advsimd": @@ -1606,7 +1641,6 @@ float: 2 Function: "tan_downward": double: 1 -float: 2 ldouble: 1 Function: "tan_sve": @@ -1615,17 +1649,14 @@ float: 2 Function: "tan_towardzero": double: 1 -float: 1 ldouble: 1 Function: "tan_upward": double: 1 -float: 1 ldouble: 1 Function: "tanh": double: 2 -float: 2 ldouble: 2 Function: "tanh_advsimd": @@ -1634,7 +1665,6 @@ float: 2 Function: "tanh_downward": double: 3 -float: 3 ldouble: 4 Function: "tanh_sve": @@ -1643,32 +1673,50 @@ float: 2 Function: "tanh_towardzero": double: 2 -float: 2 ldouble: 3 Function: "tanh_upward": double: 3 -float: 3 ldouble: 3 +Function: "tanpi": +double: 3 +ldouble: 3 + +Function: "tanpi_advsimd": +double: 2 +float: 2 + +Function: "tanpi_downward": +double: 2 +ldouble: 4 + +Function: "tanpi_sve": +double: 2 +float: 2 + +Function: "tanpi_towardzero": +double: 2 +ldouble: 4 + +Function: "tanpi_upward": +double: 2 +ldouble: 4 + Function: "tgamma": double: 9 -float: 8 ldouble: 4 Function: "tgamma_downward": double: 9 -float: 7 ldouble: 5 Function: "tgamma_towardzero": double: 9 -float: 7 ldouble: 5 Function: "tgamma_upward": double: 9 -float: 8 ldouble: 4 Function: "y0": diff --git a/sysdeps/aarch64/linkmap.h b/sysdeps/aarch64/linkmap.h index 56a63fc..e56c890 100644 --- a/sysdeps/aarch64/linkmap.h +++ b/sysdeps/aarch64/linkmap.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2009-2024 Free Software Foundation, Inc. +/* Copyright (C) 2009-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -23,4 +23,5 @@ struct link_map_machine ElfW(Addr) plt; /* Address of .plt */ void *tlsdesc_table; /* Address of TLS descriptor hash table. */ bool bti_fail; /* Failed to enable Branch Target Identification. */ + bool gcs; /* Guarded Control Stack marking. */ }; diff --git a/sysdeps/aarch64/machine-gmon.h b/sysdeps/aarch64/machine-gmon.h index 6890b99..eba7c24 100644 --- a/sysdeps/aarch64/machine-gmon.h +++ b/sysdeps/aarch64/machine-gmon.h @@ -1,5 +1,5 @@ /* AArch64 definitions for profiling support. - Copyright (C) 1996-2024 Free Software Foundation, Inc. + Copyright (C) 1996-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/math-tests-trap.h b/sysdeps/aarch64/math-tests-trap.h index 72a58ca..543d545 100644 --- a/sysdeps/aarch64/math-tests-trap.h +++ b/sysdeps/aarch64/math-tests-trap.h @@ -1,6 +1,6 @@ /* Configuration for math tests: support for enabling exception traps. AArch64 version. - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/mcount.c b/sysdeps/aarch64/mcount.c index 1651ab9..0a26c6c 100644 --- a/sysdeps/aarch64/mcount.c +++ b/sysdeps/aarch64/mcount.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2024 Free Software Foundation, Inc. +/* Copyright (C) 2013-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S index a9fa405..e67c359 100644 --- a/sysdeps/aarch64/memchr.S +++ b/sysdeps/aarch64/memchr.S @@ -1,6 +1,6 @@ /* memchr - find a character in a memory zone - Copyright (C) 2015-2024 Free Software Foundation, Inc. + Copyright (C) 2015-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 5afa794..471dc82 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -1,6 +1,6 @@ /* memcmp - compare memory - Copyright (C) 2013-2024 Free Software Foundation, Inc. + Copyright (C) 2013-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S index f21c21d..725705c 100644 --- a/sysdeps/aarch64/memcpy.S +++ b/sysdeps/aarch64/memcpy.S @@ -1,5 +1,5 @@ /* Generic optimized memcpy using SIMD. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S index c5274f5..229a8a0 100644 --- a/sysdeps/aarch64/memrchr.S +++ b/sysdeps/aarch64/memrchr.S @@ -1,6 +1,6 @@ /* memrchr - find the last occurrence of a byte in a memory block - Copyright (C) 2015-2024 Free Software Foundation, Inc. + Copyright (C) 2015-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 7ef77ee..9a5a89e 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Generic optimized memset using SIMD. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include "memset-reg.h" #ifndef MEMSET # define MEMSET memset @@ -25,167 +25,116 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ -ENTRY (MEMSET) +#define dstin x0 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define off x3 +#define dstend2 x5 +ENTRY (MEMSET) PTR_ARG (0) SIZE_ARG (2) dup v0.16B, valw + cmp count, 16 + b.lo L(set_small) + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) - cmp count, 96 - b.hi L(set_long) - cmp count, 16 - b.hs L(set_medium) - mov val, v0.D[0] + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] + ret + .p2align 4 /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] - ret - nop -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] +L(set_small): + add dstend, dstin, count + cmp count, 4 + b.lo 2f + lsr off, count, 3 + sub dstend2, dstend, off, lsl 2 + str s0, [dstin] + str s0, [dstin, off, lsl 2] + str s0, [dstend2, -4] + str s0, [dstend, -4] ret + + /* Set 0..3 bytes. */ 2: cbz count, 3f + lsr off, count, 1 strb valw, [dstin] - tbz count, 1, 3f - strh valw, [dstend, -2] + strb valw, [dstin, off] + strb valw, [dstend, -1] 3: ret - /* Set 17..96 bytes. */ -L(set_medium): - str q0, [dstin] - tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: ret - .p2align 4 - /* Set 64..96 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set96): - str q0, [dstin, 16] +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 3 - nop + .p2align 4 L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - -L(try_zva): + str q0, [dst, 16] + tst valw, 255 + b.ne L(no_zva) #ifndef ZVA64_ONLY - .p2align 3 - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - nop + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) #endif - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ - .p2align 4 -L(zva_64): - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] stp q0, q0, [dst, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ + + /* Write last bytes before ZVA loop. */ stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva64_loop) ret -#ifndef ZVA64_ONLY .p2align 3 -L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) - - str q0, [dst, 16] +L(no_zva): + sub count, dstend, dst /* Count is 32 too large. */ + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ +L(no_zva_loop): stp q0, q0, [dst, 32] stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] + add dst, dst, 64 + subs count, count, 64 + b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - sub dst, dst, 32 /* Bias dst for tail loop. */ - b L(tail64) -#endif - END (MEMSET) libc_hidden_builtin_def (MEMSET) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 3e251cc..772b16a 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -7,8 +7,6 @@ sysdep_routines += \ memcpy_mops \ memcpy_oryon1 \ memcpy_sve \ - memcpy_thunderx \ - memcpy_thunderx2 \ memmove_mops \ memset_a64fx \ memset_emag \ diff --git a/sysdeps/aarch64/multiarch/dl-symbol-redir-ifunc.h b/sysdeps/aarch64/multiarch/dl-symbol-redir-ifunc.h index 70e5a90..63ac28f 100644 --- a/sysdeps/aarch64/multiarch/dl-symbol-redir-ifunc.h +++ b/sysdeps/aarch64/multiarch/dl-symbol-redir-ifunc.h @@ -1,5 +1,5 @@ /* Symbol rediretion for loader/static initialization code. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index b2fda54..0481e45 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -1,5 +1,5 @@ /* Enumerate available IFUNC implementations of a function. AARCH64 version. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. @@ -35,9 +35,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c. */ IFUNC_IMPL (i, name, memcpy, - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1) - IFUNC_IMPL_ADD (array, i, memcpy, !bti, __memcpy_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx) IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) @@ -45,9 +43,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1) - IFUNC_IMPL_ADD (array, i, memmove, !bti, __memmove_thunderx2) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx) IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h index 61dc400..63c24e7 100644 --- a/sysdeps/aarch64/multiarch/init-arch.h +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -1,6 +1,6 @@ /* Define INIT_ARCH so that midr is initialized before use by IFUNCs. This file is part of the GNU C Library. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c index 5069e76..299508a 100644 --- a/sysdeps/aarch64/multiarch/memchr.c +++ b/sysdeps/aarch64/multiarch/memchr.c @@ -1,5 +1,5 @@ /* Multiple versions of memchr. AARCH64 version. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S index 8d55427..5c9d5a4 100644 --- a/sysdeps/aarch64/multiarch/memchr_generic.S +++ b/sysdeps/aarch64/multiarch/memchr_generic.S @@ -1,5 +1,5 @@ /* Memchr for aarch64, default version for internal use. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S index 0a65139..03e2852 100644 --- a/sysdeps/aarch64/multiarch/memchr_nosimd.S +++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S @@ -1,6 +1,6 @@ /* memchr - find a character in a memory zone using base integer registers - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 15c9547..0e33d19 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -1,5 +1,5 @@ /* Multiple versions of memcpy. AARCH64 version. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. @@ -30,8 +30,6 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; -extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden; @@ -55,12 +53,6 @@ select_memcpy_ifunc (void) if (IS_ORYON1 (midr)) return __memcpy_oryon1; - if (IS_THUNDERX (midr)) - return __memcpy_thunderx; - - if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) - return __memcpy_thunderx2; - return __memcpy_generic; } diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S index d826aaf..0be269c 100644 --- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S +++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S @@ -1,5 +1,5 @@ /* Optimized memcpy for Fujitsu A64FX processor. - Copyright (C) 2021-2024 Free Software Foundation, Inc. + Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memcpy_generic.S b/sysdeps/aarch64/multiarch/memcpy_generic.S index 577c359..f044ebc 100644 --- a/sysdeps/aarch64/multiarch/memcpy_generic.S +++ b/sysdeps/aarch64/multiarch/memcpy_generic.S @@ -1,5 +1,5 @@ /* A Generic Optimized memcpy implementation for AARCH64. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/multiarch/memcpy_mops.S b/sysdeps/aarch64/multiarch/memcpy_mops.S index b094af3..85a0633 100644 --- a/sysdeps/aarch64/multiarch/memcpy_mops.S +++ b/sysdeps/aarch64/multiarch/memcpy_mops.S @@ -1,5 +1,5 @@ /* Optimized memcpy for MOPS. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memcpy_oryon1.S b/sysdeps/aarch64/multiarch/memcpy_oryon1.S index 4efc43d..bda5ed0 100644 --- a/sysdeps/aarch64/multiarch/memcpy_oryon1.S +++ b/sysdeps/aarch64/multiarch/memcpy_oryon1.S @@ -1,5 +1,5 @@ /* A oryon-1 core Optimized memcpy implementation for AARCH64. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. @@ -160,46 +160,6 @@ L(copy96): .p2align 6 L(copy_long): - /* On oryon1 cores, large memcpy's are helped by using ldnp/stnp. - This loop is identical to the one below it but using ldnp/stnp - instructions. For loops that are less than 32768 bytes, - the ldnp/stnp instructions will not help and will cause a slow - down so only use the ldnp/stnp loop for the largest sizes. */ - - cmp count, #32768 - b.lo L(copy_long_without_nontemp) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldnp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldnp A_l, A_h, [src, 16] - stnp D_l, D_h, [dstin] - ldnp B_l, B_h, [src, 32] - ldnp C_l, C_h, [src, 48] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(nontemp_loop64): - tbz src, #6, 1f -1: - stnp A_l, A_h, [dst, 16] - ldnp A_l, A_h, [src, 16] - stnp B_l, B_h, [dst, 32] - ldnp B_l, B_h, [src, 32] - stnp C_l, C_h, [dst, 48] - ldnp C_l, C_h, [src, 48] - stnp D_l, D_h, [dst, 64] - ldnp D_l, D_h, [src, 64] - add src, src, #64 - add dst, dst, #64 - subs count, count, 64 - b.hi L(nontemp_loop64) - b L(last64) - -L(copy_long_without_nontemp): - and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src] diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S index 3ce49d7..f36248c 100644 --- a/sysdeps/aarch64/multiarch/memcpy_sve.S +++ b/sysdeps/aarch64/multiarch/memcpy_sve.S @@ -1,5 +1,5 @@ /* Optimized memcpy for SVE. - Copyright (C) 2021-2024 Free Software Foundation, Inc. + Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S deleted file mode 100644 index 5d8438a..0000000 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S +++ /dev/null @@ -1,305 +0,0 @@ -/* A Thunderx Optimized memcpy implementation for AARCH64. - Copyright (C) 2017-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -/* The actual code in this memcpy and memmove should be identical to the - generic version except for the code under '#ifdef THUNDERX'. This is - to make is easier to keep this version and the generic version in sync - for changes that are not specific to thunderx. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define G_l count -#define G_h dst -#define tmp1 x14 - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use an unrolled loop - processing 64 bytes per iteration. - In order to share code with memmove, small and medium copies read all - data before writing, allowing any kind of overlap. So small, medium - and large backwards memmoves are handled by falling through into memcpy. - Overlapping large forward memmoves use a loop that copies backwards. -*/ - -ENTRY (__memmove_thunderx) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - - /* Common case falls through into memcpy. */ -END (__memmove_thunderx) - -ENTRY (__memcpy_thunderx) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - prfm PLDL1KEEP, [src] - add srcend, src, count - add dstend, dstin, count - cmp count, 16 - b.ls L(copy16) - cmp count, 96 - b.hi L(copy_long) - - /* Medium copies: 17..96 bytes. */ - sub tmp1, count, 1 - ldp A_l, A_h, [src] - tbnz tmp1, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz tmp1, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(copy16): - cmp count, 8 - b.lo 1f - ldr A_l, [src] - ldr A_h, [srcend, -8] - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 -1: - tbz count, 2, 1f - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -1: - cbz count, 2f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - ldrb B_lw, [src, tmp1] - strb A_lw, [dstin] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] -2: ret - - .p2align 4 - /* Copy 64..96 bytes. Copy 64 bytes from the start and - 32 bytes from the end. */ -L(copy96): - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [src, 32] - ldp D_l, D_h, [src, 48] - ldp E_l, E_h, [srcend, -32] - ldp F_l, F_h, [srcend, -16] - stp A_l, A_h, [dstin] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin, 32] - stp D_l, D_h, [dstin, 48] - stp E_l, E_h, [dstend, -32] - stp F_l, F_h, [dstend, -16] - ret - - /* Align DST to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - .p2align 4 -L(copy_long): - - /* On thunderx, large memcpy's are helped by software prefetching. - This loop is identical to the one below it but with prefetching - instructions included. For loops that are less than 32768 bytes, - the prefetching does not help and slow the code down so we only - use the prefetching loop for the largest memcpys. */ - - cmp count, #32768 - b.lo L(copy_long_without_prefetch) - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - prfm pldl1strm, [src, 384] - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - -L(prefetch_loop64): - tbz src, #6, 1f - prfm pldl1strm, [src, 512] -1: - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi L(prefetch_loop64) - b L(last64) - -L(copy_long_without_prefetch): - - and tmp1, dstin, 15 - bic dst, dstin, 15 - ldp D_l, D_h, [src] - sub src, src, tmp1 - add count, count, tmp1 /* Count is now 16 too large. */ - ldp A_l, A_h, [src, 16] - stp D_l, D_h, [dstin] - ldp B_l, B_h, [src, 32] - ldp C_l, C_h, [src, 48] - ldp D_l, D_h, [src, 64]! - subs count, count, 128 + 16 /* Test and readjust count. */ - b.ls L(last64) -L(loop64): - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [src, 16] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [src, 32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [src, 48] - stp D_l, D_h, [dst, 64]! - ldp D_l, D_h, [src, 64]! - subs count, count, 64 - b.hi L(loop64) - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the end even if - there is just 1 byte left. */ -L(last64): - ldp E_l, E_h, [srcend, -64] - stp A_l, A_h, [dst, 16] - ldp A_l, A_h, [srcend, -48] - stp B_l, B_h, [dst, 32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dst, 48] - ldp C_l, C_h, [srcend, -16] - stp D_l, D_h, [dst, 64] - stp E_l, E_h, [dstend, -64] - stp A_l, A_h, [dstend, -48] - stp B_l, B_h, [dstend, -32] - stp C_l, C_h, [dstend, -16] - ret - - .p2align 4 -L(move_long): - cbz tmp1, 3f - - add srcend, src, count - add dstend, dstin, count - - /* Align dstend to 16 byte alignment so that we don't cross cache line - boundaries on both loads and stores. There are at least 96 bytes - to copy, so copy 16 bytes unaligned and then align. The loop - copies 64 bytes per iteration and prefetches one iteration ahead. */ - - and tmp1, dstend, 15 - ldp D_l, D_h, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_l, A_h, [srcend, -16] - stp D_l, D_h, [dstend, -16] - ldp B_l, B_h, [srcend, -32] - ldp C_l, C_h, [srcend, -48] - ldp D_l, D_h, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - - nop -1: - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [srcend, -16] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [srcend, -32] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [srcend, -48] - stp D_l, D_h, [dstend, -64]! - ldp D_l, D_h, [srcend, -64]! - subs count, count, 64 - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp G_l, G_h, [src, 48] - stp A_l, A_h, [dstend, -16] - ldp A_l, A_h, [src, 32] - stp B_l, B_h, [dstend, -32] - ldp B_l, B_h, [src, 16] - stp C_l, C_h, [dstend, -48] - ldp C_l, C_h, [src] - stp D_l, D_h, [dstend, -64] - stp G_l, G_h, [dstin, 48] - stp A_l, A_h, [dstin, 32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstin] -3: ret - -END (__memcpy_thunderx) diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S deleted file mode 100644 index a3d79aa..0000000 --- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S +++ /dev/null @@ -1,457 +0,0 @@ -/* A Thunderx2 Optimized memcpy implementation for AARCH64. - Copyright (C) 2018-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -/* Assumptions: - * - * ARMv8-a, AArch64, unaligned accesses. - * - */ - -#define dstin x0 -#define src x1 -#define count x2 -#define dst x3 -#define srcend x4 -#define dstend x5 -#define tmp2 x6 -#define tmp3 x7 -#define tmp3w w7 -#define A_l x6 -#define A_lw w6 -#define A_h x7 -#define A_hw w7 -#define B_l x8 -#define B_lw w8 -#define B_h x9 -#define C_l x10 -#define C_h x11 -#define D_l x12 -#define D_h x13 -#define E_l src -#define E_h count -#define F_l srcend -#define F_h dst -#define G_l count -#define G_h dst -#define tmp1 x14 - -#define A_q q0 -#define B_q q1 -#define C_q q2 -#define D_q q3 -#define E_q q4 -#define F_q q5 -#define G_q q6 -#define H_q q7 -#define I_q q16 -#define J_q q17 - -#define A_v v0 -#define B_v v1 -#define C_v v2 -#define D_v v3 -#define E_v v4 -#define F_v v5 -#define G_v v6 -#define H_v v7 -#define I_v v16 -#define J_v v17 - -/* Overlapping large forward memmoves use a loop that copies backwards. - Otherwise memcpy is used. Small moves branch to memcopy16 directly. - The longer memcpy cases fall through to the memcpy head. -*/ - -ENTRY (__memmove_thunderx2) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - cmp count, 16 - b.ls L(memcopy16) - sub tmp1, dstin, src - cmp count, 96 - ccmp tmp1, count, 2, hi - b.lo L(move_long) - -END (__memmove_thunderx2) - - -/* Copies are split into 3 main cases: small copies of up to 16 bytes, - medium copies of 17..96 bytes which are fully unrolled. Large copies - of more than 96 bytes align the destination and use load-and-merge - approach in the case src and dst addresses are unaligned not evenly, - so that, actual loads and stores are always aligned. - Large copies use the loops processing 64 bytes per iteration for - unaligned case and 128 bytes per iteration for aligned ones. -*/ - -#define MEMCPY_PREFETCH_LDR 640 - -ENTRY (__memcpy_thunderx2) - - PTR_ARG (0) - PTR_ARG (1) - SIZE_ARG (2) - - add srcend, src, count - cmp count, 16 - b.ls L(memcopy16) - ldr A_q, [src], #16 - add dstend, dstin, count - and tmp1, src, 15 - cmp count, 96 - b.hi L(memcopy_long) - - /* Medium copies: 17..96 bytes. */ - ldr E_q, [srcend, -16] - cmp count, 64 - b.gt L(memcpy_copy96) - cmp count, 48 - b.le L(bytes_17_to_48) - /* 49..64 bytes */ - ldp B_q, C_q, [src] - str E_q, [dstend, -16] - stp A_q, B_q, [dstin] - str C_q, [dstin, 32] - ret - -L(bytes_17_to_48): - /* 17..48 bytes*/ - cmp count, 32 - b.gt L(bytes_32_to_48) - /* 17..32 bytes*/ - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(bytes_32_to_48): - /* 32..48 */ - ldr B_q, [src] - str A_q, [dstin] - str E_q, [dstend, -16] - str B_q, [dstin, 16] - ret - - .p2align 4 - /* Small copies: 0..16 bytes. */ -L(memcopy16): - cmp count, 8 - b.lo L(bytes_0_to_8) - ldr A_l, [src] - ldr A_h, [srcend, -8] - add dstend, dstin, count - str A_l, [dstin] - str A_h, [dstend, -8] - ret - .p2align 4 - -L(bytes_0_to_8): - tbz count, 2, L(bytes_0_to_3) - ldr A_lw, [src] - ldr A_hw, [srcend, -4] - add dstend, dstin, count - str A_lw, [dstin] - str A_hw, [dstend, -4] - ret - - /* Copy 0..3 bytes. Use a branchless sequence that copies the same - byte 3 times if count==1, or the 2nd byte twice if count==2. */ -L(bytes_0_to_3): - cbz count, 1f - lsr tmp1, count, 1 - ldrb A_lw, [src] - ldrb A_hw, [srcend, -1] - add dstend, dstin, count - ldrb B_lw, [src, tmp1] - strb B_lw, [dstin, tmp1] - strb A_hw, [dstend, -1] - strb A_lw, [dstin] -1: - ret - - .p2align 4 - -L(memcpy_copy96): - /* Copying 65..96 bytes. A_q (first 16 bytes) and - E_q(last 16 bytes) are already loaded. The size - is large enough to benefit from aligned loads */ - bic src, src, 15 - ldp B_q, C_q, [src] - /* Loaded 64 bytes, second 16-bytes chunk can be - overlapping with the first chunk by tmp1 bytes. - Stored 16 bytes. */ - sub dst, dstin, tmp1 - add count, count, tmp1 - /* The range of count being [65..96] becomes [65..111] - after tmp [0..15] gets added to it, - count now is <bytes-left-to-load>+48 */ - cmp count, 80 - b.gt L(copy96_medium) - ldr D_q, [src, 32] - stp B_q, C_q, [dst, 16] - str D_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(copy96_medium): - ldp D_q, G_q, [src, 32] - cmp count, 96 - b.gt L(copy96_large) - stp B_q, C_q, [dst, 16] - stp D_q, G_q, [dst, 48] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - -L(copy96_large): - ldr F_q, [src, 64] - str B_q, [dst, 16] - stp C_q, D_q, [dst, 32] - stp G_q, F_q, [dst, 64] - str A_q, [dstin] - str E_q, [dstend, -16] - ret - - .p2align 4 -L(memcopy_long): - bic src, src, 15 - ldp B_q, C_q, [src], #32 - sub dst, dstin, tmp1 - add count, count, tmp1 - add dst, dst, 16 - and tmp1, dst, 15 - ldp D_q, E_q, [src], #32 - str A_q, [dstin] - - /* Already loaded 64+16 bytes. Check if at - least 64 more bytes left */ - subs count, count, 64+64+16 - b.lt L(loop128_exit0) - cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 - b.lt L(loop128) - cbnz tmp1, L(dst_unaligned) - sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - - .p2align 4 - -L(loop128_prefetch): - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - ldp H_q, I_q, [src], #32 - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] - ldp B_q, C_q, [src], #32 - stp D_q, E_q, [dst], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 128 - b.ge L(loop128_prefetch) - - add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 - .p2align 4 -L(loop128): - ldp F_q, G_q, [src], #32 - ldp H_q, I_q, [src], #32 - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst], #32 - subs count, count, 64 - b.lt L(loop128_exit1) - ldp B_q, C_q, [src], #32 - ldp D_q, E_q, [src], #32 - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst], #32 - subs count, count, 64 - b.ge L(loop128) -L(loop128_exit0): - ldp F_q, G_q, [srcend, -64] - ldp H_q, I_q, [srcend, -32] - stp B_q, C_q, [dst], #32 - stp D_q, E_q, [dst] - stp F_q, G_q, [dstend, -64] - stp H_q, I_q, [dstend, -32] - ret -L(loop128_exit1): - ldp B_q, C_q, [srcend, -64] - ldp D_q, E_q, [srcend, -32] - stp F_q, G_q, [dst], #32 - stp H_q, I_q, [dst] - stp B_q, C_q, [dstend, -64] - stp D_q, E_q, [dstend, -32] - ret - -L(dst_unaligned_tail): - ldp C_q, D_q, [srcend, -64] - ldp E_q, F_q, [srcend, -32] - stp A_q, B_q, [dst], #32 - stp H_q, I_q, [dst], #16 - str G_q, [dst, tmp1] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstend, -32] - ret - -L(dst_unaligned): - /* For the unaligned store case the code loads two - aligned chunks and then merges them using ext - instruction. This can be up to 30% faster than - the the simple unaligned store access. - - Current state: tmp1 = dst % 16; C_q, D_q, E_q - contains data yet to be stored. src and dst points - to next-to-be-processed data. A_q, B_q contains - data already stored before, count = bytes left to - be load decremented by 64. - - The control is passed here if at least 64 bytes left - to be loaded. The code does two aligned loads and then - extracts (16-tmp1) bytes from the first register and - tmp1 bytes from the next register forming the value - for the aligned store. - - As ext instruction can only have it's index encoded - as immediate. 15 code chunks process each possible - index value. Computed goto is used to reach the - required code. */ - - /* Store the 16 bytes to dst and align dst for further - operations, several bytes will be stored at this - address once more */ - - ldp F_q, G_q, [src], #32 - stp B_q, C_q, [dst], #32 - bic dst, dst, 15 - sub count, count, 32 - adrp tmp2, L(ext_table) - add tmp2, tmp2, :lo12:L(ext_table) - add tmp2, tmp2, tmp1, LSL #2 - ldr tmp3w, [tmp2] - add tmp2, tmp2, tmp3w, SXTW - br tmp2 - -.p2align 4 - /* to make the loop in each chunk 16-bytes aligned */ - nop -#define EXT_CHUNK(shft) \ -L(ext_size_ ## shft):;\ - ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ - ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ -1:;\ - stp A_q, B_q, [dst], #32;\ - prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ - ldp C_q, D_q, [src], #32;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - stp H_q, I_q, [dst], #32;\ - ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ - ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ - ldp F_q, G_q, [src], #32;\ - ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ - subs count, count, 64;\ - b.ge 1b;\ -2:;\ - ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ - b L(dst_unaligned_tail); - -EXT_CHUNK(1) -EXT_CHUNK(2) -EXT_CHUNK(3) -EXT_CHUNK(4) -EXT_CHUNK(5) -EXT_CHUNK(6) -EXT_CHUNK(7) -EXT_CHUNK(8) -EXT_CHUNK(9) -EXT_CHUNK(10) -EXT_CHUNK(11) -EXT_CHUNK(12) -EXT_CHUNK(13) -EXT_CHUNK(14) -EXT_CHUNK(15) - -L(move_long): - .p2align 4 -1: - cbz tmp1, 3f - - add srcend, src, count - add dstend, dstin, count - - and tmp1, srcend, 15 - ldr D_q, [srcend, -16] - sub srcend, srcend, tmp1 - sub count, count, tmp1 - ldp A_q, B_q, [srcend, -32] - str D_q, [dstend, -16] - ldp C_q, D_q, [srcend, -64]! - sub dstend, dstend, tmp1 - subs count, count, 128 - b.ls 2f - - .p2align 4 -1: - subs count, count, 64 - stp A_q, B_q, [dstend, -32] - ldp A_q, B_q, [srcend, -32] - stp C_q, D_q, [dstend, -64]! - ldp C_q, D_q, [srcend, -64]! - b.hi 1b - - /* Write the last full set of 64 bytes. The remainder is at most 64 - bytes, so it is safe to always copy 64 bytes from the start even if - there is just 1 byte left. */ -2: - ldp E_q, F_q, [src, 32] - ldp G_q, H_q, [src] - stp A_q, B_q, [dstend, -32] - stp C_q, D_q, [dstend, -64] - stp E_q, F_q, [dstin, 32] - stp G_q, H_q, [dstin] -3: ret - - -END (__memcpy_thunderx2) - .section .rodata - .p2align 4 - -L(ext_table): - /* The first entry is for the alignment of 0 and is never - actually used (could be any value). */ - .word 0 - .word L(ext_size_1) -. - .word L(ext_size_2) -. - .word L(ext_size_3) -. - .word L(ext_size_4) -. - .word L(ext_size_5) -. - .word L(ext_size_6) -. - .word L(ext_size_7) -. - .word L(ext_size_8) -. - .word L(ext_size_9) -. - .word L(ext_size_10) -. - .word L(ext_size_11) -. - .word L(ext_size_12) -. - .word L(ext_size_13) -. - .word L(ext_size_14) -. - .word L(ext_size_15) -. diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index fe95037..47b7268 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -1,5 +1,5 @@ /* Multiple versions of memmove. AARCH64 version. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -29,8 +29,6 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; -extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden; extern __typeof (__redirect_memmove) __memmove_mops attribute_hidden; @@ -50,12 +48,6 @@ select_memmove_ifunc (void) return prefer_sve_ifuncs ? __memmove_sve : __memmove_generic; } - if (IS_THUNDERX (midr)) - return __memmove_thunderx; - - if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) - return __memmove_thunderx2; - return __memmove_generic; } diff --git a/sysdeps/aarch64/multiarch/memmove_mops.S b/sysdeps/aarch64/multiarch/memmove_mops.S index 7df0d22..2768096 100644 --- a/sysdeps/aarch64/multiarch/memmove_mops.S +++ b/sysdeps/aarch64/multiarch/memmove_mops.S @@ -1,5 +1,5 @@ /* Optimized memmove for MOPS. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index bd063c1..f6194e4 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -1,5 +1,5 @@ /* Multiple versions of memset. AARCH64 version. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index 2e6d882..7f36997 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -1,5 +1,5 @@ /* Optimized memset for Fujitsu A64FX processor. - Copyright (C) 2021-2024 Free Software Foundation, Inc. + Copyright (C) 2021-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <sysdeps/aarch64/memset-reg.h> /* Assumptions: * @@ -36,6 +35,14 @@ .arch armv8.2-a+sve +#define dstin x0 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define tmp1 x5 +#define tmp2 x6 + .macro st1b_unroll first=0, last=7 st1b z0.b, p0, [dst, \first, mul vl] .if \last-\first diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S index 6d714ed..d1d9be6 100644 --- a/sysdeps/aarch64/multiarch/memset_emag.S +++ b/sysdeps/aarch64/multiarch/memset_emag.S @@ -1,5 +1,5 @@ /* Optimized memset for AmpereComputing emag processor. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include "memset-reg.h" /* Assumptions: * @@ -26,6 +25,13 @@ * */ +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 + ENTRY (__memset_emag) PTR_ARG (0) diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S index e125a5e..115a480 100644 --- a/sysdeps/aarch64/multiarch/memset_generic.S +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -1,5 +1,5 @@ /* Memset for aarch64, default version for internal use. - Copyright (C) 2017-2024 Free Software Foundation, Inc. + Copyright (C) 2017-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S index 7b21550..fb3202b 100644 --- a/sysdeps/aarch64/multiarch/memset_kunpeng.S +++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S @@ -1,5 +1,5 @@ /* Optimized memset for Huawei Kunpeng processor. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <sysdeps/aarch64/memset-reg.h> /* Assumptions: * @@ -26,6 +25,12 @@ * */ +#define dstin x0 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 + ENTRY (__memset_kunpeng) PTR_ARG (0) diff --git a/sysdeps/aarch64/multiarch/memset_mops.S b/sysdeps/aarch64/multiarch/memset_mops.S index e879c81..83cf378 100644 --- a/sysdeps/aarch64/multiarch/memset_mops.S +++ b/sysdeps/aarch64/multiarch/memset_mops.S @@ -1,5 +1,5 @@ /* Optimized memset for MOPS. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S index b43a43b..8e032f1 100644 --- a/sysdeps/aarch64/multiarch/memset_oryon1.S +++ b/sysdeps/aarch64/multiarch/memset_oryon1.S @@ -1,5 +1,5 @@ /* Optimized memset for Qualcomm's oyron-1 core. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. Copyright The GNU Toolchain Authors. This file is part of the GNU C Library. @@ -19,12 +19,18 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include "memset-reg.h" /* Assumptions: ARMv8-a, AArch64, unaligned accesses */ +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 + ENTRY (__memset_oryon1) PTR_ARG (0) @@ -87,8 +93,6 @@ L(set_long): cmp count, 256 ccmp valw, 0, 0, cs b.eq L(try_zva) - cmp count, #32768 - b.hi L(set_long_with_nontemp) /* Small-size or non-zero memset does not use DC ZVA. */ sub count, dstend, dst @@ -111,30 +115,6 @@ L(set_long): stp val, val, [dstend, -16] ret -L(set_long_with_nontemp): - /* Small-size or non-zero memset does not use DC ZVA. */ - sub count, dstend, dst - - /* Adjust count and bias for loop. By subtracting extra 1 from count, - it is easy to use tbz instruction to check whether loop tailing - count is less than 33 bytes, so as to bypass 2 unnecessary stps. */ - sub count, count, 64+16+1 - -1: stnp val, val, [dst, 16] - stnp val, val, [dst, 32] - stnp val, val, [dst, 48] - stnp val, val, [dst, 64] - add dst, dst, #64 - subs count, count, 64 - b.hs 1b - - tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ - stnp val, val, [dst, 16] - stnp val, val, [dst, 32] -1: stnp val, val, [dstend, -32] - stnp val, val, [dstend, -16] - ret - L(try_zva): /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA as it is faster. */ diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S index fb67cb1..91640b7 100644 --- a/sysdeps/aarch64/multiarch/memset_zva64.S +++ b/sysdeps/aarch64/multiarch/memset_zva64.S @@ -1,5 +1,5 @@ /* Optimized memset for zva size = 64. - Copyright (C) 2023-2024 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c index f78592c..c7ee383 100644 --- a/sysdeps/aarch64/multiarch/strlen.c +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -1,5 +1,5 @@ /* Multiple versions of strlen. AARCH64 version. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S index 67dcc94..4eb91c9 100644 --- a/sysdeps/aarch64/multiarch/strlen_asimd.S +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S @@ -1,5 +1,5 @@ /* Optimized strlen implementation using SIMD. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S index ceeafe9..270586c 100644 --- a/sysdeps/aarch64/multiarch/strlen_generic.S +++ b/sysdeps/aarch64/multiarch/strlen_generic.S @@ -1,5 +1,5 @@ /* A Generic Optimized strlen implementation for AARCH64. - Copyright (C) 2018-2024 Free Software Foundation, Inc. + Copyright (C) 2018-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h index 5fe9577..7f11e82 100644 --- a/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h +++ b/sysdeps/aarch64/nptl/bits/pthreadtypes-arch.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2024 Free Software Foundation, Inc. +/* Copyright (C) 2002-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/nptl/bits/semaphore.h b/sysdeps/aarch64/nptl/bits/semaphore.h index 216ee3d..45aabcb 100644 --- a/sysdeps/aarch64/nptl/bits/semaphore.h +++ b/sysdeps/aarch64/nptl/bits/semaphore.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2024 Free Software Foundation, Inc. +/* Copyright (C) 2002-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/nptl/bits/struct_rwlock.h b/sysdeps/aarch64/nptl/bits/struct_rwlock.h index 84bdc0f..d15d053 100644 --- a/sysdeps/aarch64/nptl/bits/struct_rwlock.h +++ b/sysdeps/aarch64/nptl/bits/struct_rwlock.h @@ -1,5 +1,5 @@ /* AArch64 internal rwlock struct definitions. - Copyright (C) 2019-2024 Free Software Foundation, Inc. + Copyright (C) 2019-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/nptl/pthreaddef.h b/sysdeps/aarch64/nptl/pthreaddef.h index 88f8602..c03593d 100644 --- a/sysdeps/aarch64/nptl/pthreaddef.h +++ b/sysdeps/aarch64/nptl/pthreaddef.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2024 Free Software Foundation, Inc. +/* Copyright (C) 2002-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h index fc9776d..ede7c0d 100644 --- a/sysdeps/aarch64/nptl/tls.h +++ b/sysdeps/aarch64/nptl/tls.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2005-2024 Free Software Foundation, Inc. +/* Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S index cdfc6fc..6b8fab1 100644 --- a/sysdeps/aarch64/rawmemchr.S +++ b/sysdeps/aarch64/rawmemchr.S @@ -1,6 +1,6 @@ /* rawmemchr - find a character in a memory zone - Copyright (C) 2015-2024 Free Software Foundation, Inc. + Copyright (C) 2015-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/rtld-global-offsets.sym b/sysdeps/aarch64/rtld-global-offsets.sym index 23cdaf7..6c0690b 100644 --- a/sysdeps/aarch64/rtld-global-offsets.sym +++ b/sysdeps/aarch64/rtld-global-offsets.sym @@ -3,8 +3,13 @@ #include <ldsodefs.h> #define GLRO_offsetof(name) offsetof (struct rtld_global_ro, _##name) +#define GL_offsetof(name) offsetof (struct rtld_global, _##name) -- Offsets of _rtld_global_ro in libc.so GLRO_DL_HWCAP_OFFSET GLRO_offsetof (dl_hwcap) GLRO_DL_HWCAP2_OFFSET GLRO_offsetof (dl_hwcap2) + +-- Offsets of _rtld_global in libc.so + +GL_DL_AARCH64_GCS_OFFSET GL_offsetof (dl_aarch64_gcs) diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S index 43fdb1b..b630ca0 100644 --- a/sysdeps/aarch64/setjmp.S +++ b/sysdeps/aarch64/setjmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -57,6 +57,16 @@ ENTRY (__sigsetjmp) stp d10, d11, [x0, #JB_D10<<3] stp d12, d13, [x0, #JB_D12<<3] stp d14, d15, [x0, #JB_D14<<3] + + /* GCS support. */ + mov x16, 1 + CHKFEAT_X16 + tbnz x16, 0, L(gcs_done) + MRS_GCSPR (x2) + add x2, x2, 8 /* GCS state right after setjmp returns. */ + str x2, [x0, #JB_GCSPR] +L(gcs_done): + #ifdef PTR_MANGLE mov x4, sp PTR_MANGLE (5, 4, 3, 2) diff --git a/sysdeps/aarch64/sfp-machine.h b/sysdeps/aarch64/sfp-machine.h index a9ecdbf..b41a946 100644 --- a/sysdeps/aarch64/sfp-machine.h +++ b/sysdeps/aarch64/sfp-machine.h @@ -74,7 +74,7 @@ do { \ const float fp_1e32 = 1.0e32f; \ const float fp_zero = 0.0; \ const float fp_one = 1.0; \ - unsigned fpsr; \ + uint64_t fpsr; \ if (_fex & FP_EX_INVALID) \ { \ __asm__ __volatile__ ("fdiv\ts0, %s0, %s0" \ diff --git a/sysdeps/aarch64/sotruss-lib.c b/sysdeps/aarch64/sotruss-lib.c index 0c99fd7..b57ad71 100644 --- a/sysdeps/aarch64/sotruss-lib.c +++ b/sysdeps/aarch64/sotruss-lib.c @@ -1,5 +1,5 @@ /* Override generic sotruss-lib.c to define actual functions for AArch64. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/stackinfo.h b/sysdeps/aarch64/stackinfo.h deleted file mode 100644 index 5a4dce7..0000000 --- a/sysdeps/aarch64/stackinfo.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2001-2024 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 2.1 of the - License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -/* This file contains a bit of information about the stack allocation - of the processor. */ - -#ifndef _STACKINFO_H -#define _STACKINFO_H 1 - -#include <elf.h> - -/* On AArch64 the stack grows down. */ -#define _STACK_GROWS_DOWN 1 - -/* Default to a non-executable stack. */ -#define DEFAULT_STACK_PERMS (PF_R|PF_W) - -#endif /* stackinfo.h */ diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S index 5d187f3..ef6b5ad 100644 --- a/sysdeps/aarch64/start.S +++ b/sysdeps/aarch64/start.S @@ -1,4 +1,4 @@ -/* Copyright (C) 1995-2024 Free Software Foundation, Inc. +/* Copyright (C) 1995-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/stpcpy.S b/sysdeps/aarch64/stpcpy.S index daf5fbb..8c84cef 100644 --- a/sysdeps/aarch64/stpcpy.S +++ b/sysdeps/aarch64/stpcpy.S @@ -1,5 +1,5 @@ /* stpcpy - copy a string returning pointer to end. - Copyright (C) 2015-2024 Free Software Foundation, Inc. + Copyright (C) 2015-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S index ca4c99e..8fb7c33 100644 --- a/sysdeps/aarch64/strchr.S +++ b/sysdeps/aarch64/strchr.S @@ -1,6 +1,6 @@ /* strchr - find a character in a string - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S index e1a1c7e..7862cdf 100644 --- a/sysdeps/aarch64/strchrnul.S +++ b/sysdeps/aarch64/strchrnul.S @@ -1,6 +1,6 @@ /* strchrnul - find a character or nul in a string - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S index 47f6fb1..d724586 100644 --- a/sysdeps/aarch64/strcmp.S +++ b/sysdeps/aarch64/strcmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S index 7053540..5477597 100644 --- a/sysdeps/aarch64/strcpy.S +++ b/sysdeps/aarch64/strcpy.S @@ -1,5 +1,5 @@ /* strcpy/stpcpy - copy a string returning pointer to start/end. - Copyright (C) 2013-2024 Free Software Foundation, Inc. + Copyright (C) 2013-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S index 352fb40..a7df056 100644 --- a/sysdeps/aarch64/strlen.S +++ b/sysdeps/aarch64/strlen.S @@ -1,5 +1,5 @@ /* Generic optimized strlen using SIMD. - Copyright (C) 2012-2024 Free Software Foundation, Inc. + Copyright (C) 2012-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index 5da88d1..da7545f 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2013-2024 Free Software Foundation, Inc. +/* Copyright (C) 2013-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S index e4fb350..9c40639 100644 --- a/sysdeps/aarch64/strnlen.S +++ b/sysdeps/aarch64/strnlen.S @@ -1,6 +1,6 @@ /* strnlen - calculate the length of a string with limit. - Copyright (C) 2013-2024 Free Software Foundation, Inc. + Copyright (C) 2013-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index e52c9b2..869b1cd 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -1,6 +1,6 @@ /* strrchr: find the last instance of a character in a string. - Copyright (C) 2014-2024 Free Software Foundation, Inc. + Copyright (C) 2014-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/sys/ifunc.h b/sysdeps/aarch64/sys/ifunc.h index eeebae1..7781b37 100644 --- a/sysdeps/aarch64/sys/ifunc.h +++ b/sysdeps/aarch64/sys/ifunc.h @@ -1,5 +1,5 @@ /* Definitions used by AArch64 indirect function resolvers. - Copyright (C) 2019-2024 Free Software Foundation, Inc. + Copyright (C) 2019-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h index 464674e..036eb12 100644 --- a/sysdeps/aarch64/sysdep.h +++ b/sysdeps/aarch64/sysdep.h @@ -1,4 +1,4 @@ -/* Copyright (C) 1997-2024 Free Software Foundation, Inc. +/* Copyright (C) 1997-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -74,10 +74,18 @@ strip_pac (void *p) #define PACIASP hint 25 #define AUTIASP hint 29 +/* Guarded Control Stack support. */ +#define CHKFEAT_X16 hint 40 +#define MRS_GCSPR(x) mrs x, s3_3_c2_c5_1 +#define GCSPOPM(x) sysl x, #3, c7, c7, #1 +#define GCSSS1(x) sys #3, c7, c7, #2, x +#define GCSSS2(x) sysl x, #3, c7, c7, #3 + /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */ #define FEATURE_1_AND 0xc0000000 #define FEATURE_1_BTI 1 #define FEATURE_1_PAC 2 +#define FEATURE_1_GCS 4 /* Add a NT_GNU_PROPERTY_TYPE_0 note. */ #define GNU_PROPERTY(type, value) \ @@ -96,9 +104,9 @@ strip_pac (void *p) /* Add GNU property note with the supported features to all asm code where sysdep.h is included. */ #if HAVE_AARCH64_BTI && HAVE_AARCH64_PAC_RET -GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC|FEATURE_1_GCS) #elif HAVE_AARCH64_BTI -GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI) +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS) #endif /* Define an entry point visible from C. */ diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c index 105e4cf..5dc9ed4 100644 --- a/sysdeps/aarch64/tlsdesc.c +++ b/sysdeps/aarch64/tlsdesc.c @@ -1,6 +1,6 @@ /* Manage TLS descriptors. AArch64 version. - Copyright (C) 2011-2024 Free Software Foundation, Inc. + Copyright (C) 2011-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/tst-audit.h b/sysdeps/aarch64/tst-audit.h index 95e034a..cf05ba5 100644 --- a/sysdeps/aarch64/tst-audit.h +++ b/sysdeps/aarch64/tst-audit.h @@ -1,6 +1,6 @@ /* Definitions for testing PLT entry/exit auditing. AArch64 version. - Copyright (C) 2005-2024 Free Software Foundation, Inc. + Copyright (C) 2005-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. diff --git a/sysdeps/aarch64/tst-audit26.c b/sysdeps/aarch64/tst-audit26.c index cf4026b..8e00cb0 100644 --- a/sysdeps/aarch64/tst-audit26.c +++ b/sysdeps/aarch64/tst-audit26.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 ABI specifics. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-audit26mod.c b/sysdeps/aarch64/tst-audit26mod.c index 67f8531..c5d0693 100644 --- a/sysdeps/aarch64/tst-audit26mod.c +++ b/sysdeps/aarch64/tst-audit26mod.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 ABI specifics. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-audit26mod.h b/sysdeps/aarch64/tst-audit26mod.h index 4a3f956..6c8ed7f 100644 --- a/sysdeps/aarch64/tst-audit26mod.h +++ b/sysdeps/aarch64/tst-audit26mod.h @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 specific ABI. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-audit27.c b/sysdeps/aarch64/tst-audit27.c index 4f34581..b7778aa 100644 --- a/sysdeps/aarch64/tst-audit27.c +++ b/sysdeps/aarch64/tst-audit27.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 ABI specifics. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-audit27mod.c b/sysdeps/aarch64/tst-audit27mod.c index a853c28..03c5628 100644 --- a/sysdeps/aarch64/tst-audit27mod.c +++ b/sysdeps/aarch64/tst-audit27mod.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 ABI specifics. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-audit27mod.h b/sysdeps/aarch64/tst-audit27mod.h index 008f5a6..5ef272f 100644 --- a/sysdeps/aarch64/tst-audit27mod.h +++ b/sysdeps/aarch64/tst-audit27mod.h @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 specific ABI. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-auditmod26.c b/sysdeps/aarch64/tst-auditmod26.c index aecb302..1dd1412 100644 --- a/sysdeps/aarch64/tst-auditmod26.c +++ b/sysdeps/aarch64/tst-auditmod26.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 specific ABI. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-auditmod27.c b/sysdeps/aarch64/tst-auditmod27.c index 547ca9f..820acad 100644 --- a/sysdeps/aarch64/tst-auditmod27.c +++ b/sysdeps/aarch64/tst-auditmod27.c @@ -1,5 +1,5 @@ /* Check LD_AUDIT for aarch64 specific ABI. - Copyright (C) 2022-2024 Free Software Foundation, Inc. + Copyright (C) 2022-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-ifunc-arg-1.c b/sysdeps/aarch64/tst-ifunc-arg-1.c index d8c3c14..b90c836 100644 --- a/sysdeps/aarch64/tst-ifunc-arg-1.c +++ b/sysdeps/aarch64/tst-ifunc-arg-1.c @@ -1,5 +1,5 @@ /* Test STT_GNU_IFUNC resolver with second argument. - Copyright (C) 2019-2024 Free Software Foundation, Inc. + Copyright (C) 2019-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-ifunc-arg-2.c b/sysdeps/aarch64/tst-ifunc-arg-2.c index a3bca92..dac144d 100644 --- a/sysdeps/aarch64/tst-ifunc-arg-2.c +++ b/sysdeps/aarch64/tst-ifunc-arg-2.c @@ -1,5 +1,5 @@ /* Test R_*_IRELATIVE resolver with second argument. - Copyright (C) 2019-2024 Free Software Foundation, Inc. + Copyright (C) 2019-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-sme-jmp.c b/sysdeps/aarch64/tst-sme-jmp.c index 3576934..62c419f 100644 --- a/sysdeps/aarch64/tst-sme-jmp.c +++ b/sysdeps/aarch64/tst-sme-jmp.c @@ -1,5 +1,5 @@ /* Test for SME longjmp. - Copyright (C) 2023 Free Software Foundation, Inc. + Copyright (C) 2023-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or diff --git a/sysdeps/aarch64/tst-vpcs-mod.S b/sysdeps/aarch64/tst-vpcs-mod.S index 19b01c3..613a4d1 100644 --- a/sysdeps/aarch64/tst-vpcs-mod.S +++ b/sysdeps/aarch64/tst-vpcs-mod.S @@ -1,5 +1,5 @@ /* Record the register state before and after a variant PCS call. - Copyright (C) 2020-2024 Free Software Foundation, Inc. + Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,6 +17,8 @@ License along with the GNU C Library. If not, see <https://www.gnu.org/licenses/>. */ +#include "tst-asm-helper.h" + .variant_pcs vpcs_call .global vpcs_call .type vpcs_call, %function @@ -121,7 +123,7 @@ vpcs_call_regs: /* Emulate a BL using B, but save x30 before the branch. */ adr x30, .L_return_addr stp x30, x29, [x1, 240] - b vpcs_call + bl vpcs_call .L_return_addr: /* Restore callee-saved registers. */ diff --git a/sysdeps/aarch64/tst-vpcs.c b/sysdeps/aarch64/tst-vpcs.c index 4fd44bc..5ac42b7 100644 --- a/sysdeps/aarch64/tst-vpcs.c +++ b/sysdeps/aarch64/tst-vpcs.c @@ -1,5 +1,5 @@ /* Test that variant PCS calls don't clobber registers with lazy binding. - Copyright (C) 2020-2024 Free Software Foundation, Inc. + Copyright (C) 2020-2025 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or |