82 files changed, 3796 insertions, 1361 deletions
diff --git a/sysdeps/aarch64/Makefile b/sysdeps/aarch64/Makefile
index 4b7f8a5..bb97d31 100644
--- a/sysdeps/aarch64/Makefile
+++ b/sysdeps/aarch64/Makefile
@@ -41,15 +41,18 @@ gen-as-const-headers += \
   dl-link.sym \
   rtld-global-offsets.sym
 
-tests-internal += tst-ifunc-arg-1 tst-ifunc-arg-2
+tests-internal += \
+  tst-ifunc-arg-1 \
+  tst-ifunc-arg-2 \
+  tst-ifunc-arg-3 \
+  tst-ifunc-arg-4 \
+  # tests-internal
 
-ifeq (yes,$(aarch64-variant-pcs))
 tests += tst-vpcs
 modules-names += tst-vpcs-mod
 LDFLAGS-tst-vpcs-mod.so = -Wl,-z,lazy
 $(objpfx)tst-vpcs: $(objpfx)tst-vpcs-mod.so
 endif
-endif
 
 ifeq ($(subdir),csu)
 gen-as-const-headers += \
@@ -75,7 +78,9 @@ sysdep_routines += \
   __alloc_gcs
 
 tests += \
-  tst-sme-jmp
+  tst-sme-jmp \
+  tst-sme-za-state \
+  # tests
 endif
 
 ifeq ($(subdir),malloc)
diff --git a/sysdeps/aarch64/__alloc_gcs.c b/sysdeps/aarch64/__alloc_gcs.c
index e70b459..b98e5fc 100644
--- a/sysdeps/aarch64/__alloc_gcs.c
+++ b/sysdeps/aarch64/__alloc_gcs.c
@@ -15,6 +15,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include "aarch64-gcs.h"
+
 #include <sysdep.h>
 #include <unistd.h>
 #include <sys/mman.h>
@@ -34,7 +36,7 @@ map_shadow_stack (void *addr, size_t size, unsigned long flags)
 #define GCS_ALTSTACK_RESERVE 160
 
 void *
-__alloc_gcs (size_t stack_size, void **ss_base, size_t *ss_size)
+__alloc_gcs (size_t stack_size, struct gcs_record *gcs)
 {
   size_t size = (stack_size / 2 + GCS_ALTSTACK_RESERVE) & -8UL;
   if (size > GCS_MAX_SIZE)
@@ -45,9 +47,6 @@ __alloc_gcs (size_t stack_size, void **ss_base, size_t *ss_size)
   if (base == MAP_FAILED)
     return NULL;
 
-  *ss_base = base;
-  *ss_size = size;
-
   uint64_t *gcsp = (uint64_t *) ((char *) base + size);
   /* Skip end of GCS token.  */
   gcsp--;
@@ -58,6 +57,14 @@ __alloc_gcs (size_t stack_size, void **ss_base, size_t *ss_size)
       __munmap (base, size);
       return NULL;
     }
+
+  if (gcs != NULL)
+    {
+      gcs->gcs_base = base;
+      gcs->gcs_token = gcsp;
+      gcs->gcs_size = size;
+    }
+
   /* Return the target GCS pointer for context switch.  */
   return gcsp + 1;
 }
diff --git a/sysdeps/aarch64/__arm_za_disable.S b/sysdeps/aarch64/__arm_za_disable.S
index 6290803..92f4814 100644
--- a/sysdeps/aarch64/__arm_za_disable.S
+++ b/sysdeps/aarch64/__arm_za_disable.S
@@ -88,10 +88,8 @@ L(save_loop):
 L(end):
 	ret
 L(fail):
-#if HAVE_AARCH64_PAC_RET
-	PACIASP
-	cfi_window_save
-#endif
+	paciasp
+	cfi_negate_ra_state
 	stp	x29, x30, [sp, -32]!
 	cfi_adjust_cfa_offset (32)
 	cfi_rel_offset (x29, 0)
diff --git a/sysdeps/aarch64/__longjmp.S b/sysdeps/aarch64/__longjmp.S
index 981bf80..70ac02c 100644
--- a/sysdeps/aarch64/__longjmp.S
+++ b/sysdeps/aarch64/__longjmp.S
@@ -24,51 +24,43 @@
 /* __longjmp(jmpbuf, val) */
 
 ENTRY (__longjmp)
-	cfi_def_cfa(x0, 0)
-	cfi_offset(x19, JB_X19<<3)
-	cfi_offset(x20, JB_X20<<3)
-	cfi_offset(x21, JB_X21<<3)
-	cfi_offset(x22, JB_X22<<3)
-	cfi_offset(x23, JB_X23<<3)
-	cfi_offset(x24, JB_X24<<3)
-	cfi_offset(x25, JB_X25<<3)
-	cfi_offset(x26, JB_X26<<3)
-	cfi_offset(x27, JB_X27<<3)
-	cfi_offset(x28, JB_X28<<3)
-	cfi_offset(x29, JB_X29<<3)
-	cfi_offset(x30, JB_LR<<3)
-
-	cfi_offset( d8, JB_D8<<3)
-	cfi_offset( d9, JB_D9<<3)
-	cfi_offset(d10, JB_D10<<3)
-	cfi_offset(d11, JB_D11<<3)
-	cfi_offset(d12, JB_D12<<3)
-	cfi_offset(d13, JB_D13<<3)
-	cfi_offset(d14, JB_D14<<3)
-	cfi_offset(d15, JB_D15<<3)
 
 #if IS_IN(libc)
-	/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so.  */
-# if HAVE_AARCH64_PAC_RET
-	PACIASP
-	cfi_window_save
-# endif
-	stp	x29, x30, [sp, -16]!
-	cfi_adjust_cfa_offset (16)
-	cfi_rel_offset (x29, 0)
-	cfi_rel_offset (x30, 8)
-	mov	x29, sp
+	/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so.
+	   The calling convention of __libc_arm_za_disable allows to do
+	   this thus allowing to avoid saving to and reading from stack.
+	   As a result we also don't need to sign the return address and
+	   check it after returning because it is not stored to stack.  */
+	mov	x13, x30
+	cfi_register (x30, x13)
 	bl	__libc_arm_za_disable
-	ldp	x29, x30, [sp], 16
-	cfi_adjust_cfa_offset (-16)
-	cfi_restore (x29)
-	cfi_restore (x30)
-# if HAVE_AARCH64_PAC_RET
-	AUTIASP
-	cfi_window_save
-# endif
+	mov	x30, x13
+	cfi_register (x13, x30)
 #endif
 
+	cfi_def_cfa (x0, 0)
+	cfi_offset (x19, JB_X19<<3)
+	cfi_offset (x20, JB_X20<<3)
+	cfi_offset (x21, JB_X21<<3)
+	cfi_offset (x22, JB_X22<<3)
+	cfi_offset (x23, JB_X23<<3)
+	cfi_offset (x24, JB_X24<<3)
+	cfi_offset (x25, JB_X25<<3)
+	cfi_offset (x26, JB_X26<<3)
+	cfi_offset (x27, JB_X27<<3)
+	cfi_offset (x28, JB_X28<<3)
+	cfi_offset (x29, JB_X29<<3)
+	cfi_offset (x30, JB_LR<<3)
+
+	cfi_offset ( d8, JB_D8<<3)
+	cfi_offset ( d9, JB_D9<<3)
+	cfi_offset (d10, JB_D10<<3)
+	cfi_offset (d11, JB_D11<<3)
+	cfi_offset (d12, JB_D12<<3)
+	cfi_offset (d13, JB_D13<<3)
+	cfi_offset (d14, JB_D14<<3)
+	cfi_offset (d15, JB_D15<<3)
+
 	ldp	x19, x20, [x0, #JB_X19<<3]
 	ldp	x21, x22, [x0, #JB_X21<<3]
 	ldp	x23, x24, [x0, #JB_X23<<3]
diff --git a/sysdeps/aarch64/aarch64-gcs.h b/sysdeps/aarch64/aarch64-gcs.h
index 162ef18..8e253ed 100644
--- a/sysdeps/aarch64/aarch64-gcs.h
+++ b/sysdeps/aarch64/aarch64-gcs.h
@@ -23,6 +23,21 @@
 #include <stddef.h>
 #include <stdbool.h>
 
-void *__alloc_gcs (size_t, void **, size_t *) attribute_hidden;
+struct gcs_record
+{
+  void *gcs_base;
+  void *gcs_token;
+  size_t gcs_size;
+};
+
+void *__alloc_gcs (size_t, struct gcs_record *) attribute_hidden;
+
+static inline bool
+has_gcs (void)
+{
+  register unsigned long x16 asm ("x16") = 1;
+  asm ("hint	40" /* chkfeat x16 */ : "+r" (x16));
+  return x16 == 0;
+}
 
 #endif
diff --git a/sysdeps/aarch64/configure b/sysdeps/aarch64/configure
index 4bd5496..f364e65 100755
--- a/sysdeps/aarch64/configure
+++ b/sysdeps/aarch64/configure
@@ -185,219 +185,14 @@ else
 default-abi = lp64"
 fi
 
-# Only consider BTI supported if -mbranch-protection=bti is
-# on by default in the compiler and the linker produces
-# binaries with GNU property notes in PT_GNU_PROPERTY segment.
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for BTI support" >&5
-printf %s "checking for BTI support... " >&6; }
-if test ${libc_cv_aarch64_bti+y}
-then :
-  printf %s "(cached) " >&6
-else case e in #(
-  e)   cat > conftest.c <<EOF
-void foo (void) { }
-EOF
-  libc_cv_aarch64_bti=no
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostdlib -nostartfiles $no_ssp -shared -fPIC -o conftest.so conftest.c'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } \
-     && { ac_try='$READELF -lW conftest.so | grep -q GNU_PROPERTY'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } \
-     && { ac_try='$READELF -nW conftest.so | grep -q "NT_GNU_PROPERTY_TYPE_0.*AArch64 feature:.* BTI"'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }
-  then
-    libc_cv_aarch64_bti=yes
-  fi
-  rm -rf conftest.* ;;
-esac
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_bti" >&5
-printf "%s\n" "$libc_cv_aarch64_bti" >&6; }
-config_vars="$config_vars
-aarch64-bti = $libc_cv_aarch64_bti"
-if test $libc_cv_aarch64_bti = yes; then
-  printf "%s\n" "#define HAVE_AARCH64_BTI 1" >>confdefs.h
-
-fi
-
-# Check if glibc is built with return address signing, i.e.
-# if -mbranch-protection=pac-ret is on. We need this because
-# pac-ret relies on unwinder support so it's not safe to use
-# it in assembly code unconditionally, but there is no
-# feature test macro for it in gcc.
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if pac-ret is enabled" >&5
-printf %s "checking if pac-ret is enabled... " >&6; }
-if test ${libc_cv_aarch64_pac_ret+y}
-then :
-  printf %s "(cached) " >&6
-else case e in #(
-  e)   cat > conftest.c <<EOF
-int bar (void);
-int foo (void) { return bar () + 1; }
-EOF
-  libc_cv_aarch64_pac_ret=no
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -S -o conftest.s conftest.c'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } \
-     && { ac_try='grep -q -E '\''(hint( |	)+25|paciasp)'\'' conftest.s'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }
-  then
-    libc_cv_aarch64_pac_ret=yes
-  fi
-  rm -rf conftest.* ;;
-esac
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_pac_ret" >&5
-printf "%s\n" "$libc_cv_aarch64_pac_ret" >&6; }
-if test $libc_cv_aarch64_pac_ret = yes; then
-  printf "%s\n" "#define HAVE_AARCH64_PAC_RET 1" >>confdefs.h
-
-fi
-
-# Check if binutils supports variant PCS symbols.
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for variant PCS support" >&5
-printf %s "checking for variant PCS support... " >&6; }
-if test ${libc_cv_aarch64_variant_pcs+y}
-then :
-  printf %s "(cached) " >&6
-else case e in #(
-  e)   cat > conftest.S <<EOF
-.global foo
-.type foo, %function
-.variant_pcs foo
-foo:
-	ret
-.global bar
-.type bar, %function
-bar:
-	b foo
-EOF
-  libc_cv_aarch64_variant_pcs=no
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostdlib -nostartfiles $no_ssp -shared -fPIC -o conftest.so conftest.S'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; } \
-     && { ac_try='$READELF -dW conftest.so | grep -q AARCH64_VARIANT_PCS'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }
-  then
-    libc_cv_aarch64_variant_pcs=yes
-  fi
-  rm -rf conftest.* ;;
-esac
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_variant_pcs" >&5
-printf "%s\n" "$libc_cv_aarch64_variant_pcs" >&6; }
-config_vars="$config_vars
-aarch64-variant-pcs = $libc_cv_aarch64_variant_pcs"
-
-# Check if asm support armv8.2-a+sve
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for SVE support in assembler" >&5
-printf %s "checking for SVE support in assembler... " >&6; }
-if test ${libc_cv_aarch64_sve_asm+y}
-then :
-  printf %s "(cached) " >&6
-else case e in #(
-  e) cat > conftest.s <<\EOF
-	.arch armv8.2-a+sve
-	ptrue p0.b
-EOF
-if { ac_try='${CC-cc} -c conftest.s 1>&5'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-  libc_cv_aarch64_sve_asm=yes
-else
-  libc_cv_aarch64_sve_asm=no
-fi
-rm -f conftest* ;;
-esac
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_sve_asm" >&5
-printf "%s\n" "$libc_cv_aarch64_sve_asm" >&6; }
-if test $libc_cv_aarch64_sve_asm = yes; then
-  printf "%s\n" "#define HAVE_AARCH64_SVE_ASM 1" >>confdefs.h
-
-fi
-
 if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
 
-# Check if compiler supports SVE ACLE.
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for availability of SVE ACLE" >&5
-printf %s "checking for availability of SVE ACLE... " >&6; }
-if test ${libc_cv_aarch64_sve_acle+y}
-then :
-  printf %s "(cached) " >&6
-else case e in #(
-  e)   cat > conftest.c <<EOF
-#include <arm_sve.h>
-EOF
-  if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS -fsyntax-only -ffreestanding conftest.c'
-  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
-  (eval $ac_try) 2>&5
-  ac_status=$?
-  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-    libc_cv_aarch64_sve_acle=yes
-  else
-    libc_cv_aarch64_sve_acle=no
-  fi
-  rm conftest.c ;;
-esac
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aarch64_sve_acle" >&5
-printf "%s\n" "$libc_cv_aarch64_sve_acle" >&6; }
-
-# Check if compiler is sufficient to build mathvec
-if test $build_mathvec = yes; then
-  fail=no
-  if test $libc_cv_aarch64_variant_pcs = no; then
-    fail=yes
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: mathvec is enabled but linker does not support variant PCS." >&5
-printf "%s\n" "$as_me: WARNING: mathvec is enabled but linker does not support variant PCS." >&2;}
-  fi
-  if test $libc_cv_aarch64_sve_asm = no; then
-    fail=yes
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: mathvec is enabled but assembler does not support SVE." >&5
-printf "%s\n" "$as_me: WARNING: mathvec is enabled but assembler does not support SVE." >&2;}
-  fi
-  if test $libc_cv_aarch64_sve_acle = no; then
-    fail=yes
-    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: mathvec is enabled but compiler does not have SVE ACLE." >&5
-printf "%s\n" "$as_me: WARNING: mathvec is enabled but compiler does not have SVE ACLE." >&2;}
-  fi
-  if test $fail = yes; then
-    as_fn_error $? "use a compatible toolchain or configure with --disable-mathvec (this results in incomplete ABI)." "$LINENO" 5
-  fi
-else
+if test $build_mathvec = no; then
   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: mathvec is disabled, this results in incomplete ABI." >&5
 printf "%s\n" "$as_me: WARNING: mathvec is disabled, this results in incomplete ABI." >&2;}
 fi
 
+libc_cv_support_sframe=yes
+
diff --git a/sysdeps/aarch64/configure.ac b/sysdeps/aarch64/configure.ac
index 56d12d6..a9a1b74 100644
--- a/sysdeps/aarch64/configure.ac
+++ b/sysdeps/aarch64/configure.ac
@@ -24,119 +24,12 @@ else
   LIBC_CONFIG_VAR([default-abi], [lp64])
 fi
 
-# Only consider BTI supported if -mbranch-protection=bti is
-# on by default in the compiler and the linker produces
-# binaries with GNU property notes in PT_GNU_PROPERTY segment.
-AC_CACHE_CHECK([for BTI support], [libc_cv_aarch64_bti], [dnl
-  cat > conftest.c <<EOF
-void foo (void) { }
-EOF
-  libc_cv_aarch64_bti=no
-  if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostdlib -nostartfiles $no_ssp -shared -fPIC -o conftest.so conftest.c]) \
-     && AC_TRY_COMMAND([$READELF -lW conftest.so | grep -q GNU_PROPERTY]) \
-     && AC_TRY_COMMAND([$READELF -nW conftest.so | grep -q "NT_GNU_PROPERTY_TYPE_0.*AArch64 feature:.* BTI"])
-  then
-    libc_cv_aarch64_bti=yes
-  fi
-  rm -rf conftest.*])
-LIBC_CONFIG_VAR([aarch64-bti], [$libc_cv_aarch64_bti])
-if test $libc_cv_aarch64_bti = yes; then
-  AC_DEFINE(HAVE_AARCH64_BTI)
-fi
-
-# Check if glibc is built with return address signing, i.e.
-# if -mbranch-protection=pac-ret is on. We need this because
-# pac-ret relies on unwinder support so it's not safe to use
-# it in assembly code unconditionally, but there is no
-# feature test macro for it in gcc.
-AC_CACHE_CHECK([if pac-ret is enabled], [libc_cv_aarch64_pac_ret], [dnl
-  cat > conftest.c <<EOF
-int bar (void);
-int foo (void) { return bar () + 1; }
-EOF
-  libc_cv_aarch64_pac_ret=no
-  if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -S -o conftest.s conftest.c]) \
-     && AC_TRY_COMMAND([grep -q -E '\''(hint( |	)+25|paciasp)'\'' conftest.s])
-  then
-    libc_cv_aarch64_pac_ret=yes
-  fi
-  rm -rf conftest.*])
-if test $libc_cv_aarch64_pac_ret = yes; then
-  AC_DEFINE(HAVE_AARCH64_PAC_RET)
-fi
-
-# Check if binutils supports variant PCS symbols.
-AC_CACHE_CHECK([for variant PCS support], [libc_cv_aarch64_variant_pcs], [dnl
-  cat > conftest.S <<EOF
-.global foo
-.type foo, %function
-.variant_pcs foo
-foo:
-	ret
-.global bar
-.type bar, %function
-bar:
-	b foo
-EOF
-  libc_cv_aarch64_variant_pcs=no
-  if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -nostdlib -nostartfiles $no_ssp -shared -fPIC -o conftest.so conftest.S]) \
-     && AC_TRY_COMMAND([$READELF -dW conftest.so | grep -q AARCH64_VARIANT_PCS])
-  then
-    libc_cv_aarch64_variant_pcs=yes
-  fi
-  rm -rf conftest.*])
-LIBC_CONFIG_VAR([aarch64-variant-pcs], [$libc_cv_aarch64_variant_pcs])
-
-# Check if asm support armv8.2-a+sve
-AC_CACHE_CHECK([for SVE support in assembler], [libc_cv_aarch64_sve_asm], [dnl
-cat > conftest.s <<\EOF
-	.arch armv8.2-a+sve
-	ptrue p0.b
-EOF
-if AC_TRY_COMMAND(${CC-cc} -c conftest.s 1>&AS_MESSAGE_LOG_FD); then
-  libc_cv_aarch64_sve_asm=yes
-else
-  libc_cv_aarch64_sve_asm=no
-fi
-rm -f conftest*])
-if test $libc_cv_aarch64_sve_asm = yes; then
-  AC_DEFINE(HAVE_AARCH64_SVE_ASM)
-fi
-
 if test x"$build_mathvec" = xnotset; then
   build_mathvec=yes
 fi
 
-# Check if compiler supports SVE ACLE.
-AC_CACHE_CHECK(for availability of SVE ACLE, libc_cv_aarch64_sve_acle, [dnl
-  cat > conftest.c <<EOF
-#include <arm_sve.h>
-EOF
-  if AC_TRY_COMMAND([${CC-cc} $CFLAGS $CPPFLAGS -fsyntax-only -ffreestanding conftest.c]); then
-    libc_cv_aarch64_sve_acle=yes
-  else
-    libc_cv_aarch64_sve_acle=no
-  fi
-  rm conftest.c])
-
-# Check if compiler is sufficient to build mathvec
-if test $build_mathvec = yes; then
-  fail=no
-  if test $libc_cv_aarch64_variant_pcs = no; then
-    fail=yes
-    AC_MSG_WARN([mathvec is enabled but linker does not support variant PCS.])
-  fi
-  if test $libc_cv_aarch64_sve_asm = no; then
-    fail=yes
-    AC_MSG_WARN([mathvec is enabled but assembler does not support SVE.])
-  fi
-  if test $libc_cv_aarch64_sve_acle = no; then
-    fail=yes
-    AC_MSG_WARN([mathvec is enabled but compiler does not have SVE ACLE.])
-  fi
-  if test $fail = yes; then
-    AC_MSG_ERROR([use a compatible toolchain or configure with --disable-mathvec (this results in incomplete ABI).])
-  fi
-else
+if test $build_mathvec = no; then
   AC_MSG_WARN([mathvec is disabled, this results in incomplete ABI.])
 fi
+
+libc_cv_support_sframe=yes
diff --git a/sysdeps/aarch64/crti.S b/sysdeps/aarch64/crti.S
index 0c3ee40..e9e530c 100644
--- a/sysdeps/aarch64/crti.S
+++ b/sysdeps/aarch64/crti.S
@@ -65,7 +65,7 @@ call_weak_fn:
 	cbz	x0, 1f
 	b	PREINIT_FUNCTION
 1:
-	RET
+	ret
 	.size	call_weak_fn, .-call_weak_fn
 #endif
 
@@ -75,11 +75,7 @@ call_weak_fn:
 	.hidden	_init
 	.type	_init, %function
 _init:
-#if HAVE_AARCH64_PAC_RET
-	PACIASP
-#else
-	BTI_C
-#endif
+	paciasp
 	stp	x29, x30, [sp, -16]!
 	mov	x29, sp
 #if PREINIT_FUNCTION_WEAK
@@ -94,10 +90,6 @@ _init:
 	.hidden	_fini
 	.type	_fini, %function
 _fini:
-#if HAVE_AARCH64_PAC_RET
-	PACIASP
-#else
-	BTI_C
-#endif
+	paciasp
 	stp	x29, x30, [sp, -16]!
 	mov	x29, sp
diff --git a/sysdeps/aarch64/crtn.S b/sysdeps/aarch64/crtn.S
index b52b10e..653a548 100644
--- a/sysdeps/aarch64/crtn.S
+++ b/sysdeps/aarch64/crtn.S
@@ -41,14 +41,10 @@
 
 	.section .init,"ax",%progbits
 	ldp	x29, x30, [sp], 16
-#if HAVE_AARCH64_PAC_RET
-	AUTIASP
-#endif
-	RET
+	autiasp
+	ret
 
 	.section .fini,"ax",%progbits
 	ldp	x29, x30, [sp], 16
-#if HAVE_AARCH64_PAC_RET
-	AUTIASP
-#endif
-	RET
+	autiasp
+	ret
diff --git a/sysdeps/aarch64/dl-irel.h b/sysdeps/aarch64/dl-irel.h
index ae402bc..7bae3c3 100644
--- a/sysdeps/aarch64/dl-irel.h
+++ b/sysdeps/aarch64/dl-irel.h
@@ -21,11 +21,26 @@
 #define _DL_IREL_H
 
 #include <stdio.h>
-#include <unistd.h>
 #include <ldsodefs.h>
-#include <sysdep.h>
 #include <sys/ifunc.h>
 
+#define _IFUNC_ARG_SIZE_VER0 24 /* sizeof 1st published __ifunc_arg_t */
+#define _IFUNC_ARG_SIZE_VER1 40 /* sizeof 2nd published __ifunc_arg_t */
+
+#define sizeof_field(TYPE, MEMBER) sizeof ((((TYPE *)0)->MEMBER))
+#define offsetofend(TYPE, MEMBER) \
+  (offsetof (TYPE, MEMBER) + sizeof_field (TYPE, MEMBER))
+
+_Static_assert (sizeof (__ifunc_arg_t) == _IFUNC_ARG_SIZE_VER1,
+  "sizeof (__ifunc_arg_t) != _IFUNC_ARG_SIZE_VER1");
+
+_Static_assert (_IFUNC_ARG_SIZE_VER1
+  == (_IFUNC_HWCAP_MAX + 1) * sizeof (unsigned long),
+  "_IFUNC_ARG_SIZE_VER1 and _IFUNC_HWCAP_MAX mismatch");
+
+#undef offsetofend
+#undef sizeof_field
+
 #define ELF_MACHINE_IRELA	1
 
 static inline ElfW(Addr)
@@ -37,6 +52,8 @@ elf_ifunc_invoke (ElfW(Addr) addr)
   arg._size = sizeof (arg);
   arg._hwcap = GLRO(dl_hwcap);
   arg._hwcap2 = GLRO(dl_hwcap2);
+  arg._hwcap3 = GLRO(dl_hwcap3);
+  arg._hwcap4 = GLRO(dl_hwcap4);
   return ((ElfW(Addr) (*) (uint64_t, const __ifunc_arg_t *)) (addr))
 	 (GLRO(dl_hwcap) | _IFUNC_ARG_HWCAP, &arg);
 }
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index fc40d66..2ff8d95 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -74,9 +74,9 @@
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_return:
-	BTI_C
+	bti	c
 	ldr	x0, [x0, 8]
-	RET
+	ret
 	cfi_endproc
 	.size	_dl_tlsdesc_return, .-_dl_tlsdesc_return
 
@@ -95,7 +95,7 @@ _dl_tlsdesc_return:
 	cfi_startproc
 	.align  2
 _dl_tlsdesc_undefweak:
-	BTI_C
+	bti	c
 	str	x1, [sp, #-16]!
 	cfi_adjust_cfa_offset (16)
 	ldr	x0, [x0, 8]
@@ -103,7 +103,7 @@ _dl_tlsdesc_undefweak:
 	sub	x0, x0, x1
 	ldr	x1, [sp], #16
 	cfi_adjust_cfa_offset (-16)
-	RET
+	ret
 	cfi_endproc
 	.size	_dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
 
@@ -141,12 +141,8 @@ _dl_tlsdesc_undefweak:
 	cfi_startproc
 	.align 2
 _dl_tlsdesc_dynamic:
-# if HAVE_AARCH64_PAC_RET
-	PACIASP
-	cfi_window_save
-# else
-	BTI_C
-# endif
+	paciasp
+	cfi_negate_ra_state
 
 	/* Save just enough registers to support fast path, if we fall
 	   into slow path we will save additional registers.  */
@@ -177,12 +173,10 @@ _dl_tlsdesc_dynamic:
 1:
 	ldp	 x3,  x4, [sp, #16]
 	ldp	 x1,  x2, [sp], #32
-# if HAVE_AARCH64_PAC_RET
-	AUTIASP
-	cfi_window_save
-# endif
+	autiasp
+	cfi_negate_ra_state
 	cfi_adjust_cfa_offset (-32)
-	RET
+	ret
 2:
 	/* This is the slow path. We need to call __tls_get_addr() which
 	   means we need to save and restore all the register that the
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index d6bed96..d628b01 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -34,7 +34,7 @@
 	cfi_startproc
 	.align 2
 _dl_runtime_resolve:
-	BTI_C
+	bti	c
 	/* AArch64 we get called with:
 	   ip0		&PLTGOT[2]
 	   ip1		temp(dl resolver entry point)
@@ -127,12 +127,8 @@ _dl_runtime_resolve:
 	cfi_startproc
 	.align 2
 _dl_runtime_profile:
-# if HAVE_AARCH64_PAC_RET
-	PACIASP
-	cfi_window_save
-# else
-	BTI_C
-# endif
+	paciasp
+	cfi_negate_ra_state
 	/* AArch64 we get called with:
 	   ip0		&PLTGOT[2]
 	   ip1		temp(dl resolver entry point)
@@ -251,17 +247,12 @@ _dl_runtime_profile:
 	cfi_restore(x29)
 	cfi_restore(x30)
 
-# if HAVE_AARCH64_PAC_RET
 	add	sp, sp, SF_SIZE
 	cfi_adjust_cfa_offset (-SF_SIZE)
-	AUTIASP
-	cfi_window_save
+	autiasp
+	cfi_negate_ra_state
 	add	sp, sp, 16
 	cfi_adjust_cfa_offset (-16)
-# else
-	add	sp, sp, SF_SIZE + 16
-	cfi_adjust_cfa_offset (- SF_SIZE - 16)
-# endif
 
 	/* Jump to the newly found address.  */
 	br	ip0
@@ -321,10 +312,8 @@ _dl_runtime_profile:
 	/* LR from within La_aarch64_reg */
 	ldr	lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR]
 	cfi_restore(lr)
-# if HAVE_AARCH64_PAC_RET
 	/* Note: LR restored from La_aarch64_reg has no PAC.  */
-	cfi_window_save
-# endif
+	cfi_negate_ra_state
 	mov	sp, x29
 	cfi_def_cfa_register (sp)
 	ldr	x29, [x29, #0]
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index aadedf1..068c11c 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,10 +1,14 @@
 libmvec-supported-funcs = acos \
                           acosh \
+                          acospi \
                           asin \
                           asinh \
+                          asinpi \
                           atan \
                           atanh \
+                          atanpi \
                           atan2 \
+                          atan2pi \
                           cbrt \
                           cos \
                           cosh \
@@ -52,8 +56,11 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                   v_powf_data
 endif
 
-sve-cflags = -march=armv8-a+sve
+# Enable SVE for building libmvec.  Since CFLAGS may contain a -mcpu or -march,
+# add a generic -mcpu and -march with SVE enabled.  Also use a tune for a modern
+# SVE core.
 
+sve-cflags = -mcpu=generic+sve -march=armv8-a+sve -mtune=neoverse-v1
 
 ifeq ($(build-mathvec),yes)
 bench-libmvec = $(addprefix float-advsimd-,$(float-advsimd-funcs)) \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 0f9503f..2980cb7 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -157,4 +157,26 @@ libmvec {
     _ZGVsMxv_tanpi;
     _ZGVsMxv_tanpif;
   }
+  GLIBC_2.42 {
+    _ZGVnN2v_acospi;
+    _ZGVnN2v_acospif;
+    _ZGVnN4v_acospif;
+    _ZGVsMxv_acospi;
+    _ZGVsMxv_acospif;
+    _ZGVnN2v_asinpi;
+    _ZGVnN2v_asinpif;
+    _ZGVnN4v_asinpif;
+    _ZGVsMxv_asinpi;
+    _ZGVsMxv_asinpif;
+    _ZGVnN2v_atanpi;
+    _ZGVnN2v_atanpif;
+    _ZGVnN4v_atanpif;
+    _ZGVsMxv_atanpi;
+    _ZGVsMxv_atanpif;
+    _ZGVnN2vv_atan2pi;
+    _ZGVnN2vv_atan2pif;
+    _ZGVnN4vv_atan2pif;
+    _ZGVsMxvv_atan2pi;
+    _ZGVsMxvv_atan2pif;
+  }
 }
diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
index 7709b54..453f780 100644
--- a/sysdeps/aarch64/fpu/acos_advsimd.c
+++ b/sysdeps/aarch64/fpu/acos_advsimd.c
@@ -18,24 +18,23 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
-  float64x2_t poly[12];
-  float64x2_t pi, pi_over_2;
+  double c1, c3, c5, c7, c9, c11;
+  float64x2_t c0, c2, c4, c6, c8, c10;
   uint64x2_t abs_mask;
+  float64x2_t pi, pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
-  .pi = V2 (0x1.921fb54442d18p+1),
-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+  .c0 = V2 (0x1.555555555554ep-3),     .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),   .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi = V2 (0x1.921fb54442d18p+1),     .pi_over_2 = V2 (0x1.921fb54442d18p+0),
   .abs_mask = V2 (0x7fffffffffffffff),
 };
 
@@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
 
      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
 
-   The largest observed error in this region is 1.18 ulps,
+   The largest observed error in this region is 1.18 ulp:
    _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
 				       want 0x1.0d54d1985c069p+0.
 
@@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
 
      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 1.52 ulps,
-   _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
-				       want 0x1.edbbedf8a7d6cp-1.  */
+   The largest observed error in this region is 1.50 ulp:
+   _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
+				       want 0x1.ec1a46aa829p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
   float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
 
   /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z3 = vmulq_f64 (z2, z);
   float64x2_t z4 = vmulq_f64 (z2, z2);
   float64x2_t z8 = vmulq_f64 (z4, z4);
-  float64x2_t z16 = vmulq_f64 (z8, z8);
-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
 
-  /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+  /* Order-11 Estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p411 = vfmaq_f64 (p47, z8, p811);
+  float64x2_t p = vfmaq_f64 (p03, z8, p411);
+
+  /* Finalize polynomial: z + z3 * P(z2).  */
+  p = vfmaq_f64 (z, z3, p);
 
   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c
index 74e2f7d..104f0d7 100644
--- a/sysdeps/aarch64/fpu/acos_sve.c
+++ b/sysdeps/aarch64/fpu/acos_sve.c
@@ -18,20 +18,21 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[12];
-  float64_t pi, pi_over_2;
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
-	    0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
-	    0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
-  .pi = 0x1.921fb54442d18p+1,
+  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
+  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
   .pi_over_2 = 0x1.921fb54442d18p+0,
 };
 
@@ -42,20 +43,21 @@ static const struct data
 
      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
 
-   The largest observed error in this region is 1.18 ulps,
-   _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
-				       want 0x1.0d4d0f55667f7p+0.
+   The largest observed error in this region is 1.18 ulp:
+   _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0
+				       want 0x1.0d51266607583p+0.
 
    For |x| in [0.5, 1.0], use same approximation with a change of variable
 
      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 1.52 ulps,
-   _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
-				       want 0x1.ed82df4243f0bp-1.  */
+   The largest observed error in this region is 1.50 ulp:
+   _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
+				       want 0x1.ec1a46aa829p-1.  */
 svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
 
   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
   svfloat64_t ax = svabs_x (pg, x);
@@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
   svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
 
   /* Use a single polynomial approximation P for both intervals.  */
-  svfloat64_t z4 = svmul_x (pg, z2, z2);
-  svfloat64_t z8 = svmul_x (pg, z4, z4);
-  svfloat64_t z16 = svmul_x (pg, z8, z8);
-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+  svfloat64_t z3 = svmul_x (ptrue, z2, z);
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmad_x (pg, p411, z8, p03);
 
   /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+  p = svmad_x (pg, p, z3, z);
 
   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
 	       = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
-  svfloat64_t y
-      = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
-
-  svbool_t is_neg = svcmplt (pg, x, 0.0);
-  svfloat64_t off = svdup_f64_z (is_neg, d->pi);
-  svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
-  svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
-
-  return svmla_x (pg, add, mul, y);
+  svfloat64_t mul = svreinterpret_f64 (
+      svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10));
+  mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign));
+  svfloat64_t add = svreinterpret_f64 (
+      svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2))));
+  add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add);
+
+  return svmsb_x (pg, p, mul, add);
 }
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
index 326b2cc..3a84959 100644
--- a/sysdeps/aarch64/fpu/acosh_sve.c
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 }
 
 /* SVE approximation for double-precision acosh, based on log1p.
-   The largest observed error is 3.19 ULP in the region where the
+   The largest observed error is 3.14 ULP in the region where the
    argument to log1p falls in the k=0 interval, i.e. x close to 1:
-   SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
-					   want 0x1.ed23399f51373p-2.  */
+   SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
+					   want 0x1.ef0cee7c33ce4p-2.  */
 svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
 {
   /* (ix - One) >= (BigBound - One).  */
diff --git a/sysdeps/aarch64/fpu/acospi_advsimd.c b/sysdeps/aarch64/fpu/acospi_advsimd.c
new file mode 100644
index 0000000..bb6c209
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acospi_advsimd.c
@@ -0,0 +1,118 @@
+/* Double-Precision vector (Advanced SIMD) inverse cospi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10;
+  uint64x2_t abs_mask;
+  float64x2_t one, inv_pi;
+  double c1, c3, c5, c7, c9, c11;
+} data = {
+  /* Coefficients of polynomial P such that asin(x)/pi~ x/pi + x^3 * poly(x^2)
+     on [ 0x1p-126 0x1p-2 ]. rel error: 0x1.ef9f94b1p-33. Generated using
+     iterative approach for minimisation of relative error in asinpif Sollya
+     file.  */
+  .c0 = V2 (0x1.b2995e7b7b5fbp-5),     .c1 = 0x1.8723a1d58d83p-6,
+  .c2 = V2 (0x1.d1a452eacf2fep-7),     .c3 = 0x1.3ce52c4d75582p-7,
+  .c4 = V2 (0x1.d2b2a0aea27d5p-8),     .c5 = 0x1.6a0b9b92cad8bp-8,
+  .c6 = V2 (0x1.2290c84438caep-8),     .c7 = 0x1.efba896580d02p-9,
+  .c8 = V2 (0x1.44446707af38p-9),      .c9 = 0x1.5070b3e7aa03ep-8,
+  .c10 = V2 (-0x1.c70015d0ebdafp-9),   .c11 = 0x1.27029c383fed9p-7,
+  .abs_mask = V2 (0x7fffffffffffffff), .one = V2 (1.0),
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+};
+
+/* Double-precision implementation of vector acospi(x).
+
+   For |x| in [0, 0.5], use order-11 polynomial P to approximate asinpi
+   such that the final approximation of acospi is an odd polynomial:
+
+     acospi(x) ~ 1/2 - (x/pi + x^3 P(x^2)).
+
+   The largest observed error in this region is 1.35 ulp:
+   _ZGVnN2v_acospi (0x1.fb16ed35a6d64p-2) got 0x1.5722a3dbcafb4p-2
+					 want 0x1.5722a3dbcafb5p-2.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+      acospi(x) = y/pi + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.55 ulp:
+   _ZGVnN2v_acospi (0x1.d90d50357410cp-1) got 0x1.ffd43d5dd3a9ep-4
+					 want 0x1.ffd43d5dd3a9bp-4.  */
+float64x2_t VPCS_ATTR NOINLINE V_NAME_D1 (acospi) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t ia = vandq_u64 (ix, d->abs_mask);
+
+  float64x2_t ax = vreinterpretq_f64_u64 (ia);
+  uint64x2_t a_le_half = vcaltq_f64 (x, v_f64 (0.5));
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+  float64x2_t z2 = vbslq_f64 (a_le_half, vmulq_f64 (x, x),
+			      vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+  float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+
+  /* Order-11 Estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p411 = vfmaq_f64 (p47, z8, p811);
+  float64x2_t p = vfmaq_f64 (p03, z8, p411);
+
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f64 (d->inv_pi, z2, p);
+  p = vmulq_f64 (p, z);
+
+  /* acospi(|x|)
+		= 1/2 - sign(x) * Q(|x|), for       |x| < 0.5
+		= 2 Q(|x|)              , for  0.5 < x < 1.0
+		= 1 - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+  float64x2_t y = vbslq_f64 (d->abs_mask, p, x);
+  uint64x2_t is_neg = vcltzq_f64 (x);
+  float64x2_t off = vreinterpretq_f64_u64 (
+      vandq_u64 (is_neg, vreinterpretq_u64_f64 (d->one)));
+  float64x2_t mul = vbslq_f64 (a_le_half, d->one, v_f64 (-2.0));
+  float64x2_t add = vbslq_f64 (a_le_half, v_f64 (0.5), off);
+
+  return vfmsq_f64 (add, mul, y);
+}
diff --git a/sysdeps/aarch64/fpu/acospi_sve.c b/sysdeps/aarch64/fpu/acospi_sve.c
new file mode 100644
index 0000000..e41eaad
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acospi_sve.c
@@ -0,0 +1,112 @@
+/* Double-Precision vector (SVE) inverse cospi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t inv_pi, half;
+} data = {
+  /* Coefficients of polynomial P such that asin(x)/pi~ x/pi + x^3 * poly(x^2)
+     on [ 0x1p-126 0x1p-2 ]. rel error: 0x1.ef9f94b1p-33. Generated using
+     iterative approach for minimisation of relative error in asinpif Sollya
+     file.  */
+  .c0 = 0x1.b2995e7b7b5fbp-5,	  .c1 = 0x1.8723a1d58d83p-6,
+  .c2 = 0x1.d1a452eacf2fep-7,	  .c3 = 0x1.3ce52c4d75582p-7,
+  .c4 = 0x1.d2b2a0aea27d5p-8,	  .c5 = 0x1.6a0b9b92cad8bp-8,
+  .c6 = 0x1.2290c84438caep-8,	  .c7 = 0x1.efba896580d02p-9,
+  .c8 = 0x1.44446707af38p-9,	  .c9 = 0x1.5070b3e7aa03ep-8,
+  .c10 = -0x1.c70015d0ebdafp-9,	  .c11 = 0x1.27029c383fed9p-7,
+  .inv_pi = 0x1.45f306dc9c883p-2, .half = 0.5,
+};
+
+/* Double-precision SVE implementation of vector acospi(x).
+
+   For |x| in [0, 0.5], use order 11 polynomial P to approximate asinpi
+   such that the final approximation of acospi is:
+
+     acospi(x) ~ 1/2 - (x/pi + x^3 P(x^2)).
+
+   The largest observed error in this region is 1.35 ulp:
+   _ZGVsMxv_acospi (0x1.fb014996aea18p-2) got 0x1.572a91755bbf6p-2
+					 want 0x1.572a91755bbf7p-2.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+      acospi(x) = y/pi + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.55 ulp:
+   _ZGVsMxv_acospi(0x1.d90d50357410cp-1) got 0x1.ffd43d5dd3a9ep-4
+					want 0x1.ffd43d5dd3a9bp-4.  */
+svfloat64_t SV_NAME_D1 (acospi) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
+
+  svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+  svfloat64_t ax = svabs_x (pg, x);
+  svbool_t a_gt_half = svacgt (pg, x, 0.5f);
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+  svfloat64_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+			  svmul_x (ptrue, x, x));
+  svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmla_x (pg, p03, z8, p411);
+
+  p = svmla_x (pg, sv_f64 (d->inv_pi), z2, p);
+  p = svmul_x (ptrue, p, z);
+
+  /* acospi(|x|) = 1/2 - sign(x) * Q(|x|), for       |x| < 0.5
+		 = 2 Q(|x|)              , for  0.5 < x < 1.0
+		 = 1 - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+  svfloat64_t mul = svreinterpret_f64 (
+      svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10));
+  mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign));
+  svfloat64_t add = svreinterpret_f64 (
+      svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->half))));
+  add = svsub_m (a_gt_half, sv_f64 (d->half), add);
+
+  return svmsb_x (pg, p, mul, add);
+}
diff --git a/sysdeps/aarch64/fpu/acospif_advsimd.c b/sysdeps/aarch64/fpu/acospif_advsimd.c
new file mode 100644
index 0000000..8486b62
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acospif_advsimd.c
@@ -0,0 +1,106 @@
+/* Single-Precision vector (Advanced SIMD) inverse cospi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, inv_pi;
+  float c1, c3, c5, null;
+} data = {
+  /* Coefficients of polynomial P such that asin(x)/pi~ x/pi + x^3 * poly(x^2)
+     on [ 0x1p-126 0x1p-2 ]. rel error: 0x1.ef9f94b1p-33. Generated using
+     iterative approach for minimisation of relative error in asinpif Sollya
+     file.  */
+  .c0 = V4 (0x1.b2995ep-5f),	 .c1 = 0x1.8724ep-6f,
+  .c2 = V4 (0x1.d1301ep-7f),	 .c3 = 0x1.446d3cp-7f,
+  .c4 = V4 (0x1.654848p-8f),	 .c5 = 0x1.5fdaa8p-7f,
+  .inv_pi = V4 (0x1.45f306p-2f),
+};
+
+#define AbsMask 0x7fffffff
+
+/* Single-precision implementation of vector acospi(x).
+
+   For |x| in [0, 0.5], use order 5 polynomial P to approximate asinpi
+   such that the final approximation of acospi is an odd polynomial:
+
+     acospi(x) ~ 1/2 - (x/pi + x^3 P(x^2)).
+
+   The largest observed error in this region is 1.23 ulps,
+      _ZGVnN4v_acospif (0x1.fee13ep-2) got 0x1.55beb4p-2 want 0x1.55beb2p-2.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+      acospi(x) = y/pi + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.53 ulps,
+   _ZGVnN4v_acospif (0x1.6ad644p-1) got 0x1.fe8f96p-3
+				   want 0x1.fe8f9cp-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acospi) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+  float32x4_t ax = vreinterpretq_f32_u32 (ia);
+  uint32x4_t a_le_half = vcaltq_f32 (x, v_f32 (0.5f));
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+
+  float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x),
+			      vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f));
+  float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+
+  /* Order-5 Estrin evaluation scheme.  */
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+  float32x4_t c135 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c135, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c135, 1);
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c135, 2);
+  float32x4_t p = vfmaq_f32 (p03, z8, p45);
+  /* Add 1/pi as final coeff.  */
+  p = vfmaq_f32 (d->inv_pi, z2, p);
+
+  /* Finalize polynomial: z * P(z^2).  */
+  p = vmulq_f32 (z, p);
+
+  /* acospi(|x|)
+			= 1/2 - sign(x) * Q(|x|), for       |x| < 0.5
+			= 2 Q(|x|)              , for  0.5 < x < 1.0
+			= 1 - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+
+  float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x);
+  uint32x4_t is_neg = vcltzq_f32 (x);
+  float32x4_t off = vreinterpretq_f32_u32 (
+      vandq_u32 (vreinterpretq_u32_f32 (v_f32 (1.0f)), is_neg));
+  float32x4_t mul = vbslq_f32 (a_le_half, v_f32 (1.0f), v_f32 (-2.0f));
+  float32x4_t add = vbslq_f32 (a_le_half, v_f32 (0.5f), off);
+
+  return vfmsq_f32 (add, mul, y);
+}
+libmvec_hidden_def (V_NAME_F1 (acospi))
+HALF_WIDTH_ALIAS_F1 (acospi)
diff --git a/sysdeps/aarch64/fpu/acospif_sve.c b/sysdeps/aarch64/fpu/acospif_sve.c
new file mode 100644
index 0000000..ea4fc4a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/acospif_sve.c
@@ -0,0 +1,91 @@
+/* Single-Precision vector (SVE) inverse cospi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float32_t c0, c1, c2, c3, c4, inv_pi, half;
+} data = {
+  /* Coefficients of polynomial P such that asin(x)/pi~ x/pi + x^3 * poly(x^2)
+     on [ 0x1p-126 0x1p-2 ]. rel error: 0x1.ef9f94b1p-33. Generated using
+     iterative approach for minimisation of relative error.  */
+  .c0 = 0x1.b29968p-5f, .c1 = 0x1.871424p-6f, .c2 = 0x1.d56e44p-7f,
+  .c3 = 0x1.149bb8p-7f, .c4 = 0x1.8e07fep-7f, .inv_pi = 0x1.45f306p-2f,
+  .half = 0.5f,
+};
+
+/* Single-precision SVE implementation of vector acospi(x).
+
+   For |x| in [0, 0.5], use order 5 polynomial P to approximate asinpi
+   such that the final approximation of acospi is:
+
+     acospi(x) ~ 1/2 - (x/pi + x^3 P(x^2)).
+
+    The largest observed error in this region is 1.3 ulps,
+      _ZGVsMxv_acospif(0x1.ffa9d2p-2) got 0x1.557504p-2
+				     want 0x1.557502p-2.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+      acospi(x) = y/pi + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 2.61 ulps,
+   _ZGVsMxv_acospif (0x1.6b232ep-1) got 0x1.fe04bap-3
+				   want 0x1.fe04cp-3.  */
+svfloat32_t SV_NAME_F1 (acospi) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svbool_t ptrue = svptrue_b32 ();
+
+  svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+  svfloat32_t ax = svabs_x (pg, x);
+  svbool_t a_gt_half = svacgt (pg, x, 0.5f);
+
+  /* Evaluate polynomial Q(x) = z + z * z2 * P(z2) with
+     z2 = x ^ 2         and z = |x|     , if |x| < 0.5
+     z2 = (1 - |x|) / 2 and z = sqrt(z2), if |x| >= 0.5.  */
+  svfloat32_t z2 = svsel (a_gt_half, svmls_x (pg, sv_f32 (0.5f), ax, 0.5f),
+			  svmul_x (ptrue, x, x));
+  svfloat32_t z = svsqrt_m (ax, a_gt_half, z2);
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  svfloat32_t p = svmla_x (pg, sv_f32 (d->c3), z2, d->c4);
+  p = svmad_x (pg, z2, p, d->c2);
+  p = svmad_x (pg, z2, p, d->c1);
+  p = svmad_x (pg, z2, p, d->c0);
+  /* Add 1/pi as final coeff.  */
+  p = svmla_x (pg, sv_f32 (d->inv_pi), z2, p);
+  /* Finalize polynomial: z * P(z^2).  */
+  p = svmul_x (ptrue, z, p);
+
+  /* acospi(|x|)
+			  = 1/2 - sign(x) * Q(|x|), for       |x| < 0.5
+			  = 2 Q(|x|)              , for  0.5 < x < 1.0
+			  = 1 - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
+  svfloat32_t y
+      = svreinterpret_f32 (svorr_x (ptrue, svreinterpret_u32 (p), sign));
+  svfloat32_t mul = svsel (a_gt_half, sv_f32 (2.0f), sv_f32 (-1.0f));
+  svfloat32_t add = svreinterpret_f32 (
+      svorr_x (ptrue, sign, svreinterpret_u32 (sv_f32 (d->half))));
+  add = svsub_m (a_gt_half, sv_f32 (d->half), add);
+
+  return svmad_x (pg, y, mul, add);
+}
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 38681a4..c202bda 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -19,10 +19,13 @@
 
 libmvec_hidden_proto (V_NAME_F1(acos));
 libmvec_hidden_proto (V_NAME_F1(acosh));
+libmvec_hidden_proto (V_NAME_F1(acospi));
 libmvec_hidden_proto (V_NAME_F1(asin));
 libmvec_hidden_proto (V_NAME_F1(asinh));
+libmvec_hidden_proto (V_NAME_F1(asinpi));
 libmvec_hidden_proto (V_NAME_F1(atan));
 libmvec_hidden_proto (V_NAME_F1(atanh));
+libmvec_hidden_proto (V_NAME_F1(atanpi));
 libmvec_hidden_proto (V_NAME_F1(cbrt));
 libmvec_hidden_proto (V_NAME_F1(cos));
 libmvec_hidden_proto (V_NAME_F1(cosh));
@@ -47,3 +50,4 @@ libmvec_hidden_proto (V_NAME_F1(tan));
 libmvec_hidden_proto (V_NAME_F1(tanh));
 libmvec_hidden_proto (V_NAME_F1(tanpi));
 libmvec_hidden_proto (V_NAME_F2(atan2));
+libmvec_hidden_proto (V_NAME_F2(atan2pi));
diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
index 4142116..f74141c 100644
--- a/sysdeps/aarch64/fpu/asin_advsimd.c
+++ b/sysdeps/aarch64/fpu/asin_advsimd.c
@@ -18,24 +18,23 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
-  float64x2_t poly[12];
+  float64x2_t c0, c2, c4, c6, c8, c10;
   float64x2_t pi_over_2;
   uint64x2_t abs_mask;
+  double c1, c3, c5, c7, c9, c11;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
-  .abs_mask = V2 (0x7fffffffffffffff),
+  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
 };
 
 #define AllMask v_u64 (0xffffffffffffffff)
@@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
      asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
 
    The largest observed error in this region is 2.69 ulps,
-   _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-				       want 0x1.110d7e85fdd53p-1.  */
+   _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+				       want 0x1.1111dd54ddf99p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
     return special_case (x, x, AllMask);
 #endif
 
-  uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
 
   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
      z = x ^ 2 and y = |x|            , if |x| < 0.5
@@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
   float64x2_t z4 = vmulq_f64 (z2, z2);
   float64x2_t z8 = vmulq_f64 (z4, z4);
   float64x2_t z16 = vmulq_f64 (z8, z8);
-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+  /* order-11 estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+  float64x2_t p = vfmaq_f64 (p07, z16, p811);
 
   /* Finalize polynomial: z + z * z2 * P(z2).  */
   p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c
index 9314466..975f408 100644
--- a/sysdeps/aarch64/fpu/asin_sve.c
+++ b/sysdeps/aarch64/fpu/asin_sve.c
@@ -18,45 +18,43 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[12];
-  float64_t pi_over_2f;
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
-	    0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
-	    0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
-	    0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
-	    -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
-  .pi_over_2f = 0x1.921fb54442d18p+0,
+  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
+  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = 0x1.921fb54442d18p+0,
 };
 
-#define P(i) sv_f64 (d->poly[i])
-
 /* Double-precision SVE implementation of vector asin(x).
 
    For |x| in [0, 0.5], use an order 11 polynomial P such that the final
    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
 
-   The largest observed error in this region is 0.52 ulps,
-   _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
-				      want 0x1.ec13757305f26p-2.
-
-   For |x| in [0.5, 1.0], use same approximation with a change of variable
+   The largest observed error in this region is 0.98 ulp:
+   _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2
+				       want 0x1.ec4eb661a73d2p-2.
 
-     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+   For |x| in [0.5, 1.0], use same approximation with a change of variable:
+   asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 2.69 ulps,
-   _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-				      want 0x1.110d7e85fdd53p-1.  */
+   The largest observed error in this region is 2.66 ulp:
+   _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1
+				       want 0x1.10b9586f087abp-1.  */
 svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
 
   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
   svfloat64_t ax = svabs_x (pg, x);
@@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
   svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
 
   /* Use a single polynomial approximation P for both intervals.  */
+  svfloat64_t z3 = svmul_x (pg, z2, z);
   svfloat64_t z4 = svmul_x (pg, z2, z2);
   svfloat64_t z8 = svmul_x (pg, z4, z4);
-  svfloat64_t z16 = svmul_x (pg, z8, z8);
-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  /* Order-11 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmla_x (pg, p03, z8, p411);
+
   /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+  p = svmla_x (pg, z, z3, p);
 
-  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
-	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
-  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
+  /* asin(|x|) = Q(|x|), for |x| <  0.5
+	    = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2);
 
-  /* Copy sign.  */
+  /* Reinsert the sign from the argument.  */
   return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
 }
diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
index 52c7c0e..013936c 100644
--- a/sysdeps/aarch64/fpu/asinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
@@ -18,22 +18,21 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
 
 static const struct data
 {
-  float32x4_t poly[5];
+  float32x4_t c0, c2, c4;
+  float c1, c3;
   float32x4_t pi_over_2f;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
      [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
-  .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
-	    V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
-  .pi_over_2f = V4 (0x1.921fb6p+0f),
+  .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f,
+  .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f,
+  .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f),
 };
 
 #define AbsMask 0x7fffffff
-#define Half 0x3f000000
 #define One 0x3f800000
 #define Small 0x39800000 /* 2^-12.  */
 
@@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 
 /* Single-precision implementation of vector asin(x).
 
-   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
-   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
-   following approximation.
 
-   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+   For |x| <0.5, use order 4 polynomial P such that the final
    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
 
     The largest observed error in this region is 0.83 ulps,
@@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
 #endif
 
   float32x4_t ax = vreinterpretq_f32_u32 (ia);
-  uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+  uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
 
   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
      z = x ^ 2 and y = |x|            , if |x| < 0.5
      z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
   float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
-			      vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+			      vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f));
   float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
 
   /* Use a single polynomial approximation P for both intervals.  */
-  float32x4_t p = v_horner_4_f32 (z2, d->poly);
+
+  /* PW Horner 3 evaluation scheme.  */
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t c13 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1);
+  float32x4_t p = vfmaq_f32 (p23, d->c4, z4);
+  p = vfmaq_f32 (p01, p, z4);
   /* Finalize polynomial: z + z * z2 * P(z2).  */
   p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
 
   /* asin(|x|) = Q(|x|)         , for |x| < 0.5
 	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
   float32x4_t y
-      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f));
 
   /* Copy sign.  */
   return vbslq_f32 (v_u32 (AbsMask), y, x);
diff --git a/sysdeps/aarch64/fpu/asinpi_advsimd.c b/sysdeps/aarch64/fpu/asinpi_advsimd.c
new file mode 100644
index 0000000..b11f98b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinpi_advsimd.c
@@ -0,0 +1,109 @@
+/* Double-Precision vector (Advanced SIMD) inverse sinpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64x2_t c0, c2, c4, c6, c8, c10;
+  float64x2_t pi_over_2, inv_pi;
+  uint64x2_t abs_mask;
+  double c1, c3, c5, c7, c9, c11;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+     on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+};
+
+/* Double-precision implementation of vector asinpi(x).
+
+   For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+   approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+   asinpi(x) = asin(x) * 1/pi.
+
+   The largest observed error in this region is 1.63 ulps,
+   _ZGVnN2v_asinpi (0x1.9125919fa617p-19) got 0x1.fec183497ea53p-21
+					 want 0x1.fec183497ea51p-21.
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 3.04 ulps,
+   _ZGVnN2v_asinpi (0x1.0479b7bd98553p-1) got 0x1.5beebec797326p-3
+					 want 0x1.5beebec797329p-3.  */
+
+float64x2_t VPCS_ATTR V_NAME_D1 (asinpi) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  float64x2_t ax = vabsq_f64 (x);
+
+  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
+
+  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+     z = x ^ 2 and y = |x|            , if |x| < 0.5
+     z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  float64x2_t z2 = vbslq_f64 (a_lt_half, vmulq_f64 (x, x),
+			      vfmsq_n_f64 (v_f64 (0.5), ax, 0.5));
+  float64x2_t z = vbslq_f64 (a_lt_half, ax, vsqrtq_f64 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
+
+  /* order-11 Estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+  float64x2_t p = vfmaq_f64 (p07, z16, p811);
+
+  /* Finalize polynomial: z + z * z2 * P(z2).  */
+  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+
+  /* asin(|x|) = Q(|x|)          , for |x| < 0.5
+	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  float64x2_t y = vbslq_f64 (a_lt_half, p, vfmsq_n_f64 (d->pi_over_2, p, 2.0));
+  /* asinpi(|x|) = asin(|x|) /pi.  */
+  y = vmulq_f64 (y, d->inv_pi);
+
+  /* Copy sign.  */
+  return vbslq_f64 (d->abs_mask, y, x);
+}
diff --git a/sysdeps/aarch64/fpu/asinpi_sve.c b/sysdeps/aarch64/fpu/asinpi_sve.c
new file mode 100644
index 0000000..71ef8ce
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinpi_sve.c
@@ -0,0 +1,107 @@
+/* Double-Precision vector (SVE) inverse sinpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t pi_over_2, inv_pi;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
+     on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
+  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
+  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = 0x1.921fb54442d18p+0, .inv_pi = 0x1.45f306dc9c883p-2,
+};
+
+/* Double-precision SVE implementation of vector asinpi(x).
+
+   For |x| in [0, 0.5], use an order 11 polynomial P such that the final
+   approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
+
+   The largest observed error in this region is 1.32 ulp:
+   _ZGVsMxv_asinpi (0x1.fc12356dbdefbp-2) got 0x1.5272e9658ba66p-3
+					 want 0x1.5272e9658ba64p-3
+
+   For |x| in [0.5, 1.0], use same approximation with a change of variable:
+  asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 3.48 ulp:
+   _ZGVsMxv_asinpi (0x1.03da0c2295424p-1) got 0x1.5b02b3dcafaefp-3
+					 want 0x1.5b02b3dcafaf2p-3.  */
+svfloat64_t SV_NAME_D1 (asinpi) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
+
+  svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
+  svfloat64_t ax = svabs_x (pg, x);
+  svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+     z = x ^ 2 and y = |x|            , if |x| < 0.5
+     z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  svfloat64_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f64 (0.5), ax, 0.5),
+			  svmul_x (ptrue, x, x));
+  svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
+
+  /* Use a single polynomial approximation P for both intervals.  */
+  svfloat64_t z3 = svmul_x (pg, z2, z);
+  svfloat64_t z4 = svmul_x (pg, z2, z2);
+  svfloat64_t z8 = svmul_x (pg, z4, z4);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  /* Order-11 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmla_x (pg, p03, z8, p411);
+
+  /* Finalize polynomial: z + z3 * P(z2).  */
+  p = svmla_x (pg, z, z3, p);
+
+  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
+	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2);
+
+  /* Reinsert the sign from the argument.  */
+  svfloat64_t inv_pi = svreinterpret_f64 (
+      svorr_x (pg, svreinterpret_u64 (sv_f64 (d->inv_pi)), sign));
+
+  return svmul_x (pg, y, inv_pi);
+}
diff --git a/sysdeps/aarch64/fpu/asinpif_advsimd.c b/sysdeps/aarch64/fpu/asinpif_advsimd.c
new file mode 100644
index 0000000..1483ea8
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinpif_advsimd.c
@@ -0,0 +1,95 @@
+/* Single-Precision vector (Advanced SIMD) inverse sinpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t c0, c2, c4, inv_pi;
+  float c1, c3, c5, null;
+} data = {
+  /* Coefficients of polynomial P such that asin(x)/pi~ x/pi + x^3 * poly(x^2)
+     on [ 0x1p-126 0x1p-2 ]. rel error: 0x1.ef9f94b1p-33. Generated using
+     iterative approach for minimisation of relative error in Sollya file.  */
+  .c0 = V4 (0x1.b2995ep-5f),	 .c1 = 0x1.8724ep-6f,
+  .c2 = V4 (0x1.d1301ep-7f),	 .c3 = 0x1.446d3cp-7f,
+  .c4 = V4 (0x1.654848p-8f),	 .c5 = 0x1.5fdaa8p-7f,
+  .inv_pi = V4 (0x1.45f306p-2f),
+};
+
+#define AbsMask 0x7fffffff
+
+/* Single-precision implementation of vector asinpi(x).
+
+    For |x| < 0.5, use order 5 polynomial P such that the final
+   approximation is an odd polynomial: asinpif(x) ~ x/pi + x^3 P(x^2).
+
+    The largest observed error in this region is 1.68 ulps,
+      _ZGVnN4v_asinpif (0x1.86e514p-2) got 0x1.fea8c8p-4 want 0x1.fea8ccp-4.
+
+    For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+    asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 3.49 ulps,
+   _ZGVnN4v_asinpif(0x1.0d93fep-1) got 0x1.697aap-3 want 0x1.697a9ap-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinpi) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t ia = vandq_u32 (ix, v_u32 (AbsMask));
+
+  float32x4_t ax = vreinterpretq_f32_u32 (ia);
+  uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
+
+  /* Evaluate polynomial Q(x) = y/pi + y * z * P(z) with
+     z = x ^ 2 and y = |x|            , if |x| < 0.5
+     z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
+			      vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f));
+  float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
+
+  /* Use a single polynomial approximation P for both intervals.  */
+
+  /* Order-5 Estrin evaluation scheme.  */
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+  float32x4_t c135 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c135, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c135, 1);
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c135, 2);
+  float32x4_t p = vfmaq_f32 (p03, z8, p45);
+  /* Add 1/pi as final coeff.  */
+  p = vfmaq_f32 (d->inv_pi, z2, p);
+
+  /* Finalize polynomial: z * P(z2).  */
+  p = vmulq_f32 (z, p);
+
+  /*  asinpi(|x|) = Q(|x|), for |x| < 0.5
+	       =  1/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  float32x4_t y
+      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (v_f32 (0.5f), p, 2.0f));
+
+  /* Copy sign.  */
+  return vbslq_f32 (v_u32 (AbsMask), y, x);
+}
+libmvec_hidden_def (V_NAME_F1 (asinpi))
+HALF_WIDTH_ALIAS_F1 (asinpi)
diff --git a/sysdeps/aarch64/fpu/asinpif_sve.c b/sysdeps/aarch64/fpu/asinpif_sve.c
new file mode 100644
index 0000000..046b258
--- /dev/null
+++ b/sysdeps/aarch64/fpu/asinpif_sve.c
@@ -0,0 +1,88 @@
+/* Single-Precision vector (SVE) inverse sinpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float32_t c1, c3, c5;
+  float32_t c0, c2, c4, inv_pi;
+} data = {
+  /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
+    [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
+  .c0 = 0x1.b2995ep-5f,	    .c1 = 0x1.8724ep-6f,  .c2 = 0x1.d1301ep-7f,
+  .c3 = 0x1.446d3cp-7f,	    .c4 = 0x1.654848p-8f, .c5 = 0x1.5fdaa8p-7f,
+  .inv_pi = 0x1.45f306p-2f,
+};
+
+/* Single-precision SVE implementation of vector asin(x).
+
+   For |x| in [0, 0.5], use order 5 polynomial P such that the final
+   approximation is an odd polynomial: asinpi(x) ~ x/pi + x^3 P(x^2).
+
+    The largest observed error in this region is 1.96 ulps:
+    _ZGVsMxv_asinpif (0x1.8e534ep-3) got 0x1.fe6ab4p-5
+				    want 0x1.fe6ab8p-5.
+
+    For |x| in [0.5, 1.0], use same approximation with a change of variable
+
+    asinpi(x) = 1/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+
+   The largest observed error in this region is 3.46 ulps:
+   _ZGVsMxv_asinpif (0x1.0df892p-1) got 0x1.6a114cp-3
+				   want 0x1.6a1146p-3.  */
+svfloat32_t SV_NAME_F1 (asinpi) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
+
+  svuint32_t sign = svand_x (pg, svreinterpret_u32 (x), 0x80000000);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svbool_t a_ge_half = svacge (pg, x, 0.5);
+
+  /* Evaluate polynomial Q(x) = y + y * z * P(z) with
+   z = x ^ 2 and y = |x|            , if |x| < 0.5
+   z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
+  svfloat32_t z2 = svsel (a_ge_half, svmls_x (pg, sv_f32 (0.5), ax, 0.5),
+			  svmul_x (pg, x, x));
+  svfloat32_t z = svsqrt_m (ax, a_ge_half, z2);
+
+  svfloat32_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat32_t c135_two = svld1rq (ptrue, &d->c1);
+
+  /* Order-5 Pairwise Horner evaluation scheme.  */
+  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, c135_two, 0);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, c135_two, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, c135_two, 2);
+
+  svfloat32_t p25 = svmla_x (pg, p23, z4, p45);
+  svfloat32_t p = svmla_x (pg, p01, z4, p25);
+
+  /* Add 1/pi as final coeff.  */
+  p = svmla_x (pg, sv_f32 (d->inv_pi), z2, p);
+  p = svmul_x (pg, p, z);
+
+  /*  asinpi(|x|) = Q(|x|), for |x| < 0.5
+	       =  1/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  svfloat32_t y = svmsb_m (a_ge_half, p, sv_f32 (2.0), 0.5);
+
+  /* Reinsert sign from argument.  */
+  return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+}
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
index 00b4a4f..a31d52f 100644
--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
@@ -19,40 +19,38 @@
 
 #include "math_config.h"
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
   float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
   float64x2_t pi_over_2;
-  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
-  uint64x2_t zeroinfnan, minustwo;
+  uint64x2_t zeroinfnan;
 } data = {
-  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
-	      [2**-1022, 1.0].  */
-  .c0 = V2 (-0x1.5555555555555p-2),
-  .c1 = 0x1.99999999996c1p-3,
-  .c2 = V2 (-0x1.2492492478f88p-3),
-  .c3 = 0x1.c71c71bc3951cp-4,
-  .c4 = V2 (-0x1.745d160a7e368p-4),
-  .c5 = 0x1.3b139b6a88ba1p-4,
-  .c6 = V2 (-0x1.11100ee084227p-4),
-  .c7 = 0x1.e1d0f9696f63bp-5,
-  .c8 = V2 (-0x1.aebfe7b418581p-5),
-  .c9 = 0x1.842dbe9b0d916p-5,
-  .c10 = V2 (-0x1.5d30140ae5e99p-5),
-  .c11 = 0x1.338e31eb2fbbcp-5,
-  .c12 = V2 (-0x1.00e6eece7de8p-5),
-  .c13 = 0x1.860897b29e5efp-6,
-  .c14 = V2 (-0x1.0051381722a59p-6),
-  .c15 = 0x1.14e9dc19a4a4ep-7,
-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
-  .c17 = 0x1.17739e210171ap-10,
-  .c18 = V2 (-0x1.ab24da7be7402p-13),
-  .c19 = 0x1.358851160a528p-16,
+  /* Coefficients of polynomial P such that
+     atan(x)~x+x*P(x^2) on [2^-1022, 1.0].  */
+  .c0 = V2 (-0x1.555555555552ap-2),
+  .c1 = 0x1.9999999995aebp-3,
+  .c2 = V2 (-0x1.24924923923f6p-3),
+  .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = V2 (-0x1.745d11fb3d32bp-4),
+  .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = V2 (-0x1.110e6d985f496p-4),
+  .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = V2 (-0x1.ae644e28058c3p-5),
+  .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = V2 (-0x1.59d7f901566cbp-5),
+  .c11 = 0x1.2c982855ab069p-5,
+  .c12 = V2 (-0x1.eb49592998177p-6),
+  .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = V2 (-0x1.ca980345c4204p-7),
+  .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = V2 (-0x1.7ea70755b8eccp-9),
+  .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = V2 (-0x1.44a4b059b6f67p-13),
+  .c19 = 0x1.c4a45029e5a91p-17,
   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
   .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
-  .minustwo = V2 (0xc000000000000000),
 };
 
 #define SignMask v_u64 (0x8000000000000000)
@@ -77,10 +75,9 @@ zeroinfnan (uint64x2_t i, const struct data *d)
 }
 
 /* Fast implementation of vector atan2.
-   Maximum observed error is 2.8 ulps:
-   _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
-	got 0x1.92d628ab678ccp-1
-       want 0x1.92d628ab678cfp-1.  */
+   Maximum observed error is 1.97 ulps:
+   _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5)
+   got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1.  */
 float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -101,26 +98,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
   uint64x2_t pred_xlt0 = vcltzq_f64 (x);
   uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
 
-  /* Set up z for call to atan.  */
-  float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
-  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
-  float64x2_t z = vdivq_f64 (n, q);
-
-  /* Work out the correct shift.  */
-  float64x2_t shift
-      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
-  shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
-  shift = vmulq_f64 (shift, d->pi_over_2);
-
-  /* Calculate the polynomial approximation.
-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
-     full scheme to avoid underflow in x^16.
-     The order 19 polynomial P approximates
-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+  /* Set up z for evaluation of atan.  */
+  float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+  float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax);
+  float64x2_t z = vdivq_f64 (num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
+  float64x2_t shift = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+  float64x2_t shift2 = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0))));
+  shift = vaddq_f64 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
   float64x2_t z2 = vmulq_f64 (z, z);
-  float64x2_t x2 = vmulq_f64 (z2, z2);
-  float64x2_t x4 = vmulq_f64 (x2, x2);
-  float64x2_t x8 = vmulq_f64 (x4, x4);
+  float64x2_t z3 = vmulq_f64 (z2, z);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
 
   float64x2_t c13 = vld1q_f64 (&d->c1);
   float64x2_t c57 = vld1q_f64 (&d->c5);
@@ -128,45 +128,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
   float64x2_t c1315 = vld1q_f64 (&d->c13);
   float64x2_t c1719 = vld1q_f64 (&d->c17);
 
-  /* estrin_7.  */
+  /* Order-7 Estrin.  */
   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
 
   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
 
-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
 
-  /* estrin_11.  */
+  /* Order-11 Estrin.  */
   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
 
   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
 
   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
 
-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
 
-  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+  float64x2_t poly = vfmaq_f64 (p07, p819, z16);
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
-  ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
-  ret = vaddq_f64 (ret, shift);
+  float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2);
+  ret = vfmaq_f64 (ret, z3, poly);
 
   if (__glibc_unlikely (v_any_u64 (special_cases)))
     return special_case (y, x, ret, sign_xy, special_cases);
 
   /* Account for the sign of x and y.  */
-  ret = vreinterpretq_f64_u64 (
+  return vreinterpretq_f64_u64 (
       veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
-
-  return ret;
 }
diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
index 163f613..9e2dd24 100644
--- a/sysdeps/aarch64/fpu/atan2_sve.c
+++ b/sysdeps/aarch64/fpu/atan2_sve.c
@@ -19,25 +19,25 @@
 
 #include "math_config.h"
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[20];
-  float64_t pi_over_2;
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-1022, 1.0].  */
-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
-  .pi_over_2 = 0x1.921fb54442d18p+0,
+  .c0 = -0x1.555555555552ap-2,	 .c1 = 0x1.9999999995aebp-3,
+  .c2 = -0x1.24924923923f6p-3,	 .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = -0x1.745d11fb3d32bp-4,	 .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = -0x1.110e6d985f496p-4,	 .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = -0x1.ae644e28058c3p-5,	 .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = -0x1.59d7f901566cbp-5,	 .c11 = 0x1.2c982855ab069p-5,
+  .c12 = -0x1.eb49592998177p-6,	 .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = -0x1.ca980345c4204p-7,	 .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = -0x1.7ea70755b8eccp-9,	 .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17,
 };
-
 /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
 static svfloat64_t NOINLINE
 special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
@@ -56,15 +56,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg)
 }
 
 /* Fast implementation of SVE atan2. Errors are greatest when y and
-   x are reasonably close together. The greatest observed error is 2.28 ULP:
-   _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
-   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
-svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
+   x are reasonably close together. The greatest observed error is 1.94 ULP:
+   _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5)
+   got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1.  */
+svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x,
+				const svbool_t pg)
 {
-  const struct data *data_ptr = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);
 
   svuint64_t ix = svreinterpret_u64 (x);
   svuint64_t iy = svreinterpret_u64 (y);
+  svbool_t ptrue = svptrue_b64 ();
 
   svbool_t cmp_x = zeroinfnan (ix, pg);
   svbool_t cmp_y = zeroinfnan (iy, pg);
@@ -81,32 +83,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
 
   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
 
-  /* Set up z for call to atan.  */
-  svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
-  svfloat64_t d = svsel (pred_aygtax, ay, ax);
-  svfloat64_t z = svdiv_x (pg, n, d);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atan.  */
+  svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat64_t den = svsel (pred_aygtax, ay, ax);
+  svfloat64_t z = svdiv_x (pg, num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
+  svfloat64_t shift_mul = svreinterpret_f64 (
+      svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0))));
   shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
-  shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
-  shift = svmul_x (pg, shift, data_ptr->pi_over_2);
+  shift = svmla_x (pg, z, shift, shift_mul);
 
   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
   svfloat64_t z2 = svmul_x (pg, z, z);
-  svfloat64_t x2 = svmul_x (pg, z2, z2);
-  svfloat64_t x4 = svmul_x (pg, x2, x2);
-  svfloat64_t x8 = svmul_x (pg, x4, x4);
+  svfloat64_t z3 = svmul_x (pg, z2, z);
+  svfloat64_t z4 = svmul_x (pg, z2, z2);
+  svfloat64_t z8 = svmul_x (pg, z4, z4);
+  svfloat64_t z16 = svmul_x (pg, z8, z8);
 
-  svfloat64_t ret = svmla_x (
-      pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
-      sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
+  /* Order-7 Estrin.  */
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
 
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat64_t z3 = svmul_x (pg, z2, z);
-  ret = svmla_x (pg, z, z3, ret);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
+  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
 
-  ret = svadd_m (pg, ret, shift);
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
+  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
+
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
+  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
+  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
+
+  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
+  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
+
+  svfloat64_t poly = svmla_x (pg, p07, z16, p819);
+
+  /* y = shift + z + z^3 * P(z^2).  */
+  svfloat64_t ret = svmla_x (pg, shift, z3, poly);
 
   /* Account for the sign of x and y.  */
   if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
index e65406f..75d8738 100644
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -18,22 +18,22 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
 
 static const struct data
 {
-  float32x4_t c0, pi_over_2, c4, c6, c2;
+  float32x4_t c0, c4, c6, c2;
   float c1, c3, c5, c7;
   uint32x4_t comp_const;
+  float32x4_t pi;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].
      Generated using fpminimax between FLT_MIN and 1.  */
-  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
-  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
-  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
-  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
-  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+  .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3,
+  .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4,
+  .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5,
+  .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9,
+  .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1),
 };
 
 #define SignMask v_u32 (0x80000000)
@@ -54,13 +54,13 @@ static inline uint32x4_t
 zeroinfnan (uint32x4_t i, const struct data *d)
 {
   /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
-  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
+  return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const);
 }
 
 /* Fast implementation of vector atan2f. Maximum observed error is
-   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
-   _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
-						 want 0x1.967f00p-1.  */
+   2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1
+						    want 0x1.97ae9ep-1.  */
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
   uint32x4_t pred_xlt0 = vcltzq_f32 (x);
   uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
 
-  /* Set up z for call to atanf.  */
-  float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
-  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
-  float32x4_t z = vdivq_f32 (n, q);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atanf.  */
+  float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+  float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax);
+  float32x4_t z = vdivq_f32 (num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   float32x4_t shift = vreinterpretq_f32_u32 (
-      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
-  shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
-  shift = vmulq_f32 (shift, d->pi_over_2);
-
-  /* Calculate the polynomial approximation.
-     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
-     a standard implementation using z8 creates spurious underflow
-     in the very last fma (when z^8 is small enough).
-     Therefore, we split the last fma into a mul and an fma.
-     Horner and single-level Estrin have higher errors that exceed
-     threshold.  */
+      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f))));
+  float32x4_t shift2 = vreinterpretq_f32_u32 (
+      vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f))));
+  shift = vaddq_f32 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
   float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z3 = vmulq_f32 (z2, z);
   float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
 
   float32x4_t c1357 = vld1q_f32 (&d->c1);
+
   float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
   float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
   float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
@@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
   float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
   float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
 
-  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
+  float32x4_t poly = vfmaq_f32 (p03, z8, p47);
 
   /* y = shift + z * P(z^2).  */
-  ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+  float32x4_t ret = vfmaq_f32 (z, shift, d->pi);
+  ret = vfmaq_f32 (ret, z3, poly);
 
   if (__glibc_unlikely (v_any_u32 (special_cases)))
     {
diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
index 5f26e2a..4d93419 100644
--- a/sysdeps/aarch64/fpu/atan2f_sve.c
+++ b/sysdeps/aarch64/fpu/atan2f_sve.c
@@ -18,18 +18,18 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 static const struct data
 {
-  float32_t poly[8];
+  float32_t c0, c2, c4, c6;
+  float32_t c1, c3, c5, c7;
   float32_t pi_over_2;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].  */
-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
-  .pi_over_2 = 0x1.921fb6p+0f,
+  .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3,  .c2 = -0x1.230a94p-3,
+  .c3 = 0x1.b4debp-4,	.c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5,
+  .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9,  .pi_over_2 = 0x1.921fb6p+0f,
 };
 
 /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
@@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg)
 
 /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
    P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
-   observed error is 2.95 ULP:
-   _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
-						 want 0x1.967f00p-1.  */
-svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
+   observed error is 2.21 ULP:
+   _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1
+						 want 0x1.95ed36p-1.  */
+svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x,
+				const svbool_t pg)
 {
-  const struct data *data_ptr = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
 
   svuint32_t ix = svreinterpret_u32 (x);
   svuint32_t iy = svreinterpret_u32 (y);
@@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
 
   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
 
-  /* Set up z for call to atan.  */
-  svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
-  svfloat32_t d = svsel (pred_aygtax, ay, ax);
-  svfloat32_t z = svdiv_x (pg, n, d);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atanf.  */
+  svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat32_t den = svsel (pred_aygtax, ay, ax);
+  svfloat32_t z = svdiv_x (ptrue, num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
   shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
   shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
-  shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
 
   /* Use pure Estrin scheme for P(z^2) with deg(P)=7.  */
-  svfloat32_t z2 = svmul_x (pg, z, z);
+  svfloat32_t z2 = svmul_x (ptrue, z, z);
+  svfloat32_t z3 = svmul_x (pg, z2, z);
   svfloat32_t z4 = svmul_x (pg, z2, z2);
   svfloat32_t z8 = svmul_x (pg, z4, z4);
 
-  svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
+  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
 
-  /* ret = shift + z + z^3 * P(z^2).  */
-  svfloat32_t z3 = svmul_x (pg, z2, z);
-  ret = svmla_x (pg, z, z3, ret);
+  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
+  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
 
-  ret = svadd_m (pg, ret, shift);
+  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat32_t poly = svmla_x (pg, p03, z8, p47);
+
+  /* ret = shift + z + z^3 * P(z^2).  */
+  svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2));
+  ret = svmla_x (pg, ret, z3, poly);
 
   /* Account for the sign of x and y.  */
 
diff --git a/sysdeps/aarch64/fpu/atan2pi_advsimd.c b/sysdeps/aarch64/fpu/atan2pi_advsimd.c
new file mode 100644
index 0000000..3cf231b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2pi_advsimd.c
@@ -0,0 +1,175 @@
+/* Double-Precision vector (Advanced SIMD) inverse tan2pi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float64_t c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  float64x2_t c0;
+  uint64x2_t zeroinfnan;
+  float64x2_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+	      [2**-1022, 1.0].  */
+  .c0 = V2 (0x1.45f306dc9c883p-2),
+  .c1 = V2 (-0x1.b2995e7b7ba4ap-4),
+  .c2 = 0x1.04c26be3d2c1p-4,
+  .c3 = V2 (-0x1.7483759c17ea1p-5),
+  .c4 = 0x1.21bb95c315d57p-5,
+  .c5 = V2 (-0x1.da1bdc3d453f3p-6),
+  .c6 = 0x1.912d20459b4bfp-6,
+  .c7 = V2 (-0x1.5bbd4545cad1fp-6),
+  .c8 = 0x1.331b83bec30a1p-6,
+  .c9 = V2 (-0x1.13d6457f44de3p-6),
+  .c10 = 0x1.f8e802974db94p-7,
+  .c11 = V2 (-0x1.d7e173ab04a1ap-7),
+  .c12 = 0x1.bdfa47d6a4f28p-7,
+  .c13 = V2 (-0x1.9ba78f3232ceep-7),
+  .c14 = 0x1.5e6044590ab4fp-7,
+  .c15 = V2 (-0x1.01ccfdeb9f77fp-7),
+  .c16 = 0x1.345cf0d4eb1c1p-8,
+  .c17 = V2 (-0x1.19e5f00f67e3ap-9),
+  .c18 = 0x1.6d3035ac7625bp-11,
+  .c19 = V2 (-0x1.286bb9ae4ed79p-13),
+  .c20 = 0x1.c37ec36da0e1ap-17,
+  .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+#define OneOverPi v_f64 (0x1.45f306dc9c883p-2)
+
+/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls).  */
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+	      uint64x2_t sign_xy, uint64x2_t cmp)
+{
+  /* Account for the sign of x and y.  */
+  ret = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+
+  /* Since we have no scalar fallback for atan2pi,
+     we can instead make a call to atan2f and divide by pi.  */
+  ret = v_call2_f64 (atan2, y, x, ret, cmp);
+
+  /* Only divide the special cases by pi, and leave the rest unchanged.  */
+  return vbslq_f64 (cmp, vmulq_f64 (ret, OneOverPi), ret);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline uint64x2_t
+zeroinfnan (uint64x2_t i, const struct data *d)
+{
+  /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1).  */
+  return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
+}
+
+/* Fast implementation of vector atan2pi.
+   Maximum observed error is 3.04 ulps:
+   _ZGVnN2vv_atan2pi (0x1.1e0733532ce28p+5, 0x1.2d803379cca1fp+5)
+   got 0x1.eed60c1e89317p-3 want 0x1.eed60c1e89314p-3.  */
+float64x2_t VPCS_ATTR V_NAME_D2 (atan2pi) (float64x2_t y, float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t iy = vreinterpretq_u64_f64 (y);
+
+  uint64x2_t special_cases
+      = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+  uint64x2_t sign_x = vandq_u64 (ix, SignMask);
+  uint64x2_t sign_y = vandq_u64 (iy, SignMask);
+  uint64x2_t sign_xy = veorq_u64 (sign_x, sign_y);
+
+  float64x2_t ax = vabsq_f64 (x);
+  float64x2_t ay = vabsq_f64 (y);
+
+  uint64x2_t pred_xlt0 = vcltzq_f64 (x);
+  uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
+
+  /* Set up z for evaluation of atanpi.  */
+  float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+  float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax);
+  float64x2_t z = vdivq_f64 (num, den);
+
+  /* Work out the correct shift for atan2pi:
+     -1.0 when x < 0  and ax < ay
+     -0.5 when x < 0  and ax > ay
+      0   when x >= 0 and ax < ay
+      0.5 when x >= 0 and ax > ay.  */
+  float64x2_t shift = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-1.0))));
+  float64x2_t shift2 = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (0.5))));
+  shift = vaddq_f64 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
+  float64x2_t z2 = vmulq_f64 (z, z);
+  float64x2_t z3 = vmulq_f64 (z2, z);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
+
+  float64x2_t c24 = vld1q_f64 (&d->c2);
+  float64x2_t c68 = vld1q_f64 (&d->c6);
+
+  /* Order-7 Estrin.  */
+  float64x2_t p12 = vfmaq_laneq_f64 (d->c1, z2, c24, 0);
+  float64x2_t p34 = vfmaq_laneq_f64 (d->c3, z2, c24, 1);
+  float64x2_t p56 = vfmaq_laneq_f64 (d->c5, z2, c68, 0);
+  float64x2_t p78 = vfmaq_laneq_f64 (d->c7, z2, c68, 1);
+
+  float64x2_t p14 = vfmaq_f64 (p12, z4, p34);
+  float64x2_t p58 = vfmaq_f64 (p56, z4, p78);
+  float64x2_t p18 = vfmaq_f64 (p14, z8, p58);
+
+  /* Order-11 Estrin.  */
+  float64x2_t c1012 = vld1q_f64 (&d->c10);
+  float64x2_t c1416 = vld1q_f64 (&d->c14);
+  float64x2_t c1820 = vld1q_f64 (&d->c18);
+
+  float64x2_t p910 = vfmaq_laneq_f64 (d->c9, z2, c1012, 0);
+  float64x2_t p1112 = vfmaq_laneq_f64 (d->c11, z2, c1012, 1);
+  float64x2_t p912 = vfmaq_f64 (p910, z4, p1112);
+
+  float64x2_t p1314 = vfmaq_laneq_f64 (d->c13, z2, c1416, 0);
+  float64x2_t p1516 = vfmaq_laneq_f64 (d->c15, z2, c1416, 1);
+  float64x2_t p1316 = vfmaq_f64 (p1314, z4, p1516);
+
+  float64x2_t p1718 = vfmaq_laneq_f64 (d->c17, z2, c1820, 0);
+  float64x2_t p1920 = vfmaq_laneq_f64 (d->c19, z2, c1820, 1);
+  float64x2_t p1720 = vfmaq_f64 (p1718, z4, p1920);
+
+  float64x2_t p916 = vfmaq_f64 (p912, z8, p1316);
+  float64x2_t p920 = vfmaq_f64 (p916, z16, p1720);
+
+  float64x2_t poly = vfmaq_f64 (p18, z16, p920);
+
+  /* y = shift + z * P(z^2).  */
+  float64x2_t ret = vfmaq_f64 (shift, z, d->c0);
+  ret = vfmaq_f64 (ret, z3, poly);
+
+  if (__glibc_unlikely (v_any_u64 (special_cases)))
+    return special_case (y, x, ret, sign_xy, special_cases);
+
+  /* Account for the sign of x and y.  */
+  return vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
+}
diff --git a/sysdeps/aarch64/fpu/atan2pi_sve.c b/sysdeps/aarch64/fpu/atan2pi_sve.c
new file mode 100644
index 0000000..f1d1f1c
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2pi_sve.c
@@ -0,0 +1,159 @@
+/* Double-Precision vector (SVE) inverse tan2pi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "math_config.h"
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  float64_t c0, c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  float64_t shift_val;
+} data = {
+  /* Coefficients of polnomial P such that atan(x)~x+x*P(x^2) on
+     [2^-1022, 1.0].  */
+  .c0 = 0x1.45f306dc9c883p-2,	.c1 = -0x1.b2995e7b7ba4ap-4,
+  .c2 = 0x1.04c26be3d2c1p-4,	.c3 = -0x1.7483759c17ea1p-5,
+  .c4 = 0x1.21bb95c315d57p-5,	.c5 = -0x1.da1bdc3d453f3p-6,
+  .c6 = 0x1.912d20459b4bfp-6,	.c7 = -0x1.5bbd4545cad1fp-6,
+  .c8 = 0x1.331b83bec30a1p-6,	.c9 = -0x1.13d6457f44de3p-6,
+  .c10 = 0x1.f8e802974db94p-7,	.c11 = -0x1.d7e173ab04a1ap-7,
+  .c12 = 0x1.bdfa47d6a4f28p-7,	.c13 = -0x1.9ba78f3232ceep-7,
+  .c14 = 0x1.5e6044590ab4fp-7,	.c15 = -0x1.01ccfdeb9f77fp-7,
+  .c16 = 0x1.345cf0d4eb1c1p-8,	.c17 = -0x1.19e5f00f67e3ap-9,
+  .c18 = 0x1.6d3035ac7625bp-11, .c19 = -0x1.286bb9ae4ed79p-13,
+  .c20 = 0x1.c37ec36da0e1ap-17, .shift_val = 0.5,
+};
+
+#define OneOverPi sv_f64 (0x1.45f306dc9c883p-2)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+static svfloat64_t NOINLINE
+special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
+	      const svbool_t cmp)
+{
+  ret = sv_call2_f64 (atan2, y, x, ret, cmp);
+  return svmul_f64_m (cmp, ret, OneOverPi);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation
+   of 0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (svuint64_t i, const svbool_t pg)
+{
+  return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+		  sv_u64 (2 * asuint64 (INFINITY) - 1));
+}
+
+/* Fast implementation of SVE atan2pi.
+   Maximum observed error is 3.11 ulps:
+   _ZGVsMxvv_atan2pi (0x1.ef284a877f6b5p+6, 0x1.03fdde8242b17p+7)
+   got 0x1.f00f800163079p-3 want 0x1.f00f800163076p-3.  */
+svfloat64_t SV_NAME_D2 (atan2pi) (svfloat64_t y, svfloat64_t x,
+				  const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
+
+  svuint64_t ix = svreinterpret_u64 (x);
+  svuint64_t iy = svreinterpret_u64 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg);
+  svbool_t cmp_y = zeroinfnan (iy, pg);
+  svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
+
+  svfloat64_t ax = svabs_x (pg, x);
+  svfloat64_t ay = svabs_x (pg, y);
+  svuint64_t iax = svreinterpret_u64 (ax);
+  svuint64_t iay = svreinterpret_u64 (ay);
+
+  svuint64_t sign_x = sveor_x (pg, ix, iax);
+  svuint64_t sign_y = sveor_x (pg, iy, iay);
+  svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y);
+
+  svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+
+  /* Set up z for evaluation of atanpi.  */
+  svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat64_t den = svsel (pred_aygtax, ay, ax);
+  svfloat64_t z = svdiv_x (pg, num, den);
+
+  /* Work out the correct shift for atan2pi:
+     -1.0 when x < 0  and ax < ay
+     -0.5 when x < 0  and ax > ay
+      0   when x >= 0 and ax < ay
+      0.5 when x >= 0 and ax > ay.  */
+  svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
+  shift = svmul_x (ptrue, shift, sv_f64 (d->shift_val));
+  shift = svsel (pred_aygtax, sv_f64 (d->shift_val), shift);
+  shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
+
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
+  svfloat64_t z2 = svmul_x (pg, z, z);
+  svfloat64_t z3 = svmul_x (pg, z2, z);
+  svfloat64_t z4 = svmul_x (pg, z2, z2);
+  svfloat64_t z8 = svmul_x (pg, z4, z4);
+  svfloat64_t z16 = svmul_x (pg, z8, z8);
+
+  /* Order-7 Estrin.  */
+  svfloat64_t c24 = svld1rq (ptrue, &d->c2);
+  svfloat64_t c68 = svld1rq (ptrue, &d->c6);
+
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), z2, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), z2, c24, 1);
+  svfloat64_t p56 = svmla_lane (sv_f64 (d->c5), z2, c68, 0);
+  svfloat64_t p78 = svmla_lane (sv_f64 (d->c7), z2, c68, 1);
+
+  svfloat64_t p14 = svmla_x (pg, p12, z4, p34);
+  svfloat64_t p58 = svmla_x (pg, p56, z4, p78);
+  svfloat64_t p18 = svmla_x (pg, p14, z8, p58);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c1012 = svld1rq (ptrue, &d->c10);
+  svfloat64_t c1416 = svld1rq (ptrue, &d->c14);
+  svfloat64_t c1820 = svld1rq (ptrue, &d->c18);
+
+  svfloat64_t p910 = svmla_lane (sv_f64 (d->c9), z2, c1012, 0);
+  svfloat64_t p1112 = svmla_lane (sv_f64 (d->c11), z2, c1012, 1);
+  svfloat64_t p912 = svmla_x (pg, p910, z4, p1112);
+
+  svfloat64_t p1314 = svmla_lane (sv_f64 (d->c13), z2, c1416, 0);
+  svfloat64_t p1516 = svmla_lane (sv_f64 (d->c15), z2, c1416, 1);
+  svfloat64_t p1316 = svmla_x (pg, p1314, z4, p1516);
+
+  svfloat64_t p1718 = svmla_lane (sv_f64 (d->c17), z2, c1820, 0);
+  svfloat64_t p1920 = svmla_lane (sv_f64 (d->c19), z2, c1820, 1);
+  svfloat64_t p1720 = svmla_x (pg, p1718, z4, p1920);
+
+  svfloat64_t p916 = svmla_x (pg, p912, z8, p1316);
+  svfloat64_t p920 = svmla_x (pg, p916, z16, p1720);
+
+  svfloat64_t poly = svmla_x (pg, p18, z16, p920);
+
+  svfloat64_t ret = svmla_x (pg, shift, z, sv_f64 (d->c0));
+  ret = svmla_x (pg, ret, z3, poly);
+
+  /* Account for the sign of x and y.  */
+  if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
+    return special_case (
+	y, x,
+	svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)),
+	cmp_xy);
+  return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy));
+}
diff --git a/sysdeps/aarch64/fpu/atan2pif_advsimd.c b/sysdeps/aarch64/fpu/atan2pif_advsimd.c
new file mode 100644
index 0000000..f1f542b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2pif_advsimd.c
@@ -0,0 +1,138 @@
+/* Single-Precision vector (Advanced SIMD) inverse tan2pi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  float32x4_t c1, c3, c5, c7;
+  float c2, c4, c6, c8;
+  float32x4_t c0;
+  uint32x4_t comp_const;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+     [2^-128, 1.0].
+     Generated using fpminimax between FLT_MIN and 1.  */
+  .c0 = V4 (0x1.45f306p-2), .c1 = V4 (-0x1.b2975ep-4),
+  .c2 = 0x1.0490e4p-4,	    .c3 = V4 (-0x1.70c272p-5),
+  .c4 = 0x1.0eef52p-5,	    .c5 = V4 (-0x1.6abbbap-6),
+  .c6 = 0x1.78157p-7,	    .c7 = V4 (-0x1.f0b406p-9),
+  .c8 = 0x1.2ae7fep-11,	    .comp_const = V4 (2 * 0x7f800000lu - 1),
+};
+
+#define SignMask v_u32 (0x80000000)
+#define OneOverPi v_f32 (0x1.45f307p-2)
+
+/* Special cases i.e. 0, infinity and nan (fall back to scalar calls).  */
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+	      uint32x4_t sign_xy, uint32x4_t cmp)
+{
+  /* Account for the sign of y.  */
+  ret = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+
+  /* Since we have no scalar fallback for atan2pif,
+     we can instead make a call to atan2f and divide by pi.  */
+  ret = v_call2_f32 (atan2f, y, x, ret, cmp);
+
+  /* Only divide the special cases by pi, and leave the rest unchanged.  */
+  return vbslq_f32 (cmp, vmulq_f32 (ret, OneOverPi), ret);
+}
+
+/* Returns 1 if input is the bit representation of 0, infinity or nan.  */
+static inline uint32x4_t
+zeroinfnan (uint32x4_t i, const struct data *d)
+{
+  /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
+  return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const);
+}
+
+/* Fast implementation of vector atan2f. Maximum observed error is 2.89 ULP:
+   _ZGVnN4vv_atan2pif (0x1.bd397p+54, 0x1.e79a4ap+54) got 0x1.e2678ep-3
+						     want 0x1.e26794p-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2pi) (float32x4_t y,
+						    float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t iy = vreinterpretq_u32_f32 (y);
+
+  uint32x4_t special_cases
+      = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
+
+  uint32x4_t sign_x = vandq_u32 (ix, SignMask);
+  uint32x4_t sign_y = vandq_u32 (iy, SignMask);
+  uint32x4_t sign_xy = veorq_u32 (sign_x, sign_y);
+
+  float32x4_t ax = vabsq_f32 (x);
+  float32x4_t ay = vabsq_f32 (y);
+
+  uint32x4_t pred_xlt0 = vcltzq_f32 (x);
+  uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
+
+  /* Set up z for evaluation of atanpif.  */
+  float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+  float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax);
+  float32x4_t z = vdivq_f32 (num, den);
+
+  /* Work out the correct shift for atan2pi:
+     -1.0 when x < 0  and ax < ay
+     -0.5 when x < 0  and ax > ay
+      0   when x >= 0 and ax < ay
+      0.5 when x >= 0 and ax > ay.  */
+  float32x4_t shift = vreinterpretq_f32_u32 (
+      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f))));
+  float32x4_t shift2 = vreinterpretq_f32_u32 (
+      vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f))));
+  shift = vaddq_f32 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z3 = vmulq_f32 (z2, z);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+
+  float32x4_t c2468 = vld1q_f32 (&d->c2);
+
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, z2, c2468, 0);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, z2, c2468, 1);
+  float32x4_t p56 = vfmaq_laneq_f32 (d->c5, z2, c2468, 2);
+  float32x4_t p78 = vfmaq_laneq_f32 (d->c7, z2, c2468, 3);
+  float32x4_t p14 = vfmaq_f32 (p12, z4, p34);
+  float32x4_t p58 = vfmaq_f32 (p56, z4, p78);
+
+  float32x4_t poly = vfmaq_f32 (p14, z8, p58);
+
+  /* y = shift + z * P(z^2).  */
+  float32x4_t ret = vfmaq_f32 (shift, z, d->c0);
+  ret = vfmaq_f32 (ret, z3, poly);
+
+  if (__glibc_unlikely (v_any_u32 (special_cases)))
+    {
+      return special_case (y, x, ret, sign_xy, special_cases);
+    }
+
+  /* Account for the sign of y.  */
+  return vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
+}
+libmvec_hidden_def (V_NAME_F2 (atan2pi))
+HALF_WIDTH_ALIAS_F2 (atan2pi)
diff --git a/sysdeps/aarch64/fpu/atan2pif_sve.c b/sysdeps/aarch64/fpu/atan2pif_sve.c
new file mode 100644
index 0000000..d5ac4b7
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atan2pif_sve.c
@@ -0,0 +1,137 @@
+/* Single-Precision vector (SVE) inverse tan2pi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float32_t c0, c1, c3, c5, c7;
+  float32_t c2, c4, c6, c8;
+  float32_t shift_val;
+  uint32_t comp_const;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+     [2**-128, 1.0].  */
+  .c0 = 0x1.45f306p-2,
+  .c1 = -0x1.b2975ep-4,
+  .c2 = 0x1.0490e4p-4,
+  .c3 = -0x1.70c272p-5,
+  .c4 = 0x1.0eef52p-5,
+  .c5 = -0x1.6abbbap-6,
+  .c6 = 0x1.78157p-7,
+  .c7 = -0x1.f0b406p-9,
+  .c8 = 0x1.2ae7fep-11,
+  .shift_val = 0.5f,
+  .comp_const = 2 * 0x7f800000lu - 1,
+};
+
+#define OneOverPi sv_f32 (0x1.45f307p-2)
+
+/* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
+static svfloat32_t NOINLINE
+special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret,
+	      const svbool_t cmp)
+{
+  ret = sv_call2_f32 (atan2f, y, x, ret, cmp);
+  return svmul_f32_x (cmp, ret, OneOverPi);
+}
+
+/* Returns a predicate indicating true if the input is the bit representation
+   of 0, infinity or nan.  */
+static inline svbool_t
+zeroinfnan (svuint32_t i, const svbool_t pg, const struct data *d)
+{
+  return svcmpge (pg, svsub_x (pg, svlsl_x (pg, i, 1), 1),
+		  sv_u32 (d->comp_const));
+}
+
+/* Fast implementation of SVE atan2pif based on atan(x) ~ shift + z + z^3 *
+   P(z^2) with reduction to [0,1] using z=1/x and shift = 1/2. Maximum
+   observed error is 2.90 ULP:
+   _ZGVsMxvv_atan2pif (0x1.a28542p+5, 0x1.adb7c6p+5) got 0x1.f76524p-3
+						    want 0x1.f7651ep-3.  */
+svfloat32_t SV_NAME_F2 (atan2pi) (svfloat32_t y, svfloat32_t x,
+				  const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
+
+  svuint32_t ix = svreinterpret_u32 (x);
+  svuint32_t iy = svreinterpret_u32 (y);
+
+  svbool_t cmp_x = zeroinfnan (ix, pg, d);
+  svbool_t cmp_y = zeroinfnan (iy, pg, d);
+  svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y);
+
+  svfloat32_t ax = svabs_x (pg, x);
+  svfloat32_t ay = svabs_x (pg, y);
+  svuint32_t iax = svreinterpret_u32 (ax);
+  svuint32_t iay = svreinterpret_u32 (ay);
+
+  svuint32_t sign_x = sveor_x (pg, ix, iax);
+  svuint32_t sign_y = sveor_x (pg, iy, iay);
+  svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y);
+
+  svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
+
+  /* Set up z for evaluation of atanpif.  */
+  svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat32_t den = svsel (pred_aygtax, ay, ax);
+  svfloat32_t z = svdiv_x (ptrue, num, den);
+
+  /* Work out the correct shift for atan2pi:
+     -1.0 when x < 0  and ax < ay
+     -0.5 when x < 0  and ax > ay
+      0   when x >= 0 and ax < ay
+      0.5 when x >= 0 and ax > ay.  */
+  svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
+  shift = svmul_x (ptrue, shift, sv_f32 (d->shift_val));
+  shift = svsel (pred_aygtax, sv_f32 (d->shift_val), shift);
+  shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
+
+  /* Use pure Estrin scheme for P(z^2) with deg(P)=7.  */
+  svfloat32_t z2 = svmul_x (pg, z, z);
+  svfloat32_t z4 = svmul_x (pg, z2, z2);
+  svfloat32_t z8 = svmul_x (pg, z4, z4);
+
+  svfloat32_t even_coeffs = svld1rq (ptrue, &d->c2);
+
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), z2, even_coeffs, 0);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), z2, even_coeffs, 1);
+  svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), z2, even_coeffs, 2);
+  svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), z2, even_coeffs, 3);
+
+  svfloat32_t p14 = svmad_x (pg, z4, p34, p12);
+  svfloat32_t p58 = svmad_x (pg, z4, p78, p56);
+
+  svfloat32_t p18 = svmad_x (pg, z8, p58, p14);
+
+  /* ret = shift + z + z^3 * P(z^2).  */
+  svfloat32_t poly = svmad_x (pg, z2, p18, d->c0);
+  svfloat32_t ret = svmad_x (pg, poly, z, shift);
+
+  if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
+    return special_case (
+	y, x,
+	svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)),
+	cmp_xy);
+
+  /* Account for the sign of x and y.  */
+  return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy));
+}
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
index f024fd1..da0d371 100644
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -18,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
@@ -28,16 +27,16 @@ static const struct data
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
 	      [2**-1022, 1.0].  */
-  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
-  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
-  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
-  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
-  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
-  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
-  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
-  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
-  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
+  .c0 = V2 (-0x1.555555555552ap-2),	  .c1 = 0x1.9999999995aebp-3,
+  .c2 = V2 (-0x1.24924923923f6p-3),	  .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = V2 (-0x1.745d11fb3d32bp-4),	  .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = V2 (-0x1.110e6d985f496p-4),	  .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = V2 (-0x1.ae644e28058c3p-5),	  .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = V2 (-0x1.59d7f901566cbp-5),	  .c11 = 0x1.2c982855ab069p-5,
+  .c12 = V2 (-0x1.eb49592998177p-6),	  .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = V2 (-0x1.ca980345c4204p-7),	  .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = V2 (-0x1.7ea70755b8eccp-9),	  .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = V2 (-0x1.44a4b059b6f67p-13),	  .c19 = 0x1.c4a45029e5a91p-17,
   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
 };
 
@@ -47,9 +46,9 @@ static const struct data
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
-   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
-   _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
-				       want 0x1.9225645bdd7c3p-1.  */
+   z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps:
+   _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1
+				       want 0x1.92288c551a4c3p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
      Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
+  uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0));
   /* Avoid dependency in abs(x) in division (and comparison).  */
-  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x);
+
   float64x2_t shift = vreinterpretq_f64_u64 (
       vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
-  /* Use absolute value only when needed (odd powers of z).  */
-  float64x2_t az = vbslq_f64 (
-      SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
-
-  /* Calculate the polynomial approximation.
-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
-     full scheme to avoid underflow in x^16.
-     The order 19 polynomial P approximates
-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+
+  /* Reinsert sign bit from argument into the shift value.  */
+  shift = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (shift), sign));
+
+  /* Calculate polynomial approximation P(z^2) with deg(P)=19.  */
   float64x2_t z2 = vmulq_f64 (z, z);
-  float64x2_t x2 = vmulq_f64 (z2, z2);
-  float64x2_t x4 = vmulq_f64 (x2, x2);
-  float64x2_t x8 = vmulq_f64 (x4, x4);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
 
-  /* estrin_7.  */
+  /* Order-7 Estrin.  */
   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
 
   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
 
-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
 
-  /* estrin_11.  */
+  /* Order-11 Estrin.  */
   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
 
   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
 
   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
 
-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
 
-  float64x2_t y = vfmaq_f64 (p07, p819, x8);
+  float64x2_t y = vfmaq_f64 (p07, p819, z16);
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
-  y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
-  y = vaddq_f64 (y, shift);
-
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
-  return y;
+  y = vfmsq_f64 (v_f64 (-1.0), z2, y);
+  return vfmsq_f64 (shift, z, y);
 }
diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c
index 3880ced..a6b0489 100644
--- a/sysdeps/aarch64/fpu/atan_sve.c
+++ b/sysdeps/aarch64/fpu/atan_sve.c
@@ -18,23 +18,26 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[20];
-  float64_t pi_over_2;
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  float64_t shift_val, neg_one;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-1022, 1.0].  */
-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
-  .pi_over_2 = 0x1.921fb54442d18p+0,
+  .c0 = -0x1.555555555552ap-2,	     .c1 = 0x1.9999999995aebp-3,
+  .c2 = -0x1.24924923923f6p-3,	     .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = -0x1.745d11fb3d32bp-4,	     .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = -0x1.110e6d985f496p-4,	     .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = -0x1.ae644e28058c3p-5,	     .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = -0x1.59d7f901566cbp-5,	     .c11 = 0x1.2c982855ab069p-5,
+  .c12 = -0x1.eb49592998177p-6,	     .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = -0x1.ca980345c4204p-7,	     .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = -0x1.7ea70755b8eccp-9,	     .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = -0x1.44a4b059b6f67p-13,     .c19 = 0x1.c4a45029e5a91p-17,
+  .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1,
 };
 
 /* Useful constants.  */
@@ -43,15 +46,14 @@ static const struct data
 /* Fast implementation of SVE atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
-   error is 2.27 ulps:
-   _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
-				       want 0x1.9225645bdd7c3p-1.  */
+   error is 2.08 ulps:
+   _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1
+				       want 0x1.922a3163e15c4p-1.  */
 svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  svbool_t ptrue = svptrue_b64 ();
   svuint64_t ix = svreinterpret_u64 (x);
   svuint64_t sign = svand_x (pg, ix, SignMask);
 
@@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
      Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  svbool_t red = svacgt (pg, x, 1.0);
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
-  /* Use absolute value only when needed (odd powers of z).  */
-  svfloat64_t az = svabs_x (pg, z);
-  az = svneg_m (az, red, az);
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x);
+
+  /* Reuse of -1.0f to reduce constant loads,
+     We need a shift value of 1/2, which is created via -1 + (1 + 1/2).  */
+  svfloat64_t shift
+      = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val));
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign));
 
   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
-  svfloat64_t z2 = svmul_x (pg, z, z);
-  svfloat64_t x2 = svmul_x (pg, z2, z2);
-  svfloat64_t x4 = svmul_x (pg, x2, x2);
-  svfloat64_t x8 = svmul_x (pg, x4, x4);
+  svfloat64_t z2 = svmul_x (ptrue, z, z);
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+  svfloat64_t z16 = svmul_x (ptrue, z8, z8);
 
-  svfloat64_t y
-      = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
-		 sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
+  /* Order-7 Estrin.  */
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
 
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat64_t z3 = svmul_x (pg, z2, az);
-  y = svmla_x (pg, az, z3, y);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
+  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
 
-  /* Apply shift as indicated by `red` predicate.  */
-  y = svadd_m (red, y, d->pi_over_2);
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
 
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
+  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
 
-  return y;
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
+  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
+  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
+
+  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
+  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
+
+  svfloat64_t y = svmla_x (pg, p07, z16, p819);
+
+  /* y = shift + z + z^3 * P(z^2).  */
+  shift = svadd_m (red, z, shift);
+  y = svmul_x (pg, z2, y);
+  return svmla_x (pg, shift, z, y);
 }
diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
index 472865e..817a47e 100644
--- a/sysdeps/aarch64/fpu/atanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
@@ -22,26 +22,35 @@
 
 static const struct data
 {
+  uint32x4_t sign_mask, pi_over_2;
+  float32x4_t neg_one;
+#if WANT_SIMD_EXCEPT
   float32x4_t poly[8];
-  float32x4_t pi_over_2;
+} data = {
+  .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3),
+	    V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5),
+	    V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) },
+#else
+  float32x4_t c0, c2, c4, c6;
+  float c1, c3, c5, c7;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].
      Generated using fpminimax between FLT_MIN and 1.  */
-  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
-	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
-	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
-  .pi_over_2 = V4 (0x1.921fb6p+0f),
+  .c0 = V4 (-0x1.5554dcp-2),	.c1 = 0x1.9978ecp-3,
+  .c2 = V4 (-0x1.230a94p-3),	.c3 = 0x1.b4debp-4,
+  .c4 = V4 (-0x1.3550dap-4),	.c5 = 0x1.61eebp-5,
+  .c6 = V4 (-0x1.0c17d4p-6),	.c7 = 0x1.7ea694p-9,
+#endif
+  .pi_over_2 = V4 (0x3fc90fdb),
+  .neg_one = V4 (-1.0f),
+  .sign_mask = V4 (0x80000000),
 };
 
-#define SignMask v_u32 (0x80000000)
-
-#define P(i) d->poly[i]
-
+#if WANT_SIMD_EXCEPT
 #define TinyBound 0x30800000 /* asuint(0x1p-30).  */
 #define BigBound 0x4e800000  /* asuint(0x1p30).  */
 
-#if WANT_SIMD_EXCEPT
 static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 {
@@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 
 /* Fast implementation of vector atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
-   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
-   _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps:
+   _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
+				 want 0x1.95ed36p-1.  */
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Small cases, infs and nans are supported by our approximation technique,
-     but do not set fenv flags correctly. Only trigger special case if we need
-     fenv.  */
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint32x4_t sign = vandq_u32 (ix, SignMask);
+  uint32x4_t sign = vandq_u32 (ix, d->sign_mask);
 
 #if WANT_SIMD_EXCEPT
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
   uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
 				  v_u32 (BigBound - TinyBound));
@@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
   if (__glibc_unlikely (v_any_u32 (special)))
     return special_case (x, x, v_u32 (-1));
 #endif
-
   /* Argument reduction:
-     y := arctan(x) for x < 1
-     y := pi/2 + arctan(-1/x) for x > 1
-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+     y := arctan(x) for |x| < 1
+     y := arctan(-1/x) + pi/2 for x > +1
+     y := arctan(-1/x) - pi/2 for x < -1
+     Hence, use z=-1/a if x>=|-1|, otherwise z=a.  */
+  uint32x4_t red = vcagtq_f32 (x, d->neg_one);
+
+  float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x);
+
+  /* Shift is calculated as +-pi/2 or 0, depending on the argument case.  */
   float32x4_t shift = vreinterpretq_f32_u32 (
-      vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
-  /* Use absolute value only when needed (odd powers of z).  */
-  float32x4_t az = vbslq_f32 (
-      SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
+      vandq_u32 (red, veorq_u32 (d->pi_over_2, sign)));
+
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z3 = vmulq_f32 (z, z2);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+#if WANT_SIMD_EXCEPT
 
   /* Calculate the polynomial approximation.
      Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
      a standard implementation using z8 creates spurious underflow
      in the very last fma (when z^8 is small enough).
-     Therefore, we split the last fma into a mul and an fma.
-     Horner and single-level Estrin have higher errors that exceed
-     threshold.  */
-  float32x4_t z2 = vmulq_f32 (z, z);
-  float32x4_t z4 = vmulq_f32 (z2, z2);
-
+     Therefore, we split the last fma into a mul and an fma.  */
   float32x4_t y = vfmaq_f32 (
       v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
       vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
 
-  /* y = shift + z * P(z^2).  */
-  y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
+#else
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+
+  /* Uses an Estrin scheme for polynomial approximation.  */
+  float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3);
 
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
 
-  return y;
+  float32x4_t y = vfmaq_f32 (p03, z8, p47);
+#endif
+
+  /* y = shift + z * P(z^2).  */
+  return vfmaq_f32 (vaddq_f32 (shift, z), z3, y);
 }
 libmvec_hidden_def (V_NAME_F1 (atan))
 HALF_WIDTH_ALIAS_F1 (atan)
diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c
index 3a98d70..6558223 100644
--- a/sysdeps/aarch64/fpu/atanf_sve.c
+++ b/sysdeps/aarch64/fpu/atanf_sve.c
@@ -18,18 +18,26 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 static const struct data
 {
-  float32_t poly[8];
-  float32_t pi_over_2;
+  float32_t c1, c3, c5, c7;
+  float32_t c0, c2, c4, c6;
+  float32_t shift_val, neg_one;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
     [2**-128, 1.0].  */
-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
-  .pi_over_2 = 0x1.921fb6p+0f,
+  .c0 = -0x1.5554dcp-2,
+  .c1 = 0x1.9978ecp-3,
+  .c2 = -0x1.230a94p-3,
+  .c3 = 0x1.b4debp-4,
+  .c4 = -0x1.3550dap-4,
+  .c5 = 0x1.61eebp-5,
+  .c6 = -0x1.0c17d4p-6,
+  .c7 = 0x1.7ea694p-9,
+  /*  pi/2, used as a shift value after reduction.  */
+  .shift_val = 0x1.921fb54442d18p+0,
+  .neg_one = -1.0f,
 };
 
 #define SignMask (0x80000000)
@@ -37,43 +45,49 @@ static const struct data
 /* Fast implementation of SVE atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=-1/x and shift = pi/2.
-   Largest observed error is 2.9 ULP, close to +/-1.0:
-   _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
-				 want -0x1.967fp-1.  */
+   Largest observed error is 2.12 ULP:
+   _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
+				 want 0x1.95ed36p-1.  */
 svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
 
   /* No need to trigger special case. Small cases, infs and nans
      are supported by our approximation technique.  */
   svuint32_t ix = svreinterpret_u32 (x);
-  svuint32_t sign = svand_x (pg, ix, SignMask);
+  svuint32_t sign = svand_x (ptrue, ix, SignMask);
 
   /* Argument reduction:
      y := arctan(x) for x < 1
-     y := pi/2 + arctan(-1/x) for x > 1
-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  svbool_t red = svacgt (pg, x, 1.0f);
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
-  /* Use absolute value only when needed (odd powers of z).  */
-  svfloat32_t az = svabs_x (pg, z);
-  az = svneg_m (az, red, az);
-
-  /* Use split Estrin scheme for P(z^2) with deg(P)=7.  */
-  svfloat32_t z2 = svmul_x (pg, z, z);
-  svfloat32_t z4 = svmul_x (pg, z2, z2);
-  svfloat32_t z8 = svmul_x (pg, z4, z4);
-
-  svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
-
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat32_t z3 = svmul_x (pg, z2, az);
-  y = svmla_x (pg, az, z3, y);
-
-  /* Apply shift as indicated by 'red' predicate.  */
-  y = svadd_m (red, y, sv_f32 (d->pi_over_2));
-
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+     y := arctan(-1/x) + pi/2 for x > +1
+     y := arctan(-1/x) - pi/2 for x < -1
+     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x);
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  svfloat32_t shift = svreinterpret_f32 (
+      sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign));
+
+  svfloat32_t z2 = svmul_x (ptrue, z, z);
+  svfloat32_t z3 = svmul_x (ptrue, z2, z);
+  svfloat32_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat32_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
+
+  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
+  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
+
+  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat32_t y = svmla_x (pg, p03, z8, p47);
+
+  /* shift + z + z^3 * P(z^2).  */
+  shift = svadd_m (red, z, shift);
+  return svmla_x (pg, shift, z3, y);
 }
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
index 16a7cf6..958d69a 100644
--- a/sysdeps/aarch64/fpu/atanh_sve.c
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 }
 
 /* SVE approximation for double-precision atanh, based on log1p.
-   The greatest observed error is 2.81 ULP:
+   The greatest observed error is 3.3 ULP:
    _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
 				      want 0x1.ffd8ff31b501cp-6.  */
 svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
 
   /* It is special if iax >= 1.  */
-//   svbool_t special = svcmpge (pg, iax, One);
   svbool_t special = svacge (pg, x, 1.0);
 
   /* Computation is performed based on the following sequence of equality:
diff --git a/sysdeps/aarch64/fpu/atanpi_advsimd.c b/sysdeps/aarch64/fpu/atanpi_advsimd.c
new file mode 100644
index 0000000..9101419
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanpi_advsimd.c
@@ -0,0 +1,117 @@
+/* Double-Precision vector (Advanced SIMD) inverse tanpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  double c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  float64x2_t c0, c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+} data = {
+  /* Coefficients of polynomial P such that atanpi(x)~x*P(x^2) on
+	      [2^-1022, 1.0].  */
+  .c0 = V2 (0x1.45f306dc9c883p-2), .c1 = V2 (-0x1.b2995e7b7ba4ap-4),
+  .c2 = 0x1.04c26be3d2c1p-4,	   .c3 = V2 (-0x1.7483759c17ea1p-5),
+  .c4 = 0x1.21bb95c315d57p-5,	   .c5 = V2 (-0x1.da1bdc3d453f3p-6),
+  .c6 = 0x1.912d20459b4bfp-6,	   .c7 = V2 (-0x1.5bbd4545cad1fp-6),
+  .c8 = 0x1.331b83bec30a1p-6,	   .c9 = V2 (-0x1.13d6457f44de3p-6),
+  .c10 = 0x1.f8e802974db94p-7,	   .c11 = V2 (-0x1.d7e173ab04a1ap-7),
+  .c12 = 0x1.bdfa47d6a4f28p-7,	   .c13 = V2 (-0x1.9ba78f3232ceep-7),
+  .c14 = 0x1.5e6044590ab4fp-7,	   .c15 = V2 (-0x1.01ccfdeb9f77fp-7),
+  .c16 = 0x1.345cf0d4eb1c1p-8,	   .c17 = V2 (-0x1.19e5f00f67e3ap-9),
+  .c18 = 0x1.6d3035ac7625bp-11,	   .c19 = V2 (-0x1.286bb9ae4ed79p-13),
+  .c20 = 0x1.c37ec36da0e1ap-17,
+};
+
+#define SignMask v_u64 (0x8000000000000000)
+
+/* Fast implementation of vector atanpi.
+   atanpi(x) ~ shift + z * P(z^2) with reduction to [0,1] using
+   z=1/x and shift = +-1/2. Maximum observed error is 2.76 ulps:
+   _ZGVnN2v_atanpi(0x1.fa2d6912cd64fp-1) got 0x1.fc45a51bd497fp-3
+					want 0x1.fc45a51bd497cp-3.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (atanpi) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint64x2_t ix = vreinterpretq_u64_f64 (x);
+  uint64x2_t sign = vandq_u64 (ix, SignMask);
+
+  /* Argument Reduction:
+     y := arctanpi(x) for |x| < 1
+     y := arctanpi(-1/x) + 1/2 for x > 1
+     y := arctanpi(-1/x) - 1/2 for x < -1
+     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
+  uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0));
+  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x);
+
+  /* Shift is calculated as +1/2 or 0, depending on the argument case.  */
+  float64x2_t shift = vreinterpretq_f64_u64 (
+      vandq_u64 (red, vreinterpretq_u64_f64 (v_f64 (0.5))));
+
+  /* Reinsert sign bit from argument into the shift value.  */
+  shift = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (shift), sign));
+
+  /* Calculate polynomial approximation P(z^2) with deg(P)=19.  */
+  float64x2_t z2 = vmulq_f64 (z, z);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
+
+  float64x2_t c24 = vld1q_f64 (&d->c2);
+  float64x2_t c68 = vld1q_f64 (&d->c6);
+
+  /* Order-7 Estrin.  */
+  float64x2_t p12 = vfmaq_laneq_f64 (d->c1, z2, c24, 0);
+  float64x2_t p34 = vfmaq_laneq_f64 (d->c3, z2, c24, 1);
+  float64x2_t p56 = vfmaq_laneq_f64 (d->c5, z2, c68, 0);
+  float64x2_t p78 = vfmaq_laneq_f64 (d->c7, z2, c68, 1);
+
+  float64x2_t p14 = vfmaq_f64 (p12, z4, p34);
+  float64x2_t p58 = vfmaq_f64 (p56, z4, p78);
+  float64x2_t p18 = vfmaq_f64 (p14, z8, p58);
+
+  /* Order-11 Estrin.  */
+  float64x2_t c1012 = vld1q_f64 (&d->c10);
+  float64x2_t c1416 = vld1q_f64 (&d->c14);
+  float64x2_t c1820 = vld1q_f64 (&d->c18);
+
+  float64x2_t p910 = vfmaq_laneq_f64 (d->c9, z2, c1012, 0);
+  float64x2_t p1112 = vfmaq_laneq_f64 (d->c11, z2, c1012, 1);
+  float64x2_t p912 = vfmaq_f64 (p910, z4, p1112);
+
+  float64x2_t p1314 = vfmaq_laneq_f64 (d->c13, z2, c1416, 0);
+  float64x2_t p1516 = vfmaq_laneq_f64 (d->c15, z2, c1416, 1);
+  float64x2_t p1316 = vfmaq_f64 (p1314, z4, p1516);
+
+  float64x2_t p1718 = vfmaq_laneq_f64 (d->c17, z2, c1820, 0);
+  float64x2_t p1920 = vfmaq_laneq_f64 (d->c19, z2, c1820, 1);
+  float64x2_t p1720 = vfmaq_f64 (p1718, z4, p1920);
+
+  float64x2_t p916 = vfmaq_f64 (p912, z8, p1316);
+  float64x2_t p920 = vfmaq_f64 (p916, z16, p1720);
+
+  float64x2_t y = vfmaq_f64 (p18, p920, z16);
+
+  y = vfmaq_f64 (d->c0, z2, y);
+
+  /* y = shift + z * p(z^2).  */
+  return vfmaq_f64 (shift, z, y);
+}
diff --git a/sysdeps/aarch64/fpu/atanpi_sve.c b/sysdeps/aarch64/fpu/atanpi_sve.c
new file mode 100644
index 0000000..3f8f277
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanpi_sve.c
@@ -0,0 +1,127 @@
+/* Double-Precision vector (SVE) inverse tanpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float64_t c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
+  float64_t c0, c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  float64_t shift_val, neg_one;
+} data = {
+  /* Coefficients of polnomial P such that atan(x)~x+x*P(x^2) on
+     [2^-1022, 1.0].  */
+  .c0 = 0x1.45f306dc9c883p-2,
+  .c1 = -0x1.b2995e7b7ba4ap-4,
+  .c2 = 0x1.04c26be3d2c1p-4,
+  .c3 = -0x1.7483759c17ea1p-5,
+  .c4 = 0x1.21bb95c315d57p-5,
+  .c5 = -0x1.da1bdc3d453f3p-6,
+  .c6 = 0x1.912d20459b4bfp-6,
+  .c7 = -0x1.5bbd4545cad1fp-6,
+  .c8 = 0x1.331b83bec30a1p-6,
+  .c9 = -0x1.13d6457f44de3p-6,
+  .c10 = 0x1.f8e802974db94p-7,
+  .c11 = -0x1.d7e173ab04a1ap-7,
+  .c12 = 0x1.bdfa47d6a4f28p-7,
+  .c13 = -0x1.9ba78f3232ceep-7,
+  .c14 = 0x1.5e6044590ab4fp-7,
+  .c15 = -0x1.01ccfdeb9f77fp-7,
+  .c16 = 0x1.345cf0d4eb1c1p-8,
+  .c17 = -0x1.19e5f00f67e3ap-9,
+  .c18 = 0x1.6d3035ac7625bp-11,
+  .c19 = -0x1.286bb9ae4ed79p-13,
+  .c20 = 0x1.c37ec36da0e1ap-17,
+  .shift_val = 1.5,
+  .neg_one = -1,
+};
+
+/* Fast implementation of SVE atan.
+   Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to 0,1 using
+   z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
+   error is 2.80 ulps:
+   _ZGVsMxv_atanpi(0x1.f19587d63c76fp-1) got 0x1.f6b1304817d02p-3
+					want 0x1.f6b1304817d05p-3.  */
+svfloat64_t SV_NAME_D1 (atanpi) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svbool_t ptrue = svptrue_b64 ();
+  svuint64_t ix = svreinterpret_u64 (x);
+  svuint64_t sign = svand_x (pg, ix, 0x8000000000000000);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := pi/2 + arctan(-1/x) for x > 1
+     Hence, use z=-1/a if x>=1, otherwise z=a.  */
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x);
+
+  /* Reuse of -1.0f to reduce constant loads,
+     We need a shift value of 1/2, which is created via -1 + (1 + 1/2).  */
+  svfloat64_t shift
+      = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val));
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign));
+
+  /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
+  svfloat64_t z2 = svmul_x (ptrue, z, z);
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+  svfloat64_t z16 = svmul_x (ptrue, z8, z8);
+
+  /* Order-7 Estrin.  */
+  svfloat64_t c24 = svld1rq (ptrue, &d->c2);
+  svfloat64_t c68 = svld1rq (ptrue, &d->c6);
+
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), z2, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), z2, c24, 1);
+  svfloat64_t p56 = svmla_lane (sv_f64 (d->c5), z2, c68, 0);
+  svfloat64_t p78 = svmla_lane (sv_f64 (d->c7), z2, c68, 1);
+
+  svfloat64_t p14 = svmla_x (pg, p12, z4, p34);
+  svfloat64_t p58 = svmla_x (pg, p56, z4, p78);
+  svfloat64_t p18 = svmla_x (pg, p14, z8, p58);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c1012 = svld1rq (ptrue, &d->c10);
+  svfloat64_t c1416 = svld1rq (ptrue, &d->c14);
+  svfloat64_t c1820 = svld1rq (ptrue, &d->c18);
+
+  svfloat64_t p910 = svmla_lane (sv_f64 (d->c9), z2, c1012, 0);
+  svfloat64_t p1112 = svmla_lane (sv_f64 (d->c11), z2, c1012, 1);
+  svfloat64_t p912 = svmla_x (pg, p910, z4, p1112);
+
+  svfloat64_t p1314 = svmla_lane (sv_f64 (d->c13), z2, c1416, 0);
+  svfloat64_t p1516 = svmla_lane (sv_f64 (d->c15), z2, c1416, 1);
+  svfloat64_t p1316 = svmla_x (pg, p1314, z4, p1516);
+
+  svfloat64_t p1718 = svmla_lane (sv_f64 (d->c17), z2, c1820, 0);
+  svfloat64_t p1920 = svmla_lane (sv_f64 (d->c19), z2, c1820, 1);
+  svfloat64_t p1720 = svmla_x (pg, p1718, z4, p1920);
+
+  svfloat64_t p916 = svmla_x (pg, p912, z8, p1316);
+  svfloat64_t p920 = svmla_x (pg, p916, z16, p1720);
+
+  svfloat64_t y = svmla_x (pg, p18, z16, p920);
+
+  y = svmla_x (pg, sv_f64 (d->c0), z2, y);
+  return svmla_x (pg, shift, z, y);
+}
diff --git a/sysdeps/aarch64/fpu/atanpif_advsimd.c b/sysdeps/aarch64/fpu/atanpif_advsimd.c
new file mode 100644
index 0000000..9295156
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanpif_advsimd.c
@@ -0,0 +1,92 @@
+/* Single-Precision vector (Advanced SIMD) inverse tanpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const struct data
+{
+  uint32x4_t half;
+  float32x4_t neg_one;
+  float32x4_t c0, c1, c3, c5, c7;
+  float c2, c4, c6, c8;
+} data = {
+  /* Polynomial coefficients generated using Remez algorithm,
+     see atanpi.sollya for details.  */
+  .c0 = V4 (0x1.45f306p-2), .c1 = V4 (-0x1.b2975ep-4),
+  .c2 = 0x1.0490e4p-4,	    .c3 = V4 (-0x1.70c272p-5),
+  .c4 = 0x1.0eef52p-5,	    .c5 = V4 (-0x1.6abbbap-6),
+  .c6 = 0x1.78157p-7,	    .c7 = V4 (-0x1.f0b406p-9),
+  .c8 = 0x1.2ae7fep-11,	    .half = V4 (0x3f000000),
+  .neg_one = V4 (-1.0f),
+};
+
+#define SignMask v_u32 (0x80000000)
+
+/* Fast implementation of vector atanpif based on
+   atanpi(x) ~ shift + z * P(z^2) with reduction to [0,1]
+   using z=-1/x and shift = +-1/2.
+   Maximum observed error is 2.59ulps:
+   _ZGVnN4v_atanpif (0x1.f2a89cp-1) got 0x1.f76524p-3
+				   want 0x1.f7651ep-3.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanpi) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  uint32x4_t ix = vreinterpretq_u32_f32 (x);
+  uint32x4_t sign = vandq_u32 (ix, SignMask);
+
+  /* Argument Reduction:
+     y := arctanpi(x) for |x| < 1
+     y := arctanpi(-1/x) + 1/2 for x > 1
+     y := arctanpi(-1/x) - 1/2 for x < -1
+     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
+  uint32x4_t red = vcagtq_f32 (x, d->neg_one);
+
+  float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x);
+
+  /* Shift is calculated as +1/2 or 0, depending on the argument case.  */
+  float32x4_t shift = vreinterpretq_f32_u32 (vandq_u32 (red, d->half));
+
+  /* Reinsert sign bit from argument into the shift value.  */
+  shift = vreinterpretq_f32_u32 (
+      veorq_u32 (vreinterpretq_u32_f32 (shift), sign));
+
+  /* Uses an Estrin scheme for polynomial approximation.  */
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+
+  float32x4_t even_coeffs = vld1q_f32 (&d->c2);
+
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, z2, even_coeffs, 0);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, z2, even_coeffs, 1);
+  float32x4_t p56 = vfmaq_laneq_f32 (d->c5, z2, even_coeffs, 2);
+  float32x4_t p78 = vfmaq_laneq_f32 (d->c7, z2, even_coeffs, 3);
+
+  float32x4_t p14 = vfmaq_f32 (p12, z4, p34);
+  float32x4_t p58 = vfmaq_f32 (p56, z4, p78);
+
+  float32x4_t y = vfmaq_f32 (p14, z8, p58);
+  y = vfmaq_f32 (d->c0, z2, y);
+
+  /* y = shift + z * P(z^2).  */
+  return vfmaq_f32 (shift, z, y);
+}
+libmvec_hidden_def (V_NAME_F1 (atanpi))
+HALF_WIDTH_ALIAS_F1 (atanpi)
diff --git a/sysdeps/aarch64/fpu/atanpif_sve.c b/sysdeps/aarch64/fpu/atanpif_sve.c
new file mode 100644
index 0000000..2abd788
--- /dev/null
+++ b/sysdeps/aarch64/fpu/atanpif_sve.c
@@ -0,0 +1,89 @@
+/* Single-Precision vector (SVE) inverse tanpi function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float32_t c2, c4, c6, c8;
+  float32_t c0, c1, c3, c5, c7;
+  float32_t shift_val, neg_one;
+} data = {
+  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
+    [2**-128, 1.0].  */
+  .c0 = 0x1.45f306p-2,	.c1 = -0x1.b2975ep-4, .c2 = 0x1.0490e4p-4,
+  .c3 = -0x1.70c272p-5, .c4 = 0x1.0eef52p-5,  .c5 = -0x1.6abbbap-6,
+  .c6 = 0x1.78157p-7,	.c7 = -0x1.f0b406p-9, .c8 = 0x1.2ae7fep-11,
+  .shift_val = 1.5f,	.neg_one = -1.0f,
+};
+
+#define SignMask (0x80000000)
+
+/* Fast implementation of SVE atanpif based on
+   atan(x) ~ shift + z * P(z^2) with reduction to [0,1] using
+   z=-1/x and shift = 1/2.
+   Largest observed error is 2.59 ULP, close to +/-1.0:
+   _ZGVsMxv_atanpif(0x1.f2a89cp-1) got 0x1.f76524p-3
+				  want 0x1.f7651ep-3.  */
+svfloat32_t SV_NAME_F1 (atanpi) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
+
+  /* No need to trigger special case. Small cases, infs and nans
+     are supported by our approximation technique.  */
+  svuint32_t ix = svreinterpret_u32 (x);
+  svuint32_t sign = svand_x (pg, ix, SignMask);
+
+  /* Argument reduction:
+     y := arctan(x) for x < 1
+     y := arctan(-1/x) + 1/2 for x > +1
+     y := arctan(-1/x) - 1/2 for x < -1
+     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat32_t z = svsel (red, svdiv_x (ptrue, sv_f32 (d->neg_one), x), x);
+
+  /* Reuse of -1.0f to reduce constant loads,
+     We need a shift value of 1/2, which is created via -1 + (1 + 1/2).  */
+  svfloat32_t shift
+      = svadd_z (red, sv_f32 (d->neg_one), sv_f32 (d->shift_val));
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  shift = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (shift), sign));
+
+  svfloat32_t z2 = svmul_x (ptrue, z, z);
+  svfloat32_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat32_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat32_t even_coeffs = svld1rq (ptrue, &d->c2);
+
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), z2, even_coeffs, 0);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), z2, even_coeffs, 1);
+  svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), z2, even_coeffs, 2);
+  svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), z2, even_coeffs, 3);
+
+  svfloat32_t p14 = svmad_x (pg, z4, p34, p12);
+  svfloat32_t p58 = svmad_x (pg, z4, p78, p56);
+
+  svfloat32_t p18 = svmad_x (pg, z8, p58, p14);
+  svfloat32_t y = svmad_x (pg, z2, p18, d->c0);
+
+  /* shift + z * P(z^2).  */
+  return svmad_x (pg, y, z, shift);
+}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 5152c0d..77ae10d 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -37,6 +37,10 @@
 # define __DECL_SIMD_acosh __DECL_SIMD_aarch64
 # undef __DECL_SIMD_acoshf
 # define __DECL_SIMD_acoshf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acospi
+# define __DECL_SIMD_acospi __DECL_SIMD_aarch64
+# undef __DECL_SIMD_acospif
+# define __DECL_SIMD_acospif __DECL_SIMD_aarch64
 # undef __DECL_SIMD_asin
 # define __DECL_SIMD_asin __DECL_SIMD_aarch64
 # undef __DECL_SIMD_asinf
@@ -45,6 +49,10 @@
 # define __DECL_SIMD_asinh __DECL_SIMD_aarch64
 # undef __DECL_SIMD_asinhf
 # define __DECL_SIMD_asinhf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinpi
+# define __DECL_SIMD_asinpi __DECL_SIMD_aarch64
+# undef __DECL_SIMD_asinpif
+# define __DECL_SIMD_asinpif __DECL_SIMD_aarch64
 # undef __DECL_SIMD_atan
 # define __DECL_SIMD_atan __DECL_SIMD_aarch64
 # undef __DECL_SIMD_atanf
@@ -53,10 +61,18 @@
 # define __DECL_SIMD_atanh __DECL_SIMD_aarch64
 # undef __DECL_SIMD_atanhf
 # define __DECL_SIMD_atanhf __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanpi
+# define __DECL_SIMD_atanpi __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atanpif
+# define __DECL_SIMD_atanpif __DECL_SIMD_aarch64
 # undef __DECL_SIMD_atan2
 # define __DECL_SIMD_atan2 __DECL_SIMD_aarch64
 # undef __DECL_SIMD_atan2f
 # define __DECL_SIMD_atan2f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atan2pi
+# define __DECL_SIMD_atan2pi __DECL_SIMD_aarch64
+# undef __DECL_SIMD_atan2pif
+# define __DECL_SIMD_atan2pif __DECL_SIMD_aarch64
 # undef __DECL_SIMD_cbrt
 # define __DECL_SIMD_cbrt __DECL_SIMD_aarch64
 # undef __DECL_SIMD_cbrtf
@@ -176,12 +192,16 @@ typedef __SVBool_t __sv_bool_t;
 #  define __vpcs __attribute__ ((__aarch64_vector_pcs__))
 
 __vpcs __f32x4_t _ZGVnN4vv_atan2f (__f32x4_t, __f32x4_t);
+__vpcs __f32x4_t _ZGVnN4vv_atan2pif (__f32x4_t, __f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_acoshf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_acospif (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_asinhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_asinpif (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_atanhf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_atanpif (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cbrtf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t);
@@ -207,12 +227,16 @@ __vpcs __f32x4_t _ZGVnN4v_tanhf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_tanpif (__f32x4_t);
 
 __vpcs __f64x2_t _ZGVnN2vv_atan2 (__f64x2_t, __f64x2_t);
+__vpcs __f64x2_t _ZGVnN2vv_atan2pi (__f64x2_t, __f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_acosh (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_acospi (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_asinh (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_asinpi (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_atanh (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_atanpi (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cbrt (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t);
@@ -243,12 +267,16 @@ __vpcs __f64x2_t _ZGVnN2v_tanpi (__f64x2_t);
 #ifdef __SVE_VEC_MATH_SUPPORTED
 
 __sv_f32_t _ZGVsMxvv_atan2f (__sv_f32_t, __sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxvv_atan2pif (__sv_f32_t, __sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_acoshf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_acospif (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_asinhf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_asinpif (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_atanhf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_atanpif (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_cbrtf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t);
@@ -274,12 +302,16 @@ __sv_f32_t _ZGVsMxv_tanhf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_tanpif (__sv_f32_t, __sv_bool_t);
 
 __sv_f64_t _ZGVsMxvv_atan2 (__sv_f64_t, __sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxvv_atan2pi (__sv_f64_t, __sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_acosh (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_acospi (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_asinh (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_asinpi (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_atanh (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_atanpi (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_cbrt (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
index 77e58e1..f5a163b 100644
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -21,71 +21,99 @@
 
 static const struct data
 {
-  float64_t poly[3];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  double c0, c2;
+  double c1, c3;
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
   uint64_t special_bound;
 } data = {
-  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
-	    0x1.5555576a59599p-5, },
-
-  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
-  /* -ln2/N.  */
-  .ln2_hi = -0x1.62e42fefa39efp-9,
-  .ln2_lo = -0x1.abc9e3b39803f3p-64,
-  .shift = 0x1.8p+52,
-  .thres = 704.0,
-
-  /* 0x1.6p9, above which exp overflows.  */
-  .special_bound = 0x4086000000000000,
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* 1/ln2.  */
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */
+
+  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
+  .special_bound = 0x40862e37e7d8ba72,
 };
 
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
-{
-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
-  return sv_call_f64 (cosh, x, y, special);
-}
-
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
-   special-case handling or tail.  */
+/* Helper for approximating exp(x)/2.
+   Functionally identical to FEXPA exp(x), but an adjustment in
+   the shift value which leads to a reduction in the exponent of scale by 1,
+   thus halving the result at no cost.  */
 static inline svfloat64_t
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
   /* Calculate exp(x).  */
   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
   svfloat64_t n = svsub_x (pg, z, d->shift);
 
-  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
-  r = svmla_x (pg, r, n, d->ln2_lo);
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
 
-  svuint64_t u = svreinterpret_u64 (z);
-  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
-  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
 
-  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
-  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
-  y = svmla_x (pg, sv_f64 (1.0), r, y);
-  y = svmul_x (svptrue_b64 (), r, y);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+  svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+  svfloat64_t p = svmla_x (pg, r, p04, r2);
 
-  /* s = 2^(n/N).  */
-  u = svld1_gather_index (pg, __v_exp_tail_data, i);
-  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+  svfloat64_t scale = svexpa (u);
 
-  return svmla_x (pg, s, s, y);
+  return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+	      const struct data *d)
+{
+  /* Finish fast path to compute values for non-special cases.  */
+  svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+  svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe
+      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
+  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
+  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+  svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+  /* Select correct return values for special and non-special cases.  */
+  special_y = svsel (special, special_y, y);
+
+  /* Ensure an input of nan is correctly propagated.  */
+  svbool_t is_nan
+      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+  return svsel (is_nan, ax, svsel (special, special_y, y));
 }
 
 /* Approximation for SVE double-precision cosh(x) using exp_inline.
    cosh(x) = (exp(x) + exp(-x)) / 2.
-   The greatest observed error is in the scalar fall-back region, so is the
-   same as the scalar routine, 1.93 ULP:
-   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
-				       want 0x1.fd774e958236fp+1021.
-
-   The greatest observed error in the non-special region is 1.54 ULP:
-   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
-				       want 0x1.f5e2bb8d5c991p+8.  */
+   The greatest observed error in special case region is 2.66 + 0.5 ULP:
+   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+				       want 0x1.f9b2d3d22399bp+1023
+
+  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+				      want 0x1.890b225657f82p+3.  */
 svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
@@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
   svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
 
   /* Up to the point that exp overflows, we can use it to calculate cosh by
-     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
-  svfloat64_t t = exp_inline (ax, pg, d);
+     (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
+  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
 
-  /* Fall back to scalar for any special cases.  */
+  /* Falls back to entirely standalone vectorized special case.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg, t, special);
+    return special_case (pg, special, ax, half_exp, d);
 
-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-  return svadd_x (pg, half_t, half_over_t);
+  svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+  return svadd_x (pg, half_exp, inv_twoexp);
 }
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index 1a74db2..f3e7f8b 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -19,26 +19,19 @@
 
 #include "sv_math.h"
 
-/* For x < -Thres, the result is subnormal and not handled correctly by
-   FEXPA.  */
-#define Thres 37.9
+/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
+   correctly by FEXPA.  */
+#define Thres 0x1.2f702p+5
 
 static const struct data
 {
-  float log2_10_lo, c0, c2, c4;
-  float c1, c3, log10_2;
-  float shift, log2_10_hi, thres;
+  float log10_2, log2_10_hi, log2_10_lo, c1;
+  float c0, shift, thres;
 } data = {
   /* Coefficients generated using Remez algorithm with minimisation of relative
-     error.
-     rel error: 0x1.89dafa3p-24
-     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
-     maxerr: 0.52 +0.5 ulp.  */
-  .c0 = 0x1.26bb16p+1f,
-  .c1 = 0x1.5350d2p+1f,
-  .c2 = 0x1.04744ap+1f,
-  .c3 = 0x1.2d8176p+0f,
-  .c4 = 0x1.12b41ap-1f,
+     error.  */
+  .c0 = 0x1.26bb62p1,
+  .c1 = 0x1.53524cp1,
   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
   .shift = 0x1.803f8p17f,
   .log10_2 = 0x1.a934fp+1,
@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
      with poly(r) in [1/sqrt(2), sqrt(2)] and
      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
-
-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
 
   /* n = round(x/(log10(2)/N)).  */
   svfloat32_t shift = sv_f32 (d->shift);
-  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
-  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+  svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
+  svfloat32_t n = svsub_x (pg, z, shift);
 
   /* r = x - n*log10(2)/N.  */
-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
-  r = svmls_lane (r, n, lane_consts, 0);
+  svfloat32_t r = x;
+  r = svmls_lane (r, n, lane_consts, 1);
+  r = svmls_lane (r, n, lane_consts, 2);
 
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
+  svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
+  poly = svmul_x (pg, poly, r);
   return svmla_x (pg, scale, scale, poly);
 }
 
@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
 		      special);
 }
 
-/* Single-precision SVE exp10f routine. Implements the same algorithm
-   as AdvSIMD exp10f.
-   Worst case error is 1.02 ULPs.
-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
-				  want 0x1.ba5f9cp-1.  */
+/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
+   Worst case error is 1.10 ULP.
+   _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
+				  want 0x1.be017p+47.  */
 svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
index 6db8526..c135852 100644
--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
@@ -19,23 +19,21 @@
 
 #include "sv_math.h"
 
-#define N (1 << V_EXP_TABLE_BITS)
-
 #define BigBound 1022
 #define UOFlowBound 1280
 
 static const struct data
 {
-  double c0, c2;
-  double c1, c3;
+  double c2, c4;
+  double c0, c1, c3;
   double shift, big_bound, uoflow_bound;
 } data = {
   /* Coefficients are computed using Remez algorithm with
      minimisation of the absolute error.  */
-  .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3,
-  .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7,
-  .shift = 0x1.8p52 / N,      .uoflow_bound = UOFlowBound,
-  .big_bound = BigBound,
+  .c0 = 0x1.62e42fefa39efp-1,  .c1 = 0x1.ebfbdff82a31bp-3,
+  .c2 = 0x1.c6b08d706c8a5p-5,  .c3 = 0x1.3b2ad2ff7d2f3p-7,
+  .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
+  .uoflow_bound = UOFlowBound, .big_bound = BigBound,
 };
 
 #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
@@ -64,50 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
       svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
 
   /* |n| > 1280 => 2^(n) overflows.  */
-  svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+  svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
 
   svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
   svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
 
-  return svsel (p_cmp, r1, r0);
+  return svsel (p_cmp, r0, r1);
 }
 
 /* Fast vector implementation of exp2.
-   Maximum measured error is 1.65 ulp.
-   _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
-				       want 0x1.f8db0d4df721dp-1.  */
+   Maximum measured error is 0.52 + 0.5 ulp.
+   _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
+				       want 0x1.8861641b49e07p+0.  */
 svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
-  svbool_t no_big_scale = svacle (pg, x, d->big_bound);
-  svbool_t special = svnot_z (pg, no_big_scale);
-
-  /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N].  */
-  svfloat64_t shift = sv_f64 (d->shift);
-  svfloat64_t kd = svadd_x (pg, x, shift);
-  svuint64_t ki = svreinterpret_u64 (kd);
-  /* kd = k/N.  */
-  kd = svsub_x (pg, kd, shift);
-  svfloat64_t r = svsub_x (pg, x, kd);
-
-  /* scale ~= 2^(k/N).  */
-  svuint64_t idx = svand_x (pg, ki, N - 1);
-  svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
-  svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
-  svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
-
-  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
-  /* Approximate exp2(r) using polynomial.  */
-  /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  svbool_t special = svacge (pg, x, d->big_bound);
+
+  svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
+  svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
+  svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+
+  svfloat64_t scale = svexpa (svreinterpret_u64 (z));
+
   svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
-  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
-  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
-  svfloat64_t p = svmla_x (pg, p01, p23, r2);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  /* Approximate exp2(r) using polynomial.  */
+  /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4).  */
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  svfloat64_t p = svmla_x (pg, p12, p34, r2);
+  p = svmad_x (pg, p, r, d->c0);
   svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
+
   /* Assemble exp2(x) = exp2(r) * scale.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (pg, scale, y, kd, d);
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
+            0x8000000000000000);
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, scale, y, n, d);
+    }
+
   return svmla_x (pg, scale, scale, y);
 }
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index fcd7830..989cefb 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -18,21 +18,17 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 #define Thres 0x1.5d5e2ap+6f
 
 static const struct data
 {
-  float c0, c2, c4, c1, c3;
-  float shift, thres;
+  float c0, c1, shift, thres;
 } data = {
-  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
-  .c0 = 0x1.62e422p-1f,
-  .c1 = 0x1.ebf9bcp-3f,
-  .c2 = 0x1.c6bd32p-5f,
-  .c3 = 0x1.3ce9e4p-7f,
-  .c4 = 0x1.59977ap-10f,
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.  */
+  .c0 = 0x1.62e485p-1,
+  .c1 = 0x1.ebfbe0p-3,
   /* 1.5*2^17 + 127.  */
   .shift = 0x1.803f8p17f,
   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
 
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
-  /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
-     Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
-     coefficients 1 to 4, and apply most significant coefficient directly.  */
-  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
-  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
-  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+  svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
+  poly = svmul_x (svptrue_b32 (), poly, r);
 
   return svmla_x (pg, scale, scale, poly);
 }
@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
 		      special);
 }
 
-/* Single-precision SVE exp2f routine. Implements the same algorithm
-   as AdvSIMD exp2f.
-   Worst case error is 1.04 ULPs.
-   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
-				 want 0x1.ba6a64p-1.  */
+/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
+   Worst case error is 1.09 ULPs.
+   _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
+				 want 0x1.be1052p+0.  */
 svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
index d4ba8cc..b1d940b 100644
--- a/sysdeps/aarch64/fpu/expm1_sve.c
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
@@ -18,82 +18,164 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
-#define SpecialBound 0x1.62b7d369a5aa9p+9
-#define ExponentBias 0x3ff0000000000000
+#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64.  */
+#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)).  */
 
 static const struct data
 {
-  double poly[11];
-  double shift, inv_ln2, special_bound;
-  /* To be loaded in one quad-word.  */
+  double c2, c4;
+  double inv_ln2;
   double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, thres;
+  uint64_t expm1_data[32];
 } data = {
-  /* Generated using fpminimax.  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-            0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
-            0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-            0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .special_bound = SpecialBound,
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .ln2_hi = 0x1.62e42fefa39efp-1,
-  .ln2_lo = 0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
+  /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+  The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+  The first half of the table stores values associated to i from 0 to 15.
+  The second half of the table stores values associated to i from 0 to -15.  */
+  .expm1_data = {
+      0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+      0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+      0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+		  0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+      0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
+      0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+		  0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
+      0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+    },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
+  .thres = SpecialBound,
 };
 
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+	      svfloat64_t n)
 {
-  return sv_call_f64 (expm1, x, y, pg);
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x6, 0 otherwise.  */
+  svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0.  */
+  svuint64_t b
+      = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+  svfloat64_t s2 = svreinterpret_f64 (
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacgt (pg, n, 1280.0);
+
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+  svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+  svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes.  */
+  return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
 }
 
-/* Double-precision vector exp(x) - 1 function.
-   The maximum error observed error is 2.18 ULP:
-   _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
-				       want 0x1.a8b9ea8d66e2p-2.  */
+/* FEXPA based SVE expm1 algorithm.
+   Maximum measured error is 2.81 + 0.5 ULP:
+   _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
+				       want 0x1.c290e5858bb5p-3.  */
 svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Large, Nan/Inf.  */
-  svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
-
-  /* Reduce argument to smaller range:
-     Let i = round(x / ln2)
-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where 2^i is exact because i is an integer.  */
-  svfloat64_t shift = sv_f64 (d->shift);
-  svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
-  svint64_t i = svcvt_s64_x (pg, n);
-  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
-  svfloat64_t f = svmls_lane (x, n, ln2, 0);
-  f = svmls_lane (f, n, ln2, 1);
-
-  /* Approximate expm1(f) using polynomial.
-     Taylor expansion for expm1(x) has the form:
-	 x + ax^2 + bx^3 + cx^4 ....
-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t f8 = svmul_x (pg, f4, f4);
-  svfloat64_t p
-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-
-  /* Assemble the result.
-   expm1(x) ~= 2^i * (p + 1) - 1
-   Let t = 2^i.  */
-  svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
-  svfloat64_t t = svreinterpret_f64 (u);
-
-  /* expm1(x) ~= p * t + (t - 1).  */
-  svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svbool_t special = svacgt (pg, x, d->thres);
 
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
 
+  /* r = x - n * ln2, r is in [-ln2/128, ln2/128].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+  svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.
+
+     This bound is based upon the table size:
+	   Bound = (TableSize-1/64) * ln2.
+     The current bound is based upon a table size of 16.  */
+  svbool_t is_small = svaclt (pg, x, FexpaBound);
+
+  if (svptest_any (pg, is_small))
+    {
+      /* Index via the input of FEXPA, but we only care about the lower 4 bits.
+       */
+      svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+      /* We can use the sign of x as a fifth bit to account for the asymmetry
+	 of e^x around 0.  */
+      svuint64_t signBit
+	  = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+      svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+      /* Lookup values for scale - 1 for small x.  */
+      svfloat64_t lookup = svreinterpret_f64 (
+	  svld1_gather_index (is_small, d->expm1_data, idx));
+
+      /* Select the appropriate scale - 1 value based on x.  */
+      scalem1 = svsel (is_small, lookup, scalem1);
+    }
+
+  svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+  /* FEXPA returns nan for large inputs so we special case those.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+      /* Copy sign to s.  */
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, y, scale, p, n);
+    }
+
+  /* return expm1 = (scale - 1) + (scale * poly).  */
   return y;
 }
diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
index 862c13f..821c078 100644
--- a/sysdeps/aarch64/fpu/log1p_sve.c
+++ b/sysdeps/aarch64/fpu/log1p_sve.c
@@ -22,19 +22,33 @@
 
 static const struct data
 {
-  double poly[19];
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
   double ln2_hi, ln2_lo;
   uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
 } data = {
   /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
-     polynomial, however first 2 coefficients are 0 and 1 so are not stored.  */
-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
-	    -0x1.cfa7385bdb37ep-6, },
+     polynomial, however first 2 coefficients are 0 and 1 so are not
+     stored.  */
+  .c0 = -0x1.ffffffffffffbp-2,
+  .c1 = 0x1.55555555551a9p-2,
+  .c2 = -0x1.00000000008e3p-2,
+  .c3 = 0x1.9999999a32797p-3,
+  .c4 = -0x1.555555552fecfp-3,
+  .c5 = 0x1.249248e071e5ap-3,
+  .c6 = -0x1.ffffff8bf8482p-4,
+  .c7 = 0x1.c71c8f07da57ap-4,
+  .c8 = -0x1.9999ca4ccb617p-4,
+  .c9 = 0x1.7459ad2e1dfa3p-4,
+  .c10 = -0x1.554d2680a3ff2p-4,
+  .c11 = 0x1.3b4c54d487455p-4,
+  .c12 = -0x1.2548a9ffe80e6p-4,
+  .c13 = 0x1.0f389a24b2e07p-4,
+  .c14 = -0x1.eee4db15db335p-5,
+  .c15 = 0x1.e95b494d4a5ddp-5,
+  .c16 = -0x1.15fdf07cb7c73p-4,
+  .c17 = 0x1.0310b70800fcfp-4,
+  .c18 = -0x1.cfa7385bdb37ep-6,
   .ln2_hi = 0x1.62e42fefa3800p-1,
   .ln2_lo = 0x1.ef35793c76730p-45,
   /* top32(asuint64(sqrt(2)/2)) << 32.  */
@@ -49,7 +63,7 @@ static const struct data
 #define BottomMask 0xffffffff
 
 static svfloat64_t NOINLINE
-special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 {
   return sv_call_f64 (log1p, x, y, special);
 }
@@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
   /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
   svuint64_t utop
       = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
-  svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
-  svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+  svuint64_t u_red
+      = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask));
+  svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1);
 
   /* Correction term c/m.  */
   svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
@@ -103,18 +118,49 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
      Hence approximation has the form f + f^2 * P(f)
      where P(x) = C0 + C1*x + C2x^2 + ...
      Assembling this all correctly is dealt with at the final step.  */
-  svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
-	      f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
-  svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+	      f4 = svmul_x (svptrue_b64 (), f2, f2),
+	      f8 = svmul_x (svptrue_b64 (), f4, f4),
+	      f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+  /* Order-18 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
 
   svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
   svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
-  svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
 
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (special, x, y);
-
-  return y;
+    return special_case (
+	x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p),
+	special);
+  return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p);
 }
 
 strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
index 963453f..072ba8f 100644
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -18,90 +18,153 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
   uint64_t halff;
-  int64_t onef;
-  uint64_t large_bound;
+  double c2, c4;
+  double inv_ln2;
+  double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, special_bound, bound;
+  uint64_t expm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .m_ln2_hi = -0x1.62e42fefa39efp-1,
-  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
-
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .expm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
   .halff = 0x3fe0000000000000,
-  .onef = 0x3ff0000000000000,
-  /* 2^9. expm1 helper overflows for large input.  */
-  .large_bound = 0x4080000000000000,
+  .special_bound = 0x1.62e37e7d8ba72p+9,	/* ln(2^(1024 - 1/128)).  */
+  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
 };
 
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+   has no special cases. Based off the full FEXPA expm1 implementated for
+   _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Reduce argument:
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where i = round(x / ln2)
-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
-  f = svmla_x (pg, f, j, d->m_ln2_lo);
-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t f8 = svmul_x (pg, f4, f4);
-  svfloat64_t p
-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-  /* t = 2^i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+  /* return expm1 = scale - 1 + (scale * poly).  */
+  return svmla_x (pg, scalem1, scale, p);
 }
 
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
 static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+	      svfloat64_t halfsign, const struct data *d)
 {
-  return sv_call_f64 (sinh, x, x, pg);
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+  svfloat64_t t = expm1_inline (pg, ax);
+
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  y = svmul_x (pg, y, halfsign);
+
+  /* Computes special lane, and set remaining overflow lanes to inf.  */
+  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+  svuint64_t signed_inf
+      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+		 sv_u64 (0x7ff0000000000000));
+  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+  /* Join resulting vectors together and return.  */
+  return svsel (special, special_y, y);
 }
 
-/* Approximation for SVE double-precision sinh(x) using expm1.
-   sinh(x) = (exp(x) - exp(-x)) / 2.
-   The greatest observed error is 2.57 ULP:
-   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
-				       want 0x1.ab929fc64bd63p-2.  */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+   Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+				       want 0x1.c3587faf97b09p-2
+
+   The greatest observed error in the special region is 2.65 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+				       want 0x1.fffd30eea0063p+1023.  */
 svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
+  svbool_t special = svacge (pg, x, d->special_bound);
   svfloat64_t ax = svabs_x (pg, x);
   svuint64_t sign
       = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
 
-  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
-
   /* Fall back to scalar variant for all lanes if any are special.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg);
+    return special_case (pg, special, ax, halfsign, d);
 
   /* Up to the point that expm1 overflows, we can use it to calculate sinh
      using a slight rearrangement of the definition of sinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  svfloat64_t t = expm1_inline (ax, pg);
+  svfloat64_t t = expm1_inline (pg, ax);
   t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
   return svmul_x (pg, t, halfsign);
 }
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
index 71f88e0..c2b196f 100644
--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
@@ -21,11 +21,12 @@
 #define AARCH64_FPU_SV_LOG1P_INLINE_H
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct sv_log1p_data
 {
-  double poly[19], ln2[2];
+  double c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2_lo, ln2_hi;
   uint64_t hf_rt2_top;
   uint64_t one_m_hf_rt2_top;
   uint32_t bottom_mask;
@@ -33,15 +34,30 @@ static const struct sv_log1p_data
 } sv_log1p_data = {
   /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
    */
-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
-	    -0x1.cfa7385bdb37ep-6 },
-  .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+  .c0 = -0x1.ffffffffffffbp-2,
+  .c1 = 0x1.55555555551a9p-2,
+  .c2 = -0x1.00000000008e3p-2,
+  .c3 = 0x1.9999999a32797p-3,
+  .c4 = -0x1.555555552fecfp-3,
+  .c5 = 0x1.249248e071e5ap-3,
+  .c6 = -0x1.ffffff8bf8482p-4,
+  .c7 = 0x1.c71c8f07da57ap-4,
+  .c8 = -0x1.9999ca4ccb617p-4,
+  .c9 = 0x1.7459ad2e1dfa3p-4,
+  .c10 = -0x1.554d2680a3ff2p-4,
+  .c11 = 0x1.3b4c54d487455p-4,
+  .c12 = -0x1.2548a9ffe80e6p-4,
+  .c13 = 0x1.0f389a24b2e07p-4,
+  .c14 = -0x1.eee4db15db335p-5,
+  .c15 = 0x1.e95b494d4a5ddp-5,
+  .c16 = -0x1.15fdf07cb7c73p-4,
+  .c17 = 0x1.0310b70800fcfp-4,
+  .c18 = -0x1.cfa7385bdb37ep-6,
+  .ln2_lo = 0x1.62e42fefa3800p-1,
+  .ln2_hi = 0x1.ef35793c76730p-45,
+  /* top32(asuint64(sqrt(2)/2)) << 32.  */
   .hf_rt2_top = 0x3fe6a09e00000000,
+  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
   .one_m_hf_rt2_top = 0x00095f6200000000,
   .bottom_mask = 0xffffffff,
   .one_top = 0x3ff
@@ -51,14 +67,14 @@ static inline svfloat64_t
 sv_log1p_inline (svfloat64_t x, const svbool_t pg)
 {
   /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
-     differs from v_log1p_2u5.c by:
+     differs from advsimd/log1p.c by:
      - No special-case handling - this should be dealt with by the caller.
      - Pairwise Horner polynomial evaluation for improved accuracy.
      - Optionally simulate the shortcut for k=0, used in the scalar routine,
        using svsel, for improved accuracy when the argument to log1p is close
      to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
      in the source of the caller before including this file.
-     See sv_log1p_2u1.c for details of the algorithm.  */
+     See sve/log1p.c for details of the algorithm.  */
   const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
   svfloat64_t m = svadd_x (pg, x, 1);
   svuint64_t mi = svreinterpret_u64 (m);
@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
   svfloat64_t cm;
 
 #ifndef WANT_SV_LOG1P_K0_SHORTCUT
-#error                                                                         \
+#error                                                                       \
   "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
 #elif WANT_SV_LOG1P_K0_SHORTCUT
   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
 #endif
 
   /* Approximate log1p(f) on the reduced input using a polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+	      f4 = svmul_x (svptrue_b64 (), f2, f2),
+	      f8 = svmul_x (svptrue_b64 (), f4, f4),
+	      f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+  /* Order-18 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
 
   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
-  svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
-  svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+  svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
+  svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
+  svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
 
-  return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+  return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
 }
-
 #endif
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
index 789cc68..5869419 100644
--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@@ -18,83 +18,117 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
-  uint64_t thresh, tiny_bound;
+  double ln2_hi, ln2_lo;
+  double c2, c4;
+  double c0, c1, c3;
+  double two_over_ln2, shift;
+  uint64_t tiny_bound;
+  double large_bound, fexpa_bound;
+  uint64_t e2xm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .ln2_hi = -0x1.62e42fefa39efp-1,
-  .ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
-
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .two_over_ln2 = 0x1.71547652b82fep+1,
+  .shift = 0x1.800000000ffc0p+46,   /* 1.5*2^46+1023.  */
   .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
-  .thresh = 0x01f241bf835f9d5f,
+  .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54).  */
+  .fexpa_bound = 0x1.a56ef8ec924ccp-4,	  /* 19/64 * ln2/2.  */
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .e2xm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
 };
 
+/* An expm1 inspired, FEXPA based helper function that returns an
+   accurate estimate for e^2x - 1. With no special case or support for
+   negative inputs of x.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
-{
-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
-     the scalar variant of tanh.  */
-
-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
-  f = svmla_x (pg, f, j, d->ln2_lo);
-
-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t p = svmla_x (
-      pg, f, f2,
-      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
-
-  /* t = 2 ^ i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) = p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
-}
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
-  return sv_call_f64 (tanh, x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = svadd_x (pg, x, x);
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+  return svmla_x (pg, scalem1, scale, p);
 }
 
-/* SVE approximation for double-precision tanh(x), using a simplified
-   version of expm1. The greatest observed error is 2.77 ULP:
-   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
-				       want -0x1.bd6a21a163624p-3.  */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+   FEXPA expm1 to calculate e^2x - 1.
+   The greatest observed error is 2.79 + 0.5 ULP:
+   _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+				       want 0x1.fff7be486cae9p-9.  */
 svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+  svbool_t large = svacge (pg, x, d->large_bound);
 
-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
-  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+  /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+  As an additional optimisation, we can ensure more accurate values of e^x
+  by only using positive inputs. So we calculate tanh(|x|), and restore the
+  sign of the input before returning.  */
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t sign_bit
+      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
 
-  svfloat64_t u = svadd_x (pg, x, x);
+  svfloat64_t p = e2xm1_inline (pg, ax, d);
+  svfloat64_t q = svadd_x (pg, p, 2);
 
-  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  svfloat64_t q = expm1_inline (u, pg, d);
-  svfloat64_t qp2 = svadd_x (pg, q, 2);
+  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+     rounded, at this point we can return 1 directly, with sign correction.
+     This will also act as a guard against our approximation overflowing.  */
+  svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
 
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svdiv_x (pg, q, qp2), special);
-  return svdiv_x (pg, q, qp2);
+  return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
 }
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index 07133eb..a3fef22 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -25,11 +25,15 @@
 
 VPCS_VECTOR_WRAPPER (acos_advsimd, _ZGVnN2v_acos)
 VPCS_VECTOR_WRAPPER (acosh_advsimd, _ZGVnN2v_acosh)
+VPCS_VECTOR_WRAPPER (acospi_advsimd, _ZGVnN2v_acospi)
 VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin)
 VPCS_VECTOR_WRAPPER (asinh_advsimd, _ZGVnN2v_asinh)
+VPCS_VECTOR_WRAPPER (asinpi_advsimd, _ZGVnN2v_asinpi)
 VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan)
 VPCS_VECTOR_WRAPPER (atanh_advsimd, _ZGVnN2v_atanh)
+VPCS_VECTOR_WRAPPER (atanpi_advsimd, _ZGVnN2v_atanpi)
 VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2)
+VPCS_VECTOR_WRAPPER_ff (atan2pi_advsimd, _ZGVnN2vv_atan2pi)
 VPCS_VECTOR_WRAPPER (cbrt_advsimd, _ZGVnN2v_cbrt)
 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
 VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index 02953cb..f4a5ae8 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -44,11 +44,15 @@
 
 SVE_VECTOR_WRAPPER (acos_sve, _ZGVsMxv_acos)
 SVE_VECTOR_WRAPPER (acosh_sve, _ZGVsMxv_acosh)
+SVE_VECTOR_WRAPPER (acospi_sve, _ZGVsMxv_acospi)
 SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin)
 SVE_VECTOR_WRAPPER (asinh_sve, _ZGVsMxv_asinh)
+SVE_VECTOR_WRAPPER (asinpi_sve, _ZGVsMxv_asinpi)
 SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan)
 SVE_VECTOR_WRAPPER (atanh_sve, _ZGVsMxv_atanh)
+SVE_VECTOR_WRAPPER (atanpi_sve, _ZGVsMxv_atanpi)
 SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2)
+SVE_VECTOR_WRAPPER_ff (atan2pi_sve, _ZGVsMxvv_atan2pi)
 SVE_VECTOR_WRAPPER (cbrt_sve, _ZGVsMxv_cbrt)
 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
 SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index 118bbb0..bc22956 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -25,11 +25,15 @@
 
 VPCS_VECTOR_WRAPPER (acosf_advsimd, _ZGVnN4v_acosf)
 VPCS_VECTOR_WRAPPER (acoshf_advsimd, _ZGVnN4v_acoshf)
+VPCS_VECTOR_WRAPPER (acospif_advsimd, _ZGVnN4v_acospif)
 VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf)
 VPCS_VECTOR_WRAPPER (asinhf_advsimd, _ZGVnN4v_asinhf)
+VPCS_VECTOR_WRAPPER (asinpif_advsimd, _ZGVnN4v_asinpif)
 VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf)
 VPCS_VECTOR_WRAPPER (atanhf_advsimd, _ZGVnN4v_atanhf)
+VPCS_VECTOR_WRAPPER (atanpif_advsimd, _ZGVnN4v_atanpif)
 VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f)
+VPCS_VECTOR_WRAPPER_ff (atan2pif_advsimd, _ZGVnN4vv_atan2pif)
 VPCS_VECTOR_WRAPPER (cbrtf_advsimd, _ZGVnN4v_cbrtf)
 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
 VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index f5e7c8c..ad0d6ad 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -44,11 +44,15 @@
 
 SVE_VECTOR_WRAPPER (acosf_sve, _ZGVsMxv_acosf)
 SVE_VECTOR_WRAPPER (acoshf_sve, _ZGVsMxv_acoshf)
+SVE_VECTOR_WRAPPER (acospif_sve, _ZGVsMxv_acospif)
 SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf)
 SVE_VECTOR_WRAPPER (asinhf_sve, _ZGVsMxv_asinhf)
+SVE_VECTOR_WRAPPER (asinpif_sve, _ZGVsMxv_asinpif)
 SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf)
 SVE_VECTOR_WRAPPER (atanhf_sve, _ZGVsMxv_atanhf)
+SVE_VECTOR_WRAPPER (atanpif_sve, _ZGVsMxv_atanpif)
 SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f)
+SVE_VECTOR_WRAPPER_ff (atan2pif_sve, _ZGVsMxvv_atan2pif)
 SVE_VECTOR_WRAPPER (cbrtf_sve, _ZGVsMxv_cbrtf)
 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
 SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf)
diff --git a/sysdeps/aarch64/machine-gmon.h b/sysdeps/aarch64/machine-gmon.h
index eba7c24..05323c9 100644
--- a/sysdeps/aarch64/machine-gmon.h
+++ b/sysdeps/aarch64/machine-gmon.h
@@ -27,9 +27,8 @@ static void mcount_internal (u_long frompc, u_long selfpc);
 #define _MCOUNT_DECL(frompc, selfpc) \
 static inline void mcount_internal (u_long frompc, u_long selfpc)
 
-/* Note: strip_pac is needed for frompc because of gcc PR target/94791.  */
 #define MCOUNT                                                    \
 void __mcount (void *frompc)                                      \
 {                                                                 \
-  mcount_internal ((u_long) strip_pac (frompc), (u_long) RETURN_ADDRESS (0)); \
+  mcount_internal ((u_long) frompc, (u_long) RETURN_ADDRESS (0)); \
 }
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 8dc314b..0e26171 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -36,18 +36,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c.  */
   IFUNC_IMPL (i, name, memcpy,
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)
-#if HAVE_AARCH64_SVE_ASM
-	      IFUNC_IMPL_ADD (array, i, memcpy, sve && !bti, __memcpy_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
-#endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_oryon1)
-#if HAVE_AARCH64_SVE_ASM
-	      IFUNC_IMPL_ADD (array, i, memmove, sve && !bti, __memmove_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
-#endif
 	      IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
@@ -55,10 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_oryon1)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
-#if HAVE_AARCH64_SVE_ASM
-	      IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
-#endif
 	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
   IFUNC_IMPL (i, name, memchr,
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 63c24e7..75b3e08 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -31,7 +31,7 @@
   unsigned __attribute__((unused)) zva_size =				      \
     GLRO(dl_aarch64_cpu_features).zva_size;				      \
   bool __attribute__((unused)) bti =					      \
-    HAVE_AARCH64_BTI && GLRO(dl_aarch64_cpu_features).bti;		      \
+    GLRO(dl_aarch64_cpu_features).bti;					      \
   bool __attribute__((unused)) mte =					      \
     MTE_ENABLED ();							      \
   bool __attribute__((unused)) sve =					      \
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 0e33d19..894dabe 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -43,7 +43,7 @@ select_memcpy_ifunc (void)
   if (mops)
     return __memcpy_mops;
 
-  if (sve && HAVE_AARCH64_SVE_ASM)
+  if (sve)
     {
       if (IS_A64FX (midr))
 	return __memcpy_a64fx;
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
index ed18682..acad6e8 100644
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
@@ -19,9 +19,6 @@
 
 #include <sysdep.h>
 
-#undef BTI_C
-#define BTI_C
-
 /* Assumptions:
  *
  * ARMv8.2-a, AArch64, unaligned accesses, sve
@@ -38,8 +35,6 @@
 #define vlen	x7
 #define vlen8	x8
 
-#if HAVE_AARCH64_SVE_ASM
-
 	.arch armv8.2-a+sve
 
 	.macro ld1b_unroll8
@@ -91,9 +86,6 @@
 	st1b	z7.b, p0, [dst, 7, mul vl]
 	.endm
 
-#undef BTI_C
-#define BTI_C
-
 ENTRY (__memcpy_a64fx)
 
 	cntb	vlen
@@ -296,4 +288,3 @@ L(full_overlap):
 	b	L(last_bytes)
 
 END (__memmove_a64fx)
-#endif /* HAVE_AARCH64_SVE_ASM */
diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
index 26d4890..0ba6358 100644
--- a/sysdeps/aarch64/multiarch/memcpy_sve.S
+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
@@ -56,8 +56,6 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-#if HAVE_AARCH64_SVE_ASM
-
 	.arch armv8.2-a+sve
 
 ENTRY (__memcpy_sve)
@@ -199,4 +197,3 @@ L(return):
 	ret
 
 END (__memmove_sve)
-#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 47b7268..6b0d0ce 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -41,7 +41,7 @@ select_memmove_ifunc (void)
   if (mops)
     return __memmove_mops;
 
-  if (sve && HAVE_AARCH64_SVE_ASM)
+  if (sve)
     {
       if (IS_A64FX (midr))
 	return __memmove_a64fx;
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index 872f39f..2b0a58b 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -46,7 +46,7 @@ select_memset_ifunc (void)
   if (mops)
     return __memset_mops;
 
-  if (sve && HAVE_AARCH64_SVE_ASM)
+  if (sve)
     {
       if (IS_A64FX (midr) && zva_size == 256)
 	return __memset_a64fx;
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ea60b78..e921240 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -31,8 +31,6 @@
 #define PF_DIST_L1	(CACHE_LINE_SIZE * 16)	// Prefetch distance L1
 #define vector_length	x9
 
-#if HAVE_AARCH64_SVE_ASM
-
 	.arch armv8.2-a+sve
 
 #define dstin   x0
@@ -50,10 +48,6 @@
 	.endif
 	.endm
 
-
-#undef BTI_C
-#define BTI_C
-
 ENTRY (__memset_a64fx)
 
 	cntb	vector_length
@@ -170,5 +164,3 @@ L(L2):
 	b	L(last)
 
 END (__memset_a64fx)
-
-#endif /* HAVE_AARCH64_SVE_ASM */
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
index 7fb40fd..c385e1a 100644
--- a/sysdeps/aarch64/multiarch/memset_sve_zva64.S
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -25,8 +25,6 @@
  * ZVA size is 64.
  */
 
-#if HAVE_AARCH64_SVE_ASM
-
 .arch armv8.2-a+sve
 
 #define dstin	x0
@@ -120,4 +118,3 @@ L(no_zva_loop):
 	ret
 
 END (__memset_sve_zva64)
-#endif
diff --git a/sysdeps/aarch64/preconfigure b/sysdeps/aarch64/preconfigure
index 19657b6..e1b772c 100644
--- a/sysdeps/aarch64/preconfigure
+++ b/sysdeps/aarch64/preconfigure
@@ -3,5 +3,6 @@ aarch64*)
 	base_machine=aarch64
 	machine=aarch64
 	mtls_descriptor=desc
+	mtls_traditional=trad
 	;;
 esac
diff --git a/sysdeps/aarch64/setjmp.S b/sysdeps/aarch64/setjmp.S
index d82d62c..53c5e7d 100644
--- a/sysdeps/aarch64/setjmp.S
+++ b/sysdeps/aarch64/setjmp.S
@@ -35,6 +35,20 @@ libc_hidden_def (_setjmp)
 
 ENTRY_ALIGN (__sigsetjmp, 2)
 1:
+
+#if IS_IN(libc)
+	/* Disable ZA state of SME in libc.a and libc.so, but not in ld.so.
+	   The calling convention of __libc_arm_za_disable allows to do
+	   this thus allowing to avoid saving to and reading from stack.
+	   As a result we also don't need to sign the return address and
+	   check it after returning because it is not stored to stack.  */
+	mov	x13, x30
+	cfi_register (x30, x13)
+	bl	__libc_arm_za_disable
+	mov	x30, x13
+	cfi_register (x13, x30)
+#endif
+
 	stp	x19, x20, [x0, #JB_X19<<3]
 	stp	x21, x22, [x0, #JB_X21<<3]
 	stp	x23, x24, [x0, #JB_X23<<3]
@@ -73,7 +87,7 @@ L(gcs_done):
 #if IS_IN (rtld)
 	/* In ld.so we never save the signal mask */
 	mov	w0, #0
-	RET
+	ret
 #else
 	b	C_SYMBOL_NAME(__sigjmp_save)
 #endif
diff --git a/sysdeps/aarch64/start.S b/sysdeps/aarch64/start.S
index 544e397..694c338 100644
--- a/sysdeps/aarch64/start.S
+++ b/sysdeps/aarch64/start.S
@@ -108,7 +108,7 @@ ENTRY(_start)
 	   because crt1.o and rcrt1.o share code and the later must avoid the
 	   use of GOT relocations before __libc_start_main is called.  */
 __wrap_main:
-	BTI_C
+	bti	c
 	b	main
 #endif
 END(_start)
diff --git a/sysdeps/aarch64/sys/ifunc.h b/sysdeps/aarch64/sys/ifunc.h
index 7781b37..a3322a9 100644
--- a/sysdeps/aarch64/sys/ifunc.h
+++ b/sysdeps/aarch64/sys/ifunc.h
@@ -19,24 +19,77 @@
 #ifndef _SYS_IFUNC_H
 #define _SYS_IFUNC_H
 
+#include <sys/cdefs.h>
+
 /* A second argument is passed to the ifunc resolver.  */
 #define _IFUNC_ARG_HWCAP	(1ULL << 62)
 
-/* The prototype of a gnu indirect function resolver on AArch64 is
+/* Maximum number of HWCAP elements that are currently supported.  */
+#define _IFUNC_HWCAP_MAX	4
+
+/* The prototype of a GNU indirect function resolver on AArch64 is
+
+     ElfW(Addr) ifunc_resolver (uint64_t, const uint64_t *);
+
+   The following prototype is also compatible:
 
      ElfW(Addr) ifunc_resolver (uint64_t, const __ifunc_arg_t *);
 
-   the first argument should have the _IFUNC_ARG_HWCAP bit set and
-   the remaining bits should match the AT_HWCAP settings.  */
+   The first argument might have the _IFUNC_ARG_HWCAP bit set and
+   the remaining bits should match the AT_HWCAP settings.
+
+   If the _IFUNC_ARG_HWCAP bit is set in the first argument, then
+   the second argument is passed to the resolver function.  In
+   this case, the second argument is a const pointer to a buffer
+   that allows to access all available HWCAP elements.
+
+   This buffer has its size in bytes at offset 0.  The HWCAP elements
+   are available at offsets 8, 16, 24, 32... respectively for AT_HWCAP,
+   AT_HWCAP2, AT_HWCAP3, AT_HWCAP4...  (these offsets are multiples of
+   sizeof (unsigned long)).
+
+   Indirect function resolvers must check availability of HWCAP
+   elements at runtime before accessing them using the size of the
+   buffer.  */
 
-/* Second argument to an ifunc resolver.  */
 struct __ifunc_arg_t
 {
-  unsigned long _size; /* Size of the struct, so it can grow.  */
+  unsigned long _size;    /* Size of the struct, so it can grow.  */
   unsigned long _hwcap;
-  unsigned long _hwcap2;
+  unsigned long _hwcap2;  /* End of 1st published struct.  */
+  unsigned long _hwcap3;
+  unsigned long _hwcap4;  /* End of 2nd published struct.  */
 };
 
 typedef struct __ifunc_arg_t __ifunc_arg_t;
 
+/* Constants for IDs of HWCAP elements to be used with the
+   __ifunc_hwcap function below.  */
+enum
+{
+  _IFUNC_ARG_AT_HWCAP = 1,
+  _IFUNC_ARG_AT_HWCAP2 = 2,
+  _IFUNC_ARG_AT_HWCAP3 = 3,
+  _IFUNC_ARG_AT_HWCAP4 = 4,
+};
+
+/* A helper function to obtain HWCAP element by its ID from the
+   parameters ARG0 and ARG1 passed to the ifunc resolver.  Note that
+   ID 1 corresponds to AT_HWCAP, ID 2 corresponds to AT_HWCAP2, etc.
+   If there is no element available for the requested ID then 0 is
+   returned.  If ID doesn't much any supported AT_HWCAP{,2,...} value,
+   then 0 is also returned.  */
+static __inline unsigned long __attribute__ ((unused, always_inline))
+__ifunc_hwcap (unsigned long __id,
+	       unsigned long __arg0, const unsigned long *__arg1)
+{
+  if (__glibc_likely (__arg0 & _IFUNC_ARG_HWCAP))
+    {
+      const unsigned long size = __arg1[0];
+      const unsigned long offset = __id * sizeof (unsigned long);
+      return offset < size && __id > 0 ? __arg1[__id] : 0;
+    }
+  return __id == 1 ? __arg0 : 0;
+}
+
 #endif
diff --git a/sysdeps/aarch64/sysdep.h b/sysdeps/aarch64/sysdep.h
index 9424115..f5e28cb 100644
--- a/sysdeps/aarch64/sysdep.h
+++ b/sysdeps/aarch64/sysdep.h
@@ -21,43 +21,15 @@
 
 #include <sysdeps/generic/sysdep.h>
 
-#ifndef __ASSEMBLER__
-/* Strip pointer authentication code from pointer p.  */
-static inline void *
-strip_pac (void *p)
-{
-  register void *ra asm ("x30") = (p);
-  asm ("hint 7 // xpaclri" : "+r"(ra));
-  return ra;
-}
-
-/* This is needed when glibc is built with -mbranch-protection=pac-ret
-   with a gcc that is affected by PR target/94891.  */
-# if HAVE_AARCH64_PAC_RET
-#  undef RETURN_ADDRESS
-#  define RETURN_ADDRESS(n) strip_pac (__builtin_return_address (n))
-# endif
-#endif
-
 #ifdef	__ASSEMBLER__
 
+/* CFI directive for return address.  */
+#define cfi_negate_ra_state	.cfi_negate_ra_state
+
 /* Syntactic details of assembler.  */
 
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name
 
-/* Branch Target Identitication support.  */
-#if HAVE_AARCH64_BTI
-# define BTI_C		hint	34
-# define BTI_J		hint	36
-#else
-# define BTI_C		nop
-# define BTI_J		nop
-#endif
-
-/* Return address signing support (pac-ret).  */
-#define PACIASP		hint	25
-#define AUTIASP		hint	29
-
 /* Guarded Control Stack support.  */
 #define CHKFEAT_X16	hint	40
 #define MRS_GCSPR(x)	mrs	x, s3_3_c2_c5_1
@@ -87,11 +59,7 @@ strip_pac (void *p)
 
 /* Add GNU property note with the supported features to all asm code
    where sysdep.h is included.  */
-#if HAVE_AARCH64_BTI && HAVE_AARCH64_PAC_RET
 GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC|FEATURE_1_GCS)
-#elif HAVE_AARCH64_BTI
-GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
-#endif
 
 /* Define an entry point visible from C.  */
 #define ENTRY(name)						\
@@ -100,7 +68,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
   .p2align 6;							\
   C_LABEL(name)							\
   cfi_startproc;						\
-  BTI_C;							\
+  bti	c;							\
   CALL_MCOUNT
 
 /* Define an entry point visible from C.  */
@@ -110,7 +78,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
   .p2align align;						\
   C_LABEL(name)							\
   cfi_startproc;						\
-  BTI_C;							\
+  bti	c;							\
   CALL_MCOUNT
 
 /* Define an entry point visible from C with a specified alignment and
@@ -127,7 +95,7 @@ GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_GCS)
   .endr;							\
   C_LABEL(name)							\
   cfi_startproc;						\
-  BTI_C;							\
+  bti	c;							\
   CALL_MCOUNT
 
 #undef	END
diff --git a/sysdeps/aarch64/tst-ifunc-arg-1.c b/sysdeps/aarch64/tst-ifunc-arg-1.c
index b90c836..292c5ae 100644
--- a/sysdeps/aarch64/tst-ifunc-arg-1.c
+++ b/sysdeps/aarch64/tst-ifunc-arg-1.c
@@ -57,6 +57,21 @@ do_test (void)
   TEST_COMPARE (saved_arg2._size, sizeof (__ifunc_arg_t));
   TEST_COMPARE (saved_arg2._hwcap, getauxval (AT_HWCAP));
   TEST_COMPARE (saved_arg2._hwcap2, getauxval (AT_HWCAP2));
+  TEST_COMPARE (saved_arg2._hwcap3, getauxval (AT_HWCAP3));
+  TEST_COMPARE (saved_arg2._hwcap4, getauxval (AT_HWCAP4));
+
+  const unsigned long *saved_arg2_ptr = (const unsigned long *)&saved_arg2;
+
+  TEST_COMPARE (__ifunc_hwcap (1, saved_arg1, saved_arg2_ptr),
+		getauxval (AT_HWCAP));
+  TEST_COMPARE (__ifunc_hwcap (2, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP2));
+  TEST_COMPARE (__ifunc_hwcap (3, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP3));
+  TEST_COMPARE (__ifunc_hwcap (4, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP4));
+
+
   return 0;
 }
 
diff --git a/sysdeps/aarch64/tst-ifunc-arg-2.c b/sysdeps/aarch64/tst-ifunc-arg-2.c
index dac144d..c05129a 100644
--- a/sysdeps/aarch64/tst-ifunc-arg-2.c
+++ b/sysdeps/aarch64/tst-ifunc-arg-2.c
@@ -60,6 +60,20 @@ do_test (void)
   TEST_COMPARE (saved_arg2._size, sizeof (__ifunc_arg_t));
   TEST_COMPARE (saved_arg2._hwcap, getauxval (AT_HWCAP));
   TEST_COMPARE (saved_arg2._hwcap2, getauxval (AT_HWCAP2));
+  TEST_COMPARE (saved_arg2._hwcap3, getauxval (AT_HWCAP3));
+  TEST_COMPARE (saved_arg2._hwcap4, getauxval (AT_HWCAP4));
+
+  const unsigned long *saved_arg2_ptr = (const unsigned long *)&saved_arg2;
+
+  TEST_COMPARE (__ifunc_hwcap (1, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP));
+  TEST_COMPARE (__ifunc_hwcap (2, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP2));
+  TEST_COMPARE (__ifunc_hwcap (3, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP3));
+  TEST_COMPARE (__ifunc_hwcap (4, saved_arg1, saved_arg2_ptr),
+                getauxval (AT_HWCAP4));
+
   return 0;
 }
 
diff --git a/sysdeps/aarch64/tst-ifunc-arg-3.c b/sysdeps/aarch64/tst-ifunc-arg-3.c
new file mode 100644
index 0000000..49d8866
--- /dev/null
+++ b/sysdeps/aarch64/tst-ifunc-arg-3.c
@@ -0,0 +1,97 @@
+/* Tests for __ifunc_hwcap helper function.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <sys/ifunc.h>
+#include <support/check.h>
+
+#define CHECK_VALUES_WITH_ARG(p1, p2, p3, p4) \
+  ({ \
+    TEST_COMPARE (__ifunc_hwcap (0, _IFUNC_ARG_HWCAP, arg), 0); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP, _IFUNC_ARG_HWCAP, arg), p1); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP2, _IFUNC_ARG_HWCAP, arg), p2); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP3, _IFUNC_ARG_HWCAP, arg), p3); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP4, _IFUNC_ARG_HWCAP, arg), p4); \
+    TEST_COMPARE (__ifunc_hwcap (5, _IFUNC_ARG_HWCAP, arg), 0); \
+  })
+
+#define CHECK_VALUES_WITHOUT_ARG(p1) \
+  ({ \
+    TEST_COMPARE (__ifunc_hwcap (0, p1, arg), 0); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP, p1, arg), p1); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP2, p1, arg), 0); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP3, p1, arg), 0); \
+    TEST_COMPARE (__ifunc_hwcap (_IFUNC_ARG_AT_HWCAP4, p1, arg), 0); \
+    TEST_COMPARE (__ifunc_hwcap (5, p1, arg), 0); \
+  })
+
+static void
+test_one (const unsigned long *arg)
+{
+  uint64_t size = arg[0] / sizeof (uint64_t);
+
+  switch (size)
+    {
+      case 1:
+	CHECK_VALUES_WITH_ARG (0, 0, 0, 0);
+	CHECK_VALUES_WITHOUT_ARG (0);
+	break;
+      case 2:
+	CHECK_VALUES_WITH_ARG (1, 0, 0, 0);
+	CHECK_VALUES_WITHOUT_ARG (1);
+	break;
+      case 3:
+	CHECK_VALUES_WITH_ARG (1, 2, 0, 0);
+	CHECK_VALUES_WITHOUT_ARG (1);
+	break;
+      case 4:
+	CHECK_VALUES_WITH_ARG (1, 2, 3, 0);
+	CHECK_VALUES_WITHOUT_ARG (1);
+	break;
+      case 5:
+	CHECK_VALUES_WITH_ARG (1, 2, 3, 4);
+	CHECK_VALUES_WITHOUT_ARG (1);
+	break;
+      default:
+	TEST_VERIFY (0); // unexpected size
+	break;
+    }
+}
+
+static int
+do_test (void)
+{
+  uint64_t arg[_IFUNC_HWCAP_MAX + 1] = {
+    0, /* Placeholder for size */
+    _IFUNC_ARG_AT_HWCAP, /* AT_HWCAP */
+    _IFUNC_ARG_AT_HWCAP2, /* AT_HWCAP2 */
+    _IFUNC_ARG_AT_HWCAP3, /* AT_HWCAP3 */
+    _IFUNC_ARG_AT_HWCAP4, /* AT_HWCAP4 */
+  };
+
+  for (int k = 0; k <= _IFUNC_HWCAP_MAX; k++)
+    {
+      /* Update size */
+      arg[0] = (k + 1) * sizeof (uint64_t);
+      test_one (arg);
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/aarch64/tst-ifunc-arg-4.c b/sysdeps/aarch64/tst-ifunc-arg-4.c
new file mode 100644
index 0000000..c95ef9e
--- /dev/null
+++ b/sysdeps/aarch64/tst-ifunc-arg-4.c
@@ -0,0 +1,67 @@
+/* Test for ifunc resolver that uses __ifunc_hwcap helper function.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+#include <sys/ifunc.h>
+#include <support/check.h>
+
+static int
+one (void)
+{
+  return 1;
+}
+
+static int
+two (void)
+{
+  return 2;
+}
+
+/* Resolver function.  */
+static void *
+resolver (uint64_t arg0, const uint64_t arg1[])
+{
+  uint64_t hwcap2 = __ifunc_hwcap (_IFUNC_ARG_AT_HWCAP2, arg0, arg1);
+  if (hwcap2 & HWCAP2_POE)
+    return (void *)one;
+  else
+    return (void *)two;
+}
+
+/* An extern visible ifunc symbol.  */
+int fun (void) __attribute__((ifunc ("resolver")));
+
+static int
+do_test (void)
+{
+  if (getauxval (AT_HWCAP2) & HWCAP2_POE)
+    {
+      printf ("using 1st implementation\n");
+      TEST_VERIFY (fun () == 1);
+    }
+  else
+    {
+      printf ("using 2nd implementation\n");
+      TEST_VERIFY (fun () == 2);
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/aarch64/tst-sme-helper.h b/sysdeps/aarch64/tst-sme-helper.h
new file mode 100644
index 0000000..f049416
--- /dev/null
+++ b/sysdeps/aarch64/tst-sme-helper.h
@@ -0,0 +1,97 @@
+/* Utility functions for SME tests.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Streaming SVE vector register size.  */
+static unsigned long svl;
+
+struct blk {
+  void *za_save_buffer;
+  uint16_t num_za_save_slices;
+  char __reserved[6];
+};
+
+/* Read SVCR to get SM (bit0) and ZA (bit1) state.  */
+static unsigned long
+get_svcr (void)
+{
+  register unsigned long x0 asm ("x0");
+  asm volatile (
+    ".inst   0xd53b4240  /* mrs     x0, svcr  */\n"
+    : "=r" (x0));
+  return x0;
+}
+
+/* Returns tpidr2.  */
+static void *
+get_tpidr2 (void)
+{
+  register unsigned long x0 asm ("x0");
+  asm volatile (
+    ".inst   0xd53bd0a0  /* mrs     x0, tpidr2_el0  */\n"
+    : "=r"(x0) :: "memory");
+  return (void *) x0;
+}
+
+/* Obtains current streaming SVE vector register size.  */
+static unsigned long
+get_svl (void)
+{
+  register unsigned long x0 asm ("x0");
+  asm volatile (
+    ".inst   0x04bf5820  /* rdsvl   x0, 1  */\n"
+    : "=r" (x0));
+  return x0;
+}
+
+/* PSTATE.ZA = 1, set ZA state to active.  */
+static void
+start_za (void)
+{
+  asm volatile (
+    ".inst   0xd503457f  /* smstart za  */");
+}
+
+/* Load data into ZA byte by byte from p.  */
+static void __attribute__ ((noinline))
+load_za (const void *p)
+{
+  register unsigned long x15 asm ("x15") = 0;
+  register unsigned long x16 asm ("x16") = (unsigned long)p;
+  register unsigned long x17 asm ("x17") = svl;
+
+  asm volatile (
+    ".inst   0xd503437f  /* smstart sm  */\n"
+    ".L_ldr_loop:\n"
+    ".inst   0xe1006200  /* ldr     za[w15, 0], [x16]  */\n"
+    "add     w15, w15, 1\n"
+    ".inst   0x04305030  /* addvl   x16, x16, 1  */\n"
+    "cmp     w15, w17\n"
+    "bne     .L_ldr_loop\n"
+    ".inst   0xd503427f  /* smstop  sm  */\n"
+    : "+r"(x15), "+r"(x16), "+r"(x17));
+}
+
+/* Set tpidr2 to BLK.  */
+static void
+set_tpidr2 (struct blk *blk)
+{
+  register unsigned long x0 asm ("x0") = (unsigned long)blk;
+  asm volatile (
+    ".inst   0xd51bd0a0  /* msr     tpidr2_el0, x0  */\n"
+    :: "r"(x0) : "memory");
+}
diff --git a/sysdeps/aarch64/tst-sme-jmp.c b/sysdeps/aarch64/tst-sme-jmp.c
index 62c419f..103897a 100644
--- a/sysdeps/aarch64/tst-sme-jmp.c
+++ b/sysdeps/aarch64/tst-sme-jmp.c
@@ -27,87 +27,12 @@
 #include <support/support.h>
 #include <support/test-driver.h>
 
-struct blk {
-  void *za_save_buffer;
-  uint16_t num_za_save_slices;
-  char __reserved[6];
-};
+#include "tst-sme-helper.h"
 
-static unsigned long svl;
 static uint8_t *za_orig;
 static uint8_t *za_dump;
 static uint8_t *za_save;
 
-static unsigned long
-get_svl (void)
-{
-  register unsigned long x0 asm ("x0");
-  asm volatile (
-    ".inst   0x04bf5820  /* rdsvl   x0, 1  */\n"
-    : "=r" (x0));
-  return x0;
-}
-
-/* PSTATE.ZA = 1, set ZA state to active.  */
-static void
-start_za (void)
-{
-  asm volatile (
-    ".inst   0xd503457f  /* smstart za  */");
-}
-
-/* Read SVCR to get SM (bit0) and ZA (bit1) state.  */
-static unsigned long
-get_svcr (void)
-{
-  register unsigned long x0 asm ("x0");
-  asm volatile (
-    ".inst   0xd53b4240  /* mrs     x0, svcr  */\n"
-    : "=r" (x0));
-  return x0;
-}
-
-/* Load data into ZA byte by byte from p.  */
-static void __attribute__ ((noinline))
-load_za (const void *p)
-{
-  register unsigned long x15 asm ("x15") = 0;
-  register unsigned long x16 asm ("x16") = (unsigned long)p;
-  register unsigned long x17 asm ("x17") = svl;
-
-  asm volatile (
-    ".inst   0xd503437f  /* smstart sm  */\n"
-    ".L_ldr_loop:\n"
-    ".inst   0xe1006200  /* ldr     za[w15, 0], [x16]  */\n"
-    "add     w15, w15, 1\n"
-    ".inst   0x04305030  /* addvl   x16, x16, 1  */\n"
-    "cmp     w15, w17\n"
-    "bne     .L_ldr_loop\n"
-    ".inst   0xd503427f  /* smstop  sm  */\n"
-    : "+r"(x15), "+r"(x16), "+r"(x17));
-}
-
-/* Set tpidr2 to BLK.  */
-static void
-set_tpidr2 (struct blk *blk)
-{
-  register unsigned long x0 asm ("x0") = (unsigned long)blk;
-  asm volatile (
-    ".inst   0xd51bd0a0  /* msr     tpidr2_el0, x0  */\n"
-    :: "r"(x0) : "memory");
-}
-
-/* Returns tpidr2.  */
-static void *
-get_tpidr2 (void)
-{
-  register unsigned long x0 asm ("x0");
-  asm volatile (
-    ".inst   0xd53bd0a0  /* mrs     x0, tpidr2_el0  */\n"
-    : "=r"(x0) :: "memory");
-  return (void *) x0;
-}
-
 static void
 print_data(const char *msg, void *p)
 {
@@ -168,8 +93,8 @@ longjmp_test (void)
     {
       p = get_tpidr2 ();
       printf ("before longjmp: tp2 = %p\n", p);
-      if (p != &blk)
-	FAIL_EXIT1 ("tpidr2 is clobbered");
+      if (p != NULL)
+	FAIL_EXIT1 ("tpidr2 has not been reset to null");
       do_longjmp (env);
       FAIL_EXIT1 ("longjmp returned");
     }
diff --git a/sysdeps/aarch64/tst-sme-za-state.c b/sysdeps/aarch64/tst-sme-za-state.c
new file mode 100644
index 0000000..63f6eeb
--- /dev/null
+++ b/sysdeps/aarch64/tst-sme-za-state.c
@@ -0,0 +1,119 @@
+/* Test for SME ZA state being cleared on setjmp and longjmp.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+
+#include "tst-sme-helper.h"
+
+static uint8_t *state;
+
+static void
+enable_sme_za_state (struct blk *ptr)
+{
+  set_tpidr2 (ptr);
+  start_za ();
+  load_za (state);
+}
+
+static void
+check_sme_za_state (const char msg[], bool clear)
+{
+  unsigned long svcr = get_svcr ();
+  void *tpidr2 = get_tpidr2 ();
+  printf ("[%s]\n", msg);
+  printf ("svcr = %016lx\n", svcr);
+  printf ("tpidr2 = %016lx\n", (unsigned long)tpidr2);
+  if (clear)
+    {
+      TEST_VERIFY (svcr == 0);
+      TEST_VERIFY (tpidr2 == NULL);
+    }
+  else
+    {
+      TEST_VERIFY (svcr != 0);
+      TEST_VERIFY (tpidr2 != NULL);
+    }
+}
+
+static void
+run (struct blk *ptr)
+{
+  jmp_buf buf;
+  int ret;
+
+  check_sme_za_state ("initial state", /* Clear.  */ true);
+
+  /* Enabled ZA state so that effect of disabling be observable.  */
+  enable_sme_za_state (ptr);
+  check_sme_za_state ("before setjmp", /* Clear.  */ false);
+
+  if ((ret = setjmp (buf)) == 0)
+    {
+      check_sme_za_state ("after setjmp", /* Clear.  */ true);
+
+      /* Enabled ZA state so that effect of disabling be observable.  */
+      enable_sme_za_state (ptr);
+      check_sme_za_state ("before longjmp", /* Clear.  */ false);
+
+      longjmp (buf, 42);
+
+      /* Unreachable.  */
+      TEST_VERIFY (false);
+      __builtin_unreachable ();
+    }
+
+  TEST_COMPARE (ret, 42);
+  check_sme_za_state ("after longjmp", /* Clear.  */ true);
+}
+
+static int
+do_test (void)
+{
+  unsigned long hwcap2 = getauxval (AT_HWCAP2);
+  if ((hwcap2 & HWCAP2_SME) == 0)
+    return EXIT_UNSUPPORTED;
+
+  /* Get current streaming SVE vector register size.  */
+  svl = get_svl ();
+  printf ("svl: %lu\n", svl);
+  TEST_VERIFY_EXIT (!(svl < 16 || svl % 16 != 0 || svl >= (1 << 16)));
+
+  /* Initialise buffer for ZA state of SME.  */
+  state = xmalloc (svl * svl);
+  memset (state, 1, svl * svl);
+  struct blk blk = {
+    .za_save_buffer = state,
+    .num_za_save_slices = svl,
+    .__reserved = {0},
+  };
+
+  run (&blk);
+
+  free (state);
+  return 0;
+}
+
+#include <support/test-driver.c>