AArch64: Implement exp2m1 and exp10m1 routinesHEAD master

Vector variants of the new C23 exp2m1 & exp10m1 routines. Note: Benchmark inputs for exp2m1 & exp10m1 are identical to exp2 & exp10 respectively, this also includes the floating point variations. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
author: Hasaan Khan <Hasaan.Khan@arm.com> 2025-08-20 11:27:23 +0000
committer: Wilco Dijkstra <wilco.dijkstra@arm.com> 2025-09-02 16:50:24 +0000
commit: 8ced7815fbff7bec9af2b7611a3478af27b57d13 (patch)
tree: 0d48a47ef256d42915e80d73f0ac01db00dfda0d /sysdeps/aarch64
parent: 54bd776f991f1a228a6bb6d76bf542edd915a0e3 (diff)
download: glibc-master.zip
glibc-master.tar.gz
glibc-master.tar.bz2
16 files changed, 1273 insertions, 0 deletions
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 068c11c..1ba0459 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -19,6 +19,8 @@ libmvec-supported-funcs = acos \
                           exp10 \
                           exp2 \
                           expm1 \
+                          exp2m1 \
+                          exp10m1 \
                           hypot \
                           log \
                           log10 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 2980cb7..21a29f9 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -179,4 +179,16 @@ libmvec {
     _ZGVsMxvv_atan2pi;
     _ZGVsMxvv_atan2pif;
   }
+  GLIBC_2.43 {
+    _ZGVnN2v_exp2m1;
+    _ZGVnN2v_exp2m1f;
+    _ZGVnN4v_exp2m1f;
+    _ZGVsMxv_exp2m1;
+    _ZGVsMxv_exp2m1f;
+    _ZGVnN2v_exp10m1;
+    _ZGVnN2v_exp10m1f;
+    _ZGVnN4v_exp10m1f;
+    _ZGVsMxv_exp10m1;
+    _ZGVsMxv_exp10m1f;
+  }
 }
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index c202bda..1c3df31 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -36,6 +36,8 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
 libmvec_hidden_proto (V_NAME_F1(exp2));
 libmvec_hidden_proto (V_NAME_F1(exp));
 libmvec_hidden_proto (V_NAME_F1(expm1));
+libmvec_hidden_proto (V_NAME_F1(exp2m1));
+libmvec_hidden_proto (V_NAME_F1(exp10m1));
 libmvec_hidden_proto (V_NAME_F2(hypot));
 libmvec_hidden_proto (V_NAME_F1(log10));
 libmvec_hidden_proto (V_NAME_F1(log1p));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 77ae10d..2983640 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -113,6 +113,14 @@
 # define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
 # undef __DECL_SIMD_expm1f
 # define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp2m1
+# define __DECL_SIMD_exp2m1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp2m1f
+# define __DECL_SIMD_exp2m1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp10m1
+# define __DECL_SIMD_exp10m1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp10m1f
+# define __DECL_SIMD_exp10m1f __DECL_SIMD_aarch64
 # undef __DECL_SIMD_hypot
 # define __DECL_SIMD_hypot __DECL_SIMD_aarch64
 # undef __DECL_SIMD_hypotf
@@ -212,6 +220,8 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp2m1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
@@ -247,6 +257,8 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp2m1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
@@ -287,6 +299,8 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_exp2m1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
@@ -322,6 +336,8 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_exp2m1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/exp10m1_advsimd.c b/sysdeps/aarch64/fpu/exp10m1_advsimd.c
new file mode 100644
index 0000000..765774a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1_advsimd.c
@@ -0,0 +1,202 @@
+/* Double-precision (Advanced SIMD) exp10m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1.  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 163840.0 /* 1280.0 * N.  */
+
+/* Value of |x| below which scale - 1 contributes produces large error.  */
+#define TableBound 0x1.a308a4198c9d7p-4 /* log10(2) * 87/256.  */
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static struct data
+{
+  double c6, c0;
+  double c2, c4;
+  double log2_10_hi, log2_10_lo;
+  float64x2_t c1, c3, c5;
+  float64x2_t log10_2, shift;
+  float64x2_t special_bound, scale_thresh;
+  uint64x2_t sm1_tbl_off, sm1_tbl_mask;
+  float64x2_t rnd2zero;
+  uint64_t scalem1[88];
+} data = {
+  /* Coefficients generated using Remez algorithm.  */
+  .c0 = 0x1.26bb1bbb55516p1,
+  .c1 = V2 (0x1.53524c73cea69p1),
+  .c2 = 0x1.0470591b30b6bp1,
+  .c3 = V2 (0x1.2bd7609b57e25p0),
+  .c4 = 0x1.1517d41bddcbep-1,
+  .c5 = V2 (0x1.a9a12d6f0755cp-3),
+  .c6 = -0x1.76a8d3abd7025p9,
+
+  /* Values of x which should round to the zeroth index of the exp10m1 table.  */
+  .rnd2zero = V2 (-0x1.34413509f79ffp-10), /* (2^-8)/log2(10).  */
+  .sm1_tbl_off = V2 (24),
+  .sm1_tbl_mask = V2 (0x3f),
+
+  .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10).  */
+  .log2_10_hi = 0x1.34413509f79ffp-9,  /* log2(10)/N.  */
+  .log2_10_lo = -0x1.9dc1da994fd21p-66,
+  .shift = V2 (0x1.8p+52),
+  .scale_thresh = V2 (ScaleBound),
+  .special_bound = V2 (SpecialBound),
+
+  /* Table containing 2^x - 1, for 2^x values close to 1.
+     The table holds values of 2^(i/128) - 1, computed in
+     arbitrary precision.
+     The 1st half contains values associated to i=0..43.
+     The 2nd half contains values associated to i=-44..-1.  */
+  .scalem1 = {
+    0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+    0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+    0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+    0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+    0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+    0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+    0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+    0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+    0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+    0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+    0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+    0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+    0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+    0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+    0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+    0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+    0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+    0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+    0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+    0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+    0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+    0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+    0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+    0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+    0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+    0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+    0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+    0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+    0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+    0xbf761eea3847077b,
+  }
+};
+
+#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
+#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+	      const struct data *d)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vsubq_f64 (vbslq_f64 (cmp, r1, r0), v_f64 (1.0));
+}
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+  return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+		       __v_exp_data[i[1] & IndexMask] };
+}
+
+static inline float64x2_t
+lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
+{
+  /* Extract sign bit and use as offset into table.  */
+  uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
+  uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
+  uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
+  uint64x2_t idx = vaddq_u64 (base_idx, offset);
+
+  uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
+  return vreinterpretq_f64_u64 (sm1);
+}
+
+/* Fast vector implementation of exp10m1.
+   Maximum measured error is 2.53 + 0.5 ulp.
+   _ZGVnN2v_exp10m1 (0x1.347007ffed62cp-10) got 0x1.63955944d1d83p-9
+					   want 0x1.63955944d1d80p-9.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (exp10m1) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+  uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
+
+  /* n = round(x/(log10(2)/N)).  */
+  float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  float64x2_t c24 = vld1q_f64 (&d->c2);
+  float64x2_t c60 = vld1q_f64 (&d->c6);
+  float64x2_t log_2_10 = vld1q_f64 (&d->log2_10_hi);
+
+  /* r = x - n*log10(2)/N.  */
+  float64x2_t r = x;
+  r = vfmsq_laneq_f64 (r, n, log_2_10, 0);
+  r = vfmsq_laneq_f64 (r, n, log_2_10, 1);
+
+  /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4.  */
+  float64x2_t r2 = vmulq_f64 (r, r);
+  float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, c24, 0);
+  float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c24, 1);
+  float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c60, 0);
+
+  float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
+  float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
+
+  float64x2_t p0 = vmulq_laneq_f64 (r, c60, 1);
+  float64x2_t p = vfmaq_f64 (p0, r2, p16);
+
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+  /* scale = 2^(n/N).  */
+  uint64x2_t scale_bits = lookup_sbits (u);
+  float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
+  float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
+
+  /* Use table to gather scalem1 for small values of x.  */
+  uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
+  if (v_any_u64 (is_small))
+    scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
+
+  /* Construct exp10m1 = (scale - 1) + scale * poly.  */
+  float64x2_t y = vfmaq_f64 (scalem1, scale, p);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    return vbslq_f64 (cmp, special_case (scale, p, n, d), y);
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp10m1_sve.c b/sysdeps/aarch64/fpu/exp10m1_sve.c
new file mode 100644
index 0000000..f77bce5
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1_sve.c
@@ -0,0 +1,184 @@
+/* Double-precision (SVE) exp10m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 0x1.33f4bedd4fa70p+8 /* log10(2^(1023 + 1/128)).  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 1280.0
+
+/* Value of |x| below which scale - 1 contributes produces large error.  */
+#define FexpaBound 0x1.2a9f2b61a7e2p-4 /* 31*(log10(2)/128).  */
+
+static const struct data
+{
+  double log2_10_hi, log2_10_lo;
+  double c3, c5;
+  double c0, c1, c2, c4;
+  double shift, log10_2, special_bound;
+  uint64_t scalem1[32];
+} data = {
+  /* Coefficients generated using Remez algorithm.  */
+  .c0 = 0x1.26bb1bbb55516p1,
+  .c1 = 0x1.53524c73cea6ap1,
+  .c2 = 0x1.0470591d8bd2ep1,
+  .c3 = 0x1.2bd7609dfea43p0,
+  .c4 = 0x1.142d0d89058f1p-1,
+  .c5 = 0x1.a80cf2ddd513p-3,
+
+  /* 1.5*2^46+1023. This value is further explained below.  */
+  .shift = 0x1.800000000ffc0p+46,
+  .log10_2 = 0x1.a934f0979a371p1,     /* 1/log2(10).  */
+  .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10).  */
+  .log2_10_lo = -0x1.9dc1da994fd21p-59,
+  .special_bound = SpecialBound,
+
+  /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+     The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+     The 1st half contains values associated to i=0..+15.
+     The 2nd half contains values associated to i=0..-15.  */
+  .scalem1 = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f,
+    0x3fa0e8a30eb37901, 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440,
+    0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, 0x3fb72b83c7d517ae,
+    0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709,
+    0x3fc6942d3720185a, 0x0000000000000000, 0xbfc331751ec3a814,
+    0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, 0xbfbf332113d56b1f,
+    0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+    0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a,
+    0xbfaafd11874c009e, 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89,
+    0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+  },
+};
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+	      svfloat64_t n)
+{
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x6, 0 otherwise.  */
+  svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0.  */
+  svuint64_t b
+      = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+  svfloat64_t s2 = svreinterpret_f64 (
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacgt (pg, n, ScaleBound);
+
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+  svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+  svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes.  */
+  return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
+}
+
+/* FEXPA based SVE exp10m1 algorithm.
+   Maximum measured error is 2.87 + 0.5 ULP:
+   _ZGVsMxv_exp10m1(0x1.64645f11e94c6p-4) got 0x1.c64d54eb7658dp-3
+					 want 0x1.c64d54eb7658ap-3.  */
+svfloat64_t SV_NAME_D1 (exp10m1) (svfloat64_t x, svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacgt (pg, x, d->special_bound);
+
+  /* n = round(x/(log10(2)/N)).  */
+  svfloat64_t shift = sv_f64 (d->shift);
+  svfloat64_t z = svmla_x (pg, shift, x, d->log10_2);
+  svfloat64_t n = svsub_x (pg, z, shift);
+
+  /* r = x - n*log10(2)/N.  */
+  svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, log2_10, 0);
+  r = svmls_lane (r, n, log2_10, 1);
+
+  /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so
+     for consistent NaN handling we have to manually propagate them. This
+     comes at significant performance cost.  */
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t scale = svexpa (u);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c3);
+  /* Approximate exp10(r) using polynomial.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, sv_f64 (d->c1));
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c24, 0);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c24, 1);
+  svfloat64_t p25 = svmla_x (pg, p23, p45, r2);
+  svfloat64_t p05 = svmla_x (pg, p01, p25, r2);
+
+  svfloat64_t p = svmul_x (pg, p05, r);
+
+  svfloat64_t scalem1 = svsub_x (pg, scale, 1.0);
+
+  /* For small values, use a lookup table for a more accurate scalem1.  */
+  svbool_t is_small = svaclt (pg, x, FexpaBound);
+  if (svptest_any (pg, is_small))
+    {
+      /* Use the low 4 bits of the input of FEXPA as index.  */
+      svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+      /* We can use the sign of x as a fifth bit to account for the asymmetry
+	 of e^x around 0.  */
+      svuint64_t signBit
+	  = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+      svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+      /* Lookup values for scale - 1 for small x.  */
+      svfloat64_t lookup
+	  = svreinterpret_f64 (svld1_gather_index (is_small, d->scalem1, idx));
+
+      /* Select the appropriate scale - 1 value based on x.  */
+      scalem1 = svsel (is_small, lookup, scalem1);
+    }
+
+  svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+  /* FEXPA returns nan for large inputs so we special case those.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+	  special case function so needs to be copied.
+	  e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+      /* Copy sign to scale.  */
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, y, scale, p, n);
+    }
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp10m1f_advsimd.c b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c
new file mode 100644
index 0000000..db1333b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c
@@ -0,0 +1,120 @@
+/* Single-precision (Advanced SIMD) exp10m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+  float log10_2_high, log10_2_low;
+  float log10_lo, c2, c4, c6;
+  float32x4_t log10_hi, c1, c3, c5, c7, c8;
+  float32x4_t inv_log10_2, special_bound;
+  uint32x4_t exponent_bias, special_offset, special_bias;
+  float32x4_t scale_thresh;
+} data = {
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.  */
+  .log10_hi = V4 (0x1.26bb1b8000000p+1),
+  .log10_lo = 0x1.daaa8b0000000p-26,
+  .c1 = V4 (0x1.53524ep1),
+  .c2 = 0x1.046fc8p1,
+  .c3 = V4 (0x1.2bd376p0),
+  .c4 = 0x1.156f8p-1,
+  .c5 = V4 (0x1.b28c0ep-3),
+  .c6 = -0x1.05e38ep-4,
+  .c7 = V4 (-0x1.c79f4ap-4),
+  .c8 = V4 (0x1.2d6f34p1),
+  .inv_log10_2 = V4 (0x1.a934fp+1),
+  .log10_2_high = 0x1.344136p-2,
+  .log10_2_low = 0x1.ec10cp-27,
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+  .scale_thresh = V4 (ScaleBound),
+  .special_bound = V4 (SpecialBound),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
+}
+
+/* Fast vector implementation of single-precision exp10m1.
+   Algorithm is accurate to 1.70 + 0.5 ULP.
+   _ZGVnN4v_exp10m1f(0x1.36f94cp-3) got 0x1.ac96acp-2
+				   want 0x1.ac96bp-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10m1) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+     with poly(r) in [1/sqrt(2), sqrt(2)] and
+     x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
+  float32x4_t log10_2 = vld1q_f32 (&d->log10_2_high);
+  float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+  float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2, 0);
+  r = vfmaq_laneq_f32 (r, n, log10_2, 1);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+  /* Pairwise Horner scheme.  */
+  float32x4_t log10lo_c246 = vld1q_f32 (&d->log10_lo);
+  float32x4_t r2 = vmulq_f32 (r, r);
+  float32x4_t p78 = vfmaq_f32 (d->c7, r, d->c8);
+  float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log10lo_c246, 3);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10lo_c246, 2);
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10lo_c246, 1);
+  float32x4_t p58 = vfmaq_f32 (p56, r2, p78);
+  float32x4_t p36 = vfmaq_f32 (p34, r2, p58);
+  float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
+
+  float32x4_t poly
+      = vfmaq_laneq_f32 (vmulq_f32 (d->log10_hi, r), r, log10lo_c246, 0);
+  poly = vfmaq_f32 (poly, p16, r2);
+
+  float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+  return y;
+}
+libmvec_hidden_def (V_NAME_F1 (exp10m1))
+HALF_WIDTH_ALIAS_F1 (exp10m1)
diff --git a/sysdeps/aarch64/fpu/exp10m1f_sve.c b/sysdeps/aarch64/fpu/exp10m1f_sve.c
new file mode 100644
index 0000000..afbd034
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1f_sve.c
@@ -0,0 +1,122 @@
+/* Single-precision (SVE) exp10m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+  float log10_2_high, log10_2_low;
+  float log10_lo, c2, c4, c6, c8;
+  float32_t log10_hi, c1, c3, c5, c7;
+  float32_t inv_log10_2, special_bound;
+  uint32_t exponent_bias, special_offset, special_bias;
+  float32_t scale_thresh;
+} data = {
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.  */
+  .log10_hi = 0x1.26bb1b8000000p+1,
+  .log10_lo = 0x1.daaa8b0000000p-26,
+  .c1 = 0x1.53524ep1,
+  .c2 = 0x1.046fc8p1,
+  .c3 = 0x1.2bd376p0,
+  .c4 = 0x1.156f8p-1,
+  .c5 = 0x1.b28c0ep-3,
+  .c6 = -0x1.05e38ep-4,
+  .c7 = -0x1.c79f4ap-4,
+  .c8 = 0x1.2d6f34p1,
+  .inv_log10_2 = 0x1.a934fp+1,
+  .log10_2_high = 0x1.344136p-2,
+  .log10_2_low = 0x1.ec10cp-27,
+  .exponent_bias = 0x3f800000,
+  .special_offset = 0x82000000,
+  .special_bias = 0x7f000000,
+  .scale_thresh = ScaleBound,
+  .special_bound = SpecialBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
+	      svfloat32_t scale, const struct data *d)
+{
+  svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
+  svfloat32_t s1 = svreinterpret_f32 (
+      svsel (b, sv_u32 (d->special_offset + d->special_bias),
+	     sv_u32 (d->special_bias)));
+  svfloat32_t s2
+      = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
+  svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
+  svfloat32_t r1
+      = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
+  svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
+  svfloat32_t r = svsel (cmp1, r1, r0);
+  return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
+}
+
+/* Fast vector implementation of single-precision exp10.
+   Algorithm is accurate to 1.68 + 0.5 ULP.
+   _ZGVnN4v_exp10m1f(0x1.3aeffep-3) got 0x1.b3139p-2
+				   want 0x1.b3138cp-2.  */
+svfloat32_t SV_NAME_F1 (exp10m1) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+     with poly(r) in [1/sqrt(2), sqrt(2)] and
+     x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2].  */
+  svfloat32_t log10_2 = svld1rq (svptrue_b32 (), &d->log10_2_high);
+  svfloat32_t n = svrinta_x (pg, svmul_x (pg, x, d->inv_log10_2));
+  svfloat32_t r = svmls_lane_f32 (x, n, log10_2, 0);
+  r = svmla_lane_f32 (r, n, log10_2, 1);
+
+  svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
+
+  svfloat32_t scale
+      = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
+  svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
+
+  /* Pairwise Horner scheme.  */
+  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t c2468 = svld1rq (svptrue_b32 (), &d->c2);
+  svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), r, c2468, 3);
+  svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, c2468, 2);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, c2468, 1);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, c2468, 0);
+  svfloat32_t p58 = svmla_x (pg, p56, r2, p78);
+  svfloat32_t p36 = svmla_x (pg, p34, r2, p58);
+  svfloat32_t p16 = svmla_x (pg, p12, r2, p36);
+
+  svfloat32_t poly = svmla_n_f32_x (pg, svmul_x (pg, r, sv_f32 (d->log10_hi)),
+				    r, d->log10_lo);
+  poly = svmla_x (pg, poly, p16, r2);
+
+  svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1_advsimd.c b/sysdeps/aarch64/fpu/exp2m1_advsimd.c
new file mode 100644
index 0000000..1eb6042
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1_advsimd.c
@@ -0,0 +1,194 @@
+/* Double-precision (Advanced SIMD) exp2m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 1022.0
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 1280.0
+
+/* 87/256, value of x under which table lookup is used for 2^x-1.  */
+#define TableBound 0x1.5bfffffffffffp-2
+
+/* Number of bits for each value in the table.  */
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+static const struct data
+{
+  uint64x2_t exponent_bias, special_offset, special_bias, special_bias2,
+      sm1_tbl_off, sm1_tbl_mask;
+  float64x2_t scale_thresh, special_bound, shift, rnd2zero;
+  float64x2_t log2_hi, c1, c3, c5;
+  double log2_lo, c2, c4, c6;
+  uint64_t scalem1[88];
+} data = {
+  /* Coefficients generated using remez's algorithm for exp2m1(x).  */
+  .log2_hi = V2 (0x1.62e42fefa39efp-1),
+  .log2_lo = 0x1.abc9e3b39803f3p-56,
+  .c1 = V2 (0x1.ebfbdff82c58ep-3),
+  .c2 = 0x1.c6b08d71f5804p-5,
+  .c3 = V2 (0x1.3b2ab6fee7509p-7),
+  .c4 = 0x1.5d1d37eb33b15p-10,
+  .c5 = V2 (0x1.423f35f371d9ap-13),
+  .c6 = 0x1.e7d57ad9a5f93p-5,
+  .exponent_bias = V2 (0x3ff0000000000000),
+  .special_offset = V2 (0x6000000000000000), /* 0x1p513.  */
+  .special_bias = V2 (0x7000000000000000),   /* 0x1p769.  */
+  .special_bias2 = V2 (0x3010000000000000),  /* 0x1p-254.  */
+  .scale_thresh = V2 (ScaleBound),
+  .special_bound = V2 (SpecialBound),
+  .shift = V2 (0x1.8p52 / N),
+  .rnd2zero = V2 (-0x1p-8),
+  .sm1_tbl_off = V2 (24),
+  .sm1_tbl_mask = V2 (0x3f),
+
+  /* Table containing 2^x - 1, for 2^x values close to 1.
+     The table holds values of 2^(i/128) - 1, computed in
+     arbitrary precision.
+     The 1st half contains values associated to i=0..43.
+     The 2nd half contains values associated to i=-44..-1.  */
+  .scalem1 = {
+    0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+    0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+    0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+    0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+    0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+    0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+    0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+    0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+    0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+    0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+    0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+    0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+    0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+    0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+    0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+    0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+    0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+    0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+    0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+    0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+    0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+    0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+    0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+    0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+    0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+    0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+    0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+    0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+    0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+    0xbf761eea3847077b,
+  }
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+  return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+		       __v_exp_data[i[1] & IndexMask] };
+}
+
+static inline float64x2_t
+lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
+{
+  /* Extract sign bit and use as offset into table.  */
+  uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
+  uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
+  uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
+  uint64x2_t idx = vaddq_u64 (base_idx, offset);
+
+  /* Fall back to table lookup for 2^x - 1, when x is close to zero to
+     avoid large errors.  */
+  uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
+  return vreinterpretq_f64_u64 (sm1);
+}
+
+static inline VPCS_ATTR float64x2_t
+special_case (float64x2_t poly, float64x2_t n, uint64x2_t e, float64x2_t scale,
+	      const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vclezq_f64 (n), d->special_offset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (d->special_bias, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+      vsubq_u64 (vreinterpretq_u64_f64 (scale), d->special_bias2), b));
+  uint64x2_t cmp2 = vcagtq_f64 (n, d->scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r2 = vmulq_f64 (vfmaq_f64 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  return vsubq_f64 (vbslq_f64 (cmp2, r1, r2), v_f64 (1.0f));
+}
+
+/* Double-precision vector exp2(x) - 1 function.
+   The maximum error is 2.55 + 0.5 ULP.
+   _ZGVnN2v_exp2m1 (0x1.1113e87a035ap-8) got 0x1.7b1d06f0a7d36p-9
+					want 0x1.7b1d06f0a7d33p-9.  */
+VPCS_ATTR float64x2_t V_NAME_D1 (exp2m1) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* exp2(x) = 2^n (1 + poly(r))
+     x = n + r, with r in [-1/2N, 1/2N].
+     n is a floating point number, multiple of 1/N.  */
+  float64x2_t z = vaddq_f64 (d->shift, x);
+  uint64x2_t u = vreinterpretq_u64_f64 (z);
+  float64x2_t n = vsubq_f64 (z, d->shift);
+
+  /* Calculate scale, 2^n.  */
+  uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+  uint64x2_t scale_bits = lookup_sbits (u);
+  float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
+
+  uint64x2_t cmp = vcagtq_f64 (x, d->special_bound);
+
+  /* Pairwise Horner scheme.  */
+  float64x2_t r = vsubq_f64 (x, n);
+  float64x2_t r2 = vmulq_f64 (r, r);
+
+  float64x2_t log2lo_c2 = vld1q_f64 (&d->log2_lo);
+  float64x2_t c4c6 = vld1q_f64 (&d->c4);
+
+  float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c4c6, 1);
+  float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c4c6, 0);
+  float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, log2lo_c2, 1);
+  float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
+  float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
+  float64x2_t poly
+      = vfmaq_laneq_f64 (vmulq_f64 (d->log2_hi, r), r, log2lo_c2, 0);
+  poly = vfmaq_f64 (poly, p16, r2);
+
+  float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
+
+  /* Use table to gather scalem1 for small values of x.  */
+  uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
+  if (v_any_u64 (is_small))
+    scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
+
+  /* Construct exp2m1 = (scale - 1) + scale * poly.  */
+  float64x2_t y = vfmaq_f64 (scalem1, poly, scale);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    return vbslq_f64 (cmp, special_case (poly, n, e, scale, d), y);
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1_sve.c b/sysdeps/aarch64/fpu/exp2m1_sve.c
new file mode 100644
index 0000000..24e00f8
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1_sve.c
@@ -0,0 +1,197 @@
+/* Double-precision (SVE) exp2m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 1022.0
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 1280.0
+
+/* 87/256, value of x under which table lookup is used for 2^x-1.  */
+#define TableBound 0x1.5bfffffffffffp-2
+
+/* Number of bits for each value in the table.  */
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+static const struct data
+{
+  double shift, rnd2zero;
+  double c1, c3, c5;
+  double c0, c2, c4;
+  uint64_t sm1_tbl_mask, sm1_tbl_off;
+  uint64_t scalem1[88];
+} data = {
+  /* Generated using fpminimax.  */
+  .c0 = 0x1.62e42fefa39efp-1,
+  .c1 = 0x1.ebfbdff82c58ep-3,
+  .c2 = 0x1.c6b08d707e662p-5,
+  .c3 = 0x1.3b2ab6fc45f33p-7,
+  .c4 = 0x1.5d86c0ff8618dp-10,
+  .c5 = 0x1.4301374d5d2f5p-13,
+  .shift = 0x1.8p52 / N,
+  .sm1_tbl_mask = 0x3f,
+  .sm1_tbl_off = 24,
+  .rnd2zero = -0x1p-8,
+  /* Table containing 2^x - 1, for 2^x values close to 1.
+     The table holds values of 2^(i/128) - 1, computed in
+     arbitrary precision.
+     The 1st half contains values associated to i=0..43.
+     The 2nd half contains values associated to i=-44..-1.  */
+  .scalem1= {
+    0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+    0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+    0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+    0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+    0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+    0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+    0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+    0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+    0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+    0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+    0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+    0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+    0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+    0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+    0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+    0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+    0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+    0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+    0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+    0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+    0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+    0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+    0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+    0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+    0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+    0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+    0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+    0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+    0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+    0xbf761eea3847077b,
+  }
+};
+
+static inline svuint64_t
+lookup_sbits (svbool_t pg, svuint64_t indices)
+{
+  /* Mask indices to valid range.  */
+  svuint64_t masked = svand_z (pg, indices, svdup_n_u64 (IndexMask));
+  return svld1_gather_index (pg, __v_exp_data, masked);
+}
+
+static inline svfloat64_t
+lookup_sm1bits (svbool_t pg, svfloat64_t x, svuint64_t u, const struct data *d)
+{
+  /* Extract sign bit and use as offset into table.  */
+  svbool_t signBit = svcmplt_f64 (pg, x, sv_f64 (d->rnd2zero));
+  svuint64_t base_idx = svand_x (pg, u, d->sm1_tbl_mask);
+  svuint64_t idx = svadd_m (signBit, base_idx, sv_u64 (d->sm1_tbl_off));
+
+  /* Fall back to table lookup for 2^x - 1, when x is close to zero to
+     avoid large errors.  */
+  svuint64_t sm1 = svld1_gather_index (svptrue_b64 (), d->scalem1, idx);
+  return svreinterpret_f64 (sm1);
+}
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static inline svfloat64_t
+special_case (svfloat64_t s, svfloat64_t y, svfloat64_t n,
+	      const struct data *d)
+{
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+  svbool_t p_sign = svcmple (svptrue_b64 (), n, 0.0); /* n <= 0.  */
+  svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
+
+  /* Set s1 to generate overflow depending on sign of exponent n.  */
+  svfloat64_t s1
+      = svreinterpret_f64 (svsubr_x (svptrue_b64 (), b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  svfloat64_t s2 = svreinterpret_f64 (svadd_x (
+      svptrue_b64 (),
+      svsub_x (svptrue_b64 (), svreinterpret_u64 (s), SpecialBias2), b));
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacle (svptrue_b64 (), n, ScaleBound);
+
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+  svfloat64_t r2 = svmla_x (svptrue_b64 (), s2, s2, y);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+  return svsub_x (svptrue_b64 (), svsel (p_cmp, r0, r1), 1.0);
+}
+
+/* Double-precision SVE exp2(x) - 1.
+   Maximum error is 2.58 + 0.5 ULP.
+   _ZGVsMxv_exp2m1(0x1.0284a345c99bfp-8) got 0x1.66df630cd2965p-9
+					want 0x1.66df630cd2962p-9.  */
+svfloat64_t SV_NAME_D1 (exp2m1) (svfloat64_t x, svbool_t pg)
+{
+  /* exp2(x) = 2^n (1 + poly(r))
+     x = n + r, with r in [-1/2N, 1/2N].
+     n is a floating point number, multiple of 1/N.  */
+  const struct data *d = ptr_barrier (&data);
+  svbool_t special = svacge (pg, x, SpecialBound);
+
+  svfloat64_t z = svadd_x (pg, x, d->shift);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+  /* Look up table to calculate 2^n.  */
+  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TABLE_BITS);
+  svuint64_t scale_bits = lookup_sbits (pg, u);
+  svfloat64_t scale = svreinterpret_f64_u64 (svadd_x (pg, scale_bits, e));
+
+  /* Pairwise Horner scheme.  */
+  svfloat64_t c35 = svld1rq (svptrue_b64 (), &d->c3);
+
+  svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, d->c1);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c35, 0);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c35, 1);
+
+  svfloat64_t p25 = svmla_x (pg, p23, r2, p45);
+  svfloat64_t p05 = svmla_x (pg, p01, r2, p25);
+  svfloat64_t poly = svmul_x (pg, p05, r);
+
+  /* Use table to gather scalem1 for small values of x.  */
+  svbool_t is_small = svaclt (pg, x, TableBound);
+  svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+  if (svptest_any (pg, is_small))
+    scalem1 = svsel_f64 (is_small, lookup_sm1bits (pg, x, u, d), scalem1);
+
+  /* Construct exp2m1 = (scale - 1) + scale * poly.  */
+  svfloat64_t y = svmla_x (pg, scalem1, scale, poly);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    return svsel_f64 (special, special_case (scale, poly, n, d), y);
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1f_advsimd.c b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c
new file mode 100644
index 0000000..b42f13e
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c
@@ -0,0 +1,106 @@
+/* Single-precision (Advanced SIMD) exp2m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+  uint32x4_t exponent_bias, special_offset, special_bias;
+  float32x4_t scale_thresh, special_bound;
+  float32x4_t log2_hi, c1, c3, c5;
+  float log2_lo, c2, c4, c6;
+} data = {
+  /* Coefficients generated using remez's algorithm for exp2m1f(x).  */
+  .log2_hi = V4 (0x1.62e43p-1),
+  .log2_lo = -0x1.05c610p-29,
+  .c1 = V4 (0x1.ebfbep-3),
+  .c2 = 0x1.c6b06ep-5,
+  .c3 = V4 (0x1.3b2a5cp-7),
+  .c4 = 0x1.5da59ep-10,
+  .c5 = V4 (0x1.440dccp-13),
+  .c6 = 0x1.e081d6p-17,
+  .exponent_bias = V4 (0x3f800000),
+  .special_offset = V4 (0x82000000),
+  .special_bias = V4 (0x7f000000),
+  .scale_thresh = V4 (ScaleBound),
+  .special_bound = V4 (SpecialBound),
+};
+
+static float32x4_t VPCS_ATTR
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale, const struct data *d)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
+}
+
+/* Single-precision vector exp2(x) - 1 function.
+   The maximum error is  1.76 + 0.5 ULP.
+   _ZGVnN4v_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
+				   want 0x1.ab2ecp-2.  */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2m1) (float32x4_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = n + r, with r in [-1/2, 1/2].  */
+  float32x4_t n = vrndaq_f32 (x);
+  float32x4_t r = vsubq_f32 (x, n);
+  uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+  float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+  uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+  float32x4_t log2lo_c246 = vld1q_f32 (&d->log2_lo);
+  float32x4_t r2 = vmulq_f32 (r, r);
+
+  /* Pairwise horner scheme.  */
+  float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log2lo_c246, 3);
+  float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log2lo_c246, 2);
+  float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log2lo_c246, 1);
+  float32x4_t p36 = vfmaq_f32 (p34, r2, p56);
+  float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
+  float32x4_t poly
+      = vfmaq_laneq_f32 (vmulq_f32 (d->log2_hi, r), r, log2lo_c246, 0);
+  poly = vfmaq_f32 (poly, p16, r2);
+
+  float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+  return y;
+}
+libmvec_hidden_def (V_NAME_F1 (exp2m1))
+HALF_WIDTH_ALIAS_F1 (exp2m1)
diff --git a/sysdeps/aarch64/fpu/exp2m1f_sve.c b/sysdeps/aarch64/fpu/exp2m1f_sve.c
new file mode 100644
index 0000000..e6473f6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1f_sve.c
@@ -0,0 +1,108 @@
+/* Single-precision (SVE) exp2m1 function
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment.  */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))).  */
+
+/* Value of n above which scale overflows even with special treatment.  */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+  uint32_t exponent_bias, special_offset, special_bias;
+  float32_t scale_thresh, special_bound;
+  float log2_lo, c2, c4, c6;
+  float log2_hi, c1, c3, c5, shift;
+} data = {
+  /* Coefficients generated using remez's algorithm for exp2m1f(x).  */
+  .log2_hi = 0x1.62e43p-1,
+  .log2_lo = -0x1.05c610p-29,
+  .c1 = 0x1.ebfbep-3,
+  .c2 = 0x1.c6b06ep-5,
+  .c3 = 0x1.3b2a5cp-7,
+  .c4 = 0x1.5da59ep-10,
+  .c5 = 0x1.440dccp-13,
+  .c6 = 0x1.e081d6p-17,
+  .exponent_bias = 0x3f800000,
+  .special_offset = 0x82000000,
+  .special_bias = 0x7f000000,
+  .scale_thresh = ScaleBound,
+  .special_bound = SpecialBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
+	      svfloat32_t scale, const struct data *d)
+{
+  svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
+  svfloat32_t s1 = svreinterpret_f32 (
+      svsel (b, sv_u32 (d->special_offset + d->special_bias),
+	     sv_u32 (d->special_bias)));
+  svfloat32_t s2
+      = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
+  svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
+  svfloat32_t r1
+      = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
+  svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
+  svfloat32_t r = svsel (cmp1, r1, r0);
+  return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
+}
+
+/* Single-precision vector exp2(x) - 1 function.
+   The maximum error is  1.76 + 0.5 ULP.
+   _ZGVsMxv_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
+				   want 0x1.ab2ecp-2.  */
+svfloat32_t SV_NAME_F1 (exp2m1) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svfloat32_t n = svrinta_x (pg, x);
+  svfloat32_t r = svsub_x (pg, x, n);
+
+  svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
+  svfloat32_t scale
+      = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
+
+  svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
+
+  svfloat32_t r2 = svmul_x (pg, r, r);
+
+  svfloat32_t log2lo_c246 = svld1rq (svptrue_b32 (), &d->log2_lo);
+  svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, log2lo_c246, 3);
+  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, log2lo_c246, 2);
+  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, log2lo_c246, 1);
+
+  svfloat32_t p36 = svmla_x (pg, p34, p56, r2);
+  svfloat32_t p16 = svmla_x (pg, p12, p36, r2);
+
+  svfloat32_t poly = svmla_lane (
+      svmul_x (svptrue_b32 (), r, sv_f32 (d->log2_hi)), r, log2lo_c246, 0);
+  poly = svmla_x (pg, poly, p16, r2);
+
+  svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
+
+  /* Fallback to special case for lanes with overflow.  */
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index a3fef22..b8b4822 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
 VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
 VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
 VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
+VPCS_VECTOR_WRAPPER (exp2m1_advsimd, _ZGVnN2v_exp2m1)
+VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1)
 VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
 VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
 VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index f4a5ae8..168b3f7 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
 SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
 SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
 SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
+SVE_VECTOR_WRAPPER (exp2m1_sve, _ZGVsMxv_exp2m1)
+SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1)
 SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
 SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
 SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index bc22956..c290073 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
 VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
 VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
 VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
+VPCS_VECTOR_WRAPPER (exp2m1f_advsimd, _ZGVnN4v_exp2m1f)
+VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f)
 VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
 VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
 VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index ad0d6ad..fd01791 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
 SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
 SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
 SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
+SVE_VECTOR_WRAPPER (exp2m1f_sve, _ZGVsMxv_exp2m1f)
+SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f)
 SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
 SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
 SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
author	Hasaan Khan <Hasaan.Khan@arm.com>	2025-08-20 11:27:23 +0000
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>	2025-09-02 16:50:24 +0000
commit	8ced7815fbff7bec9af2b7611a3478af27b57d13 (patch)
tree	0d48a47ef256d42915e80d73f0ac01db00dfda0d /sysdeps/aarch64
parent	54bd776f991f1a228a6bb6d76bf542edd915a0e3 (diff)
download	glibc-master.zip glibc-master.tar.gz glibc-master.tar.bz2