aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64
diff options
context:
space:
mode:
authorHasaan Khan <Hasaan.Khan@arm.com>2025-08-20 11:27:23 +0000
committerWilco Dijkstra <wilco.dijkstra@arm.com>2025-09-02 16:50:24 +0000
commit8ced7815fbff7bec9af2b7611a3478af27b57d13 (patch)
tree0d48a47ef256d42915e80d73f0ac01db00dfda0d /sysdeps/aarch64
parent54bd776f991f1a228a6bb6d76bf542edd915a0e3 (diff)
downloadglibc-master.zip
glibc-master.tar.gz
glibc-master.tar.bz2
AArch64: Implement exp2m1 and exp10m1 routinesHEADmaster
Vector variants of the new C23 exp2m1 & exp10m1 routines. Note: Benchmark inputs for exp2m1 & exp10m1 are identical to exp2 & exp10 respectively, this also includes the floating point variations. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r--sysdeps/aarch64/fpu/Makefile2
-rw-r--r--sysdeps/aarch64/fpu/Versions12
-rw-r--r--sysdeps/aarch64/fpu/advsimd_f32_protos.h2
-rw-r--r--sysdeps/aarch64/fpu/bits/math-vector.h16
-rw-r--r--sysdeps/aarch64/fpu/exp10m1_advsimd.c202
-rw-r--r--sysdeps/aarch64/fpu/exp10m1_sve.c184
-rw-r--r--sysdeps/aarch64/fpu/exp10m1f_advsimd.c120
-rw-r--r--sysdeps/aarch64/fpu/exp10m1f_sve.c122
-rw-r--r--sysdeps/aarch64/fpu/exp2m1_advsimd.c194
-rw-r--r--sysdeps/aarch64/fpu/exp2m1_sve.c197
-rw-r--r--sysdeps/aarch64/fpu/exp2m1f_advsimd.c106
-rw-r--r--sysdeps/aarch64/fpu/exp2m1f_sve.c108
-rw-r--r--sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c2
-rw-r--r--sysdeps/aarch64/fpu/test-double-sve-wrappers.c2
-rw-r--r--sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c2
-rw-r--r--sysdeps/aarch64/fpu/test-float-sve-wrappers.c2
16 files changed, 1273 insertions, 0 deletions
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 068c11c..1ba0459 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -19,6 +19,8 @@ libmvec-supported-funcs = acos \
exp10 \
exp2 \
expm1 \
+ exp2m1 \
+ exp10m1 \
hypot \
log \
log10 \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 2980cb7..21a29f9 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -179,4 +179,16 @@ libmvec {
_ZGVsMxvv_atan2pi;
_ZGVsMxvv_atan2pif;
}
+ GLIBC_2.43 {
+ _ZGVnN2v_exp2m1;
+ _ZGVnN2v_exp2m1f;
+ _ZGVnN4v_exp2m1f;
+ _ZGVsMxv_exp2m1;
+ _ZGVsMxv_exp2m1f;
+ _ZGVnN2v_exp10m1;
+ _ZGVnN2v_exp10m1f;
+ _ZGVnN4v_exp10m1f;
+ _ZGVsMxv_exp10m1;
+ _ZGVsMxv_exp10m1f;
+ }
}
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index c202bda..1c3df31 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -36,6 +36,8 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
libmvec_hidden_proto (V_NAME_F1(exp2));
libmvec_hidden_proto (V_NAME_F1(exp));
libmvec_hidden_proto (V_NAME_F1(expm1));
+libmvec_hidden_proto (V_NAME_F1(exp2m1));
+libmvec_hidden_proto (V_NAME_F1(exp10m1));
libmvec_hidden_proto (V_NAME_F2(hypot));
libmvec_hidden_proto (V_NAME_F1(log10));
libmvec_hidden_proto (V_NAME_F1(log1p));
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 77ae10d..2983640 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -113,6 +113,14 @@
# define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
# undef __DECL_SIMD_expm1f
# define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp2m1
+# define __DECL_SIMD_exp2m1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp2m1f
+# define __DECL_SIMD_exp2m1f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp10m1
+# define __DECL_SIMD_exp10m1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_exp10m1f
+# define __DECL_SIMD_exp10m1f __DECL_SIMD_aarch64
# undef __DECL_SIMD_hypot
# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
# undef __DECL_SIMD_hypotf
@@ -212,6 +220,8 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp2m1f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
@@ -247,6 +257,8 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp2m1 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
@@ -287,6 +299,8 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_exp2m1f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
@@ -322,6 +336,8 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_exp2m1 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/exp10m1_advsimd.c b/sysdeps/aarch64/fpu/exp10m1_advsimd.c
new file mode 100644
index 0000000..765774a
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1_advsimd.c
@@ -0,0 +1,202 @@
+/* Double-precision (Advanced SIMD) exp10m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 163840.0 /* 1280.0 * N. */
+
+/* Value of |x| below which scale - 1 contributes produces large error. */
+#define TableBound 0x1.a308a4198c9d7p-4 /* log10(2) * 87/256. */
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static struct data
+{
+ double c6, c0;
+ double c2, c4;
+ double log2_10_hi, log2_10_lo;
+ float64x2_t c1, c3, c5;
+ float64x2_t log10_2, shift;
+ float64x2_t special_bound, scale_thresh;
+ uint64x2_t sm1_tbl_off, sm1_tbl_mask;
+ float64x2_t rnd2zero;
+ uint64_t scalem1[88];
+} data = {
+ /* Coefficients generated using Remez algorithm. */
+ .c0 = 0x1.26bb1bbb55516p1,
+ .c1 = V2 (0x1.53524c73cea69p1),
+ .c2 = 0x1.0470591b30b6bp1,
+ .c3 = V2 (0x1.2bd7609b57e25p0),
+ .c4 = 0x1.1517d41bddcbep-1,
+ .c5 = V2 (0x1.a9a12d6f0755cp-3),
+ .c6 = -0x1.76a8d3abd7025p9,
+
+ /* Values of x which should round to the zeroth index of the exp10m1 table. */
+ .rnd2zero = V2 (-0x1.34413509f79ffp-10), /* (2^-8)/log2(10). */
+ .sm1_tbl_off = V2 (24),
+ .sm1_tbl_mask = V2 (0x3f),
+
+ .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */
+ .log2_10_hi = 0x1.34413509f79ffp-9, /* log2(10)/N. */
+ .log2_10_lo = -0x1.9dc1da994fd21p-66,
+ .shift = V2 (0x1.8p+52),
+ .scale_thresh = V2 (ScaleBound),
+ .special_bound = V2 (SpecialBound),
+
+ /* Table containing 2^x - 1, for 2^x values close to 1.
+ The table holds values of 2^(i/128) - 1, computed in
+ arbitrary precision.
+ The 1st half contains values associated to i=0..43.
+ The 2nd half contains values associated to i=-44..-1. */
+ .scalem1 = {
+ 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+ 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+ 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+ 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+ 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+ 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+ 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+ 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+ 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+ 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+ 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+ 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+ 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+ 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+ 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+ 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+ 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+ 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+ 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+ 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+ 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+ 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+ 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+ 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+ 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+ 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+ 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+ 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+ 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+ 0xbf761eea3847077b,
+ }
+};
+
+#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
+#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t s, float64x2_t y, float64x2_t n,
+ const struct data *d)
+{
+ /* 2^(n/N) may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (
+ vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+ uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+ return vsubq_f64 (vbslq_f64 (cmp, r1, r0), v_f64 (1.0));
+}
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+ return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+ __v_exp_data[i[1] & IndexMask] };
+}
+
+static inline float64x2_t
+lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
+{
+ /* Extract sign bit and use as offset into table. */
+ uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
+ uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
+ uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
+ uint64x2_t idx = vaddq_u64 (base_idx, offset);
+
+ uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
+ return vreinterpretq_f64_u64 (sm1);
+}
+
+/* Fast vector implementation of exp10m1.
+ Maximum measured error is 2.53 + 0.5 ulp.
+ _ZGVnN2v_exp10m1 (0x1.347007ffed62cp-10) got 0x1.63955944d1d83p-9
+ want 0x1.63955944d1d80p-9. */
+float64x2_t VPCS_ATTR V_NAME_D1 (exp10m1) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+ uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
+
+ /* n = round(x/(log10(2)/N)). */
+ float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ float64x2_t c24 = vld1q_f64 (&d->c2);
+ float64x2_t c60 = vld1q_f64 (&d->c6);
+ float64x2_t log_2_10 = vld1q_f64 (&d->log2_10_hi);
+
+ /* r = x - n*log10(2)/N. */
+ float64x2_t r = x;
+ r = vfmsq_laneq_f64 (r, n, log_2_10, 0);
+ r = vfmsq_laneq_f64 (r, n, log_2_10, 1);
+
+ /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
+ float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, c24, 0);
+ float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c24, 1);
+ float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c60, 0);
+
+ float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
+ float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
+
+ float64x2_t p0 = vmulq_laneq_f64 (r, c60, 1);
+ float64x2_t p = vfmaq_f64 (p0, r2, p16);
+
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+ /* scale = 2^(n/N). */
+ uint64x2_t scale_bits = lookup_sbits (u);
+ float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
+ float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
+
+ /* Use table to gather scalem1 for small values of x. */
+ uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
+ if (v_any_u64 (is_small))
+ scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
+
+ /* Construct exp10m1 = (scale - 1) + scale * poly. */
+ float64x2_t y = vfmaq_f64 (scalem1, scale, p);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (v_any_u64 (cmp)))
+ return vbslq_f64 (cmp, special_case (scale, p, n, d), y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp10m1_sve.c b/sysdeps/aarch64/fpu/exp10m1_sve.c
new file mode 100644
index 0000000..f77bce5
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1_sve.c
@@ -0,0 +1,184 @@
+/* Double-precision (SVE) exp10m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 0x1.33f4bedd4fa70p+8 /* log10(2^(1023 + 1/128)). */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 1280.0
+
+/* Value of |x| below which scale - 1 contributes produces large error. */
+#define FexpaBound 0x1.2a9f2b61a7e2p-4 /* 31*(log10(2)/128). */
+
+static const struct data
+{
+ double log2_10_hi, log2_10_lo;
+ double c3, c5;
+ double c0, c1, c2, c4;
+ double shift, log10_2, special_bound;
+ uint64_t scalem1[32];
+} data = {
+ /* Coefficients generated using Remez algorithm. */
+ .c0 = 0x1.26bb1bbb55516p1,
+ .c1 = 0x1.53524c73cea6ap1,
+ .c2 = 0x1.0470591d8bd2ep1,
+ .c3 = 0x1.2bd7609dfea43p0,
+ .c4 = 0x1.142d0d89058f1p-1,
+ .c5 = 0x1.a80cf2ddd513p-3,
+
+ /* 1.5*2^46+1023. This value is further explained below. */
+ .shift = 0x1.800000000ffc0p+46,
+ .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
+ .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */
+ .log2_10_lo = -0x1.9dc1da994fd21p-59,
+ .special_bound = SpecialBound,
+
+ /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+ The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+ The 1st half contains values associated to i=0..+15.
+ The 2nd half contains values associated to i=0..-15. */
+ .scalem1 = {
+ 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f,
+ 0x3fa0e8a30eb37901, 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440,
+ 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, 0x3fb72b83c7d517ae,
+ 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+ 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709,
+ 0x3fc6942d3720185a, 0x0000000000000000, 0xbfc331751ec3a814,
+ 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, 0xbfbf332113d56b1f,
+ 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+ 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a,
+ 0xbfaafd11874c009e, 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89,
+ 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+ },
+};
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+ svfloat64_t n)
+{
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+
+ /* If n<=0 then set b to 0x6, 0 otherwise. */
+ svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
+ svuint64_t b
+ = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
+
+ /* Set s1 to generate overflow depending on sign of exponent n,
+ ie. s1 = 0x70...0 - b. */
+ svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold.
+ ie. s2 = as_u64 (s) - 0x3010...0 + b. */
+ svfloat64_t s2 = svreinterpret_f64 (
+ svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacgt (pg, n, ScaleBound);
+
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+ svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+ svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */
+ return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
+}
+
+/* FEXPA based SVE exp10m1 algorithm.
+ Maximum measured error is 2.87 + 0.5 ULP:
+ _ZGVsMxv_exp10m1(0x1.64645f11e94c6p-4) got 0x1.c64d54eb7658dp-3
+ want 0x1.c64d54eb7658ap-3. */
+svfloat64_t SV_NAME_D1 (exp10m1) (svfloat64_t x, svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->special_bound);
+
+ /* n = round(x/(log10(2)/N)). */
+ svfloat64_t shift = sv_f64 (d->shift);
+ svfloat64_t z = svmla_x (pg, shift, x, d->log10_2);
+ svfloat64_t n = svsub_x (pg, z, shift);
+
+ /* r = x - n*log10(2)/N. */
+ svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi);
+ svfloat64_t r = x;
+ r = svmls_lane (r, n, log2_10, 0);
+ r = svmls_lane (r, n, log2_10, 1);
+
+ /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so
+ for consistent NaN handling we have to manually propagate them. This
+ comes at significant performance cost. */
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t scale = svexpa (u);
+ svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c3);
+ /* Approximate exp10(r) using polynomial. */
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+ svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, sv_f64 (d->c1));
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c24, 0);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c24, 1);
+ svfloat64_t p25 = svmla_x (pg, p23, p45, r2);
+ svfloat64_t p05 = svmla_x (pg, p01, p25, r2);
+
+ svfloat64_t p = svmul_x (pg, p05, r);
+
+ svfloat64_t scalem1 = svsub_x (pg, scale, 1.0);
+
+ /* For small values, use a lookup table for a more accurate scalem1. */
+ svbool_t is_small = svaclt (pg, x, FexpaBound);
+ if (svptest_any (pg, is_small))
+ {
+ /* Use the low 4 bits of the input of FEXPA as index. */
+ svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+ /* We can use the sign of x as a fifth bit to account for the asymmetry
+ of e^x around 0. */
+ svuint64_t signBit
+ = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+ svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+ /* Lookup values for scale - 1 for small x. */
+ svfloat64_t lookup
+ = svreinterpret_f64 (svld1_gather_index (is_small, d->scalem1, idx));
+
+ /* Select the appropriate scale - 1 value based on x. */
+ scalem1 = svsel (is_small, lookup, scalem1);
+ }
+
+ svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+ /* FEXPA returns nan for large inputs so we special case those. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ {
+ /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+ special case function so needs to be copied.
+ e = sign bit of u << 46. */
+ svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+ /* Copy sign to scale. */
+ scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+ return special_case (pg, y, scale, p, n);
+ }
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp10m1f_advsimd.c b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c
new file mode 100644
index 0000000..db1333b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c
@@ -0,0 +1,120 @@
+/* Single-precision (Advanced SIMD) exp10m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ float log10_2_high, log10_2_low;
+ float log10_lo, c2, c4, c6;
+ float32x4_t log10_hi, c1, c3, c5, c7, c8;
+ float32x4_t inv_log10_2, special_bound;
+ uint32x4_t exponent_bias, special_offset, special_bias;
+ float32x4_t scale_thresh;
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error. */
+ .log10_hi = V4 (0x1.26bb1b8000000p+1),
+ .log10_lo = 0x1.daaa8b0000000p-26,
+ .c1 = V4 (0x1.53524ep1),
+ .c2 = 0x1.046fc8p1,
+ .c3 = V4 (0x1.2bd376p0),
+ .c4 = 0x1.156f8p-1,
+ .c5 = V4 (0x1.b28c0ep-3),
+ .c6 = -0x1.05e38ep-4,
+ .c7 = V4 (-0x1.c79f4ap-4),
+ .c8 = V4 (0x1.2d6f34p1),
+ .inv_log10_2 = V4 (0x1.a934fp+1),
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+ .scale_thresh = V4 (ScaleBound),
+ .special_bound = V4 (SpecialBound),
+};
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
+}
+
+/* Fast vector implementation of single-precision exp10m1.
+ Algorithm is accurate to 1.70 + 0.5 ULP.
+ _ZGVnN4v_exp10m1f(0x1.36f94cp-3) got 0x1.ac96acp-2
+ want 0x1.ac96bp-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10m1) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
+ float32x4_t log10_2 = vld1q_f32 (&d->log10_2_high);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2, 0);
+ r = vfmaq_laneq_f32 (r, n, log10_2, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
+
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+ /* Pairwise Horner scheme. */
+ float32x4_t log10lo_c246 = vld1q_f32 (&d->log10_lo);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p78 = vfmaq_f32 (d->c7, r, d->c8);
+ float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log10lo_c246, 3);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10lo_c246, 2);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10lo_c246, 1);
+ float32x4_t p58 = vfmaq_f32 (p56, r2, p78);
+ float32x4_t p36 = vfmaq_f32 (p34, r2, p58);
+ float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
+
+ float32x4_t poly
+ = vfmaq_laneq_f32 (vmulq_f32 (d->log10_hi, r), r, log10lo_c246, 0);
+ poly = vfmaq_f32 (poly, p16, r2);
+
+ float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (v_any_u32 (cmp)))
+ return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+ return y;
+}
+libmvec_hidden_def (V_NAME_F1 (exp10m1))
+HALF_WIDTH_ALIAS_F1 (exp10m1)
diff --git a/sysdeps/aarch64/fpu/exp10m1f_sve.c b/sysdeps/aarch64/fpu/exp10m1f_sve.c
new file mode 100644
index 0000000..afbd034
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp10m1f_sve.c
@@ -0,0 +1,122 @@
+/* Single-precision (SVE) exp10m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ float log10_2_high, log10_2_low;
+ float log10_lo, c2, c4, c6, c8;
+ float32_t log10_hi, c1, c3, c5, c7;
+ float32_t inv_log10_2, special_bound;
+ uint32_t exponent_bias, special_offset, special_bias;
+ float32_t scale_thresh;
+} data = {
+ /* Coefficients generated using Remez algorithm with minimisation of relative
+ error. */
+ .log10_hi = 0x1.26bb1b8000000p+1,
+ .log10_lo = 0x1.daaa8b0000000p-26,
+ .c1 = 0x1.53524ep1,
+ .c2 = 0x1.046fc8p1,
+ .c3 = 0x1.2bd376p0,
+ .c4 = 0x1.156f8p-1,
+ .c5 = 0x1.b28c0ep-3,
+ .c6 = -0x1.05e38ep-4,
+ .c7 = -0x1.c79f4ap-4,
+ .c8 = 0x1.2d6f34p1,
+ .inv_log10_2 = 0x1.a934fp+1,
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ .exponent_bias = 0x3f800000,
+ .special_offset = 0x82000000,
+ .special_bias = 0x7f000000,
+ .scale_thresh = ScaleBound,
+ .special_bound = SpecialBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
+ svfloat32_t scale, const struct data *d)
+{
+ svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
+ svfloat32_t s1 = svreinterpret_f32 (
+ svsel (b, sv_u32 (d->special_offset + d->special_bias),
+ sv_u32 (d->special_bias)));
+ svfloat32_t s2
+ = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
+ svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
+ svfloat32_t r1
+ = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
+ svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
+ svfloat32_t r = svsel (cmp1, r1, r0);
+ return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
+}
+
+/* Fast vector implementation of single-precision exp10.
+ Algorithm is accurate to 1.68 + 0.5 ULP.
+ _ZGVnN4v_exp10m1f(0x1.3aeffep-3) got 0x1.b3139p-2
+ want 0x1.b3138cp-2. */
+svfloat32_t SV_NAME_F1 (exp10m1) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
+ with poly(r) in [1/sqrt(2), sqrt(2)] and
+ x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
+ svfloat32_t log10_2 = svld1rq (svptrue_b32 (), &d->log10_2_high);
+ svfloat32_t n = svrinta_x (pg, svmul_x (pg, x, d->inv_log10_2));
+ svfloat32_t r = svmls_lane_f32 (x, n, log10_2, 0);
+ r = svmla_lane_f32 (r, n, log10_2, 1);
+
+ svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
+
+ svfloat32_t scale
+ = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
+ svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
+
+ /* Pairwise Horner scheme. */
+ svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t c2468 = svld1rq (svptrue_b32 (), &d->c2);
+ svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), r, c2468, 3);
+ svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, c2468, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, c2468, 1);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, c2468, 0);
+ svfloat32_t p58 = svmla_x (pg, p56, r2, p78);
+ svfloat32_t p36 = svmla_x (pg, p34, r2, p58);
+ svfloat32_t p16 = svmla_x (pg, p12, r2, p36);
+
+ svfloat32_t poly = svmla_n_f32_x (pg, svmul_x (pg, r, sv_f32 (d->log10_hi)),
+ r, d->log10_lo);
+ poly = svmla_x (pg, poly, p16, r2);
+
+ svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (svptest_any (pg, cmp)))
+ return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1_advsimd.c b/sysdeps/aarch64/fpu/exp2m1_advsimd.c
new file mode 100644
index 0000000..1eb6042
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1_advsimd.c
@@ -0,0 +1,194 @@
+/* Double-precision (Advanced SIMD) exp2m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 1022.0
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 1280.0
+
+/* 87/256, value of x under which table lookup is used for 2^x-1. */
+#define TableBound 0x1.5bfffffffffffp-2
+
+/* Number of bits for each value in the table. */
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+static const struct data
+{
+ uint64x2_t exponent_bias, special_offset, special_bias, special_bias2,
+ sm1_tbl_off, sm1_tbl_mask;
+ float64x2_t scale_thresh, special_bound, shift, rnd2zero;
+ float64x2_t log2_hi, c1, c3, c5;
+ double log2_lo, c2, c4, c6;
+ uint64_t scalem1[88];
+} data = {
+ /* Coefficients generated using remez's algorithm for exp2m1(x). */
+ .log2_hi = V2 (0x1.62e42fefa39efp-1),
+ .log2_lo = 0x1.abc9e3b39803f3p-56,
+ .c1 = V2 (0x1.ebfbdff82c58ep-3),
+ .c2 = 0x1.c6b08d71f5804p-5,
+ .c3 = V2 (0x1.3b2ab6fee7509p-7),
+ .c4 = 0x1.5d1d37eb33b15p-10,
+ .c5 = V2 (0x1.423f35f371d9ap-13),
+ .c6 = 0x1.e7d57ad9a5f93p-5,
+ .exponent_bias = V2 (0x3ff0000000000000),
+ .special_offset = V2 (0x6000000000000000), /* 0x1p513. */
+ .special_bias = V2 (0x7000000000000000), /* 0x1p769. */
+ .special_bias2 = V2 (0x3010000000000000), /* 0x1p-254. */
+ .scale_thresh = V2 (ScaleBound),
+ .special_bound = V2 (SpecialBound),
+ .shift = V2 (0x1.8p52 / N),
+ .rnd2zero = V2 (-0x1p-8),
+ .sm1_tbl_off = V2 (24),
+ .sm1_tbl_mask = V2 (0x3f),
+
+ /* Table containing 2^x - 1, for 2^x values close to 1.
+ The table holds values of 2^(i/128) - 1, computed in
+ arbitrary precision.
+ The 1st half contains values associated to i=0..43.
+ The 2nd half contains values associated to i=-44..-1. */
+ .scalem1 = {
+ 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+ 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+ 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+ 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+ 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+ 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+ 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+ 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+ 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+ 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+ 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+ 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+ 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+ 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+ 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+ 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+ 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+ 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+ 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+ 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+ 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+ 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+ 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+ 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+ 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+ 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+ 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+ 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+ 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+ 0xbf761eea3847077b,
+ }
+};
+
+static inline uint64x2_t
+lookup_sbits (uint64x2_t i)
+{
+ return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
+ __v_exp_data[i[1] & IndexMask] };
+}
+
+static inline float64x2_t
+lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
+{
+ /* Extract sign bit and use as offset into table. */
+ uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
+ uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
+ uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
+ uint64x2_t idx = vaddq_u64 (base_idx, offset);
+
+ /* Fall back to table lookup for 2^x - 1, when x is close to zero to
+ avoid large errors. */
+ uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
+ return vreinterpretq_f64_u64 (sm1);
+}
+
+static inline VPCS_ATTR float64x2_t
+special_case (float64x2_t poly, float64x2_t n, uint64x2_t e, float64x2_t scale,
+ const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint64x2_t b = vandq_u64 (vclezq_f64 (n), d->special_offset);
+ float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (d->special_bias, b));
+ float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
+ vsubq_u64 (vreinterpretq_u64_f64 (scale), d->special_bias2), b));
+ uint64x2_t cmp2 = vcagtq_f64 (n, d->scale_thresh);
+ float64x2_t r1 = vmulq_f64 (s1, s1);
+ float64x2_t r2 = vmulq_f64 (vfmaq_f64 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ return vsubq_f64 (vbslq_f64 (cmp2, r1, r2), v_f64 (1.0f));
+}
+
+/* Double-precision vector exp2(x) - 1 function.
+ The maximum error is 2.55 + 0.5 ULP.
+ _ZGVnN2v_exp2m1 (0x1.1113e87a035ap-8) got 0x1.7b1d06f0a7d36p-9
+ want 0x1.7b1d06f0a7d33p-9. */
+VPCS_ATTR float64x2_t V_NAME_D1 (exp2m1) (float64x2_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* exp2(x) = 2^n (1 + poly(r))
+ x = n + r, with r in [-1/2N, 1/2N].
+ n is a floating point number, multiple of 1/N. */
+ float64x2_t z = vaddq_f64 (d->shift, x);
+ uint64x2_t u = vreinterpretq_u64_f64 (z);
+ float64x2_t n = vsubq_f64 (z, d->shift);
+
+ /* Calculate scale, 2^n. */
+ uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+ uint64x2_t scale_bits = lookup_sbits (u);
+ float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
+
+ uint64x2_t cmp = vcagtq_f64 (x, d->special_bound);
+
+ /* Pairwise Horner scheme. */
+ float64x2_t r = vsubq_f64 (x, n);
+ float64x2_t r2 = vmulq_f64 (r, r);
+
+ float64x2_t log2lo_c2 = vld1q_f64 (&d->log2_lo);
+ float64x2_t c4c6 = vld1q_f64 (&d->c4);
+
+ float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c4c6, 1);
+ float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c4c6, 0);
+ float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, log2lo_c2, 1);
+ float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
+ float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
+ float64x2_t poly
+ = vfmaq_laneq_f64 (vmulq_f64 (d->log2_hi, r), r, log2lo_c2, 0);
+ poly = vfmaq_f64 (poly, p16, r2);
+
+ float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
+
+ /* Use table to gather scalem1 for small values of x. */
+ uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
+ if (v_any_u64 (is_small))
+ scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
+
+ /* Construct exp2m1 = (scale - 1) + scale * poly. */
+ float64x2_t y = vfmaq_f64 (scalem1, poly, scale);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (v_any_u64 (cmp)))
+ return vbslq_f64 (cmp, special_case (poly, n, e, scale, d), y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1_sve.c b/sysdeps/aarch64/fpu/exp2m1_sve.c
new file mode 100644
index 0000000..24e00f8
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1_sve.c
@@ -0,0 +1,197 @@
+/* Double-precision (SVE) exp2m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 1022.0
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 1280.0
+
+/* 87/256, value of x under which table lookup is used for 2^x-1. */
+#define TableBound 0x1.5bfffffffffffp-2
+
+/* Number of bits for each value in the table. */
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+static const struct data
+{
+ double shift, rnd2zero;
+ double c1, c3, c5;
+ double c0, c2, c4;
+ uint64_t sm1_tbl_mask, sm1_tbl_off;
+ uint64_t scalem1[88];
+} data = {
+ /* Generated using fpminimax. */
+ .c0 = 0x1.62e42fefa39efp-1,
+ .c1 = 0x1.ebfbdff82c58ep-3,
+ .c2 = 0x1.c6b08d707e662p-5,
+ .c3 = 0x1.3b2ab6fc45f33p-7,
+ .c4 = 0x1.5d86c0ff8618dp-10,
+ .c5 = 0x1.4301374d5d2f5p-13,
+ .shift = 0x1.8p52 / N,
+ .sm1_tbl_mask = 0x3f,
+ .sm1_tbl_off = 24,
+ .rnd2zero = -0x1p-8,
+ /* Table containing 2^x - 1, for 2^x values close to 1.
+ The table holds values of 2^(i/128) - 1, computed in
+ arbitrary precision.
+ The 1st half contains values associated to i=0..43.
+ The 2nd half contains values associated to i=-44..-1. */
+ .scalem1= {
+ 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
+ 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
+ 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
+ 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
+ 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
+ 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
+ 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
+ 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
+ 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
+ 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
+ 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
+ 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
+ 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
+ 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
+ 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
+ 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
+ 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
+ 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
+ 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
+ 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
+ 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
+ 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
+ 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
+ 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
+ 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
+ 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
+ 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
+ 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
+ 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
+ 0xbf761eea3847077b,
+ }
+};
+
+static inline svuint64_t
+lookup_sbits (svbool_t pg, svuint64_t indices)
+{
+ /* Mask indices to valid range. */
+ svuint64_t masked = svand_z (pg, indices, svdup_n_u64 (IndexMask));
+ return svld1_gather_index (pg, __v_exp_data, masked);
+}
+
+static inline svfloat64_t
+lookup_sm1bits (svbool_t pg, svfloat64_t x, svuint64_t u, const struct data *d)
+{
+ /* Extract sign bit and use as offset into table. */
+ svbool_t signBit = svcmplt_f64 (pg, x, sv_f64 (d->rnd2zero));
+ svuint64_t base_idx = svand_x (pg, u, d->sm1_tbl_mask);
+ svuint64_t idx = svadd_m (signBit, base_idx, sv_u64 (d->sm1_tbl_off));
+
+ /* Fall back to table lookup for 2^x - 1, when x is close to zero to
+ avoid large errors. */
+ svuint64_t sm1 = svld1_gather_index (svptrue_b64 (), d->scalem1, idx);
+ return svreinterpret_f64 (sm1);
+}
+
+#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
+
+static inline svfloat64_t
+special_case (svfloat64_t s, svfloat64_t y, svfloat64_t n,
+ const struct data *d)
+{
+ /* s=2^n may overflow, break it up into s=s1*s2,
+ such that exp = s + s*y can be computed as s1*(s2+s2*y)
+ and s1*s1 overflows only if n>0. */
+ svbool_t p_sign = svcmple (svptrue_b64 (), n, 0.0); /* n <= 0. */
+ svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
+
+ /* Set s1 to generate overflow depending on sign of exponent n. */
+ svfloat64_t s1
+ = svreinterpret_f64 (svsubr_x (svptrue_b64 (), b, SpecialBias1));
+ /* Offset s to avoid overflow in final result if n is below threshold. */
+ svfloat64_t s2 = svreinterpret_f64 (svadd_x (
+ svptrue_b64 (),
+ svsub_x (svptrue_b64 (), svreinterpret_u64 (s), SpecialBias2), b));
+
+ /* |n| > 1280 => 2^(n) overflows. */
+ svbool_t p_cmp = svacle (svptrue_b64 (), n, ScaleBound);
+
+ svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+ svfloat64_t r2 = svmla_x (svptrue_b64 (), s2, s2, y);
+ svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+ return svsub_x (svptrue_b64 (), svsel (p_cmp, r0, r1), 1.0);
+}
+
+/* Double-precision SVE exp2(x) - 1.
+ Maximum error is 2.58 + 0.5 ULP.
+ _ZGVsMxv_exp2m1(0x1.0284a345c99bfp-8) got 0x1.66df630cd2965p-9
+ want 0x1.66df630cd2962p-9. */
+svfloat64_t SV_NAME_D1 (exp2m1) (svfloat64_t x, svbool_t pg)
+{
+ /* exp2(x) = 2^n (1 + poly(r))
+ x = n + r, with r in [-1/2N, 1/2N].
+ n is a floating point number, multiple of 1/N. */
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacge (pg, x, SpecialBound);
+
+ svfloat64_t z = svadd_x (pg, x, d->shift);
+ svuint64_t u = svreinterpret_u64 (z);
+ svfloat64_t n = svsub_x (pg, z, d->shift);
+
+ svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+ svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+ /* Look up table to calculate 2^n. */
+ svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TABLE_BITS);
+ svuint64_t scale_bits = lookup_sbits (pg, u);
+ svfloat64_t scale = svreinterpret_f64_u64 (svadd_x (pg, scale_bits, e));
+
+ /* Pairwise Horner scheme. */
+ svfloat64_t c35 = svld1rq (svptrue_b64 (), &d->c3);
+
+ svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, d->c1);
+ svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c35, 0);
+ svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c35, 1);
+
+ svfloat64_t p25 = svmla_x (pg, p23, r2, p45);
+ svfloat64_t p05 = svmla_x (pg, p01, r2, p25);
+ svfloat64_t poly = svmul_x (pg, p05, r);
+
+ /* Use table to gather scalem1 for small values of x. */
+ svbool_t is_small = svaclt (pg, x, TableBound);
+ svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+ if (svptest_any (pg, is_small))
+ scalem1 = svsel_f64 (is_small, lookup_sm1bits (pg, x, u, d), scalem1);
+
+ /* Construct exp2m1 = (scale - 1) + scale * poly. */
+ svfloat64_t y = svmla_x (pg, scalem1, scale, poly);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return svsel_f64 (special, special_case (scale, poly, n, d), y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/exp2m1f_advsimd.c b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c
new file mode 100644
index 0000000..b42f13e
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c
@@ -0,0 +1,106 @@
+/* Single-precision (Advanced SIMD) exp2m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "v_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ uint32x4_t exponent_bias, special_offset, special_bias;
+ float32x4_t scale_thresh, special_bound;
+ float32x4_t log2_hi, c1, c3, c5;
+ float log2_lo, c2, c4, c6;
+} data = {
+ /* Coefficients generated using remez's algorithm for exp2m1f(x). */
+ .log2_hi = V4 (0x1.62e43p-1),
+ .log2_lo = -0x1.05c610p-29,
+ .c1 = V4 (0x1.ebfbep-3),
+ .c2 = 0x1.c6b06ep-5,
+ .c3 = V4 (0x1.3b2a5cp-7),
+ .c4 = 0x1.5da59ep-10,
+ .c5 = V4 (0x1.440dccp-13),
+ .c6 = 0x1.e081d6p-17,
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
+ .scale_thresh = V4 (ScaleBound),
+ .special_bound = V4 (SpecialBound),
+};
+
+static float32x4_t VPCS_ATTR
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+ float32x4_t scale, const struct data *d)
+{
+ /* 2^n may overflow, break it up into s1*s2. */
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
+ float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+ uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
+ float32x4_t r2 = vmulq_f32 (s1, s1);
+ float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+ /* Similar to r1 but avoids double rounding in the subnormal range. */
+ float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+ float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+ return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
+}
+
+/* Single-precision vector exp2(x) - 1 function.
+ The maximum error is 1.76 + 0.5 ULP.
+ _ZGVnN4v_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
+ want 0x1.ab2ecp-2. */
+float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2m1) (float32x4_t x)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = vsubq_f32 (x, n);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
+
+ float32x4_t log2lo_c246 = vld1q_f32 (&d->log2_lo);
+ float32x4_t r2 = vmulq_f32 (r, r);
+
+ /* Pairwise horner scheme. */
+ float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log2lo_c246, 3);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log2lo_c246, 2);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log2lo_c246, 1);
+ float32x4_t p36 = vfmaq_f32 (p34, r2, p56);
+ float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
+ float32x4_t poly
+ = vfmaq_laneq_f32 (vmulq_f32 (d->log2_hi, r), r, log2lo_c246, 0);
+ poly = vfmaq_f32 (poly, p16, r2);
+
+ float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
+
+ if (__glibc_unlikely (v_any_u32 (cmp)))
+ return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+ return y;
+}
+libmvec_hidden_def (V_NAME_F1 (exp2m1))
+HALF_WIDTH_ALIAS_F1 (exp2m1)
diff --git a/sysdeps/aarch64/fpu/exp2m1f_sve.c b/sysdeps/aarch64/fpu/exp2m1f_sve.c
new file mode 100644
index 0000000..e6473f6
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp2m1f_sve.c
@@ -0,0 +1,108 @@
+/* Single-precision (SVE) exp2m1 function
+
+ Copyright (C) 2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "sv_math.h"
+
+/* Value of |x| above which scale overflows without special treatment. */
+#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+
+/* Value of n above which scale overflows even with special treatment. */
+#define ScaleBound 192.0f
+
+static const struct data
+{
+ uint32_t exponent_bias, special_offset, special_bias;
+ float32_t scale_thresh, special_bound;
+ float log2_lo, c2, c4, c6;
+ float log2_hi, c1, c3, c5, shift;
+} data = {
+ /* Coefficients generated using remez's algorithm for exp2m1f(x). */
+ .log2_hi = 0x1.62e43p-1,
+ .log2_lo = -0x1.05c610p-29,
+ .c1 = 0x1.ebfbep-3,
+ .c2 = 0x1.c6b06ep-5,
+ .c3 = 0x1.3b2a5cp-7,
+ .c4 = 0x1.5da59ep-10,
+ .c5 = 0x1.440dccp-13,
+ .c6 = 0x1.e081d6p-17,
+ .exponent_bias = 0x3f800000,
+ .special_offset = 0x82000000,
+ .special_bias = 0x7f000000,
+ .scale_thresh = ScaleBound,
+ .special_bound = SpecialBound,
+};
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
+ svfloat32_t scale, const struct data *d)
+{
+ svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
+ svfloat32_t s1 = svreinterpret_f32 (
+ svsel (b, sv_u32 (d->special_offset + d->special_bias),
+ sv_u32 (d->special_bias)));
+ svfloat32_t s2
+ = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
+ svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
+ svfloat32_t r1
+ = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
+ svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
+ svfloat32_t r = svsel (cmp1, r1, r0);
+ return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
+}
+
+/* Single-precision vector exp2(x) - 1 function.
+ The maximum error is 1.76 + 0.5 ULP.
+ _ZGVsMxv_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
+ want 0x1.ab2ecp-2. */
+svfloat32_t SV_NAME_F1 (exp2m1) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+
+ svfloat32_t n = svrinta_x (pg, x);
+ svfloat32_t r = svsub_x (pg, x, n);
+
+ svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
+ svfloat32_t scale
+ = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
+
+ svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
+
+ svfloat32_t r2 = svmul_x (pg, r, r);
+
+ svfloat32_t log2lo_c246 = svld1rq (svptrue_b32 (), &d->log2_lo);
+ svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, log2lo_c246, 3);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, log2lo_c246, 2);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, log2lo_c246, 1);
+
+ svfloat32_t p36 = svmla_x (pg, p34, p56, r2);
+ svfloat32_t p16 = svmla_x (pg, p12, p36, r2);
+
+ svfloat32_t poly = svmla_lane (
+ svmul_x (svptrue_b32 (), r, sv_f32 (d->log2_hi)), r, log2lo_c246, 0);
+ poly = svmla_x (pg, poly, p16, r2);
+
+ svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
+
+ /* Fallback to special case for lanes with overflow. */
+ if (__glibc_unlikely (svptest_any (pg, cmp)))
+ return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
+
+ return y;
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index a3fef22..b8b4822 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
+VPCS_VECTOR_WRAPPER (exp2m1_advsimd, _ZGVnN2v_exp2m1)
+VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1)
VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index f4a5ae8..168b3f7 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
+SVE_VECTOR_WRAPPER (exp2m1_sve, _ZGVsMxv_exp2m1)
+SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1)
SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index bc22956..c290073 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
+VPCS_VECTOR_WRAPPER (exp2m1f_advsimd, _ZGVnN4v_exp2m1f)
+VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f)
VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index ad0d6ad..fd01791 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
+SVE_VECTOR_WRAPPER (exp2m1f_sve, _ZGVsMxv_exp2m1f)
+SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f)
SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)