diff options
author | Hasaan Khan <Hasaan.Khan@arm.com> | 2025-08-20 11:27:23 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2025-09-02 16:50:24 +0000 |
commit | 8ced7815fbff7bec9af2b7611a3478af27b57d13 (patch) | |
tree | 0d48a47ef256d42915e80d73f0ac01db00dfda0d /sysdeps/aarch64 | |
parent | 54bd776f991f1a228a6bb6d76bf542edd915a0e3 (diff) | |
download | glibc-master.zip glibc-master.tar.gz glibc-master.tar.bz2 |
Vector variants of the new C23 exp2m1 & exp10m1 routines.
Note: Benchmark inputs for exp2m1 & exp10m1 are identical to exp2 & exp10
respectively, this also includes the floating point variations.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r-- | sysdeps/aarch64/fpu/Makefile | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/Versions | 12 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/advsimd_f32_protos.h | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/bits/math-vector.h | 16 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp10m1_advsimd.c | 202 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp10m1_sve.c | 184 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp10m1f_advsimd.c | 120 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp10m1f_sve.c | 122 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp2m1_advsimd.c | 194 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp2m1_sve.c | 197 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp2m1f_advsimd.c | 106 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/exp2m1f_sve.c | 108 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-double-sve-wrappers.c | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 2 |
16 files changed, 1273 insertions, 0 deletions
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile index 068c11c..1ba0459 100644 --- a/sysdeps/aarch64/fpu/Makefile +++ b/sysdeps/aarch64/fpu/Makefile @@ -19,6 +19,8 @@ libmvec-supported-funcs = acos \ exp10 \ exp2 \ expm1 \ + exp2m1 \ + exp10m1 \ hypot \ log \ log10 \ diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions index 2980cb7..21a29f9 100644 --- a/sysdeps/aarch64/fpu/Versions +++ b/sysdeps/aarch64/fpu/Versions @@ -179,4 +179,16 @@ libmvec { _ZGVsMxvv_atan2pi; _ZGVsMxvv_atan2pif; } + GLIBC_2.43 { + _ZGVnN2v_exp2m1; + _ZGVnN2v_exp2m1f; + _ZGVnN4v_exp2m1f; + _ZGVsMxv_exp2m1; + _ZGVsMxv_exp2m1f; + _ZGVnN2v_exp10m1; + _ZGVnN2v_exp10m1f; + _ZGVnN4v_exp10m1f; + _ZGVsMxv_exp10m1; + _ZGVsMxv_exp10m1f; + } } diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h index c202bda..1c3df31 100644 --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h @@ -36,6 +36,8 @@ libmvec_hidden_proto (V_NAME_F1(exp10)); libmvec_hidden_proto (V_NAME_F1(exp2)); libmvec_hidden_proto (V_NAME_F1(exp)); libmvec_hidden_proto (V_NAME_F1(expm1)); +libmvec_hidden_proto (V_NAME_F1(exp2m1)); +libmvec_hidden_proto (V_NAME_F1(exp10m1)); libmvec_hidden_proto (V_NAME_F2(hypot)); libmvec_hidden_proto (V_NAME_F1(log10)); libmvec_hidden_proto (V_NAME_F1(log1p)); diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h index 77ae10d..2983640 100644 --- a/sysdeps/aarch64/fpu/bits/math-vector.h +++ b/sysdeps/aarch64/fpu/bits/math-vector.h @@ -113,6 +113,14 @@ # define __DECL_SIMD_expm1 __DECL_SIMD_aarch64 # undef __DECL_SIMD_expm1f # define __DECL_SIMD_expm1f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_exp2m1 +# define __DECL_SIMD_exp2m1 __DECL_SIMD_aarch64 +# undef __DECL_SIMD_exp2m1f +# define __DECL_SIMD_exp2m1f __DECL_SIMD_aarch64 +# undef __DECL_SIMD_exp10m1 +# define __DECL_SIMD_exp10m1 __DECL_SIMD_aarch64 +# undef __DECL_SIMD_exp10m1f +# define __DECL_SIMD_exp10m1f __DECL_SIMD_aarch64 # undef __DECL_SIMD_hypot # define __DECL_SIMD_hypot __DECL_SIMD_aarch64 # undef __DECL_SIMD_hypotf @@ -212,6 +220,8 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp2m1f (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t); __vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t); @@ -247,6 +257,8 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_exp2m1 (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t); __vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t); @@ -287,6 +299,8 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_exp2m1f (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t); @@ -322,6 +336,8 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_exp2m1 (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t); diff --git a/sysdeps/aarch64/fpu/exp10m1_advsimd.c b/sysdeps/aarch64/fpu/exp10m1_advsimd.c new file mode 100644 index 0000000..765774a --- /dev/null +++ b/sysdeps/aarch64/fpu/exp10m1_advsimd.c @@ -0,0 +1,202 @@ +/* Double-precision (Advanced SIMD) exp10m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include "v_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 163840.0 /* 1280.0 * N. */ + +/* Value of |x| below which scale - 1 contributes produces large error. */ +#define TableBound 0x1.a308a4198c9d7p-4 /* log10(2) * 87/256. */ + +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +const static struct data +{ + double c6, c0; + double c2, c4; + double log2_10_hi, log2_10_lo; + float64x2_t c1, c3, c5; + float64x2_t log10_2, shift; + float64x2_t special_bound, scale_thresh; + uint64x2_t sm1_tbl_off, sm1_tbl_mask; + float64x2_t rnd2zero; + uint64_t scalem1[88]; +} data = { + /* Coefficients generated using Remez algorithm. */ + .c0 = 0x1.26bb1bbb55516p1, + .c1 = V2 (0x1.53524c73cea69p1), + .c2 = 0x1.0470591b30b6bp1, + .c3 = V2 (0x1.2bd7609b57e25p0), + .c4 = 0x1.1517d41bddcbep-1, + .c5 = V2 (0x1.a9a12d6f0755cp-3), + .c6 = -0x1.76a8d3abd7025p9, + + /* Values of x which should round to the zeroth index of the exp10m1 table. */ + .rnd2zero = V2 (-0x1.34413509f79ffp-10), /* (2^-8)/log2(10). */ + .sm1_tbl_off = V2 (24), + .sm1_tbl_mask = V2 (0x3f), + + .log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */ + .log2_10_hi = 0x1.34413509f79ffp-9, /* log2(10)/N. */ + .log2_10_lo = -0x1.9dc1da994fd21p-66, + .shift = V2 (0x1.8p+52), + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), + + /* Table containing 2^x - 1, for 2^x values close to 1. + The table holds values of 2^(i/128) - 1, computed in + arbitrary precision. + The 1st half contains values associated to i=0..43. + The 2nd half contains values associated to i=-44..-1. */ + .scalem1 = { + 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077, + 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772, + 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec, + 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e, + 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb, + 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c, + 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8, + 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1, + 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9, + 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80, + 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9, + 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86, + 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7, + 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd, + 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c, + 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119, + 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d, + 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e, + 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b, + 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a, + 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7, + 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d, + 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7, + 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074, + 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4, + 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76, + 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268, + 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16, + 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4, + 0xbf761eea3847077b, + } +}; + +#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ +#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ + +static inline float64x2_t VPCS_ATTR +special_case (float64x2_t s, float64x2_t y, float64x2_t n, + const struct data *d) +{ + /* 2^(n/N) may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b)); + float64x2_t s2 = vreinterpretq_f64_u64 ( + vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b)); + uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1); + return vsubq_f64 (vbslq_f64 (cmp, r1, r0), v_f64 (1.0)); +} + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +static inline float64x2_t +lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d) +{ + /* Extract sign bit and use as offset into table. */ + uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero); + uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off); + uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask); + uint64x2_t idx = vaddq_u64 (base_idx, offset); + + uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] }; + return vreinterpretq_f64_u64 (sm1); +} + +/* Fast vector implementation of exp10m1. + Maximum measured error is 2.53 + 0.5 ulp. + _ZGVnN2v_exp10m1 (0x1.347007ffed62cp-10) got 0x1.63955944d1d83p-9 + want 0x1.63955944d1d80p-9. */ +float64x2_t VPCS_ATTR V_NAME_D1 (exp10m1) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + uint64x2_t cmp = vcageq_f64 (x, d->special_bound); + + /* n = round(x/(log10(2)/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + float64x2_t c24 = vld1q_f64 (&d->c2); + float64x2_t c60 = vld1q_f64 (&d->c6); + float64x2_t log_2_10 = vld1q_f64 (&d->log2_10_hi); + + /* r = x - n*log10(2)/N. */ + float64x2_t r = x; + r = vfmsq_laneq_f64 (r, n, log_2_10, 0); + r = vfmsq_laneq_f64 (r, n, log_2_10, 1); + + /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t r2 = vmulq_f64 (r, r); + float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, c24, 0); + float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c24, 1); + float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c60, 0); + + float64x2_t p36 = vfmaq_f64 (p34, r2, p56); + float64x2_t p16 = vfmaq_f64 (p12, r2, p36); + + float64x2_t p0 = vmulq_laneq_f64 (r, c60, 1); + float64x2_t p = vfmaq_f64 (p0, r2, p16); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + + /* scale = 2^(n/N). */ + uint64x2_t scale_bits = lookup_sbits (u); + float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e)); + float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0)); + + /* Use table to gather scalem1 for small values of x. */ + uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound)); + if (v_any_u64 (is_small)) + scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1); + + /* Construct exp10m1 = (scale - 1) + scale * poly. */ + float64x2_t y = vfmaq_f64 (scalem1, scale, p); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (v_any_u64 (cmp))) + return vbslq_f64 (cmp, special_case (scale, p, n, d), y); + + return y; +} diff --git a/sysdeps/aarch64/fpu/exp10m1_sve.c b/sysdeps/aarch64/fpu/exp10m1_sve.c new file mode 100644 index 0000000..f77bce5 --- /dev/null +++ b/sysdeps/aarch64/fpu/exp10m1_sve.c @@ -0,0 +1,184 @@ +/* Double-precision (SVE) exp10m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 0x1.33f4bedd4fa70p+8 /* log10(2^(1023 + 1/128)). */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 1280.0 + +/* Value of |x| below which scale - 1 contributes produces large error. */ +#define FexpaBound 0x1.2a9f2b61a7e2p-4 /* 31*(log10(2)/128). */ + +static const struct data +{ + double log2_10_hi, log2_10_lo; + double c3, c5; + double c0, c1, c2, c4; + double shift, log10_2, special_bound; + uint64_t scalem1[32]; +} data = { + /* Coefficients generated using Remez algorithm. */ + .c0 = 0x1.26bb1bbb55516p1, + .c1 = 0x1.53524c73cea6ap1, + .c2 = 0x1.0470591d8bd2ep1, + .c3 = 0x1.2bd7609dfea43p0, + .c4 = 0x1.142d0d89058f1p-1, + .c5 = 0x1.a80cf2ddd513p-3, + + /* 1.5*2^46+1023. This value is further explained below. */ + .shift = 0x1.800000000ffc0p+46, + .log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */ + .log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */ + .log2_10_lo = -0x1.9dc1da994fd21p-59, + .special_bound = SpecialBound, + + /* Table emulating FEXPA - 1, for values of FEXPA close to 1. + The table holds values of 2^(i/64) - 1, computed in arbitrary precision. + The 1st half contains values associated to i=0..+15. + The 2nd half contains values associated to i=0..-15. */ + .scalem1 = { + 0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, + 0x3fa0e8a30eb37901, 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, + 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, 0x3fb72b83c7d517ae, + 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2, + 0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, + 0x3fc6942d3720185a, 0x0000000000000000, 0xbfc331751ec3a814, + 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, 0xbfbf332113d56b1f, + 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424, + 0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, + 0xbfaafd11874c009e, 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, + 0xbf95f134923757f3, 0xbf860f9f985bc9f4, + }, +}; + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static NOINLINE svfloat64_t +special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p, + svfloat64_t n) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + + /* If n<=0 then set b to 0x6, 0 otherwise. */ + svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */ + svuint64_t b + = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */ + + /* Set s1 to generate overflow depending on sign of exponent n, + ie. s1 = 0x70...0 - b. */ + svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. + ie. s2 = as_u64 (s) - 0x3010...0 + b. */ + svfloat64_t s2 = svreinterpret_f64 ( + svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacgt (pg, n, ScaleBound); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (pg, s2, s2, p); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */ + return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0)); +} + +/* FEXPA based SVE exp10m1 algorithm. + Maximum measured error is 2.87 + 0.5 ULP: + _ZGVsMxv_exp10m1(0x1.64645f11e94c6p-4) got 0x1.c64d54eb7658dp-3 + want 0x1.c64d54eb7658ap-3. */ +svfloat64_t SV_NAME_D1 (exp10m1) (svfloat64_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacgt (pg, x, d->special_bound); + + /* n = round(x/(log10(2)/N)). */ + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t z = svmla_x (pg, shift, x, d->log10_2); + svfloat64_t n = svsub_x (pg, z, shift); + + /* r = x - n*log10(2)/N. */ + svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi); + svfloat64_t r = x; + r = svmls_lane (r, n, log2_10, 0); + r = svmls_lane (r, n, log2_10, 1); + + /* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so + for consistent NaN handling we have to manually propagate them. This + comes at significant performance cost. */ + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t scale = svexpa (u); + svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c3); + /* Approximate exp10(r) using polynomial. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, sv_f64 (d->c1)); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c24, 0); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c24, 1); + svfloat64_t p25 = svmla_x (pg, p23, p45, r2); + svfloat64_t p05 = svmla_x (pg, p01, p25, r2); + + svfloat64_t p = svmul_x (pg, p05, r); + + svfloat64_t scalem1 = svsub_x (pg, scale, 1.0); + + /* For small values, use a lookup table for a more accurate scalem1. */ + svbool_t is_small = svaclt (pg, x, FexpaBound); + if (svptest_any (pg, is_small)) + { + /* Use the low 4 bits of the input of FEXPA as index. */ + svuint64_t base_idx = svand_x (pg, u, 0xf); + + /* We can use the sign of x as a fifth bit to account for the asymmetry + of e^x around 0. */ + svuint64_t signBit + = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4); + svuint64_t idx = svorr_x (pg, base_idx, signBit); + + /* Lookup values for scale - 1 for small x. */ + svfloat64_t lookup + = svreinterpret_f64 (svld1_gather_index (is_small, d->scalem1, idx)); + + /* Select the appropriate scale - 1 value based on x. */ + scalem1 = svsel (is_small, lookup, scalem1); + } + + svfloat64_t y = svmla_x (pg, scalem1, scale, p); + + /* FEXPA returns nan for large inputs so we special case those. */ + if (__glibc_unlikely (svptest_any (pg, special))) + { + /* FEXPA zeroes the sign bit, however the sign is meaningful to the + special case function so needs to be copied. + e = sign bit of u << 46. */ + svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000); + /* Copy sign to scale. */ + scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale))); + return special_case (pg, y, scale, p, n); + } + + return y; +} diff --git a/sysdeps/aarch64/fpu/exp10m1f_advsimd.c b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c new file mode 100644 index 0000000..db1333b --- /dev/null +++ b/sysdeps/aarch64/fpu/exp10m1f_advsimd.c @@ -0,0 +1,120 @@ +/* Single-precision (Advanced SIMD) exp10m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 192.0f + +static const struct data +{ + float log10_2_high, log10_2_low; + float log10_lo, c2, c4, c6; + float32x4_t log10_hi, c1, c3, c5, c7, c8; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; + float32x4_t scale_thresh; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. */ + .log10_hi = V4 (0x1.26bb1b8000000p+1), + .log10_lo = 0x1.daaa8b0000000p-26, + .c1 = V4 (0x1.53524ep1), + .c2 = 0x1.046fc8p1, + .c3 = V4 (0x1.2bd376p0), + .c4 = 0x1.156f8p-1, + .c5 = V4 (0x1.b28c0ep-3), + .c6 = -0x1.05e38ep-4, + .c7 = V4 (-0x1.c79f4ap-4), + .c8 = V4 (0x1.2d6f34p1), + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), + .scale_thresh = V4 (ScaleBound), + .special_bound = V4 (SpecialBound), +}; + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f)); +} + +/* Fast vector implementation of single-precision exp10m1. + Algorithm is accurate to 1.70 + 0.5 ULP. + _ZGVnN4v_exp10m1f(0x1.36f94cp-3) got 0x1.ac96acp-2 + want 0x1.ac96bp-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10m1) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + float32x4_t log10_2 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2, 0); + r = vfmaq_laneq_f32 (r, n, log10_2, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); + + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + + /* Pairwise Horner scheme. */ + float32x4_t log10lo_c246 = vld1q_f32 (&d->log10_lo); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p78 = vfmaq_f32 (d->c7, r, d->c8); + float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log10lo_c246, 3); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10lo_c246, 2); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10lo_c246, 1); + float32x4_t p58 = vfmaq_f32 (p56, r2, p78); + float32x4_t p36 = vfmaq_f32 (p34, r2, p58); + float32x4_t p16 = vfmaq_f32 (p12, r2, p36); + + float32x4_t poly + = vfmaq_laneq_f32 (vmulq_f32 (d->log10_hi, r), r, log10lo_c246, 0); + poly = vfmaq_f32 (poly, p16, r2); + + float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y); + + return y; +} +libmvec_hidden_def (V_NAME_F1 (exp10m1)) +HALF_WIDTH_ALIAS_F1 (exp10m1) diff --git a/sysdeps/aarch64/fpu/exp10m1f_sve.c b/sysdeps/aarch64/fpu/exp10m1f_sve.c new file mode 100644 index 0000000..afbd034 --- /dev/null +++ b/sysdeps/aarch64/fpu/exp10m1f_sve.c @@ -0,0 +1,122 @@ +/* Single-precision (SVE) exp10m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 192.0f + +static const struct data +{ + float log10_2_high, log10_2_low; + float log10_lo, c2, c4, c6, c8; + float32_t log10_hi, c1, c3, c5, c7; + float32_t inv_log10_2, special_bound; + uint32_t exponent_bias, special_offset, special_bias; + float32_t scale_thresh; +} data = { + /* Coefficients generated using Remez algorithm with minimisation of relative + error. */ + .log10_hi = 0x1.26bb1b8000000p+1, + .log10_lo = 0x1.daaa8b0000000p-26, + .c1 = 0x1.53524ep1, + .c2 = 0x1.046fc8p1, + .c3 = 0x1.2bd376p0, + .c4 = 0x1.156f8p-1, + .c5 = 0x1.b28c0ep-3, + .c6 = -0x1.05e38ep-4, + .c7 = -0x1.c79f4ap-4, + .c8 = 0x1.2d6f34p1, + .inv_log10_2 = 0x1.a934fp+1, + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + .exponent_bias = 0x3f800000, + .special_offset = 0x82000000, + .special_bias = 0x7f000000, + .scale_thresh = ScaleBound, + .special_bound = SpecialBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1, + svfloat32_t scale, const struct data *d) +{ + svbool_t b = svcmple (svptrue_b32 (), n, 0.0f); + svfloat32_t s1 = svreinterpret_f32 ( + svsel (b, sv_u32 (d->special_offset + d->special_bias), + sv_u32 (d->special_bias))); + svfloat32_t s2 + = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset))); + svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh); + svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1); + svfloat32_t r1 + = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1); + svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale); + svfloat32_t r = svsel (cmp1, r1, r0); + return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f); +} + +/* Fast vector implementation of single-precision exp10. + Algorithm is accurate to 1.68 + 0.5 ULP. + _ZGVnN4v_exp10m1f(0x1.3aeffep-3) got 0x1.b3139p-2 + want 0x1.b3138cp-2. */ +svfloat32_t SV_NAME_F1 (exp10m1) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), + with poly(r) in [1/sqrt(2), sqrt(2)] and + x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ + svfloat32_t log10_2 = svld1rq (svptrue_b32 (), &d->log10_2_high); + svfloat32_t n = svrinta_x (pg, svmul_x (pg, x, d->inv_log10_2)); + svfloat32_t r = svmls_lane_f32 (x, n, log10_2, 0); + r = svmla_lane_f32 (r, n, log10_2, 1); + + svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23); + + svfloat32_t scale + = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias)); + svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound); + + /* Pairwise Horner scheme. */ + svfloat32_t r2 = svmul_x (pg, r, r); + svfloat32_t c2468 = svld1rq (svptrue_b32 (), &d->c2); + svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), r, c2468, 3); + svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, c2468, 2); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, c2468, 1); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, c2468, 0); + svfloat32_t p58 = svmla_x (pg, p56, r2, p78); + svfloat32_t p36 = svmla_x (pg, p34, r2, p58); + svfloat32_t p16 = svmla_x (pg, p12, r2, p36); + + svfloat32_t poly = svmla_n_f32_x (pg, svmul_x (pg, r, sv_f32 (d->log10_hi)), + r, d->log10_lo); + poly = svmla_x (pg, poly, p16, r2); + + svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (svptest_any (pg, cmp))) + return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y); + + return y; +} diff --git a/sysdeps/aarch64/fpu/exp2m1_advsimd.c b/sysdeps/aarch64/fpu/exp2m1_advsimd.c new file mode 100644 index 0000000..1eb6042 --- /dev/null +++ b/sysdeps/aarch64/fpu/exp2m1_advsimd.c @@ -0,0 +1,194 @@ +/* Double-precision (Advanced SIMD) exp2m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 1022.0 + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 1280.0 + +/* 87/256, value of x under which table lookup is used for 2^x-1. */ +#define TableBound 0x1.5bfffffffffffp-2 + +/* Number of bits for each value in the table. */ +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +static const struct data +{ + uint64x2_t exponent_bias, special_offset, special_bias, special_bias2, + sm1_tbl_off, sm1_tbl_mask; + float64x2_t scale_thresh, special_bound, shift, rnd2zero; + float64x2_t log2_hi, c1, c3, c5; + double log2_lo, c2, c4, c6; + uint64_t scalem1[88]; +} data = { + /* Coefficients generated using remez's algorithm for exp2m1(x). */ + .log2_hi = V2 (0x1.62e42fefa39efp-1), + .log2_lo = 0x1.abc9e3b39803f3p-56, + .c1 = V2 (0x1.ebfbdff82c58ep-3), + .c2 = 0x1.c6b08d71f5804p-5, + .c3 = V2 (0x1.3b2ab6fee7509p-7), + .c4 = 0x1.5d1d37eb33b15p-10, + .c5 = V2 (0x1.423f35f371d9ap-13), + .c6 = 0x1.e7d57ad9a5f93p-5, + .exponent_bias = V2 (0x3ff0000000000000), + .special_offset = V2 (0x6000000000000000), /* 0x1p513. */ + .special_bias = V2 (0x7000000000000000), /* 0x1p769. */ + .special_bias2 = V2 (0x3010000000000000), /* 0x1p-254. */ + .scale_thresh = V2 (ScaleBound), + .special_bound = V2 (SpecialBound), + .shift = V2 (0x1.8p52 / N), + .rnd2zero = V2 (-0x1p-8), + .sm1_tbl_off = V2 (24), + .sm1_tbl_mask = V2 (0x3f), + + /* Table containing 2^x - 1, for 2^x values close to 1. + The table holds values of 2^(i/128) - 1, computed in + arbitrary precision. + The 1st half contains values associated to i=0..43. + The 2nd half contains values associated to i=-44..-1. */ + .scalem1 = { + 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077, + 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772, + 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec, + 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e, + 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb, + 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c, + 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8, + 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1, + 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9, + 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80, + 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9, + 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86, + 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7, + 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd, + 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c, + 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119, + 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d, + 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e, + 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b, + 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a, + 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7, + 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d, + 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7, + 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074, + 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4, + 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76, + 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268, + 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16, + 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4, + 0xbf761eea3847077b, + } +}; + +static inline uint64x2_t +lookup_sbits (uint64x2_t i) +{ + return (uint64x2_t){ __v_exp_data[i[0] & IndexMask], + __v_exp_data[i[1] & IndexMask] }; +} + +static inline float64x2_t +lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d) +{ + /* Extract sign bit and use as offset into table. */ + uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero); + uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off); + uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask); + uint64x2_t idx = vaddq_u64 (base_idx, offset); + + /* Fall back to table lookup for 2^x - 1, when x is close to zero to + avoid large errors. */ + uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] }; + return vreinterpretq_f64_u64 (sm1); +} + +static inline VPCS_ATTR float64x2_t +special_case (float64x2_t poly, float64x2_t n, uint64x2_t e, float64x2_t scale, + const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint64x2_t b = vandq_u64 (vclezq_f64 (n), d->special_offset); + float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (d->special_bias, b)); + float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 ( + vsubq_u64 (vreinterpretq_u64_f64 (scale), d->special_bias2), b)); + uint64x2_t cmp2 = vcagtq_f64 (n, d->scale_thresh); + float64x2_t r1 = vmulq_f64 (s1, s1); + float64x2_t r2 = vmulq_f64 (vfmaq_f64 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + return vsubq_f64 (vbslq_f64 (cmp2, r1, r2), v_f64 (1.0f)); +} + +/* Double-precision vector exp2(x) - 1 function. + The maximum error is 2.55 + 0.5 ULP. + _ZGVnN2v_exp2m1 (0x1.1113e87a035ap-8) got 0x1.7b1d06f0a7d36p-9 + want 0x1.7b1d06f0a7d33p-9. */ +VPCS_ATTR float64x2_t V_NAME_D1 (exp2m1) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* exp2(x) = 2^n (1 + poly(r)) + x = n + r, with r in [-1/2N, 1/2N]. + n is a floating point number, multiple of 1/N. */ + float64x2_t z = vaddq_f64 (d->shift, x); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* Calculate scale, 2^n. */ + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS); + uint64x2_t scale_bits = lookup_sbits (u); + float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e)); + + uint64x2_t cmp = vcagtq_f64 (x, d->special_bound); + + /* Pairwise Horner scheme. */ + float64x2_t r = vsubq_f64 (x, n); + float64x2_t r2 = vmulq_f64 (r, r); + + float64x2_t log2lo_c2 = vld1q_f64 (&d->log2_lo); + float64x2_t c4c6 = vld1q_f64 (&d->c4); + + float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c4c6, 1); + float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c4c6, 0); + float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, log2lo_c2, 1); + float64x2_t p36 = vfmaq_f64 (p34, r2, p56); + float64x2_t p16 = vfmaq_f64 (p12, r2, p36); + float64x2_t poly + = vfmaq_laneq_f64 (vmulq_f64 (d->log2_hi, r), r, log2lo_c2, 0); + poly = vfmaq_f64 (poly, p16, r2); + + float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0)); + + /* Use table to gather scalem1 for small values of x. */ + uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound)); + if (v_any_u64 (is_small)) + scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1); + + /* Construct exp2m1 = (scale - 1) + scale * poly. */ + float64x2_t y = vfmaq_f64 (scalem1, poly, scale); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (v_any_u64 (cmp))) + return vbslq_f64 (cmp, special_case (poly, n, e, scale, d), y); + + return y; +} diff --git a/sysdeps/aarch64/fpu/exp2m1_sve.c b/sysdeps/aarch64/fpu/exp2m1_sve.c new file mode 100644 index 0000000..24e00f8 --- /dev/null +++ b/sysdeps/aarch64/fpu/exp2m1_sve.c @@ -0,0 +1,197 @@ +/* Double-precision (SVE) exp2m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 1022.0 + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 1280.0 + +/* 87/256, value of x under which table lookup is used for 2^x-1. */ +#define TableBound 0x1.5bfffffffffffp-2 + +/* Number of bits for each value in the table. */ +#define N (1 << V_EXP_TABLE_BITS) +#define IndexMask (N - 1) + +static const struct data +{ + double shift, rnd2zero; + double c1, c3, c5; + double c0, c2, c4; + uint64_t sm1_tbl_mask, sm1_tbl_off; + uint64_t scalem1[88]; +} data = { + /* Generated using fpminimax. */ + .c0 = 0x1.62e42fefa39efp-1, + .c1 = 0x1.ebfbdff82c58ep-3, + .c2 = 0x1.c6b08d707e662p-5, + .c3 = 0x1.3b2ab6fc45f33p-7, + .c4 = 0x1.5d86c0ff8618dp-10, + .c5 = 0x1.4301374d5d2f5p-13, + .shift = 0x1.8p52 / N, + .sm1_tbl_mask = 0x3f, + .sm1_tbl_off = 24, + .rnd2zero = -0x1p-8, + /* Table containing 2^x - 1, for 2^x values close to 1. + The table holds values of 2^(i/128) - 1, computed in + arbitrary precision. + The 1st half contains values associated to i=0..43. + The 2nd half contains values associated to i=-44..-1. */ + .scalem1= { + 0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077, + 0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772, + 0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec, + 0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e, + 0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb, + 0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c, + 0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8, + 0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1, + 0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9, + 0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80, + 0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9, + 0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86, + 0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7, + 0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd, + 0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c, + 0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119, + 0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d, + 0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e, + 0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b, + 0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a, + 0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7, + 0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d, + 0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7, + 0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074, + 0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4, + 0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76, + 0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268, + 0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16, + 0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4, + 0xbf761eea3847077b, + } +}; + +static inline svuint64_t +lookup_sbits (svbool_t pg, svuint64_t indices) +{ + /* Mask indices to valid range. */ + svuint64_t masked = svand_z (pg, indices, svdup_n_u64 (IndexMask)); + return svld1_gather_index (pg, __v_exp_data, masked); +} + +static inline svfloat64_t +lookup_sm1bits (svbool_t pg, svfloat64_t x, svuint64_t u, const struct data *d) +{ + /* Extract sign bit and use as offset into table. */ + svbool_t signBit = svcmplt_f64 (pg, x, sv_f64 (d->rnd2zero)); + svuint64_t base_idx = svand_x (pg, u, d->sm1_tbl_mask); + svuint64_t idx = svadd_m (signBit, base_idx, sv_u64 (d->sm1_tbl_off)); + + /* Fall back to table lookup for 2^x - 1, when x is close to zero to + avoid large errors. */ + svuint64_t sm1 = svld1_gather_index (svptrue_b64 (), d->scalem1, idx); + return svreinterpret_f64 (sm1); +} + +#define SpecialOffset 0x6000000000000000 /* 0x1p513. */ +/* SpecialBias1 + SpecialBias1 = asuint(1.0). */ +#define SpecialBias1 0x7000000000000000 /* 0x1p769. */ +#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ + +static inline svfloat64_t +special_case (svfloat64_t s, svfloat64_t y, svfloat64_t n, + const struct data *d) +{ + /* s=2^n may overflow, break it up into s=s1*s2, + such that exp = s + s*y can be computed as s1*(s2+s2*y) + and s1*s1 overflows only if n>0. */ + svbool_t p_sign = svcmple (svptrue_b64 (), n, 0.0); /* n <= 0. */ + svuint64_t b = svdup_u64_z (p_sign, SpecialOffset); + + /* Set s1 to generate overflow depending on sign of exponent n. */ + svfloat64_t s1 + = svreinterpret_f64 (svsubr_x (svptrue_b64 (), b, SpecialBias1)); + /* Offset s to avoid overflow in final result if n is below threshold. */ + svfloat64_t s2 = svreinterpret_f64 (svadd_x ( + svptrue_b64 (), + svsub_x (svptrue_b64 (), svreinterpret_u64 (s), SpecialBias2), b)); + + /* |n| > 1280 => 2^(n) overflows. */ + svbool_t p_cmp = svacle (svptrue_b64 (), n, ScaleBound); + + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); + svfloat64_t r2 = svmla_x (svptrue_b64 (), s2, s2, y); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); + + return svsub_x (svptrue_b64 (), svsel (p_cmp, r0, r1), 1.0); +} + +/* Double-precision SVE exp2(x) - 1. + Maximum error is 2.58 + 0.5 ULP. + _ZGVsMxv_exp2m1(0x1.0284a345c99bfp-8) got 0x1.66df630cd2965p-9 + want 0x1.66df630cd2962p-9. */ +svfloat64_t SV_NAME_D1 (exp2m1) (svfloat64_t x, svbool_t pg) +{ + /* exp2(x) = 2^n (1 + poly(r)) + x = n + r, with r in [-1/2N, 1/2N]. + n is a floating point number, multiple of 1/N. */ + const struct data *d = ptr_barrier (&data); + svbool_t special = svacge (pg, x, SpecialBound); + + svfloat64_t z = svadd_x (pg, x, d->shift); + svuint64_t u = svreinterpret_u64 (z); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t r = svsub_x (svptrue_b64 (), x, n); + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + + /* Look up table to calculate 2^n. */ + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TABLE_BITS); + svuint64_t scale_bits = lookup_sbits (pg, u); + svfloat64_t scale = svreinterpret_f64_u64 (svadd_x (pg, scale_bits, e)); + + /* Pairwise Horner scheme. */ + svfloat64_t c35 = svld1rq (svptrue_b64 (), &d->c3); + + svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, d->c1); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c35, 0); + svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c35, 1); + + svfloat64_t p25 = svmla_x (pg, p23, r2, p45); + svfloat64_t p05 = svmla_x (pg, p01, r2, p25); + svfloat64_t poly = svmul_x (pg, p05, r); + + /* Use table to gather scalem1 for small values of x. */ + svbool_t is_small = svaclt (pg, x, TableBound); + svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0)); + if (svptest_any (pg, is_small)) + scalem1 = svsel_f64 (is_small, lookup_sm1bits (pg, x, u, d), scalem1); + + /* Construct exp2m1 = (scale - 1) + scale * poly. */ + svfloat64_t y = svmla_x (pg, scalem1, scale, poly); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (svptest_any (pg, special))) + return svsel_f64 (special, special_case (scale, poly, n, d), y); + + return y; +} diff --git a/sysdeps/aarch64/fpu/exp2m1f_advsimd.c b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c new file mode 100644 index 0000000..b42f13e --- /dev/null +++ b/sysdeps/aarch64/fpu/exp2m1f_advsimd.c @@ -0,0 +1,106 @@ +/* Single-precision (Advanced SIMD) exp2m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 192.0f + +static const struct data +{ + uint32x4_t exponent_bias, special_offset, special_bias; + float32x4_t scale_thresh, special_bound; + float32x4_t log2_hi, c1, c3, c5; + float log2_lo, c2, c4, c6; +} data = { + /* Coefficients generated using remez's algorithm for exp2m1f(x). */ + .log2_hi = V4 (0x1.62e43p-1), + .log2_lo = -0x1.05c610p-29, + .c1 = V4 (0x1.ebfbep-3), + .c2 = 0x1.c6b06ep-5, + .c3 = V4 (0x1.3b2a5cp-7), + .c4 = 0x1.5da59ep-10, + .c5 = V4 (0x1.440dccp-13), + .c6 = 0x1.e081d6p-17, + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), + .scale_thresh = V4 (ScaleBound), + .special_bound = V4 (SpecialBound), +}; + +static float32x4_t VPCS_ATTR +special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, + float32x4_t scale, const struct data *d) +{ + /* 2^n may overflow, break it up into s1*s2. */ + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); + float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); + uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); + float32x4_t r2 = vmulq_f32 (s1, s1); + float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); + /* Similar to r1 but avoids double rounding in the subnormal range. */ + float32x4_t r0 = vfmaq_f32 (scale, poly, scale); + float32x4_t r = vbslq_f32 (cmp1, r1, r0); + return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f)); +} + +/* Single-precision vector exp2(x) - 1 function. + The maximum error is 1.76 + 0.5 ULP. + _ZGVnN4v_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2 + want 0x1.ab2ecp-2. */ +float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2m1) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); + + float32x4_t log2lo_c246 = vld1q_f32 (&d->log2_lo); + float32x4_t r2 = vmulq_f32 (r, r); + + /* Pairwise horner scheme. */ + float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log2lo_c246, 3); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log2lo_c246, 2); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log2lo_c246, 1); + float32x4_t p36 = vfmaq_f32 (p34, r2, p56); + float32x4_t p16 = vfmaq_f32 (p12, r2, p36); + float32x4_t poly + = vfmaq_laneq_f32 (vmulq_f32 (d->log2_hi, r), r, log2lo_c246, 0); + poly = vfmaq_f32 (poly, p16, r2); + + float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale); + + if (__glibc_unlikely (v_any_u32 (cmp))) + return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y); + + return y; +} +libmvec_hidden_def (V_NAME_F1 (exp2m1)) +HALF_WIDTH_ALIAS_F1 (exp2m1) diff --git a/sysdeps/aarch64/fpu/exp2m1f_sve.c b/sysdeps/aarch64/fpu/exp2m1f_sve.c new file mode 100644 index 0000000..e6473f6 --- /dev/null +++ b/sysdeps/aarch64/fpu/exp2m1f_sve.c @@ -0,0 +1,108 @@ +/* Single-precision (SVE) exp2m1 function + + Copyright (C) 2025 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +/* Value of |x| above which scale overflows without special treatment. */ +#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + +/* Value of n above which scale overflows even with special treatment. */ +#define ScaleBound 192.0f + +static const struct data +{ + uint32_t exponent_bias, special_offset, special_bias; + float32_t scale_thresh, special_bound; + float log2_lo, c2, c4, c6; + float log2_hi, c1, c3, c5, shift; +} data = { + /* Coefficients generated using remez's algorithm for exp2m1f(x). */ + .log2_hi = 0x1.62e43p-1, + .log2_lo = -0x1.05c610p-29, + .c1 = 0x1.ebfbep-3, + .c2 = 0x1.c6b06ep-5, + .c3 = 0x1.3b2a5cp-7, + .c4 = 0x1.5da59ep-10, + .c5 = 0x1.440dccp-13, + .c6 = 0x1.e081d6p-17, + .exponent_bias = 0x3f800000, + .special_offset = 0x82000000, + .special_bias = 0x7f000000, + .scale_thresh = ScaleBound, + .special_bound = SpecialBound, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1, + svfloat32_t scale, const struct data *d) +{ + svbool_t b = svcmple (svptrue_b32 (), n, 0.0f); + svfloat32_t s1 = svreinterpret_f32 ( + svsel (b, sv_u32 (d->special_offset + d->special_bias), + sv_u32 (d->special_bias))); + svfloat32_t s2 + = svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset))); + svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh); + svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1); + svfloat32_t r1 + = svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1); + svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale); + svfloat32_t r = svsel (cmp1, r1, r0); + return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f); +} + +/* Single-precision vector exp2(x) - 1 function. + The maximum error is 1.76 + 0.5 ULP. + _ZGVsMxv_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2 + want 0x1.ab2ecp-2. */ +svfloat32_t SV_NAME_F1 (exp2m1) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t n = svrinta_x (pg, x); + svfloat32_t r = svsub_x (pg, x, n); + + svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23); + svfloat32_t scale + = svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias)); + + svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound); + + svfloat32_t r2 = svmul_x (pg, r, r); + + svfloat32_t log2lo_c246 = svld1rq (svptrue_b32 (), &d->log2_lo); + svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, log2lo_c246, 3); + svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, log2lo_c246, 2); + svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, log2lo_c246, 1); + + svfloat32_t p36 = svmla_x (pg, p34, p56, r2); + svfloat32_t p16 = svmla_x (pg, p12, p36, r2); + + svfloat32_t poly = svmla_lane ( + svmul_x (svptrue_b32 (), r, sv_f32 (d->log2_hi)), r, log2lo_c246, 0); + poly = svmla_x (pg, poly, p16, r2); + + svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale); + + /* Fallback to special case for lanes with overflow. */ + if (__glibc_unlikely (svptest_any (pg, cmp))) + return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y); + + return y; +} diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c index a3fef22..b8b4822 100644 --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c @@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp) VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10) VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2) VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1) +VPCS_VECTOR_WRAPPER (exp2m1_advsimd, _ZGVnN2v_exp2m1) +VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1) VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot) VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log) VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10) diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c index f4a5ae8..168b3f7 100644 --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c @@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp) SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10) SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2) SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1) +SVE_VECTOR_WRAPPER (exp2m1_sve, _ZGVsMxv_exp2m1) +SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1) SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot) SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log) SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10) diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c index bc22956..c290073 100644 --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c @@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf) VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f) VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f) VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f) +VPCS_VECTOR_WRAPPER (exp2m1f_advsimd, _ZGVnN4v_exp2m1f) +VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f) VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf) VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf) VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f) diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c index ad0d6ad..fd01791 100644 --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c @@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf) SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f) SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f) SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f) +SVE_VECTOR_WRAPPER (exp2m1f_sve, _ZGVsMxv_exp2m1f) +SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f) SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf) SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf) SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f) |