diff options
-rw-r--r-- | sysdeps/aarch64/fpu/Makefile | 4 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/Versions | 5 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/advsimd_f32_protos.h | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/bits/math-vector.h | 8 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cosh_advsimd.c | 108 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/cosh_sve.c | 105 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/coshf_advsimd.c | 84 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/coshf_sve.c | 59 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/sv_expf_inline.h | 75 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-double-sve-wrappers.c | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/test-float-sve-wrappers.c | 1 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/v_exp_tail_data.c | 110 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/v_expf_inline.h | 71 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/vecmath_config.h | 2 | ||||
-rw-r--r-- | sysdeps/aarch64/libm-test-ulps | 8 | ||||
-rw-r--r-- | sysdeps/unix/sysv/linux/aarch64/libmvec.abilist | 5 |
18 files changed, 648 insertions, 1 deletions
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile index 320b6ed..019c3a5 100644 --- a/sysdeps/aarch64/fpu/Makefile +++ b/sysdeps/aarch64/fpu/Makefile @@ -3,6 +3,7 @@ libmvec-supported-funcs = acos \ atan \ atan2 \ cos \ + cosh \ erf \ exp \ exp10 \ @@ -32,7 +33,8 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \ erf_data \ erff_data \ sv_erf_data \ - sv_erff_data + sv_erff_data \ + v_exp_tail_data endif sve-cflags = -march=armv8-a+sve diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions index d7b1e87..884b4b5 100644 --- a/sysdeps/aarch64/fpu/Versions +++ b/sysdeps/aarch64/fpu/Versions @@ -79,6 +79,11 @@ libmvec { _ZGVsMxv_tan; } GLIBC_2.40 { + _ZGVnN2v_cosh; + _ZGVnN2v_coshf; + _ZGVnN4v_coshf; + _ZGVsMxv_cosh; + _ZGVsMxv_coshf; _ZGVnN2v_erf; _ZGVnN2v_erff; _ZGVnN4v_erff; diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h index d8d88de..c63b294 100644 --- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h +++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h @@ -21,6 +21,7 @@ libmvec_hidden_proto (V_NAME_F1(acos)); libmvec_hidden_proto (V_NAME_F1(asin)); libmvec_hidden_proto (V_NAME_F1(atan)); libmvec_hidden_proto (V_NAME_F1(cos)); +libmvec_hidden_proto (V_NAME_F1(cosh)); libmvec_hidden_proto (V_NAME_F1(erf)); libmvec_hidden_proto (V_NAME_F1(exp10)); libmvec_hidden_proto (V_NAME_F1(exp2)); diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h index 71f5336..8ca5509 100644 --- a/sysdeps/aarch64/fpu/bits/math-vector.h +++ b/sysdeps/aarch64/fpu/bits/math-vector.h @@ -49,6 +49,10 @@ # define __DECL_SIMD_cos __DECL_SIMD_aarch64 # undef __DECL_SIMD_cosf # define __DECL_SIMD_cosf __DECL_SIMD_aarch64 +# undef __DECL_SIMD_cosh +# define __DECL_SIMD_cosh __DECL_SIMD_aarch64 +# undef __DECL_SIMD_coshf +# define __DECL_SIMD_coshf __DECL_SIMD_aarch64 # undef __DECL_SIMD_erf # define __DECL_SIMD_erf __DECL_SIMD_aarch64 # undef __DECL_SIMD_erff @@ -124,6 +128,7 @@ __vpcs __f32x4_t _ZGVnN4v_acosf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_asinf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_atanf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_coshf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_erff (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t); @@ -141,6 +146,7 @@ __vpcs __f64x2_t _ZGVnN2v_acos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_asin (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_atan (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_cosh (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_erf (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t); @@ -163,6 +169,7 @@ __sv_f32_t _ZGVsMxv_acosf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_asinf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_atanf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_coshf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_erff (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t); @@ -180,6 +187,7 @@ __sv_f64_t _ZGVsMxv_acos (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_asin (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_atan (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_cosh (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_erf (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t); diff --git a/sysdeps/aarch64/fpu/cosh_advsimd.c b/sysdeps/aarch64/fpu/cosh_advsimd.c new file mode 100644 index 0000000..ec7b596 --- /dev/null +++ b/sysdeps/aarch64/fpu/cosh_advsimd.c @@ -0,0 +1,108 @@ +/* Double-precision vector (AdvSIMD) cosh function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_math.h" + +static const struct data +{ + float64x2_t poly[3]; + float64x2_t inv_ln2, ln2, shift, thres; + uint64x2_t index_mask, special_bound; +} data = { + .poly = { V2 (0x1.fffffffffffd4p-2), V2 (0x1.5555571d6b68cp-3), + V2 (0x1.5555576a59599p-5), }, + + .inv_ln2 = V2 (0x1.71547652b82fep8), /* N/ln2. */ + /* -ln2/N. */ + .ln2 = {-0x1.62e42fefa39efp-9, -0x1.abc9e3b39803f3p-64}, + .shift = V2 (0x1.8p+52), + .thres = V2 (704.0), + + .index_mask = V2 (0xff), + /* 0x1.6p9, above which exp overflows. */ + .special_bound = V2 (0x4086000000000000), +}; + +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +{ + return v_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from v_exp_tail, with no + special-case handling or tail. */ +static inline float64x2_t +exp_inline (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + /* n = round(x/(ln2/N)). */ + float64x2_t z = vfmaq_f64 (d->shift, x, d->inv_ln2); + uint64x2_t u = vreinterpretq_u64_f64 (z); + float64x2_t n = vsubq_f64 (z, d->shift); + + /* r = x - n*ln2/N. */ + float64x2_t r = vfmaq_laneq_f64 (x, n, d->ln2, 0); + r = vfmaq_laneq_f64 (r, n, d->ln2, 1); + + uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TAIL_TABLE_BITS); + uint64x2_t i = vandq_u64 (u, d->index_mask); + + /* y = tail + exp(r) - 1 ~= r + C1 r^2 + C2 r^3 + C3 r^4. */ + float64x2_t y = vfmaq_f64 (d->poly[1], d->poly[2], r); + y = vfmaq_f64 (d->poly[0], y, r); + y = vmulq_f64 (vfmaq_f64 (v_f64 (1), y, r), r); + + /* s = 2^(n/N). */ + u = v_lookup_u64 (__v_exp_tail_data, i); + float64x2_t s = vreinterpretq_f64_u64 (vaddq_u64 (u, e)); + + return vfmaq_f64 (s, y, s); +} + +/* Approximation for vector double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVnN2v_cosh (0x1.628af341989dap+9) got 0x1.fdf28623ef921p+1021 + want 0x1.fdf28623ef923p+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVnN2v_cosh (0x1.8e205b6ecacf7p+2) got 0x1.f711dcb0c77afp+7 + want 0x1.f711dcb0c77b1p+7. */ +float64x2_t VPCS_ATTR V_NAME_D1 (cosh) (float64x2_t x) +{ + const struct data *d = ptr_barrier (&data); + + float64x2_t ax = vabsq_f64 (x); + uint64x2_t special + = vcgtq_u64 (vreinterpretq_u64_f64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + float64x2_t t = exp_inline (ax); + float64x2_t half_t = vmulq_n_f64 (t, 0.5); + float64x2_t half_over_t = vdivq_f64 (v_f64 (0.5), t); + + /* Fall back to scalar for any special cases. */ + if (__glibc_unlikely (v_any_u64 (special))) + return special_case (x, vaddq_f64 (half_t, half_over_t), special); + + return vaddq_f64 (half_t, half_over_t); +} diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c new file mode 100644 index 0000000..919f346 --- /dev/null +++ b/sysdeps/aarch64/fpu/cosh_sve.c @@ -0,0 +1,105 @@ +/* Double-precision vector (SVE) cosh function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" + +static const struct data +{ + float64_t poly[3]; + float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres; + uint64_t index_mask, special_bound; +} data = { + .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3, + 0x1.5555576a59599p-5, }, + + .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2. */ + /* -ln2/N. */ + .ln2_hi = -0x1.62e42fefa39efp-9, + .ln2_lo = -0x1.abc9e3b39803f3p-64, + .shift = 0x1.8p+52, + .thres = 704.0, + + .index_mask = 0xff, + /* 0x1.6p9, above which exp overflows. */ + .special_bound = 0x4086000000000000, +}; + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +{ + return sv_call_f64 (cosh, x, y, special); +} + +/* Helper for approximating exp(x). Copied from sv_exp_tail, with no + special-case handling or tail. */ +static inline svfloat64_t +exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d) +{ + /* Calculate exp(x). */ + svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2); + svfloat64_t n = svsub_x (pg, z, d->shift); + + svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi); + r = svmla_x (pg, r, n, d->ln2_lo); + + svuint64_t u = svreinterpret_u64 (z); + svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS); + svuint64_t i = svand_x (pg, u, d->index_mask); + + svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]); + y = svmla_x (pg, sv_f64 (d->poly[0]), r, y); + y = svmla_x (pg, sv_f64 (1.0), r, y); + y = svmul_x (pg, r, y); + + /* s = 2^(n/N). */ + u = svld1_gather_index (pg, __v_exp_tail_data, i); + svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e)); + + return svmla_x (pg, s, s, y); +} + +/* Approximation for SVE double-precision cosh(x) using exp_inline. + cosh(x) = (exp(x) + exp(-x)) / 2. + The greatest observed error is in the scalar fall-back region, so is the + same as the scalar routine, 1.93 ULP: + _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021 + want 0x1.fd774e958236fp+1021. + + The greatest observed error in the non-special region is 1.54 ULP: + _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8 + want 0x1.f5e2bb8d5c991p+8. */ +svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat64_t ax = svabs_x (pg, x); + svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound); + + /* Up to the point that exp overflows, we can use it to calculate cosh by + exp(|x|) / 2 + 1 / (2 * exp(|x|)). */ + svfloat64_t t = exp_inline (ax, pg, d); + svfloat64_t half_t = svmul_x (pg, t, 0.5); + svfloat64_t half_over_t = svdivr_x (pg, t, 0.5); + + /* Fall back to scalar for any special cases. */ + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c new file mode 100644 index 0000000..c1ab492 --- /dev/null +++ b/sysdeps/aarch64/fpu/coshf_advsimd.c @@ -0,0 +1,84 @@ +/* Single-precision vector (AdvSIMD) cosh function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "v_expf_inline.h" +#include "v_math.h" + +static const struct data +{ + struct v_expf_data expf_consts; + uint32x4_t tiny_bound, special_bound; +} data = { + .expf_consts = V_EXPF_DATA, + .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = V4 (0x42ad496c), +}; + +#if !WANT_SIMD_EXCEPT +static float32x4_t NOINLINE VPCS_ATTR +special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +{ + return v_call_f32 (coshf, x, y, special); +} +#endif + +/* Single-precision vector cosh, using vector expf. + Maximum error is 2.38 ULP: + _ZGVnN4v_coshf (0x1.e8001ep+1) got 0x1.6a491ep+4 + want 0x1.6a4922p+4. */ +float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) +{ + const struct data *d = ptr_barrier (&data); + + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); + +#if WANT_SIMD_EXCEPT + /* If fp exceptions are to be triggered correctly, fall back to the scalar + variant for all inputs if any input is a special value or above the bound + at which expf overflows. */ + if (__glibc_unlikely (v_any_u32 (special))) + return v_call_f32 (coshf, x, x, v_u32 (-1)); + + uint32x4_t tiny = vcleq_u32 (iax, d->tiny_bound); + /* If any input is tiny, avoid underflow exception by fixing tiny lanes of + input to 0, which will generate no exceptions. */ + if (__glibc_unlikely (v_any_u32 (tiny))) + ax = v_zerofy_f32 (ax, tiny); +#endif + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + float32x4_t t = v_expf_inline (ax, &d->expf_consts); + float32x4_t half_t = vmulq_n_f32 (t, 0.5); + float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); + +#if WANT_SIMD_EXCEPT + if (__glibc_unlikely (v_any_u32 (tiny))) + return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); +#else + if (__glibc_unlikely (v_any_u32 (special))) + return special_case (x, vaddq_f32 (half_t, half_over_t), special); +#endif + + return vaddq_f32 (half_t, half_over_t); +} +libmvec_hidden_def (V_NAME_F1 (cosh)) +HALF_WIDTH_ALIAS_F1 (cosh) diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c new file mode 100644 index 0000000..e5d8a29 --- /dev/null +++ b/sysdeps/aarch64/fpu/coshf_sve.c @@ -0,0 +1,59 @@ +/* Single-precision vector (SVE) cosh function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "sv_math.h" +#include "sv_expf_inline.h" + +static const struct data +{ + struct sv_expf_data expf_consts; + uint32_t special_bound; +} data = { + .expf_consts = SV_EXPF_DATA, + /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .special_bound = 0x42ad496c, +}; + +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t pg) +{ + return sv_call_f32 (coshf, x, y, pg); +} + +/* Single-precision vector cosh, using vector expf. + Maximum error is 1.89 ULP: + _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127 + want 0x1.f00adcp+127. */ +svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t ax = svabs_x (pg, x); + svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound); + + /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ + svfloat32_t t = expf_inline (ax, pg, &d->expf_consts); + svfloat32_t half_t = svmul_x (pg, t, 0.5); + svfloat32_t half_over_t = svdivr_x (pg, t, 0.5); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (x, svadd_x (pg, half_t, half_over_t), special); + + return svadd_x (pg, half_t, half_over_t); +} diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h new file mode 100644 index 0000000..23963b5 --- /dev/null +++ b/sysdeps/aarch64/fpu/sv_expf_inline.h @@ -0,0 +1,75 @@ +/* SVE helper for single-precision routines which depend on exp + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_SV_EXPF_INLINE_H +#define AARCH64_FPU_SV_EXPF_INLINE_H + +#include "sv_math.h" + +struct sv_expf_data +{ + float poly[5]; + float inv_ln2, ln2_hi, ln2_lo, shift; +}; + +/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for + compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */ +#define SV_EXPF_DATA \ + { \ + .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \ + 0x1.0e4020p-7f }, \ + \ + .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \ + .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \ + } + +#define C(i) sv_f32 (d->poly[i]) + +static inline svfloat32_t +expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d) +{ + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]); + + /* n = round(x/(ln2/N)). */ + svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1); + svfloat32_t n = svsub_x (pg, z, d->shift); + + /* r = x - n*ln2/N. */ + svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2); + r = svmls_lane (r, n, c4_invln2_and_ln2, 3); + + /* scale = 2^(n/N). */ + svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z)); + + /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */ + svfloat32_t p12 = svmla_x (pg, C (1), C (2), r); + svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0); + svfloat32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t p14 = svmla_x (pg, p12, p34, r2); + svfloat32_t p0 = svmul_f32_x (pg, r, C (0)); + svfloat32_t poly = svmla_x (pg, p0, r2, p14); + + return svmla_x (pg, scale, scale, poly); +} + +#endif diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c index 41fdb92..b37cb7d 100644 --- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c @@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asin_advsimd, _ZGVnN2v_asin) VPCS_VECTOR_WRAPPER (atan_advsimd, _ZGVnN2v_atan) VPCS_VECTOR_WRAPPER_ff (atan2_advsimd, _ZGVnN2vv_atan2) VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos) +VPCS_VECTOR_WRAPPER (cosh_advsimd, _ZGVnN2v_cosh) VPCS_VECTOR_WRAPPER (erf_advsimd, _ZGVnN2v_erf) VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp) VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10) diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c index 8e3d64d..011f07d 100644 --- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c @@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asin_sve, _ZGVsMxv_asin) SVE_VECTOR_WRAPPER (atan_sve, _ZGVsMxv_atan) SVE_VECTOR_WRAPPER_ff (atan2_sve, _ZGVsMxvv_atan2) SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos) +SVE_VECTOR_WRAPPER (cosh_sve, _ZGVsMxv_cosh) SVE_VECTOR_WRAPPER (erf_sve, _ZGVsMxv_erf) SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp) SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10) diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c index 33ae928..3545299 100644 --- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c @@ -28,6 +28,7 @@ VPCS_VECTOR_WRAPPER (asinf_advsimd, _ZGVnN4v_asinf) VPCS_VECTOR_WRAPPER (atanf_advsimd, _ZGVnN4v_atanf) VPCS_VECTOR_WRAPPER_ff (atan2f_advsimd, _ZGVnN4vv_atan2f) VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf) +VPCS_VECTOR_WRAPPER (coshf_advsimd, _ZGVnN4v_coshf) VPCS_VECTOR_WRAPPER (erff_advsimd, _ZGVnN4v_erff) VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf) VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f) diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c index ac0464f..bbc74ed 100644 --- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c +++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c @@ -47,6 +47,7 @@ SVE_VECTOR_WRAPPER (asinf_sve, _ZGVsMxv_asinf) SVE_VECTOR_WRAPPER (atanf_sve, _ZGVsMxv_atanf) SVE_VECTOR_WRAPPER_ff (atan2f_sve, _ZGVsMxvv_atan2f) SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf) +SVE_VECTOR_WRAPPER (coshf_sve, _ZGVsMxv_coshf) SVE_VECTOR_WRAPPER (erff_sve, _ZGVsMxv_erff) SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf) SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f) diff --git a/sysdeps/aarch64/fpu/v_exp_tail_data.c b/sysdeps/aarch64/fpu/v_exp_tail_data.c new file mode 100644 index 0000000..151e97c --- /dev/null +++ b/sysdeps/aarch64/fpu/v_exp_tail_data.c @@ -0,0 +1,110 @@ +/* Lookup table for high-precision exp(x, tail) function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "vecmath_config.h" + +/* 2^(j/N), j=0..N, N=2^8=256. */ +const uint64_t __v_exp_tail_data[] = { + 0x3ff0000000000000, 0x3feffb1afa5abcbf, 0x3feff63da9fb3335, + 0x3feff168143b0281, 0x3fefec9a3e778061, 0x3fefe7d42e11bbcc, + 0x3fefe315e86e7f85, 0x3fefde5f72f654b1, 0x3fefd9b0d3158574, + 0x3fefd50a0e3c1f89, 0x3fefd06b29ddf6de, 0x3fefcbd42b72a836, + 0x3fefc74518759bc8, 0x3fefc2bdf66607e0, 0x3fefbe3ecac6f383, + 0x3fefb9c79b1f3919, 0x3fefb5586cf9890f, 0x3fefb0f145e46c85, + 0x3fefac922b7247f7, 0x3fefa83b23395dec, 0x3fefa3ec32d3d1a2, + 0x3fef9fa55fdfa9c5, 0x3fef9b66affed31b, 0x3fef973028d7233e, + 0x3fef9301d0125b51, 0x3fef8edbab5e2ab6, 0x3fef8abdc06c31cc, + 0x3fef86a814f204ab, 0x3fef829aaea92de0, 0x3fef7e95934f312e, + 0x3fef7a98c8a58e51, 0x3fef76a45471c3c2, 0x3fef72b83c7d517b, + 0x3fef6ed48695bbc0, 0x3fef6af9388c8dea, 0x3fef672658375d2f, + 0x3fef635beb6fcb75, 0x3fef5f99f8138a1c, 0x3fef5be084045cd4, + 0x3fef582f95281c6b, 0x3fef54873168b9aa, 0x3fef50e75eb44027, + 0x3fef4d5022fcd91d, 0x3fef49c18438ce4d, 0x3fef463b88628cd6, + 0x3fef42be3578a819, 0x3fef3f49917ddc96, 0x3fef3bdda27912d1, + 0x3fef387a6e756238, 0x3fef351ffb82140a, 0x3fef31ce4fb2a63f, + 0x3fef2e85711ece75, 0x3fef2b4565e27cdd, 0x3fef280e341ddf29, + 0x3fef24dfe1f56381, 0x3fef21ba7591bb70, 0x3fef1e9df51fdee1, + 0x3fef1b8a66d10f13, 0x3fef187fd0dad990, 0x3fef157e39771b2f, + 0x3fef1285a6e4030b, 0x3fef0f961f641589, 0x3fef0cafa93e2f56, + 0x3fef09d24abd886b, 0x3fef06fe0a31b715, 0x3fef0432edeeb2fd, + 0x3fef0170fc4cd831, 0x3feefeb83ba8ea32, 0x3feefc08b26416ff, + 0x3feef96266e3fa2d, 0x3feef6c55f929ff1, 0x3feef431a2de883b, + 0x3feef1a7373aa9cb, 0x3feeef26231e754a, 0x3feeecae6d05d866, + 0x3feeea401b7140ef, 0x3feee7db34e59ff7, 0x3feee57fbfec6cf4, + 0x3feee32dc313a8e5, 0x3feee0e544ede173, 0x3feedea64c123422, + 0x3feedc70df1c5175, 0x3feeda4504ac801c, 0x3feed822c367a024, + 0x3feed60a21f72e2a, 0x3feed3fb2709468a, 0x3feed1f5d950a897, + 0x3feecffa3f84b9d4, 0x3feece086061892d, 0x3feecc2042a7d232, + 0x3feeca41ed1d0057, 0x3feec86d668b3237, 0x3feec6a2b5c13cd0, + 0x3feec4e1e192aed2, 0x3feec32af0d7d3de, 0x3feec17dea6db7d7, + 0x3feebfdad5362a27, 0x3feebe41b817c114, 0x3feebcb299fddd0d, + 0x3feebb2d81d8abff, 0x3feeb9b2769d2ca7, 0x3feeb8417f4531ee, + 0x3feeb6daa2cf6642, 0x3feeb57de83f4eef, 0x3feeb42b569d4f82, + 0x3feeb2e2f4f6ad27, 0x3feeb1a4ca5d920f, 0x3feeb070dde910d2, + 0x3feeaf4736b527da, 0x3feeae27dbe2c4cf, 0x3feead12d497c7fd, + 0x3feeac0827ff07cc, 0x3feeab07dd485429, 0x3feeaa11fba87a03, + 0x3feea9268a5946b7, 0x3feea84590998b93, 0x3feea76f15ad2148, + 0x3feea6a320dceb71, 0x3feea5e1b976dc09, 0x3feea52ae6cdf6f4, + 0x3feea47eb03a5585, 0x3feea3dd1d1929fd, 0x3feea34634ccc320, + 0x3feea2b9febc8fb7, 0x3feea23882552225, 0x3feea1c1c70833f6, + 0x3feea155d44ca973, 0x3feea0f4b19e9538, 0x3feea09e667f3bcd, + 0x3feea052fa75173e, 0x3feea012750bdabf, 0x3fee9fdcddd47645, + 0x3fee9fb23c651a2f, 0x3fee9f9298593ae5, 0x3fee9f7df9519484, + 0x3fee9f7466f42e87, 0x3fee9f75e8ec5f74, 0x3fee9f8286ead08a, + 0x3fee9f9a48a58174, 0x3fee9fbd35d7cbfd, 0x3fee9feb564267c9, + 0x3feea024b1ab6e09, 0x3feea0694fde5d3f, 0x3feea0b938ac1cf6, + 0x3feea11473eb0187, 0x3feea17b0976cfdb, 0x3feea1ed0130c132, + 0x3feea26a62ff86f0, 0x3feea2f336cf4e62, 0x3feea3878491c491, + 0x3feea427543e1a12, 0x3feea4d2add106d9, 0x3feea589994cce13, + 0x3feea64c1eb941f7, 0x3feea71a4623c7ad, 0x3feea7f4179f5b21, + 0x3feea8d99b4492ed, 0x3feea9cad931a436, 0x3feeaac7d98a6699, + 0x3feeabd0a478580f, 0x3feeace5422aa0db, 0x3feeae05bad61778, + 0x3feeaf3216b5448c, 0x3feeb06a5e0866d9, 0x3feeb1ae99157736, + 0x3feeb2fed0282c8a, 0x3feeb45b0b91ffc6, 0x3feeb5c353aa2fe2, + 0x3feeb737b0cdc5e5, 0x3feeb8b82b5f98e5, 0x3feeba44cbc8520f, + 0x3feebbdd9a7670b3, 0x3feebd829fde4e50, 0x3feebf33e47a22a2, + 0x3feec0f170ca07ba, 0x3feec2bb4d53fe0d, 0x3feec49182a3f090, + 0x3feec674194bb8d5, 0x3feec86319e32323, 0x3feeca5e8d07f29e, + 0x3feecc667b5de565, 0x3feece7aed8eb8bb, 0x3feed09bec4a2d33, + 0x3feed2c980460ad8, 0x3feed503b23e255d, 0x3feed74a8af46052, + 0x3feed99e1330b358, 0x3feedbfe53c12e59, 0x3feede6b5579fdbf, + 0x3feee0e521356eba, 0x3feee36bbfd3f37a, 0x3feee5ff3a3c2774, + 0x3feee89f995ad3ad, 0x3feeeb4ce622f2ff, 0x3feeee07298db666, + 0x3feef0ce6c9a8952, 0x3feef3a2b84f15fb, 0x3feef68415b749b1, + 0x3feef9728de5593a, 0x3feefc6e29f1c52a, 0x3feeff76f2fb5e47, + 0x3fef028cf22749e4, 0x3fef05b030a1064a, 0x3fef08e0b79a6f1f, + 0x3fef0c1e904bc1d2, 0x3fef0f69c3f3a207, 0x3fef12c25bd71e09, + 0x3fef16286141b33d, 0x3fef199bdd85529c, 0x3fef1d1cd9fa652c, + 0x3fef20ab5fffd07a, 0x3fef244778fafb22, 0x3fef27f12e57d14b, + 0x3fef2ba88988c933, 0x3fef2f6d9406e7b5, 0x3fef33405751c4db, + 0x3fef3720dcef9069, 0x3fef3b0f2e6d1675, 0x3fef3f0b555dc3fa, + 0x3fef43155b5bab74, 0x3fef472d4a07897c, 0x3fef4b532b08c968, + 0x3fef4f87080d89f2, 0x3fef53c8eacaa1d6, 0x3fef5818dcfba487, + 0x3fef5c76e862e6d3, 0x3fef60e316c98398, 0x3fef655d71ff6075, + 0x3fef69e603db3285, 0x3fef6e7cd63a8315, 0x3fef7321f301b460, + 0x3fef77d5641c0658, 0x3fef7c97337b9b5f, 0x3fef81676b197d17, + 0x3fef864614f5a129, 0x3fef8b333b16ee12, 0x3fef902ee78b3ff6, + 0x3fef953924676d76, 0x3fef9a51fbc74c83, 0x3fef9f7977cdb740, + 0x3fefa4afa2a490da, 0x3fefa9f4867cca6e, 0x3fefaf482d8e67f1, + 0x3fefb4aaa2188510, 0x3fefba1bee615a27, 0x3fefbf9c1cb6412a, + 0x3fefc52b376bba97, 0x3fefcac948dd7274, 0x3fefd0765b6e4540, + 0x3fefd632798844f8, 0x3fefdbfdad9cbe14, 0x3fefe1d802243c89, + 0x3fefe7c1819e90d8, 0x3fefedba3692d514, 0x3feff3c22b8f71f1, + 0x3feff9d96b2a23d9, +}; diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h new file mode 100644 index 0000000..a3b0e32 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_expf_inline.h @@ -0,0 +1,71 @@ +/* Helper for single-precision AdvSIMD routines which depend on exp + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_V_EXPF_INLINE_H +#define AARCH64_FPU_V_EXPF_INLINE_H + +#include "v_math.h" + +struct v_expf_data +{ + float32x4_t poly[5]; + float32x4_t shift, invln2_and_ln2; +}; + +/* maxerr: 1.45358 +0.5 ulp. */ +#define V_EXPF_DATA \ + { \ + .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ + V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ + .shift = V4 (0x1.8p23f), \ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + } + +#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ +#define C(i) d->poly[i] + +static inline float32x4_t +v_expf_inline (float32x4_t x, const struct v_expf_data *d) +{ + /* Helper routine for calculating exp(x). + Copied from v_expf.c, with all special-case handling removed - the + calling routine should handle special values if required. */ + + /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t n, r, z; + z = vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0); + n = vsubq_f32 (z, d->shift); + r = vfmsq_laneq_f32 (x, n, d->invln2_and_ln2, 1); + r = vfmsq_laneq_f32 (r, n, d->invln2_and_ln2, 2); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + + /* Custom order-4 Estrin avoids building high order monomial. */ + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p, q, poly; + p = vfmaq_f32 (C (1), C (0), r); + q = vfmaq_f32 (C (3), C (2), r); + q = vfmaq_f32 (q, p, r2); + p = vmulq_f32 (C (4), r); + poly = vfmaq_f32 (p, q, r2); + return vfmaq_f32 (scale, poly, scale); +} + +#endif diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h index 409c0c9..3f0b5f4 100644 --- a/sysdeps/aarch64/fpu/vecmath_config.h +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -59,6 +59,8 @@ extern const struct v_log_data } table[1 << V_LOG_TABLE_BITS]; } __v_log_data attribute_hidden; +#define V_EXP_TAIL_TABLE_BITS 8 +extern const uint64_t __v_exp_tail_data[1 << V_EXP_TAIL_TABLE_BITS] attribute_hidden; #define V_EXP_TABLE_BITS 7 extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden; diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps index ed4791e..97f4523 100644 --- a/sysdeps/aarch64/libm-test-ulps +++ b/sysdeps/aarch64/libm-test-ulps @@ -701,11 +701,19 @@ double: 2 float: 2 ldouble: 2 +Function: "cosh_advsimd": +double: 2 +float: 2 + Function: "cosh_downward": double: 3 float: 1 ldouble: 3 +Function: "cosh_sve": +double: 2 +float: 2 + Function: "cosh_towardzero": double: 3 float: 1 diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist index 6193518..f66da42 100644 --- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist @@ -73,8 +73,13 @@ GLIBC_2.39 _ZGVsMxv_tan F GLIBC_2.39 _ZGVsMxv_tanf F GLIBC_2.39 _ZGVsMxvv_atan2 F GLIBC_2.39 _ZGVsMxvv_atan2f F +GLIBC_2.40 _ZGVnN2v_cosh F +GLIBC_2.40 _ZGVnN2v_coshf F GLIBC_2.40 _ZGVnN2v_erf F GLIBC_2.40 _ZGVnN2v_erff F +GLIBC_2.40 _ZGVnN4v_coshf F GLIBC_2.40 _ZGVnN4v_erff F +GLIBC_2.40 _ZGVsMxv_cosh F +GLIBC_2.40 _ZGVsMxv_coshf F GLIBC_2.40 _ZGVsMxv_erf F GLIBC_2.40 _ZGVsMxv_erff F |