diff options
author | Pierre Blanchard <pierre.blanchard@arm.com> | 2024-12-09 15:58:47 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2024-12-09 16:20:34 +0000 |
commit | 13a7ef5999de56add448a24fefb0250236271a06 (patch) | |
tree | 0edc92054042abe4034ab64b5b4ecedf534124ed /sysdeps | |
parent | ca0c0d0f26fbf75b9cacc65122b457e8fdec40b8 (diff) | |
download | glibc-13a7ef5999de56add448a24fefb0250236271a06.zip glibc-13a7ef5999de56add448a24fefb0250236271a06.tar.gz glibc-13a7ef5999de56add448a24fefb0250236271a06.tar.bz2 |
AArch64: Improve codegen in users of ADVSIMD expm1 helper
Add inline helper for expm1 and rearrange operations so MOV
is not necessary in reduction or around the special-case handler.
Reduce memory access by using more indexed MLAs in polynomial.
Speedup on Neoverse V1 for expm1 (19%), sinh (8.5%), and tanh (7.5%).
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/aarch64/fpu/expm1_advsimd.c | 68 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/sinh_advsimd.c | 69 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/tanh_advsimd.c | 62 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/v_expm1_inline.h | 97 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/v_expm1f_inline.h | 1 |
5 files changed, 135 insertions, 162 deletions
diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c index 3db3b80..f2042db 100644 --- a/sysdeps/aarch64/fpu/expm1_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c @@ -18,31 +18,18 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11]; - float64x2_t invln2; - double ln2[2]; - float64x2_t shift; - int64x2_t exponent_bias; + struct v_expm1_data d; #if WANT_SIMD_EXCEPT uint64x2_t thresh, tiny_bound; #else float64x2_t oflow_bound; #endif } data = { - /* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, - .invln2 = V2 (0x1.71547652b82fep0), - .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, - .shift = V2 (0x1.8p52), - .exponent_bias = V2 (0x3ff0000000000000), + .d = V_EXPM1_DATA, #if WANT_SIMD_EXCEPT /* asuint64(oflow_bound) - asuint64(0x1p-51), shifted left by 1 for abs compare. */ @@ -58,67 +45,36 @@ static const struct data }; static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, uint64x2_t special, const struct data *d) { - return v_call_f64 (expm1, x, y, special); + return v_call_f64 (expm1, x, expm1_inline (v_zerofy_f64 (x, special), &d->d), + special); } /* Double-precision vector exp(x) - 1 function. - The maximum error observed error is 2.18 ULP: - _ZGVnN2v_expm1 (0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2 - want 0x1.a8b9ea8d66e2p-2. */ + The maximum error observed error is 2.05 ULP: + _ZGVnN2v_expm1(0x1.634902eaff3adp-2) got 0x1.a8b636e2a9388p-2 + want 0x1.a8b636e2a9386p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) { const struct data *d = ptr_barrier (&data); - uint64x2_t ix = vreinterpretq_u64_f64 (x); - #if WANT_SIMD_EXCEPT + uint64x2_t ix = vreinterpretq_u64_f64 (x); /* If fp exceptions are to be triggered correctly, fall back to scalar for |x| < 2^-51, |x| > oflow_bound, Inf & NaN. Add ix to itself for shift-left by 1, and compare with thresh which was left-shifted offline - this is effectively an absolute compare. */ uint64x2_t special = vcgeq_u64 (vsubq_u64 (vaddq_u64 (ix, ix), d->tiny_bound), d->thresh); - if (__glibc_unlikely (v_any_u64 (special))) - x = v_zerofy_f64 (x, special); #else /* Large input, NaNs and Infs. */ uint64x2_t special = vcageq_f64 (x, d->oflow_bound); #endif - /* Reduce argument to smaller range: - Let i = round(x / ln2) - and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where 2^i is exact because i is an integer. */ - float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (n); - float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); - float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); - f = vfmsq_laneq_f64 (f, n, ln2, 1); - - /* Approximate expm1(f) using polynomial. - Taylor expansion for expm1(x) has the form: - x + ax^2 + bx^3 + cx^4 .... - So we calculate the polynomial P(f) = a + bf + cf^2 + ... - and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - - /* Assemble the result. - expm1(x) ~= 2^i * (p + 1) - 1 - Let t = 2^i. */ - int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); - float64x2_t t = vreinterpretq_f64_s64 (u); - if (__glibc_unlikely (v_any_u64 (special))) - return special_case (vreinterpretq_f64_u64 (ix), - vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t), - special); + return special_case (x, special, d); /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); + return expm1_inline (x, &d->d); } diff --git a/sysdeps/aarch64/fpu/sinh_advsimd.c b/sysdeps/aarch64/fpu/sinh_advsimd.c index 3e3b76c..7adf771 100644 --- a/sysdeps/aarch64/fpu/sinh_advsimd.c +++ b/sysdeps/aarch64/fpu/sinh_advsimd.c @@ -18,72 +18,31 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11], inv_ln2; - double m_ln2[2]; - float64x2_t shift; + struct v_expm1_data d; uint64x2_t halff; - int64x2_t onef; #if WANT_SIMD_EXCEPT uint64x2_t tiny_bound, thresh; #else - uint64x2_t large_bound; + float64x2_t large_bound; #endif } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .m_ln2 = {-0x1.62e42fefa39efp-1, -0x1.abc9e3b39803fp-56}, - .shift = V2 (0x1.8p52), - + .d = V_EXPM1_DATA, .halff = V2 (0x3fe0000000000000), - .onef = V2 (0x3ff0000000000000), #if WANT_SIMD_EXCEPT /* 2^-26, below which sinh(x) rounds to x. */ .tiny_bound = V2 (0x3e50000000000000), /* asuint(large_bound) - asuint(tiny_bound). */ .thresh = V2 (0x0230000000000000), #else -/* 2^9. expm1 helper overflows for large input. */ - .large_bound = V2 (0x4080000000000000), + /* 2^9. expm1 helper overflows for large input. */ + .large_bound = V2 (0x1p+9), #endif }; -static inline float64x2_t -expm1_inline (float64x2_t x) -{ - const struct data *d = ptr_barrier (&data); - - /* Reduce argument: - exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 - where i = round(x / ln2) - and f = x - i * ln2 (f in [-ln2/2, ln2/2]). */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - - float64x2_t m_ln2 = vld1q_f64 (d->m_ln2); - float64x2_t f = vfmaq_laneq_f64 (x, j, m_ln2, 0); - f = vfmaq_laneq_f64 (f, j, m_ln2, 1); - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t f8 = vmulq_f64 (f4, f4); - float64x2_t p = vfmaq_f64 (f, f2, v_estrin_10_f64 (f, f2, f4, f8, d->poly)); - /* t = 2^i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vreinterpretq_u64_s64 (vaddq_s64 (vshlq_n_s64 (i, 52), d->onef))); - /* expm1(x) ~= p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); -} - static float64x2_t NOINLINE VPCS_ATTR special_case (float64x2_t x) { @@ -92,23 +51,23 @@ special_case (float64x2_t x) /* Approximation for vector double-precision sinh(x) using expm1. sinh(x) = (exp(x) - exp(-x)) / 2. - The greatest observed error is 2.57 ULP: - _ZGVnN2v_sinh (0x1.9fb1d49d1d58bp-2) got 0x1.ab34e59d678dcp-2 - want 0x1.ab34e59d678d9p-2. */ + The greatest observed error is 2.52 ULP: + _ZGVnN2v_sinh(-0x1.a098a2177a2b9p-2) got -0x1.ac2f05bb66fccp-2 + want -0x1.ac2f05bb66fc9p-2. */ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); float64x2_t ax = vabsq_f64 (x); - uint64x2_t sign - = veorq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (ax)); - float64x2_t halfsign = vreinterpretq_f64_u64 (vorrq_u64 (sign, d->halff)); + uint64x2_t ix = vreinterpretq_u64_f64 (x); + float64x2_t halfsign = vreinterpretq_f64_u64 ( + vbslq_u64 (v_u64 (0x8000000000000000), ix, d->halff)); #if WANT_SIMD_EXCEPT uint64x2_t special = vcgeq_u64 ( vsubq_u64 (vreinterpretq_u64_f64 (ax), d->tiny_bound), d->thresh); #else - uint64x2_t special = vcgeq_u64 (vreinterpretq_u64_f64 (ax), d->large_bound); + uint64x2_t special = vcageq_f64 (x, d->large_bound); #endif /* Fall back to scalar variant for all lanes if any of them are special. */ @@ -118,7 +77,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sinh) (float64x2_t x) /* Up to the point that expm1 overflows, we can use it to calculate sinh using a slight rearrangement of the definition of sinh. This allows us to retain acceptable accuracy for very small inputs. */ - float64x2_t t = expm1_inline (ax); + float64x2_t t = expm1_inline (ax, &d->d); t = vaddq_f64 (t, vdivq_f64 (t, vaddq_f64 (t, v_f64 (1.0)))); return vmulq_f64 (t, halfsign); } diff --git a/sysdeps/aarch64/fpu/tanh_advsimd.c b/sysdeps/aarch64/fpu/tanh_advsimd.c index 1da1dfa..402ba9d 100644 --- a/sysdeps/aarch64/fpu/tanh_advsimd.c +++ b/sysdeps/aarch64/fpu/tanh_advsimd.c @@ -18,68 +18,30 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f64.h" +#include "v_expm1_inline.h" static const struct data { - float64x2_t poly[11]; - float64x2_t inv_ln2, ln2_hi, ln2_lo, shift; - uint64x2_t onef; + struct v_expm1_data d; uint64x2_t thresh, tiny_bound; } data = { - /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2]. */ - .poly = { V2 (0x1p-1), V2 (0x1.5555555555559p-3), V2 (0x1.555555555554bp-5), - V2 (0x1.111111110f663p-7), V2 (0x1.6c16c16c1b5f3p-10), - V2 (0x1.a01a01affa35dp-13), V2 (0x1.a01a018b4ecbbp-16), - V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), - V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29), }, - - .inv_ln2 = V2 (0x1.71547652b82fep0), - .ln2_hi = V2 (-0x1.62e42fefa39efp-1), - .ln2_lo = V2 (-0x1.abc9e3b39803fp-56), - .shift = V2 (0x1.8p52), - - .onef = V2 (0x3ff0000000000000), + .d = V_EXPM1_DATA, .tiny_bound = V2 (0x3e40000000000000), /* asuint64 (0x1p-27). */ /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound). */ .thresh = V2 (0x01f241bf835f9d5f), }; -static inline float64x2_t -expm1_inline (float64x2_t x, const struct data *d) -{ - /* Helper routine for calculating exp(x) - 1. Vector port of the helper from - the scalar variant of tanh. */ - - /* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */ - float64x2_t j = vsubq_f64 (vfmaq_f64 (d->shift, d->inv_ln2, x), d->shift); - int64x2_t i = vcvtq_s64_f64 (j); - float64x2_t f = vfmaq_f64 (x, j, d->ln2_hi); - f = vfmaq_f64 (f, j, d->ln2_lo); - - /* Approximate expm1(f) using polynomial. */ - float64x2_t f2 = vmulq_f64 (f, f); - float64x2_t f4 = vmulq_f64 (f2, f2); - float64x2_t p = vfmaq_f64 ( - f, f2, v_estrin_10_f64 (f, f2, f4, vmulq_f64 (f4, f4), d->poly)); - - /* t = 2 ^ i. */ - float64x2_t t = vreinterpretq_f64_u64 ( - vaddq_u64 (vreinterpretq_u64_s64 (i << 52), d->onef)); - /* expm1(x) = p * t + (t - 1). */ - return vfmaq_f64 (vsubq_f64 (t, v_f64 (1)), p, t); -} - static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t special) +special_case (float64x2_t x, float64x2_t q, float64x2_t qp2, + uint64x2_t special) { - return v_call_f64 (tanh, x, y, special); + return v_call_f64 (tanh, x, vdivq_f64 (q, qp2), special); } /* Vector approximation for double-precision tanh(x), using a simplified - version of expm1. The greatest observed error is 2.77 ULP: - _ZGVnN2v_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3 - want -0x1.bd6a21a163624p-3. */ + version of expm1. The greatest observed error is 2.70 ULP: + _ZGVnN2v_tanh(-0x1.c59aa220cb177p-3) got -0x1.be5452a6459fep-3 + want -0x1.be5452a6459fbp-3. */ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); @@ -100,10 +62,10 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tanh) (float64x2_t x) u = vaddq_f64 (u, u); /* tanh(x) = (e^2x - 1) / (e^2x + 1). */ - float64x2_t q = expm1_inline (u, d); - float64x2_t qp2 = vaddq_f64 (q, v_f64 (2)); + float64x2_t q = expm1_inline (u, &d->d); + float64x2_t qp2 = vaddq_f64 (q, v_f64 (2.0)); if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, vdivq_f64 (q, qp2), special); + return special_case (x, q, qp2, special); return vdivq_f64 (q, qp2); } diff --git a/sysdeps/aarch64/fpu/v_expm1_inline.h b/sysdeps/aarch64/fpu/v_expm1_inline.h new file mode 100644 index 0000000..a925183 --- /dev/null +++ b/sysdeps/aarch64/fpu/v_expm1_inline.h @@ -0,0 +1,97 @@ +/* Double-precision inline helper for vector (Advanced SIMD) expm1 function + + Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_V_EXPM1_INLINE_H +#define AARCH64_FPU_V_EXPM1_INLINE_H + +#include "v_math.h" + +struct v_expm1_data +{ + float64x2_t c2, c4, c6, c8; + float64x2_t invln2; + int64x2_t exponent_bias; + double c1, c3, c5, c7, c9, c10; + double ln2[2]; +}; + +/* Generated using fpminimax, with degree=12 in [log(2)/2, log(2)/2]. */ +#define V_EXPM1_DATA \ + { \ + .c1 = 0x1.5555555555559p-3, .c2 = V2 (0x1.555555555554bp-5), \ + .c3 = 0x1.111111110f663p-7, .c4 = V2 (0x1.6c16c16c1b5f3p-10), \ + .c5 = 0x1.a01a01affa35dp-13, .c6 = V2 (0x1.a01a018b4ecbbp-16), \ + .c7 = 0x1.71ddf82db5bb4p-19, .c8 = V2 (0x1.27e517fc0d54bp-22), \ + .c9 = 0x1.af5eedae67435p-26, .c10 = 0x1.1f143d060a28ap-29, \ + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, \ + .invln2 = V2 (0x1.71547652b82fep0), \ + .exponent_bias = V2 (0x3ff0000000000000), \ + } + +static inline float64x2_t +expm1_inline (float64x2_t x, const struct v_expm1_data *d) +{ + /* Helper routine for calculating exp(x) - 1. */ + + float64x2_t ln2 = vld1q_f64 (&d->ln2[0]); + + /* Reduce argument to smaller range: + Let i = round(x / ln2) + and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. + exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 + where 2^i is exact because i is an integer. */ + float64x2_t n = vrndaq_f64 (vmulq_f64 (x, d->invln2)); + int64x2_t i = vcvtq_s64_f64 (n); + float64x2_t f = vfmsq_laneq_f64 (x, n, ln2, 0); + f = vfmsq_laneq_f64 (f, n, ln2, 1); + + /* Approximate expm1(f) using polynomial. + Taylor expansion for expm1(x) has the form: + x + ax^2 + bx^3 + cx^4 .... + So we calculate the polynomial P(f) = a + bf + cf^2 + ... + and assemble the approximation expm1(f) ~= f + f^2 * P(f). */ + float64x2_t f2 = vmulq_f64 (f, f); + float64x2_t f4 = vmulq_f64 (f2, f2); + float64x2_t lane_consts_13 = vld1q_f64 (&d->c1); + float64x2_t lane_consts_57 = vld1q_f64 (&d->c5); + float64x2_t lane_consts_910 = vld1q_f64 (&d->c9); + float64x2_t p01 = vfmaq_laneq_f64 (v_f64 (0.5), f, lane_consts_13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, lane_consts_13, 1); + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, lane_consts_57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, lane_consts_57, 1); + float64x2_t p03 = vfmaq_f64 (p01, f2, p23); + float64x2_t p47 = vfmaq_f64 (p45, f2, p67); + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, lane_consts_910, 0); + float64x2_t p = vfmaq_laneq_f64 (p89, f2, lane_consts_910, 1); + p = vfmaq_f64 (p47, f4, p); + p = vfmaq_f64 (p03, f4, p); + + p = vfmaq_f64 (f, f2, p); + + /* Assemble the result. + expm1(x) ~= 2^i * (p + 1) - 1 + Let t = 2^i. */ + int64x2_t u = vaddq_s64 (vshlq_n_s64 (i, 52), d->exponent_bias); + float64x2_t t = vreinterpretq_f64_s64 (u); + + /* expm1(x) ~= p * t + (t - 1). */ + return vfmaq_f64 (vsubq_f64 (t, v_f64 (1.0)), p, t); +} + +#endif diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h index 1daedfd..c1fb88b 100644 --- a/sysdeps/aarch64/fpu/v_expm1f_inline.h +++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h @@ -21,7 +21,6 @@ #define AARCH64_FPU_V_EXPM1F_INLINE_H #include "v_math.h" -#include "math_config.h" struct v_expm1f_data { |