aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2023-12-01 09:49:45 +0000
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2023-12-04 15:52:11 +0000
commit63d0a35d5f223a3f4b68190567b7d4d44545bce5 (patch)
tree69b006288d047188fcd0b81906647f9168a9eb65 /sysdeps
parent8e755f5bc8f57d0034c12f34c11ce8a83103dc65 (diff)
downloadglibc-63d0a35d5f223a3f4b68190567b7d4d44545bce5.zip
glibc-63d0a35d5f223a3f4b68190567b7d4d44545bce5.tar.gz
glibc-63d0a35d5f223a3f4b68190567b7d4d44545bce5.tar.bz2
math: Add new exp10 implementation
New implementation is based on the existing exp/exp2, with different reduction constants and polynomial. Worst-case error in round-to- nearest is 0.513 ULP. The exp/exp2 shared table is reused for exp10 - .rodata size of e_exp_data increases by 64 bytes. As for exp/exp2, targets with single-instruction rounding/conversion intrinsics can use them by toggling TOINT_INTRINSICS=1 and adding the necessary code to their math_private.h. Improvements on Neoverse V1 compared to current GLIBC master: exp10 thruput: 3.3x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8] exp10 latency: 1.8x in [-0x1.439b746e36b52p+8 0x1.34413509f79ffp+8] Tested on: aarch64-linux-gnu (TOINT_INTRINSICS, fma contraction) and x86_64-linux-gnu (!TOINT_INTRINSICS, no fma contraction) Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/ieee754/dbl-64/e_exp10.c144
-rw-r--r--sysdeps/ieee754/dbl-64/e_exp_data.c11
-rw-r--r--sysdeps/ieee754/dbl-64/math_config.h4
3 files changed, 135 insertions, 24 deletions
diff --git a/sysdeps/ieee754/dbl-64/e_exp10.c b/sysdeps/ieee754/dbl-64/e_exp10.c
index fa47f4f..0806914 100644
--- a/sysdeps/ieee754/dbl-64/e_exp10.c
+++ b/sysdeps/ieee754/dbl-64/e_exp10.c
@@ -16,36 +16,132 @@
<https://www.gnu.org/licenses/>. */
#include <math.h>
+#include <math-barriers.h>
+#include <math-narrow-eval.h>
#include <math_private.h>
#include <float.h>
#include <libm-alias-finite.h>
+#include "math_config.h"
-static const double log10_high = 0x2.4d7637p0;
-static const double log10_low = 0x7.6aaa2b05ba95cp-28;
+#define N (1 << EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+#define OFlowBound 0x1.34413509f79ffp8 /* log10(DBL_MAX). */
+#define UFlowBound -0x1.5ep+8 /* -350. */
+#define SmallTop 0x3c6 /* top12(0x1p-57). */
+#define BigTop 0x407 /* top12(0x1p8). */
+#define Thresh 0x41 /* BigTop - SmallTop. */
+#define Shift __exp_data.shift
+#define C(i) __exp_data.exp10_poly[i]
+static double
+special_case (uint64_t sbits, double_t tmp, uint64_t ki)
+{
+ double_t scale, y;
+
+ if (ki - (1ull << 16) < 0x80000000)
+ {
+ /* The exponent of scale might have overflowed by 1. */
+ sbits -= 1ull << 52;
+ scale = asdouble (sbits);
+ y = 2 * (scale + scale * tmp);
+ return check_oflow (y);
+ }
+
+ /* n < 0, need special care in the subnormal range. */
+ sbits += 1022ull << 52;
+ scale = asdouble (sbits);
+ y = scale + scale * tmp;
+
+ if (y < 1.0)
+ {
+ /* Round y to the right precision before scaling it into the subnormal
+ range to avoid double rounding that can cause 0.5+E/2 ulp error where
+ E is the worst-case ulp error outside the subnormal range. So this
+ is only useful if the goal is better than 1 ulp worst-case error. */
+ double_t lo = scale - y + scale * tmp;
+ double_t hi = 1.0 + y;
+ lo = 1.0 - hi + y + lo;
+ y = math_narrow_eval (hi + lo) - 1.0;
+ /* Avoid -0.0 with downward rounding. */
+ if (WANT_ROUNDING && y == 0.0)
+ y = 0.0;
+ /* The underflow exception needs to be signaled explicitly. */
+ math_force_eval (math_opt_barrier (0x1p-1022) * 0x1p-1022);
+ }
+ y = 0x1p-1022 * y;
+
+ return check_uflow (y);
+}
+
+/* Double-precision 10^x approximation. Largest observed error is ~0.513 ULP. */
double
-__ieee754_exp10 (double arg)
+__ieee754_exp10 (double x)
{
- int32_t lx;
- double arg_high, arg_low;
- double exp_high, exp_low;
-
- if (!isfinite (arg))
- return __ieee754_exp (arg);
- if (arg < DBL_MIN_10_EXP - DBL_DIG - 10)
- return DBL_MIN * DBL_MIN;
- else if (arg > DBL_MAX_10_EXP + 1)
- return DBL_MAX * DBL_MAX;
- else if (fabs (arg) < 0x1p-56)
- return 1.0;
-
- GET_LOW_WORD (lx, arg);
- lx &= 0xf8000000;
- arg_high = arg;
- SET_LOW_WORD (arg_high, lx);
- arg_low = arg - arg_high;
- exp_high = arg_high * log10_high;
- exp_low = arg_high * log10_low + arg_low * M_LN10;
- return __ieee754_exp (exp_high) * __ieee754_exp (exp_low);
+ uint64_t ix = asuint64 (x);
+ uint32_t abstop = (ix >> 52) & 0x7ff;
+
+ if (__glibc_unlikely (abstop - SmallTop >= Thresh))
+ {
+ if (abstop - SmallTop >= 0x80000000)
+ /* Avoid spurious underflow for tiny x.
+ Note: 0 is common input. */
+ return x + 1;
+ if (abstop == 0x7ff)
+ return ix == asuint64 (-INFINITY) ? 0.0 : x + 1.0;
+ if (x >= OFlowBound)
+ return __math_oflow (0);
+ if (x < UFlowBound)
+ return __math_uflow (0);
+
+ /* Large x is special-cased below. */
+ abstop = 0;
+ }
+
+ /* Reduce x: z = x * N / log10(2), k = round(z). */
+ double_t z = __exp_data.invlog10_2N * x;
+ double_t kd;
+ int64_t ki;
+#if TOINT_INTRINSICS
+ kd = roundtoint (z);
+ ki = converttoint (z);
+#else
+ kd = math_narrow_eval (z + Shift);
+ kd -= Shift;
+ ki = kd;
+#endif
+
+ /* r = x - k * log10(2), r in [-0.5, 0.5]. */
+ double_t r = x;
+ r = __exp_data.neglog10_2hiN * kd + r;
+ r = __exp_data.neglog10_2loN * kd + r;
+
+ /* exp10(x) = 2^(k/N) * 2^(r/N).
+ Approximate the two components separately. */
+
+ /* s = 2^(k/N), using lookup table. */
+ uint64_t e = ki << (52 - EXP_TABLE_BITS);
+ uint64_t i = (ki & IndexMask) * 2;
+ uint64_t u = __exp_data.tab[i + 1];
+ uint64_t sbits = u + e;
+
+ double_t tail = asdouble (__exp_data.tab[i]);
+
+ /* 2^(r/N) ~= 1 + r * Poly(r). */
+ double_t r2 = r * r;
+ double_t p = C (0) + r * C (1);
+ double_t y = C (2) + r * C (3);
+ y = y + r2 * C (4);
+ y = p + r2 * y;
+ y = tail + y * r;
+
+ if (__glibc_unlikely (abstop == 0))
+ return special_case (sbits, y, ki);
+
+ /* Assemble components:
+ y = 2^(r/N) * 2^(k/N)
+ ~= (y + 1) * s. */
+ double_t s = asdouble (sbits);
+ return s * y + s;
}
+
libm_alias_finite (__ieee754_exp10, __exp10)
diff --git a/sysdeps/ieee754/dbl-64/e_exp_data.c b/sysdeps/ieee754/dbl-64/e_exp_data.c
index d498b8b..5c774af 100644
--- a/sysdeps/ieee754/dbl-64/e_exp_data.c
+++ b/sysdeps/ieee754/dbl-64/e_exp_data.c
@@ -58,6 +58,17 @@ const struct exp_data __exp_data = {
0x1.5d7e09b4e3a84p-10,
#endif
},
+.invlog10_2N = 0x1.a934f0979a371p1 * N,
+.neglog10_2hiN = -0x1.3441350ap-2 / N,
+.neglog10_2loN = 0x1.0c0219dc1da99p-39 / N,
+.exp10_poly = {
+/* Coeffs generated using Remez in [-log10(2)/256, log10(2)/256 ]. */
+0x1.26bb1bbb55516p1,
+0x1.53524c73ce9fep1,
+0x1.0470591ce4b26p1,
+0x1.2bd76577fe684p0,
+0x1.1446eeccd0efbp-1
+},
// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N)
// tab[2*k] = asuint64(T[k])
// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 19af33f..d617add 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -201,6 +201,10 @@ extern const struct exp_data
double poly[4]; /* Last four coefficients. */
double exp2_shift;
double exp2_poly[EXP2_POLY_ORDER];
+ double invlog10_2N;
+ double neglog10_2hiN;
+ double neglog10_2loN;
+ double exp10_poly[5];
uint64_t tab[2*(1 << EXP_TABLE_BITS)];
} __exp_data attribute_hidden;