aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/fpu/acosh_advsimd.c13
-rw-r--r--sysdeps/aarch64/fpu/acosh_sve.c32
-rw-r--r--sysdeps/aarch64/fpu/acoshf_advsimd.c73
-rw-r--r--sysdeps/aarch64/fpu/asinh_advsimd.c68
-rw-r--r--sysdeps/aarch64/fpu/asinh_sve.c48
-rw-r--r--sysdeps/aarch64/fpu/asinhf_advsimd.c63
-rw-r--r--sysdeps/aarch64/fpu/atanh_advsimd.c27
-rw-r--r--sysdeps/aarch64/fpu/atanh_sve.c24
8 files changed, 224 insertions, 124 deletions
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c
index ac0db3e..b5efe92 100644
--- a/sysdeps/aarch64/fpu/acosh_advsimd.c
+++ b/sysdeps/aarch64/fpu/acosh_advsimd.c
@@ -34,7 +34,18 @@ static float64x2_t NOINLINE VPCS_ATTR
special_case (float64x2_t x, float64x2_t y, uint64x2_t special,
const struct v_log1p_data *d)
{
- return v_call_f64 (acosh, x, log1p_inline (y, d), special);
+ /* Acosh is define on [1, inf). Its formula can be re-written knowing that 1
+ becomes negligible when x is a very large number. So for special numbers,
+ where x >= 2^511, acosh ~= ln(2x). But, ln(2x) = ln(2) + ln(x) and below we
+ calculate ln(x) and then add ln(2) to the result.
+
+ Right before returning we check if x is infinity or if x is lower than 1,
+ in which case we return infinity or NaN. */
+ float64x2_t one = v_f64 (1.0);
+ float64x2_t t = log1p_inline (vbslq_f64 (special, x, y), d);
+ t = vbslq_f64 (special, vaddq_f64 (t, v_f64 (d->ln2[0])), t);
+ t = vbslq_f64 (vceqq_f64 (x, v_f64 (INFINITY)), v_f64 (INFINITY), t);
+ return vbslq_f64 (vcltq_f64 (x, one), v_f64 (NAN), t);
}
/* Vector approximation for double-precision acosh, based on log1p.
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
index 1f6b8fc..6d996e3 100644
--- a/sysdeps/aarch64/fpu/acosh_sve.c
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
@@ -20,13 +20,33 @@
#define WANT_SV_LOG1P_K0_SHORTCUT 1
#include "sv_log1p_inline.h"
-#define One (0x3ff0000000000000)
-#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */
+#define One (0x3ff0000000000000ULL)
+#define Thres (0x1ff0000000000000ULL)
+const static struct data
+{
+ double ln2;
+} data = { .ln2 = 0x1.62e42fefa39efp-1 };
+
+/* Acosh is define on [1, inf). Its formula can be re-written knowing that 1
+ becomes negligible when x is a very large number. So for special numbers,
+ where x >= 2^511, acosh ~= ln(2x). But, ln(2x) = ln(2) + ln(x) and below we
+ calculate ln(x) and then add ln(2) to the result.
+
+ Right before returning we check if x is infinity or if x is lower than 1,
+ in which case we return infinity or NaN. */
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t x, svfloat64_t xm1, svfloat64_t y, svbool_t special,
+ svbool_t pg, const struct data *d)
{
- return sv_call_f64 (acosh, x, y, special);
+ svfloat64_t inf = sv_f64 (INFINITY);
+ svfloat64_t nan = sv_f64 (NAN);
+ svfloat64_t log = sv_log1p_inline (svsel (special, xm1, y), pg);
+ svfloat64_t result = svadd_m (special, log, sv_f64 (d->ln2));
+ svbool_t is_inf_nan = svorr_b_z (special, svcmpge (special, x, inf),
+ svcmplt (special, x, sv_f64 (1.0)));
+ svfloat64_t inf_or_nan = svsel (svcmpge (special, x, inf), inf, nan);
+ return svsel (is_inf_nan, inf_or_nan, result);
}
/* SVE approximation for double-precision acosh, based on log1p.
@@ -36,6 +56,8 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
want 0x1.ef0cee7c33ce4p-2. */
svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
+
/* (ix - One) >= (BigBound - One). */
svuint64_t ix = svreinterpret_u64 (x);
svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres);
@@ -46,6 +68,6 @@ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
/* Fall back to scalar routine for special lanes. */
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, sv_log1p_inline (y, pg), special);
+ return special_case (x, xm1, y, special, pg, d);
return sv_log1p_inline (y, pg);
}
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 6071190..275de77 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -24,15 +24,13 @@ const static struct data
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
uint16x4_t special_bound_u16;
- uint32x4_t special_bound_u32;
- float32x4_t pinf, nan;
+ float32x4_t inf, nan;
} data = {
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.one = V4 (0x3f800000),
- .special_bound_u16 = V4 (0x2000),
/* asuint(sqrt(FLT_MAX)) - asuint(1). */
- .special_bound_u32 = V4 (0x20000000),
- .pinf = V4 (INFINITY),
+ .special_bound_u16 = V4 (0x2000),
+ .inf = V4 (INFINITY),
.nan = V4 (NAN),
};
@@ -51,34 +49,42 @@ inline_acoshf (float32x4_t x, const struct data *d)
}
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, const struct data *d)
+special_case (float32x4_t x, uint16x4_t special_u16, const struct data *d)
{
- uint32x4_t special = vcgeq_u32 (
- vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), d->special_bound_u32);
-
- /* To avoid the overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we
- reduce the input of acosh to a narrower interval by relying on the identity
- acosh(t) = 1/2acosh(2t^2 - 1) for t>=1.
- If we set t=sqrt((x+1)/2), since x>=1 then t>=sqrt(2/2)=1, and therefore
- acosh(x) = 2acosh(sqrt((x+1)/2)). */
- float32x4_t r = vaddq_f32 (x, vreinterpretq_f32_u32 (d->one));
- r = vmulq_f32 (r, v_f32 (0.5f));
- r = vbslq_f32 (special, vsqrtq_f32 (r), x);
-
- float32x4_t y = inline_acoshf (r, d);
-
- y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y);
-
- /* Check whether x is less than 1, or x is inf or nan. */
- uint32x4_t inf_minus_one
- = vsubq_u32 (vreinterpretq_u32_f32 (d->pinf), d->one);
- uint32x4_t is_infnan = vcgeq_u32 (
- vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), inf_minus_one);
-
- y = vbslq_f32 (is_infnan, d->nan, y);
- uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf);
- y = vbslq_f32 (ret_pinf, d->pinf, y);
- return y;
+ float32x4_t one = v_f32 (1.0);
+ float32x4_t ln2 = d->log1pf_consts.ln2;
+
+ /* acosh(x) = ln(x + sqrt[x^2 -1]).
+ So acosh(x) = log1p (x + sqrt[x^2 - 1] - 1). */
+ float32x4_t xm1 = vsubq_f32 (x, one);
+ float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, one));
+
+ float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
+
+ /* special_u16 will be 0x0000ffff for true lanes, which doesn't work with bit
+ selects therefore to make it a 32 bit predicate, we have to add special
+ << 16. */
+ uint32x4_t special = vmovl_u16 (special_u16);
+ special = vaddq_u32 (special, vshlq_n_u32 (special, 16));
+ float32x4_t xy = vbslq_f32 (special, x, y);
+
+ /* For large inputs, acosh(x) ≈ log(x) + ln(2).
+ We use log1pf-inline log implementation and add ln(2). */
+ float32x4_t log_xy = log1pf_inline (xy, &d->log1pf_consts);
+
+ /* For acoshf there are three special cases that need considering. Infinity
+ and NaNs, which are also returned unchanged and for cases of x < 1 we'll
+ return all NaNs since acoshf is defined on (1, inf). */
+ uint32x4_t is_lower_one = vcltq_f32 (x, one);
+ uint32x4_t is_finite = vcltq_f32 (x, d->inf);
+
+ /* This dependency chain of selects can run in parallel with y and log_x
+ being calculated before the last addition. */
+ float32x4_t ln2_inf_nan = vbslq_f32 (is_finite, ln2, x);
+ ln2_inf_nan = vbslq_f32 (is_lower_one, d->nan, ln2_inf_nan);
+ float32x4_t ln2_inf_nan_zero = vbslq_f32 (special, ln2_inf_nan, v_f32 (0));
+
+ return vaddq_f32 (log_xy, ln2_inf_nan_zero);
}
/* Vector approximation for single-precision acosh, based on log1p.
@@ -97,8 +103,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
= vcge_u16 (vsubhn_u32 (ix, d->one), d->special_bound_u16);
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, d);
+ return special_case (x, special, d);
return inline_acoshf (x, d);
}
+
libmvec_hidden_def (V_NAME_F1 (acosh))
HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c
index 5168917..650ed46 100644
--- a/sysdeps/aarch64/fpu/asinh_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinh_advsimd.c
@@ -27,36 +27,43 @@ const static struct data
double lc1, lc3, ln2, lc4;
float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17;
double c1, c3, c5, c7, c9, c11, c13, c15;
+ float64x2_t inf;
} data = {
/* Even terms of polynomial s.t. asinh(x) is approximated by
asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...).
Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */
- .c0 = V2 (-0x1.55555555554a7p-3), .c1 = 0x1.3333333326c7p-4,
- .c2 = V2 (-0x1.6db6db68332e6p-5), .c3 = 0x1.f1c71b26fb40dp-6,
- .c4 = V2 (-0x1.6e8b8b654a621p-6), .c5 = 0x1.1c4daa9e67871p-6,
- .c6 = V2 (-0x1.c9871d10885afp-7), .c7 = 0x1.7a16e8d9d2ecfp-7,
- .c8 = V2 (-0x1.3ddca533e9f54p-7), .c9 = 0x1.0becef748dafcp-7,
- .c10 = V2 (-0x1.b90c7099dd397p-8), .c11 = 0x1.541f2bb1ffe51p-8,
- .c12 = V2 (-0x1.d217026a669ecp-9), .c13 = 0x1.0b5c7977aaf7p-9,
- .c14 = V2 (-0x1.e0f37daef9127p-11), .c15 = 0x1.388b5fe542a6p-12,
- .c16 = V2 (-0x1.021a48685e287p-14), .c17 = V2 (0x1.93d4ba83d34dap-18),
- .lc0 = V2 (-0x1.ffffffffffff7p-2), .lc1 = 0x1.55555555170d4p-2,
- .lc2 = V2 (-0x1.0000000399c27p-2), .lc3 = 0x1.999b2e90e94cap-3,
- .lc4 = -0x1.554e550bd501ep-3, .ln2 = 0x1.62e42fefa39efp-1,
- .off = V2 (0x3fe6900900000000), .huge_bound = V2 (0x5fe0000000000000),
- .abs_mask = V2 (0x7fffffffffffffff), .mask = V2 (0xfffULL << 52),
+ .c0 = V2 (-0x1.55555555554a7p-3),
+ .c1 = 0x1.3333333326c7p-4,
+ .c2 = V2 (-0x1.6db6db68332e6p-5),
+ .c3 = 0x1.f1c71b26fb40dp-6,
+ .c4 = V2 (-0x1.6e8b8b654a621p-6),
+ .c5 = 0x1.1c4daa9e67871p-6,
+ .c6 = V2 (-0x1.c9871d10885afp-7),
+ .c7 = 0x1.7a16e8d9d2ecfp-7,
+ .c8 = V2 (-0x1.3ddca533e9f54p-7),
+ .c9 = 0x1.0becef748dafcp-7,
+ .c10 = V2 (-0x1.b90c7099dd397p-8),
+ .c11 = 0x1.541f2bb1ffe51p-8,
+ .c12 = V2 (-0x1.d217026a669ecp-9),
+ .c13 = 0x1.0b5c7977aaf7p-9,
+ .c14 = V2 (-0x1.e0f37daef9127p-11),
+ .c15 = 0x1.388b5fe542a6p-12,
+ .c16 = V2 (-0x1.021a48685e287p-14),
+ .c17 = V2 (0x1.93d4ba83d34dap-18),
+ .lc0 = V2 (-0x1.ffffffffffff7p-2),
+ .lc1 = 0x1.55555555170d4p-2,
+ .lc2 = V2 (-0x1.0000000399c27p-2),
+ .lc3 = 0x1.999b2e90e94cap-3,
+ .lc4 = -0x1.554e550bd501ep-3,
+ .ln2 = 0x1.62e42fefa39efp-1,
+ .off = V2 (0x3fe6900900000000),
+ .huge_bound = V2 (0x5fe0000000000000),
+ .abs_mask = V2 (0x7fffffffffffffff),
+ .mask = V2 (0xfffULL << 52),
+ .inf = V2 (INFINITY)
};
-static float64x2_t NOINLINE VPCS_ATTR
-special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask,
- uint64x2_t special)
-{
- /* Copy sign. */
- y = vbslq_f64 (abs_mask, y, x);
- return v_call_f64 (asinh, x, y, special);
-}
-
#define N (1 << V_LOG_TABLE_BITS)
#define IndexMask (N - 1)
@@ -110,6 +117,19 @@ log_inline (float64x2_t ax, const struct data *d)
return vfmaq_f64 (hi, y, r2);
}
+static float64x2_t NOINLINE VPCS_ATTR
+special_case (float64x2_t x, uint64x2_t special, float64x2_t y, float64x2_t ax,
+ const struct data *d)
+{
+ /* For very large inputs (x > 2^511), asinh(x) ≈ ln(2x).
+ In this range the +sqrt(x^2+1) term is negligible, so we compute
+ asinh(x) as ln(x) + ln(2). */
+ float64x2_t res_asinh = vaddq_f64 (log_inline (ax, d), v_f64 (d->ln2));
+ res_asinh = vbslq_f64 (vceqq_f64 (ax, d->inf), d->inf, res_asinh);
+ res_asinh = vbslq_f64 (special, res_asinh, y);
+ return vbslq_f64 (d->abs_mask, res_asinh, x);
+}
+
/* Double-precision implementation of vector asinh(x).
asinh is very sensitive around 1, so it is impractical to devise a single
low-cost algorithm which is sufficiently accurate on a wide range of input.
@@ -179,7 +199,7 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x)
/* Choose the right option for each lane. */
float64x2_t y = vbslq_f64 (gt1, option_1, option_2);
if (__glibc_unlikely (v_any_u64 (special)))
- return special_case (x, y, d->abs_mask, special);
+ return special_case (x, special, y, ax, d);
/* Copy sign. */
return vbslq_f64 (d->abs_mask, y, x);
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
index bf9dad9..cb32b60 100644
--- a/sysdeps/aarch64/fpu/asinh_sve.c
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
@@ -29,7 +29,7 @@ static const struct data
double even_coeffs[9];
double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
uint64_t off, mask;
-
+ double inf;
} data = {
/* Polynomial generated using Remez on [2^-26, 1]. */
.even_coeffs ={
@@ -61,14 +61,9 @@ static const struct data
.p4 = -0x1.554e550bd501ep-3,
.off = 0x3fe6900900000000,
.mask = 0xfffULL << 52,
+ .inf = INFINITY
};
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
- return sv_call_f64 (asinh, x, y, special);
-}
-
static inline svfloat64_t
__sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
{
@@ -104,12 +99,38 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
return y;
}
+static svfloat64_t NOINLINE
+special_case (svfloat64_t ax, svfloat64_t y, svuint64_t sign, svbool_t special,
+ svbool_t pg, const struct data *d)
+{
+ /* For very large inputs (x > 2^511), asinh(x) ≈ ln(2x).
+ In this range the +sqrt(x^2+1) term is negligible, so we compute
+ asinh(x) as ln(x) + ln(2) later in this function. */
+ svfloat64_t log_ax = __sv_log_inline (ax, d, special);
+
+ /* The only special cases that need considering are infinity and NaNs since
+ 0 will be handled by other calculations. */
+ svfloat64_t inf = sv_f64 (d->inf);
+ svbool_t is_inf = svcmpeq (special, ax, inf);
+ svbool_t is_nan = svcmpne (special, ax, ax);
+ svfloat64_t inf_ln2 = svsel (is_inf, inf, sv_f64 (d->ln2));
+ svfloat64_t inf_nan_ln2 = svsel (is_nan, sv_f64 (NAN), inf_ln2);
+ svfloat64_t asinh_x_res = svadd_x (special, log_ax, inf_nan_ln2);
+
+ /* Now select (based on special) between x and y to change the type and,
+ return either the positive or negative value, considering the input and
+ its sign. */
+ svfloat64_t result = svsel (special, asinh_x_res, y);
+ svuint64_t result_uint = svreinterpret_u64 (result);
+ return svreinterpret_f64 (sveor_m (pg, result_uint, sign));
+}
+
/* Double-precision implementation of SVE asinh(x).
asinh is very sensitive around 1, so it is impractical to devise a single
low-cost algorithm which is sufficiently accurate on a wide range of input.
Instead we use two different algorithms:
asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1
- = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
+ = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise
where log(x) is an optimized log approximation, and P(x) is a polynomial
shared with the scalar routine. The greatest observed error 2.51 ULP, in
|x| >= 1:
@@ -180,14 +201,11 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
}
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (
- x,
- svreinterpret_f64 (sveor_x (
- pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
- special);
-
/* Choose the right option for each lane. */
svfloat64_t y = svsel (ge1, option_1, option_2);
+
+ if (__glibc_unlikely (svptest_any (pg, special)))
+ return special_case (ax, y, sign, special, pg, d);
+
return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
}
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
index f1b1dd7..3454e46 100644
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -25,12 +25,12 @@ const static struct data
struct v_log1pf_data log1pf_consts;
float32x4_t one;
uint32x4_t square_lim;
- float32x4_t pinf, nan;
+ float32x4_t inf, nan;
} data = {
.one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.square_lim = V4 (0x5f800000), /* asuint(sqrt(FLT_MAX)). */
- .pinf = V4 (INFINITY),
+ .inf = V4 (INFINITY),
.nan = V4 (NAN),
};
@@ -51,31 +51,39 @@ static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t ax, uint32x4_t sign, uint32x4_t special,
const struct data *d)
{
- /* To avoid overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we
- reduce the input of asinh to a narrower interval by relying on the
- identity: 2asinh(t) = +-acosh(2t^2 + 1)
- If we set t=sqrt((x-1)/2), then
- 2asinh(sqrt((x-1)/2)) = acosh(x).
- Found that, for a high input x, asinh(x) very closely approximates
- acosh(x), so implemented it with this function instead. */
- float32x4_t r = vsubq_f32 (ax, d->one);
- r = vmulq_f32 (r, v_f32 (0.5f));
- r = vbslq_f32 (special, vsqrtq_f32 (r), ax);
-
- float32x4_t y = inline_asinhf (r, sign, d);
-
- y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y);
-
- /* Check whether x is inf or nan. */
- uint32x4_t ret_inf = vceqq_f32 (ax, d->pinf);
- uint32x4_t ret_nan = vmvnq_u32 (vcleq_f32 (ax, d->pinf));
- y = vbslq_f32 (ret_inf, d->pinf, y);
- y = vbslq_f32 (ret_nan, d->nan, y);
- /* Put sign back in for minf, as it doesn't happen in log1pf_inline call. */
- y = vbslq_f32 (
- ret_inf,
- vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)), y);
- return y;
+ float32x4_t t
+ = vaddq_f32 (v_f32 (1.0f), vsqrtq_f32 (vfmaq_f32 (d->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), t));
+
+ /* For large inputs (x > 2^64), asinh(x) ≈ ln(2x).
+ 1 becomes negligible in sqrt(x^2+1), so we compute
+ asinh(x) as ln(x) + ln(2). */
+ float32x4_t xy = vbslq_f32 (special, ax, y);
+ float32x4_t log_xy = log1pf_inline (xy, &d->log1pf_consts);
+
+ /* Infinity and NaNs are the only other special cases that need checking
+ before we return the values. 0 is handled by inline_asinhf as it returns
+ 0. Below we implememt the logic that returns infinity when infinity is
+ passed and NaNs when NaNs are passed. Sometimes due to intrinsics like
+ vdivq_f32, infinity can change into NaNs, so we want to make sure the
+ right result is returned.
+
+ Since these steps run in parallel with the log, we select between the
+ results we'll want to add to log_x in time, since addition with infinity
+ and NaNs doesn't make a difference. When we get to the adition step,
+ everything is already in the right place. */
+ float32x4_t ln2 = d->log1pf_consts.ln2;
+ uint32x4_t is_finite = vcltq_f32 (ax, d->inf);
+ float32x4_t ln2_inf_nan = vbslq_f32 (is_finite, ln2, ax);
+
+ /* Before returning the result, the right sign will be assinged to the
+ absolute result. This is because we pass an absoulte x to the function.
+ */
+
+ float32x4_t asinhf
+ = vbslq_f32 (special, vaddq_f32 (log_xy, ln2_inf_nan), log_xy);
+ return vreinterpretq_f32_u32 (
+ veorq_u32 (sign, vreinterpretq_u32_f32 (asinhf)));
}
/* Single-precision implementation of vector asinh(x), using vector log1p.
@@ -99,5 +107,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
return special_case (ax, sign, special, d);
return inline_asinhf (ax, sign, d);
}
+
libmvec_hidden_def (V_NAME_F1 (asinh))
HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c
index 711a234..a3798ef 100644
--- a/sysdeps/aarch64/fpu/atanh_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanh_advsimd.c
@@ -23,38 +23,43 @@
const static struct data
{
struct v_log1p_data log1p_consts;
- uint64x2_t one;
+ float64x2_t one;
uint64x2_t sign_mask;
} data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE,
- .one = V2 (0x3ff0000000000000),
+ .one = V2 (1.0),
.sign_mask = V2 (0x8000000000000000) };
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y,
+special_case (float64x2_t ax, float64x2_t halfsign, float64x2_t y,
uint64x2_t special, const struct data *d)
{
y = log1p_inline (y, &d->log1p_consts);
- return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x),
- vmulq_f64 (halfsign, y), special);
+ /* Copy the sign bit from the input to inf. */
+ uint64x2_t is_one = vceqq_f64 (ax, d->one);
+ float64x2_t signed_inf
+ = vbslq_f64 (d->sign_mask, halfsign, v_f64 (INFINITY));
+ /* Here we check for all the rest of the cases where |x| > 1 will return a
+ NaN, including if x = NaN. */
+ float64x2_t res = vbslq_f64 (special, v_f64 (NAN), vmulq_f64 (halfsign, y));
+
+ return vbslq_f64 (is_one, signed_inf, res);
}
/* Approximation for vector double-precision atanh(x) using modified log1p.
- The greatest observed error is 3.31 ULP:
+ The greatest observed error is 2.81 + 0.5 ULP:
_ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
want 0x1.ffd8ff31b501cp-6. */
-VPCS_ATTR
-float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
+VPCS_ATTR float64x2_t V_NAME_D1 (atanh) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5));
float64x2_t ax = vabsq_f64 (x);
- uint64x2_t ia = vreinterpretq_u64_f64 (ax);
- uint64x2_t special = vcgeq_u64 (ia, d->one);
+ uint64x2_t special = vcageq_f64 (x, d->one);
float64x2_t y;
y = vaddq_f64 (ax, ax);
- y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax));
+ y = vdivq_f64 (y, vsubq_f64 (d->one, ax));
if (__glibc_unlikely (v_any_u64 (special)))
return special_case (ax, halfsign, y, special, d);
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
index fddf4c8..5589849 100644
--- a/sysdeps/aarch64/fpu/atanh_sve.c
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
@@ -20,13 +20,20 @@
#define WANT_SV_LOG1P_K0_SHORTCUT 0
#include "sv_log1p_inline.h"
-#define One (0x3ff0000000000000)
-#define Half (0x3fe0000000000000)
+static const struct data
+{
+ uint64_t half;
+ double inf;
+ double nan;
+} data = { .half = 0x3fe0000000000000, .inf = INFINITY, .nan = NAN };
static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+special_case (svfloat64_t ax, svfloat64_t y, svbool_t pg, svbool_t special,
+ svfloat64_t halfsign, const struct data *d)
{
- return sv_call_f64 (atanh, x, y, special);
+ svfloat64_t res = svsel (special, sv_f64 (d->nan), y);
+ res = svsel (svcmpeq (special, ax, sv_f64 (1.0)), sv_f64 (d->inf), res);
+ return svmul_x (pg, res, halfsign);
}
/* SVE approximation for double-precision atanh, based on log1p.
@@ -35,24 +42,25 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
want 0x1.ffd8ff31b501cp-6. */
svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
{
+ const struct data *d = ptr_barrier (&data);
svfloat64_t ax = svabs_x (pg, x);
svuint64_t iax = svreinterpret_u64 (ax);
svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax);
- svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
+ svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->half));
/* It is special if iax >= 1. */
- svbool_t special = svacge (pg, x, 1.0);
+ svbool_t special = svacge (pg, ax, 1.0);
/* Computation is performed based on the following sequence of equality:
(1+x)/(1-x) = 1 + 2x/(1-x). */
svfloat64_t y;
y = svadd_x (pg, ax, ax);
- y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax));
+ y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1.0), ax));
/* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */
y = sv_log1p_inline (y, pg);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmul_x (pg, halfsign, y), special);
+ return special_case (ax, y, pg, special, halfsign, d);
return svmul_x (pg, halfsign, y);
}