diff options
Diffstat (limited to 'sysdeps')
| -rw-r--r-- | sysdeps/aarch64/fpu/acosh_advsimd.c | 13 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/acosh_sve.c | 32 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/acoshf_advsimd.c | 73 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/asinh_advsimd.c | 68 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/asinh_sve.c | 48 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/asinhf_advsimd.c | 63 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/atanh_advsimd.c | 27 | ||||
| -rw-r--r-- | sysdeps/aarch64/fpu/atanh_sve.c | 24 |
8 files changed, 224 insertions, 124 deletions
diff --git a/sysdeps/aarch64/fpu/acosh_advsimd.c b/sysdeps/aarch64/fpu/acosh_advsimd.c index ac0db3e..b5efe92 100644 --- a/sysdeps/aarch64/fpu/acosh_advsimd.c +++ b/sysdeps/aarch64/fpu/acosh_advsimd.c @@ -34,7 +34,18 @@ static float64x2_t NOINLINE VPCS_ATTR special_case (float64x2_t x, float64x2_t y, uint64x2_t special, const struct v_log1p_data *d) { - return v_call_f64 (acosh, x, log1p_inline (y, d), special); + /* Acosh is define on [1, inf). Its formula can be re-written knowing that 1 + becomes negligible when x is a very large number. So for special numbers, + where x >= 2^511, acosh ~= ln(2x). But, ln(2x) = ln(2) + ln(x) and below we + calculate ln(x) and then add ln(2) to the result. + + Right before returning we check if x is infinity or if x is lower than 1, + in which case we return infinity or NaN. */ + float64x2_t one = v_f64 (1.0); + float64x2_t t = log1p_inline (vbslq_f64 (special, x, y), d); + t = vbslq_f64 (special, vaddq_f64 (t, v_f64 (d->ln2[0])), t); + t = vbslq_f64 (vceqq_f64 (x, v_f64 (INFINITY)), v_f64 (INFINITY), t); + return vbslq_f64 (vcltq_f64 (x, one), v_f64 (NAN), t); } /* Vector approximation for double-precision acosh, based on log1p. diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c index 1f6b8fc..6d996e3 100644 --- a/sysdeps/aarch64/fpu/acosh_sve.c +++ b/sysdeps/aarch64/fpu/acosh_sve.c @@ -20,13 +20,33 @@ #define WANT_SV_LOG1P_K0_SHORTCUT 1 #include "sv_log1p_inline.h" -#define One (0x3ff0000000000000) -#define Thres (0x1ff0000000000000) /* asuint64 (0x1p511) - One. */ +#define One (0x3ff0000000000000ULL) +#define Thres (0x1ff0000000000000ULL) +const static struct data +{ + double ln2; +} data = { .ln2 = 0x1.62e42fefa39efp-1 }; + +/* Acosh is define on [1, inf). Its formula can be re-written knowing that 1 + becomes negligible when x is a very large number. So for special numbers, + where x >= 2^511, acosh ~= ln(2x). But, ln(2x) = ln(2) + ln(x) and below we + calculate ln(x) and then add ln(2) to the result. + + Right before returning we check if x is infinity or if x is lower than 1, + in which case we return infinity or NaN. */ static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t x, svfloat64_t xm1, svfloat64_t y, svbool_t special, + svbool_t pg, const struct data *d) { - return sv_call_f64 (acosh, x, y, special); + svfloat64_t inf = sv_f64 (INFINITY); + svfloat64_t nan = sv_f64 (NAN); + svfloat64_t log = sv_log1p_inline (svsel (special, xm1, y), pg); + svfloat64_t result = svadd_m (special, log, sv_f64 (d->ln2)); + svbool_t is_inf_nan = svorr_b_z (special, svcmpge (special, x, inf), + svcmplt (special, x, sv_f64 (1.0))); + svfloat64_t inf_or_nan = svsel (svcmpge (special, x, inf), inf, nan); + return svsel (is_inf_nan, inf_or_nan, result); } /* SVE approximation for double-precision acosh, based on log1p. @@ -36,6 +56,8 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) want 0x1.ef0cee7c33ce4p-2. */ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); + /* (ix - One) >= (BigBound - One). */ svuint64_t ix = svreinterpret_u64 (x); svbool_t special = svcmpge (pg, svsub_x (pg, ix, One), Thres); @@ -46,6 +68,6 @@ svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg) /* Fall back to scalar routine for special lanes. */ if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, sv_log1p_inline (y, pg), special); + return special_case (x, xm1, y, special, pg, d); return sv_log1p_inline (y, pg); } diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c index 6071190..275de77 100644 --- a/sysdeps/aarch64/fpu/acoshf_advsimd.c +++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c @@ -24,15 +24,13 @@ const static struct data struct v_log1pf_data log1pf_consts; uint32x4_t one; uint16x4_t special_bound_u16; - uint32x4_t special_bound_u32; - float32x4_t pinf, nan; + float32x4_t inf, nan; } data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000), - .special_bound_u16 = V4 (0x2000), /* asuint(sqrt(FLT_MAX)) - asuint(1). */ - .special_bound_u32 = V4 (0x20000000), - .pinf = V4 (INFINITY), + .special_bound_u16 = V4 (0x2000), + .inf = V4 (INFINITY), .nan = V4 (NAN), }; @@ -51,34 +49,42 @@ inline_acoshf (float32x4_t x, const struct data *d) } static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t x, const struct data *d) +special_case (float32x4_t x, uint16x4_t special_u16, const struct data *d) { - uint32x4_t special = vcgeq_u32 ( - vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), d->special_bound_u32); - - /* To avoid the overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we - reduce the input of acosh to a narrower interval by relying on the identity - acosh(t) = 1/2acosh(2t^2 - 1) for t>=1. - If we set t=sqrt((x+1)/2), since x>=1 then t>=sqrt(2/2)=1, and therefore - acosh(x) = 2acosh(sqrt((x+1)/2)). */ - float32x4_t r = vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)); - r = vmulq_f32 (r, v_f32 (0.5f)); - r = vbslq_f32 (special, vsqrtq_f32 (r), x); - - float32x4_t y = inline_acoshf (r, d); - - y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y); - - /* Check whether x is less than 1, or x is inf or nan. */ - uint32x4_t inf_minus_one - = vsubq_u32 (vreinterpretq_u32_f32 (d->pinf), d->one); - uint32x4_t is_infnan = vcgeq_u32 ( - vsubq_u32 (vreinterpretq_u32_f32 (x), d->one), inf_minus_one); - - y = vbslq_f32 (is_infnan, d->nan, y); - uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf); - y = vbslq_f32 (ret_pinf, d->pinf, y); - return y; + float32x4_t one = v_f32 (1.0); + float32x4_t ln2 = d->log1pf_consts.ln2; + + /* acosh(x) = ln(x + sqrt[x^2 -1]). + So acosh(x) = log1p (x + sqrt[x^2 - 1] - 1). */ + float32x4_t xm1 = vsubq_f32 (x, one); + float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, one)); + + float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u)); + + /* special_u16 will be 0x0000ffff for true lanes, which doesn't work with bit + selects therefore to make it a 32 bit predicate, we have to add special + << 16. */ + uint32x4_t special = vmovl_u16 (special_u16); + special = vaddq_u32 (special, vshlq_n_u32 (special, 16)); + float32x4_t xy = vbslq_f32 (special, x, y); + + /* For large inputs, acosh(x) ≈ log(x) + ln(2). + We use log1pf-inline log implementation and add ln(2). */ + float32x4_t log_xy = log1pf_inline (xy, &d->log1pf_consts); + + /* For acoshf there are three special cases that need considering. Infinity + and NaNs, which are also returned unchanged and for cases of x < 1 we'll + return all NaNs since acoshf is defined on (1, inf). */ + uint32x4_t is_lower_one = vcltq_f32 (x, one); + uint32x4_t is_finite = vcltq_f32 (x, d->inf); + + /* This dependency chain of selects can run in parallel with y and log_x + being calculated before the last addition. */ + float32x4_t ln2_inf_nan = vbslq_f32 (is_finite, ln2, x); + ln2_inf_nan = vbslq_f32 (is_lower_one, d->nan, ln2_inf_nan); + float32x4_t ln2_inf_nan_zero = vbslq_f32 (special, ln2_inf_nan, v_f32 (0)); + + return vaddq_f32 (log_xy, ln2_inf_nan_zero); } /* Vector approximation for single-precision acosh, based on log1p. @@ -97,8 +103,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acosh) (float32x4_t x) = vcge_u16 (vsubhn_u32 (ix, d->one), d->special_bound_u16); if (__glibc_unlikely (v_any_u16h (special))) - return special_case (x, d); + return special_case (x, special, d); return inline_acoshf (x, d); } + libmvec_hidden_def (V_NAME_F1 (acosh)) HALF_WIDTH_ALIAS_F1 (acosh) diff --git a/sysdeps/aarch64/fpu/asinh_advsimd.c b/sysdeps/aarch64/fpu/asinh_advsimd.c index 5168917..650ed46 100644 --- a/sysdeps/aarch64/fpu/asinh_advsimd.c +++ b/sysdeps/aarch64/fpu/asinh_advsimd.c @@ -27,36 +27,43 @@ const static struct data double lc1, lc3, ln2, lc4; float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c17; double c1, c3, c5, c7, c9, c11, c13, c15; + float64x2_t inf; } data = { /* Even terms of polynomial s.t. asinh(x) is approximated by asinh(x) ~= x + x^3 * (C0 + C1 * x + C2 * x^2 + C3 * x^3 + ...). Generated using Remez, f = (asinh(sqrt(x)) - sqrt(x))/x^(3/2). */ - .c0 = V2 (-0x1.55555555554a7p-3), .c1 = 0x1.3333333326c7p-4, - .c2 = V2 (-0x1.6db6db68332e6p-5), .c3 = 0x1.f1c71b26fb40dp-6, - .c4 = V2 (-0x1.6e8b8b654a621p-6), .c5 = 0x1.1c4daa9e67871p-6, - .c6 = V2 (-0x1.c9871d10885afp-7), .c7 = 0x1.7a16e8d9d2ecfp-7, - .c8 = V2 (-0x1.3ddca533e9f54p-7), .c9 = 0x1.0becef748dafcp-7, - .c10 = V2 (-0x1.b90c7099dd397p-8), .c11 = 0x1.541f2bb1ffe51p-8, - .c12 = V2 (-0x1.d217026a669ecp-9), .c13 = 0x1.0b5c7977aaf7p-9, - .c14 = V2 (-0x1.e0f37daef9127p-11), .c15 = 0x1.388b5fe542a6p-12, - .c16 = V2 (-0x1.021a48685e287p-14), .c17 = V2 (0x1.93d4ba83d34dap-18), - .lc0 = V2 (-0x1.ffffffffffff7p-2), .lc1 = 0x1.55555555170d4p-2, - .lc2 = V2 (-0x1.0000000399c27p-2), .lc3 = 0x1.999b2e90e94cap-3, - .lc4 = -0x1.554e550bd501ep-3, .ln2 = 0x1.62e42fefa39efp-1, - .off = V2 (0x3fe6900900000000), .huge_bound = V2 (0x5fe0000000000000), - .abs_mask = V2 (0x7fffffffffffffff), .mask = V2 (0xfffULL << 52), + .c0 = V2 (-0x1.55555555554a7p-3), + .c1 = 0x1.3333333326c7p-4, + .c2 = V2 (-0x1.6db6db68332e6p-5), + .c3 = 0x1.f1c71b26fb40dp-6, + .c4 = V2 (-0x1.6e8b8b654a621p-6), + .c5 = 0x1.1c4daa9e67871p-6, + .c6 = V2 (-0x1.c9871d10885afp-7), + .c7 = 0x1.7a16e8d9d2ecfp-7, + .c8 = V2 (-0x1.3ddca533e9f54p-7), + .c9 = 0x1.0becef748dafcp-7, + .c10 = V2 (-0x1.b90c7099dd397p-8), + .c11 = 0x1.541f2bb1ffe51p-8, + .c12 = V2 (-0x1.d217026a669ecp-9), + .c13 = 0x1.0b5c7977aaf7p-9, + .c14 = V2 (-0x1.e0f37daef9127p-11), + .c15 = 0x1.388b5fe542a6p-12, + .c16 = V2 (-0x1.021a48685e287p-14), + .c17 = V2 (0x1.93d4ba83d34dap-18), + .lc0 = V2 (-0x1.ffffffffffff7p-2), + .lc1 = 0x1.55555555170d4p-2, + .lc2 = V2 (-0x1.0000000399c27p-2), + .lc3 = 0x1.999b2e90e94cap-3, + .lc4 = -0x1.554e550bd501ep-3, + .ln2 = 0x1.62e42fefa39efp-1, + .off = V2 (0x3fe6900900000000), + .huge_bound = V2 (0x5fe0000000000000), + .abs_mask = V2 (0x7fffffffffffffff), + .mask = V2 (0xfffULL << 52), + .inf = V2 (INFINITY) }; -static float64x2_t NOINLINE VPCS_ATTR -special_case (float64x2_t x, float64x2_t y, uint64x2_t abs_mask, - uint64x2_t special) -{ - /* Copy sign. */ - y = vbslq_f64 (abs_mask, y, x); - return v_call_f64 (asinh, x, y, special); -} - #define N (1 << V_LOG_TABLE_BITS) #define IndexMask (N - 1) @@ -110,6 +117,19 @@ log_inline (float64x2_t ax, const struct data *d) return vfmaq_f64 (hi, y, r2); } +static float64x2_t NOINLINE VPCS_ATTR +special_case (float64x2_t x, uint64x2_t special, float64x2_t y, float64x2_t ax, + const struct data *d) +{ + /* For very large inputs (x > 2^511), asinh(x) ≈ ln(2x). + In this range the +sqrt(x^2+1) term is negligible, so we compute + asinh(x) as ln(x) + ln(2). */ + float64x2_t res_asinh = vaddq_f64 (log_inline (ax, d), v_f64 (d->ln2)); + res_asinh = vbslq_f64 (vceqq_f64 (ax, d->inf), d->inf, res_asinh); + res_asinh = vbslq_f64 (special, res_asinh, y); + return vbslq_f64 (d->abs_mask, res_asinh, x); +} + /* Double-precision implementation of vector asinh(x). asinh is very sensitive around 1, so it is impractical to devise a single low-cost algorithm which is sufficiently accurate on a wide range of input. @@ -179,7 +199,7 @@ VPCS_ATTR float64x2_t V_NAME_D1 (asinh) (float64x2_t x) /* Choose the right option for each lane. */ float64x2_t y = vbslq_f64 (gt1, option_1, option_2); if (__glibc_unlikely (v_any_u64 (special))) - return special_case (x, y, d->abs_mask, special); + return special_case (x, special, y, ax, d); /* Copy sign. */ return vbslq_f64 (d->abs_mask, y, x); diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c index bf9dad9..cb32b60 100644 --- a/sysdeps/aarch64/fpu/asinh_sve.c +++ b/sysdeps/aarch64/fpu/asinh_sve.c @@ -29,7 +29,7 @@ static const struct data double even_coeffs[9]; double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17; uint64_t off, mask; - + double inf; } data = { /* Polynomial generated using Remez on [2^-26, 1]. */ .even_coeffs ={ @@ -61,14 +61,9 @@ static const struct data .p4 = -0x1.554e550bd501ep-3, .off = 0x3fe6900900000000, .mask = 0xfffULL << 52, + .inf = INFINITY }; -static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) -{ - return sv_call_f64 (asinh, x, y, special); -} - static inline svfloat64_t __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) { @@ -104,12 +99,38 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg) return y; } +static svfloat64_t NOINLINE +special_case (svfloat64_t ax, svfloat64_t y, svuint64_t sign, svbool_t special, + svbool_t pg, const struct data *d) +{ + /* For very large inputs (x > 2^511), asinh(x) ≈ ln(2x). + In this range the +sqrt(x^2+1) term is negligible, so we compute + asinh(x) as ln(x) + ln(2) later in this function. */ + svfloat64_t log_ax = __sv_log_inline (ax, d, special); + + /* The only special cases that need considering are infinity and NaNs since + 0 will be handled by other calculations. */ + svfloat64_t inf = sv_f64 (d->inf); + svbool_t is_inf = svcmpeq (special, ax, inf); + svbool_t is_nan = svcmpne (special, ax, ax); + svfloat64_t inf_ln2 = svsel (is_inf, inf, sv_f64 (d->ln2)); + svfloat64_t inf_nan_ln2 = svsel (is_nan, sv_f64 (NAN), inf_ln2); + svfloat64_t asinh_x_res = svadd_x (special, log_ax, inf_nan_ln2); + + /* Now select (based on special) between x and y to change the type and, + return either the positive or negative value, considering the input and + its sign. */ + svfloat64_t result = svsel (special, asinh_x_res, y); + svuint64_t result_uint = svreinterpret_u64 (result); + return svreinterpret_f64 (sveor_m (pg, result_uint, sign)); +} + /* Double-precision implementation of SVE asinh(x). asinh is very sensitive around 1, so it is impractical to devise a single low-cost algorithm which is sufficiently accurate on a wide range of input. Instead we use two different algorithms: asinh(x) = sign(x) * log(|x| + sqrt(x^2 + 1) if |x| >= 1 - = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise + = sign(x) * (|x| + |x|^3 * P(x^2)) otherwise where log(x) is an optimized log approximation, and P(x) is a polynomial shared with the scalar routine. The greatest observed error 2.51 ULP, in |x| >= 1: @@ -180,14 +201,11 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg) option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax)); } - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case ( - x, - svreinterpret_f64 (sveor_x ( - pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)), - special); - /* Choose the right option for each lane. */ svfloat64_t y = svsel (ge1, option_1, option_2); + + if (__glibc_unlikely (svptest_any (pg, special))) + return special_case (ax, y, sign, special, pg, d); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)); } diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c index f1b1dd7..3454e46 100644 --- a/sysdeps/aarch64/fpu/asinhf_advsimd.c +++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c @@ -25,12 +25,12 @@ const static struct data struct v_log1pf_data log1pf_consts; float32x4_t one; uint32x4_t square_lim; - float32x4_t pinf, nan; + float32x4_t inf, nan; } data = { .one = V4 (1), .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .square_lim = V4 (0x5f800000), /* asuint(sqrt(FLT_MAX)). */ - .pinf = V4 (INFINITY), + .inf = V4 (INFINITY), .nan = V4 (NAN), }; @@ -51,31 +51,39 @@ static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t ax, uint32x4_t sign, uint32x4_t special, const struct data *d) { - /* To avoid overflow in x^2 (so the x < sqrt(FLT_MAX) constraint), we - reduce the input of asinh to a narrower interval by relying on the - identity: 2asinh(t) = +-acosh(2t^2 + 1) - If we set t=sqrt((x-1)/2), then - 2asinh(sqrt((x-1)/2)) = acosh(x). - Found that, for a high input x, asinh(x) very closely approximates - acosh(x), so implemented it with this function instead. */ - float32x4_t r = vsubq_f32 (ax, d->one); - r = vmulq_f32 (r, v_f32 (0.5f)); - r = vbslq_f32 (special, vsqrtq_f32 (r), ax); - - float32x4_t y = inline_asinhf (r, sign, d); - - y = vbslq_f32 (special, vmulq_f32 (y, v_f32 (2.0f)), y); - - /* Check whether x is inf or nan. */ - uint32x4_t ret_inf = vceqq_f32 (ax, d->pinf); - uint32x4_t ret_nan = vmvnq_u32 (vcleq_f32 (ax, d->pinf)); - y = vbslq_f32 (ret_inf, d->pinf, y); - y = vbslq_f32 (ret_nan, d->nan, y); - /* Put sign back in for minf, as it doesn't happen in log1pf_inline call. */ - y = vbslq_f32 ( - ret_inf, - vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign)), y); - return y; + float32x4_t t + = vaddq_f32 (v_f32 (1.0f), vsqrtq_f32 (vfmaq_f32 (d->one, ax, ax))); + float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), t)); + + /* For large inputs (x > 2^64), asinh(x) ≈ ln(2x). + 1 becomes negligible in sqrt(x^2+1), so we compute + asinh(x) as ln(x) + ln(2). */ + float32x4_t xy = vbslq_f32 (special, ax, y); + float32x4_t log_xy = log1pf_inline (xy, &d->log1pf_consts); + + /* Infinity and NaNs are the only other special cases that need checking + before we return the values. 0 is handled by inline_asinhf as it returns + 0. Below we implememt the logic that returns infinity when infinity is + passed and NaNs when NaNs are passed. Sometimes due to intrinsics like + vdivq_f32, infinity can change into NaNs, so we want to make sure the + right result is returned. + + Since these steps run in parallel with the log, we select between the + results we'll want to add to log_x in time, since addition with infinity + and NaNs doesn't make a difference. When we get to the adition step, + everything is already in the right place. */ + float32x4_t ln2 = d->log1pf_consts.ln2; + uint32x4_t is_finite = vcltq_f32 (ax, d->inf); + float32x4_t ln2_inf_nan = vbslq_f32 (is_finite, ln2, ax); + + /* Before returning the result, the right sign will be assinged to the + absolute result. This is because we pass an absoulte x to the function. + */ + + float32x4_t asinhf + = vbslq_f32 (special, vaddq_f32 (log_xy, ln2_inf_nan), log_xy); + return vreinterpretq_f32_u32 ( + veorq_u32 (sign, vreinterpretq_u32_f32 (asinhf))); } /* Single-precision implementation of vector asinh(x), using vector log1p. @@ -99,5 +107,6 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asinh) (float32x4_t x) return special_case (ax, sign, special, d); return inline_asinhf (ax, sign, d); } + libmvec_hidden_def (V_NAME_F1 (asinh)) HALF_WIDTH_ALIAS_F1 (asinh) diff --git a/sysdeps/aarch64/fpu/atanh_advsimd.c b/sysdeps/aarch64/fpu/atanh_advsimd.c index 711a234..a3798ef 100644 --- a/sysdeps/aarch64/fpu/atanh_advsimd.c +++ b/sysdeps/aarch64/fpu/atanh_advsimd.c @@ -23,38 +23,43 @@ const static struct data { struct v_log1p_data log1p_consts; - uint64x2_t one; + float64x2_t one; uint64x2_t sign_mask; } data = { .log1p_consts = V_LOG1P_CONSTANTS_TABLE, - .one = V2 (0x3ff0000000000000), + .one = V2 (1.0), .sign_mask = V2 (0x8000000000000000) }; static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x, float64x2_t halfsign, float64x2_t y, +special_case (float64x2_t ax, float64x2_t halfsign, float64x2_t y, uint64x2_t special, const struct data *d) { y = log1p_inline (y, &d->log1p_consts); - return v_call_f64 (atanh, vbslq_f64 (d->sign_mask, halfsign, x), - vmulq_f64 (halfsign, y), special); + /* Copy the sign bit from the input to inf. */ + uint64x2_t is_one = vceqq_f64 (ax, d->one); + float64x2_t signed_inf + = vbslq_f64 (d->sign_mask, halfsign, v_f64 (INFINITY)); + /* Here we check for all the rest of the cases where |x| > 1 will return a + NaN, including if x = NaN. */ + float64x2_t res = vbslq_f64 (special, v_f64 (NAN), vmulq_f64 (halfsign, y)); + + return vbslq_f64 (is_one, signed_inf, res); } /* Approximation for vector double-precision atanh(x) using modified log1p. - The greatest observed error is 3.31 ULP: + The greatest observed error is 2.81 + 0.5 ULP: _ZGVnN2v_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6 want 0x1.ffd8ff31b501cp-6. */ -VPCS_ATTR -float64x2_t V_NAME_D1 (atanh) (float64x2_t x) +VPCS_ATTR float64x2_t V_NAME_D1 (atanh) (float64x2_t x) { const struct data *d = ptr_barrier (&data); float64x2_t halfsign = vbslq_f64 (d->sign_mask, x, v_f64 (0.5)); float64x2_t ax = vabsq_f64 (x); - uint64x2_t ia = vreinterpretq_u64_f64 (ax); - uint64x2_t special = vcgeq_u64 (ia, d->one); + uint64x2_t special = vcageq_f64 (x, d->one); float64x2_t y; y = vaddq_f64 (ax, ax); - y = vdivq_f64 (y, vsubq_f64 (vreinterpretq_f64_u64 (d->one), ax)); + y = vdivq_f64 (y, vsubq_f64 (d->one, ax)); if (__glibc_unlikely (v_any_u64 (special))) return special_case (ax, halfsign, y, special, d); diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c index fddf4c8..5589849 100644 --- a/sysdeps/aarch64/fpu/atanh_sve.c +++ b/sysdeps/aarch64/fpu/atanh_sve.c @@ -20,13 +20,20 @@ #define WANT_SV_LOG1P_K0_SHORTCUT 0 #include "sv_log1p_inline.h" -#define One (0x3ff0000000000000) -#define Half (0x3fe0000000000000) +static const struct data +{ + uint64_t half; + double inf; + double nan; +} data = { .half = 0x3fe0000000000000, .inf = INFINITY, .nan = NAN }; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t ax, svfloat64_t y, svbool_t pg, svbool_t special, + svfloat64_t halfsign, const struct data *d) { - return sv_call_f64 (atanh, x, y, special); + svfloat64_t res = svsel (special, sv_f64 (d->nan), y); + res = svsel (svcmpeq (special, ax, sv_f64 (1.0)), sv_f64 (d->inf), res); + return svmul_x (pg, res, halfsign); } /* SVE approximation for double-precision atanh, based on log1p. @@ -35,24 +42,25 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) want 0x1.ffd8ff31b501cp-6. */ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg) { + const struct data *d = ptr_barrier (&data); svfloat64_t ax = svabs_x (pg, x); svuint64_t iax = svreinterpret_u64 (ax); svuint64_t sign = sveor_x (pg, svreinterpret_u64 (x), iax); - svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half)); + svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->half)); /* It is special if iax >= 1. */ - svbool_t special = svacge (pg, x, 1.0); + svbool_t special = svacge (pg, ax, 1.0); /* Computation is performed based on the following sequence of equality: (1+x)/(1-x) = 1 + 2x/(1-x). */ svfloat64_t y; y = svadd_x (pg, ax, ax); - y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1), ax)); + y = svdiv_x (pg, y, svsub_x (pg, sv_f64 (1.0), ax)); /* ln((1+x)/(1-x)) = ln(1+2x/(1-x)) = ln(1 + y). */ y = sv_log1p_inline (y, pg); if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svmul_x (pg, halfsign, y), special); + return special_case (ax, y, pg, special, halfsign, d); return svmul_x (pg, halfsign, y); } |
