aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/aarch64')
-rw-r--r--sysdeps/aarch64/fpu/Makefile2
-rw-r--r--sysdeps/aarch64/fpu/Versions7
-rw-r--r--sysdeps/aarch64/fpu/acoshf_advsimd.c34
-rw-r--r--sysdeps/aarch64/fpu/advsimd_f32_protos.h1
-rw-r--r--sysdeps/aarch64/fpu/asinhf_advsimd.c33
-rw-r--r--sysdeps/aarch64/fpu/atanhf_advsimd.c26
-rw-r--r--sysdeps/aarch64/fpu/bits/math-vector.h8
-rw-r--r--sysdeps/aarch64/fpu/cos_advsimd.c11
-rw-r--r--sysdeps/aarch64/fpu/cosf_advsimd.c9
-rw-r--r--sysdeps/aarch64/fpu/coshf_sve.c35
-rw-r--r--sysdeps/aarch64/fpu/erf_advsimd.c29
-rw-r--r--sysdeps/aarch64/fpu/erf_data.c8
-rw-r--r--sysdeps/aarch64/fpu/erf_sve.c10
-rw-r--r--sysdeps/aarch64/fpu/erfc_advsimd.c17
-rw-r--r--sysdeps/aarch64/fpu/erfc_data.c8
-rw-r--r--sysdeps/aarch64/fpu/erfc_sve.c2
-rw-r--r--sysdeps/aarch64/fpu/erfcf_advsimd.c8
-rw-r--r--sysdeps/aarch64/fpu/erfcf_data.c8
-rw-r--r--sysdeps/aarch64/fpu/erfcf_sve.c2
-rw-r--r--sysdeps/aarch64/fpu/erff_advsimd.c8
-rw-r--r--sysdeps/aarch64/fpu/erff_data.c8
-rw-r--r--sysdeps/aarch64/fpu/erff_sve.c13
-rw-r--r--sysdeps/aarch64/fpu/exp10f_sve.c83
-rw-r--r--sysdeps/aarch64/fpu/exp2f_sve.c70
-rw-r--r--sysdeps/aarch64/fpu/expf_advsimd.c10
-rw-r--r--sysdeps/aarch64/fpu/expf_sve.c62
-rw-r--r--sysdeps/aarch64/fpu/expm1f_advsimd.c62
-rw-r--r--sysdeps/aarch64/fpu/log10f_advsimd.c38
-rw-r--r--sysdeps/aarch64/fpu/log10f_sve.c41
-rw-r--r--sysdeps/aarch64/fpu/log1p_advsimd.c2
-rw-r--r--sysdeps/aarch64/fpu/log1p_sve.c2
-rw-r--r--sysdeps/aarch64/fpu/log1pf_advsimd.c124
-rw-r--r--sysdeps/aarch64/fpu/log1pf_sve.c2
-rw-r--r--sysdeps/aarch64/fpu/log2f_advsimd.c38
-rw-r--r--sysdeps/aarch64/fpu/log2f_sve.c37
-rw-r--r--sysdeps/aarch64/fpu/logf_advsimd.c41
-rw-r--r--sysdeps/aarch64/fpu/logf_sve.c38
-rw-r--r--sysdeps/aarch64/fpu/sin_advsimd.c16
-rw-r--r--sysdeps/aarch64/fpu/sinf_advsimd.c22
-rw-r--r--sysdeps/aarch64/fpu/sinhf_advsimd.c23
-rw-r--r--sysdeps/aarch64/fpu/sv_erf_data.c1570
-rw-r--r--sysdeps/aarch64/fpu/sv_erff_data.c1058
-rw-r--r--sysdeps/aarch64/fpu/sv_expf_inline.h34
-rw-r--r--sysdeps/aarch64/fpu/tanhf_advsimd.c21
-rw-r--r--sysdeps/aarch64/fpu/v_expm1f_inline.h43
-rw-r--r--sysdeps/aarch64/fpu/v_log1pf_inline.h71
-rw-r--r--sysdeps/aarch64/fpu/vecmath_config.h28
-rw-r--r--sysdeps/aarch64/libm-test-ulps33
-rw-r--r--sysdeps/aarch64/memset-reg.h30
-rw-r--r--sysdeps/aarch64/memset.S194
-rw-r--r--sysdeps/aarch64/multiarch/memset_a64fx.S9
-rw-r--r--sysdeps/aarch64/multiarch/memset_emag.S8
-rw-r--r--sysdeps/aarch64/multiarch/memset_kunpeng.S7
-rw-r--r--sysdeps/aarch64/multiarch/memset_oryon1.S8
54 files changed, 721 insertions, 3391 deletions
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 234a6c4..be8541f 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -41,8 +41,6 @@ libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
v_log10_data \
erf_data \
erff_data \
- sv_erf_data \
- sv_erff_data \
v_exp_tail_data \
erfc_data \
erfcf_data \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index cc15ce2..015211f 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -135,4 +135,11 @@ libmvec {
_ZGVsMxv_tanh;
_ZGVsMxv_tanhf;
}
+ GLIBC_2.41 {
+ _ZGVnN2v_logp1;
+ _ZGVnN2v_logp1f;
+ _ZGVnN4v_logp1f;
+ _ZGVsMxv_logp1;
+ _ZGVsMxv_logp1f;
+ }
}
diff --git a/sysdeps/aarch64/fpu/acoshf_advsimd.c b/sysdeps/aarch64/fpu/acoshf_advsimd.c
index 8916dcbf4..004474a 100644
--- a/sysdeps/aarch64/fpu/acoshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/acoshf_advsimd.c
@@ -25,35 +25,32 @@ const static struct data
{
struct v_log1pf_data log1pf_consts;
uint32x4_t one;
- uint16x4_t thresh;
-} data = {
- .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
- .one = V4 (0x3f800000),
- .thresh = V4 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
-};
+} data = { .log1pf_consts = V_LOG1PF_CONSTANTS_TABLE, .one = V4 (0x3f800000) };
+
+#define Thresh vdup_n_u16 (0x2000) /* top(asuint(SquareLim) - asuint(1)). */
static float32x4_t NOINLINE VPCS_ATTR
special_case (float32x4_t x, float32x4_t y, uint16x4_t special,
- const struct v_log1pf_data d)
+ const struct v_log1pf_data *d)
{
return v_call_f32 (acoshf, x, log1pf_inline (y, d), vmovl_u16 (special));
}
/* Vector approximation for single-precision acosh, based on log1p. Maximum
error depends on WANT_SIMD_EXCEPT. With SIMD fp exceptions enabled, it
- is 2.78 ULP:
- __v_acoshf(0x1.07887p+0) got 0x1.ef9e9cp-3
- want 0x1.ef9ea2p-3.
+ is 3.00 ULP:
+ _ZGVnN4v_acoshf(0x1.01df3ap+0) got 0x1.ef0a82p-4
+ want 0x1.ef0a7cp-4.
With exceptions disabled, we can compute u with a shorter dependency chain,
- which gives maximum error of 3.07 ULP:
- __v_acoshf(0x1.01f83ep+0) got 0x1.fbc7fap-4
- want 0x1.fbc7f4p-4. */
+ which gives maximum error of 3.22 ULP:
+ _ZGVnN4v_acoshf(0x1.007ef2p+0) got 0x1.fdcdccp-5
+ want 0x1.fdcdd2p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), d->thresh);
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (ix, d->one), Thresh);
#if WANT_SIMD_EXCEPT
/* Mask special lanes with 1 to side-step spurious invalid or overflow. Use
@@ -64,15 +61,16 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (acosh) (float32x4_t x)
float32x4_t xm1 = v_zerofy_f32 (vsubq_f32 (x, v_f32 (1)), p);
float32x4_t u = vfmaq_f32 (vaddq_f32 (xm1, xm1), xm1, xm1);
#else
- float32x4_t xm1 = vsubq_f32 (x, v_f32 (1));
- float32x4_t u = vmulq_f32 (xm1, vaddq_f32 (x, v_f32 (1.0f)));
+ float32x4_t xm1 = vsubq_f32 (x, vreinterpretq_f32_u32 (d->one));
+ float32x4_t u
+ = vmulq_f32 (xm1, vaddq_f32 (x, vreinterpretq_f32_u32 (d->one)));
#endif
float32x4_t y = vaddq_f32 (xm1, vsqrtq_f32 (u));
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, special, d->log1pf_consts);
- return log1pf_inline (y, d->log1pf_consts);
+ return special_case (x, y, special, &d->log1pf_consts);
+ return log1pf_inline (y, &d->log1pf_consts);
}
libmvec_hidden_def (V_NAME_F1 (acosh))
HALF_WIDTH_ALIAS_F1 (acosh)
diff --git a/sysdeps/aarch64/fpu/advsimd_f32_protos.h b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
index 097d403..5909bb4 100644
--- a/sysdeps/aarch64/fpu/advsimd_f32_protos.h
+++ b/sysdeps/aarch64/fpu/advsimd_f32_protos.h
@@ -36,6 +36,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
libmvec_hidden_proto (V_NAME_F1(log10));
libmvec_hidden_proto (V_NAME_F1(log1p));
libmvec_hidden_proto (V_NAME_F1(log2));
+libmvec_hidden_proto (V_NAME_F1(logp1));
libmvec_hidden_proto (V_NAME_F1(log));
libmvec_hidden_proto (V_NAME_F2(pow));
libmvec_hidden_proto (V_NAME_F1(sin));
diff --git a/sysdeps/aarch64/fpu/asinhf_advsimd.c b/sysdeps/aarch64/fpu/asinhf_advsimd.c
index 09fd8a6..eb789b9 100644
--- a/sysdeps/aarch64/fpu/asinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinhf_advsimd.c
@@ -20,16 +20,16 @@
#include "v_math.h"
#include "v_log1pf_inline.h"
-#define SignMask v_u32 (0x80000000)
-
const static struct data
{
struct v_log1pf_data log1pf_consts;
+ float32x4_t one;
uint32x4_t big_bound;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound;
#endif
} data = {
+ .one = V4 (1),
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
.big_bound = V4 (0x5f800000), /* asuint(0x1p64). */
#if WANT_SIMD_EXCEPT
@@ -38,20 +38,27 @@ const static struct data
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t sign, float32x4_t y,
+ uint32x4_t special, const struct data *d)
{
- return v_call_f32 (asinhf, x, y, special);
+ return v_call_f32 (
+ asinhf, x,
+ vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &d->log1pf_consts)))),
+ special);
}
/* Single-precision implementation of vector asinh(x), using vector log1p.
- Worst-case error is 2.66 ULP, at roughly +/-0.25:
- __v_asinhf(0x1.01b04p-2) got 0x1.fe163ep-3 want 0x1.fe1638p-3. */
+ Worst-case error is 2.59 ULP:
+ _ZGVnN4v_asinhf(0x1.d86124p-3) got 0x1.d449bep-3
+ want 0x1.d449c4p-3. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
{
const struct data *dat = ptr_barrier (&data);
- uint32x4_t iax = vbicq_u32 (vreinterpretq_u32_f32 (x), SignMask);
- float32x4_t ax = vreinterpretq_f32_u32 (iax);
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t special = vcgeq_u32 (iax, dat->big_bound);
+ uint32x4_t sign = veorq_u32 (vreinterpretq_u32_f32 (x), iax);
float32x4_t special_arg = x;
#if WANT_SIMD_EXCEPT
@@ -68,13 +75,13 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (asinh) (float32x4_t x)
/* asinh(x) = log(x + sqrt(x * x + 1)).
For positive x, asinh(x) = log1p(x + x * x / (1 + sqrt(x * x + 1))). */
float32x4_t d
- = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (v_f32 (1), x, x)));
- float32x4_t y = log1pf_inline (
- vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d)), dat->log1pf_consts);
+ = vaddq_f32 (v_f32 (1), vsqrtq_f32 (vfmaq_f32 (dat->one, ax, ax)));
+ float32x4_t y = vaddq_f32 (ax, vdivq_f32 (vmulq_f32 (ax, ax), d));
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (special_arg, vbslq_f32 (SignMask, x, y), special);
- return vbslq_f32 (SignMask, x, y);
+ return special_case (special_arg, sign, y, special, dat);
+ return vreinterpretq_f32_u32 (veorq_u32 (
+ sign, vreinterpretq_u32_f32 (log1pf_inline (y, &dat->log1pf_consts))));
}
libmvec_hidden_def (V_NAME_F1 (asinh))
HALF_WIDTH_ALIAS_F1 (asinh)
diff --git a/sysdeps/aarch64/fpu/atanhf_advsimd.c b/sysdeps/aarch64/fpu/atanhf_advsimd.c
index ae488f7..818b6c9 100644
--- a/sysdeps/aarch64/fpu/atanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanhf_advsimd.c
@@ -40,15 +40,17 @@ const static struct data
#define Half v_u32 (0x3f000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
+ uint32x4_t special)
{
- return v_call_f32 (atanhf, x, y, special);
+ return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
+ vmulq_f32 (halfsign, y), special);
}
/* Approximation for vector single-precision atanh(x) using modified log1p.
- The maximum error is 3.08 ULP:
- __v_atanhf(0x1.ff215p-5) got 0x1.ffcb7cp-5
- want 0x1.ffcb82p-5. */
+ The maximum error is 2.93 ULP:
+ _ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
+ want 0x1.f4dcf8p-5. */
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
@@ -68,11 +70,19 @@ VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
uint32x4_t special = vcgeq_u32 (iax, d->one);
#endif
- float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax), vsubq_f32 (v_f32 (1), ax));
- y = log1pf_inline (y, d->log1pf_consts);
+ float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
+ vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
+ y = log1pf_inline (y, &d->log1pf_consts);
+ /* If exceptions not required, pass ax to special-case for shorter dependency
+ chain. If exceptions are required ax will have been zerofied, so have to
+ pass x. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (halfsign, y), special);
+#if WANT_SIMD_EXCEPT
+ return special_case (x, halfsign, y, special);
+#else
+ return special_case (ax, halfsign, y, special);
+#endif
return vmulq_f32 (halfsign, y);
}
libmvec_hidden_def (V_NAME_F1 (atanh))
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 7484150..f295fe1 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -113,6 +113,10 @@
# define __DECL_SIMD_log2 __DECL_SIMD_aarch64
# undef __DECL_SIMD_log2f
# define __DECL_SIMD_log2f __DECL_SIMD_aarch64
+# undef __DECL_SIMD_logp1
+# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
+# undef __DECL_SIMD_logp1f
+# define __DECL_SIMD_logp1f __DECL_SIMD_aarch64
# undef __DECL_SIMD_pow
# define __DECL_SIMD_pow __DECL_SIMD_aarch64
# undef __DECL_SIMD_powf
@@ -180,6 +184,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinhf (__f32x4_t);
@@ -207,6 +212,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sinh (__f64x2_t);
@@ -239,6 +245,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinhf (__sv_f32_t, __sv_bool_t);
@@ -266,6 +273,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sinh (__sv_f64_t, __sv_bool_t);
diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c
index 3924c9c..11a89b1 100644
--- a/sysdeps/aarch64/fpu/cos_advsimd.c
+++ b/sysdeps/aarch64/fpu/cos_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
@@ -30,11 +30,9 @@ static const struct data
V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
V2 (-0x1.9e9540300a1p-41) },
.inv_pi = V2 (0x1.45f306dc9c883p-2),
- .half_pi = V2 (0x1.921fb54442d18p+0),
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
.range_val = V2 (0x1p23)
};
@@ -68,10 +66,9 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi));
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
- n = vsubq_f64 (n, v_f64 (0.5));
+ n = vrndaq_f64 (vfmaq_f64 (v_f64 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
+ n = vsubq_f64 (n, v_f64 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c
index d0c285b..85a1b37 100644
--- a/sysdeps/aarch64/fpu/cosf_advsimd.c
+++ b/sysdeps/aarch64/fpu/cosf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,8 +33,6 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
- .half_pi = V4 (0x1.921fb6p0f),
.range_val = V4 (0x1p20f)
};
@@ -69,9 +67,8 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x)
#endif
/* n = rint((|x|+pi/2)/pi) - 0.5. */
- n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi));
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ n = vrndaq_f32 (vfmaq_f32 (v_f32 (0.5), r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
n = vsubq_f32 (n, v_f32 (0.5f));
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
index e5d8a29..7ad6efa 100644
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -23,37 +23,42 @@
static const struct data
{
struct sv_expf_data expf_consts;
- uint32_t special_bound;
+ float special_bound;
} data = {
.expf_consts = SV_EXPF_DATA,
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
- .special_bound = 0x42ad496c,
+ .special_bound = 0x1.5a92d8p+6,
};
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t pg)
+special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
+ svbool_t pg)
{
- return sv_call_f32 (coshf, x, y, pg);
+ return sv_call_f32 (coshf, x, svadd_x (svptrue_b32 (), half_e, half_over_e),
+ pg);
}
/* Single-precision vector cosh, using vector expf.
- Maximum error is 1.89 ULP:
- _ZGVsMxv_coshf (-0x1.65898cp+6) got 0x1.f00aep+127
- want 0x1.f00adcp+127. */
+ Maximum error is 2.77 ULP:
+ _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
+ want 0x1.e4594cp+2. */
svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svfloat32_t ax = svabs_x (pg, x);
- svbool_t special = svcmpge (pg, svreinterpret_u32 (ax), d->special_bound);
+ svbool_t special = svacge (pg, x, d->special_bound);
- /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- svfloat32_t t = expf_inline (ax, pg, &d->expf_consts);
- svfloat32_t half_t = svmul_x (pg, t, 0.5);
- svfloat32_t half_over_t = svdivr_x (pg, t, 0.5);
+ /* Calculate cosh by exp(x) / 2 + exp(-x) / 2.
+ Note that x is passed to exp here, rather than |x|. This is to avoid using
+ destructive unary ABS for better register usage. However it means the
+ routine is not exactly symmetrical, as the exp helper is slightly less
+ accurate in the negative range. */
+ svfloat32_t e = expf_inline (x, pg, &d->expf_consts);
+ svfloat32_t half_e = svmul_x (svptrue_b32 (), e, 0.5);
+ svfloat32_t half_over_e = svdivr_x (pg, e, 0.5);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+ return special_case (x, half_e, half_over_e, special);
- return svadd_x (pg, half_t, half_over_t);
+ return svadd_x (svptrue_b32 (), half_e, half_over_e);
}
diff --git a/sysdeps/aarch64/fpu/erf_advsimd.c b/sysdeps/aarch64/fpu/erf_advsimd.c
index 19cbb7d..a48092e 100644
--- a/sysdeps/aarch64/fpu/erf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erf_advsimd.c
@@ -22,19 +22,21 @@
static const struct data
{
float64x2_t third;
- float64x2_t tenth, two_over_five, two_over_fifteen;
- float64x2_t two_over_nine, two_over_fortyfive;
+ float64x2_t tenth, two_over_five, two_over_nine;
+ double two_over_fifteen, two_over_fortyfive;
float64x2_t max, shift;
+ uint64x2_t max_idx;
#if WANT_SIMD_EXCEPT
float64x2_t tiny_bound, huge_bound, scale_minus_one;
#endif
} data = {
+ .max_idx = V2 (768),
.third = V2 (0x1.5555555555556p-2), /* used to compute 2/3 and 1/6 too. */
- .two_over_fifteen = V2 (0x1.1111111111111p-3),
+ .two_over_fifteen = 0x1.1111111111111p-3,
.tenth = V2 (-0x1.999999999999ap-4),
.two_over_five = V2 (-0x1.999999999999ap-2),
.two_over_nine = V2 (-0x1.c71c71c71c71cp-3),
- .two_over_fortyfive = V2 (0x1.6c16c16c16c17p-5),
+ .two_over_fortyfive = 0x1.6c16c16c16c17p-5,
.max = V2 (5.9921875), /* 6 - 1/128. */
.shift = V2 (0x1p45),
#if WANT_SIMD_EXCEPT
@@ -56,8 +58,8 @@ static inline struct entry
lookup (uint64x2_t i)
{
struct entry e;
- float64x2_t e1 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
- e2 = vld1q_f64 (&__erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
+ float64x2_t e1 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 0)].erf),
+ e2 = vld1q_f64 (&__v_erf_data.tab[vgetq_lane_u64 (i, 1)].erf);
e.erf = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@@ -87,8 +89,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t a = vabsq_f64 (x);
/* Reciprocal conditions that do not catch NaNs so they can be used in BSLs
to return expected results. */
- uint64x2_t a_le_max = vcleq_f64 (a, dat->max);
- uint64x2_t a_gt_max = vcgtq_f64 (a, dat->max);
+ uint64x2_t a_le_max = vcaleq_f64 (x, dat->max);
+ uint64x2_t a_gt_max = vcagtq_f64 (x, dat->max);
#if WANT_SIMD_EXCEPT
/* |x| huge or tiny. */
@@ -115,7 +117,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
segfault. */
uint64x2_t i
= vsubq_u64 (vreinterpretq_u64_f64 (z), vreinterpretq_u64_f64 (shift));
- i = vbslq_u64 (a_le_max, i, v_u64 (768));
+ i = vbslq_u64 (a_le_max, i, dat->max_idx);
struct entry e = lookup (i);
float64x2_t r = vsubq_f64 (z, shift);
@@ -125,14 +127,19 @@ float64x2_t VPCS_ATTR V_NAME_D1 (erf) (float64x2_t x)
float64x2_t d2 = vmulq_f64 (d, d);
float64x2_t r2 = vmulq_f64 (r, r);
+ float64x2_t two_over_fifteen_and_fortyfive
+ = vld1q_f64 (&dat->two_over_fifteen);
+
/* poly (d, r) = 1 + p1(r) * d + p2(r) * d^2 + ... + p5(r) * d^5. */
float64x2_t p1 = r;
float64x2_t p2
= vfmsq_f64 (dat->third, r2, vaddq_f64 (dat->third, dat->third));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->third));
- float64x2_t p4 = vfmaq_f64 (dat->two_over_five, r2, dat->two_over_fifteen);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->two_over_five, r2,
+ two_over_fifteen_and_fortyfive, 0);
p4 = vfmsq_f64 (dat->tenth, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->two_over_nine, r2, dat->two_over_fortyfive);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->two_over_nine, r2,
+ two_over_fifteen_and_fortyfive, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->third), r2, p5));
float64x2_t p34 = vfmaq_f64 (p3, d, p4);
diff --git a/sysdeps/aarch64/fpu/erf_data.c b/sysdeps/aarch64/fpu/erf_data.c
index 6d2dcd2..ea01fad 100644
--- a/sysdeps/aarch64/fpu/erf_data.c
+++ b/sysdeps/aarch64/fpu/erf_data.c
@@ -19,14 +19,14 @@
#include "vecmath_config.h"
-/* Lookup table used in erf.
+/* Lookup table used in vector erf.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 6.0 (769 values):
- - the first entry __erff_data.tab.erf contains the values of erf(r),
- - the second entry __erff_data.tab.scale contains the values of
+ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+ - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct erf_data __erf_data = {
+const struct v_erf_data __v_erf_data = {
.tab = { { 0x0.0000000000000p+0, 0x1.20dd750429b6dp+0 },
{ 0x1.20dbf3deb1340p-7, 0x1.20d8f1975c85dp+0 },
{ 0x1.20d77083f17a0p-6, 0x1.20cb67bd452c7p+0 },
diff --git a/sysdeps/aarch64/fpu/erf_sve.c b/sysdeps/aarch64/fpu/erf_sve.c
index 7d51417..671d55a 100644
--- a/sysdeps/aarch64/fpu/erf_sve.c
+++ b/sysdeps/aarch64/fpu/erf_sve.c
@@ -67,14 +67,16 @@ svfloat64_t SV_NAME_D1 (erf) (svfloat64_t x, const svbool_t pg)
svfloat64_t a = svabs_x (pg, x);
svfloat64_t shift = sv_f64 (dat->shift);
svfloat64_t z = svadd_x (pg, a, shift);
- svuint64_t i
- = svsub_x (pg, svreinterpret_u64 (z), svreinterpret_u64 (shift));
+ svuint64_t i = svand_x (pg, svreinterpret_u64 (z), 0xfff);
+ i = svadd_x (pg, i, i);
/* Lookup without shortcut for small values but with predicate to avoid
segfault for large values and NaNs. */
svfloat64_t r = svsub_x (pg, z, shift);
- svfloat64_t erfr = svld1_gather_index (a_lt_max, __sv_erf_data.erf, i);
- svfloat64_t scale = svld1_gather_index (a_lt_max, __sv_erf_data.scale, i);
+ svfloat64_t erfr
+ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].erf, i);
+ svfloat64_t scale
+ = svld1_gather_index (a_lt_max, &__v_erf_data.tab[0].scale, i);
/* erf(x) ~ erf(r) + scale * d * poly (r, d). */
svfloat64_t d = svsub_x (pg, a, r);
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
index f1b3bfe..d05eac6 100644
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -24,8 +24,8 @@ static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
- float64x2_t p20, p40, p41, p42;
- float64x2_t p51, p52;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
@@ -41,9 +41,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@@ -69,9 +69,9 @@ lookup (uint64x2_t i)
{
struct entry e;
float64x2_t e1
- = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 0) - Off].erfc);
float64x2_t e2
- = vld1q_f64 (&__erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
+ = vld1q_f64 (&__v_erfc_data.tab[vgetq_lane_u64 (i, 1) - Off].erfc);
e.erfc = vuzp1q_f64 (e1, e2);
e.scale = vuzp2q_f64 (e1, e2);
return e;
@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */
diff --git a/sysdeps/aarch64/fpu/erfc_data.c b/sysdeps/aarch64/fpu/erfc_data.c
index 76a94e4..8dc6a8c 100644
--- a/sysdeps/aarch64/fpu/erfc_data.c
+++ b/sysdeps/aarch64/fpu/erfc_data.c
@@ -19,14 +19,14 @@
#include "vecmath_config.h"
-/* Lookup table used in erfc.
+/* Lookup table used in vector erfc.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = ~27.0 (3488 values):
- - the first entry __erfc_data.tab.erfc contains the values of erfc(r),
- - the second entry __erfc_data.tab.scale contains the values of
+ - the first entry __v_erfc_data.tab.erfc contains the values of erfc(r),
+ - the second entry __v_erfc_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^128 (fits in 8bit). */
-const struct erfc_data __erfc_data = {
+const struct v_erfc_data __v_erfc_data = {
.tab = { { 0x1p128, 0x1.20dd750429b6dp128 },
{ 0x1.fb7c9030853b3p127, 0x1.20d8f1975c85dp128 },
{ 0x1.f6f9447be0743p127, 0x1.20cb67bd452c7p128 },
diff --git a/sysdeps/aarch64/fpu/erfc_sve.c b/sysdeps/aarch64/fpu/erfc_sve.c
index c17d3e4..703926e 100644
--- a/sysdeps/aarch64/fpu/erfc_sve.c
+++ b/sysdeps/aarch64/fpu/erfc_sve.c
@@ -104,7 +104,7 @@ svfloat64_t SV_NAME_D1 (erfc) (svfloat64_t x, const svbool_t pg)
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svadd_x (pg, i, i);
- const float64_t *p = &__erfc_data.tab[0].erfc - 2 * dat->off_arr;
+ const float64_t *p = &__v_erfc_data.tab[0].erfc - 2 * dat->off_arr;
svfloat64_t erfcr = svld1_gather_index (pg, p, i);
svfloat64_t scale = svld1_gather_index (pg, p + 1, i);
diff --git a/sysdeps/aarch64/fpu/erfcf_advsimd.c b/sysdeps/aarch64/fpu/erfcf_advsimd.c
index ca5bc3a..59b0b0d 100644
--- a/sysdeps/aarch64/fpu/erfcf_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfcf_advsimd.c
@@ -62,13 +62,13 @@ lookup (uint32x4_t i)
{
struct entry e;
float32x2_t t0
- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 0) - Off].erfc);
float32x2_t t1
- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 1) - Off].erfc);
float32x2_t t2
- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 2) - Off].erfc);
float32x2_t t3
- = vld1_f32 (&__erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
+ = vld1_f32 (&__v_erfcf_data.tab[vgetq_lane_u32 (i, 3) - Off].erfc);
float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3);
e.erfc = vuzp1q_f32 (e1, e2);
diff --git a/sysdeps/aarch64/fpu/erfcf_data.c b/sysdeps/aarch64/fpu/erfcf_data.c
index 77fb889..d45087b 100644
--- a/sysdeps/aarch64/fpu/erfcf_data.c
+++ b/sysdeps/aarch64/fpu/erfcf_data.c
@@ -19,14 +19,14 @@
#include "vecmath_config.h"
-/* Lookup table used in erfcf.
+/* Lookup table used in vector erfcf.
For each possible rounded input r (multiples of 1/64), between
r = 0.0 and r = 10.0625 (645 values):
- - the first entry __erfcf_data.tab.erfc contains the values of erfc(r),
- - the second entry __erfcf_data.tab.scale contains the values of
+ - the first entry __v_erfcf_data.tab.erfc contains the values of erfc(r),
+ - the second entry __v_erfcf_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Both values may go into subnormal range, therefore
they are scaled by a large enough value 2^47 (fits in 8 bits). */
-const struct erfcf_data __erfcf_data = {
+const struct v_erfcf_data __v_erfcf_data = {
.tab = { { 0x1p47, 0x1.20dd76p47 },
{ 0x1.f6f944p46, 0x1.20cb68p47 },
{ 0x1.edf3aap46, 0x1.209546p47 },
diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
index 48d1677..ecacb93 100644
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
@@ -77,7 +77,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
/* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables. */
i = svmul_x (pg, i, 2);
- const float32_t *p = &__erfcf_data.tab[0].erfc - 2 * dat->off_arr;
+ const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
svfloat32_t erfcr = svld1_gather_index (pg, p, i);
svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
diff --git a/sysdeps/aarch64/fpu/erff_advsimd.c b/sysdeps/aarch64/fpu/erff_advsimd.c
index f2fe6ff..db39e78 100644
--- a/sysdeps/aarch64/fpu/erff_advsimd.c
+++ b/sysdeps/aarch64/fpu/erff_advsimd.c
@@ -47,10 +47,10 @@ static inline struct entry
lookup (uint32x4_t i)
{
struct entry e;
- float32x2_t t0 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
- float32x2_t t1 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
- float32x2_t t2 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
- float32x2_t t3 = vld1_f32 (&__erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
+ float32x2_t t0 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 0)].erf);
+ float32x2_t t1 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 1)].erf);
+ float32x2_t t2 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 2)].erf);
+ float32x2_t t3 = vld1_f32 (&__v_erff_data.tab[vgetq_lane_u32 (i, 3)].erf);
float32x4_t e1 = vcombine_f32 (t0, t1);
float32x4_t e2 = vcombine_f32 (t2, t3);
e.erf = vuzp1q_f32 (e1, e2);
diff --git a/sysdeps/aarch64/fpu/erff_data.c b/sysdeps/aarch64/fpu/erff_data.c
index 9a32940..da38aed 100644
--- a/sysdeps/aarch64/fpu/erff_data.c
+++ b/sysdeps/aarch64/fpu/erff_data.c
@@ -19,14 +19,14 @@
#include "vecmath_config.h"
-/* Lookup table used in erff.
+/* Lookup table used in vector erff.
For each possible rounded input r (multiples of 1/128), between
r = 0.0 and r = 4.0 (513 values):
- - the first entry __erff_data.tab.erf contains the values of erf(r),
- - the second entry __erff_data.tab.scale contains the values of
+ - the first entry __v_erff_data.tab.erf contains the values of erf(r),
+ - the second entry __v_erff_data.tab.scale contains the values of
2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct erff_data __erff_data = {
+const struct v_erff_data __v_erff_data = {
.tab = { { 0x0.000000p+0, 0x1.20dd76p+0 },
{ 0x1.20dbf4p-7, 0x1.20d8f2p+0 },
{ 0x1.20d770p-6, 0x1.20cb68p+0 },
diff --git a/sysdeps/aarch64/fpu/erff_sve.c b/sysdeps/aarch64/fpu/erff_sve.c
index 38f00db..0e382eb 100644
--- a/sysdeps/aarch64/fpu/erff_sve.c
+++ b/sysdeps/aarch64/fpu/erff_sve.c
@@ -62,18 +62,17 @@ svfloat32_t SV_NAME_F1 (erf) (svfloat32_t x, const svbool_t pg)
svfloat32_t shift = sv_f32 (dat->shift);
svfloat32_t z = svadd_x (pg, a, shift);
- svuint32_t i
- = svsub_x (pg, svreinterpret_u32 (z), svreinterpret_u32 (shift));
-
- /* Saturate lookup index. */
- i = svsel (a_ge_max, sv_u32 (512), i);
+ svuint32_t i = svand_x (pg, svreinterpret_u32 (z), 0xfff);
+ i = svadd_x (pg, i, i);
/* r and erf(r) set to 0 for |x| below min. */
svfloat32_t r = svsub_z (a_gt_min, z, shift);
- svfloat32_t erfr = svld1_gather_index (a_gt_min, __sv_erff_data.erf, i);
+ svfloat32_t erfr
+ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].erf, i);
/* scale set to 2/sqrt(pi) for |x| below min. */
- svfloat32_t scale = svld1_gather_index (a_gt_min, __sv_erff_data.scale, i);
+ svfloat32_t scale
+ = svld1_gather_index (a_gt_min, &__v_erff_data.tab[0].scale, i);
scale = svsel (a_gt_min, scale, sv_f32 (dat->scale));
/* erf(x) ~ erf(r) + scale * d * (1 - r * d + 1/3 * d^2). */
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index e09b2f3..8aa3fa9 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -18,74 +18,83 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
-#include "poly_sve_f32.h"
-/* For x < -SpecialBound, the result is subnormal and not handled correctly by
+/* For x < -Thres, the result is subnormal and not handled correctly by
FEXPA. */
-#define SpecialBound 37.9
+#define Thres 37.9
static const struct data
{
- float poly[5];
- float shift, log10_2, log2_10_hi, log2_10_lo, special_bound;
+ float log2_10_lo, c0, c2, c4;
+ float c1, c3, log10_2;
+ float shift, log2_10_hi, thres;
} data = {
/* Coefficients generated using Remez algorithm with minimisation of relative
error.
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 0.52 +0.5 ulp. */
- .poly = { 0x1.26bb16p+1f, 0x1.5350d2p+1f, 0x1.04744ap+1f, 0x1.2d8176p+0f,
- 0x1.12b41ap-1f },
+ .c0 = 0x1.26bb16p+1f,
+ .c1 = 0x1.5350d2p+1f,
+ .c2 = 0x1.04744ap+1f,
+ .c3 = 0x1.2d8176p+0f,
+ .c4 = 0x1.12b41ap-1f,
/* 1.5*2^17 + 127, a shift value suitable for FEXPA. */
- .shift = 0x1.903f8p17f,
+ .shift = 0x1.803f8p17f,
.log10_2 = 0x1.a934fp+1,
.log2_10_hi = 0x1.344136p-2,
.log2_10_lo = -0x1.ec10cp-27,
- .special_bound = SpecialBound,
+ .thres = Thres,
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+static inline svfloat32_t
+sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
{
- return sv_call_f32 (exp10f, x, y, special);
-}
-
-/* Single-precision SVE exp10f routine. Implements the same algorithm
- as AdvSIMD exp10f.
- Worst case error is 1.02 ULPs.
- _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
- want 0x1.ba5f9cp-1. */
-svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
-{
- const struct data *d = ptr_barrier (&data);
/* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N]. */
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t log10_2_and_inv = svld1rq (svptrue_b32 (), &d->log10_2);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
/* n = round(x/(log10(2)/N)). */
svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svmla_lane (shift, x, log10_2_and_inv, 0);
- svfloat32_t n = svsub_x (pg, z, shift);
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
/* r = x - n*log10(2)/N. */
- svfloat32_t r = svmls_lane (x, n, log10_2_and_inv, 1);
- r = svmls_lane (r, n, log10_2_and_inv, 2);
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
- svbool_t special = svacgt (pg, x, d->special_bound);
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* Polynomial evaluation: poly(r) ~ exp10(r)-1. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t poly
- = svmla_x (pg, svmul_x (pg, r, d->poly[0]),
- sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1), r2);
-
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
+ svfloat32_t poly = svmla_x (pg, p0, r2, p14);
return svmla_x (pg, scale, scale, poly);
}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp10f, x, sv_exp10f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp10f routine. Implements the same algorithm
+ as AdvSIMD exp10f.
+ Worst case error is 1.02 ULPs.
+ _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
+ want 0x1.ba5f9cp-1. */
+svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (__glibc_unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp10f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index 8a686e3..c6216be 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -24,54 +24,64 @@
static const struct data
{
- float poly[5];
+ float c0, c2, c4, c1, c3;
float shift, thres;
} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.62e422p-1f, 0x1.ebf9bcp-3f, 0x1.c6bd32p-5f, 0x1.3ce9e4p-7f,
- 0x1.59977ap-10f },
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */
+ .c0 = 0x1.62e422p-1f,
+ .c1 = 0x1.ebf9bcp-3f,
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = 0x1.3ce9e4p-7f,
+ .c4 = 0x1.59977ap-10f,
/* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
+ .shift = 0x1.803f8p17f,
/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
correctly by FEXPA. */
.thres = Thres,
};
-static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
-{
- return sv_call_f32 (exp2f, x, y, special);
-}
-
-/* Single-precision SVE exp2f routine. Implements the same algorithm
- as AdvSIMD exp2f.
- Worst case error is 1.04 ULPs.
- SV_NAME_F1 (exp2)(0x1.943b9p-1) got 0x1.ba7eb2p+0
- want 0x1.ba7ebp+0. */
-svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+static inline svfloat32_t
+sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
{
- const struct data *d = ptr_barrier (&data);
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = n + r, with r in [-1/2, 1/2]. */
- svfloat32_t shift = sv_f32 (d->shift);
- svfloat32_t z = svadd_x (pg, x, shift);
- svfloat32_t n = svsub_x (pg, z, shift);
- svfloat32_t r = svsub_x (pg, x, n);
+ svfloat32_t z = svadd_x (svptrue_b32 (), x, d->shift);
+ svfloat32_t n = svsub_x (svptrue_b32 (), z, d->shift);
+ svfloat32_t r = svsub_x (svptrue_b32 (), x, n);
- svbool_t special = svacgt (pg, x, d->thres);
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* Polynomial evaluation: poly(r) ~ exp2(r)-1.
Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
coefficients 1 to 4, and apply most significant coefficient directly. */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = sv_pairwise_poly_3_f32_x (pg, r, r2, d->poly + 1);
- svfloat32_t p0 = svmul_x (pg, r, d->poly[0]);
+ svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
+ svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
+ svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
- if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (pg, scale, scale, poly), special);
-
return svmla_x (pg, scale, scale, poly);
}
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svbool_t special, const struct data *d)
+{
+ return sv_call_f32 (exp2f, x, sv_exp2f_inline (x, svptrue_b32 (), d),
+ special);
+}
+
+/* Single-precision SVE exp2f routine. Implements the same algorithm
+ as AdvSIMD exp2f.
+ Worst case error is 1.04 ULPs.
+ _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
+ want 0x1.ba6a64p-1. */
+svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
+{
+ const struct data *d = ptr_barrier (&data);
+ svbool_t special = svacgt (pg, x, d->thres);
+ if (__glibc_unlikely (svptest_any (special, special)))
+ return special_case (x, special, d);
+ return sv_exp2f_inline (x, pg, d);
+}
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 99d2e64..5c9cb72 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[5];
- float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+ float32x4_t inv_ln2, ln2_hi, ln2_lo;
uint32x4_t exponent_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
@@ -31,7 +31,6 @@ static const struct data
/* maxerr: 1.45358 +0.5 ulp. */
.poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
- .shift = V4 (0x1.8p23f),
.inv_ln2 = V4 (0x1.715476p+0f),
.ln2_hi = V4 (0x1.62e4p-1f),
.ln2_lo = V4 (0x1.7f7d1cp-20f),
@@ -85,7 +84,7 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly, z;
+ float32x4_t n, r, r2, scale, p, q, poly;
uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
@@ -104,11 +103,10 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- z = vfmaq_f32 (d->shift, x, d->inv_ln2);
- n = vsubq_f32 (z, d->shift);
+ n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
r = vfmsq_f32 (x, n, d->ln2_hi);
r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
index 3ba79bc..da93e01 100644
--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -18,33 +18,25 @@
<https://www.gnu.org/licenses/>. */
#include "sv_math.h"
+#include "sv_expf_inline.h"
+
+/* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+ correctly by FEXPA. */
+#define Thres 0x1.5d5e2ap+6f
static const struct data
{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+ struct sv_expf_data d;
+ float thres;
} data = {
- /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
- compatibility with polynomial helpers. */
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
- 0x1.0e4020p-7f },
- .inv_ln2 = 0x1.715476p+0f,
- .ln2_hi = 0x1.62e4p-1f,
- .ln2_lo = 0x1.7f7d1cp-20f,
- /* 1.5*2^17 + 127. */
- .shift = 0x1.903f8p17f,
- /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
- correctly by FEXPA. */
- .thres = 0x1.5d5e2ap+6f,
+ .d = SV_EXPF_DATA,
+ .thres = Thres,
};
-#define C(i) sv_f32 (d->poly[i])
-#define ExponentBias 0x3f800000
-
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
{
- return sv_call_f32 (expf, x, y, special);
+ return sv_call_f32 (expf, x, expf_inline (x, svptrue_b32 (), d), special);
}
/* Optimised single-precision SVE exp function.
@@ -54,36 +46,8 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
-
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
-
- /* Load some constants in quad-word chunks to minimise memory access (last
- lane is wasted). */
- svfloat32_t invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->inv_ln2);
-
- /* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, invln2_and_ln2, 0);
- svfloat32_t n = svsub_x (pg, z, d->shift);
-
- /* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, invln2_and_ln2, 1);
- r = svmls_lane (r, n, invln2_and_ln2, 2);
-
- /* scale = 2^(n/N). */
svbool_t is_special_case = svacgt (pg, x, d->thres);
- svfloat32_t scale = svexpa (svreinterpret_u32 (z));
-
- /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_x (pg, C (3), C (4), r);
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_x (pg, r, C (0));
- svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
if (__glibc_unlikely (svptest_any (pg, is_special_case)))
- return special_case (x, svmla_x (pg, scale, scale, poly), is_special_case);
-
- return svmla_x (pg, scale, scale, poly);
+ return special_case (x, is_special_case, &d->d);
+ return expf_inline (x, pg, &d->d);
}
diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c
index a0616ec..8303ca2 100644
--- a/sysdeps/aarch64/fpu/expm1f_advsimd.c
+++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c
@@ -18,27 +18,18 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_expm1f_inline.h"
static const struct data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
- int32x4_t exponent_bias;
+ struct v_expm1f_data d;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
#else
float32x4_t oflow_bound;
#endif
} data = {
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
- .shift = V4 (0x1.8p23f),
- .exponent_bias = V4 (0x3f800000),
+ .d = V_EXPM1F_DATA,
#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
@@ -55,67 +46,38 @@ static const struct data
#define TinyBound v_u32 (0x34000000 << 1)
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
- return v_call_f32 (expm1f, x, y, special);
+ return v_call_f32 (
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
- The maximum error is 1.51 ULP:
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
- want 0x1.e2fb94p-2. */
+ The maximum error is 1.62 ULP:
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+ want 0x1.da9f44p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
#if WANT_SIMD_EXCEPT
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint32x4_t special
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
- if (__glibc_unlikely (v_any_u32 (special)))
- x = v_zerofy_f32 (x, special);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
- int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float32x4_t p = v_horner_4_f32 (f, d->poly);
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
- float32x4_t t = vreinterpretq_f32_s32 (u);
-
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
- special);
+ return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+ return expm1f_inline (x, &d->d);
}
libmvec_hidden_def (V_NAME_F1 (expm1))
HALF_WIDTH_ALIAS_F1 (expm1)
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index 9347422..82228b5 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -22,11 +22,11 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
float32x4_t poly[8];
float32x4_t inv_ln10, ln2;
- uint32x4_t off, mantissa_mask;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
@@ -35,18 +35,22 @@ static const struct data
V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
- uint16x4_t cmp)
+special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (log10f, x, vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
+ return v_call_f32 (log10f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (y, p, r2), vmovl_u16 (cmp));
}
/* Fast implementation of AdvSIMD log10f,
@@ -58,15 +62,21 @@ special_case (float32x4_t x, float32x4_t y, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log10(1+r) + n * log10(2). */
@@ -77,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
y = vmulq_f32 (y, d->inv_ln10);
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, y, poly, r2, special);
+ return special_case (y, u_off, poly, r2, special, d);
return vfmaq_f32 (y, poly, r2);
}
libmvec_hidden_def (V_NAME_F1 (log10))
diff --git a/sysdeps/aarch64/fpu/log10f_sve.c b/sysdeps/aarch64/fpu/log10f_sve.c
index bdbb49c..7913679 100644
--- a/sysdeps/aarch64/fpu/log10f_sve.c
+++ b/sysdeps/aarch64/fpu/log10f_sve.c
@@ -24,6 +24,7 @@ static const struct data
float poly_0246[4];
float poly_1357[4];
float ln2, inv_ln10;
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -35,18 +36,23 @@ static const struct data
-0x1.0fc92cp-4f },
.ln2 = 0x1.62e43p-1f,
.inv_ln10 = 0x1.bcb7b2p-2f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min 0x00800000
-#define Max 0x7f800000
-#define Thres 0x7f000000 /* Max - Min. */
-#define Offset 0x3f2aaaab /* 0.666667. */
+#define Thres 0x7f000000 /* asuint32(inf) - 0x00800000. */
#define MantissaMask 0x007fffff
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log10f, x, y, special);
+ return sv_call_f32 (
+ log10f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log10f using the same algorithm and
@@ -57,23 +63,25 @@ special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t ix = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
+
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thres);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- ix = svsub_x (pg, ix, Offset);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend. */
- ix = svand_x (pg, ix, MantissaMask);
- ix = svadd_x (pg, ix, Offset);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* signextend. */
+ svuint32_t ix = svand_x (pg, u_off, MantissaMask);
+ ix = svadd_x (pg, ix, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
/* y = log10(1+r) + n*log10(2)
log10(1+r) ~ r * InvLn(10) + P(r)
where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). */
- svfloat32_t r2 = svmul_x (pg, r, r);
- svfloat32_t r4 = svmul_x (pg, r2, r2);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
+ svfloat32_t r4 = svmul_x (svptrue_b32 (), r2, r2);
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
@@ -88,7 +96,6 @@ svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
hi = svmul_x (pg, hi, d->inv_ln10);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
- special);
- return svmla_x (pg, hi, r2, y);
+ return special_case (u_off, hi, r2, y, special);
+ return svmla_x (svptrue_b32 (), hi, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/log1p_advsimd.c b/sysdeps/aarch64/fpu/log1p_advsimd.c
index ffc418f..114064c 100644
--- a/sysdeps/aarch64/fpu/log1p_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1p_advsimd.c
@@ -127,3 +127,5 @@ VPCS_ATTR float64x2_t V_NAME_D1 (log1p) (float64x2_t x)
return vfmaq_f64 (y, f2, p);
}
+
+strong_alias (V_NAME_D1 (log1p), V_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
index 04f7e57..b21cfb2 100644
--- a/sysdeps/aarch64/fpu/log1p_sve.c
+++ b/sysdeps/aarch64/fpu/log1p_sve.c
@@ -116,3 +116,5 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
return y;
}
+
+strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log1pf_advsimd.c b/sysdeps/aarch64/fpu/log1pf_advsimd.c
index dc15334..00006fc 100644
--- a/sysdeps/aarch64/fpu/log1pf_advsimd.c
+++ b/sysdeps/aarch64/fpu/log1pf_advsimd.c
@@ -18,113 +18,81 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_log1pf_inline.h"
+
+#if WANT_SIMD_EXCEPT
const static struct data
{
- float32x4_t poly[8], ln2;
- uint32x4_t tiny_bound, minus_one, four, thresh;
- int32x4_t three_quarters;
+ uint32x4_t minus_one, thresh;
+ struct v_log1pf_data d;
} data = {
- .poly = { /* Generated using FPMinimax in [-0.25, 0.5]. First two coefficients
- (1, -0.5) are not stored as they can be generated more
- efficiently. */
- V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f),
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f),
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) },
- .ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x34000000), /* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
- .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - tiny_bound. */
+ .d = V_LOG1PF_CONSTANTS_TABLE,
+ .thresh = V4 (0x4b800000), /* asuint32(INFINITY) - TinyBound. */
.minus_one = V4 (0xbf800000),
- .four = V4 (0x40800000),
- .three_quarters = V4 (0x3f400000)
};
-static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *p)
-{
- /* Approximate log(1+m) on [-0.25, 0.5] using split Estrin scheme. */
- float32x4_t p_12 = vfmaq_f32 (v_f32 (-0.5), m, p[0]);
- float32x4_t p_34 = vfmaq_f32 (p[1], m, p[2]);
- float32x4_t p_56 = vfmaq_f32 (p[3], m, p[4]);
- float32x4_t p_78 = vfmaq_f32 (p[5], m, p[6]);
-
- float32x4_t m2 = vmulq_f32 (m, m);
- float32x4_t p_02 = vfmaq_f32 (m, m2, p_12);
- float32x4_t p_36 = vfmaq_f32 (p_34, m2, p_56);
- float32x4_t p_79 = vfmaq_f32 (p_78, m2, p[7]);
-
- float32x4_t m4 = vmulq_f32 (m2, m2);
- float32x4_t p_06 = vfmaq_f32 (p_02, m4, p_36);
- return vfmaq_f32 (p_06, m4, vmulq_f32 (m4, p_79));
-}
+/* asuint32(0x1p-23). ulp=0.5 at 0x1p-23. */
+# define TinyBound v_u32 (0x34000000)
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t cmp, const struct data *d)
{
- return v_call_f32 (log1pf, x, y, special);
+ /* Side-step special lanes so fenv exceptions are not triggered
+ inadvertently. */
+ float32x4_t x_nospecial = v_zerofy_f32 (x, cmp);
+ return v_call_f32 (log1pf, x, log1pf_inline (x_nospecial, &d->d), cmp);
}
-/* Vector log1pf approximation using polynomial on reduced interval. Accuracy
- is roughly 2.02 ULP:
- log1pf(0x1.21e13ap-2) got 0x1.fe8028p-3 want 0x1.fe802cp-3. */
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.69 ULP:
+ _ZGVnN4v_log1pf(0x1.04418ap-2) got 0x1.cfcbd8p-3
+ want 0x1.cfcbdcp-3. */
VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
+
uint32x4_t special_cases
- = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, d->tiny_bound), d->thresh),
+ = vorrq_u32 (vcgeq_u32 (vsubq_u32 (ia, TinyBound), d->thresh),
vcgeq_u32 (ix, d->minus_one));
- float32x4_t special_arg = x;
-#if WANT_SIMD_EXCEPT
if (__glibc_unlikely (v_any_u32 (special_cases)))
- /* Side-step special lanes so fenv exceptions are not triggered
- inadvertently. */
- x = v_zerofy_f32 (x, special_cases);
-#endif
+ return special_case (x, special_cases, d);
- /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
- is in [-0.25, 0.5]):
- log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
-
- We approximate log1p(m) with a polynomial, then scale by
- k*log(2). Instead of doing this directly, we use an intermediate
- scale factor s = 4*k*log(2) to ensure the scale is representable
- as a normalised fp32 number. */
+ return log1pf_inline (x, &d->d);
+}
- float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+#else
- /* Choose k to scale x to the range [-1/4, 1/2]. */
- int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
- v_s32 (0xff800000));
- uint32x4_t ku = vreinterpretq_u32_s32 (k);
+const static struct v_log1pf_data data = V_LOG1PF_CONSTANTS_TABLE;
- /* Scale x by exponent manipulation. */
- float32x4_t m_scale
- = vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
+static float32x4_t NOINLINE VPCS_ATTR
+special_case (float32x4_t x, uint32x4_t cmp)
+{
+ return v_call_f32 (log1pf, x, log1pf_inline (x, ptr_barrier (&data)), cmp);
+}
- /* Scale up to ensure that the scale factor is representable as normalised
- fp32 number, and scale m down accordingly. */
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
- m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
+/* Vector log1pf approximation using polynomial on reduced interval. Worst-case
+ error is 1.63 ULP:
+ _ZGVnN4v_log1pf(0x1.216d12p-2) got 0x1.fdcb12p-3
+ want 0x1.fdcb16p-3. */
+VPCS_ATTR float32x4_t V_NAME_F1 (log1p) (float32x4_t x)
+{
+ uint32x4_t special_cases = vornq_u32 (vcleq_f32 (x, v_f32 (-1)),
+ vcaleq_f32 (x, v_f32 (0x1p127f)));
- /* Evaluate polynomial on the reduced interval. */
- float32x4_t p = eval_poly (m_scale, d->poly);
+ if (__glibc_unlikely (v_any_u32 (special_cases)))
+ return special_case (x, special_cases);
- /* The scale factor to be applied back at the end - by multiplying float(k)
- by 2^-23 we get the unbiased exponent of k. */
- float32x4_t scale_back = vcvtq_f32_s32 (vshrq_n_s32 (k, 23));
+ return log1pf_inline (x, ptr_barrier (&data));
+}
- /* Apply the scaling back. */
- float32x4_t y = vfmaq_f32 (p, scale_back, d->ln2);
+#endif
- if (__glibc_unlikely (v_any_u32 (special_cases)))
- return special_case (special_arg, y, special_cases);
- return y;
-}
libmvec_hidden_def (V_NAME_F1 (log1p))
HALF_WIDTH_ALIAS_F1 (log1p)
+strong_alias (V_NAME_F1 (log1p), V_NAME_F1 (logp1))
+libmvec_hidden_def (V_NAME_F1 (logp1))
+HALF_WIDTH_ALIAS_F1 (logp1)
diff --git a/sysdeps/aarch64/fpu/log1pf_sve.c b/sysdeps/aarch64/fpu/log1pf_sve.c
index f645cc9..5256d5e 100644
--- a/sysdeps/aarch64/fpu/log1pf_sve.c
+++ b/sysdeps/aarch64/fpu/log1pf_sve.c
@@ -98,3 +98,5 @@ svfloat32_t SV_NAME_F1 (log1p) (svfloat32_t x, svbool_t pg)
return y;
}
+
+strong_alias (SV_NAME_F1 (log1p), SV_NAME_F1 (logp1))
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index db21836..84effe4 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -22,9 +22,9 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
- uint32x4_t off, mantissa_mask;
+ uint32x4_t mantissa_mask;
float32x4_t poly[9];
} data = {
/* Coefficients generated using Remez algorithm approximate
@@ -34,18 +34,22 @@ static const struct data
V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff),
};
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
- uint16x4_t cmp)
+special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (log2f, x, vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
+ return v_call_f32 (log2f, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (n, p, r), vmovl_u16 (cmp));
}
/* Fast implementation for single precision AdvSIMD log2,
@@ -56,15 +60,21 @@ special_case (float32x4_t x, float32x4_t n, float32x4_t p, float32x4_t r,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t u = vreinterpretq_u32_f32 (x);
- uint16x4_t special = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ uint32x4_t u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
float32x4_t n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vaddq_u32 (vandq_u32 (u, d->mantissa_mask), d->off);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+
+ uint16x4_t special = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log2(1+r) + n. */
@@ -72,7 +82,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
if (__glibc_unlikely (v_any_u16h (special)))
- return special_case (x, n, p, r, special);
+ return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
libmvec_hidden_def (V_NAME_F1 (log2))
diff --git a/sysdeps/aarch64/fpu/log2f_sve.c b/sysdeps/aarch64/fpu/log2f_sve.c
index 5031c42..939d89b 100644
--- a/sysdeps/aarch64/fpu/log2f_sve.c
+++ b/sysdeps/aarch64/fpu/log2f_sve.c
@@ -23,6 +23,7 @@ static const struct data
{
float poly_02468[5];
float poly_1357[4];
+ uint32_t off, lower;
} data = {
.poly_1357 = {
/* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
@@ -32,18 +33,23 @@ static const struct data
},
.poly_02468 = { 0x1.715476p0f, 0x1.ec701cp-2f, 0x1.27a0b8p-2f,
0x1.9d8ecap-3f, 0x1.9e495p-3f },
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thres (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define MantissaMask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (log2f, x, y, cmp);
+ return sv_call_f32 (
+ log2f, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE log2f, using the same algorithm
@@ -55,19 +61,20 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t special = svcmpge (pg, svsub_x (pg, u, Min), Thres);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t special = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, MantissaMask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+ svuint32_t u = svand_x (pg, u_off, MantissaMask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log2(1+r) + n. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* Evaluate polynomial using pairwise Horner scheme. */
svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
@@ -81,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log2) (svfloat32_t x, const svbool_t pg)
y = svmla_x (pg, q_01, r2, y);
if (__glibc_unlikely (svptest_any (pg, special)))
- return special_case (x, svmla_x (svnot_z (pg, special), n, r, y), special);
- return svmla_x (pg, n, r, y);
+ return special_case (u_off, n, r, y, special);
+ return svmla_x (svptrue_b32 (), n, r, y);
}
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 3c0d0fc..c20dbfd 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,20 +21,22 @@
static const struct data
{
- uint32x4_t min_norm;
+ uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
+ uint32x4_t mantissa_mask;
float32x4_t poly[7];
- float32x4_t ln2, tiny_bound;
- uint32x4_t off, mantissa_mask;
+ float32x4_t ln2;
} data = {
/* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
V4 (-0x1.ffffc8p-2f) },
.ln2 = V4 (0x1.62e43p-1f),
- .tiny_bound = V4 (0x1p-126),
- .min_norm = V4 (0x00800000),
- .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .offset_lower_bound = V4 (0x00800000 - 0x3f2aaaab),
+ .special_bound = V8 (0x7f00), /* top16(asuint32(inf) - 0x00800000). */
.off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};
@@ -42,32 +44,37 @@ static const struct data
#define P(i) d->poly[7 - i]
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
- uint16x4_t cmp)
+special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
+ uint16x4_t cmp, const struct data *d)
{
/* Fall back to scalar code. */
- return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
+ return v_call_f32 (logf, vreinterpretq_f32_u32 (vaddq_u32 (u_off, d->off)),
+ vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
- uint32x4_t u;
+ uint32x4_t u, u_off;
uint16x4_t cmp;
- u = vreinterpretq_u32_f32 (x);
- cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
- vget_low_u16 (d->special_bound));
+ /* To avoid having to mov x out of the way, keep u after offset has been
+ applied, and recover x by adding the offset back in the special-case
+ handler. */
+ u_off = vreinterpretq_u32_f32 (x);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = vsubq_u32 (u, d->off);
+ u_off = vsubq_u32 (u_off, d->off);
n = vcvtq_f32_s32 (
- vshrq_n_s32 (vreinterpretq_s32_u32 (u), 23)); /* signextend. */
- u = vandq_u32 (u, d->mantissa_mask);
+ vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
+ u = vandq_u32 (u_off, d->mantissa_mask);
u = vaddq_u32 (u, d->off);
r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
+
/* y = log(1+r) + n*ln2. */
r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
@@ -80,7 +87,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
p = vfmaq_f32 (r, d->ln2, n);
if (__glibc_unlikely (v_any_u16h (cmp)))
- return special_case (x, y, r2, p, cmp);
+ return special_case (p, u_off, y, r2, cmp, d);
return vfmaq_f32 (p, y, r2);
}
libmvec_hidden_def (V_NAME_F1 (log))
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
index d64e810..5b93246 100644
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -24,6 +24,7 @@ static const struct data
float poly_0135[4];
float poly_246[3];
float ln2;
+ uint32_t off, lower;
} data = {
.poly_0135 = {
/* Coefficients copied from the AdvSIMD routine in math/, then rearranged so
@@ -32,19 +33,24 @@ static const struct data
-0x1.3e737cp-3f, 0x1.5a9aa2p-3f, 0x1.961348p-3f, 0x1.555d7cp-2f
},
.poly_246 = { -0x1.4f9934p-3f, -0x1.00187cp-2f, -0x1.ffffc8p-2f },
- .ln2 = 0x1.62e43p-1f
+ .ln2 = 0x1.62e43p-1f,
+ .off = 0x3f2aaaab,
+ /* Lower bound is the smallest positive normal float 0x00800000. For
+ optimised register use subnormals are detected after offset has been
+ subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
+ .lower = 0x00800000 - 0x3f2aaaab
};
-#define Min (0x00800000)
-#define Max (0x7f800000)
-#define Thresh (0x7f000000) /* Max - Min. */
+#define Thresh (0x7f000000) /* asuint32(inf) - 0x00800000. */
#define Mask (0x007fffff)
-#define Off (0x3f2aaaab) /* 0.666667. */
static svfloat32_t NOINLINE
-special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+special_case (svuint32_t u_off, svfloat32_t p, svfloat32_t r2, svfloat32_t y,
+ svbool_t cmp)
{
- return sv_call_f32 (logf, x, y, cmp);
+ return sv_call_f32 (
+ logf, svreinterpret_f32 (svadd_x (svptrue_b32 (), u_off, data.off)),
+ svmla_x (svptrue_b32 (), p, r2, y), cmp);
}
/* Optimised implementation of SVE logf, using the same algorithm and
@@ -55,19 +61,21 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32 (x);
- svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
+ svuint32_t u_off = svreinterpret_u32 (x);
+
+ u_off = svsub_x (pg, u_off, d->off);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u_off, d->lower), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_x (pg, u, Off);
svfloat32_t n = svcvt_f32_x (
- pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
- u = svand_x (pg, u, Mask);
- u = svadd_x (pg, u, Off);
+ pg, svasr_x (pg, svreinterpret_s32 (u_off), 23)); /* Sign-extend. */
+
+ svuint32_t u = svand_x (pg, u_off, Mask);
+ u = svadd_x (pg, u, d->off);
svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */
- svfloat32_t r2 = svmul_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
@@ -80,6 +88,6 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
p = svmla_x (pg, r, n, d->ln2);
if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+ return special_case (u_off, p, r2, y, cmp);
return svmla_x (pg, p, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
index a0d9d3b..718125c 100644
--- a/sysdeps/aarch64/fpu/sin_advsimd.c
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float64x2_t poly[7];
- float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float64x2_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
.poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
@@ -34,12 +34,13 @@ static const struct data
.pi_1 = V2 (0x1.921fb54442d18p+1),
.pi_2 = V2 (0x1.1a62633145c06p-53),
.pi_3 = V2 (0x1.c1cd129024e09p-106),
- .shift = V2 (0x1.8p52),
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255). */
-# define Thresh v_u64 (0x1160000000000000) /* RangeVal - TinyBound. */
+/* asuint64(0x1p-253)), below which multiply by inv_pi underflows. */
+# define TinyBound v_u64 (0x3020000000000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u64 (0x1160000000000000)
#endif
#define C(i) d->poly[i]
@@ -72,16 +73,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
fenv). These lanes will be fixed by special-case handler later. */
uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
- r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+ r = vreinterpretq_f64_u64 (vbicq_u64 (vreinterpretq_u64_f64 (x), cmp));
#else
r = x;
cmp = vcageq_f64 (x, d->range_val);
#endif
/* n = rint(|x|/pi). */
- n = vfmaq_f64 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
- n = vsubq_f64 (n, d->shift);
+ n = vrndaq_f64 (vmulq_f64 (r, d->inv_pi));
+ odd = vshlq_n_u64 (vreinterpretq_u64_s64 (vcvtq_s64_f64 (n)), 63);
/* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f64 (r, d->pi_1, n);
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
index 375dfc3..6ee9a23 100644
--- a/sysdeps/aarch64/fpu/sinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -22,7 +22,7 @@
static const struct data
{
float32x4_t poly[4];
- float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+ float32x4_t range_val, inv_pi, pi_1, pi_2, pi_3;
} data = {
/* 1.886 ulp error. */
.poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
@@ -33,13 +33,14 @@ static const struct data
.pi_3 = V4 (-0x1.ee59dap-49f),
.inv_pi = V4 (0x1.45f306p-2f),
- .shift = V4 (0x1.8p+23f),
.range_val = V4 (0x1p20f)
};
#if WANT_SIMD_EXCEPT
-# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f). */
-# define Thresh v_u32 (0x28800000) /* RangeVal - TinyBound. */
+/* asuint32(0x1p-59f), below which multiply by inv_pi underflows. */
+# define TinyBound v_u32 (0x22000000)
+/* RangeVal - TinyBound. */
+# define Thresh v_u32 (0x27800000)
#endif
#define C(i) d->poly[i]
@@ -64,23 +65,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x)
/* If fenv exceptions are to be triggered correctly, set any special lanes
to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
special-case handler later. */
- r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+ r = vreinterpretq_f32_u32 (vbicq_u32 (vreinterpretq_u32_f32 (x), cmp));
#else
r = x;
cmp = vcageq_f32 (x, d->range_val);
#endif
- /* n = rint(|x|/pi) */
- n = vfmaq_f32 (d->shift, d->inv_pi, r);
- odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
- n = vsubq_f32 (n, d->shift);
+ /* n = rint(|x|/pi). */
+ n = vrndaq_f32 (vmulq_f32 (r, d->inv_pi));
+ odd = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 31);
- /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2) */
+ /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
r = vfmsq_f32 (r, d->pi_1, n);
r = vfmsq_f32 (r, d->pi_2, n);
r = vfmsq_f32 (r, d->pi_3, n);
- /* y = sin(r) */
+ /* y = sin(r). */
r2 = vmulq_f32 (r, r);
y = vfmaq_f32 (C (2), C (3), r2);
y = vfmaq_f32 (C (1), y, r2);
diff --git a/sysdeps/aarch64/fpu/sinhf_advsimd.c b/sysdeps/aarch64/fpu/sinhf_advsimd.c
index 6bb7482..c6ed759 100644
--- a/sysdeps/aarch64/fpu/sinhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/sinhf_advsimd.c
@@ -23,15 +23,13 @@
static const struct data
{
struct v_expm1f_data expm1f_consts;
- uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
- uint32x4_t oflow_bound;
+ float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
- .halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@@ -39,14 +37,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
- .oflow_bound = V4 (0x42b0c0a7),
+ .oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+ uint32x4_t special)
{
- return v_call_f32 (sinhf, x, y, special);
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t sign = veorq_u32 (ix, iax);
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ uint32x4_t special = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (t, halfsign), special);
+ return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}
diff --git a/sysdeps/aarch64/fpu/sv_erf_data.c b/sysdeps/aarch64/fpu/sv_erf_data.c
deleted file mode 100644
index a53878f..0000000
--- a/sysdeps/aarch64/fpu/sv_erf_data.c
+++ /dev/null
@@ -1,1570 +0,0 @@
-/* Table for SVE erf approximation
-
- Copyright (C) 2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include "vecmath_config.h"
-
-/* Lookup table used in vector erf.
- For each possible rounded input r (multiples of 1/128), between
- r = 0.0 and r = 6.0 (769 values):
- - the first entry __erf_data.tab.erf contains the values of erf(r),
- - the second entry __erf_data.tab.scale contains the values of
- 2/sqrt(pi)*exp(-r^2). Note that indices 0 and 1 are never hit by the
- algorithm, since lookup is performed only for x >= 1/64-1/512. */
-const struct sv_erf_data __sv_erf_data = {
- .erf = { 0x0.0000000000000p+0,
- 0x1.20dbf3deb1340p-7,
- 0x1.20d77083f17a0p-6,
- 0x1.b137e0cf584dcp-6,
- 0x1.20c5645dd2538p-5,
- 0x1.68e5d3bbc9526p-5,
- 0x1.b0fafef135745p-5,
- 0x1.f902a77bd3821p-5,
- 0x1.207d480e90658p-4,
- 0x1.44703e87e8593p-4,
- 0x1.68591a1e83b5dp-4,
- 0x1.8c36beb8a8d23p-4,
- 0x1.b0081148a873ap-4,
- 0x1.d3cbf7e70a4b3p-4,
- 0x1.f78159ec8bb50p-4,
- 0x1.0d939005f65e5p-3,
- 0x1.1f5e1a35c3b89p-3,
- 0x1.311fc15f56d14p-3,
- 0x1.42d7fc2f64959p-3,
- 0x1.548642321d7c6p-3,
- 0x1.662a0bdf7a89fp-3,
- 0x1.77c2d2a765f9ep-3,
- 0x1.895010fdbdbfdp-3,
- 0x1.9ad142662e14dp-3,
- 0x1.ac45e37fe2526p-3,
- 0x1.bdad72110a648p-3,
- 0x1.cf076d1233237p-3,
- 0x1.e05354b96ff36p-3,
- 0x1.f190aa85540e2p-3,
- 0x1.015f78a3dcf3dp-2,
- 0x1.09eed6982b948p-2,
- 0x1.127631eb8de32p-2,
- 0x1.1af54e232d609p-2,
- 0x1.236bef825d9a2p-2,
- 0x1.2bd9db0f7827fp-2,
- 0x1.343ed6989b7d9p-2,
- 0x1.3c9aa8b84bedap-2,
- 0x1.44ed18d9f6462p-2,
- 0x1.4d35ef3e5372ep-2,
- 0x1.5574f4ffac98ep-2,
- 0x1.5da9f415ff23fp-2,
- 0x1.65d4b75b00471p-2,
- 0x1.6df50a8dff772p-2,
- 0x1.760aba57a76bfp-2,
- 0x1.7e15944d9d3e4p-2,
- 0x1.861566f5fd3c0p-2,
- 0x1.8e0a01cab516bp-2,
- 0x1.95f3353cbb146p-2,
- 0x1.9dd0d2b721f39p-2,
- 0x1.a5a2aca209394p-2,
- 0x1.ad68966569a87p-2,
- 0x1.b522646bbda68p-2,
- 0x1.bccfec24855b8p-2,
- 0x1.c4710406a65fcp-2,
- 0x1.cc058392a6d2dp-2,
- 0x1.d38d4354c3bd0p-2,
- 0x1.db081ce6e2a48p-2,
- 0x1.e275eaf25e458p-2,
- 0x1.e9d68931ae650p-2,
- 0x1.f129d471eabb1p-2,
- 0x1.f86faa9428f9dp-2,
- 0x1.ffa7ea8eb5fd0p-2,
- 0x1.03693a371519cp-1,
- 0x1.06f794ab2cae7p-1,
- 0x1.0a7ef5c18edd2p-1,
- 0x1.0dff4f247f6c6p-1,
- 0x1.1178930ada115p-1,
- 0x1.14eab43841b55p-1,
- 0x1.1855a5fd3dd50p-1,
- 0x1.1bb95c3746199p-1,
- 0x1.1f15cb50bc4dep-1,
- 0x1.226ae840d4d70p-1,
- 0x1.25b8a88b6dd7fp-1,
- 0x1.28ff0240d52cdp-1,
- 0x1.2c3debfd7d6c1p-1,
- 0x1.2f755ce9a21f4p-1,
- 0x1.32a54cb8db67bp-1,
- 0x1.35cdb3a9a144dp-1,
- 0x1.38ee8a84beb71p-1,
- 0x1.3c07ca9cb4f9ep-1,
- 0x1.3f196dcd0f135p-1,
- 0x1.42236e79a5fa6p-1,
- 0x1.4525c78dd5966p-1,
- 0x1.4820747ba2dc2p-1,
- 0x1.4b13713ad3513p-1,
- 0x1.4dfeba47f63ccp-1,
- 0x1.50e24ca35fd2cp-1,
- 0x1.53be25d016a4fp-1,
- 0x1.569243d2b3a9bp-1,
- 0x1.595ea53035283p-1,
- 0x1.5c2348ecc4dc3p-1,
- 0x1.5ee02e8a71a53p-1,
- 0x1.61955607dd15dp-1,
- 0x1.6442bfdedd397p-1,
- 0x1.66e86d0312e82p-1,
- 0x1.69865ee075011p-1,
- 0x1.6c1c9759d0e5fp-1,
- 0x1.6eab18c74091bp-1,
- 0x1.7131e5f496a5ap-1,
- 0x1.73b1021fc0cb8p-1,
- 0x1.762870f720c6fp-1,
- 0x1.78983697dc96fp-1,
- 0x1.7b00578c26037p-1,
- 0x1.7d60d8c979f7bp-1,
- 0x1.7fb9bfaed8078p-1,
- 0x1.820b1202f27fbp-1,
- 0x1.8454d5f25760dp-1,
- 0x1.8697120d92a4ap-1,
- 0x1.88d1cd474a2e0p-1,
- 0x1.8b050ef253c37p-1,
- 0x1.8d30debfc572ep-1,
- 0x1.8f5544bd00c04p-1,
- 0x1.91724951b8fc6p-1,
- 0x1.9387f53df5238p-1,
- 0x1.959651980da31p-1,
- 0x1.979d67caa6631p-1,
- 0x1.999d4192a5715p-1,
- 0x1.9b95e8fd26abap-1,
- 0x1.9d8768656cc42p-1,
- 0x1.9f71ca72cffb6p-1,
- 0x1.a1551a16aaeafp-1,
- 0x1.a331628a45b92p-1,
- 0x1.a506af4cc00f4p-1,
- 0x1.a6d50c20fa293p-1,
- 0x1.a89c850b7d54dp-1,
- 0x1.aa5d265064366p-1,
- 0x1.ac16fc7143263p-1,
- 0x1.adca142b10f98p-1,
- 0x1.af767a741088bp-1,
- 0x1.b11c3c79bb424p-1,
- 0x1.b2bb679ead19cp-1,
- 0x1.b4540978921eep-1,
- 0x1.b5e62fce16095p-1,
- 0x1.b771e894d602ep-1,
- 0x1.b8f741ef54f83p-1,
- 0x1.ba764a2af2b78p-1,
- 0x1.bbef0fbde6221p-1,
- 0x1.bd61a1453ab44p-1,
- 0x1.bece0d82d1a5cp-1,
- 0x1.c034635b66e23p-1,
- 0x1.c194b1d49a184p-1,
- 0x1.c2ef0812fc1bdp-1,
- 0x1.c443755820d64p-1,
- 0x1.c5920900b5fd1p-1,
- 0x1.c6dad2829ec62p-1,
- 0x1.c81de16b14cefp-1,
- 0x1.c95b455cce69dp-1,
- 0x1.ca930e0e2a825p-1,
- 0x1.cbc54b476248dp-1,
- 0x1.ccf20ce0c0d27p-1,
- 0x1.ce1962c0e0d8bp-1,
- 0x1.cf3b5cdaf0c39p-1,
- 0x1.d0580b2cfd249p-1,
- 0x1.d16f7dbe41ca0p-1,
- 0x1.d281c49d818d0p-1,
- 0x1.d38eefdf64fddp-1,
- 0x1.d4970f9ce00d9p-1,
- 0x1.d59a33f19ed42p-1,
- 0x1.d6986cfa798e7p-1,
- 0x1.d791cad3eff01p-1,
- 0x1.d8865d98abe01p-1,
- 0x1.d97635600bb89p-1,
- 0x1.da61623cb41e0p-1,
- 0x1.db47f43b2980dp-1,
- 0x1.dc29fb60715afp-1,
- 0x1.dd0787a8bb39dp-1,
- 0x1.dde0a90611a0dp-1,
- 0x1.deb56f5f12d28p-1,
- 0x1.df85ea8db188ep-1,
- 0x1.e0522a5dfda73p-1,
- 0x1.e11a3e8cf4eb8p-1,
- 0x1.e1de36c75ba58p-1,
- 0x1.e29e22a89d766p-1,
- 0x1.e35a11b9b61cep-1,
- 0x1.e4121370224ccp-1,
- 0x1.e4c6372cd8927p-1,
- 0x1.e5768c3b4a3fcp-1,
- 0x1.e62321d06c5e0p-1,
- 0x1.e6cc0709c8a0dp-1,
- 0x1.e7714aec96534p-1,
- 0x1.e812fc64db369p-1,
- 0x1.e8b12a44944a8p-1,
- 0x1.e94be342e6743p-1,
- 0x1.e9e335fb56f87p-1,
- 0x1.ea7730ed0bbb9p-1,
- 0x1.eb07e27a133aap-1,
- 0x1.eb9558e6b42cep-1,
- 0x1.ec1fa258c4beap-1,
- 0x1.eca6ccd709544p-1,
- 0x1.ed2ae6489ac1ep-1,
- 0x1.edabfc7453e63p-1,
- 0x1.ee2a1d004692cp-1,
- 0x1.eea5557137ae0p-1,
- 0x1.ef1db32a2277cp-1,
- 0x1.ef93436bc2daap-1,
- 0x1.f006135426b26p-1,
- 0x1.f0762fde45ee6p-1,
- 0x1.f0e3a5e1a1788p-1,
- 0x1.f14e8211e8c55p-1,
- 0x1.f1b6d0fea5f4dp-1,
- 0x1.f21c9f12f0677p-1,
- 0x1.f27ff89525acfp-1,
- 0x1.f2e0e9a6a8b09p-1,
- 0x1.f33f7e43a706bp-1,
- 0x1.f39bc242e43e6p-1,
- 0x1.f3f5c1558b19ep-1,
- 0x1.f44d870704911p-1,
- 0x1.f4a31ebcd47dfp-1,
- 0x1.f4f693b67bd77p-1,
- 0x1.f547f10d60597p-1,
- 0x1.f59741b4b97cfp-1,
- 0x1.f5e4907982a07p-1,
- 0x1.f62fe80272419p-1,
- 0x1.f67952cff6282p-1,
- 0x1.f6c0db3c34641p-1,
- 0x1.f7068b7b10fd9p-1,
- 0x1.f74a6d9a38383p-1,
- 0x1.f78c8b812d498p-1,
- 0x1.f7cceef15d631p-1,
- 0x1.f80ba18636f07p-1,
- 0x1.f848acb544e95p-1,
- 0x1.f88419ce4e184p-1,
- 0x1.f8bdf1fb78370p-1,
- 0x1.f8f63e416ebffp-1,
- 0x1.f92d077f8d56dp-1,
- 0x1.f96256700da8ep-1,
- 0x1.f99633a838a57p-1,
- 0x1.f9c8a7989af0dp-1,
- 0x1.f9f9ba8d3c733p-1,
- 0x1.fa2974addae45p-1,
- 0x1.fa57ddfe27376p-1,
- 0x1.fa84fe5e05c8dp-1,
- 0x1.fab0dd89d1309p-1,
- 0x1.fadb831a9f9c3p-1,
- 0x1.fb04f6868a944p-1,
- 0x1.fb2d3f20f9101p-1,
- 0x1.fb54641aebbc9p-1,
- 0x1.fb7a6c834b5a2p-1,
- 0x1.fb9f5f4739170p-1,
- 0x1.fbc3433260ca5p-1,
- 0x1.fbe61eef4cf6ap-1,
- 0x1.fc07f907bc794p-1,
- 0x1.fc28d7e4f9cd0p-1,
- 0x1.fc48c1d033c7ap-1,
- 0x1.fc67bcf2d7b8fp-1,
- 0x1.fc85cf56ecd38p-1,
- 0x1.fca2fee770c79p-1,
- 0x1.fcbf5170b578bp-1,
- 0x1.fcdacca0bfb73p-1,
- 0x1.fcf57607a6e7cp-1,
- 0x1.fd0f5317f582fp-1,
- 0x1.fd2869270a56fp-1,
- 0x1.fd40bd6d7a785p-1,
- 0x1.fd58550773cb5p-1,
- 0x1.fd6f34f52013ap-1,
- 0x1.fd85621b0876dp-1,
- 0x1.fd9ae142795e3p-1,
- 0x1.fdafb719e6a69p-1,
- 0x1.fdc3e835500b3p-1,
- 0x1.fdd7790ea5bc0p-1,
- 0x1.fdea6e062d0c9p-1,
- 0x1.fdfccb62e52d3p-1,
- 0x1.fe0e9552ebdd6p-1,
- 0x1.fe1fcfebe2083p-1,
- 0x1.fe307f2b503d0p-1,
- 0x1.fe40a6f70af4bp-1,
- 0x1.fe504b1d9696cp-1,
- 0x1.fe5f6f568b301p-1,
- 0x1.fe6e1742f7cf6p-1,
- 0x1.fe7c466dc57a1p-1,
- 0x1.fe8a004c19ae6p-1,
- 0x1.fe97483db8670p-1,
- 0x1.fea4218d6594ap-1,
- 0x1.feb08f7146046p-1,
- 0x1.febc950b3fa75p-1,
- 0x1.fec835695932ep-1,
- 0x1.fed37386190fbp-1,
- 0x1.fede5248e38f4p-1,
- 0x1.fee8d486585eep-1,
- 0x1.fef2fd00af31ap-1,
- 0x1.fefcce6813974p-1,
- 0x1.ff064b5afffbep-1,
- 0x1.ff0f766697c76p-1,
- 0x1.ff18520700971p-1,
- 0x1.ff20e0a7ba8c2p-1,
- 0x1.ff2924a3f7a83p-1,
- 0x1.ff312046f2339p-1,
- 0x1.ff38d5cc4227fp-1,
- 0x1.ff404760319b4p-1,
- 0x1.ff47772010262p-1,
- 0x1.ff4e671a85425p-1,
- 0x1.ff55194fe19dfp-1,
- 0x1.ff5b8fb26f5f6p-1,
- 0x1.ff61cc26c1578p-1,
- 0x1.ff67d08401202p-1,
- 0x1.ff6d9e943c231p-1,
- 0x1.ff733814af88cp-1,
- 0x1.ff789eb6130c9p-1,
- 0x1.ff7dd41ce2b4dp-1,
- 0x1.ff82d9e1a76d8p-1,
- 0x1.ff87b1913e853p-1,
- 0x1.ff8c5cad200a5p-1,
- 0x1.ff90dcaba4096p-1,
- 0x1.ff9532f846ab0p-1,
- 0x1.ff9960f3eb327p-1,
- 0x1.ff9d67f51ddbap-1,
- 0x1.ffa14948549a7p-1,
- 0x1.ffa506302ebaep-1,
- 0x1.ffa89fe5b3625p-1,
- 0x1.ffac17988ef4bp-1,
- 0x1.ffaf6e6f4f5c0p-1,
- 0x1.ffb2a5879f35ep-1,
- 0x1.ffb5bdf67fe6fp-1,
- 0x1.ffb8b8c88295fp-1,
- 0x1.ffbb970200110p-1,
- 0x1.ffbe599f4f9d9p-1,
- 0x1.ffc10194fcb64p-1,
- 0x1.ffc38fcffbb7cp-1,
- 0x1.ffc60535dd7f5p-1,
- 0x1.ffc862a501fd7p-1,
- 0x1.ffcaa8f4c9beap-1,
- 0x1.ffccd8f5c66d1p-1,
- 0x1.ffcef371ea4d7p-1,
- 0x1.ffd0f92cb6ba7p-1,
- 0x1.ffd2eae369a07p-1,
- 0x1.ffd4c94d29fdbp-1,
- 0x1.ffd6951b33686p-1,
- 0x1.ffd84ef9009eep-1,
- 0x1.ffd9f78c7524ap-1,
- 0x1.ffdb8f7605ee7p-1,
- 0x1.ffdd1750e1220p-1,
- 0x1.ffde8fb314ebfp-1,
- 0x1.ffdff92db56e5p-1,
- 0x1.ffe1544d01ccbp-1,
- 0x1.ffe2a1988857cp-1,
- 0x1.ffe3e19349dc7p-1,
- 0x1.ffe514bbdc197p-1,
- 0x1.ffe63b8c8b5f7p-1,
- 0x1.ffe7567b7b5e1p-1,
- 0x1.ffe865fac722bp-1,
- 0x1.ffe96a78a04a9p-1,
- 0x1.ffea645f6d6dap-1,
- 0x1.ffeb5415e7c44p-1,
- 0x1.ffec39ff380b9p-1,
- 0x1.ffed167b12ac2p-1,
- 0x1.ffede9e5d3262p-1,
- 0x1.ffeeb49896c6dp-1,
- 0x1.ffef76e956a9fp-1,
- 0x1.fff0312b010b5p-1,
- 0x1.fff0e3ad91ec2p-1,
- 0x1.fff18ebe2b0e1p-1,
- 0x1.fff232a72b48ep-1,
- 0x1.fff2cfb0453d9p-1,
- 0x1.fff3661e9569dp-1,
- 0x1.fff3f634b79f9p-1,
- 0x1.fff48032dbe40p-1,
- 0x1.fff50456dab8cp-1,
- 0x1.fff582dc48d30p-1,
- 0x1.fff5fbfc8a439p-1,
- 0x1.fff66feee5129p-1,
- 0x1.fff6dee89352ep-1,
- 0x1.fff7491cd4af6p-1,
- 0x1.fff7aebcff755p-1,
- 0x1.fff80ff8911fdp-1,
- 0x1.fff86cfd3e657p-1,
- 0x1.fff8c5f702ccfp-1,
- 0x1.fff91b102fca8p-1,
- 0x1.fff96c717b695p-1,
- 0x1.fff9ba420e834p-1,
- 0x1.fffa04a7928b1p-1,
- 0x1.fffa4bc63ee9ap-1,
- 0x1.fffa8fc0e5f33p-1,
- 0x1.fffad0b901755p-1,
- 0x1.fffb0ecebee1bp-1,
- 0x1.fffb4a210b172p-1,
- 0x1.fffb82cd9dcbfp-1,
- 0x1.fffbb8f1049c6p-1,
- 0x1.fffbeca6adbe9p-1,
- 0x1.fffc1e08f25f5p-1,
- 0x1.fffc4d3120aa1p-1,
- 0x1.fffc7a37857d2p-1,
- 0x1.fffca53375ce3p-1,
- 0x1.fffcce3b57bffp-1,
- 0x1.fffcf564ab6b7p-1,
- 0x1.fffd1ac4135f9p-1,
- 0x1.fffd3e6d5cd87p-1,
- 0x1.fffd607387b07p-1,
- 0x1.fffd80e8ce0dap-1,
- 0x1.fffd9fdeabccep-1,
- 0x1.fffdbd65e5ad0p-1,
- 0x1.fffdd98e903b2p-1,
- 0x1.fffdf46816833p-1,
- 0x1.fffe0e0140857p-1,
- 0x1.fffe26683972ap-1,
- 0x1.fffe3daa95b18p-1,
- 0x1.fffe53d558ae9p-1,
- 0x1.fffe68f4fa777p-1,
- 0x1.fffe7d156d244p-1,
- 0x1.fffe904222101p-1,
- 0x1.fffea2860ee1ep-1,
- 0x1.fffeb3ebb267bp-1,
- 0x1.fffec47d19457p-1,
- 0x1.fffed443e2787p-1,
- 0x1.fffee34943b15p-1,
- 0x1.fffef1960d85dp-1,
- 0x1.fffeff32af7afp-1,
- 0x1.ffff0c273bea2p-1,
- 0x1.ffff187b6bc0ep-1,
- 0x1.ffff2436a21dcp-1,
- 0x1.ffff2f5fefcaap-1,
- 0x1.ffff39fe16963p-1,
- 0x1.ffff44178c8d2p-1,
- 0x1.ffff4db27f146p-1,
- 0x1.ffff56d4d5e5ep-1,
- 0x1.ffff5f8435efcp-1,
- 0x1.ffff67c604180p-1,
- 0x1.ffff6f9f67e55p-1,
- 0x1.ffff77154e0d6p-1,
- 0x1.ffff7e2c6aea2p-1,
- 0x1.ffff84e93cd75p-1,
- 0x1.ffff8b500e77cp-1,
- 0x1.ffff9164f8e46p-1,
- 0x1.ffff972be5c59p-1,
- 0x1.ffff9ca891572p-1,
- 0x1.ffffa1de8c582p-1,
- 0x1.ffffa6d13de73p-1,
- 0x1.ffffab83e54b8p-1,
- 0x1.ffffaff99bac4p-1,
- 0x1.ffffb43555b5fp-1,
- 0x1.ffffb839e52f3p-1,
- 0x1.ffffbc09fa7cdp-1,
- 0x1.ffffbfa82616bp-1,
- 0x1.ffffc316d9ed0p-1,
- 0x1.ffffc6586abf6p-1,
- 0x1.ffffc96f1165ep-1,
- 0x1.ffffcc5cec0c1p-1,
- 0x1.ffffcf23ff5fcp-1,
- 0x1.ffffd1c637b2bp-1,
- 0x1.ffffd4456a10dp-1,
- 0x1.ffffd6a3554a1p-1,
- 0x1.ffffd8e1a2f22p-1,
- 0x1.ffffdb01e8546p-1,
- 0x1.ffffdd05a75eap-1,
- 0x1.ffffdeee4f810p-1,
- 0x1.ffffe0bd3e852p-1,
- 0x1.ffffe273c15b7p-1,
- 0x1.ffffe41314e06p-1,
- 0x1.ffffe59c6698bp-1,
- 0x1.ffffe710d565ep-1,
- 0x1.ffffe8717232dp-1,
- 0x1.ffffe9bf4098cp-1,
- 0x1.ffffeafb377d5p-1,
- 0x1.ffffec2641a9ep-1,
- 0x1.ffffed413e5b7p-1,
- 0x1.ffffee4d01cd6p-1,
- 0x1.ffffef4a55bd4p-1,
- 0x1.fffff039f9e8fp-1,
- 0x1.fffff11ca4876p-1,
- 0x1.fffff1f302bc1p-1,
- 0x1.fffff2bdb904dp-1,
- 0x1.fffff37d63a36p-1,
- 0x1.fffff43297019p-1,
- 0x1.fffff4dde0118p-1,
- 0x1.fffff57fc4a95p-1,
- 0x1.fffff618c3da6p-1,
- 0x1.fffff6a956450p-1,
- 0x1.fffff731ee681p-1,
- 0x1.fffff7b2f8ed6p-1,
- 0x1.fffff82cdcf1bp-1,
- 0x1.fffff89ffc4aap-1,
- 0x1.fffff90cb3c81p-1,
- 0x1.fffff9735b73bp-1,
- 0x1.fffff9d446cccp-1,
- 0x1.fffffa2fc5015p-1,
- 0x1.fffffa8621251p-1,
- 0x1.fffffad7a2652p-1,
- 0x1.fffffb248c39dp-1,
- 0x1.fffffb6d1e95dp-1,
- 0x1.fffffbb196132p-1,
- 0x1.fffffbf22c1e2p-1,
- 0x1.fffffc2f171e3p-1,
- 0x1.fffffc688a9cfp-1,
- 0x1.fffffc9eb76acp-1,
- 0x1.fffffcd1cbc28p-1,
- 0x1.fffffd01f36afp-1,
- 0x1.fffffd2f57d68p-1,
- 0x1.fffffd5a2041fp-1,
- 0x1.fffffd8271d12p-1,
- 0x1.fffffda86faa9p-1,
- 0x1.fffffdcc3b117p-1,
- 0x1.fffffdedf37edp-1,
- 0x1.fffffe0db6b91p-1,
- 0x1.fffffe2ba0ea5p-1,
- 0x1.fffffe47ccb60p-1,
- 0x1.fffffe62534d4p-1,
- 0x1.fffffe7b4c81ep-1,
- 0x1.fffffe92ced93p-1,
- 0x1.fffffea8ef9cfp-1,
- 0x1.fffffebdc2ec6p-1,
- 0x1.fffffed15bcbap-1,
- 0x1.fffffee3cc32cp-1,
- 0x1.fffffef5251c2p-1,
- 0x1.ffffff0576917p-1,
- 0x1.ffffff14cfb92p-1,
- 0x1.ffffff233ee1dp-1,
- 0x1.ffffff30d18e8p-1,
- 0x1.ffffff3d9480fp-1,
- 0x1.ffffff4993c46p-1,
- 0x1.ffffff54dab72p-1,
- 0x1.ffffff5f74141p-1,
- 0x1.ffffff6969fb8p-1,
- 0x1.ffffff72c5fb6p-1,
- 0x1.ffffff7b91176p-1,
- 0x1.ffffff83d3d07p-1,
- 0x1.ffffff8b962bep-1,
- 0x1.ffffff92dfba2p-1,
- 0x1.ffffff99b79d2p-1,
- 0x1.ffffffa0248e8p-1,
- 0x1.ffffffa62ce54p-1,
- 0x1.ffffffabd69b4p-1,
- 0x1.ffffffb127525p-1,
- 0x1.ffffffb624592p-1,
- 0x1.ffffffbad2affp-1,
- 0x1.ffffffbf370cdp-1,
- 0x1.ffffffc355dfdp-1,
- 0x1.ffffffc733572p-1,
- 0x1.ffffffcad3626p-1,
- 0x1.ffffffce39b67p-1,
- 0x1.ffffffd169d0cp-1,
- 0x1.ffffffd466fa5p-1,
- 0x1.ffffffd7344aap-1,
- 0x1.ffffffd9d4aabp-1,
- 0x1.ffffffdc4ad7ap-1,
- 0x1.ffffffde9964ep-1,
- 0x1.ffffffe0c2bf0p-1,
- 0x1.ffffffe2c92dbp-1,
- 0x1.ffffffe4aed5ep-1,
- 0x1.ffffffe675bbdp-1,
- 0x1.ffffffe81fc4ep-1,
- 0x1.ffffffe9aeb97p-1,
- 0x1.ffffffeb24467p-1,
- 0x1.ffffffec81ff2p-1,
- 0x1.ffffffedc95e7p-1,
- 0x1.ffffffeefbc85p-1,
- 0x1.fffffff01a8b6p-1,
- 0x1.fffffff126e1ep-1,
- 0x1.fffffff221f30p-1,
- 0x1.fffffff30cd3fp-1,
- 0x1.fffffff3e8892p-1,
- 0x1.fffffff4b606fp-1,
- 0x1.fffffff57632dp-1,
- 0x1.fffffff629e44p-1,
- 0x1.fffffff6d1e56p-1,
- 0x1.fffffff76ef3fp-1,
- 0x1.fffffff801c1fp-1,
- 0x1.fffffff88af67p-1,
- 0x1.fffffff90b2e3p-1,
- 0x1.fffffff982fc1p-1,
- 0x1.fffffff9f2e9fp-1,
- 0x1.fffffffa5b790p-1,
- 0x1.fffffffabd229p-1,
- 0x1.fffffffb18582p-1,
- 0x1.fffffffb6d844p-1,
- 0x1.fffffffbbd0aap-1,
- 0x1.fffffffc0748fp-1,
- 0x1.fffffffc4c96cp-1,
- 0x1.fffffffc8d462p-1,
- 0x1.fffffffcc9a41p-1,
- 0x1.fffffffd01f89p-1,
- 0x1.fffffffd36871p-1,
- 0x1.fffffffd678edp-1,
- 0x1.fffffffd954aep-1,
- 0x1.fffffffdbff2ap-1,
- 0x1.fffffffde7ba0p-1,
- 0x1.fffffffe0cd16p-1,
- 0x1.fffffffe2f664p-1,
- 0x1.fffffffe4fa30p-1,
- 0x1.fffffffe6daf7p-1,
- 0x1.fffffffe89b0cp-1,
- 0x1.fffffffea3c9ap-1,
- 0x1.fffffffebc1a9p-1,
- 0x1.fffffffed2c21p-1,
- 0x1.fffffffee7dc8p-1,
- 0x1.fffffffefb847p-1,
- 0x1.ffffffff0dd2bp-1,
- 0x1.ffffffff1ede9p-1,
- 0x1.ffffffff2ebdap-1,
- 0x1.ffffffff3d843p-1,
- 0x1.ffffffff4b453p-1,
- 0x1.ffffffff58126p-1,
- 0x1.ffffffff63fc3p-1,
- 0x1.ffffffff6f121p-1,
- 0x1.ffffffff79626p-1,
- 0x1.ffffffff82fabp-1,
- 0x1.ffffffff8be77p-1,
- 0x1.ffffffff94346p-1,
- 0x1.ffffffff9bec8p-1,
- 0x1.ffffffffa319fp-1,
- 0x1.ffffffffa9c63p-1,
- 0x1.ffffffffaffa4p-1,
- 0x1.ffffffffb5be5p-1,
- 0x1.ffffffffbb1a2p-1,
- 0x1.ffffffffc014ep-1,
- 0x1.ffffffffc4b56p-1,
- 0x1.ffffffffc901cp-1,
- 0x1.ffffffffccfffp-1,
- 0x1.ffffffffd0b56p-1,
- 0x1.ffffffffd4271p-1,
- 0x1.ffffffffd759dp-1,
- 0x1.ffffffffda520p-1,
- 0x1.ffffffffdd13cp-1,
- 0x1.ffffffffdfa2dp-1,
- 0x1.ffffffffe202dp-1,
- 0x1.ffffffffe4371p-1,
- 0x1.ffffffffe642ap-1,
- 0x1.ffffffffe8286p-1,
- 0x1.ffffffffe9eb0p-1,
- 0x1.ffffffffeb8d0p-1,
- 0x1.ffffffffed10ap-1,
- 0x1.ffffffffee782p-1,
- 0x1.ffffffffefc57p-1,
- 0x1.fffffffff0fa7p-1,
- 0x1.fffffffff218fp-1,
- 0x1.fffffffff3227p-1,
- 0x1.fffffffff4188p-1,
- 0x1.fffffffff4fc9p-1,
- 0x1.fffffffff5cfdp-1,
- 0x1.fffffffff6939p-1,
- 0x1.fffffffff748ep-1,
- 0x1.fffffffff7f0dp-1,
- 0x1.fffffffff88c5p-1,
- 0x1.fffffffff91c6p-1,
- 0x1.fffffffff9a1bp-1,
- 0x1.fffffffffa1d2p-1,
- 0x1.fffffffffa8f6p-1,
- 0x1.fffffffffaf92p-1,
- 0x1.fffffffffb5b0p-1,
- 0x1.fffffffffbb58p-1,
- 0x1.fffffffffc095p-1,
- 0x1.fffffffffc56dp-1,
- 0x1.fffffffffc9e8p-1,
- 0x1.fffffffffce0dp-1,
- 0x1.fffffffffd1e1p-1,
- 0x1.fffffffffd56cp-1,
- 0x1.fffffffffd8b3p-1,
- 0x1.fffffffffdbbap-1,
- 0x1.fffffffffde86p-1,
- 0x1.fffffffffe11dp-1,
- 0x1.fffffffffe380p-1,
- 0x1.fffffffffe5b6p-1,
- 0x1.fffffffffe7c0p-1,
- 0x1.fffffffffe9a2p-1,
- 0x1.fffffffffeb60p-1,
- 0x1.fffffffffecfbp-1,
- 0x1.fffffffffee77p-1,
- 0x1.fffffffffefd6p-1,
- 0x1.ffffffffff11ap-1,
- 0x1.ffffffffff245p-1,
- 0x1.ffffffffff359p-1,
- 0x1.ffffffffff457p-1,
- 0x1.ffffffffff542p-1,
- 0x1.ffffffffff61bp-1,
- 0x1.ffffffffff6e3p-1,
- 0x1.ffffffffff79bp-1,
- 0x1.ffffffffff845p-1,
- 0x1.ffffffffff8e2p-1,
- 0x1.ffffffffff973p-1,
- 0x1.ffffffffff9f8p-1,
- 0x1.ffffffffffa73p-1,
- 0x1.ffffffffffae4p-1,
- 0x1.ffffffffffb4cp-1,
- 0x1.ffffffffffbadp-1,
- 0x1.ffffffffffc05p-1,
- 0x1.ffffffffffc57p-1,
- 0x1.ffffffffffca2p-1,
- 0x1.ffffffffffce7p-1,
- 0x1.ffffffffffd27p-1,
- 0x1.ffffffffffd62p-1,
- 0x1.ffffffffffd98p-1,
- 0x1.ffffffffffdcap-1,
- 0x1.ffffffffffdf8p-1,
- 0x1.ffffffffffe22p-1,
- 0x1.ffffffffffe49p-1,
- 0x1.ffffffffffe6cp-1,
- 0x1.ffffffffffe8dp-1,
- 0x1.ffffffffffeabp-1,
- 0x1.ffffffffffec7p-1,
- 0x1.ffffffffffee1p-1,
- 0x1.ffffffffffef8p-1,
- 0x1.fffffffffff0ep-1,
- 0x1.fffffffffff22p-1,
- 0x1.fffffffffff34p-1,
- 0x1.fffffffffff45p-1,
- 0x1.fffffffffff54p-1,
- 0x1.fffffffffff62p-1,
- 0x1.fffffffffff6fp-1,
- 0x1.fffffffffff7bp-1,
- 0x1.fffffffffff86p-1,
- 0x1.fffffffffff90p-1,
- 0x1.fffffffffff9ap-1,
- 0x1.fffffffffffa2p-1,
- 0x1.fffffffffffaap-1,
- 0x1.fffffffffffb1p-1,
- 0x1.fffffffffffb8p-1,
- 0x1.fffffffffffbep-1,
- 0x1.fffffffffffc3p-1,
- 0x1.fffffffffffc8p-1,
- 0x1.fffffffffffcdp-1,
- 0x1.fffffffffffd1p-1,
- 0x1.fffffffffffd5p-1,
- 0x1.fffffffffffd9p-1,
- 0x1.fffffffffffdcp-1,
- 0x1.fffffffffffdfp-1,
- 0x1.fffffffffffe2p-1,
- 0x1.fffffffffffe4p-1,
- 0x1.fffffffffffe7p-1,
- 0x1.fffffffffffe9p-1,
- 0x1.fffffffffffebp-1,
- 0x1.fffffffffffedp-1,
- 0x1.fffffffffffeep-1,
- 0x1.ffffffffffff0p-1,
- 0x1.ffffffffffff1p-1,
- 0x1.ffffffffffff3p-1,
- 0x1.ffffffffffff4p-1,
- 0x1.ffffffffffff5p-1,
- 0x1.ffffffffffff6p-1,
- 0x1.ffffffffffff7p-1,
- 0x1.ffffffffffff7p-1,
- 0x1.ffffffffffff8p-1,
- 0x1.ffffffffffff9p-1,
- 0x1.ffffffffffff9p-1,
- 0x1.ffffffffffffap-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffbp-1,
- 0x1.ffffffffffffcp-1,
- 0x1.ffffffffffffcp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffdp-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.ffffffffffffep-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.fffffffffffffp-1,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- 0x1.0000000000000p+0,
- },
- .scale = { 0x1.20dd750429b6dp+0,
- 0x1.20d8f1975c85dp+0,
- 0x1.20cb67bd452c7p+0,
- 0x1.20b4d8bac36c1p+0,
- 0x1.209546ad13ccfp+0,
- 0x1.206cb4897b148p+0,
- 0x1.203b261cd0052p+0,
- 0x1.2000a00ae3804p+0,
- 0x1.1fbd27cdc72d3p+0,
- 0x1.1f70c3b4f2cc7p+0,
- 0x1.1f1b7ae44867fp+0,
- 0x1.1ebd5552f795bp+0,
- 0x1.1e565bca400d4p+0,
- 0x1.1de697e413d28p+0,
- 0x1.1d6e14099944ap+0,
- 0x1.1cecdb718d61cp+0,
- 0x1.1c62fa1e869b6p+0,
- 0x1.1bd07cdd189acp+0,
- 0x1.1b357141d95d5p+0,
- 0x1.1a91e5a748165p+0,
- 0x1.19e5e92b964abp+0,
- 0x1.19318bae53a04p+0,
- 0x1.1874ddcdfce24p+0,
- 0x1.17aff0e56ec10p+0,
- 0x1.16e2d7093cd8cp+0,
- 0x1.160da304ed92fp+0,
- 0x1.153068581b781p+0,
- 0x1.144b3b337c90cp+0,
- 0x1.135e3075d076bp+0,
- 0x1.12695da8b5bdep+0,
- 0x1.116cd8fd67618p+0,
- 0x1.1068b94962e5ep+0,
- 0x1.0f5d1602f7e41p+0,
- 0x1.0e4a073dc1b91p+0,
- 0x1.0d2fa5a70c168p+0,
- 0x1.0c0e0a8223359p+0,
- 0x1.0ae54fa490722p+0,
- 0x1.09b58f724416bp+0,
- 0x1.087ee4d9ad247p+0,
- 0x1.07416b4fbfe7cp+0,
- 0x1.05fd3ecbec297p+0,
- 0x1.04b27bc403d30p+0,
- 0x1.03613f2812dafp+0,
- 0x1.0209a65e29545p+0,
- 0x1.00abcf3e187a9p+0,
- 0x1.fe8fb01a47307p-1,
- 0x1.fbbbbef34b4b2p-1,
- 0x1.f8dc092d58ff8p-1,
- 0x1.f5f0cdaf15313p-1,
- 0x1.f2fa4c16c0019p-1,
- 0x1.eff8c4b1375dbp-1,
- 0x1.ecec7870ebca7p-1,
- 0x1.e9d5a8e4c934ep-1,
- 0x1.e6b4982f158b9p-1,
- 0x1.e38988fc46e72p-1,
- 0x1.e054be79d3042p-1,
- 0x1.dd167c4cf9d2ap-1,
- 0x1.d9cf06898cdafp-1,
- 0x1.d67ea1a8b5368p-1,
- 0x1.d325927fb9d89p-1,
- 0x1.cfc41e36c7df9p-1,
- 0x1.cc5a8a3fbea40p-1,
- 0x1.c8e91c4d01368p-1,
- 0x1.c5701a484ef9dp-1,
- 0x1.c1efca49a5011p-1,
- 0x1.be68728e29d5dp-1,
- 0x1.bada596f25436p-1,
- 0x1.b745c55905bf8p-1,
- 0x1.b3aafcc27502ep-1,
- 0x1.b00a46237d5bep-1,
- 0x1.ac63e7ecc1411p-1,
- 0x1.a8b8287ec6a09p-1,
- 0x1.a5074e2157620p-1,
- 0x1.a1519efaf889ep-1,
- 0x1.9d97610879642p-1,
- 0x1.99d8da149c13fp-1,
- 0x1.96164fafd8de3p-1,
- 0x1.925007283d7aap-1,
- 0x1.8e86458169af8p-1,
- 0x1.8ab94f6caa71dp-1,
- 0x1.86e9694134b9ep-1,
- 0x1.8316d6f48133dp-1,
- 0x1.7f41dc12c9e89p-1,
- 0x1.7b6abbb7aaf19p-1,
- 0x1.7791b886e7403p-1,
- 0x1.73b714a552763p-1,
- 0x1.6fdb11b1e0c34p-1,
- 0x1.6bfdf0beddaf5p-1,
- 0x1.681ff24b4ab04p-1,
- 0x1.6441563c665d4p-1,
- 0x1.60625bd75d07bp-1,
- 0x1.5c8341bb23767p-1,
- 0x1.58a445da7c74cp-1,
- 0x1.54c5a57629db0p-1,
- 0x1.50e79d1749ac9p-1,
- 0x1.4d0a6889dfd9fp-1,
- 0x1.492e42d78d2c5p-1,
- 0x1.4553664273d24p-1,
- 0x1.417a0c4049fd0p-1,
- 0x1.3da26d759aef5p-1,
- 0x1.39ccc1b136d5ap-1,
- 0x1.35f93fe7d1b3dp-1,
- 0x1.32281e2fd1a92p-1,
- 0x1.2e5991bd4cbfcp-1,
- 0x1.2a8dcede3673bp-1,
- 0x1.26c508f6bd0ffp-1,
- 0x1.22ff727dd6f7bp-1,
- 0x1.1f3d3cf9ffe5ap-1,
- 0x1.1b7e98fe26217p-1,
- 0x1.17c3b626c7a11p-1,
- 0x1.140cc3173f007p-1,
- 0x1.1059ed7740313p-1,
- 0x1.0cab61f084b93p-1,
- 0x1.09014c2ca74dap-1,
- 0x1.055bd6d32e8d7p-1,
- 0x1.01bb2b87c6968p-1,
- 0x1.fc3ee5d1524b0p-2,
- 0x1.f511a91a67d2ap-2,
- 0x1.edeeee0959518p-2,
- 0x1.e6d6ffaa65a25p-2,
- 0x1.dfca26f5bbf88p-2,
- 0x1.d8c8aace11e63p-2,
- 0x1.d1d2cfff91594p-2,
- 0x1.cae8d93f1d7b6p-2,
- 0x1.c40b0729ed547p-2,
- 0x1.bd3998457afdap-2,
- 0x1.b674c8ffc6283p-2,
- 0x1.afbcd3afe8ab6p-2,
- 0x1.a911f096fbc26p-2,
- 0x1.a27455e14c93cp-2,
- 0x1.9be437a7de946p-2,
- 0x1.9561c7f23a47bp-2,
- 0x1.8eed36b886d93p-2,
- 0x1.8886b1e5ecfd1p-2,
- 0x1.822e655b417e6p-2,
- 0x1.7be47af1f5d89p-2,
- 0x1.75a91a7f4d2edp-2,
- 0x1.6f7c69d7d3ef8p-2,
- 0x1.695e8cd31867ep-2,
- 0x1.634fa54fa285fp-2,
- 0x1.5d4fd33729015p-2,
- 0x1.575f3483021c3p-2,
- 0x1.517de540ce2a3p-2,
- 0x1.4babff975a04cp-2,
- 0x1.45e99bcbb7915p-2,
- 0x1.4036d0468a7a2p-2,
- 0x1.3a93b1998736cp-2,
- 0x1.35005285227f1p-2,
- 0x1.2f7cc3fe6f423p-2,
- 0x1.2a09153529381p-2,
- 0x1.24a55399ea239p-2,
- 0x1.1f518ae487dc8p-2,
- 0x1.1a0dc51a9934dp-2,
- 0x1.14da0a961fd14p-2,
- 0x1.0fb6620c550afp-2,
- 0x1.0aa2d09497f2bp-2,
- 0x1.059f59af7a906p-2,
- 0x1.00abff4dec7a3p-2,
- 0x1.f79183b101c5bp-3,
- 0x1.edeb406d9c824p-3,
- 0x1.e4652fadcb6b2p-3,
- 0x1.daff4969c0b04p-3,
- 0x1.d1b982c501370p-3,
- 0x1.c893ce1dcbef7p-3,
- 0x1.bf8e1b1ca2279p-3,
- 0x1.b6a856c3ed54fp-3,
- 0x1.ade26b7fbed95p-3,
- 0x1.a53c4135a6526p-3,
- 0x1.9cb5bd549b111p-3,
- 0x1.944ec2e4f5630p-3,
- 0x1.8c07329874652p-3,
- 0x1.83deeada4d25ap-3,
- 0x1.7bd5c7df3fe9cp-3,
- 0x1.73eba3b5b07b7p-3,
- 0x1.6c205655be71fp-3,
- 0x1.6473b5b15a7a1p-3,
- 0x1.5ce595c455b0ap-3,
- 0x1.5575c8a468361p-3,
- 0x1.4e241e912c305p-3,
- 0x1.46f066040a832p-3,
- 0x1.3fda6bc016994p-3,
- 0x1.38e1fae1d6a9dp-3,
- 0x1.3206dceef5f87p-3,
- 0x1.2b48d9e5dea1cp-3,
- 0x1.24a7b84d38971p-3,
- 0x1.1e233d434b813p-3,
- 0x1.17bb2c8d41535p-3,
- 0x1.116f48a6476ccp-3,
- 0x1.0b3f52ce8c383p-3,
- 0x1.052b0b1a174eap-3,
- 0x1.fe6460fef4680p-4,
- 0x1.f2a901ccafb37p-4,
- 0x1.e723726b824a9p-4,
- 0x1.dbd32ac4c99b0p-4,
- 0x1.d0b7a0f921e7cp-4,
- 0x1.c5d0497c09e74p-4,
- 0x1.bb1c972f23e50p-4,
- 0x1.b09bfb7d11a83p-4,
- 0x1.a64de673e8837p-4,
- 0x1.9c31c6df3b1b8p-4,
- 0x1.92470a61b6965p-4,
- 0x1.888d1d8e510a3p-4,
- 0x1.7f036c0107294p-4,
- 0x1.75a96077274bap-4,
- 0x1.6c7e64e7281cbp-4,
- 0x1.6381e2980956bp-4,
- 0x1.5ab342383d177p-4,
- 0x1.5211ebf41880bp-4,
- 0x1.499d478bca735p-4,
- 0x1.4154bc68d75c3p-4,
- 0x1.3937b1b319259p-4,
- 0x1.31458e6542847p-4,
- 0x1.297db960e4f63p-4,
- 0x1.21df9981f8e53p-4,
- 0x1.1a6a95b1e786fp-4,
- 0x1.131e14fa1625dp-4,
- 0x1.0bf97e95f2a64p-4,
- 0x1.04fc3a0481321p-4,
- 0x1.fc4b5e32d6259p-5,
- 0x1.eeea8c1b1db93p-5,
- 0x1.e1d4cf1e2450ap-5,
- 0x1.d508f9a1ea64ep-5,
- 0x1.c885df3451a07p-5,
- 0x1.bc4a54a84e834p-5,
- 0x1.b055303221015p-5,
- 0x1.a4a549829587ep-5,
- 0x1.993979e14fffdp-5,
- 0x1.8e109c4622913p-5,
- 0x1.83298d717210ep-5,
- 0x1.78832c03aa2b1p-5,
- 0x1.6e1c5893c380bp-5,
- 0x1.63f3f5c4de13bp-5,
- 0x1.5a08e85af27e0p-5,
- 0x1.505a174e9c929p-5,
- 0x1.46e66be002240p-5,
- 0x1.3dacd1a8d8ccdp-5,
- 0x1.34ac36ad8dafep-5,
- 0x1.2be38b6d92415p-5,
- 0x1.2351c2f2d1449p-5,
- 0x1.1af5d2e04f3f6p-5,
- 0x1.12ceb37ff9bc3p-5,
- 0x1.0adb5fcfa8c75p-5,
- 0x1.031ad58d56279p-5,
- 0x1.f7182a851bca2p-6,
- 0x1.e85c449e377f2p-6,
- 0x1.da0005e5f28dfp-6,
- 0x1.cc0180af00a8bp-6,
- 0x1.be5ecd2fcb5f9p-6,
- 0x1.b1160991ff737p-6,
- 0x1.a4255a00b9f03p-6,
- 0x1.978ae8b55ce1bp-6,
- 0x1.8b44e6031383ep-6,
- 0x1.7f5188610ddc8p-6,
- 0x1.73af0c737bb45p-6,
- 0x1.685bb5134ef13p-6,
- 0x1.5d55cb54cd53ap-6,
- 0x1.529b9e8cf9a1ep-6,
- 0x1.482b8455dc491p-6,
- 0x1.3e03d891b37dep-6,
- 0x1.3422fd6d12e2bp-6,
- 0x1.2a875b5ffab56p-6,
- 0x1.212f612dee7fbp-6,
- 0x1.181983e5133ddp-6,
- 0x1.0f443edc5ce49p-6,
- 0x1.06ae13b0d3255p-6,
- 0x1.fcab1483ea7fcp-7,
- 0x1.ec72615a894c4p-7,
- 0x1.dcaf3691fc448p-7,
- 0x1.cd5ec93c12431p-7,
- 0x1.be7e5ac24963bp-7,
- 0x1.b00b38d6b3575p-7,
- 0x1.a202bd6372dcep-7,
- 0x1.94624e78e0fafp-7,
- 0x1.87275e3a6869dp-7,
- 0x1.7a4f6aca256cbp-7,
- 0x1.6dd7fe3358230p-7,
- 0x1.61beae53b72b7p-7,
- 0x1.56011cc3b036dp-7,
- 0x1.4a9cf6bda3f4cp-7,
- 0x1.3f8ff5042a88ep-7,
- 0x1.34d7dbc76d7e5p-7,
- 0x1.2a727a89a3f14p-7,
- 0x1.205dac02bd6b9p-7,
- 0x1.1697560347b25p-7,
- 0x1.0d1d69569b82dp-7,
- 0x1.03ede1a45bfeep-7,
- 0x1.f60d8aa2a88f2p-8,
- 0x1.e4cc4abf7d065p-8,
- 0x1.d4143a9dfe965p-8,
- 0x1.c3e1a5f5c077cp-8,
- 0x1.b430ecf4a83a8p-8,
- 0x1.a4fe83fb9db25p-8,
- 0x1.9646f35a76623p-8,
- 0x1.8806d70b2fc36p-8,
- 0x1.7a3ade6c8b3e4p-8,
- 0x1.6cdfcbfc1e263p-8,
- 0x1.5ff2750fe7820p-8,
- 0x1.536fc18f7ce5cp-8,
- 0x1.4754abacdf1dcp-8,
- 0x1.3b9e3f9d06e3fp-8,
- 0x1.30499b503957fp-8,
- 0x1.2553ee2a336bfp-8,
- 0x1.1aba78ba3af89p-8,
- 0x1.107a8c7323a6ep-8,
- 0x1.06918b6355624p-8,
- 0x1.f9f9cfd9c3035p-9,
- 0x1.e77448fb66bb9p-9,
- 0x1.d58da68fd1170p-9,
- 0x1.c4412bf4b8f0bp-9,
- 0x1.b38a3af2e55b4p-9,
- 0x1.a3645330550ffp-9,
- 0x1.93cb11a30d765p-9,
- 0x1.84ba3004a50d0p-9,
- 0x1.762d84469c18fp-9,
- 0x1.6821000795a03p-9,
- 0x1.5a90b00981d93p-9,
- 0x1.4d78bba8ca5fdp-9,
- 0x1.40d564548fad7p-9,
- 0x1.34a305080681fp-9,
- 0x1.28de11c5031ebp-9,
- 0x1.1d83170fbf6fbp-9,
- 0x1.128eb96be8798p-9,
- 0x1.07fdb4dafea5fp-9,
- 0x1.fb99b8b8279e1p-10,
- 0x1.e7f232d9e2630p-10,
- 0x1.d4fed7195d7e8p-10,
- 0x1.c2b9cf7f893bfp-10,
- 0x1.b11d702b3deb1p-10,
- 0x1.a024365f771bdp-10,
- 0x1.8fc8c794b03b5p-10,
- 0x1.8005f08d6f1efp-10,
- 0x1.70d6a46e07ddap-10,
- 0x1.6235fbd7a4345p-10,
- 0x1.541f340697987p-10,
- 0x1.468dadf4080abp-10,
- 0x1.397ced7af2b15p-10,
- 0x1.2ce898809244ep-10,
- 0x1.20cc76202c5fap-10,
- 0x1.15246dda49d47p-10,
- 0x1.09ec86c75d497p-10,
- 0x1.fe41cd9bb4eeep-11,
- 0x1.e97ba3b77f306p-11,
- 0x1.d57f524723822p-11,
- 0x1.c245d4b998479p-11,
- 0x1.afc85e0f82e12p-11,
- 0x1.9e005769dbc1dp-11,
- 0x1.8ce75e9f6f8a0p-11,
- 0x1.7c7744d9378f7p-11,
- 0x1.6caa0d3582fe9p-11,
- 0x1.5d79eb71e893bp-11,
- 0x1.4ee1429bf7cc0p-11,
- 0x1.40daa3c89f5b6p-11,
- 0x1.3360ccd23db3ap-11,
- 0x1.266ea71d4f71ap-11,
- 0x1.19ff4663ae9dfp-11,
- 0x1.0e0de78654d1ep-11,
- 0x1.0295ef6591848p-11,
- 0x1.ef25d37f49fe1p-12,
- 0x1.da01102b5f851p-12,
- 0x1.c5b5412dcafadp-12,
- 0x1.b23a5a23e4210p-12,
- 0x1.9f8893d8fd1c1p-12,
- 0x1.8d986a4187285p-12,
- 0x1.7c629a822bc9ep-12,
- 0x1.6be02102b3520p-12,
- 0x1.5c0a378c90bcap-12,
- 0x1.4cda5374ea275p-12,
- 0x1.3e4a23d1f4702p-12,
- 0x1.30538fbb77ecdp-12,
- 0x1.22f0b496539bdp-12,
- 0x1.161be46ad3b50p-12,
- 0x1.09cfa445b00ffp-12,
- 0x1.fc0d55470cf51p-13,
- 0x1.e577bbcd49935p-13,
- 0x1.cfd4a5adec5bfp-13,
- 0x1.bb1a9657ce465p-13,
- 0x1.a740684026555p-13,
- 0x1.943d4a1d1ed39p-13,
- 0x1.8208bc334a6a5p-13,
- 0x1.709a8db59f25cp-13,
- 0x1.5feada379d8b7p-13,
- 0x1.4ff207314a102p-13,
- 0x1.40a8c1949f75ep-13,
- 0x1.3207fb7420eb9p-13,
- 0x1.2408e9ba3327fp-13,
- 0x1.16a501f0e42cap-13,
- 0x1.09d5f819c9e29p-13,
- 0x1.fb2b792b40a22p-14,
- 0x1.e3bcf436a1a95p-14,
- 0x1.cd55277c18d05p-14,
- 0x1.b7e94604479dcp-14,
- 0x1.a36eec00926ddp-14,
- 0x1.8fdc1b2dcf7b9p-14,
- 0x1.7d2737527c3f9p-14,
- 0x1.6b4702d7d5849p-14,
- 0x1.5a329b7d30748p-14,
- 0x1.49e17724f4d41p-14,
- 0x1.3a4b60ba9aa4dp-14,
- 0x1.2b6875310f785p-14,
- 0x1.1d312098e9dbap-14,
- 0x1.0f9e1b4dd36dfp-14,
- 0x1.02a8673a94691p-14,
- 0x1.ec929a665b449p-15,
- 0x1.d4f4b4c8e09edp-15,
- 0x1.be6abbb10a5aap-15,
- 0x1.a8e8cc1fadef6p-15,
- 0x1.94637d5bacfdbp-15,
- 0x1.80cfdc72220cfp-15,
- 0x1.6e2367dc27f95p-15,
- 0x1.5c540b4936fd2p-15,
- 0x1.4b581b8d170fcp-15,
- 0x1.3b2652b06c2b2p-15,
- 0x1.2bb5cc22e5db6p-15,
- 0x1.1cfe010e2052dp-15,
- 0x1.0ef6c4c84a0fep-15,
- 0x1.01984165a5f36p-15,
- 0x1.e9b5e8d00ce76p-16,
- 0x1.d16f5716c6c1ap-16,
- 0x1.ba4f035d60e02p-16,
- 0x1.a447b7b03f045p-16,
- 0x1.8f4ccca7fc90dp-16,
- 0x1.7b5223dac7336p-16,
- 0x1.684c227fcacefp-16,
- 0x1.562fac4329b48p-16,
- 0x1.44f21e49054f2p-16,
- 0x1.34894a5e24657p-16,
- 0x1.24eb7254ccf83p-16,
- 0x1.160f438c70913p-16,
- 0x1.07ebd2a2d2844p-16,
- 0x1.f4f12e9ab070ap-17,
- 0x1.db5ad0b27805cp-17,
- 0x1.c304efa2c6f4ep-17,
- 0x1.abe09e9144b5ep-17,
- 0x1.95df988e76644p-17,
- 0x1.80f439b4ee04bp-17,
- 0x1.6d11788a69c64p-17,
- 0x1.5a2adfa0b4bc4p-17,
- 0x1.4834877429b8fp-17,
- 0x1.37231085c7d9ap-17,
- 0x1.26eb9daed6f7ep-17,
- 0x1.1783ceac28910p-17,
- 0x1.08e1badf0fcedp-17,
- 0x1.f5f7d88472604p-18,
- 0x1.db92b5212fb8dp-18,
- 0x1.c282cd3957edap-18,
- 0x1.aab7abace48dcp-18,
- 0x1.94219bfcb4928p-18,
- 0x1.7eb1a2075864dp-18,
- 0x1.6a597219a93d9p-18,
- 0x1.570b69502f313p-18,
- 0x1.44ba864670882p-18,
- 0x1.335a62115bce2p-18,
- 0x1.22df298214423p-18,
- 0x1.133d96ae7e0ddp-18,
- 0x1.046aeabcfcdecp-18,
- 0x1.ecb9cfe1d8642p-19,
- 0x1.d21397ead99cbp-19,
- 0x1.b8d094c86d374p-19,
- 0x1.a0df0f0c626dcp-19,
- 0x1.8a2e269750a39p-19,
- 0x1.74adc8f4064d3p-19,
- 0x1.604ea819f007cp-19,
- 0x1.4d0231928c6f9p-19,
- 0x1.3aba85fe22e1fp-19,
- 0x1.296a70f414053p-19,
- 0x1.1905613b3abf2p-19,
- 0x1.097f6156f32c5p-19,
- 0x1.f59a20caf6695p-20,
- 0x1.d9c73698fb1dcp-20,
- 0x1.bf716c6168baep-20,
- 0x1.a6852c6b58392p-20,
- 0x1.8eefd70594a88p-20,
- 0x1.789fb715aae95p-20,
- 0x1.6383f726a8e04p-20,
- 0x1.4f8c96f26a26ap-20,
- 0x1.3caa61607f920p-20,
- 0x1.2acee2f5ecdb8p-20,
- 0x1.19ec60b1242edp-20,
- 0x1.09f5cf4dd2877p-20,
- 0x1.f5bd95d8730d8p-21,
- 0x1.d9371e2ff7c35p-21,
- 0x1.be41de54d155ap-21,
- 0x1.a4c89e08ef4f3p-21,
- 0x1.8cb738399b12cp-21,
- 0x1.75fa8dbc84becp-21,
- 0x1.608078a70dcbcp-21,
- 0x1.4c37c0394d094p-21,
- 0x1.39100d5687bfep-21,
- 0x1.26f9df8519bd6p-21,
- 0x1.15e6827001f18p-21,
- 0x1.05c803e4831c1p-21,
- 0x1.ed22548cffd35p-22,
- 0x1.d06ad6ecdf971p-22,
- 0x1.b551c847fbc96p-22,
- 0x1.9bc09f112b494p-22,
- 0x1.83a1ff0aa239dp-22,
- 0x1.6ce1aa3fd7bddp-22,
- 0x1.576c72b514859p-22,
- 0x1.43302cc4a0da8p-22,
- 0x1.301ba221dc9bbp-22,
- 0x1.1e1e857adc568p-22,
- 0x1.0d2966b1746f7p-22,
- 0x1.fa5b4f49cc6b2p-23,
- 0x1.dc3ae30b55c16p-23,
- 0x1.bfd7555a3bd68p-23,
- 0x1.a517d9e61628ap-23,
- 0x1.8be4f8f6c951fp-23,
- 0x1.74287ded49339p-23,
- 0x1.5dcd669f2cd34p-23,
- 0x1.48bfd38302870p-23,
- 0x1.34ecf8a3c124ap-23,
- 0x1.22430f521cbcfp-23,
- 0x1.10b1488aeb235p-23,
- 0x1.0027c00a263a6p-23,
- 0x1.e12ee004efc37p-24,
- 0x1.c3e44ae32b16bp-24,
- 0x1.a854ea14102a8p-24,
- 0x1.8e6761569f45dp-24,
- 0x1.7603bac345f65p-24,
- 0x1.5f1353cdad001p-24,
- 0x1.4980cb3c80949p-24,
- 0x1.3537f00b6ad4dp-24,
- 0x1.2225b12bffc68p-24,
- 0x1.10380e1adb7e9p-24,
- 0x1.febc107d5efaap-25,
- 0x1.df0f2a0ee6946p-25,
- 0x1.c14b2188bcee4p-25,
- 0x1.a553644f7f07dp-25,
- 0x1.8b0cfce0579dfp-25,
- 0x1.725e7c5dd20f7p-25,
- 0x1.5b2fe547a1340p-25,
- 0x1.456a974e92e93p-25,
- 0x1.30f93c3699078p-25,
- 0x1.1dc7b5b978cf8p-25,
- 0x1.0bc30c5d52f15p-25,
- 0x1.f5b2be65a0c7fp-26,
- 0x1.d5f3a8dea7357p-26,
- 0x1.b82915b03515bp-26,
- 0x1.9c3517e789488p-26,
- 0x1.81fb7df06136ep-26,
- 0x1.6961b8d641d06p-26,
- 0x1.524ec4d916caep-26,
- 0x1.3cab1343d18d1p-26,
- 0x1.2860757487a01p-26,
- 0x1.155a09065d4f7p-26,
- 0x1.0384250e4c9fcp-26,
- 0x1.e59890b926c78p-27,
- 0x1.c642116a8a9e3p-27,
- 0x1.a8e405e651ab6p-27,
- 0x1.8d5f98114f872p-27,
- 0x1.7397c5a66e307p-27,
- 0x1.5b71456c5a4c4p-27,
- 0x1.44d26de513197p-27,
- 0x1.2fa31d6371537p-27,
- 0x1.1bcca373b7b43p-27,
- 0x1.0939ab853339fp-27,
- 0x1.efac5187b2863p-28,
- 0x1.cf1e86235d0e6p-28,
- 0x1.b0a68a2128babp-28,
- 0x1.9423165bc4444p-28,
- 0x1.7974e743dea3cp-28,
- 0x1.607e9eacd1050p-28,
- 0x1.4924a74dec728p-28,
- 0x1.334d19e0c2160p-28,
- 0x1.1edfa3c5f5ccap-28,
- 0x1.0bc56f1b54701p-28,
- 0x1.f3d2185e047d9p-29,
- 0x1.d26cb87945e87p-29,
- 0x1.b334fac4b9f99p-29,
- 0x1.96076f7918d1cp-29,
- 0x1.7ac2d72fc2c63p-29,
- 0x1.614801550319ep-29,
- 0x1.4979ac8b28926p-29,
- 0x1.333c68e2d0548p-29,
- 0x1.1e767bce37dd7p-29,
- 0x1.0b0fc5b6d05a0p-29,
- 0x1.f1e3523b41d7dp-30,
- 0x1.d00de6608effep-30,
- 0x1.b0778b7b3301ap-30,
- 0x1.92fb04ec0f6cfp-30,
- 0x1.77756ec9f78fap-30,
- 0x1.5dc61922d5a06p-30,
- 0x1.45ce65699ff6dp-30,
- 0x1.2f71a5f159970p-30,
- 0x1.1a94ff571654fp-30,
- 0x1.071f4bbea09ecp-30,
- 0x1.e9f1ff8ddd774p-31,
- 0x1.c818223a202c7p-31,
- 0x1.a887bd2b4404dp-31,
- 0x1.8b1a336c5eb6bp-31,
- 0x1.6fab63324088ap-31,
- 0x1.56197e30205bap-31,
- 0x1.3e44e45301b92p-31,
- 0x1.281000bfe4c3fp-31,
- 0x1.135f28f2d50b4p-31,
- 0x1.00187dded5975p-31,
- 0x1.dc479de0ef001p-32,
- 0x1.bad4fdad3caa1p-32,
- 0x1.9baed3ed27ab8p-32,
- 0x1.7ead9ce4285bbp-32,
- 0x1.63ac6b4edc88ep-32,
- 0x1.4a88be2a6390cp-32,
- 0x1.332259185f1a0p-32,
- 0x1.1d5b1f3793044p-32,
- 0x1.0916f04b6e18bp-32,
- 0x1.ec77101de6926p-33,
- 0x1.c960bf23153e0p-33,
- 0x1.a8bd20fc65ef7p-33,
- 0x1.8a61745ec7d1dp-33,
- 0x1.6e25d0e756261p-33,
- 0x1.53e4f7d1666cbp-33,
- 0x1.3b7c27a7ddb0ep-33,
- 0x1.24caf2c32af14p-33,
- 0x1.0fb3186804d0fp-33,
- 0x1.f830c0bb41fd7p-34,
- 0x1.d3c0f1a91c846p-34,
- 0x1.b1e5acf351d87p-34,
- 0x1.92712d259ce66p-34,
- 0x1.7538c60a04476p-34,
- 0x1.5a14b04b47879p-34,
- 0x1.40dfd87456f4cp-34,
- 0x1.2977b1172b9d5p-34,
- 0x1.13bc07e891491p-34,
- 0x1.ff1dbb4300811p-35,
- 0x1.d9a880f306bd8p-35,
- 0x1.b6e45220b55e0p-35,
- 0x1.96a0b33f2c4dap-35,
- 0x1.78b07e9e924acp-35,
- 0x1.5ce9ab1670dd2p-35,
- 0x1.4325167006bb0p-35,
- 0x1.2b3e53538ff3fp-35,
- 0x1.15137a7f44864p-35,
- 0x1.0084ff125639dp-35,
- 0x1.daeb0b7311ec7p-36,
- 0x1.b7937d1c40c52p-36,
- 0x1.96d082f59ab06p-36,
- 0x1.7872d9fa10aadp-36,
- 0x1.5c4e8e37bc7d0p-36,
- 0x1.423ac0df49a40p-36,
- 0x1.2a117230ad284p-36,
- 0x1.13af4f04f9998p-36,
- 0x1.fde703724e560p-37,
- 0x1.d77f0c82e7641p-37,
- 0x1.b3ee02611d7ddp-37,
- 0x1.92ff33023d5bdp-37,
- 0x1.7481a9e69f53fp-37,
- 0x1.5847eda620959p-37,
- 0x1.3e27c1fcc74bdp-37,
- 0x1.25f9ee0b923dcp-37,
- 0x1.0f9a0686531ffp-37,
- 0x1.f5cc7718082afp-38,
- 0x1.cf7e53d6a2ca5p-38,
- 0x1.ac0f5f3229372p-38,
- 0x1.8b498644847eap-38,
- 0x1.6cfa9bcca59dcp-38,
- 0x1.50f411d4fd2cdp-38,
- 0x1.370ab8327af5ep-38,
- 0x1.1f167f88c6b6ep-38,
- 0x1.08f24085d4597p-38,
- 0x1.e8f70e181d619p-39,
- 0x1.c324c20e337dcp-39,
- 0x1.a03261574b54ep-39,
- 0x1.7fe903cdf5855p-39,
- 0x1.6215c58da3450p-39,
- 0x1.46897d4b69fc6p-39,
- 0x1.2d1877d731b7bp-39,
- 0x1.159a386b11517p-39,
- 0x1.ffd27ae9393cep-40,
- 0x1.d7c593130dd0bp-40,
- 0x1.b2cd607c79bcfp-40,
- 0x1.90ae4d3405651p-40,
- 0x1.71312dd1759e2p-40,
- 0x1.5422ef5d8949dp-40,
- 0x1.39544b0ecc957p-40,
- 0x1.20997f73e73ddp-40,
- 0x1.09ca0eaacd277p-40,
- 0x1.e9810295890ecp-41,
- 0x1.c2b45b5aa4a1dp-41,
- 0x1.9eee068fa7596p-41,
- 0x1.7df2b399c10a8p-41,
- 0x1.5f8b87a31bd85p-41,
- 0x1.4385c96e9a2d9p-41,
- 0x1.29b2933ef4cbcp-41,
- 0x1.11e68a6378f8ap-41,
- 0x1.f7f338086a86bp-42,
- 0x1.cf8d7d9ce040ap-42,
- 0x1.aa577251ae484p-42,
- 0x1.8811d739efb5ep-42,
- 0x1.68823e52970bep-42,
- 0x1.4b72ae68e8b4cp-42,
- 0x1.30b14dbe876bcp-42,
- 0x1.181012ef86610p-42,
- 0x1.01647ba798744p-42,
- 0x1.d90e917701675p-43,
- 0x1.b2a87e86d0c8ap-43,
- 0x1.8f53dcb377293p-43,
- 0x1.6ed2f2515e933p-43,
- 0x1.50ecc9ed47f19p-43,
- 0x1.356cd5ce7799ep-43,
- 0x1.1c229a587ab78p-43,
- 0x1.04e15ecc7f3f6p-43,
- 0x1.deffc7e6a6017p-44,
- 0x1.b7b040832f310p-44,
- 0x1.938e021f36d76p-44,
- 0x1.7258610b3b233p-44,
- 0x1.53d3bfc82a909p-44,
- 0x1.37c92babdc2fdp-44,
- 0x1.1e06010120f6ap-44,
- 0x1.065b9616170d4p-44,
- 0x1.e13dd96b3753ap-45,
- 0x1.b950d32467392p-45,
- 0x1.94a72263259a5p-45,
- 0x1.72fd93e036cdcp-45,
- 0x1.54164576929abp-45,
- 0x1.37b83c521fe96p-45,
- 0x1.1daf033182e96p-45,
- 0x1.05ca50205d26ap-45,
- 0x1.dfbb6235639fap-46,
- 0x1.b7807e294781fp-46,
- 0x1.9298add70a734p-46,
- 0x1.70beaf9c7ffb6p-46,
- 0x1.51b2cd6709222p-46,
- 0x1.353a6cf7f7fffp-46,
- 0x1.1b1fa8cbe84a7p-46,
- 0x1.0330f0fd69921p-46,
- 0x1.da81670f96f9bp-47,
- 0x1.b24a16b4d09aap-47,
- 0x1.8d6eeb6efdbd6p-47,
- 0x1.6ba91ac734785p-47,
- 0x1.4cb7966770ab5p-47,
- 0x1.305e9721d0981p-47,
- 0x1.1667311fff70ap-47,
- 0x1.fd3de10d62855p-48,
- 0x1.d1aefbcd48d0cp-48,
- 0x1.a9cc93c25aca9p-48,
- 0x1.85487ee3ea735p-48,
- 0x1.63daf8b4b1e0cp-48,
- 0x1.45421e69a6ca1p-48,
- 0x1.294175802d99ap-48,
- 0x1.0fa17bf41068fp-48,
- 0x1.f05e82aae2bb9p-49,
- 0x1.c578101b29058p-49,
- 0x1.9e39dc5dd2f7cp-49,
- 0x1.7a553a728bbf2p-49,
- 0x1.5982008db1304p-49,
- 0x1.3b7e00422e51bp-49,
- 0x1.200c898d9ee3ep-49,
- 0x1.06f5f7eb65a56p-49,
- 0x1.e00e9148a1d25p-50,
- 0x1.b623734024e92p-50,
- 0x1.8fd4e01891bf8p-50,
- 0x1.6cd44c7470d89p-50,
- 0x1.4cd9c04158cd7p-50,
- 0x1.2fa34bf5c8344p-50,
- 0x1.14f4890ff2461p-50,
- 0x1.f92c49dfa4df5p-51,
- 0x1.ccaaea71ab0dfp-51,
- 0x1.a40829f001197p-51,
- 0x1.7eef13b59e96cp-51,
- 0x1.5d11e1a252bf5p-51,
- 0x1.3e296303b2297p-51,
- 0x1.21f47009f43cep-51,
- 0x1.083768c5e4541p-51,
- 0x1.e1777d831265ep-52,
- 0x1.b69f10b0191b5p-52,
- 0x1.8f8a3a05b5b52p-52,
- 0x1.6be573c40c8e7p-52,
- 0x1.4b645ba991fdbp-52,
- 0x1.2dc119095729fp-52,
- },
-};
diff --git a/sysdeps/aarch64/fpu/sv_erff_data.c b/sysdeps/aarch64/fpu/sv_erff_data.c
deleted file mode 100644
index 6dcd72a..0000000
--- a/sysdeps/aarch64/fpu/sv_erff_data.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/* Table for SVE erff approximation
-
- Copyright (C) 2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include "vecmath_config.h"
-
-/* Lookup table used in SVE erff.
- For each possible rounded input r (multiples of 1/128), between
- r = 0.0 and r = 4.0 (513 values):
- - __erff_data.erf contains the values of erf(r),
- - __erff_data.scale contains the values of 2/sqrt(pi)*exp(-r^2).
- Note that indices 0 and 1 are never hit by the algorithm, since lookup is
- performed only for x >= 1/64-1/512. */
-const struct sv_erff_data __sv_erff_data = {
- .erf = { 0x0.000000p+0,
- 0x1.20dbf4p-7,
- 0x1.20d770p-6,
- 0x1.b137e0p-6,
- 0x1.20c564p-5,
- 0x1.68e5d4p-5,
- 0x1.b0fafep-5,
- 0x1.f902a8p-5,
- 0x1.207d48p-4,
- 0x1.44703ep-4,
- 0x1.68591ap-4,
- 0x1.8c36bep-4,
- 0x1.b00812p-4,
- 0x1.d3cbf8p-4,
- 0x1.f7815ap-4,
- 0x1.0d9390p-3,
- 0x1.1f5e1ap-3,
- 0x1.311fc2p-3,
- 0x1.42d7fcp-3,
- 0x1.548642p-3,
- 0x1.662a0cp-3,
- 0x1.77c2d2p-3,
- 0x1.895010p-3,
- 0x1.9ad142p-3,
- 0x1.ac45e4p-3,
- 0x1.bdad72p-3,
- 0x1.cf076ep-3,
- 0x1.e05354p-3,
- 0x1.f190aap-3,
- 0x1.015f78p-2,
- 0x1.09eed6p-2,
- 0x1.127632p-2,
- 0x1.1af54ep-2,
- 0x1.236bf0p-2,
- 0x1.2bd9dcp-2,
- 0x1.343ed6p-2,
- 0x1.3c9aa8p-2,
- 0x1.44ed18p-2,
- 0x1.4d35f0p-2,
- 0x1.5574f4p-2,
- 0x1.5da9f4p-2,
- 0x1.65d4b8p-2,
- 0x1.6df50ap-2,
- 0x1.760abap-2,
- 0x1.7e1594p-2,
- 0x1.861566p-2,
- 0x1.8e0a02p-2,
- 0x1.95f336p-2,
- 0x1.9dd0d2p-2,
- 0x1.a5a2acp-2,
- 0x1.ad6896p-2,
- 0x1.b52264p-2,
- 0x1.bccfecp-2,
- 0x1.c47104p-2,
- 0x1.cc0584p-2,
- 0x1.d38d44p-2,
- 0x1.db081cp-2,
- 0x1.e275eap-2,
- 0x1.e9d68ap-2,
- 0x1.f129d4p-2,
- 0x1.f86faap-2,
- 0x1.ffa7eap-2,
- 0x1.03693ap-1,
- 0x1.06f794p-1,
- 0x1.0a7ef6p-1,
- 0x1.0dff50p-1,
- 0x1.117894p-1,
- 0x1.14eab4p-1,
- 0x1.1855a6p-1,
- 0x1.1bb95cp-1,
- 0x1.1f15ccp-1,
- 0x1.226ae8p-1,
- 0x1.25b8a8p-1,
- 0x1.28ff02p-1,
- 0x1.2c3decp-1,
- 0x1.2f755cp-1,
- 0x1.32a54cp-1,
- 0x1.35cdb4p-1,
- 0x1.38ee8ap-1,
- 0x1.3c07cap-1,
- 0x1.3f196ep-1,
- 0x1.42236ep-1,
- 0x1.4525c8p-1,
- 0x1.482074p-1,
- 0x1.4b1372p-1,
- 0x1.4dfebap-1,
- 0x1.50e24cp-1,
- 0x1.53be26p-1,
- 0x1.569244p-1,
- 0x1.595ea6p-1,
- 0x1.5c2348p-1,
- 0x1.5ee02ep-1,
- 0x1.619556p-1,
- 0x1.6442c0p-1,
- 0x1.66e86ep-1,
- 0x1.69865ep-1,
- 0x1.6c1c98p-1,
- 0x1.6eab18p-1,
- 0x1.7131e6p-1,
- 0x1.73b102p-1,
- 0x1.762870p-1,
- 0x1.789836p-1,
- 0x1.7b0058p-1,
- 0x1.7d60d8p-1,
- 0x1.7fb9c0p-1,
- 0x1.820b12p-1,
- 0x1.8454d6p-1,
- 0x1.869712p-1,
- 0x1.88d1cep-1,
- 0x1.8b050ep-1,
- 0x1.8d30dep-1,
- 0x1.8f5544p-1,
- 0x1.91724ap-1,
- 0x1.9387f6p-1,
- 0x1.959652p-1,
- 0x1.979d68p-1,
- 0x1.999d42p-1,
- 0x1.9b95e8p-1,
- 0x1.9d8768p-1,
- 0x1.9f71cap-1,
- 0x1.a1551ap-1,
- 0x1.a33162p-1,
- 0x1.a506b0p-1,
- 0x1.a6d50cp-1,
- 0x1.a89c86p-1,
- 0x1.aa5d26p-1,
- 0x1.ac16fcp-1,
- 0x1.adca14p-1,
- 0x1.af767ap-1,
- 0x1.b11c3cp-1,
- 0x1.b2bb68p-1,
- 0x1.b4540ap-1,
- 0x1.b5e630p-1,
- 0x1.b771e8p-1,
- 0x1.b8f742p-1,
- 0x1.ba764ap-1,
- 0x1.bbef10p-1,
- 0x1.bd61a2p-1,
- 0x1.bece0ep-1,
- 0x1.c03464p-1,
- 0x1.c194b2p-1,
- 0x1.c2ef08p-1,
- 0x1.c44376p-1,
- 0x1.c5920ap-1,
- 0x1.c6dad2p-1,
- 0x1.c81de2p-1,
- 0x1.c95b46p-1,
- 0x1.ca930ep-1,
- 0x1.cbc54cp-1,
- 0x1.ccf20cp-1,
- 0x1.ce1962p-1,
- 0x1.cf3b5cp-1,
- 0x1.d0580cp-1,
- 0x1.d16f7ep-1,
- 0x1.d281c4p-1,
- 0x1.d38ef0p-1,
- 0x1.d49710p-1,
- 0x1.d59a34p-1,
- 0x1.d6986cp-1,
- 0x1.d791cap-1,
- 0x1.d8865ep-1,
- 0x1.d97636p-1,
- 0x1.da6162p-1,
- 0x1.db47f4p-1,
- 0x1.dc29fcp-1,
- 0x1.dd0788p-1,
- 0x1.dde0aap-1,
- 0x1.deb570p-1,
- 0x1.df85eap-1,
- 0x1.e0522ap-1,
- 0x1.e11a3ep-1,
- 0x1.e1de36p-1,
- 0x1.e29e22p-1,
- 0x1.e35a12p-1,
- 0x1.e41214p-1,
- 0x1.e4c638p-1,
- 0x1.e5768cp-1,
- 0x1.e62322p-1,
- 0x1.e6cc08p-1,
- 0x1.e7714ap-1,
- 0x1.e812fcp-1,
- 0x1.e8b12ap-1,
- 0x1.e94be4p-1,
- 0x1.e9e336p-1,
- 0x1.ea7730p-1,
- 0x1.eb07e2p-1,
- 0x1.eb9558p-1,
- 0x1.ec1fa2p-1,
- 0x1.eca6ccp-1,
- 0x1.ed2ae6p-1,
- 0x1.edabfcp-1,
- 0x1.ee2a1ep-1,
- 0x1.eea556p-1,
- 0x1.ef1db4p-1,
- 0x1.ef9344p-1,
- 0x1.f00614p-1,
- 0x1.f07630p-1,
- 0x1.f0e3a6p-1,
- 0x1.f14e82p-1,
- 0x1.f1b6d0p-1,
- 0x1.f21ca0p-1,
- 0x1.f27ff8p-1,
- 0x1.f2e0eap-1,
- 0x1.f33f7ep-1,
- 0x1.f39bc2p-1,
- 0x1.f3f5c2p-1,
- 0x1.f44d88p-1,
- 0x1.f4a31ep-1,
- 0x1.f4f694p-1,
- 0x1.f547f2p-1,
- 0x1.f59742p-1,
- 0x1.f5e490p-1,
- 0x1.f62fe8p-1,
- 0x1.f67952p-1,
- 0x1.f6c0dcp-1,
- 0x1.f7068cp-1,
- 0x1.f74a6ep-1,
- 0x1.f78c8cp-1,
- 0x1.f7cceep-1,
- 0x1.f80ba2p-1,
- 0x1.f848acp-1,
- 0x1.f8841ap-1,
- 0x1.f8bdf2p-1,
- 0x1.f8f63ep-1,
- 0x1.f92d08p-1,
- 0x1.f96256p-1,
- 0x1.f99634p-1,
- 0x1.f9c8a8p-1,
- 0x1.f9f9bap-1,
- 0x1.fa2974p-1,
- 0x1.fa57dep-1,
- 0x1.fa84fep-1,
- 0x1.fab0dep-1,
- 0x1.fadb84p-1,
- 0x1.fb04f6p-1,
- 0x1.fb2d40p-1,
- 0x1.fb5464p-1,
- 0x1.fb7a6cp-1,
- 0x1.fb9f60p-1,
- 0x1.fbc344p-1,
- 0x1.fbe61ep-1,
- 0x1.fc07fap-1,
- 0x1.fc28d8p-1,
- 0x1.fc48c2p-1,
- 0x1.fc67bcp-1,
- 0x1.fc85d0p-1,
- 0x1.fca2fep-1,
- 0x1.fcbf52p-1,
- 0x1.fcdaccp-1,
- 0x1.fcf576p-1,
- 0x1.fd0f54p-1,
- 0x1.fd286ap-1,
- 0x1.fd40bep-1,
- 0x1.fd5856p-1,
- 0x1.fd6f34p-1,
- 0x1.fd8562p-1,
- 0x1.fd9ae2p-1,
- 0x1.fdafb8p-1,
- 0x1.fdc3e8p-1,
- 0x1.fdd77ap-1,
- 0x1.fdea6ep-1,
- 0x1.fdfcccp-1,
- 0x1.fe0e96p-1,
- 0x1.fe1fd0p-1,
- 0x1.fe3080p-1,
- 0x1.fe40a6p-1,
- 0x1.fe504cp-1,
- 0x1.fe5f70p-1,
- 0x1.fe6e18p-1,
- 0x1.fe7c46p-1,
- 0x1.fe8a00p-1,
- 0x1.fe9748p-1,
- 0x1.fea422p-1,
- 0x1.feb090p-1,
- 0x1.febc96p-1,
- 0x1.fec836p-1,
- 0x1.fed374p-1,
- 0x1.fede52p-1,
- 0x1.fee8d4p-1,
- 0x1.fef2fep-1,
- 0x1.fefccep-1,
- 0x1.ff064cp-1,
- 0x1.ff0f76p-1,
- 0x1.ff1852p-1,
- 0x1.ff20e0p-1,
- 0x1.ff2924p-1,
- 0x1.ff3120p-1,
- 0x1.ff38d6p-1,
- 0x1.ff4048p-1,
- 0x1.ff4778p-1,
- 0x1.ff4e68p-1,
- 0x1.ff551ap-1,
- 0x1.ff5b90p-1,
- 0x1.ff61ccp-1,
- 0x1.ff67d0p-1,
- 0x1.ff6d9ep-1,
- 0x1.ff7338p-1,
- 0x1.ff789ep-1,
- 0x1.ff7dd4p-1,
- 0x1.ff82dap-1,
- 0x1.ff87b2p-1,
- 0x1.ff8c5cp-1,
- 0x1.ff90dcp-1,
- 0x1.ff9532p-1,
- 0x1.ff9960p-1,
- 0x1.ff9d68p-1,
- 0x1.ffa14ap-1,
- 0x1.ffa506p-1,
- 0x1.ffa8a0p-1,
- 0x1.ffac18p-1,
- 0x1.ffaf6ep-1,
- 0x1.ffb2a6p-1,
- 0x1.ffb5bep-1,
- 0x1.ffb8b8p-1,
- 0x1.ffbb98p-1,
- 0x1.ffbe5ap-1,
- 0x1.ffc102p-1,
- 0x1.ffc390p-1,
- 0x1.ffc606p-1,
- 0x1.ffc862p-1,
- 0x1.ffcaa8p-1,
- 0x1.ffccd8p-1,
- 0x1.ffcef4p-1,
- 0x1.ffd0fap-1,
- 0x1.ffd2eap-1,
- 0x1.ffd4cap-1,
- 0x1.ffd696p-1,
- 0x1.ffd84ep-1,
- 0x1.ffd9f8p-1,
- 0x1.ffdb90p-1,
- 0x1.ffdd18p-1,
- 0x1.ffde90p-1,
- 0x1.ffdffap-1,
- 0x1.ffe154p-1,
- 0x1.ffe2a2p-1,
- 0x1.ffe3e2p-1,
- 0x1.ffe514p-1,
- 0x1.ffe63cp-1,
- 0x1.ffe756p-1,
- 0x1.ffe866p-1,
- 0x1.ffe96ap-1,
- 0x1.ffea64p-1,
- 0x1.ffeb54p-1,
- 0x1.ffec3ap-1,
- 0x1.ffed16p-1,
- 0x1.ffedeap-1,
- 0x1.ffeeb4p-1,
- 0x1.ffef76p-1,
- 0x1.fff032p-1,
- 0x1.fff0e4p-1,
- 0x1.fff18ep-1,
- 0x1.fff232p-1,
- 0x1.fff2d0p-1,
- 0x1.fff366p-1,
- 0x1.fff3f6p-1,
- 0x1.fff480p-1,
- 0x1.fff504p-1,
- 0x1.fff582p-1,
- 0x1.fff5fcp-1,
- 0x1.fff670p-1,
- 0x1.fff6dep-1,
- 0x1.fff74ap-1,
- 0x1.fff7aep-1,
- 0x1.fff810p-1,
- 0x1.fff86cp-1,
- 0x1.fff8c6p-1,
- 0x1.fff91cp-1,
- 0x1.fff96cp-1,
- 0x1.fff9bap-1,
- 0x1.fffa04p-1,
- 0x1.fffa4cp-1,
- 0x1.fffa90p-1,
- 0x1.fffad0p-1,
- 0x1.fffb0ep-1,
- 0x1.fffb4ap-1,
- 0x1.fffb82p-1,
- 0x1.fffbb8p-1,
- 0x1.fffbecp-1,
- 0x1.fffc1ep-1,
- 0x1.fffc4ep-1,
- 0x1.fffc7ap-1,
- 0x1.fffca6p-1,
- 0x1.fffccep-1,
- 0x1.fffcf6p-1,
- 0x1.fffd1ap-1,
- 0x1.fffd3ep-1,
- 0x1.fffd60p-1,
- 0x1.fffd80p-1,
- 0x1.fffda0p-1,
- 0x1.fffdbep-1,
- 0x1.fffddap-1,
- 0x1.fffdf4p-1,
- 0x1.fffe0ep-1,
- 0x1.fffe26p-1,
- 0x1.fffe3ep-1,
- 0x1.fffe54p-1,
- 0x1.fffe68p-1,
- 0x1.fffe7ep-1,
- 0x1.fffe90p-1,
- 0x1.fffea2p-1,
- 0x1.fffeb4p-1,
- 0x1.fffec4p-1,
- 0x1.fffed4p-1,
- 0x1.fffee4p-1,
- 0x1.fffef2p-1,
- 0x1.ffff00p-1,
- 0x1.ffff0cp-1,
- 0x1.ffff18p-1,
- 0x1.ffff24p-1,
- 0x1.ffff30p-1,
- 0x1.ffff3ap-1,
- 0x1.ffff44p-1,
- 0x1.ffff4ep-1,
- 0x1.ffff56p-1,
- 0x1.ffff60p-1,
- 0x1.ffff68p-1,
- 0x1.ffff70p-1,
- 0x1.ffff78p-1,
- 0x1.ffff7ep-1,
- 0x1.ffff84p-1,
- 0x1.ffff8cp-1,
- 0x1.ffff92p-1,
- 0x1.ffff98p-1,
- 0x1.ffff9cp-1,
- 0x1.ffffa2p-1,
- 0x1.ffffa6p-1,
- 0x1.ffffacp-1,
- 0x1.ffffb0p-1,
- 0x1.ffffb4p-1,
- 0x1.ffffb8p-1,
- 0x1.ffffbcp-1,
- 0x1.ffffc0p-1,
- 0x1.ffffc4p-1,
- 0x1.ffffc6p-1,
- 0x1.ffffcap-1,
- 0x1.ffffccp-1,
- 0x1.ffffd0p-1,
- 0x1.ffffd2p-1,
- 0x1.ffffd4p-1,
- 0x1.ffffd6p-1,
- 0x1.ffffd8p-1,
- 0x1.ffffdcp-1,
- 0x1.ffffdep-1,
- 0x1.ffffdep-1,
- 0x1.ffffe0p-1,
- 0x1.ffffe2p-1,
- 0x1.ffffe4p-1,
- 0x1.ffffe6p-1,
- 0x1.ffffe8p-1,
- 0x1.ffffe8p-1,
- 0x1.ffffeap-1,
- 0x1.ffffeap-1,
- 0x1.ffffecp-1,
- 0x1.ffffeep-1,
- 0x1.ffffeep-1,
- 0x1.fffff0p-1,
- 0x1.fffff0p-1,
- 0x1.fffff2p-1,
- 0x1.fffff2p-1,
- 0x1.fffff2p-1,
- 0x1.fffff4p-1,
- 0x1.fffff4p-1,
- 0x1.fffff4p-1,
- 0x1.fffff6p-1,
- 0x1.fffff6p-1,
- 0x1.fffff6p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffff8p-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffap-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffcp-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.fffffep-1,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- 0x1.000000p+0,
- },
- .scale = { 0x1.20dd76p+0,
- 0x1.20d8f2p+0,
- 0x1.20cb68p+0,
- 0x1.20b4d8p+0,
- 0x1.209546p+0,
- 0x1.206cb4p+0,
- 0x1.203b26p+0,
- 0x1.2000a0p+0,
- 0x1.1fbd28p+0,
- 0x1.1f70c4p+0,
- 0x1.1f1b7ap+0,
- 0x1.1ebd56p+0,
- 0x1.1e565cp+0,
- 0x1.1de698p+0,
- 0x1.1d6e14p+0,
- 0x1.1cecdcp+0,
- 0x1.1c62fap+0,
- 0x1.1bd07cp+0,
- 0x1.1b3572p+0,
- 0x1.1a91e6p+0,
- 0x1.19e5eap+0,
- 0x1.19318cp+0,
- 0x1.1874dep+0,
- 0x1.17aff0p+0,
- 0x1.16e2d8p+0,
- 0x1.160da4p+0,
- 0x1.153068p+0,
- 0x1.144b3cp+0,
- 0x1.135e30p+0,
- 0x1.12695ep+0,
- 0x1.116cd8p+0,
- 0x1.1068bap+0,
- 0x1.0f5d16p+0,
- 0x1.0e4a08p+0,
- 0x1.0d2fa6p+0,
- 0x1.0c0e0ap+0,
- 0x1.0ae550p+0,
- 0x1.09b590p+0,
- 0x1.087ee4p+0,
- 0x1.07416cp+0,
- 0x1.05fd3ep+0,
- 0x1.04b27cp+0,
- 0x1.036140p+0,
- 0x1.0209a6p+0,
- 0x1.00abd0p+0,
- 0x1.fe8fb0p-1,
- 0x1.fbbbbep-1,
- 0x1.f8dc0ap-1,
- 0x1.f5f0cep-1,
- 0x1.f2fa4cp-1,
- 0x1.eff8c4p-1,
- 0x1.ecec78p-1,
- 0x1.e9d5a8p-1,
- 0x1.e6b498p-1,
- 0x1.e38988p-1,
- 0x1.e054bep-1,
- 0x1.dd167cp-1,
- 0x1.d9cf06p-1,
- 0x1.d67ea2p-1,
- 0x1.d32592p-1,
- 0x1.cfc41ep-1,
- 0x1.cc5a8ap-1,
- 0x1.c8e91cp-1,
- 0x1.c5701ap-1,
- 0x1.c1efcap-1,
- 0x1.be6872p-1,
- 0x1.bada5ap-1,
- 0x1.b745c6p-1,
- 0x1.b3aafcp-1,
- 0x1.b00a46p-1,
- 0x1.ac63e8p-1,
- 0x1.a8b828p-1,
- 0x1.a5074ep-1,
- 0x1.a1519ep-1,
- 0x1.9d9762p-1,
- 0x1.99d8dap-1,
- 0x1.961650p-1,
- 0x1.925008p-1,
- 0x1.8e8646p-1,
- 0x1.8ab950p-1,
- 0x1.86e96ap-1,
- 0x1.8316d6p-1,
- 0x1.7f41dcp-1,
- 0x1.7b6abcp-1,
- 0x1.7791b8p-1,
- 0x1.73b714p-1,
- 0x1.6fdb12p-1,
- 0x1.6bfdf0p-1,
- 0x1.681ff2p-1,
- 0x1.644156p-1,
- 0x1.60625cp-1,
- 0x1.5c8342p-1,
- 0x1.58a446p-1,
- 0x1.54c5a6p-1,
- 0x1.50e79ep-1,
- 0x1.4d0a68p-1,
- 0x1.492e42p-1,
- 0x1.455366p-1,
- 0x1.417a0cp-1,
- 0x1.3da26ep-1,
- 0x1.39ccc2p-1,
- 0x1.35f940p-1,
- 0x1.32281ep-1,
- 0x1.2e5992p-1,
- 0x1.2a8dcep-1,
- 0x1.26c508p-1,
- 0x1.22ff72p-1,
- 0x1.1f3d3cp-1,
- 0x1.1b7e98p-1,
- 0x1.17c3b6p-1,
- 0x1.140cc4p-1,
- 0x1.1059eep-1,
- 0x1.0cab62p-1,
- 0x1.09014cp-1,
- 0x1.055bd6p-1,
- 0x1.01bb2cp-1,
- 0x1.fc3ee6p-2,
- 0x1.f511aap-2,
- 0x1.edeeeep-2,
- 0x1.e6d700p-2,
- 0x1.dfca26p-2,
- 0x1.d8c8aap-2,
- 0x1.d1d2d0p-2,
- 0x1.cae8dap-2,
- 0x1.c40b08p-2,
- 0x1.bd3998p-2,
- 0x1.b674c8p-2,
- 0x1.afbcd4p-2,
- 0x1.a911f0p-2,
- 0x1.a27456p-2,
- 0x1.9be438p-2,
- 0x1.9561c8p-2,
- 0x1.8eed36p-2,
- 0x1.8886b2p-2,
- 0x1.822e66p-2,
- 0x1.7be47ap-2,
- 0x1.75a91ap-2,
- 0x1.6f7c6ap-2,
- 0x1.695e8cp-2,
- 0x1.634fa6p-2,
- 0x1.5d4fd4p-2,
- 0x1.575f34p-2,
- 0x1.517de6p-2,
- 0x1.4bac00p-2,
- 0x1.45e99cp-2,
- 0x1.4036d0p-2,
- 0x1.3a93b2p-2,
- 0x1.350052p-2,
- 0x1.2f7cc4p-2,
- 0x1.2a0916p-2,
- 0x1.24a554p-2,
- 0x1.1f518ap-2,
- 0x1.1a0dc6p-2,
- 0x1.14da0ap-2,
- 0x1.0fb662p-2,
- 0x1.0aa2d0p-2,
- 0x1.059f5ap-2,
- 0x1.00ac00p-2,
- 0x1.f79184p-3,
- 0x1.edeb40p-3,
- 0x1.e46530p-3,
- 0x1.daff4ap-3,
- 0x1.d1b982p-3,
- 0x1.c893cep-3,
- 0x1.bf8e1cp-3,
- 0x1.b6a856p-3,
- 0x1.ade26cp-3,
- 0x1.a53c42p-3,
- 0x1.9cb5bep-3,
- 0x1.944ec2p-3,
- 0x1.8c0732p-3,
- 0x1.83deeap-3,
- 0x1.7bd5c8p-3,
- 0x1.73eba4p-3,
- 0x1.6c2056p-3,
- 0x1.6473b6p-3,
- 0x1.5ce596p-3,
- 0x1.5575c8p-3,
- 0x1.4e241ep-3,
- 0x1.46f066p-3,
- 0x1.3fda6cp-3,
- 0x1.38e1fap-3,
- 0x1.3206dcp-3,
- 0x1.2b48dap-3,
- 0x1.24a7b8p-3,
- 0x1.1e233ep-3,
- 0x1.17bb2cp-3,
- 0x1.116f48p-3,
- 0x1.0b3f52p-3,
- 0x1.052b0cp-3,
- 0x1.fe6460p-4,
- 0x1.f2a902p-4,
- 0x1.e72372p-4,
- 0x1.dbd32ap-4,
- 0x1.d0b7a0p-4,
- 0x1.c5d04ap-4,
- 0x1.bb1c98p-4,
- 0x1.b09bfcp-4,
- 0x1.a64de6p-4,
- 0x1.9c31c6p-4,
- 0x1.92470ap-4,
- 0x1.888d1ep-4,
- 0x1.7f036cp-4,
- 0x1.75a960p-4,
- 0x1.6c7e64p-4,
- 0x1.6381e2p-4,
- 0x1.5ab342p-4,
- 0x1.5211ecp-4,
- 0x1.499d48p-4,
- 0x1.4154bcp-4,
- 0x1.3937b2p-4,
- 0x1.31458ep-4,
- 0x1.297dbap-4,
- 0x1.21df9ap-4,
- 0x1.1a6a96p-4,
- 0x1.131e14p-4,
- 0x1.0bf97ep-4,
- 0x1.04fc3ap-4,
- 0x1.fc4b5ep-5,
- 0x1.eeea8cp-5,
- 0x1.e1d4d0p-5,
- 0x1.d508fap-5,
- 0x1.c885e0p-5,
- 0x1.bc4a54p-5,
- 0x1.b05530p-5,
- 0x1.a4a54ap-5,
- 0x1.99397ap-5,
- 0x1.8e109cp-5,
- 0x1.83298ep-5,
- 0x1.78832cp-5,
- 0x1.6e1c58p-5,
- 0x1.63f3f6p-5,
- 0x1.5a08e8p-5,
- 0x1.505a18p-5,
- 0x1.46e66cp-5,
- 0x1.3dacd2p-5,
- 0x1.34ac36p-5,
- 0x1.2be38cp-5,
- 0x1.2351c2p-5,
- 0x1.1af5d2p-5,
- 0x1.12ceb4p-5,
- 0x1.0adb60p-5,
- 0x1.031ad6p-5,
- 0x1.f7182ap-6,
- 0x1.e85c44p-6,
- 0x1.da0006p-6,
- 0x1.cc0180p-6,
- 0x1.be5ecep-6,
- 0x1.b1160ap-6,
- 0x1.a4255ap-6,
- 0x1.978ae8p-6,
- 0x1.8b44e6p-6,
- 0x1.7f5188p-6,
- 0x1.73af0cp-6,
- 0x1.685bb6p-6,
- 0x1.5d55ccp-6,
- 0x1.529b9ep-6,
- 0x1.482b84p-6,
- 0x1.3e03d8p-6,
- 0x1.3422fep-6,
- 0x1.2a875cp-6,
- 0x1.212f62p-6,
- 0x1.181984p-6,
- 0x1.0f443ep-6,
- 0x1.06ae14p-6,
- 0x1.fcab14p-7,
- 0x1.ec7262p-7,
- 0x1.dcaf36p-7,
- 0x1.cd5ecap-7,
- 0x1.be7e5ap-7,
- 0x1.b00b38p-7,
- 0x1.a202bep-7,
- 0x1.94624ep-7,
- 0x1.87275ep-7,
- 0x1.7a4f6ap-7,
- 0x1.6dd7fep-7,
- 0x1.61beaep-7,
- 0x1.56011cp-7,
- 0x1.4a9cf6p-7,
- 0x1.3f8ff6p-7,
- 0x1.34d7dcp-7,
- 0x1.2a727ap-7,
- 0x1.205dacp-7,
- 0x1.169756p-7,
- 0x1.0d1d6ap-7,
- 0x1.03ede2p-7,
- 0x1.f60d8ap-8,
- 0x1.e4cc4ap-8,
- 0x1.d4143ap-8,
- 0x1.c3e1a6p-8,
- 0x1.b430ecp-8,
- 0x1.a4fe84p-8,
- 0x1.9646f4p-8,
- 0x1.8806d8p-8,
- 0x1.7a3adep-8,
- 0x1.6cdfccp-8,
- 0x1.5ff276p-8,
- 0x1.536fc2p-8,
- 0x1.4754acp-8,
- 0x1.3b9e40p-8,
- 0x1.30499cp-8,
- 0x1.2553eep-8,
- 0x1.1aba78p-8,
- 0x1.107a8cp-8,
- 0x1.06918cp-8,
- 0x1.f9f9d0p-9,
- 0x1.e77448p-9,
- 0x1.d58da6p-9,
- 0x1.c4412cp-9,
- 0x1.b38a3ap-9,
- 0x1.a36454p-9,
- 0x1.93cb12p-9,
- 0x1.84ba30p-9,
- 0x1.762d84p-9,
- 0x1.682100p-9,
- 0x1.5a90b0p-9,
- 0x1.4d78bcp-9,
- 0x1.40d564p-9,
- 0x1.34a306p-9,
- 0x1.28de12p-9,
- 0x1.1d8318p-9,
- 0x1.128ebap-9,
- 0x1.07fdb4p-9,
- 0x1.fb99b8p-10,
- 0x1.e7f232p-10,
- 0x1.d4fed8p-10,
- 0x1.c2b9d0p-10,
- 0x1.b11d70p-10,
- 0x1.a02436p-10,
- 0x1.8fc8c8p-10,
- 0x1.8005f0p-10,
- 0x1.70d6a4p-10,
- 0x1.6235fcp-10,
- 0x1.541f34p-10,
- 0x1.468daep-10,
- 0x1.397ceep-10,
- 0x1.2ce898p-10,
- 0x1.20cc76p-10,
- 0x1.15246ep-10,
- 0x1.09ec86p-10,
- 0x1.fe41cep-11,
- 0x1.e97ba4p-11,
- 0x1.d57f52p-11,
- 0x1.c245d4p-11,
- 0x1.afc85ep-11,
- 0x1.9e0058p-11,
- 0x1.8ce75ep-11,
- 0x1.7c7744p-11,
- 0x1.6caa0ep-11,
- 0x1.5d79ecp-11,
- 0x1.4ee142p-11,
- 0x1.40daa4p-11,
- 0x1.3360ccp-11,
- 0x1.266ea8p-11,
- 0x1.19ff46p-11,
- 0x1.0e0de8p-11,
- 0x1.0295f0p-11,
- 0x1.ef25d4p-12,
- 0x1.da0110p-12,
- 0x1.c5b542p-12,
- 0x1.b23a5ap-12,
- 0x1.9f8894p-12,
- 0x1.8d986ap-12,
- 0x1.7c629ap-12,
- 0x1.6be022p-12,
- 0x1.5c0a38p-12,
- 0x1.4cda54p-12,
- 0x1.3e4a24p-12,
- 0x1.305390p-12,
- 0x1.22f0b4p-12,
- 0x1.161be4p-12,
- 0x1.09cfa4p-12,
- 0x1.fc0d56p-13,
- 0x1.e577bcp-13,
- 0x1.cfd4a6p-13,
- 0x1.bb1a96p-13,
- 0x1.a74068p-13,
- 0x1.943d4ap-13,
- 0x1.8208bcp-13,
- 0x1.709a8ep-13,
- 0x1.5feadap-13,
- 0x1.4ff208p-13,
- 0x1.40a8c2p-13,
- 0x1.3207fcp-13,
- 0x1.2408eap-13,
- 0x1.16a502p-13,
- 0x1.09d5f8p-13,
- 0x1.fb2b7ap-14,
- 0x1.e3bcf4p-14,
- 0x1.cd5528p-14,
- 0x1.b7e946p-14,
- 0x1.a36eecp-14,
- 0x1.8fdc1cp-14,
- 0x1.7d2738p-14,
- 0x1.6b4702p-14,
- 0x1.5a329cp-14,
- 0x1.49e178p-14,
- 0x1.3a4b60p-14,
- 0x1.2b6876p-14,
- 0x1.1d3120p-14,
- 0x1.0f9e1cp-14,
- 0x1.02a868p-14,
- 0x1.ec929ap-15,
- 0x1.d4f4b4p-15,
- 0x1.be6abcp-15,
- 0x1.a8e8ccp-15,
- 0x1.94637ep-15,
- 0x1.80cfdcp-15,
- 0x1.6e2368p-15,
- 0x1.5c540cp-15,
- 0x1.4b581cp-15,
- 0x1.3b2652p-15,
- 0x1.2bb5ccp-15,
- 0x1.1cfe02p-15,
- 0x1.0ef6c4p-15,
- 0x1.019842p-15,
- 0x1.e9b5e8p-16,
- 0x1.d16f58p-16,
- 0x1.ba4f04p-16,
- 0x1.a447b8p-16,
- 0x1.8f4cccp-16,
- 0x1.7b5224p-16,
- 0x1.684c22p-16,
- 0x1.562facp-16,
- 0x1.44f21ep-16,
- 0x1.34894ap-16,
- 0x1.24eb72p-16,
- 0x1.160f44p-16,
- 0x1.07ebd2p-16,
- 0x1.f4f12ep-17,
- 0x1.db5ad0p-17,
- 0x1.c304f0p-17,
- 0x1.abe09ep-17,
- 0x1.95df98p-17,
- 0x1.80f43ap-17,
- 0x1.6d1178p-17,
- 0x1.5a2ae0p-17,
- 0x1.483488p-17,
- 0x1.372310p-17,
- 0x1.26eb9ep-17,
- 0x1.1783cep-17,
- 0x1.08e1bap-17,
- 0x1.f5f7d8p-18,
- 0x1.db92b6p-18,
- 0x1.c282cep-18,
- 0x1.aab7acp-18,
- 0x1.94219cp-18,
- 0x1.7eb1a2p-18,
- 0x1.6a5972p-18,
- 0x1.570b6ap-18,
- 0x1.44ba86p-18,
- 0x1.335a62p-18,
- 0x1.22df2ap-18,
- 0x1.133d96p-18,
- 0x1.046aeap-18,
- 0x1.ecb9d0p-19,
- 0x1.d21398p-19,
- 0x1.b8d094p-19,
- 0x1.a0df10p-19,
- 0x1.8a2e26p-19,
- 0x1.74adc8p-19,
- 0x1.604ea8p-19,
- 0x1.4d0232p-19,
- 0x1.3aba86p-19,
- 0x1.296a70p-19,
- 0x1.190562p-19,
- 0x1.097f62p-19,
- 0x1.f59a20p-20,
- 0x1.d9c736p-20,
- 0x1.bf716cp-20,
- 0x1.a6852cp-20,
- 0x1.8eefd8p-20,
- 0x1.789fb8p-20,
- 0x1.6383f8p-20,
- 0x1.4f8c96p-20,
- 0x1.3caa62p-20,
- 0x1.2acee2p-20,
- 0x1.19ec60p-20,
- 0x1.09f5d0p-20,
- 0x1.f5bd96p-21,
- 0x1.d9371ep-21,
- 0x1.be41dep-21,
- 0x1.a4c89ep-21,
- 0x1.8cb738p-21,
- 0x1.75fa8ep-21,
- 0x1.608078p-21,
- 0x1.4c37c0p-21,
- 0x1.39100ep-21,
- 0x1.26f9e0p-21,
- 0x1.15e682p-21,
- 0x1.05c804p-21,
- 0x1.ed2254p-22,
- 0x1.d06ad6p-22,
- 0x1.b551c8p-22,
- 0x1.9bc0a0p-22,
- 0x1.83a200p-22,
- 0x1.6ce1aap-22,
- 0x1.576c72p-22,
- 0x1.43302cp-22,
- 0x1.301ba2p-22,
- 0x1.1e1e86p-22,
- 0x1.0d2966p-22,
- 0x1.fa5b50p-23,
- 0x1.dc3ae4p-23,
- 0x1.bfd756p-23,
- 0x1.a517dap-23,
- 0x1.8be4f8p-23,
- 0x1.74287ep-23,
- 0x1.5dcd66p-23,
- 0x1.48bfd4p-23,
- 0x1.34ecf8p-23,
- 0x1.224310p-23,
- 0x1.10b148p-23,
- },
-};
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index 23963b5..6166df6 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -24,19 +24,20 @@
struct sv_expf_data
{
- float poly[5];
- float inv_ln2, ln2_hi, ln2_lo, shift;
+ float c1, c3, inv_ln2;
+ float ln2_lo, c0, c2, c4;
+ float ln2_hi, shift;
};
/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
compatibility with polynomial helpers. Shift is 1.5*2^17 + 127. */
#define SV_EXPF_DATA \
{ \
- .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f, \
- 0x1.0e4020p-7f }, \
- \
- .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f, \
- .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f, \
+ /* Coefficients copied from the polynomial in AdvSIMD variant. */ \
+ .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f, \
+ .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .shift = 0x1.803f8p17f, \
}
#define C(i) sv_f32 (d->poly[i])
@@ -47,26 +48,25 @@ expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- /* Load some constants in quad-word chunks to minimise memory access. */
- svfloat32_t c4_invln2_and_ln2 = svld1rq (svptrue_b32 (), &d->poly[4]);
+ svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
/* n = round(x/(ln2/N)). */
- svfloat32_t z = svmla_lane (sv_f32 (d->shift), x, c4_invln2_and_ln2, 1);
+ svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
svfloat32_t n = svsub_x (pg, z, d->shift);
/* r = x - n*ln2/N. */
- svfloat32_t r = svmls_lane (x, n, c4_invln2_and_ln2, 2);
- r = svmls_lane (r, n, c4_invln2_and_ln2, 3);
+ svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+ r = svmls_lane (r, n, lane_consts, 0);
/* scale = 2^(n/N). */
- svfloat32_t scale = svexpa (svreinterpret_u32_f32 (z));
+ svfloat32_t scale = svexpa (svreinterpret_u32 (z));
/* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6. */
- svfloat32_t p12 = svmla_x (pg, C (1), C (2), r);
- svfloat32_t p34 = svmla_lane (C (3), r, c4_invln2_and_ln2, 0);
- svfloat32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
+ svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+ svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
- svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+ svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
svfloat32_t poly = svmla_x (pg, p0, r2, p14);
return svmla_x (pg, scale, scale, poly);
diff --git a/sysdeps/aarch64/fpu/tanhf_advsimd.c b/sysdeps/aarch64/fpu/tanhf_advsimd.c
index 50defd6..3ced9b7 100644
--- a/sysdeps/aarch64/fpu/tanhf_advsimd.c
+++ b/sysdeps/aarch64/fpu/tanhf_advsimd.c
@@ -28,13 +28,16 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
- .onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+ float32x4_t q, uint32x4_t special)
{
- return v_call_f32 (tanhf, x, y, special);
+ return v_call_f32 (
+ tanhf, x,
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+ special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vbslq_f32 (is_boring, boring, y), special);
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+ special);
+
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
libmvec_hidden_def (V_NAME_F1 (tanh))
diff --git a/sysdeps/aarch64/fpu/v_expm1f_inline.h b/sysdeps/aarch64/fpu/v_expm1f_inline.h
index 59b552d..1daedfd 100644
--- a/sysdeps/aarch64/fpu/v_expm1f_inline.h
+++ b/sysdeps/aarch64/fpu/v_expm1f_inline.h
@@ -21,48 +21,47 @@
#define AARCH64_FPU_V_EXPM1F_INLINE_H
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "math_config.h"
struct v_expm1f_data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
+ float32x4_t c0, c2;
int32x4_t exponent_bias;
+ float c1, c3, inv_ln2, c4;
+ float ln2_hi, ln2_lo;
};
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
- log(2)/2]. Exponent bias is asuint(1.0f).
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
+ log(2)/2]. Exponent bias is asuint(1.0f). */
#define V_EXPM1F_DATA \
{ \
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
}
static inline float32x4_t
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
- /* Helper routine for calculating exp(x) - 1.
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
- calling routine should handle special values if required. */
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
- Horner. */
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
float32x4_t f2 = vmulq_f32 (f, f);
float32x4_t f4 = vmulq_f32 (f2, f2);
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */
diff --git a/sysdeps/aarch64/fpu/v_log1pf_inline.h b/sysdeps/aarch64/fpu/v_log1pf_inline.h
index 643a6cd..73e45a9 100644
--- a/sysdeps/aarch64/fpu/v_log1pf_inline.h
+++ b/sysdeps/aarch64/fpu/v_log1pf_inline.h
@@ -25,54 +25,81 @@
struct v_log1pf_data
{
- float32x4_t poly[8], ln2;
uint32x4_t four;
int32x4_t three_quarters;
+ float c0, c3, c5, c7;
+ float32x4_t c4, c6, c1, c2, ln2;
};
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. First two coefficients
(1, -0.5) are not stored as they can be generated more efficiently. */
#define V_LOG1PF_CONSTANTS_TABLE \
{ \
- .poly \
- = { V4 (0x1.5555aap-2f), V4 (-0x1.000038p-2f), V4 (0x1.99675cp-3f), \
- V4 (-0x1.54ef78p-3f), V4 (0x1.28a1f4p-3f), V4 (-0x1.0da91p-3f), \
- V4 (0x1.abcb6p-4f), V4 (-0x1.6f0d5ep-5f) }, \
- .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
- .three_quarters = V4 (0x3f400000) \
+ .c0 = 0x1.5555aap-2f, .c1 = V4 (-0x1.000038p-2f), \
+ .c2 = V4 (0x1.99675cp-3f), .c3 = -0x1.54ef78p-3f, \
+ .c4 = V4 (0x1.28a1f4p-3f), .c5 = -0x1.0da91p-3f, \
+ .c6 = V4 (0x1.abcb6p-4f), .c7 = -0x1.6f0d5ep-5f, \
+ .ln2 = V4 (0x1.62e43p-1f), .four = V4 (0x40800000), \
+ .three_quarters = V4 (0x3f400000) \
}
static inline float32x4_t
-eval_poly (float32x4_t m, const float32x4_t *c)
+eval_poly (float32x4_t m, const struct v_log1pf_data *d)
{
- /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner (main routine
- uses split Estrin, but this way reduces register pressure in the calling
- routine). */
- float32x4_t q = vfmaq_f32 (v_f32 (-0.5), m, c[0]);
+ /* Approximate log(1+m) on [-0.25, 0.5] using pairwise Horner. */
+ float32x4_t c0357 = vld1q_f32 (&d->c0);
+ float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0);
float32x4_t m2 = vmulq_f32 (m, m);
- q = vfmaq_f32 (m, m2, q);
- float32x4_t p = v_pw_horner_6_f32 (m, m2, c + 1);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1);
+ float32x4_t p = vfmaq_f32 (p45, m2, p67);
+ p = vfmaq_f32 (p23, m2, p);
+ p = vfmaq_f32 (d->c1, m, p);
p = vmulq_f32 (m2, p);
- return vfmaq_f32 (q, m2, p);
+ p = vfmaq_f32 (m, m2, p);
+ return vfmaq_f32 (p, m2, q);
}
static inline float32x4_t
-log1pf_inline (float32x4_t x, const struct v_log1pf_data d)
+log1pf_inline (float32x4_t x, const struct v_log1pf_data *d)
{
- /* Helper for calculating log(x + 1). Copied from log1pf_2u1.c, with no
- special-case handling. See that file for details of the algorithm. */
+ /* Helper for calculating log(x + 1). */
+
+ /* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
+ is in [-0.25, 0.5]):
+ log1p(x) = log(t) + log(2^k) = log1p(m) + k*log(2).
+
+ We approximate log1p(m) with a polynomial, then scale by
+ k*log(2). Instead of doing this directly, we use an intermediate
+ scale factor s = 4*k*log(2) to ensure the scale is representable
+ as a normalised fp32 number. */
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
+
+ /* Choose k to scale x to the range [-1/4, 1/2]. */
int32x4_t k
- = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d.three_quarters),
+ = vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
v_s32 (0xff800000));
uint32x4_t ku = vreinterpretq_u32_s32 (k);
- float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d.four, ku));
+
+ /* Scale up to ensure that the scale factor is representable as normalised
+ fp32 number, and scale m down accordingly. */
+ float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
+
+ /* Scale x by exponent manipulation. */
float32x4_t m_scale
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
m_scale = vaddq_f32 (m_scale, vfmaq_f32 (v_f32 (-1.0f), v_f32 (0.25f), s));
- float32x4_t p = eval_poly (m_scale, d.poly);
+
+ /* Evaluate polynomial on the reduced interval. */
+ float32x4_t p = eval_poly (m_scale, d);
+
+ /* The scale factor to be applied back at the end - by multiplying float(k)
+ by 2^-23 we get the unbiased exponent of k. */
float32x4_t scale_back = vmulq_f32 (vcvtq_f32_s32 (k), v_f32 (0x1.0p-23f));
- return vfmaq_f32 (p, scale_back, d.ln2);
+
+ /* Apply the scaling back. */
+ return vfmaq_f32 (p, scale_back, d->ln2);
}
#endif
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index 7f0a8aa..862eefa 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -75,49 +75,37 @@ extern const struct v_log10_data
} table[1 << V_LOG10_TABLE_BITS];
} __v_log10_data attribute_hidden;
-extern const struct erff_data
+extern const struct v_erff_data
{
struct
{
float erf, scale;
} tab[513];
-} __erff_data attribute_hidden;
+} __v_erff_data attribute_hidden;
-extern const struct sv_erff_data
-{
- float erf[513];
- float scale[513];
-} __sv_erff_data attribute_hidden;
-
-extern const struct erf_data
+extern const struct v_erf_data
{
struct
{
double erf, scale;
} tab[769];
-} __erf_data attribute_hidden;
-
-extern const struct sv_erf_data
-{
- double erf[769];
- double scale[769];
-} __sv_erf_data attribute_hidden;
+} __v_erf_data attribute_hidden;
-extern const struct erfc_data
+extern const struct v_erfc_data
{
struct
{
double erfc, scale;
} tab[3488];
-} __erfc_data attribute_hidden;
+} __v_erfc_data attribute_hidden;
-extern const struct erfcf_data
+extern const struct v_erfcf_data
{
struct
{
float erfc, scale;
} tab[645];
-} __erfcf_data attribute_hidden;
+} __v_erfcf_data attribute_hidden;
/* Some data for AdvSIMD and SVE pow's internal exp and log. */
#define V_POW_EXP_TABLE_BITS 8
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 846fb2c..c523d45 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -801,6 +801,7 @@ float: 1
ldouble: 1
Function: Imaginary part of "csin":
+float: 1
ldouble: 1
Function: Real part of "csin_downward":
@@ -1083,22 +1084,18 @@ ldouble: 3
Function: "exp10m1":
double: 4
-float: 2
ldouble: 3
Function: "exp10m1_downward":
double: 3
-float: 3
ldouble: 6
Function: "exp10m1_towardzero":
double: 2
-float: 3
ldouble: 6
Function: "exp10m1_upward":
double: 5
-float: 3
ldouble: 6
Function: "exp2":
@@ -1131,22 +1128,18 @@ ldouble: 2
Function: "exp2m1":
double: 2
-float: 2
ldouble: 2
Function: "exp2m1_downward":
double: 3
-float: 3
ldouble: 3
Function: "exp2m1_towardzero":
double: 3
-float: 2
ldouble: 4
Function: "exp2m1_upward":
double: 3
-float: 3
ldouble: 5
Function: "exp_advsimd":
@@ -1171,7 +1164,6 @@ float: 1
Function: "expm1":
double: 1
-float: 1
ldouble: 2
Function: "expm1_advsimd":
@@ -1180,7 +1172,6 @@ float: 1
Function: "expm1_downward":
double: 1
-float: 1
ldouble: 2
Function: "expm1_sve":
@@ -1189,12 +1180,10 @@ float: 1
Function: "expm1_towardzero":
double: 1
-float: 2
ldouble: 4
Function: "expm1_upward":
double: 1
-float: 1
ldouble: 3
Function: "gamma":
@@ -1357,27 +1346,22 @@ ldouble: 1
Function: "log10p1":
double: 2
-float: 2
ldouble: 3
Function: "log10p1_downward":
double: 2
-float: 3
ldouble: 4
Function: "log10p1_towardzero":
double: 3
-float: 2
ldouble: 3
Function: "log10p1_upward":
double: 2
-float: 3
ldouble: 4
Function: "log1p":
double: 1
-float: 1
ldouble: 3
Function: "log1p_advsimd":
@@ -1386,7 +1370,6 @@ float: 1
Function: "log1p_downward":
double: 1
-float: 2
ldouble: 3
Function: "log1p_sve":
@@ -1395,12 +1378,10 @@ float: 1
Function: "log1p_towardzero":
double: 2
-float: 2
ldouble: 3
Function: "log1p_upward":
double: 2
-float: 2
ldouble: 2
Function: "log2":
@@ -1433,22 +1414,18 @@ ldouble: 1
Function: "log2p1":
double: 2
-float: 2
ldouble: 3
Function: "log2p1_downward":
double: 2
-float: 2
ldouble: 3
Function: "log2p1_towardzero":
double: 2
-float: 2
ldouble: 2
Function: "log2p1_upward":
double: 2
-float: 2
ldouble: 3
Function: "log_advsimd":
@@ -1474,22 +1451,18 @@ ldouble: 1
Function: "logp1":
double: 1
-float: 1
ldouble: 3
Function: "logp1_downward":
double: 1
-float: 2
ldouble: 3
Function: "logp1_towardzero":
double: 2
-float: 2
ldouble: 3
Function: "logp1_upward":
double: 2
-float: 2
ldouble: 2
Function: "pow":
@@ -1653,22 +1626,18 @@ ldouble: 3
Function: "tgamma":
double: 9
-float: 8
ldouble: 4
Function: "tgamma_downward":
double: 9
-float: 7
ldouble: 5
Function: "tgamma_towardzero":
double: 9
-float: 7
ldouble: 5
Function: "tgamma_upward":
double: 9
-float: 8
ldouble: 4
Function: "y0":
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
deleted file mode 100644
index 6c7f60b..0000000
--- a/sysdeps/aarch64/memset-reg.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Register aliases for memset to be used across implementations.
- Copyright (C) 2017-2024 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#define dstin x0
-#define val x1
-#define valw w1
-#define count x2
-#define dst x3
-#define dstend x4
-#define tmp1 x5
-#define tmp1w w5
-#define tmp2 x6
-#define tmp2w w6
-#define zva_len x7
-#define zva_lenw w7
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 7ef77ee..b76dde1 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
#ifndef MEMSET
# define MEMSET memset
@@ -25,130 +25,131 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
-ENTRY (MEMSET)
+#define dstin x0
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+#define off x3
+#define dstend2 x5
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
dup v0.16B, valw
+ cmp count, 16
+ b.lo L(set_small)
+
add dstend, dstin, count
+ cmp count, 64
+ b.hs L(set_128)
- cmp count, 96
- b.hi L(set_long)
- cmp count, 16
- b.hs L(set_medium)
- mov val, v0.D[0]
+ /* Set 16..63 bytes. */
+ mov off, 16
+ and off, off, count, lsr 1
+ sub dstend2, dstend, off
+ str q0, [dstin]
+ str q0, [dstin, off]
+ str q0, [dstend2, -16]
+ str q0, [dstend, -16]
+ ret
+ .p2align 4
/* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
- nop
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
+L(set_small):
+ add dstend, dstin, count
+ cmp count, 4
+ b.lo 2f
+ lsr off, count, 3
+ sub dstend2, dstend, off, lsl 2
+ str s0, [dstin]
+ str s0, [dstin, off, lsl 2]
+ str s0, [dstend2, -4]
+ str s0, [dstend, -4]
ret
+
+ /* Set 0..3 bytes. */
2: cbz count, 3f
+ lsr off, count, 1
strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
+ strb valw, [dstin, off]
+ strb valw, [dstend, -1]
3: ret
- /* Set 17..96 bytes. */
-L(set_medium):
- str q0, [dstin]
- tbnz count, 6, L(set96)
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
.p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-L(set96):
- str q0, [dstin, 16]
+L(set_128):
+ bic dst, dstin, 15
+ cmp count, 128
+ b.hi L(set_long)
+ stp q0, q0, [dstin]
stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
- and valw, valw, 255
- bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
+ str q0, [dst, 16]
+ tst valw, 255
+ b.ne L(no_zva)
+#ifndef ZVA64_ONLY
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+#endif
+ stp q0, q0, [dst, 32]
+ bic dst, dstin, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 64 + 64 /* Adjust count and bias for loop. */
+
+ /* Write last bytes before ZVA loop. */
+ stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
+
+ .p2align 4
+L(zva64_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva64_loop)
ret
-L(try_zva):
-#ifndef ZVA64_ONLY
.p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
- nop
-#endif
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
- .p2align 4
-L(zva_64):
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 32 too large. */
+ sub count, count, 64 + 32 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- bic dst, dst, 63
stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
add dst, dst, 64
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
#ifndef ZVA64_ONLY
- .p2align 3
+ .p2align 4
L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
+ cmp zva_val, 5 /* ZVA size is 128 bytes. */
+ b.ne L(no_zva)
- str q0, [dst, 16]
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]
stp q0, q0, [dst, 96]
bic dst, dst, 127
sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
+ sub count, count, 128 + 128 /* Adjust count and bias for loop. */
+1: add dst, dst, 128
+ dc zva, dst
subs count, count, 128
b.hi 1b
stp q0, q0, [dstend, -128]
@@ -156,35 +157,6 @@ L(zva_128):
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
#endif
END (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index 2e6d882..f665b5a 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -18,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <sysdeps/aarch64/memset-reg.h>
/* Assumptions:
*
@@ -36,6 +35,14 @@
.arch armv8.2-a+sve
+#define dstin x0
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define tmp1 x5
+#define tmp2 x6
+
.macro st1b_unroll first=0, last=7
st1b z0.b, p0, [dst, \first, mul vl]
.if \last-\first
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 6d714ed..cf1b25f 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -18,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
/* Assumptions:
*
@@ -26,6 +25,13 @@
*
*/
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+
ENTRY (__memset_emag)
PTR_ARG (0)
diff --git a/sysdeps/aarch64/multiarch/memset_kunpeng.S b/sysdeps/aarch64/multiarch/memset_kunpeng.S
index 7b21550..f815c20 100644
--- a/sysdeps/aarch64/multiarch/memset_kunpeng.S
+++ b/sysdeps/aarch64/multiarch/memset_kunpeng.S
@@ -18,7 +18,6 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <sysdeps/aarch64/memset-reg.h>
/* Assumptions:
*
@@ -26,6 +25,12 @@
*
*/
+#define dstin x0
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+
ENTRY (__memset_kunpeng)
PTR_ARG (0)
diff --git a/sysdeps/aarch64/multiarch/memset_oryon1.S b/sysdeps/aarch64/multiarch/memset_oryon1.S
index b43a43b..6fa28a9 100644
--- a/sysdeps/aarch64/multiarch/memset_oryon1.S
+++ b/sysdeps/aarch64/multiarch/memset_oryon1.S
@@ -19,12 +19,18 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include "memset-reg.h"
/* Assumptions:
ARMv8-a, AArch64, unaligned accesses
*/
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+
ENTRY (__memset_oryon1)
PTR_ARG (0)