aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2023-10-04 10:38:57 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2023-10-05 16:54:16 +0100
commit5a4b6f8e4b7e2a76c71b713200a80181d745c93d (patch)
tree4d5e092d4e36c23497c9174531394c89342d32c4 /sysdeps
parent480a0dfe1acab3dbf1bdcfa37fdca992eb9c54a5 (diff)
downloadglibc-5a4b6f8e4b7e2a76c71b713200a80181d745c93d.zip
glibc-5a4b6f8e4b7e2a76c71b713200a80181d745c93d.tar.gz
glibc-5a4b6f8e4b7e2a76c71b713200a80181d745c93d.tar.bz2
aarch64: Optimise vecmath logs
* Transpose table layout for improved memory access * Use half-vector special comparisons for AdvSIMD * Improve register use near special-case branches - Due to the presence of a function call, return value would get mov-d out of x0 in order to facilitate PCS. By moving the final computation after the branch this can be avoided Also change SVE routines to use overloaded intrinsics for readability.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/fpu/log_advsimd.c36
-rw-r--r--sysdeps/aarch64/fpu/log_sve.c52
-rw-r--r--sysdeps/aarch64/fpu/logf_advsimd.c26
-rw-r--r--sysdeps/aarch64/fpu/logf_sve.c40
-rw-r--r--sysdeps/aarch64/fpu/v_log_data.c260
-rw-r--r--sysdeps/aarch64/fpu/v_math.h21
-rw-r--r--sysdeps/aarch64/fpu/vecmath_config.h6
7 files changed, 226 insertions, 215 deletions
diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c
index 434737f..8b32d1c 100644
--- a/sysdeps/aarch64/fpu/log_advsimd.c
+++ b/sysdeps/aarch64/fpu/log_advsimd.c
@@ -21,9 +21,11 @@
static const struct data
{
+ uint64x2_t min_norm;
+ uint32x4_t special_bound;
float64x2_t poly[5];
float64x2_t ln2;
- uint64x2_t min_norm, special_bound, sign_exp_mask;
+ uint64x2_t sign_exp_mask;
} data = {
/* Worst-case error: 1.17 + 0.5 ulp.
Rel error: 0x1.6272e588p-56 in [ -0x1.fc1p-9 0x1.009p-8 ]. */
@@ -32,7 +34,7 @@ static const struct data
V2 (-0x1.554e550bd501ep-3) },
.ln2 = V2 (0x1.62e42fefa39efp-1),
.min_norm = V2 (0x0010000000000000),
- .special_bound = V2 (0x7fe0000000000000), /* asuint64(inf) - min_norm. */
+ .special_bound = V4 (0x7fe00000), /* asuint64(inf) - min_norm. */
.sign_exp_mask = V2 (0xfff0000000000000)
};
@@ -52,29 +54,34 @@ lookup (uint64x2_t i)
{
/* Since N is a power of 2, n % N = n & (N - 1). */
struct entry e;
- e.invc[0] = __v_log_data.invc[i[0] & IndexMask];
- e.logc[0] = __v_log_data.logc[i[0] & IndexMask];
- e.invc[1] = __v_log_data.invc[i[1] & IndexMask];
- e.logc[1] = __v_log_data.logc[i[1] & IndexMask];
+ uint64_t i0 = (i[0] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask;
+ float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc);
+ float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc);
+ e.invc = vuzp1q_f64 (e0, e1);
+ e.logc = vuzp2q_f64 (e0, e1);
return e;
}
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+special_case (float64x2_t x, float64x2_t y, float64x2_t hi, float64x2_t r2,
+ uint32x2_t cmp)
{
- return v_call_f64 (log, x, y, cmp);
+ return v_call_f64 (log, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (cmp));
}
float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
float64x2_t z, r, r2, p, y, kd, hi;
- uint64x2_t ix, iz, tmp, cmp;
+ uint64x2_t ix, iz, tmp;
+ uint32x2_t cmp;
int64x2_t k;
struct entry e;
ix = vreinterpretq_u64_f64 (x);
- cmp = vcgeq_u64 (vsubq_u64 (ix, d->min_norm), d->special_bound);
+ cmp = vcge_u32 (vsubhn_u64 (ix, d->min_norm),
+ vget_low_u32 (d->special_bound));
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
@@ -83,7 +90,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
k = vshrq_n_s64 (vreinterpretq_s64_u64 (tmp), 52); /* arithmetic shift. */
iz = vsubq_u64 (ix, vandq_u64 (tmp, d->sign_exp_mask));
z = vreinterpretq_f64_u64 (iz);
- e = lookup (vshrq_n_u64 (tmp, 52 - V_LOG_TABLE_BITS));
+ e = lookup (tmp);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
@@ -97,9 +104,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log) (float64x2_t x)
p = vfmaq_f64 (A (0), A (1), r);
y = vfmaq_f64 (y, A (4), r2);
y = vfmaq_f64 (p, y, r2);
- y = vfmaq_f64 (hi, y, r2);
- if (__glibc_unlikely (v_any_u64 (cmp)))
- return special_case (x, y, cmp);
- return y;
+ if (__glibc_unlikely (v_any_u32h (cmp)))
+ return special_case (x, y, hi, r2, cmp);
+ return vfmaq_f64 (hi, y, r2);
}
diff --git a/sysdeps/aarch64/fpu/log_sve.c b/sysdeps/aarch64/fpu/log_sve.c
index 93c4f1c..0c171a4 100644
--- a/sysdeps/aarch64/fpu/log_sve.c
+++ b/sysdeps/aarch64/fpu/log_sve.c
@@ -38,43 +38,39 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
want 0x1.ffffff1cca045p-2. */
svfloat64_t SV_NAME_D1 (log) (svfloat64_t x, const svbool_t pg)
{
- svuint64_t ix = svreinterpret_u64_f64 (x);
- svuint64_t top = svlsr_n_u64_x (pg, ix, 52);
- svbool_t cmp
- = svcmpge_u64 (pg, svsub_n_u64_x (pg, top, MinTop), sv_u64 (ThreshTop));
+ svuint64_t ix = svreinterpret_u64 (x);
+ svuint64_t top = svlsr_x (pg, ix, 52);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, top, MinTop), sv_u64 (ThreshTop));
/* x = 2^k z; where z is in range [Off,2*Off) and exact.
The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */
- svuint64_t tmp = svsub_n_u64_x (pg, ix, Off);
- /* Equivalent to (tmp >> (52 - V_LOG_TABLE_BITS)) % N, since N is a power
- of 2. */
- svuint64_t i = svand_n_u64_x (
- pg, svlsr_n_u64_x (pg, tmp, (52 - V_LOG_TABLE_BITS)), N - 1);
- svint64_t k = svasr_n_s64_x (pg, svreinterpret_s64_u64 (tmp),
- 52); /* Arithmetic shift. */
- svuint64_t iz
- = svsub_u64_x (pg, ix, svand_n_u64_x (pg, tmp, 0xfffULL << 52));
- svfloat64_t z = svreinterpret_f64_u64 (iz);
+ svuint64_t tmp = svsub_x (pg, ix, Off);
+ /* Calculate table index = (tmp >> (52 - V_LOG_TABLE_BITS)) % N.
+ The actual value of i is double this due to table layout. */
+ svuint64_t i
+ = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)), (N - 1) << 1);
+ svint64_t k
+ = svasr_x (pg, svreinterpret_s64 (tmp), 52); /* Arithmetic shift. */
+ svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+ svfloat64_t z = svreinterpret_f64 (iz);
/* Lookup in 2 global lists (length N). */
- svfloat64_t invc = svld1_gather_u64index_f64 (pg, __v_log_data.invc, i);
- svfloat64_t logc = svld1_gather_u64index_f64 (pg, __v_log_data.logc, i);
+ svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
+ svfloat64_t logc = svld1_gather_index (pg, &__v_log_data.table[0].logc, i);
/* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */
- svfloat64_t r = svmad_n_f64_x (pg, invc, z, -1);
- svfloat64_t kd = svcvt_f64_s64_x (pg, k);
+ svfloat64_t r = svmad_x (pg, invc, z, -1);
+ svfloat64_t kd = svcvt_f64_x (pg, k);
/* hi = r + log(c) + k*Ln2. */
- svfloat64_t hi
- = svmla_n_f64_x (pg, svadd_f64_x (pg, logc, r), kd, __v_log_data.ln2);
+ svfloat64_t hi = svmla_x (pg, svadd_x (pg, logc, r), kd, __v_log_data.ln2);
/* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi. */
- svfloat64_t r2 = svmul_f64_x (pg, r, r);
- svfloat64_t y = svmla_f64_x (pg, P (2), r, P (3));
- svfloat64_t p = svmla_f64_x (pg, P (0), r, P (1));
- y = svmla_f64_x (pg, y, r2, P (4));
- y = svmla_f64_x (pg, p, r2, y);
- y = svmla_f64_x (pg, hi, r2, y);
+ svfloat64_t r2 = svmul_x (pg, r, r);
+ svfloat64_t y = svmla_x (pg, P (2), r, P (3));
+ svfloat64_t p = svmla_x (pg, P (0), r, P (1));
+ y = svmla_x (pg, y, r2, P (4));
+ y = svmla_x (pg, p, r2, y);
if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, y, cmp);
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, cmp), hi, r2, y), cmp);
+ return svmla_x (pg, hi, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index 375ad28..93903c7 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,9 +21,11 @@
static const struct data
{
+ uint32x4_t min_norm;
+ uint16x8_t special_bound;
float32x4_t poly[7];
float32x4_t ln2, tiny_bound;
- uint32x4_t min_norm, special_bound, off, mantissa_mask;
+ uint32x4_t off, mantissa_mask;
} data = {
/* 3.34 ulp error. */
.poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
@@ -32,28 +34,31 @@ static const struct data
.ln2 = V4 (0x1.62e43p-1f),
.tiny_bound = V4 (0x1p-126),
.min_norm = V4 (0x00800000),
- .special_bound = V4 (0x7f000000), /* asuint32(inf) - min_norm. */
- .off = V4 (0x3f2aaaab), /* 0.666667. */
+ .special_bound = V8 (0x7f00), /* asuint32(inf) - min_norm. */
+ .off = V4 (0x3f2aaaab), /* 0.666667. */
.mantissa_mask = V4 (0x007fffff)
};
#define P(i) d->poly[7 - i]
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+special_case (float32x4_t x, float32x4_t y, float32x4_t r2, float32x4_t p,
+ uint16x4_t cmp)
{
/* Fall back to scalar code. */
- return v_call_f32 (logf, x, y, cmp);
+ return v_call_f32 (logf, x, vfmaq_f32 (p, y, r2), vmovl_u16 (cmp));
}
float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
float32x4_t n, p, q, r, r2, y;
- uint32x4_t u, cmp;
+ uint32x4_t u;
+ uint16x4_t cmp;
u = vreinterpretq_u32_f32 (x);
- cmp = vcgeq_u32 (vsubq_u32 (u, d->min_norm), d->special_bound);
+ cmp = vcge_u16 (vsubhn_u32 (u, d->min_norm),
+ vget_low_u16 (d->special_bound));
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
u = vsubq_u32 (u, d->off);
@@ -73,9 +78,8 @@ float32x4_t VPCS_ATTR V_NAME_F1 (log) (float32x4_t x)
q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n);
- y = vfmaq_f32 (p, y, r2);
- if (__glibc_unlikely (v_any_u32 (cmp)))
- return special_case (x, y, cmp);
- return y;
+ if (__glibc_unlikely (v_any_u16h (cmp)))
+ return special_case (x, y, r2, p, cmp);
+ return vfmaq_f32 (p, y, r2);
}
diff --git a/sysdeps/aarch64/fpu/logf_sve.c b/sysdeps/aarch64/fpu/logf_sve.c
index 46c6e7c..c027611 100644
--- a/sysdeps/aarch64/fpu/logf_sve.c
+++ b/sysdeps/aarch64/fpu/logf_sve.c
@@ -55,33 +55,31 @@ svfloat32_t SV_NAME_F1 (log) (svfloat32_t x, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
- svuint32_t u = svreinterpret_u32_f32 (x);
- svbool_t cmp = svcmpge_n_u32 (pg, svsub_n_u32_x (pg, u, Min), Thresh);
+ svuint32_t u = svreinterpret_u32 (x);
+ svbool_t cmp = svcmpge (pg, svsub_x (pg, u, Min), Thresh);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u = svsub_n_u32_x (pg, u, Off);
- svfloat32_t n
- = svcvt_f32_s32_x (pg, svasr_n_s32_x (pg, svreinterpret_s32_u32 (u),
- 23)); /* Sign-extend. */
- u = svand_n_u32_x (pg, u, Mask);
- u = svadd_n_u32_x (pg, u, Off);
- svfloat32_t r = svsub_n_f32_x (pg, svreinterpret_f32_u32 (u), 1.0f);
+ u = svsub_x (pg, u, Off);
+ svfloat32_t n = svcvt_f32_x (
+ pg, svasr_x (pg, svreinterpret_s32 (u), 23)); /* Sign-extend. */
+ u = svand_x (pg, u, Mask);
+ u = svadd_x (pg, u, Off);
+ svfloat32_t r = svsub_x (pg, svreinterpret_f32 (u), 1.0f);
/* y = log(1+r) + n*ln2. */
- svfloat32_t r2 = svmul_f32_x (pg, r, r);
+ svfloat32_t r2 = svmul_x (pg, r, r);
/* n*ln2 + r + r2*(P6 + r*P5 + r2*(P4 + r*P3 + r2*(P2 + r*P1 + r2*P0))). */
- svfloat32_t p_0135 = svld1rq_f32 (svptrue_b32 (), &d->poly_0135[0]);
- svfloat32_t p = svmla_lane_f32 (sv_f32 (d->poly_246[0]), r, p_0135, 1);
- svfloat32_t q = svmla_lane_f32 (sv_f32 (d->poly_246[1]), r, p_0135, 2);
- svfloat32_t y = svmla_lane_f32 (sv_f32 (d->poly_246[2]), r, p_0135, 3);
- p = svmla_lane_f32 (p, r2, p_0135, 0);
+ svfloat32_t p_0135 = svld1rq (svptrue_b32 (), &d->poly_0135[0]);
+ svfloat32_t p = svmla_lane (sv_f32 (d->poly_246[0]), r, p_0135, 1);
+ svfloat32_t q = svmla_lane (sv_f32 (d->poly_246[1]), r, p_0135, 2);
+ svfloat32_t y = svmla_lane (sv_f32 (d->poly_246[2]), r, p_0135, 3);
+ p = svmla_lane (p, r2, p_0135, 0);
- q = svmla_f32_x (pg, q, r2, p);
- y = svmla_f32_x (pg, y, r2, q);
- p = svmla_n_f32_x (pg, r, n, d->ln2);
- y = svmla_f32_x (pg, p, r2, y);
+ q = svmla_x (pg, q, r2, p);
+ y = svmla_x (pg, y, r2, q);
+ p = svmla_x (pg, r, n, d->ln2);
if (__glibc_unlikely (svptest_any (pg, cmp)))
- return special_case (x, y, cmp);
- return y;
+ return special_case (x, svmla_x (svnot_z (pg, cmp), p, r2, y), cmp);
+ return svmla_x (pg, p, r2, y);
}
diff --git a/sysdeps/aarch64/fpu/v_log_data.c b/sysdeps/aarch64/fpu/v_log_data.c
index 6fd6f43..99506e3 100644
--- a/sysdeps/aarch64/fpu/v_log_data.c
+++ b/sysdeps/aarch64/fpu/v_log_data.c
@@ -34,140 +34,140 @@ const struct v_log_data __v_log_data = {
N=128) and log(c) and 1/c for the ith subinterval comes from two lookup
tables:
- invc[i] = 1/c
- logc[i] = (double)log(c)
+ table[i].invc = 1/c
+ table[i].logc = (double)log(c)
where c is near the center of the subinterval and is chosen by trying
several floating point invc candidates around 1/center and selecting one
for which the error in (double)log(c) is minimized (< 0x1p-74), except the
subinterval that contains 1 and the previous one got tweaked to avoid
cancellation. */
- .invc = { 0x1.6a133d0dec120p+0, 0x1.6815f2f3e42edp+0,
- 0x1.661e39be1ac9ep+0, 0x1.642bfa30ac371p+0,
- 0x1.623f1d916f323p+0, 0x1.60578da220f65p+0,
- 0x1.5e75349dea571p+0, 0x1.5c97fd387a75ap+0,
- 0x1.5abfd2981f200p+0, 0x1.58eca051dc99cp+0,
- 0x1.571e526d9df12p+0, 0x1.5554d555b3fcbp+0,
- 0x1.539015e2a20cdp+0, 0x1.51d0014ee0164p+0,
- 0x1.50148538cd9eep+0, 0x1.4e5d8f9f698a1p+0,
- 0x1.4cab0edca66bep+0, 0x1.4afcf1a9db874p+0,
- 0x1.495327136e16fp+0, 0x1.47ad9e84af28fp+0,
- 0x1.460c47b39ae15p+0, 0x1.446f12b278001p+0,
- 0x1.42d5efdd720ecp+0, 0x1.4140cfe001a0fp+0,
- 0x1.3fafa3b421f69p+0, 0x1.3e225c9c8ece5p+0,
- 0x1.3c98ec29a211ap+0, 0x1.3b13442a413fep+0,
- 0x1.399156baa3c54p+0, 0x1.38131639b4cdbp+0,
- 0x1.36987540fbf53p+0, 0x1.352166b648f61p+0,
- 0x1.33adddb3eb575p+0, 0x1.323dcd99fc1d3p+0,
- 0x1.30d129fefc7d2p+0, 0x1.2f67e6b72fe7dp+0,
- 0x1.2e01f7cf8b187p+0, 0x1.2c9f518ddc86ep+0,
- 0x1.2b3fe86e5f413p+0, 0x1.29e3b1211b25cp+0,
- 0x1.288aa08b373cfp+0, 0x1.2734abcaa8467p+0,
- 0x1.25e1c82459b81p+0, 0x1.2491eb1ad59c5p+0,
- 0x1.23450a54048b5p+0, 0x1.21fb1bb09e578p+0,
- 0x1.20b415346d8f7p+0, 0x1.1f6fed179a1acp+0,
- 0x1.1e2e99b93c7b3p+0, 0x1.1cf011a7a882ap+0,
- 0x1.1bb44b97dba5ap+0, 0x1.1a7b3e66cdd4fp+0,
- 0x1.1944e11dc56cdp+0, 0x1.18112aebb1a6ep+0,
- 0x1.16e013231b7e9p+0, 0x1.15b1913f156cfp+0,
- 0x1.14859cdedde13p+0, 0x1.135c2dc68cfa4p+0,
- 0x1.12353bdb01684p+0, 0x1.1110bf25b85b4p+0,
- 0x1.0feeafd2f8577p+0, 0x1.0ecf062c51c3bp+0,
- 0x1.0db1baa076c8bp+0, 0x1.0c96c5bb3048ep+0,
- 0x1.0b7e20263e070p+0, 0x1.0a67c2acd0ce3p+0,
- 0x1.0953a6391e982p+0, 0x1.0841c3caea380p+0,
- 0x1.07321489b13eap+0, 0x1.062491aee9904p+0,
- 0x1.05193497a7cc5p+0, 0x1.040ff6b5f5e9fp+0,
- 0x1.0308d19aa6127p+0, 0x1.0203beedb0c67p+0,
- 0x1.010037d38bcc2p+0, 1.0,
- 0x1.fc06d493cca10p-1, 0x1.f81e6ac3b918fp-1,
- 0x1.f44546ef18996p-1, 0x1.f07b10382c84bp-1,
- 0x1.ecbf7070e59d4p-1, 0x1.e91213f715939p-1,
- 0x1.e572a9a75f7b7p-1, 0x1.e1e0e2c530207p-1,
- 0x1.de5c72d8a8be3p-1, 0x1.dae50fa5658ccp-1,
- 0x1.d77a71145a2dap-1, 0x1.d41c51166623ep-1,
- 0x1.d0ca6ba0bb29fp-1, 0x1.cd847e8e59681p-1,
- 0x1.ca4a499693e00p-1, 0x1.c71b8e399e821p-1,
- 0x1.c3f80faf19077p-1, 0x1.c0df92dc2b0ecp-1,
- 0x1.bdd1de3cbb542p-1, 0x1.baceb9e1007a3p-1,
- 0x1.b7d5ef543e55ep-1, 0x1.b4e749977d953p-1,
- 0x1.b20295155478ep-1, 0x1.af279f8e82be2p-1,
- 0x1.ac5638197fdf3p-1, 0x1.a98e2f102e087p-1,
- 0x1.a6cf5606d05c1p-1, 0x1.a4197fc04d746p-1,
- 0x1.a16c80293dc01p-1, 0x1.9ec82c4dc5bc9p-1,
- 0x1.9c2c5a491f534p-1, 0x1.9998e1480b618p-1,
- 0x1.970d9977c6c2dp-1, 0x1.948a5c023d212p-1,
- 0x1.920f0303d6809p-1, 0x1.8f9b698a98b45p-1,
- 0x1.8d2f6b81726f6p-1, 0x1.8acae5bb55badp-1,
- 0x1.886db5d9275b8p-1, 0x1.8617ba567c13cp-1,
- 0x1.83c8d27487800p-1, 0x1.8180de3c5dbe7p-1,
- 0x1.7f3fbe71cdb71p-1, 0x1.7d055498071c1p-1,
- 0x1.7ad182e54f65ap-1, 0x1.78a42c3c90125p-1,
- 0x1.767d342f76944p-1, 0x1.745c7ef26b00ap-1,
- 0x1.7241f15769d0fp-1, 0x1.702d70d396e41p-1,
- 0x1.6e1ee3700cd11p-1, 0x1.6c162fc9cbe02p-1 },
- .logc = { -0x1.62fe995eb963ap-2, -0x1.5d5a48dad6b67p-2,
- -0x1.57bde257d2769p-2, -0x1.52294fbf2af55p-2,
- -0x1.4c9c7b598aa38p-2, -0x1.47174fc5ff560p-2,
- -0x1.4199b7fa7b5cap-2, -0x1.3c239f48cfb99p-2,
- -0x1.36b4f154d2aebp-2, -0x1.314d9a0ff32fbp-2,
- -0x1.2bed85cca3cffp-2, -0x1.2694a11421af9p-2,
- -0x1.2142d8d014fb2p-2, -0x1.1bf81a2c77776p-2,
- -0x1.16b452a39c6a4p-2, -0x1.11776ffa6c67ep-2,
- -0x1.0c416035020e0p-2, -0x1.071211aa10fdap-2,
- -0x1.01e972e293b1bp-2, -0x1.f98ee587fd434p-3,
- -0x1.ef5800ad716fbp-3, -0x1.e52e160484698p-3,
- -0x1.db1104b19352ep-3, -0x1.d100ac59e0bd6p-3,
- -0x1.c6fced287c3bdp-3, -0x1.bd05a7b317c29p-3,
- -0x1.b31abd229164fp-3, -0x1.a93c0edadb0a3p-3,
- -0x1.9f697ee30d7ddp-3, -0x1.95a2efa9aa40ap-3,
- -0x1.8be843d796044p-3, -0x1.82395ecc477edp-3,
- -0x1.7896240966422p-3, -0x1.6efe77aca8c55p-3,
- -0x1.65723e117ec5cp-3, -0x1.5bf15c0955706p-3,
- -0x1.527bb6c111da1p-3, -0x1.491133c939f8fp-3,
- -0x1.3fb1b90c7fc58p-3, -0x1.365d2cc485f8dp-3,
- -0x1.2d13758970de7p-3, -0x1.23d47a721fd47p-3,
- -0x1.1aa0229f25ec2p-3, -0x1.117655ddebc3bp-3,
- -0x1.0856fbf83ab6bp-3, -0x1.fe83fabbaa106p-4,
- -0x1.ec6e8507a56cdp-4, -0x1.da6d68c7cc2eap-4,
- -0x1.c88078462be0cp-4, -0x1.b6a786a423565p-4,
- -0x1.a4e2676ac7f85p-4, -0x1.9330eea777e76p-4,
- -0x1.8192f134d5ad9p-4, -0x1.70084464f0538p-4,
- -0x1.5e90bdec5cb1fp-4, -0x1.4d2c3433c5536p-4,
- -0x1.3bda7e219879ap-4, -0x1.2a9b732d27194p-4,
- -0x1.196eeb2b10807p-4, -0x1.0854be8ef8a7ep-4,
- -0x1.ee998cb277432p-5, -0x1.ccadb79919fb9p-5,
- -0x1.aae5b1d8618b0p-5, -0x1.89413015d7442p-5,
- -0x1.67bfe7bf158dep-5, -0x1.46618f83941bep-5,
- -0x1.2525df1b0618ap-5, -0x1.040c8e2f77c6ap-5,
- -0x1.c62aad39f738ap-6, -0x1.847fe3bdead9cp-6,
- -0x1.43183683400acp-6, -0x1.01f31c4e1d544p-6,
- -0x1.82201d1e6b69ap-7, -0x1.00dd0f3e1bfd6p-7,
- -0x1.ff6fe1feb4e53p-9, 0.0,
- 0x1.fe91885ec8e20p-8, 0x1.fc516f716296dp-7,
- 0x1.7bb4dd70a015bp-6, 0x1.f84c99b34b674p-6,
- 0x1.39f9ce4fb2d71p-5, 0x1.7756c0fd22e78p-5,
- 0x1.b43ee82db8f3ap-5, 0x1.f0b3fced60034p-5,
- 0x1.165bd78d4878ep-4, 0x1.3425d2715ebe6p-4,
- 0x1.51b8bd91b7915p-4, 0x1.6f15632c76a47p-4,
- 0x1.8c3c88ecbe503p-4, 0x1.a92ef077625dap-4,
- 0x1.c5ed5745fa006p-4, 0x1.e27876de1c993p-4,
- 0x1.fed104fce4cdcp-4, 0x1.0d7bd9c17d78bp-3,
- 0x1.1b76986cef97bp-3, 0x1.295913d24f750p-3,
- 0x1.37239fa295d17p-3, 0x1.44d68dd78714bp-3,
- 0x1.52722ebe5d780p-3, 0x1.5ff6d12671f98p-3,
- 0x1.6d64c2389484bp-3, 0x1.7abc4da40fddap-3,
- 0x1.87fdbda1e8452p-3, 0x1.95295b06a5f37p-3,
- 0x1.a23f6d34abbc5p-3, 0x1.af403a28e04f2p-3,
- 0x1.bc2c06a85721ap-3, 0x1.c903161240163p-3,
- 0x1.d5c5aa93287ebp-3, 0x1.e274051823fa9p-3,
- 0x1.ef0e656300c16p-3, 0x1.fb9509f05aa2ap-3,
- 0x1.04041821f37afp-2, 0x1.0a340a49b3029p-2,
- 0x1.105a7918a126dp-2, 0x1.1677819812b84p-2,
- 0x1.1c8b405b40c0ep-2, 0x1.2295d16cfa6b1p-2,
- 0x1.28975066318a2p-2, 0x1.2e8fd855d86fcp-2,
- 0x1.347f83d605e59p-2, 0x1.3a666d1244588p-2,
- 0x1.4044adb6f8ec4p-2, 0x1.461a5f077558cp-2,
- 0x1.4be799e20b9c8p-2, 0x1.51ac76a6b79dfp-2,
- 0x1.57690d5744a45p-2, 0x1.5d1d758e45217p-2 }
+ .table = { { 0x1.6a133d0dec120p+0, -0x1.62fe995eb963ap-2 },
+ { 0x1.6815f2f3e42edp+0, -0x1.5d5a48dad6b67p-2 },
+ { 0x1.661e39be1ac9ep+0, -0x1.57bde257d2769p-2 },
+ { 0x1.642bfa30ac371p+0, -0x1.52294fbf2af55p-2 },
+ { 0x1.623f1d916f323p+0, -0x1.4c9c7b598aa38p-2 },
+ { 0x1.60578da220f65p+0, -0x1.47174fc5ff560p-2 },
+ { 0x1.5e75349dea571p+0, -0x1.4199b7fa7b5cap-2 },
+ { 0x1.5c97fd387a75ap+0, -0x1.3c239f48cfb99p-2 },
+ { 0x1.5abfd2981f200p+0, -0x1.36b4f154d2aebp-2 },
+ { 0x1.58eca051dc99cp+0, -0x1.314d9a0ff32fbp-2 },
+ { 0x1.571e526d9df12p+0, -0x1.2bed85cca3cffp-2 },
+ { 0x1.5554d555b3fcbp+0, -0x1.2694a11421af9p-2 },
+ { 0x1.539015e2a20cdp+0, -0x1.2142d8d014fb2p-2 },
+ { 0x1.51d0014ee0164p+0, -0x1.1bf81a2c77776p-2 },
+ { 0x1.50148538cd9eep+0, -0x1.16b452a39c6a4p-2 },
+ { 0x1.4e5d8f9f698a1p+0, -0x1.11776ffa6c67ep-2 },
+ { 0x1.4cab0edca66bep+0, -0x1.0c416035020e0p-2 },
+ { 0x1.4afcf1a9db874p+0, -0x1.071211aa10fdap-2 },
+ { 0x1.495327136e16fp+0, -0x1.01e972e293b1bp-2 },
+ { 0x1.47ad9e84af28fp+0, -0x1.f98ee587fd434p-3 },
+ { 0x1.460c47b39ae15p+0, -0x1.ef5800ad716fbp-3 },
+ { 0x1.446f12b278001p+0, -0x1.e52e160484698p-3 },
+ { 0x1.42d5efdd720ecp+0, -0x1.db1104b19352ep-3 },
+ { 0x1.4140cfe001a0fp+0, -0x1.d100ac59e0bd6p-3 },
+ { 0x1.3fafa3b421f69p+0, -0x1.c6fced287c3bdp-3 },
+ { 0x1.3e225c9c8ece5p+0, -0x1.bd05a7b317c29p-3 },
+ { 0x1.3c98ec29a211ap+0, -0x1.b31abd229164fp-3 },
+ { 0x1.3b13442a413fep+0, -0x1.a93c0edadb0a3p-3 },
+ { 0x1.399156baa3c54p+0, -0x1.9f697ee30d7ddp-3 },
+ { 0x1.38131639b4cdbp+0, -0x1.95a2efa9aa40ap-3 },
+ { 0x1.36987540fbf53p+0, -0x1.8be843d796044p-3 },
+ { 0x1.352166b648f61p+0, -0x1.82395ecc477edp-3 },
+ { 0x1.33adddb3eb575p+0, -0x1.7896240966422p-3 },
+ { 0x1.323dcd99fc1d3p+0, -0x1.6efe77aca8c55p-3 },
+ { 0x1.30d129fefc7d2p+0, -0x1.65723e117ec5cp-3 },
+ { 0x1.2f67e6b72fe7dp+0, -0x1.5bf15c0955706p-3 },
+ { 0x1.2e01f7cf8b187p+0, -0x1.527bb6c111da1p-3 },
+ { 0x1.2c9f518ddc86ep+0, -0x1.491133c939f8fp-3 },
+ { 0x1.2b3fe86e5f413p+0, -0x1.3fb1b90c7fc58p-3 },
+ { 0x1.29e3b1211b25cp+0, -0x1.365d2cc485f8dp-3 },
+ { 0x1.288aa08b373cfp+0, -0x1.2d13758970de7p-3 },
+ { 0x1.2734abcaa8467p+0, -0x1.23d47a721fd47p-3 },
+ { 0x1.25e1c82459b81p+0, -0x1.1aa0229f25ec2p-3 },
+ { 0x1.2491eb1ad59c5p+0, -0x1.117655ddebc3bp-3 },
+ { 0x1.23450a54048b5p+0, -0x1.0856fbf83ab6bp-3 },
+ { 0x1.21fb1bb09e578p+0, -0x1.fe83fabbaa106p-4 },
+ { 0x1.20b415346d8f7p+0, -0x1.ec6e8507a56cdp-4 },
+ { 0x1.1f6fed179a1acp+0, -0x1.da6d68c7cc2eap-4 },
+ { 0x1.1e2e99b93c7b3p+0, -0x1.c88078462be0cp-4 },
+ { 0x1.1cf011a7a882ap+0, -0x1.b6a786a423565p-4 },
+ { 0x1.1bb44b97dba5ap+0, -0x1.a4e2676ac7f85p-4 },
+ { 0x1.1a7b3e66cdd4fp+0, -0x1.9330eea777e76p-4 },
+ { 0x1.1944e11dc56cdp+0, -0x1.8192f134d5ad9p-4 },
+ { 0x1.18112aebb1a6ep+0, -0x1.70084464f0538p-4 },
+ { 0x1.16e013231b7e9p+0, -0x1.5e90bdec5cb1fp-4 },
+ { 0x1.15b1913f156cfp+0, -0x1.4d2c3433c5536p-4 },
+ { 0x1.14859cdedde13p+0, -0x1.3bda7e219879ap-4 },
+ { 0x1.135c2dc68cfa4p+0, -0x1.2a9b732d27194p-4 },
+ { 0x1.12353bdb01684p+0, -0x1.196eeb2b10807p-4 },
+ { 0x1.1110bf25b85b4p+0, -0x1.0854be8ef8a7ep-4 },
+ { 0x1.0feeafd2f8577p+0, -0x1.ee998cb277432p-5 },
+ { 0x1.0ecf062c51c3bp+0, -0x1.ccadb79919fb9p-5 },
+ { 0x1.0db1baa076c8bp+0, -0x1.aae5b1d8618b0p-5 },
+ { 0x1.0c96c5bb3048ep+0, -0x1.89413015d7442p-5 },
+ { 0x1.0b7e20263e070p+0, -0x1.67bfe7bf158dep-5 },
+ { 0x1.0a67c2acd0ce3p+0, -0x1.46618f83941bep-5 },
+ { 0x1.0953a6391e982p+0, -0x1.2525df1b0618ap-5 },
+ { 0x1.0841c3caea380p+0, -0x1.040c8e2f77c6ap-5 },
+ { 0x1.07321489b13eap+0, -0x1.c62aad39f738ap-6 },
+ { 0x1.062491aee9904p+0, -0x1.847fe3bdead9cp-6 },
+ { 0x1.05193497a7cc5p+0, -0x1.43183683400acp-6 },
+ { 0x1.040ff6b5f5e9fp+0, -0x1.01f31c4e1d544p-6 },
+ { 0x1.0308d19aa6127p+0, -0x1.82201d1e6b69ap-7 },
+ { 0x1.0203beedb0c67p+0, -0x1.00dd0f3e1bfd6p-7 },
+ { 0x1.010037d38bcc2p+0, -0x1.ff6fe1feb4e53p-9 },
+ { 1.0, 0.0 },
+ { 0x1.fc06d493cca10p-1, 0x1.fe91885ec8e20p-8 },
+ { 0x1.f81e6ac3b918fp-1, 0x1.fc516f716296dp-7 },
+ { 0x1.f44546ef18996p-1, 0x1.7bb4dd70a015bp-6 },
+ { 0x1.f07b10382c84bp-1, 0x1.f84c99b34b674p-6 },
+ { 0x1.ecbf7070e59d4p-1, 0x1.39f9ce4fb2d71p-5 },
+ { 0x1.e91213f715939p-1, 0x1.7756c0fd22e78p-5 },
+ { 0x1.e572a9a75f7b7p-1, 0x1.b43ee82db8f3ap-5 },
+ { 0x1.e1e0e2c530207p-1, 0x1.f0b3fced60034p-5 },
+ { 0x1.de5c72d8a8be3p-1, 0x1.165bd78d4878ep-4 },
+ { 0x1.dae50fa5658ccp-1, 0x1.3425d2715ebe6p-4 },
+ { 0x1.d77a71145a2dap-1, 0x1.51b8bd91b7915p-4 },
+ { 0x1.d41c51166623ep-1, 0x1.6f15632c76a47p-4 },
+ { 0x1.d0ca6ba0bb29fp-1, 0x1.8c3c88ecbe503p-4 },
+ { 0x1.cd847e8e59681p-1, 0x1.a92ef077625dap-4 },
+ { 0x1.ca4a499693e00p-1, 0x1.c5ed5745fa006p-4 },
+ { 0x1.c71b8e399e821p-1, 0x1.e27876de1c993p-4 },
+ { 0x1.c3f80faf19077p-1, 0x1.fed104fce4cdcp-4 },
+ { 0x1.c0df92dc2b0ecp-1, 0x1.0d7bd9c17d78bp-3 },
+ { 0x1.bdd1de3cbb542p-1, 0x1.1b76986cef97bp-3 },
+ { 0x1.baceb9e1007a3p-1, 0x1.295913d24f750p-3 },
+ { 0x1.b7d5ef543e55ep-1, 0x1.37239fa295d17p-3 },
+ { 0x1.b4e749977d953p-1, 0x1.44d68dd78714bp-3 },
+ { 0x1.b20295155478ep-1, 0x1.52722ebe5d780p-3 },
+ { 0x1.af279f8e82be2p-1, 0x1.5ff6d12671f98p-3 },
+ { 0x1.ac5638197fdf3p-1, 0x1.6d64c2389484bp-3 },
+ { 0x1.a98e2f102e087p-1, 0x1.7abc4da40fddap-3 },
+ { 0x1.a6cf5606d05c1p-1, 0x1.87fdbda1e8452p-3 },
+ { 0x1.a4197fc04d746p-1, 0x1.95295b06a5f37p-3 },
+ { 0x1.a16c80293dc01p-1, 0x1.a23f6d34abbc5p-3 },
+ { 0x1.9ec82c4dc5bc9p-1, 0x1.af403a28e04f2p-3 },
+ { 0x1.9c2c5a491f534p-1, 0x1.bc2c06a85721ap-3 },
+ { 0x1.9998e1480b618p-1, 0x1.c903161240163p-3 },
+ { 0x1.970d9977c6c2dp-1, 0x1.d5c5aa93287ebp-3 },
+ { 0x1.948a5c023d212p-1, 0x1.e274051823fa9p-3 },
+ { 0x1.920f0303d6809p-1, 0x1.ef0e656300c16p-3 },
+ { 0x1.8f9b698a98b45p-1, 0x1.fb9509f05aa2ap-3 },
+ { 0x1.8d2f6b81726f6p-1, 0x1.04041821f37afp-2 },
+ { 0x1.8acae5bb55badp-1, 0x1.0a340a49b3029p-2 },
+ { 0x1.886db5d9275b8p-1, 0x1.105a7918a126dp-2 },
+ { 0x1.8617ba567c13cp-1, 0x1.1677819812b84p-2 },
+ { 0x1.83c8d27487800p-1, 0x1.1c8b405b40c0ep-2 },
+ { 0x1.8180de3c5dbe7p-1, 0x1.2295d16cfa6b1p-2 },
+ { 0x1.7f3fbe71cdb71p-1, 0x1.28975066318a2p-2 },
+ { 0x1.7d055498071c1p-1, 0x1.2e8fd855d86fcp-2 },
+ { 0x1.7ad182e54f65ap-1, 0x1.347f83d605e59p-2 },
+ { 0x1.78a42c3c90125p-1, 0x1.3a666d1244588p-2 },
+ { 0x1.767d342f76944p-1, 0x1.4044adb6f8ec4p-2 },
+ { 0x1.745c7ef26b00ap-1, 0x1.461a5f077558cp-2 },
+ { 0x1.7241f15769d0fp-1, 0x1.4be799e20b9c8p-2 },
+ { 0x1.702d70d396e41p-1, 0x1.51ac76a6b79dfp-2 },
+ { 0x1.6e1ee3700cd11p-1, 0x1.57690d5744a45p-2 },
+ { 0x1.6c162fc9cbe02p-1, 0x1.5d1d758e45217p-2 } }
};
diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h
index 43efd8f..cfc87f8 100644
--- a/sysdeps/aarch64/fpu/v_math.h
+++ b/sysdeps/aarch64/fpu/v_math.h
@@ -30,15 +30,15 @@
#define V_NAME_D2(fun) _ZGVnN2vv_##fun
/* Shorthand helpers for declaring constants. */
-#define V2(x) \
- { \
- x, x \
- }
+#define V2(X) { X, X }
+#define V4(X) { X, X, X, X }
+#define V8(X) { X, X, X, X, X, X, X, X }
-#define V4(x) \
- { \
- x, x, x, x \
- }
+static inline int
+v_any_u16h (uint16x4_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u16 (x), 0) != 0;
+}
static inline float32x4_t
v_f32 (float x)
@@ -63,6 +63,11 @@ v_any_u32 (uint32x4_t x)
/* assume elements in x are either 0 or -1u. */
return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0;
}
+static inline int
+v_any_u32h (uint32x2_t x)
+{
+ return vget_lane_u64 (vreinterpret_u64_u32 (x), 0) != 0;
+}
static inline float32x4_t
v_lookup_f32 (const float *tab, uint32x4_t idx)
{
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index e7d30b4..0abfd8b 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -42,8 +42,10 @@ extern const struct v_log_data
/* Shared data for vector log and log-derived routines (e.g. asinh). */
double poly[V_LOG_POLY_ORDER - 1];
double ln2;
- double invc[1 << V_LOG_TABLE_BITS];
- double logc[1 << V_LOG_TABLE_BITS];
+ struct
+ {
+ double invc, logc;
+ } table[1 << V_LOG_TABLE_BITS];
} __v_log_data attribute_hidden;
#define V_EXP_TABLE_BITS 7