aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/aarch64/fpu/atan2_advsimd.c110
-rw-r--r--sysdeps/aarch64/fpu/atan2f_advsimd.c58
-rw-r--r--sysdeps/aarch64/fpu/atan_advsimd.c60
-rw-r--r--sysdeps/aarch64/fpu/coshf_advsimd.c28
-rw-r--r--sysdeps/aarch64/fpu/exp10f_advsimd.c60
-rw-r--r--sysdeps/aarch64/fpu/exp2f_advsimd.c54
-rw-r--r--sysdeps/aarch64/fpu/expf_advsimd.c54
-rw-r--r--sysdeps/aarch64/fpu/log10f_advsimd.c29
-rw-r--r--sysdeps/aarch64/fpu/log2f_advsimd.c29
-rw-r--r--sysdeps/aarch64/fpu/logf_advsimd.c48
-rw-r--r--sysdeps/aarch64/fpu/v_expf_inline.h49
-rw-r--r--sysdeps/arm/libm-test-ulps112
-rw-r--r--sysdeps/i386/fpu/libm-test-ulps168
-rw-r--r--sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c4
-rw-r--r--sysdeps/unix/sysv/linux/syscall-names.list4
-rw-r--r--sysdeps/unix/sysv/linux/tst-clone3-internal.c4
-rw-r--r--sysdeps/unix/sysv/linux/tst-clone3.c4
-rw-r--r--sysdeps/unix/sysv/linux/x86/bits/platform/features.h2
-rw-r--r--sysdeps/x86/bits/platform/features.h2
-rw-r--r--sysdeps/x86/dl-cacheinfo.h4
-rw-r--r--sysdeps/x86/sys/platform/x86.h4
-rw-r--r--sysdeps/x86_64/Makefile4
-rw-r--r--sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c2
-rw-r--r--sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c2
-rw-r--r--sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c2
-rw-r--r--sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c2
-rw-r--r--sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c2
-rw-r--r--sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c2
28 files changed, 655 insertions, 248 deletions
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
index b1e7a9b..1a8f021 100644
--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
@@ -23,40 +23,57 @@
static const struct data
{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
- float64x2_t poly[20];
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+ uint64x2_t zeroinfnan, minustwo;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
- the interval [2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ [2**-1022, 1.0]. */
+ .c0 = V2 (-0x1.5555555555555p-2),
+ .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3),
+ .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4),
+ .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4),
+ .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5),
+ .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5),
+ .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5),
+ .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6),
+ .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9),
+ .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13),
+ .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
+ .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
+ .minustwo = V2 (0xc000000000000000),
};
#define SignMask v_u64 (0x8000000000000000)
/* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */
static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp)
+special_case (float64x2_t y, float64x2_t x, float64x2_t ret,
+ uint64x2_t sign_xy, uint64x2_t cmp)
{
+ /* Account for the sign of x and y. */
+ ret = vreinterpretq_f64_u64 (
+ veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
return v_call2_f64 (atan2, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint64x2_t
-zeroinfnan (uint64x2_t i)
+zeroinfnan (uint64x2_t i, const struct data *d)
{
/* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */
- return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)),
- v_u64 (2 * asuint64 (INFINITY) - 1));
+ return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan);
}
/* Fast implementation of vector atan2.
@@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i)
want 0x1.92d628ab678cfp-1. */
float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
{
- const struct data *data_ptr = ptr_barrier (&data);
+ const struct data *d = ptr_barrier (&data);
uint64x2_t ix = vreinterpretq_u64_f64 (x);
uint64x2_t iy = vreinterpretq_u64_f64 (y);
- uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy));
+ uint64x2_t special_cases
+ = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint64x2_t sign_x = vandq_u64 (ix, SignMask);
uint64x2_t sign_y = vandq_u64 (iy, SignMask);
@@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t ay = vabsq_f64 (y);
uint64x2_t pred_xlt0 = vcltzq_f64 (x);
- uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax);
+ uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
/* Set up z for call to atan. */
float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
- float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax);
- float64x2_t z = vdivq_f64 (n, d);
+ float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
+ float64x2_t z = vdivq_f64 (n, q);
/* Work out the correct shift. */
- float64x2_t shift = vreinterpretq_f64_u64 (
- vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+ float64x2_t shift
+ = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
- shift = vmulq_f64 (shift, data_ptr->pi_over_2);
+ shift = vmulq_f64 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
@@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t ret
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8);
+
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t ret = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
ret = vaddq_f64 (ret, shift);
+ if (__glibc_unlikely (v_any_u64 (special_cases)))
+ return special_case (y, x, ret, sign_xy, special_cases);
+
/* Account for the sign of x and y. */
ret = vreinterpretq_f64_u64 (
veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
- if (__glibc_unlikely (v_any_u64 (special_cases)))
- return special_case (y, x, ret, special_cases);
-
return ret;
}
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
index 56e610c..88daacd 100644
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -22,34 +22,39 @@
static const struct data
{
- float32x4_t poly[8];
- float32x4_t pi_over_2;
+ float32x4_t c0, pi_over_2, c4, c6, c2;
+ float c1, c3, c5, c7;
+ uint32x4_t comp_const;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-128, 1.0].
Generated using fpminimax between FLT_MIN and 1. */
- .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
- V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
- V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
- .pi_over_2 = V4 (0x1.921fb6p+0f),
+ .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f,
+ .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f,
+ .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f,
+ .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f,
+ .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
};
#define SignMask v_u32 (0x80000000)
/* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp)
+special_case (float32x4_t y, float32x4_t x, float32x4_t ret,
+ uint32x4_t sign_xy, uint32x4_t cmp)
{
+ /* Account for the sign of y. */
+ ret = vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
return v_call2_f32 (atan2f, y, x, ret, cmp);
}
/* Returns 1 if input is the bit representation of 0, infinity or nan. */
static inline uint32x4_t
-zeroinfnan (uint32x4_t i)
+zeroinfnan (uint32x4_t i, const struct data *d)
{
/* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */
- return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)),
- v_u32 (2 * 0x7f800000lu - 1));
+ return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
}
/* Fast implementation of vector atan2f. Maximum observed error is
@@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i)
want 0x1.967f00p-1. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
{
- const struct data *data_ptr = ptr_barrier (&data);
+ const struct data *d = ptr_barrier (&data);
uint32x4_t ix = vreinterpretq_u32_f32 (x);
uint32x4_t iy = vreinterpretq_u32_f32 (y);
- uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy));
+ uint32x4_t special_cases
+ = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d));
uint32x4_t sign_x = vandq_u32 (ix, SignMask);
uint32x4_t sign_y = vandq_u32 (iy, SignMask);
@@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
/* Set up z for call to atanf. */
float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
- float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax);
- float32x4_t z = vdivq_f32 (n, d);
+ float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
+ float32x4_t z = vdivq_f32 (n, q);
/* Work out the correct shift. */
float32x4_t shift = vreinterpretq_f32_u32 (
vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
- shift = vmulq_f32 (shift, data_ptr->pi_over_2);
+ shift = vmulq_f32 (shift, d->pi_over_2);
/* Calculate the polynomial approximation.
Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
@@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
float32x4_t z2 = vmulq_f32 (z, z);
float32x4_t z4 = vmulq_f32 (z2, z2);
- float32x4_t ret = vfmaq_f32 (
- v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4,
- vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4)));
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
+ float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
+ float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3);
+ float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+ float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
+
+ float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
/* y = shift + z * P(z^2). */
ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
- /* Account for the sign of y. */
- ret = vreinterpretq_f32_u32 (
- veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
-
if (__glibc_unlikely (v_any_u32 (special_cases)))
{
- return special_case (y, x, ret, special_cases);
+ return special_case (y, x, ret, sign_xy, special_cases);
}
- return ret;
+ /* Account for the sign of y. */
+ return vreinterpretq_f32_u32 (
+ veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy));
}
libmvec_hidden_def (V_NAME_F2 (atan2))
HALF_WIDTH_ALIAS_F2(atan2)
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
index a962be0..14f1809 100644
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -22,21 +22,22 @@
static const struct data
{
+ float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
float64x2_t pi_over_2;
- float64x2_t poly[20];
+ double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
} data = {
/* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
[2**-1022, 1.0]. */
- .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3),
- V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4),
- V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4),
- V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5),
- V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5),
- V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5),
- V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6),
- V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7),
- V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10),
- V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), },
+ .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3,
+ .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4,
+ .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4,
+ .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5,
+ .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5,
+ .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5,
+ .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6,
+ .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7,
+ .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10,
+ .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16,
.pi_over_2 = V2 (0x1.921fb54442d18p+0),
};
@@ -52,6 +53,11 @@ static const struct data
float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
+ float64x2_t c13 = vld1q_f64 (&d->c1);
+ float64x2_t c57 = vld1q_f64 (&d->c5);
+ float64x2_t c911 = vld1q_f64 (&d->c9);
+ float64x2_t c1315 = vld1q_f64 (&d->c13);
+ float64x2_t c1719 = vld1q_f64 (&d->c17);
/* Small cases, infs and nans are supported by our approximation technique,
but do not set fenv flags correctly. Only trigger special case if we need
@@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
float64x2_t x2 = vmulq_f64 (z2, z2);
float64x2_t x4 = vmulq_f64 (x2, x2);
float64x2_t x8 = vmulq_f64 (x4, x4);
- float64x2_t y
- = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly),
- v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8);
+
+ /* estrin_7. */
+ float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+ float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+ float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+
+ float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+ float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+ float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+
+ float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+
+ /* estrin_11. */
+ float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+ float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+ float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+
+ float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
+ float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
+ float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+
+ float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
+ float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
+ float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+
+ float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
+ float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+
+ float64x2_t y = vfmaq_f64 (p07, p819, x8);
/* Finalize. y = shift + z + z^3 * P(z^2). */
y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c
index c1ab492..cd5c866 100644
--- a/sysdeps/aarch64/fpu/coshf_advsimd.c
+++ b/sysdeps/aarch64/fpu/coshf_advsimd.c
@@ -23,19 +23,27 @@
static const struct data
{
struct v_expf_data expf_consts;
- uint32x4_t tiny_bound, special_bound;
+ uint32x4_t tiny_bound;
+ float32x4_t bound;
+#if WANT_SIMD_EXCEPT
+ uint32x4_t special_bound;
+#endif
} data = {
.expf_consts = V_EXPF_DATA,
.tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */
/* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */
+ .bound = V4 (0x1.5a92d8p+6),
+#if WANT_SIMD_EXCEPT
.special_bound = V4 (0x42ad496c),
+#endif
};
#if !WANT_SIMD_EXCEPT
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t,
+ uint32x4_t special)
{
- return v_call_f32 (coshf, x, y, special);
+ return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special);
}
#endif
@@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
-
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered correctly, fall back to the scalar
variant for all inputs if any input is a special value or above the bound
at which expf overflows. */
+ float32x4_t ax = vabsq_f32 (x);
+ uint32x4_t iax = vreinterpretq_u32_f32 (ax);
+ uint32x4_t special = vcgeq_u32 (iax, d->special_bound);
if (__glibc_unlikely (v_any_u32 (special)))
return v_call_f32 (coshf, x, x, v_u32 (-1));
@@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
input to 0, which will generate no exceptions. */
if (__glibc_unlikely (v_any_u32 (tiny)))
ax = v_zerofy_f32 (ax, tiny);
+ float32x4_t t = v_expf_inline (ax, &d->expf_consts);
+#else
+ uint32x4_t special = vcageq_f32 (x, d->bound);
+ float32x4_t t = v_expf_inline (x, &d->expf_consts);
#endif
/* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */
- float32x4_t t = v_expf_inline (ax, &d->expf_consts);
float32x4_t half_t = vmulq_n_f32 (t, 0.5);
float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t);
@@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x)
return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t));
#else
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vaddq_f32 (half_t, half_over_t), special);
+ return special_case (x, half_t, half_over_t, special);
#endif
return vaddq_f32 (half_t, half_over_t);
diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c
index cf53e73..55d9cd8 100644
--- a/sysdeps/aarch64/fpu/exp10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c
@@ -18,16 +18,15 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
#define ScaleBound 192.0f
static const struct data
{
- float32x4_t poly[5];
- float log10_2_and_inv[4];
- float32x4_t shift;
-
+ float32x4_t c0, c1, c3;
+ float log10_2_high, log10_2_low, c2, c4;
+ float32x4_t inv_log10_2, special_bound;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t scale_thresh;
#endif
@@ -37,19 +36,24 @@ static const struct data
rel error: 0x1.89dafa3p-24
abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
maxerr: 1.85943 +0.5 ulp. */
- .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f),
- V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) },
- .shift = V4 (0x1.8p23f),
-
- /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */
- .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 },
+ .c0 = V4 (0x1.26bb16p+1f),
+ .c1 = V4 (0x1.5350d2p+1f),
+ .c2 = 0x1.04744ap+1f,
+ .c3 = V4 (0x1.2d8176p+0f),
+ .c4 = 0x1.12b41ap-1f,
+ .inv_log10_2 = V4 (0x1.a934fp+1),
+ .log10_2_high = 0x1.344136p-2,
+ .log10_2_low = 0x1.ec10cp-27,
+ /* rint (log2 (2^127 / (1 + sqrt (2)))). */
+ .special_bound = V4 (126.0f),
+ .exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.scale_thresh = V4 (ScaleBound)
#endif
};
-#define ExponentBias v_u32 (0x3f800000)
-
#if WANT_SIMD_EXCEPT
# define SpecialBound 38.0f /* rint(log10(2^127)). */
@@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
+# define SpecialBound 126.0f
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x)
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
with poly(r) in [1/sqrt(2), sqrt(2)] and
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
- float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv);
- float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0);
- float32x4_t n = vsubq_f32 (z, d->shift);
- float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1);
- r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+ float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0);
+ r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound));
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly
- = vfmaq_f32 (vmulq_f32 (r, d->poly[0]),
- v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2);
+ float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2);
+ float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3);
+ float32x4_t p14 = vfmaq_f32 (p12, r2, p34);
+ float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c
index 69e0b19..a4220da 100644
--- a/sysdeps/aarch64/fpu/exp2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c
@@ -21,24 +21,28 @@
static const struct data
{
- float32x4_t poly[5];
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
- float32x4_t special_bound, scale_thresh;
+ float32x4_t scale_thresh, special_bound;
#endif
+ float c0, c2, c4, zero;
} data = {
/* maxerr: 1.962 ulp. */
- .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f),
- V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) },
+ .c0 = 0x1.59977ap-10f,
+ .c1 = V4 (0x1.3ce9e4p-7f),
+ .c2 = 0x1.c6bd32p-5f,
+ .c3 = V4 (0x1.ebf9bcp-3f),
+ .c4 = 0x1.62e422p-1f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
#endif
};
-#define C(i) d->poly[i]
-
#if WANT_SIMD_EXCEPT
# define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */
@@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
@@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
#if WANT_SIMD_EXCEPT
/* asuint(|x|) - TinyBound >= BigBound - TinyBound. */
uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x));
- cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
+ uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound);
float32x4_t xm = x;
/* If any lanes are special, mask them with 1 and retain a copy of x to allow
special_case to fix special lanes later. This is only necessary if fenv
@@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x)
x = vbslq_f32 (cmp, v_f32 (1), x);
#endif
- /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = n + r, with r in [-1/2, 1/2]. */
- n = vrndaq_f32 (x);
- r = vsubq_f32 (x, n);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ x = n + r, with r in [-1/2, 1/2]. */
+ float32x4_t n = vrndaq_f32 (x);
+ float32x4_t r = vsubq_f32 (x, n);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t c024 = vld1q_f32 (&d->c0);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_laneq_f32 (r, c024, 2);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
index 5c9cb72..70f137e 100644
--- a/sysdeps/aarch64/fpu/expf_advsimd.c
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -21,20 +21,25 @@
static const struct data
{
- float32x4_t poly[5];
- float32x4_t inv_ln2, ln2_hi, ln2_lo;
- uint32x4_t exponent_bias;
+ float32x4_t c1, c3, c4, inv_ln2;
+ float ln2_hi, ln2_lo, c0, c2;
+ uint32x4_t exponent_bias, special_offset, special_bias;
#if !WANT_SIMD_EXCEPT
float32x4_t special_bound, scale_thresh;
#endif
} data = {
/* maxerr: 1.45358 +0.5 ulp. */
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+ .c0 = 0x1.0e4020p-7f,
+ .c1 = V4 (0x1.573e2ep-5f),
+ .c2 = 0x1.555e66p-3f,
+ .c3 = V4 (0x1.fffdb6p-2f),
+ .c4 = V4 (0x1.ffffecp-1f),
.inv_ln2 = V4 (0x1.715476p+0f),
- .ln2_hi = V4 (0x1.62e4p-1f),
- .ln2_lo = V4 (0x1.7f7d1cp-20f),
+ .ln2_hi = 0x1.62e4p-1f,
+ .ln2_lo = 0x1.7f7d1cp-20f,
.exponent_bias = V4 (0x3f800000),
+ .special_offset = V4 (0x82000000),
+ .special_bias = V4 (0x7f000000),
#if !WANT_SIMD_EXCEPT
.special_bound = V4 (126.0f),
.scale_thresh = V4 (192.0f),
@@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
#else
-# define SpecialOffset v_u32 (0x82000000)
-# define SpecialBias v_u32 (0x7f000000)
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t scale, const struct data *d)
{
/* 2^n may overflow, break it up into s1*s2. */
- uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
- float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+ uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
+ float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
float32x4_t r2 = vmulq_f32 (s1, s1);
+ // (s2 + p*s2)*s1 = s2(p+1)s1
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
/* Similar to r1 but avoids double rounding in the subnormal range. */
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
@@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, r, r2, scale, p, q, poly;
- uint32x4_t cmp, e;
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
#if WANT_SIMD_EXCEPT
/* asuint(x) - TinyBound >= BigBound - TinyBound. */
- cmp = vcgeq_u32 (
+ uint32x4_t cmp = vcgeq_u32 (
vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
TinyBound),
SpecialBound);
@@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x)
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
- r = vfmsq_f32 (x, n, d->ln2_hi);
- r = vfmsq_f32 (r, n, d->ln2_lo);
- e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
- scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
#if !WANT_SIMD_EXCEPT
- cmp = vcagtq_f32 (n, d->special_bound);
+ uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
#endif
- r2 = vmulq_f32 (r, r);
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t r2 = vmulq_f32 (r, r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
if (__glibc_unlikely (v_any_u32 (cmp)))
#if WANT_SIMD_EXCEPT
diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c
index 82228b5..0d792c3 100644
--- a/sysdeps/aarch64/fpu/log10f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10f_advsimd.c
@@ -18,21 +18,25 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
static const struct data
{
+ float32x4_t c0, c2, c4, c6, inv_ln10, ln2;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
- float32x4_t poly[8];
- float32x4_t inv_ln10, ln2;
+ float c1, c3, c5, c7;
} data = {
/* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in
[-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */
- .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f),
- V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f),
- V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) },
+ .c0 = V4 (-0x1.bcb79cp-3f),
+ .c1 = 0x1.2879c8p-3f,
+ .c2 = V4 (-0x1.bcd472p-4f),
+ .c3 = 0x1.6408f8p-4f,
+ .c4 = V4 (-0x1.246f8p-4f),
+ .c5 = 0x1.f0e514p-5f,
+ .c6 = V4 (-0x1.0fc92cp-4f),
+ .c7 = 0x1.f5f76ap-5f,
.ln2 = V4 (0x1.62e43p-1f),
.inv_ln10 = V4 (0x1.bcb7b2p-2f),
/* Lower bound is the smallest positive normal float 0x00800000. For
@@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
-
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
@@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x)
/* y = log10(1+r) + n * log10(2). */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly);
+
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+
+ float32x4_t p47 = vfmaq_f32 (c45, r2, c67);
+ float32x4_t p27 = vfmaq_f32 (c23, r2, p47);
+ float32x4_t poly = vfmaq_f32 (c01, r2, p27);
+
/* y = Log10(2) * n + poly * InvLn(10). */
float32x4_t y = vfmaq_f32 (r, d->ln2, n);
y = vmulq_f32 (y, d->inv_ln10);
diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c
index 84effe4..116c36c 100644
--- a/sysdeps/aarch64/fpu/log2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/log2f_advsimd.c
@@ -18,22 +18,27 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
static const struct data
{
+ float32x4_t c0, c2, c4, c6, c8;
uint32x4_t off, offset_lower_bound;
uint16x8_t special_bound;
uint32x4_t mantissa_mask;
- float32x4_t poly[9];
+ float c1, c3, c5, c7;
} data = {
/* Coefficients generated using Remez algorithm approximate
log2(1+r)/r for r in [ -1/3, 1/3 ].
rel error: 0x1.c4c4b0cp-26. */
- .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
- V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f),
- V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f),
- V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) },
+ .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */
+ .c1 = -0x1.715458p-1f,
+ .c2 = V4 (0x1.ec701cp-2f),
+ .c3 = -0x1.7171a4p-2f,
+ .c4 = V4 (0x1.27a0b8p-2f),
+ .c5 = -0x1.e5143ep-3f,
+ .c6 = V4 (0x1.9d8ecap-3f),
+ .c7 = -0x1.c675bp-3f,
+ .c8 = V4 (0x1.9e495p-3f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
subtracted, so lower bound is 0x0080000 - offset (which wraps around). */
@@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x)
/* y = log2(1+r) + n. */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly);
+
+ float32x4_t c1357 = vld1q_f32 (&d->c1);
+ float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0);
+ float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1);
+ float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2);
+ float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3);
+ float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8);
+ float32x4_t p48 = vfmaq_f32 (c45, r2, p68);
+ float32x4_t p28 = vfmaq_f32 (c23, r2, p48);
+ float32x4_t p = vfmaq_f32 (c01, r2, p28);
if (__glibc_unlikely (v_any_u16h (special)))
return special_case (n, u_off, p, r, special, d);
return vfmaq_f32 (n, p, r);
}
+
libmvec_hidden_def (V_NAME_F1 (log2))
HALF_WIDTH_ALIAS_F1 (log2)
diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c
index c20dbfd..d9e64c7 100644
--- a/sysdeps/aarch64/fpu/logf_advsimd.c
+++ b/sysdeps/aarch64/fpu/logf_advsimd.c
@@ -21,16 +21,19 @@
static const struct data
{
- uint32x4_t off, offset_lower_bound;
+ float32x4_t c2, c4, c6, ln2;
+ uint32x4_t off, offset_lower_bound, mantissa_mask;
uint16x8_t special_bound;
- uint32x4_t mantissa_mask;
- float32x4_t poly[7];
- float32x4_t ln2;
+ float c1, c3, c5, c0;
} data = {
/* 3.34 ulp error. */
- .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f),
- V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f),
- V4 (-0x1.ffffc8p-2f) },
+ .c0 = -0x1.3e737cp-3f,
+ .c1 = 0x1.5a9aa2p-3f,
+ .c2 = V4 (-0x1.4f9934p-3f),
+ .c3 = 0x1.961348p-3f,
+ .c4 = V4 (-0x1.00187cp-2f),
+ .c5 = 0x1.555d7cp-2f,
+ .c6 = V4 (-0x1.ffffc8p-2f),
.ln2 = V4 (0x1.62e43p-1f),
/* Lower bound is the smallest positive normal float 0x00800000. For
optimised register use subnormals are detected after offset has been
@@ -41,8 +44,6 @@ static const struct data
.mantissa_mask = V4 (0x007fffff)
};
-#define P(i) d->poly[7 - i]
-
static float32x4_t VPCS_ATTR NOINLINE
special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
uint16x4_t cmp, const struct data *d)
@@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2,
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- float32x4_t n, p, q, r, r2, y;
- uint32x4_t u, u_off;
- uint16x4_t cmp;
+ float32x4_t c1350 = vld1q_f32 (&d->c1);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
- u_off = vreinterpretq_u32_f32 (x);
+ uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off);
/* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */
- u_off = vsubq_u32 (u_off, d->off);
- n = vcvtq_f32_s32 (
+ float32x4_t n = vcvtq_f32_s32 (
vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */
- u = vandq_u32 (u_off, d->mantissa_mask);
- u = vaddq_u32 (u, d->off);
- r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
+ uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
+ vget_low_u16 (d->special_bound));
- cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound),
- vget_low_u16 (d->special_bound));
+ uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off);
+ float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f));
/* y = log(1+r) + n*ln2. */
- r2 = vmulq_f32 (r, r);
+ float32x4_t r2 = vmulq_f32 (r, r);
/* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */
- p = vfmaq_f32 (P (5), P (6), r);
- q = vfmaq_f32 (P (3), P (4), r);
- y = vfmaq_f32 (P (1), P (2), r);
- p = vfmaq_f32 (p, P (7), r2);
+ float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0);
+ float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1);
+ float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2);
+ p = vfmaq_laneq_f32 (p, r2, c1350, 3);
+
q = vfmaq_f32 (q, p, r2);
y = vfmaq_f32 (y, q, r2);
p = vfmaq_f32 (r, d->ln2, n);
diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h
index 08b06e0..eacd2af 100644
--- a/sysdeps/aarch64/fpu/v_expf_inline.h
+++ b/sysdeps/aarch64/fpu/v_expf_inline.h
@@ -24,50 +24,45 @@
struct v_expf_data
{
- float32x4_t poly[5];
- float32x4_t shift;
- float invln2_and_ln2[4];
+ float ln2_hi, ln2_lo, c0, c2;
+ float32x4_t inv_ln2, c1, c3, c4;
+ /* asuint(1.0f). */
+ uint32x4_t exponent_bias;
};
/* maxerr: 1.45358 +0.5 ulp. */
#define V_EXPF_DATA \
{ \
- .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \
- V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \
- .shift = V4 (0x1.8p23f), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \
+ .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
+ .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \
}
-#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */
-#define C(i) d->poly[i]
-
static inline float32x4_t
v_expf_inline (float32x4_t x, const struct v_expf_data *d)
{
- /* Helper routine for calculating exp(x).
+ /* Helper routine for calculating exp(ax).
Copied from v_expf.c, with all special-case handling removed - the
calling routine should handle special values if required. */
- /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
- x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
- float32x4_t n, r, z;
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0);
- n = vsubq_f32 (z, d->shift);
- r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1);
- r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2);
- uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
- float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias));
+ /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+ ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */
+ float32x4_t ax = vabsq_f32 (x);
+ float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi);
+ float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2));
+ float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0);
+ r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
+ uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
+ float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
/* Custom order-4 Estrin avoids building high order monomial. */
float32x4_t r2 = vmulq_f32 (r, r);
- float32x4_t p, q, poly;
- p = vfmaq_f32 (C (1), C (0), r);
- q = vfmaq_f32 (C (3), C (2), r);
+ float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2);
+ float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3);
q = vfmaq_f32 (q, p, r2);
- p = vmulq_f32 (C (4), r);
- poly = vfmaq_f32 (p, q, r2);
+ p = vmulq_f32 (d->c4, r);
+ float32x4_t poly = vfmaq_f32 (p, q, r2);
return vfmaq_f32 (scale, poly, scale);
}
-
#endif
diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps
index c80122d..38bae47 100644
--- a/sysdeps/arm/libm-test-ulps
+++ b/sysdeps/arm/libm-test-ulps
@@ -33,6 +33,22 @@ Function: "acosh_upward":
double: 2
float: 2
+Function: "acospi":
+double: 1
+float: 1
+
+Function: "acospi_downward":
+double: 1
+float: 2
+
+Function: "acospi_towardzero":
+double: 1
+float: 2
+
+Function: "acospi_upward":
+double: 2
+float: 1
+
Function: "asin":
double: 1
float: 1
@@ -65,6 +81,22 @@ Function: "asinh_upward":
double: 3
float: 3
+Function: "asinpi":
+double: 1
+float: 1
+
+Function: "asinpi_downward":
+double: 1
+float: 1
+
+Function: "asinpi_towardzero":
+double: 1
+float: 1
+
+Function: "asinpi_upward":
+double: 2
+float: 1
+
Function: "atan":
double: 1
float: 1
@@ -84,6 +116,22 @@ Function: "atan2_upward":
double: 1
float: 2
+Function: "atan2pi":
+double: 1
+float: 1
+
+Function: "atan2pi_downward":
+double: 1
+float: 3
+
+Function: "atan2pi_towardzero":
+double: 1
+float: 2
+
+Function: "atan2pi_upward":
+double: 1
+float: 3
+
Function: "atan_downward":
double: 1
float: 2
@@ -112,6 +160,22 @@ Function: "atanh_upward":
double: 3
float: 3
+Function: "atanpi":
+double: 1
+float: 1
+
+Function: "atanpi_downward":
+double: 1
+float: 2
+
+Function: "atanpi_towardzero":
+double: 1
+float: 1
+
+Function: "atanpi_upward":
+double: 1
+float: 1
+
Function: "cabs":
double: 1
@@ -535,6 +599,22 @@ Function: "cosh_upward":
double: 2
float: 2
+Function: "cospi":
+double: 1
+float: 1
+
+Function: "cospi_downward":
+double: 1
+float: 1
+
+Function: "cospi_towardzero":
+double: 1
+float: 1
+
+Function: "cospi_upward":
+double: 1
+float: 1
+
Function: Real part of "cpow":
double: 2
float: 5
@@ -1078,6 +1158,22 @@ Function: "sinh_upward":
double: 3
float: 3
+Function: "sinpi":
+double: 1
+float: 1
+
+Function: "sinpi_downward":
+double: 2
+float: 2
+
+Function: "sinpi_towardzero":
+double: 2
+float: 1
+
+Function: "sinpi_upward":
+double: 2
+float: 2
+
Function: "tan_downward":
double: 1
@@ -1103,6 +1199,22 @@ Function: "tanh_upward":
double: 3
float: 3
+Function: "tanpi":
+double: 2
+float: 2
+
+Function: "tanpi_downward":
+double: 2
+float: 3
+
+Function: "tanpi_towardzero":
+double: 2
+float: 3
+
+Function: "tanpi_upward":
+double: 2
+float: 4
+
Function: "tgamma":
double: 9
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index c06da68..020c928 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -39,6 +39,30 @@ double: 1
float128: 3
ldouble: 3
+Function: "acospi":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "acospi_downward":
+double: 1
+float: 1
+float128: 1
+ldouble: 3
+
+Function: "acospi_towardzero":
+double: 1
+float: 1
+float128: 1
+ldouble: 3
+
+Function: "acospi_upward":
+double: 1
+float: 1
+float128: 2
+ldouble: 2
+
Function: "asin":
double: 1
float128: 1
@@ -83,6 +107,30 @@ float: 1
float128: 4
ldouble: 5
+Function: "asinpi":
+double: 1
+float: 1
+float128: 1
+ldouble: 2
+
+Function: "asinpi_downward":
+double: 1
+float: 1
+float128: 1
+ldouble: 2
+
+Function: "asinpi_towardzero":
+double: 1
+float: 1
+float128: 1
+ldouble: 2
+
+Function: "asinpi_upward":
+double: 1
+float: 1
+float128: 2
+ldouble: 2
+
Function: "atan":
double: 1
float128: 1
@@ -111,6 +159,30 @@ float: 1
float128: 2
ldouble: 1
+Function: "atan2pi":
+double: 1
+float: 1
+float128: 2
+ldouble: 1
+
+Function: "atan2pi_downward":
+double: 1
+float: 1
+float128: 2
+ldouble: 2
+
+Function: "atan2pi_towardzero":
+double: 1
+float: 1
+float128: 2
+ldouble: 2
+
+Function: "atan2pi_upward":
+double: 1
+float: 1
+float128: 2
+ldouble: 2
+
Function: "atan_downward":
double: 1
float: 1
@@ -152,6 +224,30 @@ float: 1
float128: 4
ldouble: 5
+Function: "atanpi":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "atanpi_downward":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "atanpi_towardzero":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "atanpi_upward":
+double: 1
+float: 1
+float128: 2
+ldouble: 1
+
Function: "cabs":
double: 1
float128: 1
@@ -792,6 +888,30 @@ float: 2
float128: 3
ldouble: 3
+Function: "cospi":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "cospi_downward":
+double: 1
+float: 1
+float128: 2
+ldouble: 1
+
+Function: "cospi_towardzero":
+double: 1
+float: 1
+float128: 2
+ldouble: 1
+
+Function: "cospi_upward":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
Function: Real part of "cpow":
double: 2
float: 5
@@ -1613,6 +1733,30 @@ float: 3
float128: 4
ldouble: 5
+Function: "sinpi":
+double: 1
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "sinpi_downward":
+double: 2
+float: 2
+float128: 2
+ldouble: 2
+
+Function: "sinpi_towardzero":
+double: 2
+float: 1
+float128: 1
+ldouble: 1
+
+Function: "sinpi_upward":
+double: 2
+float: 2
+float128: 2
+ldouble: 2
+
Function: "tan":
float128: 1
ldouble: 2
@@ -1656,6 +1800,30 @@ float: 3
float128: 3
ldouble: 4
+Function: "tanpi":
+double: 2
+float: 2
+float128: 2
+ldouble: 2
+
+Function: "tanpi_downward":
+double: 2
+float: 3
+float128: 4
+ldouble: 4
+
+Function: "tanpi_towardzero":
+double: 2
+float: 3
+float128: 4
+ldouble: 4
+
+Function: "tanpi_upward":
+double: 2
+float: 4
+float128: 3
+ldouble: 4
+
Function: "tgamma":
double: 9
float128: 4
diff --git a/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c b/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c
index 5d64da8..aa88ba6 100644
--- a/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c
+++ b/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c
@@ -46,9 +46,9 @@ do_test (void)
SET_LDOUBLE_WORDS (ldy, 0x7fff,
(tests[i] >> 32) | 0x80000000,
tests[i] & 0xffffffffULL);
- SET_LDOUBLE_WORDS (ldnx, 0xffff,
+ SET_LDOUBLE_WORDS (ldnx, -1,
tests[i] >> 32, tests[i] & 0xffffffffULL);
- SET_LDOUBLE_WORDS (ldny, 0xffff,
+ SET_LDOUBLE_WORDS (ldny, -1,
(tests[i] >> 32) | 0x80000000,
tests[i] & 0xffffffffULL);
bool to1 = totalorderl (&ldx, &ldy);
diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
index aa5b479..d31938a 100644
--- a/sysdeps/unix/sysv/linux/syscall-names.list
+++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
# This file can list all potential system calls. The names are only
# used if the installed kernel headers also provide them.
-# The list of system calls is current as of Linux 6.11.
-kernel 6.11
+# The list of system calls is current as of Linux 6.12.
+kernel 6.12
FAST_atomic_update
FAST_cmpxchg
diff --git a/sysdeps/unix/sysv/linux/tst-clone3-internal.c b/sysdeps/unix/sysv/linux/tst-clone3-internal.c
index 2f0b200..387f673 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3-internal.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3-internal.c
@@ -54,7 +54,7 @@ f (void *a)
} while (0)
static inline int
-futex_wait (int *futexp, int val)
+futex_wait (_Atomic int *futexp, int val)
{
#ifdef __NR_futex
return syscall (__NR_futex, futexp, FUTEX_WAIT, val);
@@ -75,7 +75,7 @@ do_test (void)
/* Initialize with a known value. ctid is set to zero by the kernel after the
cloned thread has exited. */
#define CTID_INIT_VAL 1
- pid_t ctid = CTID_INIT_VAL;
+ _Atomic pid_t ctid = CTID_INIT_VAL;
pid_t tid;
struct clone_args clone_args =
diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
index 77b8731..a32d649 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3.c
@@ -54,7 +54,7 @@ f (void *a)
} while (0)
static inline int
-futex_wait (int *futexp, int val)
+futex_wait (_Atomic int *futexp, int val)
{
#ifdef __NR_futex
return syscall (__NR_futex, futexp, FUTEX_WAIT, val);
@@ -75,7 +75,7 @@ do_test (void)
/* Initialize with a known value. ctid is set to zero by the kernel after the
cloned thread has exited. */
#define CTID_INIT_VAL 1
- pid_t ctid = CTID_INIT_VAL;
+ _Atomic pid_t ctid = CTID_INIT_VAL;
pid_t tid;
#if _STACK_GROWS_DOWN
diff --git a/sysdeps/unix/sysv/linux/x86/bits/platform/features.h b/sysdeps/unix/sysv/linux/x86/bits/platform/features.h
index 7704feb..1e63743 100644
--- a/sysdeps/unix/sysv/linux/x86/bits/platform/features.h
+++ b/sysdeps/unix/sysv/linux/x86/bits/platform/features.h
@@ -28,7 +28,7 @@ enum
x86_feature_1_shstk = 1U << 1
};
-static __inline__ _Bool
+static __inline__ bool
x86_cpu_cet_active (unsigned int __index)
{
#ifdef __x86_64__
diff --git a/sysdeps/x86/bits/platform/features.h b/sysdeps/x86/bits/platform/features.h
index f024892..676ad00 100644
--- a/sysdeps/x86/bits/platform/features.h
+++ b/sysdeps/x86/bits/platform/features.h
@@ -20,7 +20,7 @@
# error "Never include <bits/platform/features.h> directly; use <sys/platform/x86.h> instead."
#endif
-static __inline__ _Bool
+static __inline__ bool
x86_cpu_cet_active (unsigned int __index)
{
return false;
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e957950..6a0a30b 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1021,11 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
non_temporal_threshold = maximum_non_temporal_threshold;
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
- unsigned int minimum_rep_movsb_threshold;
+ unsigned long int minimum_rep_movsb_threshold;
/* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
threshold is 2048 * (VEC_SIZE / 16). */
- unsigned int rep_movsb_threshold;
+ unsigned long int rep_movsb_threshold;
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
{
diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
index b8066cc..1f53780 100644
--- a/sysdeps/x86/sys/platform/x86.h
+++ b/sysdeps/x86/sys/platform/x86.h
@@ -30,7 +30,7 @@ __BEGIN_DECLS
extern const struct cpuid_feature *__x86_get_cpuid_feature_leaf (unsigned int)
__attribute__ ((pure));
-static __inline__ _Bool
+static __inline__ bool
x86_cpu_present (unsigned int __index)
{
const struct cpuid_feature *__ptr = __x86_get_cpuid_feature_leaf
@@ -43,7 +43,7 @@ x86_cpu_present (unsigned int __index)
return __ptr->cpuid_array[__reg] & (1 << __bit);
}
-static __inline__ _Bool
+static __inline__ bool
x86_cpu_active (unsigned int __index)
{
if (__index == x86_cpu_IBT || __index == x86_cpu_SHSTK)
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index ce949db..9e19cf7 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -354,8 +354,8 @@ CFLAGS-tst-cet-legacy-mod-6c.c += -fcf-protection
CFLAGS-tst-cet-legacy-7.c += -fcf-protection=none
CFLAGS-tst-cet-legacy-10.c += -mshstk
CFLAGS-tst-cet-legacy-10-static.c += -mshstk
-CFLAGS-tst-cet-legacy-10a.c += -fcf-protection=none
-CFLAGS-tst-cet-legacy-10a-static.c += -fcf-protection=none
+CFLAGS-tst-cet-legacy-10a.c += -fcf-protection=none -mshstk
+CFLAGS-tst-cet-legacy-10a-static.c += -fcf-protection=none -mshstk
tst-cet-legacy-4-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=SHSTK
tst-cet-legacy-6-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=SHSTK
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
index c7ca36e..d030d09 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-double-vlen4.h"
+#include <test-double-vlen4.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
index 4f89ccb..aa5c882 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-double-vlen4.h"
+#include <test-double-vlen4.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
index 6c18286..9478349 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-double-vlen8.h"
+#include <test-double-vlen8.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
index 241857b..da17bcc 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-float-vlen16.h"
+#include <test-float-vlen16.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
index 043830b..a6ef2b4 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-float-vlen8.h"
+#include <test-float-vlen8.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
index e71faeb..7fd1ef0 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -16,7 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "test-float-vlen8.h"
+#include <test-float-vlen8.h>
#include "test-math-vector-sincos.h"
#include <immintrin.h>