diff options
Diffstat (limited to 'sysdeps')
28 files changed, 655 insertions, 248 deletions
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c index b1e7a9b..1a8f021 100644 --- a/sysdeps/aarch64/fpu/atan2_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2_advsimd.c @@ -23,40 +23,57 @@ static const struct data { + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - float64x2_t poly[20]; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; + uint64x2_t zeroinfnan, minustwo; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on - the interval [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + [2**-1022, 1.0]. */ + .c0 = V2 (-0x1.5555555555555p-2), + .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), + .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), + .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), + .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), + .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), + .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), + .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), + .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), + .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), + .c19 = 0x1.358851160a528p-16, .pi_over_2 = V2 (0x1.921fb54442d18p+0), + .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1), + .minustwo = V2 (0xc000000000000000), }; #define SignMask v_u64 (0x8000000000000000) /* Special cases i.e. 0, infinity, NaN (fall back to scalar calls). */ static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t y, float64x2_t x, float64x2_t ret, uint64x2_t cmp) +special_case (float64x2_t y, float64x2_t x, float64x2_t ret, + uint64x2_t sign_xy, uint64x2_t cmp) { + /* Account for the sign of x and y. */ + ret = vreinterpretq_f64_u64 ( + veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); return v_call2_f64 (atan2, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ static inline uint64x2_t -zeroinfnan (uint64x2_t i) +zeroinfnan (uint64x2_t i, const struct data *d) { /* (2 * i - 1) >= (2 * asuint64 (INFINITY) - 1). */ - return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), - v_u64 (2 * asuint64 (INFINITY) - 1)); + return vcgeq_u64 (vsubq_u64 (vaddq_u64 (i, i), v_u64 (1)), d->zeroinfnan); } /* Fast implementation of vector atan2. @@ -66,12 +83,13 @@ zeroinfnan (uint64x2_t i) want 0x1.92d628ab678cfp-1. */ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); uint64x2_t ix = vreinterpretq_u64_f64 (x); uint64x2_t iy = vreinterpretq_u64_f64 (y); - uint64x2_t special_cases = vorrq_u64 (zeroinfnan (ix), zeroinfnan (iy)); + uint64x2_t special_cases + = vorrq_u64 (zeroinfnan (ix, d), zeroinfnan (iy, d)); uint64x2_t sign_x = vandq_u64 (ix, SignMask); uint64x2_t sign_y = vandq_u64 (iy, SignMask); @@ -81,18 +99,18 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t ay = vabsq_f64 (y); uint64x2_t pred_xlt0 = vcltzq_f64 (x); - uint64x2_t pred_aygtax = vcgtq_f64 (ay, ax); + uint64x2_t pred_aygtax = vcagtq_f64 (y, x); /* Set up z for call to atan. */ float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay); - float64x2_t d = vbslq_f64 (pred_aygtax, ay, ax); - float64x2_t z = vdivq_f64 (n, d); + float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax); + float64x2_t z = vdivq_f64 (n, q); /* Work out the correct shift. */ - float64x2_t shift = vreinterpretq_f64_u64 ( - vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0)))); + float64x2_t shift + = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo)); shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift); - shift = vmulq_f64 (shift, data_ptr->pi_over_2); + shift = vmulq_f64 (shift, d->pi_over_2); /* Calculate the polynomial approximation. Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of @@ -103,20 +121,52 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x) float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t ret - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, data_ptr->poly), - v_estrin_11_f64 (z2, x2, x4, x8, data_ptr->poly + 8), x8); + + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t ret = vfmaq_f64 (p07, p819, x8); /* Finalize. y = shift + z + z^3 * P(z^2). */ ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z)); ret = vaddq_f64 (ret, shift); + if (__glibc_unlikely (v_any_u64 (special_cases))) + return special_case (y, x, ret, sign_xy, special_cases); + /* Account for the sign of x and y. */ ret = vreinterpretq_f64_u64 ( veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy)); - if (__glibc_unlikely (v_any_u64 (special_cases))) - return special_case (y, x, ret, special_cases); - return ret; } diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c index 56e610c..88daacd 100644 --- a/sysdeps/aarch64/fpu/atan2f_advsimd.c +++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c @@ -22,34 +22,39 @@ static const struct data { - float32x4_t poly[8]; - float32x4_t pi_over_2; + float32x4_t c0, pi_over_2, c4, c6, c2; + float c1, c3, c5, c7; + uint32x4_t comp_const; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-128, 1.0]. Generated using fpminimax between FLT_MIN and 1. */ - .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f), - V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f), - V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) }, - .pi_over_2 = V4 (0x1.921fb6p+0f), + .c0 = V4 (-0x1.55555p-2f), .c1 = 0x1.99935ep-3f, + .c2 = V4 (-0x1.24051ep-3f), .c3 = 0x1.bd7368p-4f, + .c4 = V4 (-0x1.491f0ep-4f), .c5 = 0x1.93a2c0p-5f, + .c6 = V4 (-0x1.4c3c60p-6f), .c7 = 0x1.01fd88p-8f, + .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1), }; #define SignMask v_u32 (0x80000000) /* Special cases i.e. 0, infinity and nan (fall back to scalar calls). */ static float32x4_t VPCS_ATTR NOINLINE -special_case (float32x4_t y, float32x4_t x, float32x4_t ret, uint32x4_t cmp) +special_case (float32x4_t y, float32x4_t x, float32x4_t ret, + uint32x4_t sign_xy, uint32x4_t cmp) { + /* Account for the sign of y. */ + ret = vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); return v_call2_f32 (atan2f, y, x, ret, cmp); } /* Returns 1 if input is the bit representation of 0, infinity or nan. */ static inline uint32x4_t -zeroinfnan (uint32x4_t i) +zeroinfnan (uint32x4_t i, const struct data *d) { /* 2 * i - 1 >= 2 * 0x7f800000lu - 1. */ - return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), - v_u32 (2 * 0x7f800000lu - 1)); + return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const); } /* Fast implementation of vector atan2f. Maximum observed error is @@ -58,12 +63,13 @@ zeroinfnan (uint32x4_t i) want 0x1.967f00p-1. */ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) { - const struct data *data_ptr = ptr_barrier (&data); + const struct data *d = ptr_barrier (&data); uint32x4_t ix = vreinterpretq_u32_f32 (x); uint32x4_t iy = vreinterpretq_u32_f32 (y); - uint32x4_t special_cases = vorrq_u32 (zeroinfnan (ix), zeroinfnan (iy)); + uint32x4_t special_cases + = vorrq_u32 (zeroinfnan (ix, d), zeroinfnan (iy, d)); uint32x4_t sign_x = vandq_u32 (ix, SignMask); uint32x4_t sign_y = vandq_u32 (iy, SignMask); @@ -77,14 +83,14 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) /* Set up z for call to atanf. */ float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); - float32x4_t d = vbslq_f32 (pred_aygtax, ay, ax); - float32x4_t z = vdivq_f32 (n, d); + float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); + float32x4_t z = vdivq_f32 (n, q); /* Work out the correct shift. */ float32x4_t shift = vreinterpretq_f32_u32 ( vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f)))); shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift); - shift = vmulq_f32 (shift, data_ptr->pi_over_2); + shift = vmulq_f32 (shift, d->pi_over_2); /* Calculate the polynomial approximation. Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However, @@ -96,23 +102,27 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) float32x4_t z2 = vmulq_f32 (z, z); float32x4_t z4 = vmulq_f32 (z2, z2); - float32x4_t ret = vfmaq_f32 ( - v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly), z4, - vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, data_ptr->poly + 4))); + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0); + float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1); + float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2); + float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, c1357, 3); + float32x4_t p03 = vfmaq_f32 (p01, z4, p23); + float32x4_t p47 = vfmaq_f32 (p45, z4, p67); + + float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47)); /* y = shift + z * P(z^2). */ ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift); - /* Account for the sign of y. */ - ret = vreinterpretq_f32_u32 ( - veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); - if (__glibc_unlikely (v_any_u32 (special_cases))) { - return special_case (y, x, ret, special_cases); + return special_case (y, x, ret, sign_xy, special_cases); } - return ret; + /* Account for the sign of y. */ + return vreinterpretq_f32_u32 ( + veorq_u32 (vreinterpretq_u32_f32 (ret), sign_xy)); } libmvec_hidden_def (V_NAME_F2 (atan2)) HALF_WIDTH_ALIAS_F2(atan2) diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c index a962be0..14f1809 100644 --- a/sysdeps/aarch64/fpu/atan_advsimd.c +++ b/sysdeps/aarch64/fpu/atan_advsimd.c @@ -22,21 +22,22 @@ static const struct data { + float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18; float64x2_t pi_over_2; - float64x2_t poly[20]; + double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19; } data = { /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on [2**-1022, 1.0]. */ - .poly = { V2 (-0x1.5555555555555p-2), V2 (0x1.99999999996c1p-3), - V2 (-0x1.2492492478f88p-3), V2 (0x1.c71c71bc3951cp-4), - V2 (-0x1.745d160a7e368p-4), V2 (0x1.3b139b6a88ba1p-4), - V2 (-0x1.11100ee084227p-4), V2 (0x1.e1d0f9696f63bp-5), - V2 (-0x1.aebfe7b418581p-5), V2 (0x1.842dbe9b0d916p-5), - V2 (-0x1.5d30140ae5e99p-5), V2 (0x1.338e31eb2fbbcp-5), - V2 (-0x1.00e6eece7de8p-5), V2 (0x1.860897b29e5efp-6), - V2 (-0x1.0051381722a59p-6), V2 (0x1.14e9dc19a4a4ep-7), - V2 (-0x1.d0062b42fe3bfp-9), V2 (0x1.17739e210171ap-10), - V2 (-0x1.ab24da7be7402p-13), V2 (0x1.358851160a528p-16), }, + .c0 = V2 (-0x1.5555555555555p-2), .c1 = 0x1.99999999996c1p-3, + .c2 = V2 (-0x1.2492492478f88p-3), .c3 = 0x1.c71c71bc3951cp-4, + .c4 = V2 (-0x1.745d160a7e368p-4), .c5 = 0x1.3b139b6a88ba1p-4, + .c6 = V2 (-0x1.11100ee084227p-4), .c7 = 0x1.e1d0f9696f63bp-5, + .c8 = V2 (-0x1.aebfe7b418581p-5), .c9 = 0x1.842dbe9b0d916p-5, + .c10 = V2 (-0x1.5d30140ae5e99p-5), .c11 = 0x1.338e31eb2fbbcp-5, + .c12 = V2 (-0x1.00e6eece7de8p-5), .c13 = 0x1.860897b29e5efp-6, + .c14 = V2 (-0x1.0051381722a59p-6), .c15 = 0x1.14e9dc19a4a4ep-7, + .c16 = V2 (-0x1.d0062b42fe3bfp-9), .c17 = 0x1.17739e210171ap-10, + .c18 = V2 (-0x1.ab24da7be7402p-13), .c19 = 0x1.358851160a528p-16, .pi_over_2 = V2 (0x1.921fb54442d18p+0), }; @@ -52,6 +53,11 @@ static const struct data float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) { const struct data *d = ptr_barrier (&data); + float64x2_t c13 = vld1q_f64 (&d->c1); + float64x2_t c57 = vld1q_f64 (&d->c5); + float64x2_t c911 = vld1q_f64 (&d->c9); + float64x2_t c1315 = vld1q_f64 (&d->c13); + float64x2_t c1719 = vld1q_f64 (&d->c17); /* Small cases, infs and nans are supported by our approximation technique, but do not set fenv flags correctly. Only trigger special case if we need @@ -90,9 +96,35 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x) float64x2_t x2 = vmulq_f64 (z2, z2); float64x2_t x4 = vmulq_f64 (x2, x2); float64x2_t x8 = vmulq_f64 (x4, x4); - float64x2_t y - = vfmaq_f64 (v_estrin_7_f64 (z2, x2, x4, d->poly), - v_estrin_11_f64 (z2, x2, x4, x8, d->poly + 8), x8); + + /* estrin_7. */ + float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0); + float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1); + float64x2_t p03 = vfmaq_f64 (p01, x2, p23); + + float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0); + float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1); + float64x2_t p47 = vfmaq_f64 (p45, x2, p67); + + float64x2_t p07 = vfmaq_f64 (p03, x4, p47); + + /* estrin_11. */ + float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0); + float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1); + float64x2_t p811 = vfmaq_f64 (p89, x2, p1011); + + float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0); + float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1); + float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415); + + float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0); + float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1); + float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819); + + float64x2_t p815 = vfmaq_f64 (p811, x4, p1215); + float64x2_t p819 = vfmaq_f64 (p815, x8, p1619); + + float64x2_t y = vfmaq_f64 (p07, p819, x8); /* Finalize. y = shift + z + z^3 * P(z^2). */ y = vfmaq_f64 (az, y, vmulq_f64 (z2, az)); diff --git a/sysdeps/aarch64/fpu/coshf_advsimd.c b/sysdeps/aarch64/fpu/coshf_advsimd.c index c1ab492..cd5c866 100644 --- a/sysdeps/aarch64/fpu/coshf_advsimd.c +++ b/sysdeps/aarch64/fpu/coshf_advsimd.c @@ -23,19 +23,27 @@ static const struct data { struct v_expf_data expf_consts; - uint32x4_t tiny_bound, special_bound; + uint32x4_t tiny_bound; + float32x4_t bound; +#if WANT_SIMD_EXCEPT + uint32x4_t special_bound; +#endif } data = { .expf_consts = V_EXPF_DATA, .tiny_bound = V4 (0x20000000), /* 0x1p-63: Round to 1 below this. */ /* 0x1.5a92d8p+6: expf overflows above this, so have to use special case. */ + .bound = V4 (0x1.5a92d8p+6), +#if WANT_SIMD_EXCEPT .special_bound = V4 (0x42ad496c), +#endif }; #if !WANT_SIMD_EXCEPT static float32x4_t NOINLINE VPCS_ATTR -special_case (float32x4_t x, float32x4_t y, uint32x4_t special) +special_case (float32x4_t x, float32x4_t half_t, float32x4_t half_over_t, + uint32x4_t special) { - return v_call_f32 (coshf, x, y, special); + return v_call_f32 (coshf, x, vaddq_f32 (half_t, half_over_t), special); } #endif @@ -47,14 +55,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t ax = vabsq_f32 (x); - uint32x4_t iax = vreinterpretq_u32_f32 (ax); - uint32x4_t special = vcgeq_u32 (iax, d->special_bound); - #if WANT_SIMD_EXCEPT /* If fp exceptions are to be triggered correctly, fall back to the scalar variant for all inputs if any input is a special value or above the bound at which expf overflows. */ + float32x4_t ax = vabsq_f32 (x); + uint32x4_t iax = vreinterpretq_u32_f32 (ax); + uint32x4_t special = vcgeq_u32 (iax, d->special_bound); if (__glibc_unlikely (v_any_u32 (special))) return v_call_f32 (coshf, x, x, v_u32 (-1)); @@ -63,10 +70,13 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) input to 0, which will generate no exceptions. */ if (__glibc_unlikely (v_any_u32 (tiny))) ax = v_zerofy_f32 (ax, tiny); + float32x4_t t = v_expf_inline (ax, &d->expf_consts); +#else + uint32x4_t special = vcageq_f32 (x, d->bound); + float32x4_t t = v_expf_inline (x, &d->expf_consts); #endif /* Calculate cosh by exp(x) / 2 + exp(-x) / 2. */ - float32x4_t t = v_expf_inline (ax, &d->expf_consts); float32x4_t half_t = vmulq_n_f32 (t, 0.5); float32x4_t half_over_t = vdivq_f32 (v_f32 (0.5), t); @@ -75,7 +85,7 @@ float32x4_t VPCS_ATTR V_NAME_F1 (cosh) (float32x4_t x) return vbslq_f32 (tiny, v_f32 (1), vaddq_f32 (half_t, half_over_t)); #else if (__glibc_unlikely (v_any_u32 (special))) - return special_case (x, vaddq_f32 (half_t, half_over_t), special); + return special_case (x, half_t, half_over_t, special); #endif return vaddq_f32 (half_t, half_over_t); diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c index cf53e73..55d9cd8 100644 --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c @@ -18,16 +18,15 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" #define ScaleBound 192.0f static const struct data { - float32x4_t poly[5]; - float log10_2_and_inv[4]; - float32x4_t shift; - + float32x4_t c0, c1, c3; + float log10_2_high, log10_2_low, c2, c4; + float32x4_t inv_log10_2, special_bound; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; #endif @@ -37,19 +36,24 @@ static const struct data rel error: 0x1.89dafa3p-24 abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2] maxerr: 1.85943 +0.5 ulp. */ - .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), - V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, - .shift = V4 (0x1.8p23f), - - /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ - .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, + .c0 = V4 (0x1.26bb16p+1f), + .c1 = V4 (0x1.5350d2p+1f), + .c2 = 0x1.04744ap+1f, + .c3 = V4 (0x1.2d8176p+0f), + .c4 = 0x1.12b41ap-1f, + .inv_log10_2 = V4 (0x1.a934fp+1), + .log10_2_high = 0x1.344136p-2, + .log10_2_low = 0x1.ec10cp-27, + /* rint (log2 (2^127 / (1 + sqrt (2)))). */ + .special_bound = V4 (126.0f), + .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .scale_thresh = V4 (ScaleBound) #endif }; -#define ExponentBias v_u32 (0x3f800000) - #if WANT_SIMD_EXCEPT # define SpecialBound 38.0f /* rint(log10(2^127)). */ @@ -67,17 +71,15 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */ -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) +# define SpecialBound 126.0f static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -112,23 +114,23 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ - float32x4_t log10_2_and_inv = vld1q_f32 (d->log10_2_and_inv); - float32x4_t z = vfmaq_laneq_f32 (d->shift, x, log10_2_and_inv, 0); - float32x4_t n = vsubq_f32 (z, d->shift); - float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_and_inv, 1); - r = vfmsq_laneq_f32 (r, n, log10_2_and_inv, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); + float32x4_t log10_2_c24 = vld1q_f32 (&d->log10_2_high); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2_c24, 0); + r = vfmaq_laneq_f32 (r, n, log10_2_c24, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - uint32x4_t cmp = vcagtq_f32 (n, v_f32 (SpecialBound)); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly - = vfmaq_f32 (vmulq_f32 (r, d->poly[0]), - v_pairwise_poly_3_f32 (r, r2, d->poly + 1), r2); + float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10_2_c24, 2); + float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10_2_c24, 3); + float32x4_t p14 = vfmaq_f32 (p12, r2, p34); + float32x4_t poly = vfmaq_f32 (vmulq_f32 (r, d->c0), p14, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/exp2f_advsimd.c b/sysdeps/aarch64/fpu/exp2f_advsimd.c index 69e0b19..a4220da 100644 --- a/sysdeps/aarch64/fpu/exp2f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2f_advsimd.c @@ -21,24 +21,28 @@ static const struct data { - float32x4_t poly[5]; - uint32x4_t exponent_bias; + float32x4_t c1, c3; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT - float32x4_t special_bound, scale_thresh; + float32x4_t scale_thresh, special_bound; #endif + float c0, c2, c4, zero; } data = { /* maxerr: 1.962 ulp. */ - .poly = { V4 (0x1.59977ap-10f), V4 (0x1.3ce9e4p-7f), V4 (0x1.c6bd32p-5f), - V4 (0x1.ebf9bcp-3f), V4 (0x1.62e422p-1f) }, + .c0 = 0x1.59977ap-10f, + .c1 = V4 (0x1.3ce9e4p-7f), + .c2 = 0x1.c6bd32p-5f, + .c3 = V4 (0x1.ebf9bcp-3f), + .c4 = 0x1.62e422p-1f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), #endif }; -#define C(i) d->poly[i] - #if WANT_SIMD_EXCEPT # define TinyBound v_u32 (0x20000000) /* asuint (0x1p-63). */ @@ -55,16 +59,13 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); @@ -80,13 +81,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; #if WANT_SIMD_EXCEPT /* asuint(|x|) - TinyBound >= BigBound - TinyBound. */ uint32x4_t ia = vreinterpretq_u32_f32 (vabsq_f32 (x)); - cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); + uint32x4_t cmp = vcgeq_u32 (vsubq_u32 (ia, TinyBound), SpecialBound); float32x4_t xm = x; /* If any lanes are special, mask them with 1 and retain a copy of x to allow special_case to fix special lanes later. This is only necessary if fenv @@ -95,23 +94,24 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2) (float32x4_t x) x = vbslq_f32 (cmp, v_f32 (1), x); #endif - /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = n + r, with r in [-1/2, 1/2]. */ - n = vrndaq_f32 (x); - r = vsubq_f32 (x, n); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + /* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + x = n + r, with r in [-1/2, 1/2]. */ + float32x4_t n = vrndaq_f32 (x); + float32x4_t r = vsubq_f32 (x, n); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t c024 = vld1q_f32 (&d->c0); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, c024, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, c024, 1); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_laneq_f32 (r, c024, 2); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c index 5c9cb72..70f137e 100644 --- a/sysdeps/aarch64/fpu/expf_advsimd.c +++ b/sysdeps/aarch64/fpu/expf_advsimd.c @@ -21,20 +21,25 @@ static const struct data { - float32x4_t poly[5]; - float32x4_t inv_ln2, ln2_hi, ln2_lo; - uint32x4_t exponent_bias; + float32x4_t c1, c3, c4, inv_ln2; + float ln2_hi, ln2_lo, c0, c2; + uint32x4_t exponent_bias, special_offset, special_bias; #if !WANT_SIMD_EXCEPT float32x4_t special_bound, scale_thresh; #endif } data = { /* maxerr: 1.45358 +0.5 ulp. */ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, + .c0 = 0x1.0e4020p-7f, + .c1 = V4 (0x1.573e2ep-5f), + .c2 = 0x1.555e66p-3f, + .c3 = V4 (0x1.fffdb6p-2f), + .c4 = V4 (0x1.ffffecp-1f), .inv_ln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), + .ln2_hi = 0x1.62e4p-1f, + .ln2_lo = 0x1.7f7d1cp-20f, .exponent_bias = V4 (0x3f800000), + .special_offset = V4 (0x82000000), + .special_bias = V4 (0x7f000000), #if !WANT_SIMD_EXCEPT .special_bound = V4 (126.0f), .scale_thresh = V4 (192.0f), @@ -59,19 +64,17 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) #else -# define SpecialOffset v_u32 (0x82000000) -# define SpecialBias v_u32 (0x7f000000) - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t scale, const struct data *d) { /* 2^n may overflow, break it up into s1*s2. */ - uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset); - float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias)); + uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset); + float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh); float32x4_t r2 = vmulq_f32 (s1, s1); + // (s2 + p*s2)*s1 = s2(p+1)s1 float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1); /* Similar to r1 but avoids double rounding in the subnormal range. */ float32x4_t r0 = vfmaq_f32 (scale, poly, scale); @@ -84,12 +87,11 @@ special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, r, r2, scale, p, q, poly; - uint32x4_t cmp, e; + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); #if WANT_SIMD_EXCEPT /* asuint(x) - TinyBound >= BigBound - TinyBound. */ - cmp = vcgeq_u32 ( + uint32x4_t cmp = vcgeq_u32 ( vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), TinyBound), SpecialBound); @@ -103,22 +105,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp) (float32x4_t x) /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); - r = vfmsq_f32 (x, n, d->ln2_hi); - r = vfmsq_f32 (r, n, d->ln2_lo); - e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); - scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); + float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (x, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); #if !WANT_SIMD_EXCEPT - cmp = vcagtq_f32 (n, d->special_bound); + uint32x4_t cmp = vcagtq_f32 (n, d->special_bound); #endif - r2 = vmulq_f32 (r, r); - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t r2 = vmulq_f32 (r, r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); if (__glibc_unlikely (v_any_u32 (cmp))) #if WANT_SIMD_EXCEPT diff --git a/sysdeps/aarch64/fpu/log10f_advsimd.c b/sysdeps/aarch64/fpu/log10f_advsimd.c index 82228b5..0d792c3 100644 --- a/sysdeps/aarch64/fpu/log10f_advsimd.c +++ b/sysdeps/aarch64/fpu/log10f_advsimd.c @@ -18,21 +18,25 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { + float32x4_t c0, c2, c4, c6, inv_ln10, ln2; uint32x4_t off, offset_lower_bound; uint16x8_t special_bound; uint32x4_t mantissa_mask; - float32x4_t poly[8]; - float32x4_t inv_ln10, ln2; + float c1, c3, c5, c7; } data = { /* Use order 9 for log10(1+x), i.e. order 8 for log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3). Max. relative error: 0x1.068ee468p-25. */ - .poly = { V4 (-0x1.bcb79cp-3f), V4 (0x1.2879c8p-3f), V4 (-0x1.bcd472p-4f), - V4 (0x1.6408f8p-4f), V4 (-0x1.246f8p-4f), V4 (0x1.f0e514p-5f), - V4 (-0x1.0fc92cp-4f), V4 (0x1.f5f76ap-5f) }, + .c0 = V4 (-0x1.bcb79cp-3f), + .c1 = 0x1.2879c8p-3f, + .c2 = V4 (-0x1.bcd472p-4f), + .c3 = 0x1.6408f8p-4f, + .c4 = V4 (-0x1.246f8p-4f), + .c5 = 0x1.f0e514p-5f, + .c6 = V4 (-0x1.0fc92cp-4f), + .c7 = 0x1.f5f76ap-5f, .ln2 = V4 (0x1.62e43p-1f), .inv_ln10 = V4 (0x1.bcb7b2p-2f), /* Lower bound is the smallest positive normal float 0x00800000. For @@ -62,7 +66,7 @@ special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - + float32x4_t c1357 = vld1q_f32 (&d->c1); /* To avoid having to mov x out of the way, keep u after offset has been applied, and recover x by adding the offset back in the special-case handler. */ @@ -81,7 +85,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) /* y = log10(1+r) + n * log10(2). */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t poly = v_pw_horner_7_f32 (r, r2, d->poly); + + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + + float32x4_t p47 = vfmaq_f32 (c45, r2, c67); + float32x4_t p27 = vfmaq_f32 (c23, r2, p47); + float32x4_t poly = vfmaq_f32 (c01, r2, p27); + /* y = Log10(2) * n + poly * InvLn(10). */ float32x4_t y = vfmaq_f32 (r, d->ln2, n); y = vmulq_f32 (y, d->inv_ln10); diff --git a/sysdeps/aarch64/fpu/log2f_advsimd.c b/sysdeps/aarch64/fpu/log2f_advsimd.c index 84effe4..116c36c 100644 --- a/sysdeps/aarch64/fpu/log2f_advsimd.c +++ b/sysdeps/aarch64/fpu/log2f_advsimd.c @@ -18,22 +18,27 @@ <https://www.gnu.org/licenses/>. */ #include "v_math.h" -#include "poly_advsimd_f32.h" static const struct data { + float32x4_t c0, c2, c4, c6, c8; uint32x4_t off, offset_lower_bound; uint16x8_t special_bound; uint32x4_t mantissa_mask; - float32x4_t poly[9]; + float c1, c3, c5, c7; } data = { /* Coefficients generated using Remez algorithm approximate log2(1+r)/r for r in [ -1/3, 1/3 ]. rel error: 0x1.c4c4b0cp-26. */ - .poly = { V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ - V4 (-0x1.715458p-1f), V4 (0x1.ec701cp-2f), V4 (-0x1.7171a4p-2f), - V4 (0x1.27a0b8p-2f), V4 (-0x1.e5143ep-3f), V4 (0x1.9d8ecap-3f), - V4 (-0x1.c675bp-3f), V4 (0x1.9e495p-3f) }, + .c0 = V4 (0x1.715476p0f), /* (float)(1 / ln(2)). */ + .c1 = -0x1.715458p-1f, + .c2 = V4 (0x1.ec701cp-2f), + .c3 = -0x1.7171a4p-2f, + .c4 = V4 (0x1.27a0b8p-2f), + .c5 = -0x1.e5143ep-3f, + .c6 = V4 (0x1.9d8ecap-3f), + .c7 = -0x1.c675bp-3f, + .c8 = V4 (0x1.9e495p-3f), /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been subtracted, so lower bound is 0x0080000 - offset (which wraps around). */ @@ -79,11 +84,21 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) /* y = log2(1+r) + n. */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p = v_pw_horner_8_f32 (r, r2, d->poly); + + float32x4_t c1357 = vld1q_f32 (&d->c1); + float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); + float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); + float32x4_t c45 = vfmaq_laneq_f32 (d->c4, r, c1357, 2); + float32x4_t c67 = vfmaq_laneq_f32 (d->c6, r, c1357, 3); + float32x4_t p68 = vfmaq_f32 (c67, r2, d->c8); + float32x4_t p48 = vfmaq_f32 (c45, r2, p68); + float32x4_t p28 = vfmaq_f32 (c23, r2, p48); + float32x4_t p = vfmaq_f32 (c01, r2, p28); if (__glibc_unlikely (v_any_u16h (special))) return special_case (n, u_off, p, r, special, d); return vfmaq_f32 (n, p, r); } + libmvec_hidden_def (V_NAME_F1 (log2)) HALF_WIDTH_ALIAS_F1 (log2) diff --git a/sysdeps/aarch64/fpu/logf_advsimd.c b/sysdeps/aarch64/fpu/logf_advsimd.c index c20dbfd..d9e64c7 100644 --- a/sysdeps/aarch64/fpu/logf_advsimd.c +++ b/sysdeps/aarch64/fpu/logf_advsimd.c @@ -21,16 +21,19 @@ static const struct data { - uint32x4_t off, offset_lower_bound; + float32x4_t c2, c4, c6, ln2; + uint32x4_t off, offset_lower_bound, mantissa_mask; uint16x8_t special_bound; - uint32x4_t mantissa_mask; - float32x4_t poly[7]; - float32x4_t ln2; + float c1, c3, c5, c0; } data = { /* 3.34 ulp error. */ - .poly = { V4 (-0x1.3e737cp-3f), V4 (0x1.5a9aa2p-3f), V4 (-0x1.4f9934p-3f), - V4 (0x1.961348p-3f), V4 (-0x1.00187cp-2f), V4 (0x1.555d7cp-2f), - V4 (-0x1.ffffc8p-2f) }, + .c0 = -0x1.3e737cp-3f, + .c1 = 0x1.5a9aa2p-3f, + .c2 = V4 (-0x1.4f9934p-3f), + .c3 = 0x1.961348p-3f, + .c4 = V4 (-0x1.00187cp-2f), + .c5 = 0x1.555d7cp-2f, + .c6 = V4 (-0x1.ffffc8p-2f), .ln2 = V4 (0x1.62e43p-1f), /* Lower bound is the smallest positive normal float 0x00800000. For optimised register use subnormals are detected after offset has been @@ -41,8 +44,6 @@ static const struct data .mantissa_mask = V4 (0x007fffff) }; -#define P(i) d->poly[7 - i] - static float32x4_t VPCS_ATTR NOINLINE special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, uint16x4_t cmp, const struct data *d) @@ -55,33 +56,30 @@ special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) { const struct data *d = ptr_barrier (&data); - float32x4_t n, p, q, r, r2, y; - uint32x4_t u, u_off; - uint16x4_t cmp; + float32x4_t c1350 = vld1q_f32 (&d->c1); /* To avoid having to mov x out of the way, keep u after offset has been applied, and recover x by adding the offset back in the special-case handler. */ - u_off = vreinterpretq_u32_f32 (x); + uint32x4_t u_off = vsubq_u32 (vreinterpretq_u32_f32 (x), d->off); /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3. */ - u_off = vsubq_u32 (u_off, d->off); - n = vcvtq_f32_s32 ( + float32x4_t n = vcvtq_f32_s32 ( vshrq_n_s32 (vreinterpretq_s32_u32 (u_off), 23)); /* signextend. */ - u = vandq_u32 (u_off, d->mantissa_mask); - u = vaddq_u32 (u, d->off); - r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); + uint16x4_t cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), + vget_low_u16 (d->special_bound)); - cmp = vcge_u16 (vsubhn_u32 (u_off, d->offset_lower_bound), - vget_low_u16 (d->special_bound)); + uint32x4_t u = vaddq_u32 (vandq_u32 (u_off, d->mantissa_mask), d->off); + float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); /* y = log(1+r) + n*ln2. */ - r2 = vmulq_f32 (r, r); + float32x4_t r2 = vmulq_f32 (r, r); /* n*ln2 + r + r2*(P1 + r*P2 + r2*(P3 + r*P4 + r2*(P5 + r*P6 + r2*P7))). */ - p = vfmaq_f32 (P (5), P (6), r); - q = vfmaq_f32 (P (3), P (4), r); - y = vfmaq_f32 (P (1), P (2), r); - p = vfmaq_f32 (p, P (7), r2); + float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); + float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); + float32x4_t y = vfmaq_laneq_f32 (d->c6, r, c1350, 2); + p = vfmaq_laneq_f32 (p, r2, c1350, 3); + q = vfmaq_f32 (q, p, r2); y = vfmaq_f32 (y, q, r2); p = vfmaq_f32 (r, d->ln2, n); diff --git a/sysdeps/aarch64/fpu/v_expf_inline.h b/sysdeps/aarch64/fpu/v_expf_inline.h index 08b06e0..eacd2af 100644 --- a/sysdeps/aarch64/fpu/v_expf_inline.h +++ b/sysdeps/aarch64/fpu/v_expf_inline.h @@ -24,50 +24,45 @@ struct v_expf_data { - float32x4_t poly[5]; - float32x4_t shift; - float invln2_and_ln2[4]; + float ln2_hi, ln2_lo, c0, c2; + float32x4_t inv_ln2, c1, c3, c4; + /* asuint(1.0f). */ + uint32x4_t exponent_bias; }; /* maxerr: 1.45358 +0.5 ulp. */ #define V_EXPF_DATA \ { \ - .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f), \ - V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) }, \ - .shift = V4 (0x1.8p23f), \ - .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \ + .c0 = 0x1.0e4020p-7f, .c1 = V4 (0x1.573e2ep-5f), .c2 = 0x1.555e66p-3f, \ + .c3 = V4 (0x1.fffdb6p-2f), .c4 = V4 (0x1.ffffecp-1f), \ + .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \ + .inv_ln2 = V4 (0x1.715476p+0f), .exponent_bias = V4 (0x3f800000), \ } -#define ExponentBias v_u32 (0x3f800000) /* asuint(1.0f). */ -#define C(i) d->poly[i] - static inline float32x4_t v_expf_inline (float32x4_t x, const struct v_expf_data *d) { - /* Helper routine for calculating exp(x). + /* Helper routine for calculating exp(ax). Copied from v_expf.c, with all special-case handling removed - the calling routine should handle special values if required. */ - /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] - x = ln2*n + r, with r in [-ln2/2, ln2/2]. */ - float32x4_t n, r, z; - float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2); - z = vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0); - n = vsubq_f32 (z, d->shift); - r = vfmsq_laneq_f32 (x, n, invln2_and_ln2, 1); - r = vfmsq_laneq_f32 (r, n, invln2_and_ln2, 2); - uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); - float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); + /* exp(ax) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)] + ax = ln2*n + r, with r in [-ln2/2, ln2/2]. */ + float32x4_t ax = vabsq_f32 (x); + float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); + float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); + float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); + r = vfmsq_laneq_f32 (r, n, ln2_c02, 1); + uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23); + float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); /* Custom order-4 Estrin avoids building high order monomial. */ float32x4_t r2 = vmulq_f32 (r, r); - float32x4_t p, q, poly; - p = vfmaq_f32 (C (1), C (0), r); - q = vfmaq_f32 (C (3), C (2), r); + float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); + float32x4_t q = vfmaq_laneq_f32 (d->c3, r, ln2_c02, 3); q = vfmaq_f32 (q, p, r2); - p = vmulq_f32 (C (4), r); - poly = vfmaq_f32 (p, q, r2); + p = vmulq_f32 (d->c4, r); + float32x4_t poly = vfmaq_f32 (p, q, r2); return vfmaq_f32 (scale, poly, scale); } - #endif diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps index c80122d..38bae47 100644 --- a/sysdeps/arm/libm-test-ulps +++ b/sysdeps/arm/libm-test-ulps @@ -33,6 +33,22 @@ Function: "acosh_upward": double: 2 float: 2 +Function: "acospi": +double: 1 +float: 1 + +Function: "acospi_downward": +double: 1 +float: 2 + +Function: "acospi_towardzero": +double: 1 +float: 2 + +Function: "acospi_upward": +double: 2 +float: 1 + Function: "asin": double: 1 float: 1 @@ -65,6 +81,22 @@ Function: "asinh_upward": double: 3 float: 3 +Function: "asinpi": +double: 1 +float: 1 + +Function: "asinpi_downward": +double: 1 +float: 1 + +Function: "asinpi_towardzero": +double: 1 +float: 1 + +Function: "asinpi_upward": +double: 2 +float: 1 + Function: "atan": double: 1 float: 1 @@ -84,6 +116,22 @@ Function: "atan2_upward": double: 1 float: 2 +Function: "atan2pi": +double: 1 +float: 1 + +Function: "atan2pi_downward": +double: 1 +float: 3 + +Function: "atan2pi_towardzero": +double: 1 +float: 2 + +Function: "atan2pi_upward": +double: 1 +float: 3 + Function: "atan_downward": double: 1 float: 2 @@ -112,6 +160,22 @@ Function: "atanh_upward": double: 3 float: 3 +Function: "atanpi": +double: 1 +float: 1 + +Function: "atanpi_downward": +double: 1 +float: 2 + +Function: "atanpi_towardzero": +double: 1 +float: 1 + +Function: "atanpi_upward": +double: 1 +float: 1 + Function: "cabs": double: 1 @@ -535,6 +599,22 @@ Function: "cosh_upward": double: 2 float: 2 +Function: "cospi": +double: 1 +float: 1 + +Function: "cospi_downward": +double: 1 +float: 1 + +Function: "cospi_towardzero": +double: 1 +float: 1 + +Function: "cospi_upward": +double: 1 +float: 1 + Function: Real part of "cpow": double: 2 float: 5 @@ -1078,6 +1158,22 @@ Function: "sinh_upward": double: 3 float: 3 +Function: "sinpi": +double: 1 +float: 1 + +Function: "sinpi_downward": +double: 2 +float: 2 + +Function: "sinpi_towardzero": +double: 2 +float: 1 + +Function: "sinpi_upward": +double: 2 +float: 2 + Function: "tan_downward": double: 1 @@ -1103,6 +1199,22 @@ Function: "tanh_upward": double: 3 float: 3 +Function: "tanpi": +double: 2 +float: 2 + +Function: "tanpi_downward": +double: 2 +float: 3 + +Function: "tanpi_towardzero": +double: 2 +float: 3 + +Function: "tanpi_upward": +double: 2 +float: 4 + Function: "tgamma": double: 9 diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps index c06da68..020c928 100644 --- a/sysdeps/i386/fpu/libm-test-ulps +++ b/sysdeps/i386/fpu/libm-test-ulps @@ -39,6 +39,30 @@ double: 1 float128: 3 ldouble: 3 +Function: "acospi": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "acospi_downward": +double: 1 +float: 1 +float128: 1 +ldouble: 3 + +Function: "acospi_towardzero": +double: 1 +float: 1 +float128: 1 +ldouble: 3 + +Function: "acospi_upward": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + Function: "asin": double: 1 float128: 1 @@ -83,6 +107,30 @@ float: 1 float128: 4 ldouble: 5 +Function: "asinpi": +double: 1 +float: 1 +float128: 1 +ldouble: 2 + +Function: "asinpi_downward": +double: 1 +float: 1 +float128: 1 +ldouble: 2 + +Function: "asinpi_towardzero": +double: 1 +float: 1 +float128: 1 +ldouble: 2 + +Function: "asinpi_upward": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + Function: "atan": double: 1 float128: 1 @@ -111,6 +159,30 @@ float: 1 float128: 2 ldouble: 1 +Function: "atan2pi": +double: 1 +float: 1 +float128: 2 +ldouble: 1 + +Function: "atan2pi_downward": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + +Function: "atan2pi_towardzero": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + +Function: "atan2pi_upward": +double: 1 +float: 1 +float128: 2 +ldouble: 2 + Function: "atan_downward": double: 1 float: 1 @@ -152,6 +224,30 @@ float: 1 float128: 4 ldouble: 5 +Function: "atanpi": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "atanpi_downward": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "atanpi_towardzero": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "atanpi_upward": +double: 1 +float: 1 +float128: 2 +ldouble: 1 + Function: "cabs": double: 1 float128: 1 @@ -792,6 +888,30 @@ float: 2 float128: 3 ldouble: 3 +Function: "cospi": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "cospi_downward": +double: 1 +float: 1 +float128: 2 +ldouble: 1 + +Function: "cospi_towardzero": +double: 1 +float: 1 +float128: 2 +ldouble: 1 + +Function: "cospi_upward": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + Function: Real part of "cpow": double: 2 float: 5 @@ -1613,6 +1733,30 @@ float: 3 float128: 4 ldouble: 5 +Function: "sinpi": +double: 1 +float: 1 +float128: 1 +ldouble: 1 + +Function: "sinpi_downward": +double: 2 +float: 2 +float128: 2 +ldouble: 2 + +Function: "sinpi_towardzero": +double: 2 +float: 1 +float128: 1 +ldouble: 1 + +Function: "sinpi_upward": +double: 2 +float: 2 +float128: 2 +ldouble: 2 + Function: "tan": float128: 1 ldouble: 2 @@ -1656,6 +1800,30 @@ float: 3 float128: 3 ldouble: 4 +Function: "tanpi": +double: 2 +float: 2 +float128: 2 +ldouble: 2 + +Function: "tanpi_downward": +double: 2 +float: 3 +float128: 4 +ldouble: 4 + +Function: "tanpi_towardzero": +double: 2 +float: 3 +float128: 4 +ldouble: 4 + +Function: "tanpi_upward": +double: 2 +float: 4 +float128: 3 +ldouble: 4 + Function: "tgamma": double: 9 float128: 4 diff --git a/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c b/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c index 5d64da8..aa88ba6 100644 --- a/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c +++ b/sysdeps/ieee754/ldbl-96/test-totalorderl-ldbl-96.c @@ -46,9 +46,9 @@ do_test (void) SET_LDOUBLE_WORDS (ldy, 0x7fff, (tests[i] >> 32) | 0x80000000, tests[i] & 0xffffffffULL); - SET_LDOUBLE_WORDS (ldnx, 0xffff, + SET_LDOUBLE_WORDS (ldnx, -1, tests[i] >> 32, tests[i] & 0xffffffffULL); - SET_LDOUBLE_WORDS (ldny, 0xffff, + SET_LDOUBLE_WORDS (ldny, -1, (tests[i] >> 32) | 0x80000000, tests[i] & 0xffffffffULL); bool to1 = totalorderl (&ldx, &ldy); diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list index aa5b479..d31938a 100644 --- a/sysdeps/unix/sysv/linux/syscall-names.list +++ b/sysdeps/unix/sysv/linux/syscall-names.list @@ -21,8 +21,8 @@ # This file can list all potential system calls. The names are only # used if the installed kernel headers also provide them. -# The list of system calls is current as of Linux 6.11. -kernel 6.11 +# The list of system calls is current as of Linux 6.12. +kernel 6.12 FAST_atomic_update FAST_cmpxchg diff --git a/sysdeps/unix/sysv/linux/tst-clone3-internal.c b/sysdeps/unix/sysv/linux/tst-clone3-internal.c index 2f0b200..387f673 100644 --- a/sysdeps/unix/sysv/linux/tst-clone3-internal.c +++ b/sysdeps/unix/sysv/linux/tst-clone3-internal.c @@ -54,7 +54,7 @@ f (void *a) } while (0) static inline int -futex_wait (int *futexp, int val) +futex_wait (_Atomic int *futexp, int val) { #ifdef __NR_futex return syscall (__NR_futex, futexp, FUTEX_WAIT, val); @@ -75,7 +75,7 @@ do_test (void) /* Initialize with a known value. ctid is set to zero by the kernel after the cloned thread has exited. */ #define CTID_INIT_VAL 1 - pid_t ctid = CTID_INIT_VAL; + _Atomic pid_t ctid = CTID_INIT_VAL; pid_t tid; struct clone_args clone_args = diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c index 77b8731..a32d649 100644 --- a/sysdeps/unix/sysv/linux/tst-clone3.c +++ b/sysdeps/unix/sysv/linux/tst-clone3.c @@ -54,7 +54,7 @@ f (void *a) } while (0) static inline int -futex_wait (int *futexp, int val) +futex_wait (_Atomic int *futexp, int val) { #ifdef __NR_futex return syscall (__NR_futex, futexp, FUTEX_WAIT, val); @@ -75,7 +75,7 @@ do_test (void) /* Initialize with a known value. ctid is set to zero by the kernel after the cloned thread has exited. */ #define CTID_INIT_VAL 1 - pid_t ctid = CTID_INIT_VAL; + _Atomic pid_t ctid = CTID_INIT_VAL; pid_t tid; #if _STACK_GROWS_DOWN diff --git a/sysdeps/unix/sysv/linux/x86/bits/platform/features.h b/sysdeps/unix/sysv/linux/x86/bits/platform/features.h index 7704feb..1e63743 100644 --- a/sysdeps/unix/sysv/linux/x86/bits/platform/features.h +++ b/sysdeps/unix/sysv/linux/x86/bits/platform/features.h @@ -28,7 +28,7 @@ enum x86_feature_1_shstk = 1U << 1 }; -static __inline__ _Bool +static __inline__ bool x86_cpu_cet_active (unsigned int __index) { #ifdef __x86_64__ diff --git a/sysdeps/x86/bits/platform/features.h b/sysdeps/x86/bits/platform/features.h index f024892..676ad00 100644 --- a/sysdeps/x86/bits/platform/features.h +++ b/sysdeps/x86/bits/platform/features.h @@ -20,7 +20,7 @@ # error "Never include <bits/platform/features.h> directly; use <sys/platform/x86.h> instead." #endif -static __inline__ _Bool +static __inline__ bool x86_cpu_cet_active (unsigned int __index) { return false; diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index e957950..6a0a30b 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -1021,11 +1021,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) non_temporal_threshold = maximum_non_temporal_threshold; /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ - unsigned int minimum_rep_movsb_threshold; + unsigned long int minimum_rep_movsb_threshold; /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ - unsigned int rep_movsb_threshold; + unsigned long int rep_movsb_threshold; if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) { diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h index b8066cc..1f53780 100644 --- a/sysdeps/x86/sys/platform/x86.h +++ b/sysdeps/x86/sys/platform/x86.h @@ -30,7 +30,7 @@ __BEGIN_DECLS extern const struct cpuid_feature *__x86_get_cpuid_feature_leaf (unsigned int) __attribute__ ((pure)); -static __inline__ _Bool +static __inline__ bool x86_cpu_present (unsigned int __index) { const struct cpuid_feature *__ptr = __x86_get_cpuid_feature_leaf @@ -43,7 +43,7 @@ x86_cpu_present (unsigned int __index) return __ptr->cpuid_array[__reg] & (1 << __bit); } -static __inline__ _Bool +static __inline__ bool x86_cpu_active (unsigned int __index) { if (__index == x86_cpu_IBT || __index == x86_cpu_SHSTK) diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index ce949db..9e19cf7 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -354,8 +354,8 @@ CFLAGS-tst-cet-legacy-mod-6c.c += -fcf-protection CFLAGS-tst-cet-legacy-7.c += -fcf-protection=none CFLAGS-tst-cet-legacy-10.c += -mshstk CFLAGS-tst-cet-legacy-10-static.c += -mshstk -CFLAGS-tst-cet-legacy-10a.c += -fcf-protection=none -CFLAGS-tst-cet-legacy-10a-static.c += -fcf-protection=none +CFLAGS-tst-cet-legacy-10a.c += -fcf-protection=none -mshstk +CFLAGS-tst-cet-legacy-10a-static.c += -fcf-protection=none -mshstk tst-cet-legacy-4-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=SHSTK tst-cet-legacy-6-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=SHSTK diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c index c7ca36e..d030d09 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-double-vlen4.h" +#include <test-double-vlen4.h> #include "test-math-vector-sincos.h" #include <immintrin.h> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c index 4f89ccb..aa5c882 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-double-vlen4.h" +#include <test-double-vlen4.h> #include "test-math-vector-sincos.h" #include <immintrin.h> diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c index 6c18286..9478349 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-double-vlen8.h" +#include <test-double-vlen8.h> #include "test-math-vector-sincos.h" #include <immintrin.h> diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c index 241857b..da17bcc 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-float-vlen16.h" +#include <test-float-vlen16.h> #include "test-math-vector-sincos.h" #include <immintrin.h> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c index 043830b..a6ef2b4 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-float-vlen8.h" +#include <test-float-vlen8.h> #include "test-math-vector-sincos.h" #include <immintrin.h> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c index e71faeb..7fd1ef0 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c @@ -16,7 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "test-float-vlen8.h" +#include <test-float-vlen8.h> #include "test-math-vector-sincos.h" #include <immintrin.h> |