diff options
author | Luna Lamb <luna.lamb@arm.com> | 2025-01-03 19:02:52 +0000 |
---|---|---|
committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2025-01-03 21:39:56 +0000 |
commit | aa6609feb20ebf8653db639dabe2a6afc77b02cc (patch) | |
tree | 27a60d7fe17f9cbdfdbe3770b0119fa74b717d2a | |
parent | 140b985e5a2071000122b3cb63ebfe88cf21dd29 (diff) | |
download | glibc-aa6609feb20ebf8653db639dabe2a6afc77b02cc.zip glibc-aa6609feb20ebf8653db639dabe2a6afc77b02cc.tar.gz glibc-aa6609feb20ebf8653db639dabe2a6afc77b02cc.tar.bz2 |
AArch64: Improve codegen in SVE tans
Improves memory access.
Tan: MOVPRFX 7 -> 2, LD1RD 12 -> 5, move MOV away from return.
Tanf: MOV 2 -> 1, MOVPRFX 6 -> 3, LD1RW 5 -> 4, move mov away from return.
-rw-r--r-- | sysdeps/aarch64/fpu/tan_sve.c | 91 | ||||
-rw-r--r-- | sysdeps/aarch64/fpu/tanf_sve.c | 18 |
2 files changed, 68 insertions, 41 deletions
diff --git a/sysdeps/aarch64/fpu/tan_sve.c b/sysdeps/aarch64/fpu/tan_sve.c index 7854741..6cbd4f2 100644 --- a/sysdeps/aarch64/fpu/tan_sve.c +++ b/sysdeps/aarch64/fpu/tan_sve.c @@ -22,24 +22,38 @@ static const struct data { - double poly[9]; - double half_pi_hi, half_pi_lo, inv_half_pi, range_val, shift; + double c2, c4, c6, c8; + double poly_1357[4]; + double c0, inv_half_pi; + double half_pi_hi, half_pi_lo, range_val; } data = { /* Polynomial generated with FPMinimax. */ - .poly = { 0x1.5555555555556p-2, 0x1.1111111110a63p-3, 0x1.ba1ba1bb46414p-5, - 0x1.664f47e5b5445p-6, 0x1.226e5e5ecdfa3p-7, 0x1.d6c7ddbf87047p-9, - 0x1.7ea75d05b583ep-10, 0x1.289f22964a03cp-11, - 0x1.4e4fd14147622p-12, }, + .c2 = 0x1.ba1ba1bb46414p-5, + .c4 = 0x1.226e5e5ecdfa3p-7, + .c6 = 0x1.7ea75d05b583ep-10, + .c8 = 0x1.4e4fd14147622p-12, + .poly_1357 = { 0x1.1111111110a63p-3, 0x1.664f47e5b5445p-6, + 0x1.d6c7ddbf87047p-9, 0x1.289f22964a03cp-11 }, + .c0 = 0x1.5555555555556p-2, + .inv_half_pi = 0x1.45f306dc9c883p-1, .half_pi_hi = 0x1.921fb54442d18p0, .half_pi_lo = 0x1.1a62633145c07p-54, - .inv_half_pi = 0x1.45f306dc9c883p-1, .range_val = 0x1p23, - .shift = 0x1.8p52, }; static svfloat64_t NOINLINE -special_case (svfloat64_t x, svfloat64_t y, svbool_t special) +special_case (svfloat64_t x, svfloat64_t p, svfloat64_t q, svbool_t pg, + svbool_t special) { + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); + + svfloat64_t n = svmad_x (pg, p, p, -1); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); + svfloat64_t swap = n; + n = svneg_m (n, use_recip, d); + d = svsel (use_recip, swap, d); + svfloat64_t y = svdiv_x (svnot_z (pg, special), n, d); return sv_call_f64 (tan, x, y, special); } @@ -50,15 +64,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special) svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) { const struct data *dat = ptr_barrier (&data); - - /* Invert condition to catch NaNs and Infs as well as large values. */ - svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); - + svfloat64_t half_pi_c0 = svld1rq (svptrue_b64 (), &dat->c0); /* q = nearest integer to 2 * x / pi. */ - svfloat64_t shift = sv_f64 (dat->shift); - svfloat64_t q = svmla_x (pg, shift, x, dat->inv_half_pi); - q = svsub_x (pg, q, shift); - svint64_t qi = svcvt_s64_x (pg, q); + svfloat64_t q = svmul_lane (x, half_pi_c0, 1); + q = svrinta_x (pg, q); /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ @@ -68,7 +77,7 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) r = svmls_lane (r, q, half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ - r = svmul_x (pg, r, 0.5); + r = svmul_x (svptrue_b64 (), r, 0.5); /* Approximate tan(r) using order 8 polynomial. tan(x) is odd, so polynomial has the form: @@ -76,29 +85,51 @@ svfloat64_t SV_NAME_D1 (tan) (svfloat64_t x, svbool_t pg) Hence we first approximate P(r) = C1 + C2 * r^2 + C3 * r^4 + ... Then compute the approximation by: tan(r) ~= r + r^3 * (C0 + r^2 * P(r)). */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t r4 = svmul_x (pg, r2, r2); - svfloat64_t r8 = svmul_x (pg, r4, r4); + + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t r4 = svmul_x (svptrue_b64 (), r2, r2); + svfloat64_t r8 = svmul_x (svptrue_b64 (), r4, r4); /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ - svfloat64_t p = sv_estrin_7_f64_x (pg, r2, r4, r8, dat->poly + 1); - p = svmad_x (pg, p, r2, dat->poly[0]); - p = svmla_x (pg, r, r2, svmul_x (pg, p, r)); + svfloat64_t C_24 = svld1rq (svptrue_b64 (), &dat->c2); + svfloat64_t C_68 = svld1rq (svptrue_b64 (), &dat->c6); + + /* Use offset version coeff array by 1 to evaluate from C1 onwards. */ + svfloat64_t p01 = svmla_lane (sv_f64 (dat->poly_1357[0]), r2, C_24, 0); + svfloat64_t p23 = svmla_lane_f64 (sv_f64 (dat->poly_1357[1]), r2, C_24, 1); + svfloat64_t p03 = svmla_x (pg, p01, p23, r4); + + svfloat64_t p45 = svmla_lane (sv_f64 (dat->poly_1357[2]), r2, C_68, 0); + svfloat64_t p67 = svmla_lane (sv_f64 (dat->poly_1357[3]), r2, C_68, 1); + svfloat64_t p47 = svmla_x (pg, p45, p67, r4); + + svfloat64_t p = svmla_x (pg, p03, p47, r8); + + svfloat64_t z = svmul_x (svptrue_b64 (), p, r); + z = svmul_x (svptrue_b64 (), r2, z); + z = svmla_lane (z, r, half_pi_c0, 0); + p = svmla_x (pg, r, r2, z); /* Recombination uses double-angle formula: tan(2x) = 2 * tan(x) / (1 - (tan(x))^2) and reciprocity around pi/2: tan(x) = 1 / (tan(pi/2 - x)) to assemble result using change-of-sign and conditional selection of - numerator/denominator dependent on odd/even-ness of q (hence quadrant). */ - svbool_t use_recip - = svcmpeq (pg, svand_x (pg, svreinterpret_u64 (qi), 1), 0); + numerator/denominator dependent on odd/even-ness of q (quadrant). */ + + /* Invert condition to catch NaNs and Infs as well as large values. */ + svbool_t special = svnot_z (pg, svaclt (pg, x, dat->range_val)); + + if (__glibc_unlikely (svptest_any (pg, special))) + { + return special_case (x, p, q, pg, special); + } + svbool_t use_recip = svcmpeq ( + pg, svand_x (pg, svreinterpret_u64 (svcvt_s64_x (pg, q)), 1), 0); svfloat64_t n = svmad_x (pg, p, p, -1); - svfloat64_t d = svmul_x (pg, p, 2); + svfloat64_t d = svmul_x (svptrue_b64 (), p, 2); svfloat64_t swap = n; n = svneg_m (n, use_recip, d); d = svsel (use_recip, swap, d); - if (__glibc_unlikely (svptest_any (pg, special))) - return special_case (x, svdiv_x (svnot_z (pg, special), n, d), special); return svdiv_x (pg, n, d); } diff --git a/sysdeps/aarch64/fpu/tanf_sve.c b/sysdeps/aarch64/fpu/tanf_sve.c index 9ce91b5..8bd5440 100644 --- a/sysdeps/aarch64/fpu/tanf_sve.c +++ b/sysdeps/aarch64/fpu/tanf_sve.c @@ -60,21 +60,16 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - /* Determine whether input is too large to perform fast regression. */ - svbool_t cmp = svacge (pg, x, d->range_val); - svfloat32_t odd_coeffs = svld1rq (svptrue_b32 (), &d->c1); svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->pio2_1); /* n = rint(x/(pi/2)). */ - svfloat32_t q = svmla_lane (sv_f32 (d->shift), x, pi_vals, 3); - svfloat32_t n = svsub_x (pg, q, d->shift); + svfloat32_t n = svrintn_x (pg, svmul_lane (x, pi_vals, 3)); /* n is already a signed integer, simply convert it. */ svint32_t in = svcvt_s32_x (pg, n); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ svint32_t alt = svand_x (pg, in, 1); svbool_t pred_alt = svcmpne (pg, alt, 0); - /* r = x - n * (pi/2) (range reduction into 0 .. pi/4). */ svfloat32_t r; r = svmls_lane (x, n, pi_vals, 0); @@ -93,7 +88,7 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) /* Evaluate polynomial approximation of tangent on [-pi/4, pi/4], using Estrin on z^2. */ - svfloat32_t z2 = svmul_x (pg, z, z); + svfloat32_t z2 = svmul_x (svptrue_b32 (), r, r); svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0); svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1); svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2); @@ -106,13 +101,14 @@ svfloat32_t SV_NAME_F1 (tan) (svfloat32_t x, const svbool_t pg) svfloat32_t y = svmla_x (pg, z, p, svmul_x (pg, z, z2)); - /* Transform result back, if necessary. */ - svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); - /* No need to pass pg to specialcase here since cmp is a strict subset, guaranteed by the cmpge above. */ + + /* Determine whether input is too large to perform fast regression. */ + svbool_t cmp = svacge (pg, x, d->range_val); if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, svsel (pred_alt, inv_y, y), cmp); + return special_case (x, svdivr_x (pg, y, 1.0f), cmp); + svfloat32_t inv_y = svdivr_x (pg, y, 1.0f); return svsel (pred_alt, inv_y, y); } |