diff options
author | Stefan Hajnoczi <stefanha@redhat.com> | 2025-02-12 08:48:33 -0500 |
---|---|---|
committer | Stefan Hajnoczi <stefanha@redhat.com> | 2025-02-12 08:48:33 -0500 |
commit | afbcca0ea41f109c2a5eae308d636dfcf98eb82b (patch) | |
tree | 0ef2983a4d66f50a533364bd92edbebf9505d163 | |
parent | 5f1de4d3ce025fcb70b94ebe35241130d990d889 (diff) | |
parent | ca4c34e07d1388df8e396520b5e7d60883cd3690 (diff) | |
download | qemu-afbcca0ea41f109c2a5eae308d636dfcf98eb82b.zip qemu-afbcca0ea41f109c2a5eae308d636dfcf98eb82b.tar.gz qemu-afbcca0ea41f109c2a5eae308d636dfcf98eb82b.tar.bz2 |
Merge tag 'pull-target-arm-20250211' of https://git.linaro.org/people/pmaydell/qemu-arm into staging
target-arm queue:
* target/alpha: Don't corrupt error_code with unknown softfloat flags
* target/arm: Implement FEAT_AFP and FEAT_RPRES
# -----BEGIN PGP SIGNATURE-----
#
# iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAmereaQZHHBldGVyLm1h
# eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3gyLEACglOM4E0j1hRl/JZlWD384
# nZL01Hayp9xwSNn28hkXaajCxkErTWLuCZax1g1fBvt/Yqn+E3oFan8gIybMEVgK
# 9ei6/m45fuICSQQhifvYTtYhAMd5uclr0anjRp9gN7FH6aaNPan/ZQYcKYxFq6cp
# RDTF5qiHIgTeXAlU+WiioxravL3A/D+jcQMYLEI5L+Vt5nYNM589PSNFWNLQ6W9e
# Gtmvp0uzrRSZgWxR3nOvhsn1NS/xXK90Zil+GPBo4jf82QVumqKYMsAcireOlxfk
# zTlHXH3PuonGj/ZPLxmiVKYhLb1RglQ9kIs/FHVel18QTz4dJ3DaJp8QXCNHbrKz
# 3aUwSiIh5Y/s3Q/X2Qy3jUHQ5tSjayhIhGFbn6zPdZ+2JZbIEu1Czeparddu/Zlq
# OR0CMVo2Lj/C6OakEU1/YRTKBKiNBaN1eVHi7gjzTDBdbMMC7ZlNuimpFAbthmSC
# szHzkgX8LXHzJqe4vip27yOMFBRPxvst/CXcEoPnjsLEQhLlKjOeFiHuEI+DUvaI
# 24AJ5b0FDdSOEcaFkxFD6gxW8E77MiNtBncfxDxTMKHs/4yFGiDihSPnOCANn3Kk
# zpQIwl0KJAPTA6Cldck9lY7MsKgGPTUNhEThadZlInbp4Uc6T1bvNDtB9b7osDfy
# FeposcM1+GBeuSde0yD6oQ==
# =P3wv
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 11 Feb 2025 11:24:04 EST
# gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE
# gpg: issuer "peter.maydell@linaro.org"
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [full]
# gpg: aka "Peter Maydell <pmaydell@gmail.com>" [full]
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [full]
# gpg: aka "Peter Maydell <peter@archaic.org.uk>" [unknown]
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* tag 'pull-target-arm-20250211' of https://git.linaro.org/people/pmaydell/qemu-arm: (68 commits)
target/arm: Sink fp_status and fpcr access into do_fmlal*
target/arm: Read fz16 from env->vfp.fpcr
target/arm: Simplify DO_VFP_cmp in vfp_helper.c
target/arm: Simplify fp_status indexing in mve_helper.c
target/arm: Remove fp_status_a32
target/arm: Remove fp_status_a64
target/arm: Remove fp_status_f16_a32
target/arm: Remove fp_status_f16_a64
target/arm: Remove ah_fp_status
target/arm: Remove ah_fp_status_f16
target/arm: Remove standard_fp_status
target/arm: Remove standard_fp_status_f16
target/arm: Introduce CPUARMState.vfp.fp_status[]
target/arm: Enable FEAT_RPRES for -cpu max
target/arm: Implement increased precision FRSQRTE
target/arm: Implement increased precision FRECPE
target/arm: Plumb FEAT_RPRES frecpe and frsqrte through to new helper
target/arm: Enable FEAT_AFP for '-cpu max'
target/arm: Handle FPCR.AH in SVE FMLSLB, FMLSLT (vectors)
target/arm: Handle FPCR.AH in SVE FMLSL (indexed)
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
37 files changed, 2330 insertions, 714 deletions
diff --git a/docs/system/arm/emulation.rst b/docs/system/arm/emulation.rst index 60176d0..78c2fd2 100644 --- a/docs/system/arm/emulation.rst +++ b/docs/system/arm/emulation.rst @@ -20,6 +20,7 @@ the following architecture extensions: - FEAT_AA64EL3 (Support for AArch64 at EL3) - FEAT_AdvSIMD (Advanced SIMD Extension) - FEAT_AES (AESD and AESE instructions) +- FEAT_AFP (Alternate floating-point behavior) - FEAT_Armv9_Crypto (Armv9 Cryptographic Extension) - FEAT_ASID16 (16 bit ASID) - FEAT_BBM at level 2 (Translation table break-before-make levels) @@ -117,6 +118,7 @@ the following architecture extensions: - FEAT_RDM (Advanced SIMD rounding double multiply accumulate instructions) - FEAT_RME (Realm Management Extension) (NB: support status in QEMU is experimental) - FEAT_RNG (Random number generator) +- FEAT_RPRES (Increased precision of FRECPE and FRSQRTE) - FEAT_S2FWB (Stage 2 forced Write-Back) - FEAT_SB (Speculation Barrier) - FEAT_SEL2 (Secure EL2) diff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc index 73621f4..1d09f06 100644 --- a/fpu/softfloat-parts.c.inc +++ b/fpu/softfloat-parts.c.inc @@ -204,7 +204,7 @@ static void partsN(canonicalize)(FloatPartsN *p, float_status *status, frac_clear(p); } else { int shift = frac_normalize(p); - p->cls = float_class_normal; + p->cls = float_class_denormal; p->exp = fmt->frac_shift - fmt->exp_bias - shift + !fmt->m68k_denormal; } @@ -334,7 +334,8 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, p->frac_lo &= ~round_mask; } frac_shr(p, frac_shift); - } else if (s->flush_to_zero) { + } else if (s->flush_to_zero && + s->ftz_detection == float_ftz_before_rounding) { flags |= float_flag_output_denormal_flushed; p->cls = float_class_zero; exp = 0; @@ -381,11 +382,19 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, exp = (p->frac_hi & DECOMPOSED_IMPLICIT_BIT) && !fmt->m68k_denormal; frac_shr(p, frac_shift); - if (is_tiny && (flags & float_flag_inexact)) { - flags |= float_flag_underflow; - } - if (exp == 0 && frac_eqz(p)) { - p->cls = float_class_zero; + if (is_tiny) { + if (s->flush_to_zero) { + assert(s->ftz_detection == float_ftz_after_rounding); + flags |= float_flag_output_denormal_flushed; + p->cls = float_class_zero; + exp = 0; + frac_clear(p); + } else if (flags & float_flag_inexact) { + flags |= float_flag_underflow; + } + if (exp == 0 && frac_eqz(p)) { + p->cls = float_class_zero; + } } } p->exp = exp; @@ -395,7 +404,7 @@ static void partsN(uncanon_normal)(FloatPartsN *p, float_status *s, static void partsN(uncanon)(FloatPartsN *p, float_status *s, const FloatFmt *fmt) { - if (likely(p->cls == float_class_normal)) { + if (likely(is_anynorm(p->cls))) { parts_uncanon_normal(p, s, fmt); } else { switch (p->cls) { @@ -433,9 +442,18 @@ static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, bool b_sign = b->sign ^ subtract; int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); + /* + * For addition and subtraction, we will consume an + * input denormal unless the other input is a NaN. + */ + if ((ab_mask & (float_cmask_denormal | float_cmask_anynan)) == + float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + if (a->sign != b_sign) { /* Subtraction */ - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { if (parts_sub_normal(a, b)) { return a; } @@ -468,7 +486,7 @@ static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, } } else { /* Addition */ - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { parts_add_normal(a, b); return a; } @@ -488,12 +506,12 @@ static FloatPartsN *partsN(addsub)(FloatPartsN *a, FloatPartsN *b, } if (b->cls == float_class_zero) { - g_assert(a->cls == float_class_normal); + g_assert(is_anynorm(a->cls)); return a; } g_assert(a->cls == float_class_zero); - g_assert(b->cls == float_class_normal); + g_assert(is_anynorm(b->cls)); return_b: b->sign = b_sign; return b; @@ -513,9 +531,13 @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); bool sign = a->sign ^ b->sign; - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { FloatPartsW tmp; + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + frac_mulw(&tmp, a, b); frac_truncjam(a, &tmp); @@ -541,6 +563,10 @@ static FloatPartsN *partsN(mul)(FloatPartsN *a, FloatPartsN *b, } /* Multiply by 0 or Inf */ + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + if (ab_mask & float_cmask_inf) { a->cls = float_class_inf; a->sign = sign; @@ -596,7 +622,7 @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b, a->sign ^= 1; } - if (unlikely(ab_mask != float_cmask_normal)) { + if (unlikely(!cmask_is_only_normals(ab_mask))) { if (unlikely(ab_mask == float_cmask_infzero)) { float_raise(float_flag_invalid | float_flag_invalid_imz, s); goto d_nan; @@ -611,7 +637,7 @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b, } g_assert(ab_mask & float_cmask_zero); - if (c->cls == float_class_normal) { + if (is_anynorm(c->cls)) { *a = *c; goto return_normal; } @@ -664,6 +690,16 @@ static FloatPartsN *partsN(muladd_scalbn)(FloatPartsN *a, FloatPartsN *b, if (flags & float_muladd_negate_result) { a->sign ^= 1; } + + /* + * All result types except for "return the default NaN + * because this is an Invalid Operation" go through here; + * this matches the set of cases where we consumed a + * denormal input. + */ + if (abc_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } return a; return_sub_zero: @@ -692,7 +728,10 @@ static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b, int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); bool sign = a->sign ^ b->sign; - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } a->sign = sign; a->exp -= b->exp + frac_div(a, b); return a; @@ -713,6 +752,10 @@ static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b, return parts_pick_nan(a, b, s); } + if ((ab_mask & float_cmask_denormal) && b->cls != float_class_zero) { + float_raise(float_flag_input_denormal_used, s); + } + a->sign = sign; /* Inf / X */ @@ -750,7 +793,10 @@ static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b, { int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } frac_modrem(a, b, mod_quot); return a; } @@ -771,6 +817,10 @@ static FloatPartsN *partsN(modrem)(FloatPartsN *a, FloatPartsN *b, return a; } + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + /* N % Inf; 0 % N */ g_assert(b->cls == float_class_inf || a->cls == float_class_zero); return a; @@ -800,6 +850,12 @@ static void partsN(sqrt)(FloatPartsN *a, float_status *status, if (unlikely(a->cls != float_class_normal)) { switch (a->cls) { + case float_class_denormal: + if (!a->sign) { + /* -ve denormal will be InvalidOperation */ + float_raise(float_flag_input_denormal_used, status); + } + break; case float_class_snan: case float_class_qnan: parts_return_nan(a, status); @@ -1130,6 +1186,7 @@ static void partsN(round_to_int)(FloatPartsN *a, FloatRoundMode rmode, case float_class_inf: break; case float_class_normal: + case float_class_denormal: if (parts_round_to_int_normal(a, rmode, scale, fmt->frac_size)) { float_raise(float_flag_inexact, s); } @@ -1174,6 +1231,7 @@ static int64_t partsN(float_to_sint)(FloatPartsN *p, FloatRoundMode rmode, return 0; case float_class_normal: + case float_class_denormal: /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { flags = float_flag_inexact; @@ -1241,6 +1299,7 @@ static uint64_t partsN(float_to_uint)(FloatPartsN *p, FloatRoundMode rmode, return 0; case float_class_normal: + case float_class_denormal: /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ if (parts_round_to_int_normal(p, rmode, scale, N - 2)) { flags = float_flag_inexact; @@ -1304,6 +1363,7 @@ static int64_t partsN(float_to_sint_modulo)(FloatPartsN *p, return 0; case float_class_normal: + case float_class_denormal: /* TODO: N - 2 is frac_size for rounding; could use input fmt. */ if (parts_round_to_int_normal(p, rmode, 0, N - 2)) { flags = float_flag_inexact; @@ -1425,6 +1485,9 @@ static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, if ((flags & (minmax_isnum | minmax_isnumber)) && !(ab_mask & float_cmask_snan) && (ab_mask & ~float_cmask_qnan)) { + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } return is_nan(a->cls) ? b : a; } @@ -1449,12 +1512,17 @@ static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, return parts_pick_nan(a, b, s); } + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + a_exp = a->exp; b_exp = b->exp; - if (unlikely(ab_mask != float_cmask_normal)) { + if (unlikely(!cmask_is_only_normals(ab_mask))) { switch (a->cls) { case float_class_normal: + case float_class_denormal: break; case float_class_inf: a_exp = INT16_MAX; @@ -1467,6 +1535,7 @@ static FloatPartsN *partsN(minmax)(FloatPartsN *a, FloatPartsN *b, } switch (b->cls) { case float_class_normal: + case float_class_denormal: break; case float_class_inf: b_exp = INT16_MAX; @@ -1513,9 +1582,13 @@ static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b, { int ab_mask = float_cmask(a->cls) | float_cmask(b->cls); - if (likely(ab_mask == float_cmask_normal)) { + if (likely(cmask_is_only_normals(ab_mask))) { FloatRelation cmp; + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + if (a->sign != b->sign) { goto a_sign; } @@ -1541,6 +1614,10 @@ static FloatRelation partsN(compare)(FloatPartsN *a, FloatPartsN *b, return float_relation_unordered; } + if (ab_mask & float_cmask_denormal) { + float_raise(float_flag_input_denormal_used, s); + } + if (ab_mask & float_cmask_zero) { if (ab_mask == float_cmask_zero) { return float_relation_equal; @@ -1580,6 +1657,9 @@ static void partsN(scalbn)(FloatPartsN *a, int n, float_status *s) case float_class_zero: case float_class_inf: break; + case float_class_denormal: + float_raise(float_flag_input_denormal_used, s); + /* fall through */ case float_class_normal: a->exp += MIN(MAX(n, -0x10000), 0x10000); break; @@ -1599,6 +1679,12 @@ static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt) if (unlikely(a->cls != float_class_normal)) { switch (a->cls) { + case float_class_denormal: + if (!a->sign) { + /* -ve denormal will be InvalidOperation */ + float_raise(float_flag_input_denormal_used, s); + } + break; case float_class_snan: case float_class_qnan: parts_return_nan(a, s); @@ -1615,9 +1701,8 @@ static void partsN(log2)(FloatPartsN *a, float_status *s, const FloatFmt *fmt) } return; default: - break; + g_assert_not_reached(); } - g_assert_not_reached(); } if (unlikely(a->sign)) { goto d_nan; diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 26f3a8d..f4fed9b 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -404,12 +404,16 @@ float64_gen2(float64 xa, float64 xb, float_status *s, /* * Classify a floating point number. Everything above float_class_qnan * is a NaN so cls >= float_class_qnan is any NaN. + * + * Note that we canonicalize denormals, so most code should treat + * class_normal and class_denormal identically. */ typedef enum __attribute__ ((__packed__)) { float_class_unclassified, float_class_zero, float_class_normal, + float_class_denormal, /* input was a non-squashed denormal */ float_class_inf, float_class_qnan, /* all NaNs from here */ float_class_snan, @@ -420,12 +424,14 @@ typedef enum __attribute__ ((__packed__)) { enum { float_cmask_zero = float_cmask(float_class_zero), float_cmask_normal = float_cmask(float_class_normal), + float_cmask_denormal = float_cmask(float_class_denormal), float_cmask_inf = float_cmask(float_class_inf), float_cmask_qnan = float_cmask(float_class_qnan), float_cmask_snan = float_cmask(float_class_snan), float_cmask_infzero = float_cmask_zero | float_cmask_inf, float_cmask_anynan = float_cmask_qnan | float_cmask_snan, + float_cmask_anynorm = float_cmask_normal | float_cmask_denormal, }; /* Flags for parts_minmax. */ @@ -460,6 +466,20 @@ static inline __attribute__((unused)) bool is_qnan(FloatClass c) } /* + * Return true if the float_cmask has only normals in it + * (including input denormals that were canonicalized) + */ +static inline bool cmask_is_only_normals(int cmask) +{ + return !(cmask & ~float_cmask_anynorm); +} + +static inline bool is_anynorm(FloatClass c) +{ + return float_cmask(c) & float_cmask_anynorm; +} + +/* * Structure holding all of the decomposed parts of a float. * The exponent is unbiased and the fraction is normalized. * @@ -1729,6 +1749,7 @@ static float64 float64r32_round_pack_canonical(FloatParts64 *p, */ switch (p->cls) { case float_class_normal: + case float_class_denormal: if (unlikely(p->exp == 0)) { /* * The result is denormal for float32, but can be represented @@ -1817,6 +1838,7 @@ static floatx80 floatx80_round_pack_canonical(FloatParts128 *p, switch (p->cls) { case float_class_normal: + case float_class_denormal: if (s->floatx80_rounding_precision == floatx80_precision_x) { parts_uncanon_normal(p, s, fmt); frac = p->frac_hi; @@ -2696,6 +2718,9 @@ static void parts_float_to_ahp(FloatParts64 *a, float_status *s) float16_params_ahp.frac_size + 1); break; + case float_class_denormal: + float_raise(float_flag_input_denormal_used, s); + break; case float_class_normal: case float_class_zero: break; @@ -2710,6 +2735,9 @@ static void parts64_float_to_float(FloatParts64 *a, float_status *s) if (is_nan(a->cls)) { parts_return_nan(a, s); } + if (a->cls == float_class_denormal) { + float_raise(float_flag_input_denormal_used, s); + } } static void parts128_float_to_float(FloatParts128 *a, float_status *s) @@ -2717,6 +2745,9 @@ static void parts128_float_to_float(FloatParts128 *a, float_status *s) if (is_nan(a->cls)) { parts_return_nan(a, s); } + if (a->cls == float_class_denormal) { + float_raise(float_flag_input_denormal_used, s); + } } #define parts_float_to_float(P, S) \ @@ -2729,12 +2760,21 @@ static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, a->sign = b->sign; a->exp = b->exp; - if (a->cls == float_class_normal) { + switch (a->cls) { + case float_class_denormal: + float_raise(float_flag_input_denormal_used, s); + /* fall through */ + case float_class_normal: frac_truncjam(a, b); - } else if (is_nan(a->cls)) { + break; + case float_class_snan: + case float_class_qnan: /* Discard the low bits of the NaN. */ a->frac = b->frac_hi; parts_return_nan(a, s); + break; + default: + break; } } @@ -2749,6 +2789,9 @@ static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, if (is_nan(a->cls)) { parts_return_nan(a, s); } + if (a->cls == float_class_denormal) { + float_raise(float_flag_input_denormal_used, s); + } } float32 float16_to_float32(float16 a, bool ieee, float_status *s) @@ -3218,6 +3261,7 @@ static Int128 float128_to_int128_scalbn(float128 a, FloatRoundMode rmode, return int128_zero(); case float_class_normal: + case float_class_denormal: if (parts_round_to_int_normal(&p, rmode, scale, 128 - 2)) { flags = float_flag_inexact; } @@ -3645,6 +3689,7 @@ static Int128 float128_to_uint128_scalbn(float128 a, FloatRoundMode rmode, return int128_zero(); case float_class_normal: + case float_class_denormal: if (parts_round_to_int_normal(&p, rmode, scale, 128 - 2)) { flags = float_flag_inexact; if (p.cls == float_class_zero) { @@ -4386,7 +4431,11 @@ float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet) goto soft; } - float32_input_flush2(&ua.s, &ub.s, s); + if (unlikely(float32_is_denormal(ua.s) || float32_is_denormal(ub.s))) { + /* We may need to set the input_denormal_used flag */ + goto soft; + } + if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -4436,7 +4485,11 @@ float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet) goto soft; } - float64_input_flush2(&ua.s, &ub.s, s); + if (unlikely(float64_is_denormal(ua.s) || float64_is_denormal(ub.s))) { + /* We may need to set the input_denormal_used flag */ + goto soft; + } + if (isgreaterequal(ua.h, ub.h)) { if (isgreater(ua.h, ub.h)) { return float_relation_greater; @@ -5231,6 +5284,8 @@ float32 float32_exp2(float32 a, float_status *status) float32_unpack_canonical(&xp, a, status); if (unlikely(xp.cls != float_class_normal)) { switch (xp.cls) { + case float_class_denormal: + break; case float_class_snan: case float_class_qnan: parts_return_nan(&xp, status); @@ -5240,9 +5295,8 @@ float32 float32_exp2(float32 a, float_status *status) case float_class_zero: return float32_one; default: - break; + g_assert_not_reached(); } - g_assert_not_reached(); } float_raise(float_flag_inexact, status); diff --git a/include/fpu/softfloat-helpers.h b/include/fpu/softfloat-helpers.h index 4cb30a4..8983c27 100644 --- a/include/fpu/softfloat-helpers.h +++ b/include/fpu/softfloat-helpers.h @@ -109,6 +109,12 @@ static inline void set_flush_inputs_to_zero(bool val, float_status *status) status->flush_inputs_to_zero = val; } +static inline void set_float_ftz_detection(FloatFTZDetection d, + float_status *status) +{ + status->ftz_detection = d; +} + static inline void set_default_nan_mode(bool val, float_status *status) { status->default_nan_mode = val; @@ -183,4 +189,9 @@ static inline bool get_default_nan_mode(const float_status *status) return status->default_nan_mode; } +static inline FloatFTZDetection get_float_ftz_detection(const float_status *status) +{ + return status->ftz_detection; +} + #endif /* SOFTFLOAT_HELPERS_H */ diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h index 2e43d1d..53d5eb8 100644 --- a/include/fpu/softfloat-types.h +++ b/include/fpu/softfloat-types.h @@ -165,6 +165,13 @@ enum { float_flag_invalid_sqrt = 0x0800, /* sqrt(-x) */ float_flag_invalid_cvti = 0x1000, /* non-nan to integer */ float_flag_invalid_snan = 0x2000, /* any operand was snan */ + /* + * An input was denormal and we used it (without flushing it to zero). + * Not set if we do not actually use the denormal input (e.g. + * because some other input was a NaN, or because the operation + * wasn't actually carried out (divide-by-zero; invalid)) + */ + float_flag_input_denormal_used = 0x4000, }; /* @@ -298,6 +305,22 @@ typedef enum __attribute__((__packed__)) { } FloatInfZeroNaNRule; /* + * When flush_to_zero is set, should we detect denormal results to + * be flushed before or after rounding? For most architectures this + * should be set to match the tininess_before_rounding setting, + * but a few architectures, e.g. MIPS MSA, detect FTZ before + * rounding but tininess after rounding. + * + * This enum is arranged so that the default if the target doesn't + * configure it matches the default for tininess_before_rounding + * (i.e. "after rounding"). + */ +typedef enum __attribute__((__packed__)) { + float_ftz_after_rounding = 0, + float_ftz_before_rounding = 1, +} FloatFTZDetection; + +/* * Floating Point Status. Individual architectures may maintain * several versions of float_status for different functions. The * correct status for the operation is then passed by reference to @@ -314,6 +337,8 @@ typedef struct float_status { bool tininess_before_rounding; /* should denormalised results go to zero and set output_denormal_flushed? */ bool flush_to_zero; + /* do we detect and flush denormal results before or after rounding? */ + FloatFTZDetection ftz_detection; /* should denormalised inputs go to zero and set input_denormal_flushed? */ bool flush_inputs_to_zero; bool default_nan_mode; diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c index e1b898e..f5dd744 100644 --- a/target/alpha/cpu.c +++ b/target/alpha/cpu.c @@ -202,6 +202,13 @@ static void alpha_cpu_initfn(Object *obj) set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status); /* Default NaN: sign bit clear, msb frac bit set */ set_float_default_nan_pattern(0b01000000, &env->fp_status); + /* + * TODO: this is incorrect. The Alpha Architecture Handbook version 4 + * section 4.7.7.11 says that we flush to zero for underflow cases, so + * this should be float_ftz_after_rounding to match the + * tininess_after_rounding (which is specified in section 4.7.5). + */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); #if defined(CONFIG_USER_ONLY) env->flags = ENV_FLAG_PS_USER | ENV_FLAG_FEN; cpu_alpha_store_fpcr(env, (uint64_t)(FPCR_INVD | FPCR_DZED | FPCR_OVFD diff --git a/target/alpha/fpu_helper.c b/target/alpha/fpu_helper.c index 63d9e9c..f810a9b 100644 --- a/target/alpha/fpu_helper.c +++ b/target/alpha/fpu_helper.c @@ -476,6 +476,8 @@ static uint64_t do_cvttq(CPUAlphaState *env, uint64_t a, int roundmode) exc = FPCR_INV; } else if (exc & float_flag_inexact) { exc = FPCR_INE; + } else { + exc = 0; } } env->error_code = exc; diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h index 30302d6..525e4ce 100644 --- a/target/arm/cpu-features.h +++ b/target/arm/cpu-features.h @@ -597,6 +597,11 @@ static inline bool isar_feature_aa64_mops(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64isar2, ID_AA64ISAR2, MOPS); } +static inline bool isar_feature_aa64_rpres(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64isar2, ID_AA64ISAR2, RPRES); +} + static inline bool isar_feature_aa64_fp_simd(const ARMISARegisters *id) { /* We always set the AdvSIMD and FP fields identically. */ @@ -802,6 +807,11 @@ static inline bool isar_feature_aa64_hcx(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, HCX) != 0; } +static inline bool isar_feature_aa64_afp(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, AFP) != 0; +} + static inline bool isar_feature_aa64_tidcp1(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64mmfr1, ID_AA64MMFR1, TIDCP1) != 0; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 32dc7c1..656070a 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -169,28 +169,6 @@ void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, QLIST_INSERT_HEAD(&cpu->el_change_hooks, entry, node); } -/* - * Set the float_status behaviour to match the Arm defaults: - * * tininess-before-rounding - * * 2-input NaN propagation prefers SNaN over QNaN, and then - * operand A over operand B (see FPProcessNaNs() pseudocode) - * * 3-input NaN propagation prefers SNaN over QNaN, and then - * operand C over A over B (see FPProcessNaNs3() pseudocode, - * but note that for QEMU muladd is a * b + c, whereas for - * the pseudocode function the arguments are in the order c, a, b. - * * 0 * Inf + NaN returns the default NaN if the input NaN is quiet, - * and the input NaN if it is signalling - * * Default NaN has sign bit clear, msb frac bit set - */ -static void arm_set_default_fp_behaviours(float_status *s) -{ - set_float_detect_tininess(float_tininess_before_rounding, s); - set_float_2nan_prop_rule(float_2nan_prop_s_ab, s); - set_float_3nan_prop_rule(float_3nan_prop_s_cab, s); - set_float_infzeronan_rule(float_infzeronan_dnan_if_qnan, s); - set_float_default_nan_pattern(0b01000000, s); -} - static void cp_reg_reset(gpointer key, gpointer value, gpointer opaque) { /* Reset a single ARMCPRegInfo register */ @@ -568,16 +546,20 @@ static void arm_cpu_reset_hold(Object *obj, ResetType type) env->sau.ctrl = 0; } - set_flush_to_zero(1, &env->vfp.standard_fp_status); - set_flush_inputs_to_zero(1, &env->vfp.standard_fp_status); - set_default_nan_mode(1, &env->vfp.standard_fp_status); - set_default_nan_mode(1, &env->vfp.standard_fp_status_f16); - arm_set_default_fp_behaviours(&env->vfp.fp_status_a32); - arm_set_default_fp_behaviours(&env->vfp.fp_status_a64); - arm_set_default_fp_behaviours(&env->vfp.standard_fp_status); - arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a32); - arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a64); - arm_set_default_fp_behaviours(&env->vfp.standard_fp_status_f16); + set_flush_to_zero(1, &env->vfp.fp_status[FPST_STD]); + set_flush_inputs_to_zero(1, &env->vfp.fp_status[FPST_STD]); + set_default_nan_mode(1, &env->vfp.fp_status[FPST_STD]); + set_default_nan_mode(1, &env->vfp.fp_status[FPST_STD_F16]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A32]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_STD]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A32_F16]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_STD_F16]); + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_AH]); + set_flush_to_zero(1, &env->vfp.fp_status[FPST_AH]); + set_flush_inputs_to_zero(1, &env->vfp.fp_status[FPST_AH]); + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_AH_F16]); #ifndef CONFIG_USER_ONLY if (kvm_enabled()) { diff --git a/target/arm/cpu.h b/target/arm/cpu.h index c2d2d99..6f6cf5c 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -202,6 +202,61 @@ typedef struct ARMMMUFaultInfo ARMMMUFaultInfo; typedef struct NVICState NVICState; +/* + * Enum for indexing vfp.fp_status[]. + * + * FPST_A32: is the "normal" fp status for AArch32 insns + * FPST_A64: is the "normal" fp status for AArch64 insns + * FPST_A32_F16: used for AArch32 half-precision calculations + * FPST_A64_F16: used for AArch64 half-precision calculations + * FPST_STD: the ARM "Standard FPSCR Value" + * FPST_STD_F16: used for half-precision + * calculations with the ARM "Standard FPSCR Value" + * FPST_AH: used for the A64 insns which change behaviour + * when FPCR.AH == 1 (bfloat16 conversions and multiplies, + * and the reciprocal and square root estimate/step insns) + * FPST_AH_F16: used for the A64 insns which change behaviour + * when FPCR.AH == 1 (bfloat16 conversions and multiplies, + * and the reciprocal and square root estimate/step insns); + * for half-precision + * + * Half-precision operations are governed by a separate + * flush-to-zero control bit in FPSCR:FZ16. We pass a separate + * status structure to control this. + * + * The "Standard FPSCR", ie default-NaN, flush-to-zero, + * round-to-nearest and is used by any operations (generally + * Neon) which the architecture defines as controlled by the + * standard FPSCR value rather than the FPSCR. + * + * The "standard FPSCR but for fp16 ops" is needed because + * the "standard FPSCR" tracks the FPSCR.FZ16 bit rather than + * using a fixed value for it. + * + * FPST_AH is needed because some insns have different + * behaviour when FPCR.AH == 1: they don't update cumulative + * exception flags, they act like FPCR.{FZ,FIZ} = {1,1} and + * they ignore FPCR.RMode. But they don't ignore FPCR.FZ16, + * which means we need an FPST_AH_F16 as well. + * + * To avoid having to transfer exception bits around, we simply + * say that the FPSCR cumulative exception flags are the logical + * OR of the flags in the four fp statuses. This relies on the + * only thing which needs to read the exception flags being + * an explicit FPSCR read. + */ +typedef enum ARMFPStatusFlavour { + FPST_A32, + FPST_A64, + FPST_A32_F16, + FPST_A64_F16, + FPST_AH, + FPST_AH_F16, + FPST_STD, + FPST_STD_F16, +} ARMFPStatusFlavour; +#define FPST_COUNT 8 + typedef struct CPUArchState { /* Regs for current mode. */ uint32_t regs[16]; @@ -631,41 +686,8 @@ typedef struct CPUArchState { /* Scratch space for aa32 neon expansion. */ uint32_t scratch[8]; - /* There are a number of distinct float control structures: - * - * fp_status_a32: is the "normal" fp status for AArch32 insns - * fp_status_a64: is the "normal" fp status for AArch64 insns - * fp_status_fp16_a32: used for AArch32 half-precision calculations - * fp_status_fp16_a64: used for AArch64 half-precision calculations - * standard_fp_status : the ARM "Standard FPSCR Value" - * standard_fp_status_fp16 : used for half-precision - * calculations with the ARM "Standard FPSCR Value" - * - * Half-precision operations are governed by a separate - * flush-to-zero control bit in FPSCR:FZ16. We pass a separate - * status structure to control this. - * - * The "Standard FPSCR", ie default-NaN, flush-to-zero, - * round-to-nearest and is used by any operations (generally - * Neon) which the architecture defines as controlled by the - * standard FPSCR value rather than the FPSCR. - * - * The "standard FPSCR but for fp16 ops" is needed because - * the "standard FPSCR" tracks the FPSCR.FZ16 bit rather than - * using a fixed value for it. - * - * To avoid having to transfer exception bits around, we simply - * say that the FPSCR cumulative exception flags are the logical - * OR of the flags in the four fp statuses. This relies on the - * only thing which needs to read the exception flags being - * an explicit FPSCR read. - */ - float_status fp_status_a32; - float_status fp_status_a64; - float_status fp_status_f16_a32; - float_status fp_status_f16_a64; - float_status standard_fp_status; - float_status standard_fp_status_f16; + /* There are a number of distinct float control structures. */ + float_status fp_status[FPST_COUNT]; uint64_t zcr_el[4]; /* ZCR_EL[1-3] */ uint64_t smcr_el[4]; /* SMCR_EL[1-3] */ @@ -1714,6 +1736,9 @@ void vfp_set_fpscr(CPUARMState *env, uint32_t val); */ /* FPCR bits */ +#define FPCR_FIZ (1 << 0) /* Flush Inputs to Zero (FEAT_AFP) */ +#define FPCR_AH (1 << 1) /* Alternate Handling (FEAT_AFP) */ +#define FPCR_NEP (1 << 2) /* SIMD scalar ops preserve elts (FEAT_AFP) */ #define FPCR_IOE (1 << 8) /* Invalid Operation exception trap enable */ #define FPCR_DZE (1 << 9) /* Divide by Zero exception trap enable */ #define FPCR_OFE (1 << 10) /* Overflow exception trap enable */ @@ -3195,6 +3220,8 @@ FIELD(TBFLAG_A64, NV2, 34, 1) FIELD(TBFLAG_A64, NV2_MEM_E20, 35, 1) /* Set if FEAT_NV2 RAM accesses are big-endian */ FIELD(TBFLAG_A64, NV2_MEM_BE, 36, 1) +FIELD(TBFLAG_A64, AH, 37, 1) /* FPCR.AH */ +FIELD(TBFLAG_A64, NEP, 38, 1) /* FPCR.NEP */ /* * Helpers for using the above. Note that only the A64 accessors use diff --git a/target/arm/helper.c b/target/arm/helper.c index 40bdfc8..7d95eae 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -4848,7 +4848,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = { .writefn = aa64_daif_write, .resetfn = arm_cp_reset_ignore }, { .name = "FPCR", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 3, .opc2 = 0, .crn = 4, .crm = 4, - .access = PL0_RW, .type = ARM_CP_FPU | ARM_CP_SUPPRESS_TB_END, + .access = PL0_RW, .type = ARM_CP_FPU, .readfn = aa64_fpcr_read, .writefn = aa64_fpcr_write }, { .name = "FPSR", .state = ARM_CP_STATE_AA64, .opc0 = 3, .opc1 = 3, .opc2 = 1, .crn = 4, .crm = 4, diff --git a/target/arm/helper.h b/target/arm/helper.h index 15bad07..0907505 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -245,9 +245,11 @@ DEF_HELPER_4(vfp_muladdh, f16, f16, f16, f16, fpst) DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, fpst) DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(recpe_rpres_f32, TCG_CALL_NO_RWG, f32, f32, fpst) DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, fpst) DEF_HELPER_FLAGS_2(rsqrte_f16, TCG_CALL_NO_RWG, f16, f16, fpst) DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, fpst) +DEF_HELPER_FLAGS_2(rsqrte_rpres_f32, TCG_CALL_NO_RWG, f32, f32, fpst) DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, fpst) DEF_HELPER_FLAGS_1(recpe_u32, TCG_CALL_NO_RWG, i32, i32) DEF_HELPER_FLAGS_1(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32) @@ -680,10 +682,12 @@ DEF_HELPER_FLAGS_4(gvec_vrintx_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frecpe_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frecpe_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frecpe_rpres_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frecpe_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frsqrte_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frsqrte_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(gvec_frsqrte_rpres_s, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_frsqrte_d, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(gvec_fcgt0_h, TCG_CALL_NO_RWG, void, ptr, ptr, fpst, i32) @@ -722,6 +726,10 @@ DEF_HELPER_FLAGS_5(gvec_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fabd_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fabd_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_5(gvec_fceq_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fceq_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_fceq_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) @@ -778,6 +786,10 @@ DEF_HELPER_FLAGS_5(gvec_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_vfms_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_vfms_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_vfms_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_5(gvec_ftsmul_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(gvec_ftsmul_s, TCG_CALL_NO_RWG, @@ -809,6 +821,20 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmls_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmls_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_fmls_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(gvec_ah_fmls_idx_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_5(gvec_uqadd_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_uqadd_h, TCG_CALL_NO_RWG, diff --git a/target/arm/internals.h b/target/arm/internals.h index 863a84e..b318734 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1828,4 +1828,10 @@ uint64_t gt_virt_cnt_offset(CPUARMState *env); * all EL1" scope; this covers stage 1 and stage 2. */ int alle1_tlbmask(CPUARMState *env); + +/* Set the float_status behaviour to match the Arm defaults */ +void arm_set_default_fp_behaviours(float_status *s); +/* Set the float_status behaviour to match Arm FPCR.AH=1 behaviour */ +void arm_set_ah_fp_behaviours(float_status *s); + #endif diff --git a/target/arm/tcg/cpu64.c b/target/arm/tcg/cpu64.c index 93573ce..29ab0ac 100644 --- a/target/arm/tcg/cpu64.c +++ b/target/arm/tcg/cpu64.c @@ -1167,6 +1167,7 @@ void aarch64_max_tcg_initfn(Object *obj) cpu->isar.id_aa64isar1 = t; t = cpu->isar.id_aa64isar2; + t = FIELD_DP64(t, ID_AA64ISAR2, RPRES, 1); /* FEAT_RPRES */ t = FIELD_DP64(t, ID_AA64ISAR2, MOPS, 1); /* FEAT_MOPS */ t = FIELD_DP64(t, ID_AA64ISAR2, BC, 1); /* FEAT_HBC */ t = FIELD_DP64(t, ID_AA64ISAR2, WFXT, 2); /* FEAT_WFxT */ @@ -1218,6 +1219,7 @@ void aarch64_max_tcg_initfn(Object *obj) t = FIELD_DP64(t, ID_AA64MMFR1, XNX, 1); /* FEAT_XNX */ t = FIELD_DP64(t, ID_AA64MMFR1, ETS, 2); /* FEAT_ETS2 */ t = FIELD_DP64(t, ID_AA64MMFR1, HCX, 1); /* FEAT_HCX */ + t = FIELD_DP64(t, ID_AA64MMFR1, AFP, 1); /* FEAT_AFP */ t = FIELD_DP64(t, ID_AA64MMFR1, TIDCP1, 1); /* FEAT_TIDCP1 */ t = FIELD_DP64(t, ID_AA64MMFR1, CMOW, 1); /* FEAT_CMOW */ cpu->isar.id_aa64mmfr1 = t; diff --git a/target/arm/tcg/helper-a64.c b/target/arm/tcg/helper-a64.c index 0503608..32f0647 100644 --- a/target/arm/tcg/helper-a64.c +++ b/target/arm/tcg/helper-a64.c @@ -38,6 +38,7 @@ #ifdef CONFIG_USER_ONLY #include "user/page-protection.h" #endif +#include "vec_internal.h" /* C2.4.7 Multiply and divide */ /* special cases for 0 and LLONG_MIN are mandated by the standard */ @@ -208,88 +209,52 @@ uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, float_status *fpst) return -float64_lt(b, a, fpst); } -/* Reciprocal step and sqrt step. Note that unlike the A32/T32 +/* + * Reciprocal step and sqrt step. Note that unlike the A32/T32 * versions, these do a fully fused multiply-add or * multiply-add-and-halve. + * The FPCR.AH == 1 versions need to avoid flipping the sign of NaN. */ - -uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, float_status *fpst) -{ - a = float16_squash_input_denormal(a, fpst); - b = float16_squash_input_denormal(b, fpst); - - a = float16_chs(a); - if ((float16_is_infinity(a) && float16_is_zero(b)) || - (float16_is_infinity(b) && float16_is_zero(a))) { - return float16_two; - } - return float16_muladd(a, b, float16_two, 0, fpst); -} - -float32 HELPER(recpsf_f32)(float32 a, float32 b, float_status *fpst) -{ - a = float32_squash_input_denormal(a, fpst); - b = float32_squash_input_denormal(b, fpst); - - a = float32_chs(a); - if ((float32_is_infinity(a) && float32_is_zero(b)) || - (float32_is_infinity(b) && float32_is_zero(a))) { - return float32_two; - } - return float32_muladd(a, b, float32_two, 0, fpst); -} - -float64 HELPER(recpsf_f64)(float64 a, float64 b, float_status *fpst) -{ - a = float64_squash_input_denormal(a, fpst); - b = float64_squash_input_denormal(b, fpst); - - a = float64_chs(a); - if ((float64_is_infinity(a) && float64_is_zero(b)) || - (float64_is_infinity(b) && float64_is_zero(a))) { - return float64_two; - } - return float64_muladd(a, b, float64_two, 0, fpst); -} - -uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, float_status *fpst) -{ - a = float16_squash_input_denormal(a, fpst); - b = float16_squash_input_denormal(b, fpst); - - a = float16_chs(a); - if ((float16_is_infinity(a) && float16_is_zero(b)) || - (float16_is_infinity(b) && float16_is_zero(a))) { - return float16_one_point_five; - } - return float16_muladd_scalbn(a, b, float16_three, -1, 0, fpst); -} - -float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, float_status *fpst) -{ - a = float32_squash_input_denormal(a, fpst); - b = float32_squash_input_denormal(b, fpst); - - a = float32_chs(a); - if ((float32_is_infinity(a) && float32_is_zero(b)) || - (float32_is_infinity(b) && float32_is_zero(a))) { - return float32_one_point_five; - } - return float32_muladd_scalbn(a, b, float32_three, -1, 0, fpst); -} - -float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, float_status *fpst) -{ - a = float64_squash_input_denormal(a, fpst); - b = float64_squash_input_denormal(b, fpst); - - a = float64_chs(a); - if ((float64_is_infinity(a) && float64_is_zero(b)) || - (float64_is_infinity(b) && float64_is_zero(a))) { - return float64_one_point_five; - } - return float64_muladd_scalbn(a, b, float64_three, -1, 0, fpst); -} +#define DO_RECPS(NAME, CTYPE, FLOATTYPE, CHSFN) \ + CTYPE HELPER(NAME)(CTYPE a, CTYPE b, float_status *fpst) \ + { \ + a = FLOATTYPE ## _squash_input_denormal(a, fpst); \ + b = FLOATTYPE ## _squash_input_denormal(b, fpst); \ + a = FLOATTYPE ## _ ## CHSFN(a); \ + if ((FLOATTYPE ## _is_infinity(a) && FLOATTYPE ## _is_zero(b)) || \ + (FLOATTYPE ## _is_infinity(b) && FLOATTYPE ## _is_zero(a))) { \ + return FLOATTYPE ## _two; \ + } \ + return FLOATTYPE ## _muladd(a, b, FLOATTYPE ## _two, 0, fpst); \ + } + +DO_RECPS(recpsf_f16, uint32_t, float16, chs) +DO_RECPS(recpsf_f32, float32, float32, chs) +DO_RECPS(recpsf_f64, float64, float64, chs) +DO_RECPS(recpsf_ah_f16, uint32_t, float16, ah_chs) +DO_RECPS(recpsf_ah_f32, float32, float32, ah_chs) +DO_RECPS(recpsf_ah_f64, float64, float64, ah_chs) + +#define DO_RSQRTSF(NAME, CTYPE, FLOATTYPE, CHSFN) \ + CTYPE HELPER(NAME)(CTYPE a, CTYPE b, float_status *fpst) \ + { \ + a = FLOATTYPE ## _squash_input_denormal(a, fpst); \ + b = FLOATTYPE ## _squash_input_denormal(b, fpst); \ + a = FLOATTYPE ## _ ## CHSFN(a); \ + if ((FLOATTYPE ## _is_infinity(a) && FLOATTYPE ## _is_zero(b)) || \ + (FLOATTYPE ## _is_infinity(b) && FLOATTYPE ## _is_zero(a))) { \ + return FLOATTYPE ## _one_point_five; \ + } \ + return FLOATTYPE ## _muladd_scalbn(a, b, FLOATTYPE ## _three, \ + -1, 0, fpst); \ + } \ + +DO_RSQRTSF(rsqrtsf_f16, uint32_t, float16, chs) +DO_RSQRTSF(rsqrtsf_f32, float32, float32, chs) +DO_RSQRTSF(rsqrtsf_f64, float64, float64, chs) +DO_RSQRTSF(rsqrtsf_ah_f16, uint32_t, float16, ah_chs) +DO_RSQRTSF(rsqrtsf_ah_f32, float32, float32, ah_chs) +DO_RSQRTSF(rsqrtsf_ah_f64, float64, float64, ah_chs) /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */ uint32_t HELPER(frecpx_f16)(uint32_t a, float_status *fpst) @@ -399,6 +364,42 @@ float32 HELPER(fcvtx_f64_to_f32)(float64 a, float_status *fpst) return r; } +/* + * AH=1 min/max have some odd special cases: + * comparing two zeroes (regardless of sign), (NaN, anything), + * or (anything, NaN) should return the second argument (possibly + * squashed to zero). + * Also, denormal outputs are not squashed to zero regardless of FZ or FZ16. + */ +#define AH_MINMAX_HELPER(NAME, CTYPE, FLOATTYPE, MINMAX) \ + CTYPE HELPER(NAME)(CTYPE a, CTYPE b, float_status *fpst) \ + { \ + bool save; \ + CTYPE r; \ + a = FLOATTYPE ## _squash_input_denormal(a, fpst); \ + b = FLOATTYPE ## _squash_input_denormal(b, fpst); \ + if (FLOATTYPE ## _is_zero(a) && FLOATTYPE ## _is_zero(b)) { \ + return b; \ + } \ + if (FLOATTYPE ## _is_any_nan(a) || \ + FLOATTYPE ## _is_any_nan(b)) { \ + float_raise(float_flag_invalid, fpst); \ + return b; \ + } \ + save = get_flush_to_zero(fpst); \ + set_flush_to_zero(false, fpst); \ + r = FLOATTYPE ## _ ## MINMAX(a, b, fpst); \ + set_flush_to_zero(save, fpst); \ + return r; \ + } + +AH_MINMAX_HELPER(vfp_ah_minh, dh_ctype_f16, float16, min) +AH_MINMAX_HELPER(vfp_ah_mins, float32, float32, min) +AH_MINMAX_HELPER(vfp_ah_mind, float64, float64, min) +AH_MINMAX_HELPER(vfp_ah_maxh, dh_ctype_f16, float16, max) +AH_MINMAX_HELPER(vfp_ah_maxs, float32, float32, max) +AH_MINMAX_HELPER(vfp_ah_maxd, float64, float64, max) + /* 64-bit versions of the CRC helpers. Note that although the operation * (and the prototypes of crc32c() and crc32() mean that only the bottom * 32 bits of the accumulator and result are used, we pass and return diff --git a/target/arm/tcg/helper-a64.h b/target/arm/tcg/helper-a64.h index bac12fb..8502346 100644 --- a/target/arm/tcg/helper-a64.h +++ b/target/arm/tcg/helper-a64.h @@ -38,9 +38,15 @@ DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, fpst) DEF_HELPER_FLAGS_3(recpsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, fpst) DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, fpst) DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, fpst) +DEF_HELPER_FLAGS_3(recpsf_ah_f16, TCG_CALL_NO_RWG, f16, f16, f16, fpst) +DEF_HELPER_FLAGS_3(recpsf_ah_f32, TCG_CALL_NO_RWG, f32, f32, f32, fpst) +DEF_HELPER_FLAGS_3(recpsf_ah_f64, TCG_CALL_NO_RWG, f64, f64, f64, fpst) DEF_HELPER_FLAGS_3(rsqrtsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, fpst) DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, fpst) DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, fpst) +DEF_HELPER_FLAGS_3(rsqrtsf_ah_f16, TCG_CALL_NO_RWG, f16, f16, f16, fpst) +DEF_HELPER_FLAGS_3(rsqrtsf_ah_f32, TCG_CALL_NO_RWG, f32, f32, f32, fpst) +DEF_HELPER_FLAGS_3(rsqrtsf_ah_f64, TCG_CALL_NO_RWG, f64, f64, f64, fpst) DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, fpst) DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, fpst) DEF_HELPER_FLAGS_2(frecpx_f16, TCG_CALL_NO_RWG, f16, f16, fpst) @@ -67,6 +73,13 @@ DEF_HELPER_4(advsimd_muladd2h, i32, i32, i32, i32, fpst) DEF_HELPER_2(advsimd_rinth_exact, f16, f16, fpst) DEF_HELPER_2(advsimd_rinth, f16, f16, fpst) +DEF_HELPER_3(vfp_ah_minh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_ah_mins, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_ah_mind, f64, f64, f64, fpst) +DEF_HELPER_3(vfp_ah_maxh, f16, f16, f16, fpst) +DEF_HELPER_3(vfp_ah_maxs, f32, f32, f32, fpst) +DEF_HELPER_3(vfp_ah_maxd, f64, f64, f64, fpst) + DEF_HELPER_2(exception_return, void, env, i64) DEF_HELPER_FLAGS_2(dc_zva, TCG_CALL_NO_WG, void, env, i64) diff --git a/target/arm/tcg/helper-sve.h b/target/arm/tcg/helper-sve.h index fea43b3..0b1b588 100644 --- a/target/arm/tcg/helper-sve.h +++ b/target/arm/tcg/helper-sve.h @@ -541,10 +541,18 @@ DEF_HELPER_FLAGS_4(sve_fabs_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_fabs_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_fabs_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fabs_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fabs_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fabs_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve_fneg_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_fneg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_fneg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fneg_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fneg_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(sve_ah_fneg_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_4(sve_not_zpz_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_not_zpz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(sve_not_zpz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -972,6 +980,48 @@ DEF_HELPER_FLAGS_5(gvec_rsqrts_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_5(gvec_rsqrts_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_recps_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_recps_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_recps_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_rsqrts_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_rsqrts_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_rsqrts_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_fmax_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmax_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmax_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_fmin_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmin_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmin_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_fmaxp_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmaxp_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fmaxp_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_5(gvec_ah_fminp_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fminp_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_5(gvec_ah_fminp_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_4(sve_faddv_h, TCG_CALL_NO_RWG, i64, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_4(sve_faddv_s, TCG_CALL_NO_RWG, @@ -1007,6 +1057,20 @@ DEF_HELPER_FLAGS_4(sve_fminv_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_4(sve_fminv_d, TCG_CALL_NO_RWG, i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sve_ah_fmaxv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sve_ah_fmaxv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sve_ah_fmaxv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_4(sve_ah_fminv_h, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sve_ah_fminv_s, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_4(sve_ah_fminv_d, TCG_CALL_NO_RWG, + i64, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_5(sve_fadda_h, TCG_CALL_NO_RWG, i64, i64, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(sve_fadda_s, TCG_CALL_NO_RWG, @@ -1098,6 +1162,20 @@ DEF_HELPER_FLAGS_6(sve_fmax_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fmax_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmin_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmin_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmin_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_6(sve_ah_fmax_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmax_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmax_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_6(sve_fminnum_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fminnum_s, TCG_CALL_NO_RWG, @@ -1119,6 +1197,13 @@ DEF_HELPER_FLAGS_6(sve_fabd_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fabd_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fabd_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fabd_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fabd_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_6(sve_fscalbn_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_6(sve_fscalbn_s, TCG_CALL_NO_RWG, @@ -1189,6 +1274,20 @@ DEF_HELPER_FLAGS_6(sve_fmins_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_6(sve_fmins_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i64, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmaxs_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmaxs_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmaxs_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) + +DEF_HELPER_FLAGS_6(sve_ah_fmins_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmins_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) +DEF_HELPER_FLAGS_6(sve_ah_fmins_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, i64, fpst, i32) + DEF_HELPER_FLAGS_5(sve_fcvt_sh, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_5(sve_fcvt_dh, TCG_CALL_NO_RWG, @@ -1376,6 +1475,27 @@ DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_s, TCG_CALL_NO_RWG, DEF_HELPER_FLAGS_7(sve_fnmls_zpzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fmls_zpzzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmla_zpzzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + +DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) +DEF_HELPER_FLAGS_7(sve_ah_fnmls_zpzzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, ptr, fpst, i32) + DEF_HELPER_FLAGS_7(sve_fcmla_zpzzz_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, fpst, i32) DEF_HELPER_FLAGS_7(sve_fcmla_zpzzz_s, TCG_CALL_NO_RWG, diff --git a/target/arm/tcg/hflags.c b/target/arm/tcg/hflags.c index f03977b..9e6a186 100644 --- a/target/arm/tcg/hflags.c +++ b/target/arm/tcg/hflags.c @@ -404,6 +404,19 @@ static CPUARMTBFlags rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, DP_TBFLAG_A64(flags, TCMA, aa64_va_parameter_tcma(tcr, mmu_idx)); } + if (env->vfp.fpcr & FPCR_AH) { + DP_TBFLAG_A64(flags, AH, 1); + } + if (env->vfp.fpcr & FPCR_NEP) { + /* + * In streaming-SVE without FA64, NEP behaves as if zero; + * compare pseudocode IsMerging() + */ + if (!(EX_TBFLAG_A64(flags, PSTATE_SM) && !sme_fa64(env, el))) { + DP_TBFLAG_A64(flags, NEP, 1); + } + } + return rebuild_hflags_common(env, fp_el, mmu_idx, flags); } diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index 03ebef5..274003e 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -2814,8 +2814,7 @@ DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -2888,8 +2887,7 @@ DO_2OP_FP_ALL(vminnma, minnuma) r[e] = 0; \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(tm & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -2926,8 +2924,7 @@ DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -2964,8 +2961,7 @@ DO_VFMA(vfmss, 4, float32, true) if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) { \ continue; \ } \ - fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst0 = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ fpst1 = fpst0; \ if (!(mask & 1)) { \ scratch_fpst = *fpst0; \ @@ -3049,8 +3045,7 @@ DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -3084,8 +3079,7 @@ DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -3116,9 +3110,8 @@ DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS) unsigned e; \ TYPE *m = vm; \ TYPE ra = (TYPE)ra_in; \ - float_status *fpst = (ESIZE == 2) ? \ - &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + float_status *fpst = \ + &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ if (mask & 1) { \ TYPE v = m[H##ESIZE(e)]; \ @@ -3168,8 +3161,7 @@ DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum) if ((mask & emask) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & (1 << (e * ESIZE)))) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -3202,8 +3194,7 @@ DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum) if ((mask & emask) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & (1 << (e * ESIZE)))) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -3267,8 +3258,7 @@ DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ @@ -3300,9 +3290,8 @@ DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero) unsigned e; \ float_status *fpst; \ float_status scratch_fpst; \ - float_status *base_fpst = (ESIZE == 2) ? \ - &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + float_status *base_fpst = \ + &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ uint32_t prev_rmode = get_float_rounding_mode(base_fpst); \ set_float_rounding_mode(rmode, base_fpst); \ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ @@ -3347,7 +3336,7 @@ static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top) unsigned e; float_status *fpst; float_status scratch_fpst; - float_status *base_fpst = &env->vfp.standard_fp_status; + float_status *base_fpst = &env->vfp.fp_status[FPST_STD]; bool old_fz = get_flush_to_zero(base_fpst); set_flush_to_zero(false, base_fpst); for (e = 0; e < 16 / 4; e++, mask >>= 4) { @@ -3377,7 +3366,7 @@ static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top) unsigned e; float_status *fpst; float_status scratch_fpst; - float_status *base_fpst = &env->vfp.standard_fp_status; + float_status *base_fpst = &env->vfp.fp_status[FPST_STD]; bool old_fiz = get_flush_inputs_to_zero(base_fpst); set_flush_inputs_to_zero(false, base_fpst); for (e = 0; e < 16 / 4; e++, mask >>= 4) { @@ -3427,8 +3416,7 @@ void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm) if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \ continue; \ } \ - fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \ - &env->vfp.standard_fp_status; \ + fpst = &env->vfp.fp_status[ESIZE == 2 ? FPST_STD_F16 : FPST_STD]; \ if (!(mask & 1)) { \ /* We need the result but without updating flags */ \ scratch_fpst = *fpst; \ diff --git a/target/arm/tcg/sme_helper.c b/target/arm/tcg/sme_helper.c index 727c085..dcc48e4 100644 --- a/target/arm/tcg/sme_helper.c +++ b/target/arm/tcg/sme_helper.c @@ -1043,8 +1043,8 @@ void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn, * produces default NaNs. We also need a second copy of fp_status with * round-to-odd -- see above. */ - fpst_f16 = env->vfp.fp_status_f16_a64; - fpst_std = env->vfp.fp_status_a64; + fpst_f16 = env->vfp.fp_status[FPST_A64_F16]; + fpst_std = env->vfp.fp_status[FPST_A64]; set_default_nan_mode(true, &fpst_std); set_default_nan_mode(true, &fpst_f16); fpst_odd = fpst_std; diff --git a/target/arm/tcg/sve_helper.c b/target/arm/tcg/sve_helper.c index 9837c5bc..c206ca6 100644 --- a/target/arm/tcg/sve_helper.c +++ b/target/arm/tcg/sve_helper.c @@ -879,12 +879,28 @@ DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) +#define DO_AH_FABS_H(N) (float16_is_any_nan(N) ? (N) : DO_FABS(N)) +#define DO_AH_FABS_S(N) (float32_is_any_nan(N) ? (N) : DO_FABS(N)) +#define DO_AH_FABS_D(N) (float64_is_any_nan(N) ? (N) : DO_FABS(N)) + +DO_ZPZ(sve_ah_fabs_h, uint16_t, H1_2, DO_AH_FABS_H) +DO_ZPZ(sve_ah_fabs_s, uint32_t, H1_4, DO_AH_FABS_S) +DO_ZPZ_D(sve_ah_fabs_d, uint64_t, DO_AH_FABS_D) + #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) +#define DO_AH_FNEG_H(N) (float16_is_any_nan(N) ? (N) : DO_FNEG(N)) +#define DO_AH_FNEG_S(N) (float32_is_any_nan(N) ? (N) : DO_FNEG(N)) +#define DO_AH_FNEG_D(N) (float64_is_any_nan(N) ? (N) : DO_FNEG(N)) + +DO_ZPZ(sve_ah_fneg_h, uint16_t, H1_2, DO_AH_FNEG_H) +DO_ZPZ(sve_ah_fneg_s, uint32_t, H1_4, DO_AH_FNEG_S) +DO_ZPZ_D(sve_ah_fneg_d, uint64_t, DO_AH_FNEG_D) + #define DO_NOT(N) (~N) DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) @@ -2539,6 +2555,7 @@ void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc) void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc) / 2; + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); uint16_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz; i += 1) { uint16_t nn = n[i]; @@ -2546,13 +2563,17 @@ void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc) if (mm & 1) { nn = float16_one; } - d[i] = nn ^ (mm & 2) << 14; + if (mm & 2) { + nn = float16_maybe_ah_chs(nn, fpcr_ah); + } + d[i] = nn; } } void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc) / 4; + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); uint32_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz; i += 1) { uint32_t nn = n[i]; @@ -2560,13 +2581,17 @@ void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc) if (mm & 1) { nn = float32_one; } - d[i] = nn ^ (mm & 2) << 30; + if (mm & 2) { + nn = float32_maybe_ah_chs(nn, fpcr_ah); + } + d[i] = nn; } } void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) { intptr_t i, opr_sz = simd_oprsz(desc) / 8; + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT, 1); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz; i += 1) { uint64_t nn = n[i]; @@ -2574,7 +2599,10 @@ void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc) if (mm & 1) { nn = float64_one; } - d[i] = nn ^ (mm & 2) << 62; + if (mm & 2) { + nn = float64_maybe_ah_chs(nn, fpcr_ah); + } + d[i] = nn; } } @@ -4190,7 +4218,7 @@ static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \ uintptr_t half = n / 2; \ TYPE lo = NAME##_reduce(data, status, half); \ TYPE hi = NAME##_reduce(data + half, status, half); \ - return TYPE##_##FUNC(lo, hi, status); \ + return FUNC(lo, hi, status); \ } \ } \ uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ @@ -4211,26 +4239,37 @@ uint64_t HELPER(NAME)(void *vn, void *vg, float_status *s, uint32_t desc) \ return NAME##_reduce(data, s, maxsz / sizeof(TYPE)); \ } -DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero) -DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero) -DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero) +DO_REDUCE(sve_faddv_h, float16, H1_2, float16_add, float16_zero) +DO_REDUCE(sve_faddv_s, float32, H1_4, float32_add, float32_zero) +DO_REDUCE(sve_faddv_d, float64, H1_8, float64_add, float64_zero) /* Identity is floatN_default_nan, without the function call. */ -DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00) -DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000) -DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL) +DO_REDUCE(sve_fminnmv_h, float16, H1_2, float16_minnum, 0x7E00) +DO_REDUCE(sve_fminnmv_s, float32, H1_4, float32_minnum, 0x7FC00000) +DO_REDUCE(sve_fminnmv_d, float64, H1_8, float64_minnum, 0x7FF8000000000000ULL) + +DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, float16_maxnum, 0x7E00) +DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, float32_maxnum, 0x7FC00000) +DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, float64_maxnum, 0x7FF8000000000000ULL) -DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00) -DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000) -DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL) +DO_REDUCE(sve_fminv_h, float16, H1_2, float16_min, float16_infinity) +DO_REDUCE(sve_fminv_s, float32, H1_4, float32_min, float32_infinity) +DO_REDUCE(sve_fminv_d, float64, H1_8, float64_min, float64_infinity) -DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity) -DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity) -DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity) +DO_REDUCE(sve_fmaxv_h, float16, H1_2, float16_max, float16_chs(float16_infinity)) +DO_REDUCE(sve_fmaxv_s, float32, H1_4, float32_max, float32_chs(float32_infinity)) +DO_REDUCE(sve_fmaxv_d, float64, H1_8, float64_max, float64_chs(float64_infinity)) -DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity)) -DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity)) -DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity)) +DO_REDUCE(sve_ah_fminv_h, float16, H1_2, helper_vfp_ah_minh, float16_infinity) +DO_REDUCE(sve_ah_fminv_s, float32, H1_4, helper_vfp_ah_mins, float32_infinity) +DO_REDUCE(sve_ah_fminv_d, float64, H1_8, helper_vfp_ah_mind, float64_infinity) + +DO_REDUCE(sve_ah_fmaxv_h, float16, H1_2, helper_vfp_ah_maxh, + float16_chs(float16_infinity)) +DO_REDUCE(sve_ah_fmaxv_s, float32, H1_4, helper_vfp_ah_maxs, + float32_chs(float32_infinity)) +DO_REDUCE(sve_ah_fmaxv_d, float64, H1_8, helper_vfp_ah_maxd, + float64_chs(float64_infinity)) #undef DO_REDUCE @@ -4336,6 +4375,14 @@ DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max) DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max) DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max) +DO_ZPZZ_FP(sve_ah_fmin_h, uint16_t, H1_2, helper_vfp_ah_minh) +DO_ZPZZ_FP(sve_ah_fmin_s, uint32_t, H1_4, helper_vfp_ah_mins) +DO_ZPZZ_FP(sve_ah_fmin_d, uint64_t, H1_8, helper_vfp_ah_mind) + +DO_ZPZZ_FP(sve_ah_fmax_h, uint16_t, H1_2, helper_vfp_ah_maxh) +DO_ZPZZ_FP(sve_ah_fmax_s, uint32_t, H1_4, helper_vfp_ah_maxs) +DO_ZPZZ_FP(sve_ah_fmax_d, uint64_t, H1_8, helper_vfp_ah_maxd) + DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum) DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum) DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum) @@ -4359,9 +4406,31 @@ static inline float64 abd_d(float64 a, float64 b, float_status *s) return float64_abs(float64_sub(a, b, s)); } +/* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ +static float16 ah_abd_h(float16 op1, float16 op2, float_status *stat) +{ + float16 r = float16_sub(op1, op2, stat); + return float16_is_any_nan(r) ? r : float16_abs(r); +} + +static float32 ah_abd_s(float32 op1, float32 op2, float_status *stat) +{ + float32 r = float32_sub(op1, op2, stat); + return float32_is_any_nan(r) ? r : float32_abs(r); +} + +static float64 ah_abd_d(float64 op1, float64 op2, float_status *stat) +{ + float64 r = float64_sub(op1, op2, stat); + return float64_is_any_nan(r) ? r : float64_abs(r); +} + DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h) DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s) DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d) +DO_ZPZZ_FP(sve_ah_fabd_h, uint16_t, H1_2, ah_abd_h) +DO_ZPZZ_FP(sve_ah_fabd_s, uint32_t, H1_4, ah_abd_s) +DO_ZPZZ_FP(sve_ah_fabd_d, uint64_t, H1_8, ah_abd_d) static inline float64 scalbn_d(float64 a, int64_t b, float_status *s) { @@ -4448,6 +4517,14 @@ DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min) DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min) DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min) +DO_ZPZS_FP(sve_ah_fmaxs_h, float16, H1_2, helper_vfp_ah_maxh) +DO_ZPZS_FP(sve_ah_fmaxs_s, float32, H1_4, helper_vfp_ah_maxs) +DO_ZPZS_FP(sve_ah_fmaxs_d, float64, H1_8, helper_vfp_ah_maxd) + +DO_ZPZS_FP(sve_ah_fmins_h, float16, H1_2, helper_vfp_ah_minh) +DO_ZPZS_FP(sve_ah_fmins_s, float32, H1_4, helper_vfp_ah_mins) +DO_ZPZS_FP(sve_ah_fmins_d, float64, H1_8, helper_vfp_ah_mind) + /* Fully general two-operand expander, controlled by a predicate, * With the extra float_status parameter. */ @@ -4737,7 +4814,7 @@ DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int) static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc, - uint16_t neg1, uint16_t neg3) + uint16_t neg1, uint16_t neg3, int flags) { intptr_t i = simd_oprsz(desc); uint64_t *g = vg; @@ -4752,7 +4829,7 @@ static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1; e2 = *(uint16_t *)(vm + H1_2(i)); e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3; - r = float16_muladd(e1, e2, e3, 0, status); + r = float16_muladd(e1, e2, e3, flags, status); *(uint16_t *)(vd + H1_2(i)) = r; } } while (i & 63); @@ -4762,30 +4839,51 @@ static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg, void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0); + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, 0); } void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0); + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0, 0); } void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000); + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000, 0); } void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000); + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000, 0); +} + +void HELPER(sve_ah_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product); +} + +void HELPER(sve_ah_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product | float_muladd_negate_c); +} + +void HELPER(sve_ah_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_c); } static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc, - uint32_t neg1, uint32_t neg3) + uint32_t neg1, uint32_t neg3, int flags) { intptr_t i = simd_oprsz(desc); uint64_t *g = vg; @@ -4800,7 +4898,7 @@ static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1; e2 = *(uint32_t *)(vm + H1_4(i)); e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3; - r = float32_muladd(e1, e2, e3, 0, status); + r = float32_muladd(e1, e2, e3, flags, status); *(uint32_t *)(vd + H1_4(i)) = r; } } while (i & 63); @@ -4810,30 +4908,51 @@ static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg, void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0); + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, 0); } void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0); + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0, 0); } void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000); + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000, 0); } void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000); + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000, 0); +} + +void HELPER(sve_ah_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product); +} + +void HELPER(sve_ah_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product | float_muladd_negate_c); +} + +void HELPER(sve_ah_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_c); } static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc, - uint64_t neg1, uint64_t neg3) + uint64_t neg1, uint64_t neg3, int flags) { intptr_t i = simd_oprsz(desc); uint64_t *g = vg; @@ -4848,7 +4967,7 @@ static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, e1 = *(uint64_t *)(vn + i) ^ neg1; e2 = *(uint64_t *)(vm + i); e3 = *(uint64_t *)(va + i) ^ neg3; - r = float64_muladd(e1, e2, e3, 0, status); + r = float64_muladd(e1, e2, e3, flags, status); *(uint64_t *)(vd + i) = r; } } while (i & 63); @@ -4858,25 +4977,46 @@ static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg, void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0); + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, 0); } void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0); + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0, 0); } void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN); + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN, 0); } void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { - do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN); + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN, 0); +} + +void HELPER(sve_ah_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product); +} + +void HELPER(sve_ah_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_product | float_muladd_negate_c); +} + +void HELPER(sve_ah_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va, + void *vg, float_status *status, uint32_t desc) +{ + do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0, + float_muladd_negate_c); } /* Two operand floating-point comparison controlled by a predicate. @@ -4994,16 +5134,24 @@ void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, }; intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16); - intptr_t x = simd_data(desc); + intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); float16 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { float16 mm = m[i]; intptr_t xx = x; + int flags = 0; + if (float16_is_neg(mm)) { - mm = float16_abs(mm); + if (fpcr_ah) { + flags = float_muladd_negate_product; + } else { + mm = float16_abs(mm); + } xx += 8; } - d[i] = float16_muladd(n[i], mm, coeff[xx], 0, s); + d[i] = float16_muladd(n[i], mm, coeff[xx], flags, s); } } @@ -5017,16 +5165,24 @@ void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000, }; intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32); - intptr_t x = simd_data(desc); + intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); float32 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { float32 mm = m[i]; intptr_t xx = x; + int flags = 0; + if (float32_is_neg(mm)) { - mm = float32_abs(mm); + if (fpcr_ah) { + flags = float_muladd_negate_product; + } else { + mm = float32_abs(mm); + } xx += 8; } - d[i] = float32_muladd(n[i], mm, coeff[xx], 0, s); + d[i] = float32_muladd(n[i], mm, coeff[xx], flags, s); } } @@ -5044,16 +5200,24 @@ void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull, }; intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64); - intptr_t x = simd_data(desc); + intptr_t x = extract32(desc, SIMD_DATA_SHIFT, 3); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 3, 1); float64 *d = vd, *n = vn, *m = vm; + for (i = 0; i < opr_sz; i++) { float64 mm = m[i]; intptr_t xx = x; + int flags = 0; + if (float64_is_neg(mm)) { - mm = float64_abs(mm); + if (fpcr_ah) { + flags = float_muladd_negate_product; + } else { + mm = float64_abs(mm); + } xx += 8; } - d[i] = float64_muladd(n[i], mm, coeff[xx], 0, s); + d[i] = float64_muladd(n[i], mm, coeff[xx], flags, s); } } @@ -5066,8 +5230,8 @@ void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, { intptr_t j, i = simd_oprsz(desc); uint64_t *g = vg; - float16 neg_imag = float16_set_sign(0, simd_data(desc)); - float16 neg_real = float16_chs(neg_imag); + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5079,9 +5243,15 @@ void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg, i -= 2 * sizeof(float16); e0 = *(float16 *)(vn + H1_2(i)); - e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real; + e1 = *(float16 *)(vm + H1_2(j)); e2 = *(float16 *)(vn + H1_2(j)); - e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag; + e3 = *(float16 *)(vm + H1_2(i)); + + if (rot) { + e3 = float16_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float16_maybe_ah_chs(e1, fpcr_ah); + } if (likely((pg >> (i & 63)) & 1)) { *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, s); @@ -5098,8 +5268,8 @@ void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, { intptr_t j, i = simd_oprsz(desc); uint64_t *g = vg; - float32 neg_imag = float32_set_sign(0, simd_data(desc)); - float32 neg_real = float32_chs(neg_imag); + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5111,9 +5281,15 @@ void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg, i -= 2 * sizeof(float32); e0 = *(float32 *)(vn + H1_2(i)); - e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real; + e1 = *(float32 *)(vm + H1_2(j)); e2 = *(float32 *)(vn + H1_2(j)); - e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag; + e3 = *(float32 *)(vm + H1_2(i)); + + if (rot) { + e3 = float32_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float32_maybe_ah_chs(e1, fpcr_ah); + } if (likely((pg >> (i & 63)) & 1)) { *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, s); @@ -5130,8 +5306,8 @@ void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, { intptr_t j, i = simd_oprsz(desc); uint64_t *g = vg; - float64 neg_imag = float64_set_sign(0, simd_data(desc)); - float64 neg_real = float64_chs(neg_imag); + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 1, 1); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5143,9 +5319,15 @@ void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg, i -= 2 * sizeof(float64); e0 = *(float64 *)(vn + H1_2(i)); - e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real; + e1 = *(float64 *)(vm + H1_2(j)); e2 = *(float64 *)(vn + H1_2(j)); - e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag; + e3 = *(float64 *)(vm + H1_2(i)); + + if (rot) { + e3 = float64_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float64_maybe_ah_chs(e1, fpcr_ah); + } if (likely((pg >> (i & 63)) & 1)) { *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, s); @@ -5165,13 +5347,18 @@ void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float16 neg_imag, neg_real; + bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float16 negx_imag, negx_real; uint64_t *g = vg; - neg_imag = float16_set_sign(0, (rot & 2) != 0); - neg_real = float16_set_sign(0, rot == 1 || rot == 2); + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 15; + negx_imag = (negf_imag & ~fpcr_ah) << 15; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5188,18 +5375,18 @@ void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va, mi = *(float16 *)(vm + H1_2(j)); e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; + e1 = (flip ? mi : mr) ^ negx_real; e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; + e3 = (flip ? mr : mi) ^ negx_imag; if (likely((pg >> (i & 63)) & 1)) { d = *(float16 *)(va + H1_2(i)); - d = float16_muladd(e2, e1, d, 0, status); + d = float16_muladd(e2, e1, d, negf_real, status); *(float16 *)(vd + H1_2(i)) = d; } if (likely((pg >> (j & 63)) & 1)) { d = *(float16 *)(va + H1_2(j)); - d = float16_muladd(e4, e3, d, 0, status); + d = float16_muladd(e4, e3, d, negf_imag, status); *(float16 *)(vd + H1_2(j)) = d; } } while (i & 63); @@ -5210,13 +5397,18 @@ void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float32 neg_imag, neg_real; + bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float32 negx_imag, negx_real; uint64_t *g = vg; - neg_imag = float32_set_sign(0, (rot & 2) != 0); - neg_real = float32_set_sign(0, rot == 1 || rot == 2); + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 31; + negx_imag = (negf_imag & ~fpcr_ah) << 31; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5233,18 +5425,18 @@ void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va, mi = *(float32 *)(vm + H1_2(j)); e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; + e1 = (flip ? mi : mr) ^ negx_real; e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; + e3 = (flip ? mr : mi) ^ negx_imag; if (likely((pg >> (i & 63)) & 1)) { d = *(float32 *)(va + H1_2(i)); - d = float32_muladd(e2, e1, d, 0, status); + d = float32_muladd(e2, e1, d, negf_real, status); *(float32 *)(vd + H1_2(i)) = d; } if (likely((pg >> (j & 63)) & 1)) { d = *(float32 *)(va + H1_2(j)); - d = float32_muladd(e4, e3, d, 0, status); + d = float32_muladd(e4, e3, d, negf_imag, status); *(float32 *)(vd + H1_2(j)) = d; } } while (i & 63); @@ -5255,13 +5447,18 @@ void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, void *vg, float_status *status, uint32_t desc) { intptr_t j, i = simd_oprsz(desc); - unsigned rot = simd_data(desc); - bool flip = rot & 1; - float64 neg_imag, neg_real; + bool flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float64 negx_imag, negx_real; uint64_t *g = vg; - neg_imag = float64_set_sign(0, (rot & 2) != 0); - neg_real = float64_set_sign(0, rot == 1 || rot == 2); + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; + negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); do { uint64_t pg = g[(i - 1) >> 6]; @@ -5278,18 +5475,18 @@ void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va, mi = *(float64 *)(vm + H1_2(j)); e2 = (flip ? ni : nr); - e1 = (flip ? mi : mr) ^ neg_real; + e1 = (flip ? mi : mr) ^ negx_real; e4 = e2; - e3 = (flip ? mr : mi) ^ neg_imag; + e3 = (flip ? mr : mi) ^ negx_imag; if (likely((pg >> (i & 63)) & 1)) { d = *(float64 *)(va + H1_2(i)); - d = float64_muladd(e2, e1, d, 0, status); + d = float64_muladd(e2, e1, d, negf_real, status); *(float64 *)(vd + H1_2(i)) = d; } if (likely((pg >> (j & 63)) & 1)) { d = *(float64 *)(va + H1_2(j)); - d = float64_muladd(e4, e3, d, 0, status); + d = float64_muladd(e4, e3, d, negf_imag, status); *(float64 *)(vd + H1_2(j)) = d; } } while (i & 63); diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index d6ac2ed..8bef391 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -654,6 +654,68 @@ static void write_fp_sreg(DisasContext *s, int reg, TCGv_i32 v) write_fp_dreg(s, reg, tmp); } +/* + * Write a double result to 128 bit vector register reg, honouring FPCR.NEP: + * - if FPCR.NEP == 0, clear the high elements of reg + * - if FPCR.NEP == 1, set the high elements of reg from mergereg + * (i.e. merge the result with those high elements) + * In either case, SVE register bits above 128 are zeroed (per R_WKYLB). + */ +static void write_fp_dreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i64 v) +{ + if (!s->fpcr_nep) { + write_fp_dreg(s, reg, v); + return; + } + + /* + * Move from mergereg to reg; this sets the high elements and + * clears the bits above 128 as a side effect. + */ + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st_i64(v, tcg_env, vec_full_reg_offset(s, reg)); +} + +/* + * Write a single-prec result, but only clear the higher elements + * of the destination register if FPCR.NEP is 0; otherwise preserve them. + */ +static void write_fp_sreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i32 v) +{ + if (!s->fpcr_nep) { + write_fp_sreg(s, reg, v); + return; + } + + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st_i32(v, tcg_env, fp_reg_offset(s, reg, MO_32)); +} + +/* + * Write a half-prec result, but only clear the higher elements + * of the destination register if FPCR.NEP is 0; otherwise preserve them. + * The caller must ensure that the top 16 bits of v are zero. + */ +static void write_fp_hreg_merging(DisasContext *s, int reg, int mergereg, + TCGv_i32 v) +{ + if (!s->fpcr_nep) { + write_fp_sreg(s, reg, v); + return; + } + + tcg_gen_gvec_mov(MO_64, vec_full_reg_offset(s, reg), + vec_full_reg_offset(s, mergereg), + 16, vec_full_reg_size(s)); + tcg_gen_st16_i32(v, tcg_env, fp_reg_offset(s, reg, MO_16)); +} + /* Expand a 2-operand AdvSIMD vector operation using an expander function. */ static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn, GVecGen2Fn *gvec_fn, int vece) @@ -712,10 +774,10 @@ static void gen_gvec_op3_ool(DisasContext *s, bool is_q, int rd, * an out-of-line helper. */ static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn, - int rm, bool is_fp16, int data, + int rm, ARMFPStatusFlavour fpsttype, int data, gen_helper_gvec_3_ptr *fn) { - TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_A64_F16 : FPST_A64); + TCGv_ptr fpst = fpstatus_ptr(fpsttype); tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), vec_full_reg_offset(s, rm), fpst, @@ -754,10 +816,11 @@ static void gen_gvec_op4_env(DisasContext *s, bool is_q, int rd, int rn, * an out-of-line helper. */ static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn, - int rm, int ra, bool is_fp16, int data, + int rm, int ra, ARMFPStatusFlavour fpsttype, + int data, gen_helper_gvec_4_ptr *fn) { - TCGv_ptr fpst = fpstatus_ptr(is_fp16 ? FPST_A64_F16 : FPST_A64); + TCGv_ptr fpst = fpstatus_ptr(fpsttype); tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), vec_full_reg_offset(s, rm), @@ -765,6 +828,111 @@ static void gen_gvec_op4_fpst(DisasContext *s, bool is_q, int rd, int rn, is_q ? 16 : 8, vec_full_reg_size(s), data, fn); } +/* + * When FPCR.AH == 1, NEG and ABS do not flip the sign bit of a NaN. + * These functions implement + * d = floatN_is_any_nan(s) ? s : floatN_chs(s) + * which for float32 is + * d = (s & ~(1 << 31)) > 0x7f800000UL) ? s : (s ^ (1 << 31)) + * and similarly for the other float sizes. + */ +static void gen_vfp_ah_negh(TCGv_i32 d, TCGv_i32 s) +{ + TCGv_i32 abs_s = tcg_temp_new_i32(), chs_s = tcg_temp_new_i32(); + + gen_vfp_negh(chs_s, s); + gen_vfp_absh(abs_s, s); + tcg_gen_movcond_i32(TCG_COND_GTU, d, + abs_s, tcg_constant_i32(0x7c00), + s, chs_s); +} + +static void gen_vfp_ah_negs(TCGv_i32 d, TCGv_i32 s) +{ + TCGv_i32 abs_s = tcg_temp_new_i32(), chs_s = tcg_temp_new_i32(); + + gen_vfp_negs(chs_s, s); + gen_vfp_abss(abs_s, s); + tcg_gen_movcond_i32(TCG_COND_GTU, d, + abs_s, tcg_constant_i32(0x7f800000UL), + s, chs_s); +} + +static void gen_vfp_ah_negd(TCGv_i64 d, TCGv_i64 s) +{ + TCGv_i64 abs_s = tcg_temp_new_i64(), chs_s = tcg_temp_new_i64(); + + gen_vfp_negd(chs_s, s); + gen_vfp_absd(abs_s, s); + tcg_gen_movcond_i64(TCG_COND_GTU, d, + abs_s, tcg_constant_i64(0x7ff0000000000000ULL), + s, chs_s); +} + +/* + * These functions implement + * d = floatN_is_any_nan(s) ? s : floatN_abs(s) + * which for float32 is + * d = (s & ~(1 << 31)) > 0x7f800000UL) ? s : (s & ~(1 << 31)) + * and similarly for the other float sizes. + */ +static void gen_vfp_ah_absh(TCGv_i32 d, TCGv_i32 s) +{ + TCGv_i32 abs_s = tcg_temp_new_i32(); + + gen_vfp_absh(abs_s, s); + tcg_gen_movcond_i32(TCG_COND_GTU, d, + abs_s, tcg_constant_i32(0x7c00), + s, abs_s); +} + +static void gen_vfp_ah_abss(TCGv_i32 d, TCGv_i32 s) +{ + TCGv_i32 abs_s = tcg_temp_new_i32(); + + gen_vfp_abss(abs_s, s); + tcg_gen_movcond_i32(TCG_COND_GTU, d, + abs_s, tcg_constant_i32(0x7f800000UL), + s, abs_s); +} + +static void gen_vfp_ah_absd(TCGv_i64 d, TCGv_i64 s) +{ + TCGv_i64 abs_s = tcg_temp_new_i64(); + + gen_vfp_absd(abs_s, s); + tcg_gen_movcond_i64(TCG_COND_GTU, d, + abs_s, tcg_constant_i64(0x7ff0000000000000ULL), + s, abs_s); +} + +static void gen_vfp_maybe_ah_negh(DisasContext *dc, TCGv_i32 d, TCGv_i32 s) +{ + if (dc->fpcr_ah) { + gen_vfp_ah_negh(d, s); + } else { + gen_vfp_negh(d, s); + } +} + +static void gen_vfp_maybe_ah_negs(DisasContext *dc, TCGv_i32 d, TCGv_i32 s) +{ + if (dc->fpcr_ah) { + gen_vfp_ah_negs(d, s); + } else { + gen_vfp_negs(d, s); + } +} + +static void gen_vfp_maybe_ah_negd(DisasContext *dc, TCGv_i64 d, TCGv_i64 s) +{ + if (dc->fpcr_ah) { + gen_vfp_ah_negd(d, s); + } else { + gen_vfp_negd(d, s); + } +} + /* Set ZF and NF based on a 64 bit result. This is alas fiddlier * than the 32 bit equivalent. */ @@ -5025,23 +5193,25 @@ typedef struct FPScalar { void (*gen_d)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); } FPScalar; -static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f) +static bool do_fp3_scalar_with_fpsttype(DisasContext *s, arg_rrr_e *a, + const FPScalar *f, int mergereg, + ARMFPStatusFlavour fpsttype) { switch (a->esz) { case MO_64: if (fp_access_check(s)) { TCGv_i64 t0 = read_fp_dreg(s, a->rn); TCGv_i64 t1 = read_fp_dreg(s, a->rm); - f->gen_d(t0, t0, t1, fpstatus_ptr(FPST_A64)); - write_fp_dreg(s, a->rd, t0); + f->gen_d(t0, t0, t1, fpstatus_ptr(fpsttype)); + write_fp_dreg_merging(s, a->rd, mergereg, t0); } break; case MO_32: if (fp_access_check(s)) { TCGv_i32 t0 = read_fp_sreg(s, a->rn); TCGv_i32 t1 = read_fp_sreg(s, a->rm); - f->gen_s(t0, t0, t1, fpstatus_ptr(FPST_A64)); - write_fp_sreg(s, a->rd, t0); + f->gen_s(t0, t0, t1, fpstatus_ptr(fpsttype)); + write_fp_sreg_merging(s, a->rd, mergereg, t0); } break; case MO_16: @@ -5051,8 +5221,8 @@ static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f) if (fp_access_check(s)) { TCGv_i32 t0 = read_fp_hreg(s, a->rn); TCGv_i32 t1 = read_fp_hreg(s, a->rm); - f->gen_h(t0, t0, t1, fpstatus_ptr(FPST_A64_F16)); - write_fp_sreg(s, a->rd, t0); + f->gen_h(t0, t0, t1, fpstatus_ptr(fpsttype)); + write_fp_hreg_merging(s, a->rd, mergereg, t0); } break; default: @@ -5061,68 +5231,103 @@ static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f) return true; } +static bool do_fp3_scalar(DisasContext *s, arg_rrr_e *a, const FPScalar *f, + int mergereg) +{ + return do_fp3_scalar_with_fpsttype(s, a, f, mergereg, + a->esz == MO_16 ? + FPST_A64_F16 : FPST_A64); +} + +static bool do_fp3_scalar_ah_2fn(DisasContext *s, arg_rrr_e *a, + const FPScalar *fnormal, const FPScalar *fah, + int mergereg) +{ + return do_fp3_scalar_with_fpsttype(s, a, s->fpcr_ah ? fah : fnormal, + mergereg, select_ah_fpst(s, a->esz)); +} + +/* Some insns need to call different helpers when FPCR.AH == 1 */ +static bool do_fp3_scalar_2fn(DisasContext *s, arg_rrr_e *a, + const FPScalar *fnormal, + const FPScalar *fah, + int mergereg) +{ + return do_fp3_scalar(s, a, s->fpcr_ah ? fah : fnormal, mergereg); +} + static const FPScalar f_scalar_fadd = { gen_helper_vfp_addh, gen_helper_vfp_adds, gen_helper_vfp_addd, }; -TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd) +TRANS(FADD_s, do_fp3_scalar, a, &f_scalar_fadd, a->rn) static const FPScalar f_scalar_fsub = { gen_helper_vfp_subh, gen_helper_vfp_subs, gen_helper_vfp_subd, }; -TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub) +TRANS(FSUB_s, do_fp3_scalar, a, &f_scalar_fsub, a->rn) static const FPScalar f_scalar_fdiv = { gen_helper_vfp_divh, gen_helper_vfp_divs, gen_helper_vfp_divd, }; -TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv) +TRANS(FDIV_s, do_fp3_scalar, a, &f_scalar_fdiv, a->rn) static const FPScalar f_scalar_fmul = { gen_helper_vfp_mulh, gen_helper_vfp_muls, gen_helper_vfp_muld, }; -TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul) +TRANS(FMUL_s, do_fp3_scalar, a, &f_scalar_fmul, a->rn) static const FPScalar f_scalar_fmax = { gen_helper_vfp_maxh, gen_helper_vfp_maxs, gen_helper_vfp_maxd, }; -TRANS(FMAX_s, do_fp3_scalar, a, &f_scalar_fmax) +static const FPScalar f_scalar_fmax_ah = { + gen_helper_vfp_ah_maxh, + gen_helper_vfp_ah_maxs, + gen_helper_vfp_ah_maxd, +}; +TRANS(FMAX_s, do_fp3_scalar_2fn, a, &f_scalar_fmax, &f_scalar_fmax_ah, a->rn) static const FPScalar f_scalar_fmin = { gen_helper_vfp_minh, gen_helper_vfp_mins, gen_helper_vfp_mind, }; -TRANS(FMIN_s, do_fp3_scalar, a, &f_scalar_fmin) +static const FPScalar f_scalar_fmin_ah = { + gen_helper_vfp_ah_minh, + gen_helper_vfp_ah_mins, + gen_helper_vfp_ah_mind, +}; +TRANS(FMIN_s, do_fp3_scalar_2fn, a, &f_scalar_fmin, &f_scalar_fmin_ah, a->rn) static const FPScalar f_scalar_fmaxnm = { gen_helper_vfp_maxnumh, gen_helper_vfp_maxnums, gen_helper_vfp_maxnumd, }; -TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm) +TRANS(FMAXNM_s, do_fp3_scalar, a, &f_scalar_fmaxnm, a->rn) static const FPScalar f_scalar_fminnm = { gen_helper_vfp_minnumh, gen_helper_vfp_minnums, gen_helper_vfp_minnumd, }; -TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm) +TRANS(FMINNM_s, do_fp3_scalar, a, &f_scalar_fminnm, a->rn) static const FPScalar f_scalar_fmulx = { gen_helper_advsimd_mulxh, gen_helper_vfp_mulxs, gen_helper_vfp_mulxd, }; -TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx) +TRANS(FMULX_s, do_fp3_scalar, a, &f_scalar_fmulx, a->rn) static void gen_fnmul_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) { @@ -5142,47 +5347,70 @@ static void gen_fnmul_d(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_ptr s) gen_vfp_negd(d, d); } +static void gen_fnmul_ah_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) +{ + gen_helper_vfp_mulh(d, n, m, s); + gen_vfp_ah_negh(d, d); +} + +static void gen_fnmul_ah_s(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) +{ + gen_helper_vfp_muls(d, n, m, s); + gen_vfp_ah_negs(d, d); +} + +static void gen_fnmul_ah_d(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_ptr s) +{ + gen_helper_vfp_muld(d, n, m, s); + gen_vfp_ah_negd(d, d); +} + static const FPScalar f_scalar_fnmul = { gen_fnmul_h, gen_fnmul_s, gen_fnmul_d, }; -TRANS(FNMUL_s, do_fp3_scalar, a, &f_scalar_fnmul) +static const FPScalar f_scalar_ah_fnmul = { + gen_fnmul_ah_h, + gen_fnmul_ah_s, + gen_fnmul_ah_d, +}; +TRANS(FNMUL_s, do_fp3_scalar_2fn, a, &f_scalar_fnmul, &f_scalar_ah_fnmul, a->rn) static const FPScalar f_scalar_fcmeq = { gen_helper_advsimd_ceq_f16, gen_helper_neon_ceq_f32, gen_helper_neon_ceq_f64, }; -TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq) +TRANS(FCMEQ_s, do_fp3_scalar, a, &f_scalar_fcmeq, a->rm) static const FPScalar f_scalar_fcmge = { gen_helper_advsimd_cge_f16, gen_helper_neon_cge_f32, gen_helper_neon_cge_f64, }; -TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge) +TRANS(FCMGE_s, do_fp3_scalar, a, &f_scalar_fcmge, a->rm) static const FPScalar f_scalar_fcmgt = { gen_helper_advsimd_cgt_f16, gen_helper_neon_cgt_f32, gen_helper_neon_cgt_f64, }; -TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt) +TRANS(FCMGT_s, do_fp3_scalar, a, &f_scalar_fcmgt, a->rm) static const FPScalar f_scalar_facge = { gen_helper_advsimd_acge_f16, gen_helper_neon_acge_f32, gen_helper_neon_acge_f64, }; -TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge) +TRANS(FACGE_s, do_fp3_scalar, a, &f_scalar_facge, a->rm) static const FPScalar f_scalar_facgt = { gen_helper_advsimd_acgt_f16, gen_helper_neon_acgt_f32, gen_helper_neon_acgt_f64, }; -TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt) +TRANS(FACGT_s, do_fp3_scalar, a, &f_scalar_facgt, a->rm) static void gen_fabd_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) { @@ -5202,26 +5430,61 @@ static void gen_fabd_d(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_ptr s) gen_vfp_absd(d, d); } +static void gen_fabd_ah_h(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) +{ + gen_helper_vfp_subh(d, n, m, s); + gen_vfp_ah_absh(d, d); +} + +static void gen_fabd_ah_s(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m, TCGv_ptr s) +{ + gen_helper_vfp_subs(d, n, m, s); + gen_vfp_ah_abss(d, d); +} + +static void gen_fabd_ah_d(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m, TCGv_ptr s) +{ + gen_helper_vfp_subd(d, n, m, s); + gen_vfp_ah_absd(d, d); +} + static const FPScalar f_scalar_fabd = { gen_fabd_h, gen_fabd_s, gen_fabd_d, }; -TRANS(FABD_s, do_fp3_scalar, a, &f_scalar_fabd) +static const FPScalar f_scalar_ah_fabd = { + gen_fabd_ah_h, + gen_fabd_ah_s, + gen_fabd_ah_d, +}; +TRANS(FABD_s, do_fp3_scalar_2fn, a, &f_scalar_fabd, &f_scalar_ah_fabd, a->rn) static const FPScalar f_scalar_frecps = { gen_helper_recpsf_f16, gen_helper_recpsf_f32, gen_helper_recpsf_f64, }; -TRANS(FRECPS_s, do_fp3_scalar, a, &f_scalar_frecps) +static const FPScalar f_scalar_ah_frecps = { + gen_helper_recpsf_ah_f16, + gen_helper_recpsf_ah_f32, + gen_helper_recpsf_ah_f64, +}; +TRANS(FRECPS_s, do_fp3_scalar_ah_2fn, a, + &f_scalar_frecps, &f_scalar_ah_frecps, a->rn) static const FPScalar f_scalar_frsqrts = { gen_helper_rsqrtsf_f16, gen_helper_rsqrtsf_f32, gen_helper_rsqrtsf_f64, }; -TRANS(FRSQRTS_s, do_fp3_scalar, a, &f_scalar_frsqrts) +static const FPScalar f_scalar_ah_frsqrts = { + gen_helper_rsqrtsf_ah_f16, + gen_helper_rsqrtsf_ah_f32, + gen_helper_rsqrtsf_ah_f64, +}; +TRANS(FRSQRTS_s, do_fp3_scalar_ah_2fn, a, + &f_scalar_frsqrts, &f_scalar_ah_frsqrts, a->rn) static bool do_fcmp0_s(DisasContext *s, arg_rr_e *a, const FPScalar *f, bool swap) @@ -5472,8 +5735,10 @@ TRANS(CMHS_s, do_cmop_d, a, TCG_COND_GEU) TRANS(CMEQ_s, do_cmop_d, a, TCG_COND_EQ) TRANS(CMTST_s, do_cmop_d, a, TCG_COND_TSTNE) -static bool do_fp3_vector(DisasContext *s, arg_qrrr_e *a, int data, - gen_helper_gvec_3_ptr * const fns[3]) +static bool do_fp3_vector_with_fpsttype(DisasContext *s, arg_qrrr_e *a, + int data, + gen_helper_gvec_3_ptr * const fns[3], + ARMFPStatusFlavour fpsttype) { MemOp esz = a->esz; int check = fp_access_check_vector_hsd(s, a->q, esz); @@ -5482,11 +5747,34 @@ static bool do_fp3_vector(DisasContext *s, arg_qrrr_e *a, int data, return check == 0; } - gen_gvec_op3_fpst(s, a->q, a->rd, a->rn, a->rm, - esz == MO_16, data, fns[esz - 1]); + gen_gvec_op3_fpst(s, a->q, a->rd, a->rn, a->rm, fpsttype, + data, fns[esz - 1]); return true; } +static bool do_fp3_vector(DisasContext *s, arg_qrrr_e *a, int data, + gen_helper_gvec_3_ptr * const fns[3]) +{ + return do_fp3_vector_with_fpsttype(s, a, data, fns, + a->esz == MO_16 ? + FPST_A64_F16 : FPST_A64); +} + +static bool do_fp3_vector_2fn(DisasContext *s, arg_qrrr_e *a, int data, + gen_helper_gvec_3_ptr * const fnormal[3], + gen_helper_gvec_3_ptr * const fah[3]) +{ + return do_fp3_vector(s, a, data, s->fpcr_ah ? fah : fnormal); +} + +static bool do_fp3_vector_ah_2fn(DisasContext *s, arg_qrrr_e *a, int data, + gen_helper_gvec_3_ptr * const fnormal[3], + gen_helper_gvec_3_ptr * const fah[3]) +{ + return do_fp3_vector_with_fpsttype(s, a, data, s->fpcr_ah ? fah : fnormal, + select_ah_fpst(s, a->esz)); +} + static gen_helper_gvec_3_ptr * const f_vector_fadd[3] = { gen_helper_gvec_fadd_h, gen_helper_gvec_fadd_s, @@ -5520,14 +5808,24 @@ static gen_helper_gvec_3_ptr * const f_vector_fmax[3] = { gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_d, }; -TRANS(FMAX_v, do_fp3_vector, a, 0, f_vector_fmax) +static gen_helper_gvec_3_ptr * const f_vector_fmax_ah[3] = { + gen_helper_gvec_ah_fmax_h, + gen_helper_gvec_ah_fmax_s, + gen_helper_gvec_ah_fmax_d, +}; +TRANS(FMAX_v, do_fp3_vector_2fn, a, 0, f_vector_fmax, f_vector_fmax_ah) static gen_helper_gvec_3_ptr * const f_vector_fmin[3] = { gen_helper_gvec_fmin_h, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_d, }; -TRANS(FMIN_v, do_fp3_vector, a, 0, f_vector_fmin) +static gen_helper_gvec_3_ptr * const f_vector_fmin_ah[3] = { + gen_helper_gvec_ah_fmin_h, + gen_helper_gvec_ah_fmin_s, + gen_helper_gvec_ah_fmin_d, +}; +TRANS(FMIN_v, do_fp3_vector_2fn, a, 0, f_vector_fmin, f_vector_fmin_ah) static gen_helper_gvec_3_ptr * const f_vector_fmaxnm[3] = { gen_helper_gvec_fmaxnum_h, @@ -5562,7 +5860,12 @@ static gen_helper_gvec_3_ptr * const f_vector_fmls[3] = { gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_d, }; -TRANS(FMLS_v, do_fp3_vector, a, 0, f_vector_fmls) +static gen_helper_gvec_3_ptr * const f_vector_fmls_ah[3] = { + gen_helper_gvec_ah_vfms_h, + gen_helper_gvec_ah_vfms_s, + gen_helper_gvec_ah_vfms_d, +}; +TRANS(FMLS_v, do_fp3_vector_2fn, a, 0, f_vector_fmls, f_vector_fmls_ah) static gen_helper_gvec_3_ptr * const f_vector_fcmeq[3] = { gen_helper_gvec_fceq_h, @@ -5604,21 +5907,36 @@ static gen_helper_gvec_3_ptr * const f_vector_fabd[3] = { gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_d, }; -TRANS(FABD_v, do_fp3_vector, a, 0, f_vector_fabd) +static gen_helper_gvec_3_ptr * const f_vector_ah_fabd[3] = { + gen_helper_gvec_ah_fabd_h, + gen_helper_gvec_ah_fabd_s, + gen_helper_gvec_ah_fabd_d, +}; +TRANS(FABD_v, do_fp3_vector_2fn, a, 0, f_vector_fabd, f_vector_ah_fabd) static gen_helper_gvec_3_ptr * const f_vector_frecps[3] = { gen_helper_gvec_recps_h, gen_helper_gvec_recps_s, gen_helper_gvec_recps_d, }; -TRANS(FRECPS_v, do_fp3_vector, a, 0, f_vector_frecps) +static gen_helper_gvec_3_ptr * const f_vector_ah_frecps[3] = { + gen_helper_gvec_ah_recps_h, + gen_helper_gvec_ah_recps_s, + gen_helper_gvec_ah_recps_d, +}; +TRANS(FRECPS_v, do_fp3_vector_ah_2fn, a, 0, f_vector_frecps, f_vector_ah_frecps) static gen_helper_gvec_3_ptr * const f_vector_frsqrts[3] = { gen_helper_gvec_rsqrts_h, gen_helper_gvec_rsqrts_s, gen_helper_gvec_rsqrts_d, }; -TRANS(FRSQRTS_v, do_fp3_vector, a, 0, f_vector_frsqrts) +static gen_helper_gvec_3_ptr * const f_vector_ah_frsqrts[3] = { + gen_helper_gvec_ah_rsqrts_h, + gen_helper_gvec_ah_rsqrts_s, + gen_helper_gvec_ah_rsqrts_d, +}; +TRANS(FRSQRTS_v, do_fp3_vector_ah_2fn, a, 0, f_vector_frsqrts, f_vector_ah_frsqrts) static gen_helper_gvec_3_ptr * const f_vector_faddp[3] = { gen_helper_gvec_faddp_h, @@ -5632,14 +5950,24 @@ static gen_helper_gvec_3_ptr * const f_vector_fmaxp[3] = { gen_helper_gvec_fmaxp_s, gen_helper_gvec_fmaxp_d, }; -TRANS(FMAXP_v, do_fp3_vector, a, 0, f_vector_fmaxp) +static gen_helper_gvec_3_ptr * const f_vector_ah_fmaxp[3] = { + gen_helper_gvec_ah_fmaxp_h, + gen_helper_gvec_ah_fmaxp_s, + gen_helper_gvec_ah_fmaxp_d, +}; +TRANS(FMAXP_v, do_fp3_vector_2fn, a, 0, f_vector_fmaxp, f_vector_ah_fmaxp) static gen_helper_gvec_3_ptr * const f_vector_fminp[3] = { gen_helper_gvec_fminp_h, gen_helper_gvec_fminp_s, gen_helper_gvec_fminp_d, }; -TRANS(FMINP_v, do_fp3_vector, a, 0, f_vector_fminp) +static gen_helper_gvec_3_ptr * const f_vector_ah_fminp[3] = { + gen_helper_gvec_ah_fminp_h, + gen_helper_gvec_ah_fminp_s, + gen_helper_gvec_ah_fminp_d, +}; +TRANS(FMINP_v, do_fp3_vector_2fn, a, 0, f_vector_fminp, f_vector_ah_fminp) static gen_helper_gvec_3_ptr * const f_vector_fmaxnmp[3] = { gen_helper_gvec_fmaxnump_h, @@ -5795,7 +6123,8 @@ static bool trans_BFMLAL_v(DisasContext *s, arg_qrrr_e *a) } if (fp_access_check(s)) { /* Q bit selects BFMLALB vs BFMLALT. */ - gen_gvec_op4_fpst(s, true, a->rd, a->rn, a->rm, a->rd, false, a->q, + gen_gvec_op4_fpst(s, true, a->rd, a->rn, a->rm, a->rd, + s->fpcr_ah ? FPST_AH : FPST_A64, a->q, gen_helper_gvec_bfmlal); } return true; @@ -5806,8 +6135,14 @@ static gen_helper_gvec_3_ptr * const f_vector_fcadd[3] = { gen_helper_gvec_fcadds, gen_helper_gvec_fcaddd, }; -TRANS_FEAT(FCADD_90, aa64_fcma, do_fp3_vector, a, 0, f_vector_fcadd) -TRANS_FEAT(FCADD_270, aa64_fcma, do_fp3_vector, a, 1, f_vector_fcadd) +/* + * Encode FPCR.AH into the data so the helper knows whether the + * negations it does should avoid flipping the sign bit on a NaN + */ +TRANS_FEAT(FCADD_90, aa64_fcma, do_fp3_vector, a, 0 | (s->fpcr_ah << 1), + f_vector_fcadd) +TRANS_FEAT(FCADD_270, aa64_fcma, do_fp3_vector, a, 1 | (s->fpcr_ah << 1), + f_vector_fcadd) static bool trans_FCMLA_v(DisasContext *s, arg_FCMLA_v *a) { @@ -5828,7 +6163,8 @@ static bool trans_FCMLA_v(DisasContext *s, arg_FCMLA_v *a) } gen_gvec_op4_fpst(s, a->q, a->rd, a->rn, a->rm, a->rd, - a->esz == MO_16, a->rot, fn[a->esz]); + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64, + a->rot | (s->fpcr_ah << 2), fn[a->esz]); return true; } @@ -6197,7 +6533,7 @@ static bool do_fp3_scalar_idx(DisasContext *s, arg_rrx_e *a, const FPScalar *f) read_vec_element(s, t1, a->rm, a->idx, MO_64); f->gen_d(t0, t0, t1, fpstatus_ptr(FPST_A64)); - write_fp_dreg(s, a->rd, t0); + write_fp_dreg_merging(s, a->rd, a->rn, t0); } break; case MO_32: @@ -6207,7 +6543,7 @@ static bool do_fp3_scalar_idx(DisasContext *s, arg_rrx_e *a, const FPScalar *f) read_vec_element_i32(s, t1, a->rm, a->idx, MO_32); f->gen_s(t0, t0, t1, fpstatus_ptr(FPST_A64)); - write_fp_sreg(s, a->rd, t0); + write_fp_sreg_merging(s, a->rd, a->rn, t0); } break; case MO_16: @@ -6220,7 +6556,7 @@ static bool do_fp3_scalar_idx(DisasContext *s, arg_rrx_e *a, const FPScalar *f) read_vec_element_i32(s, t1, a->rm, a->idx, MO_16); f->gen_h(t0, t0, t1, fpstatus_ptr(FPST_A64_F16)); - write_fp_sreg(s, a->rd, t0); + write_fp_hreg_merging(s, a->rd, a->rn, t0); } break; default: @@ -6243,10 +6579,10 @@ static bool do_fmla_scalar_idx(DisasContext *s, arg_rrx_e *a, bool neg) read_vec_element(s, t2, a->rm, a->idx, MO_64); if (neg) { - gen_vfp_negd(t1, t1); + gen_vfp_maybe_ah_negd(s, t1, t1); } gen_helper_vfp_muladdd(t0, t1, t2, t0, fpstatus_ptr(FPST_A64)); - write_fp_dreg(s, a->rd, t0); + write_fp_dreg_merging(s, a->rd, a->rd, t0); } break; case MO_32: @@ -6257,10 +6593,10 @@ static bool do_fmla_scalar_idx(DisasContext *s, arg_rrx_e *a, bool neg) read_vec_element_i32(s, t2, a->rm, a->idx, MO_32); if (neg) { - gen_vfp_negs(t1, t1); + gen_vfp_maybe_ah_negs(s, t1, t1); } gen_helper_vfp_muladds(t0, t1, t2, t0, fpstatus_ptr(FPST_A64)); - write_fp_sreg(s, a->rd, t0); + write_fp_sreg_merging(s, a->rd, a->rd, t0); } break; case MO_16: @@ -6274,11 +6610,11 @@ static bool do_fmla_scalar_idx(DisasContext *s, arg_rrx_e *a, bool neg) read_vec_element_i32(s, t2, a->rm, a->idx, MO_16); if (neg) { - gen_vfp_negh(t1, t1); + gen_vfp_maybe_ah_negh(s, t1, t1); } gen_helper_advsimd_muladdh(t0, t1, t2, t0, fpstatus_ptr(FPST_A64_F16)); - write_fp_sreg(s, a->rd, t0); + write_fp_hreg_merging(s, a->rd, a->rd, t0); } break; default: @@ -6374,7 +6710,8 @@ static bool do_fp3_vector_idx(DisasContext *s, arg_qrrx_e *a, } gen_gvec_op3_fpst(s, a->q, a->rd, a->rn, a->rm, - esz == MO_16, a->idx, fns[esz - 1]); + esz == MO_16 ? FPST_A64_F16 : FPST_A64, + a->idx, fns[esz - 1]); return true; } @@ -6394,10 +6731,16 @@ TRANS(FMULX_vi, do_fp3_vector_idx, a, f_vector_idx_fmulx) static bool do_fmla_vector_idx(DisasContext *s, arg_qrrx_e *a, bool neg) { - static gen_helper_gvec_4_ptr * const fns[3] = { - gen_helper_gvec_fmla_idx_h, - gen_helper_gvec_fmla_idx_s, - gen_helper_gvec_fmla_idx_d, + static gen_helper_gvec_4_ptr * const fns[3][3] = { + { gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_fmla_idx_s, + gen_helper_gvec_fmla_idx_d }, + { gen_helper_gvec_fmls_idx_h, + gen_helper_gvec_fmls_idx_s, + gen_helper_gvec_fmls_idx_d }, + { gen_helper_gvec_ah_fmls_idx_h, + gen_helper_gvec_ah_fmls_idx_s, + gen_helper_gvec_ah_fmls_idx_d }, }; MemOp esz = a->esz; int check = fp_access_check_vector_hsd(s, a->q, esz); @@ -6407,8 +6750,8 @@ static bool do_fmla_vector_idx(DisasContext *s, arg_qrrx_e *a, bool neg) } gen_gvec_op4_fpst(s, a->q, a->rd, a->rn, a->rm, a->rd, - esz == MO_16, (a->idx << 1) | neg, - fns[esz - 1]); + esz == MO_16 ? FPST_A64_F16 : FPST_A64, + a->idx, fns[neg ? 1 + s->fpcr_ah : 0][esz - 1]); return true; } @@ -6542,7 +6885,8 @@ static bool trans_BFMLAL_vi(DisasContext *s, arg_qrrx_e *a) } if (fp_access_check(s)) { /* Q bit selects BFMLALB vs BFMLALT. */ - gen_gvec_op4_fpst(s, true, a->rd, a->rn, a->rm, a->rd, 0, + gen_gvec_op4_fpst(s, true, a->rd, a->rn, a->rm, a->rd, + s->fpcr_ah ? FPST_AH : FPST_A64, (a->idx << 1) | a->q, gen_helper_gvec_bfmlal_idx); } @@ -6571,7 +6915,8 @@ static bool trans_FCMLA_vi(DisasContext *s, arg_FCMLA_vi *a) } if (fp_access_check(s)) { gen_gvec_op4_fpst(s, a->q, a->rd, a->rn, a->rm, a->rd, - a->esz == MO_16, (a->idx << 2) | a->rot, fn); + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64, + (s->fpcr_ah << 4) | (a->idx << 2) | a->rot, fn); } return true; } @@ -6625,9 +6970,16 @@ static bool do_fp3_scalar_pair(DisasContext *s, arg_rr_e *a, const FPScalar *f) return true; } +static bool do_fp3_scalar_pair_2fn(DisasContext *s, arg_rr_e *a, + const FPScalar *fnormal, + const FPScalar *fah) +{ + return do_fp3_scalar_pair(s, a, s->fpcr_ah ? fah : fnormal); +} + TRANS(FADDP_s, do_fp3_scalar_pair, a, &f_scalar_fadd) -TRANS(FMAXP_s, do_fp3_scalar_pair, a, &f_scalar_fmax) -TRANS(FMINP_s, do_fp3_scalar_pair, a, &f_scalar_fmin) +TRANS(FMAXP_s, do_fp3_scalar_pair_2fn, a, &f_scalar_fmax, &f_scalar_fmax_ah) +TRANS(FMINP_s, do_fp3_scalar_pair_2fn, a, &f_scalar_fmin, &f_scalar_fmin_ah) TRANS(FMAXNMP_s, do_fp3_scalar_pair, a, &f_scalar_fmaxnm) TRANS(FMINNMP_s, do_fp3_scalar_pair, a, &f_scalar_fminnm) @@ -6746,14 +7098,14 @@ static bool do_fmadd(DisasContext *s, arg_rrrr_e *a, bool neg_a, bool neg_n) TCGv_i64 ta = read_fp_dreg(s, a->ra); if (neg_a) { - gen_vfp_negd(ta, ta); + gen_vfp_maybe_ah_negd(s, ta, ta); } if (neg_n) { - gen_vfp_negd(tn, tn); + gen_vfp_maybe_ah_negd(s, tn, tn); } fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_muladdd(ta, tn, tm, ta, fpst); - write_fp_dreg(s, a->rd, ta); + write_fp_dreg_merging(s, a->rd, a->ra, ta); } break; @@ -6764,14 +7116,14 @@ static bool do_fmadd(DisasContext *s, arg_rrrr_e *a, bool neg_a, bool neg_n) TCGv_i32 ta = read_fp_sreg(s, a->ra); if (neg_a) { - gen_vfp_negs(ta, ta); + gen_vfp_maybe_ah_negs(s, ta, ta); } if (neg_n) { - gen_vfp_negs(tn, tn); + gen_vfp_maybe_ah_negs(s, tn, tn); } fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_muladds(ta, tn, tm, ta, fpst); - write_fp_sreg(s, a->rd, ta); + write_fp_sreg_merging(s, a->rd, a->ra, ta); } break; @@ -6785,14 +7137,14 @@ static bool do_fmadd(DisasContext *s, arg_rrrr_e *a, bool neg_a, bool neg_n) TCGv_i32 ta = read_fp_hreg(s, a->ra); if (neg_a) { - gen_vfp_negh(ta, ta); + gen_vfp_maybe_ah_negh(s, ta, ta); } if (neg_n) { - gen_vfp_negh(tn, tn); + gen_vfp_maybe_ah_negh(s, tn, tn); } fpst = fpstatus_ptr(FPST_A64_F16); gen_helper_advsimd_muladdh(ta, tn, tm, ta, fpst); - write_fp_sreg(s, a->rd, ta); + write_fp_hreg_merging(s, a->rd, a->ra, ta); } break; @@ -6879,27 +7231,35 @@ static TCGv_i32 do_reduction_op(DisasContext *s, int rn, MemOp esz, } static bool do_fp_reduction(DisasContext *s, arg_qrr_e *a, - NeonGenTwoSingleOpFn *fn) + NeonGenTwoSingleOpFn *fnormal, + NeonGenTwoSingleOpFn *fah) { if (fp_access_check(s)) { MemOp esz = a->esz; int elts = (a->q ? 16 : 8) >> esz; TCGv_ptr fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64); - TCGv_i32 res = do_reduction_op(s, a->rn, esz, 0, elts, fpst, fn); + TCGv_i32 res = do_reduction_op(s, a->rn, esz, 0, elts, fpst, + s->fpcr_ah ? fah : fnormal); write_fp_sreg(s, a->rd, res); } return true; } -TRANS_FEAT(FMAXNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_vfp_maxnumh) -TRANS_FEAT(FMINNMV_h, aa64_fp16, do_fp_reduction, a, gen_helper_vfp_minnumh) -TRANS_FEAT(FMAXV_h, aa64_fp16, do_fp_reduction, a, gen_helper_vfp_maxh) -TRANS_FEAT(FMINV_h, aa64_fp16, do_fp_reduction, a, gen_helper_vfp_minh) +TRANS_FEAT(FMAXNMV_h, aa64_fp16, do_fp_reduction, a, + gen_helper_vfp_maxnumh, gen_helper_vfp_maxnumh) +TRANS_FEAT(FMINNMV_h, aa64_fp16, do_fp_reduction, a, + gen_helper_vfp_minnumh, gen_helper_vfp_minnumh) +TRANS_FEAT(FMAXV_h, aa64_fp16, do_fp_reduction, a, + gen_helper_vfp_maxh, gen_helper_vfp_ah_maxh) +TRANS_FEAT(FMINV_h, aa64_fp16, do_fp_reduction, a, + gen_helper_vfp_minh, gen_helper_vfp_ah_minh) -TRANS(FMAXNMV_s, do_fp_reduction, a, gen_helper_vfp_maxnums) -TRANS(FMINNMV_s, do_fp_reduction, a, gen_helper_vfp_minnums) -TRANS(FMAXV_s, do_fp_reduction, a, gen_helper_vfp_maxs) -TRANS(FMINV_s, do_fp_reduction, a, gen_helper_vfp_mins) +TRANS(FMAXNMV_s, do_fp_reduction, a, + gen_helper_vfp_maxnums, gen_helper_vfp_maxnums) +TRANS(FMINNMV_s, do_fp_reduction, a, + gen_helper_vfp_minnums, gen_helper_vfp_minnums) +TRANS(FMAXV_s, do_fp_reduction, a, gen_helper_vfp_maxs, gen_helper_vfp_ah_maxs) +TRANS(FMINV_s, do_fp_reduction, a, gen_helper_vfp_mins, gen_helper_vfp_ah_mins) /* * Floating-point Immediate @@ -8323,21 +8683,30 @@ typedef struct FPScalar1Int { } FPScalar1Int; static bool do_fp1_scalar_int(DisasContext *s, arg_rr_e *a, - const FPScalar1Int *f) + const FPScalar1Int *f, + bool merging) { switch (a->esz) { case MO_64: if (fp_access_check(s)) { TCGv_i64 t = read_fp_dreg(s, a->rn); f->gen_d(t, t); - write_fp_dreg(s, a->rd, t); + if (merging) { + write_fp_dreg_merging(s, a->rd, a->rd, t); + } else { + write_fp_dreg(s, a->rd, t); + } } break; case MO_32: if (fp_access_check(s)) { TCGv_i32 t = read_fp_sreg(s, a->rn); f->gen_s(t, t); - write_fp_sreg(s, a->rd, t); + if (merging) { + write_fp_sreg_merging(s, a->rd, a->rd, t); + } else { + write_fp_sreg(s, a->rd, t); + } } break; case MO_16: @@ -8347,7 +8716,11 @@ static bool do_fp1_scalar_int(DisasContext *s, arg_rr_e *a, if (fp_access_check(s)) { TCGv_i32 t = read_fp_hreg(s, a->rn); f->gen_h(t, t); - write_fp_sreg(s, a->rd, t); + if (merging) { + write_fp_hreg_merging(s, a->rd, a->rd, t); + } else { + write_fp_sreg(s, a->rd, t); + } } break; default: @@ -8356,26 +8729,43 @@ static bool do_fp1_scalar_int(DisasContext *s, arg_rr_e *a, return true; } +static bool do_fp1_scalar_int_2fn(DisasContext *s, arg_rr_e *a, + const FPScalar1Int *fnormal, + const FPScalar1Int *fah) +{ + return do_fp1_scalar_int(s, a, s->fpcr_ah ? fah : fnormal, true); +} + static const FPScalar1Int f_scalar_fmov = { tcg_gen_mov_i32, tcg_gen_mov_i32, tcg_gen_mov_i64, }; -TRANS(FMOV_s, do_fp1_scalar_int, a, &f_scalar_fmov) +TRANS(FMOV_s, do_fp1_scalar_int, a, &f_scalar_fmov, false) static const FPScalar1Int f_scalar_fabs = { gen_vfp_absh, gen_vfp_abss, gen_vfp_absd, }; -TRANS(FABS_s, do_fp1_scalar_int, a, &f_scalar_fabs) +static const FPScalar1Int f_scalar_ah_fabs = { + gen_vfp_ah_absh, + gen_vfp_ah_abss, + gen_vfp_ah_absd, +}; +TRANS(FABS_s, do_fp1_scalar_int_2fn, a, &f_scalar_fabs, &f_scalar_ah_fabs) static const FPScalar1Int f_scalar_fneg = { gen_vfp_negh, gen_vfp_negs, gen_vfp_negd, }; -TRANS(FNEG_s, do_fp1_scalar_int, a, &f_scalar_fneg) +static const FPScalar1Int f_scalar_ah_fneg = { + gen_vfp_ah_negh, + gen_vfp_ah_negs, + gen_vfp_ah_negd, +}; +TRANS(FNEG_s, do_fp1_scalar_int_2fn, a, &f_scalar_fneg, &f_scalar_ah_fneg) typedef struct FPScalar1 { void (*gen_h)(TCGv_i32, TCGv_i32, TCGv_ptr); @@ -8383,8 +8773,9 @@ typedef struct FPScalar1 { void (*gen_d)(TCGv_i64, TCGv_i64, TCGv_ptr); } FPScalar1; -static bool do_fp1_scalar(DisasContext *s, arg_rr_e *a, - const FPScalar1 *f, int rmode) +static bool do_fp1_scalar_with_fpsttype(DisasContext *s, arg_rr_e *a, + const FPScalar1 *f, int rmode, + ARMFPStatusFlavour fpsttype) { TCGv_i32 tcg_rmode = NULL; TCGv_ptr fpst; @@ -8396,7 +8787,7 @@ static bool do_fp1_scalar(DisasContext *s, arg_rr_e *a, return check == 0; } - fpst = fpstatus_ptr(a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); + fpst = fpstatus_ptr(fpsttype); if (rmode >= 0) { tcg_rmode = gen_set_rmode(rmode, fpst); } @@ -8405,17 +8796,17 @@ static bool do_fp1_scalar(DisasContext *s, arg_rr_e *a, case MO_64: t64 = read_fp_dreg(s, a->rn); f->gen_d(t64, t64, fpst); - write_fp_dreg(s, a->rd, t64); + write_fp_dreg_merging(s, a->rd, a->rd, t64); break; case MO_32: t32 = read_fp_sreg(s, a->rn); f->gen_s(t32, t32, fpst); - write_fp_sreg(s, a->rd, t32); + write_fp_sreg_merging(s, a->rd, a->rd, t32); break; case MO_16: t32 = read_fp_hreg(s, a->rn); f->gen_h(t32, t32, fpst); - write_fp_sreg(s, a->rd, t32); + write_fp_hreg_merging(s, a->rd, a->rd, t32); break; default: g_assert_not_reached(); @@ -8427,6 +8818,20 @@ static bool do_fp1_scalar(DisasContext *s, arg_rr_e *a, return true; } +static bool do_fp1_scalar(DisasContext *s, arg_rr_e *a, + const FPScalar1 *f, int rmode) +{ + return do_fp1_scalar_with_fpsttype(s, a, f, rmode, + a->esz == MO_16 ? + FPST_A64_F16 : FPST_A64); +} + +static bool do_fp1_scalar_ah(DisasContext *s, arg_rr_e *a, + const FPScalar1 *f, int rmode) +{ + return do_fp1_scalar_with_fpsttype(s, a, f, rmode, select_ah_fpst(s, a->esz)); +} + static const FPScalar1 f_scalar_fsqrt = { gen_helper_vfp_sqrth, gen_helper_vfp_sqrts, @@ -8453,10 +8858,27 @@ static const FPScalar1 f_scalar_frintx = { }; TRANS(FRINTX_s, do_fp1_scalar, a, &f_scalar_frintx, -1) -static const FPScalar1 f_scalar_bfcvt = { - .gen_s = gen_helper_bfcvt, -}; -TRANS_FEAT(BFCVT_s, aa64_bf16, do_fp1_scalar, a, &f_scalar_bfcvt, -1) +static bool trans_BFCVT_s(DisasContext *s, arg_rr_e *a) +{ + ARMFPStatusFlavour fpsttype = s->fpcr_ah ? FPST_AH : FPST_A64; + TCGv_i32 t32; + int check; + + if (!dc_isar_feature(aa64_bf16, s)) { + return false; + } + + check = fp_access_check_scalar_hsd(s, a->esz); + + if (check <= 0) { + return check == 0; + } + + t32 = read_fp_sreg(s, a->rn); + gen_helper_bfcvt(t32, t32, fpstatus_ptr(fpsttype)); + write_fp_hreg_merging(s, a->rd, a->rd, t32); + return true; +} static const FPScalar1 f_scalar_frint32 = { NULL, @@ -8481,21 +8903,35 @@ static const FPScalar1 f_scalar_frecpe = { gen_helper_recpe_f32, gen_helper_recpe_f64, }; -TRANS(FRECPE_s, do_fp1_scalar, a, &f_scalar_frecpe, -1) +static const FPScalar1 f_scalar_frecpe_rpres = { + gen_helper_recpe_f16, + gen_helper_recpe_rpres_f32, + gen_helper_recpe_f64, +}; +TRANS(FRECPE_s, do_fp1_scalar_ah, a, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? + &f_scalar_frecpe_rpres : &f_scalar_frecpe, -1) static const FPScalar1 f_scalar_frecpx = { gen_helper_frecpx_f16, gen_helper_frecpx_f32, gen_helper_frecpx_f64, }; -TRANS(FRECPX_s, do_fp1_scalar, a, &f_scalar_frecpx, -1) +TRANS(FRECPX_s, do_fp1_scalar_ah, a, &f_scalar_frecpx, -1) static const FPScalar1 f_scalar_frsqrte = { gen_helper_rsqrte_f16, gen_helper_rsqrte_f32, gen_helper_rsqrte_f64, }; -TRANS(FRSQRTE_s, do_fp1_scalar, a, &f_scalar_frsqrte, -1) +static const FPScalar1 f_scalar_frsqrte_rpres = { + gen_helper_rsqrte_f16, + gen_helper_rsqrte_rpres_f32, + gen_helper_rsqrte_f64, +}; +TRANS(FRSQRTE_s, do_fp1_scalar_ah, a, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? + &f_scalar_frsqrte_rpres : &f_scalar_frsqrte, -1) static bool trans_FCVT_s_ds(DisasContext *s, arg_rr *a) { @@ -8505,7 +8941,7 @@ static bool trans_FCVT_s_ds(DisasContext *s, arg_rr *a) TCGv_ptr fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_fcvtds(tcg_rd, tcg_rn, fpst); - write_fp_dreg(s, a->rd, tcg_rd); + write_fp_dreg_merging(s, a->rd, a->rd, tcg_rd); } return true; } @@ -8518,8 +8954,8 @@ static bool trans_FCVT_s_hs(DisasContext *s, arg_rr *a) TCGv_ptr fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); - /* write_fp_sreg is OK here because top half of result is zero */ - write_fp_sreg(s, a->rd, tmp); + /* write_fp_hreg_merging is OK here because top half of result is zero */ + write_fp_hreg_merging(s, a->rd, a->rd, tmp); } return true; } @@ -8532,7 +8968,7 @@ static bool trans_FCVT_s_sd(DisasContext *s, arg_rr *a) TCGv_ptr fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_fcvtsd(tcg_rd, tcg_rn, fpst); - write_fp_sreg(s, a->rd, tcg_rd); + write_fp_sreg_merging(s, a->rd, a->rd, tcg_rd); } return true; } @@ -8546,8 +8982,8 @@ static bool trans_FCVT_s_hd(DisasContext *s, arg_rr *a) TCGv_ptr fpst = fpstatus_ptr(FPST_A64); gen_helper_vfp_fcvt_f64_to_f16(tcg_rd, tcg_rn, fpst, ahp); - /* write_fp_sreg is OK here because top half of tcg_rd is zero */ - write_fp_sreg(s, a->rd, tcg_rd); + /* write_fp_hreg_merging is OK here because top half of tcg_rd is zero */ + write_fp_hreg_merging(s, a->rd, a->rd, tcg_rd); } return true; } @@ -8561,7 +8997,7 @@ static bool trans_FCVT_s_sh(DisasContext *s, arg_rr *a) TCGv_i32 tcg_ahp = get_ahp_flag(); gen_helper_vfp_fcvt_f16_to_f32(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp); - write_fp_sreg(s, a->rd, tcg_rd); + write_fp_sreg_merging(s, a->rd, a->rd, tcg_rd); } return true; } @@ -8575,7 +9011,7 @@ static bool trans_FCVT_s_dh(DisasContext *s, arg_rr *a) TCGv_i32 tcg_ahp = get_ahp_flag(); gen_helper_vfp_fcvt_f16_to_f64(tcg_rd, tcg_rn, tcg_fpst, tcg_ahp); - write_fp_dreg(s, a->rd, tcg_rd); + write_fp_dreg_merging(s, a->rd, a->rd, tcg_rd); } return true; } @@ -8598,7 +9034,7 @@ static bool do_cvtf_scalar(DisasContext *s, MemOp esz, int rd, int shift, } else { gen_helper_vfp_uqtod(tcg_double, tcg_int, tcg_shift, tcg_fpstatus); } - write_fp_dreg(s, rd, tcg_double); + write_fp_dreg_merging(s, rd, rd, tcg_double); break; case MO_32: @@ -8608,7 +9044,7 @@ static bool do_cvtf_scalar(DisasContext *s, MemOp esz, int rd, int shift, } else { gen_helper_vfp_uqtos(tcg_single, tcg_int, tcg_shift, tcg_fpstatus); } - write_fp_sreg(s, rd, tcg_single); + write_fp_sreg_merging(s, rd, rd, tcg_single); break; case MO_16: @@ -8618,7 +9054,7 @@ static bool do_cvtf_scalar(DisasContext *s, MemOp esz, int rd, int shift, } else { gen_helper_vfp_uqtoh(tcg_single, tcg_int, tcg_shift, tcg_fpstatus); } - write_fp_sreg(s, rd, tcg_single); + write_fp_hreg_merging(s, rd, rd, tcg_single); break; default: @@ -8823,7 +9259,9 @@ static bool do_fcvt_f(DisasContext *s, arg_fcvt *a, do_fcvt_scalar(s, a->esz | (is_signed ? MO_SIGN : 0), a->esz, tcg_int, a->shift, a->rn, rmode); - clear_vec(s, a->rd); + if (!s->fpcr_nep) { + clear_vec(s, a->rd); + } write_vec_element(s, tcg_int, a->rd, 0, a->esz); return true; } @@ -9097,24 +9535,21 @@ static ArithOneOp * const f_scalar_uqxtn[] = { }; TRANS(UQXTN_s, do_2misc_narrow_scalar, a, f_scalar_uqxtn) -static void gen_fcvtxn_sd(TCGv_i64 d, TCGv_i64 n) +static bool trans_FCVTXN_s(DisasContext *s, arg_rr_e *a) { - /* - * 64 bit to 32 bit float conversion - * with von Neumann rounding (round to odd) - */ - TCGv_i32 tmp = tcg_temp_new_i32(); - gen_helper_fcvtx_f64_to_f32(tmp, n, fpstatus_ptr(FPST_A64)); - tcg_gen_extu_i32_i64(d, tmp); + if (fp_access_check(s)) { + /* + * 64 bit to 32 bit float conversion + * with von Neumann rounding (round to odd) + */ + TCGv_i64 src = read_fp_dreg(s, a->rn); + TCGv_i32 dst = tcg_temp_new_i32(); + gen_helper_fcvtx_f64_to_f32(dst, src, fpstatus_ptr(FPST_A64)); + write_fp_sreg_merging(s, a->rd, a->rd, dst); + } + return true; } -static ArithOneOp * const f_scalar_fcvtxn[] = { - NULL, - NULL, - gen_fcvtxn_sd, -}; -TRANS(FCVTXN_s, do_2misc_narrow_scalar, a, f_scalar_fcvtxn) - #undef WRAP_ENV static bool do_gvec_fn2(DisasContext *s, arg_qrr_e *a, GVecGen2Fn *fn) @@ -9216,11 +9651,27 @@ static void gen_fcvtn_sd(TCGv_i64 d, TCGv_i64 n) tcg_gen_extu_i32_i64(d, tmp); } +static void gen_fcvtxn_sd(TCGv_i64 d, TCGv_i64 n) +{ + /* + * 64 bit to 32 bit float conversion + * with von Neumann rounding (round to odd) + */ + TCGv_i32 tmp = tcg_temp_new_i32(); + gen_helper_fcvtx_f64_to_f32(tmp, n, fpstatus_ptr(FPST_A64)); + tcg_gen_extu_i32_i64(d, tmp); +} + static ArithOneOp * const f_vector_fcvtn[] = { NULL, gen_fcvtn_hs, gen_fcvtn_sd, }; +static ArithOneOp * const f_scalar_fcvtxn[] = { + NULL, + NULL, + gen_fcvtxn_sd, +}; TRANS(FCVTN_v, do_2misc_narrow_vector, a, f_vector_fcvtn) TRANS(FCVTXN_v, do_2misc_narrow_vector, a, f_scalar_fcvtxn) @@ -9232,12 +9683,27 @@ static void gen_bfcvtn_hs(TCGv_i64 d, TCGv_i64 n) tcg_gen_extu_i32_i64(d, tmp); } -static ArithOneOp * const f_vector_bfcvtn[] = { - NULL, - gen_bfcvtn_hs, - NULL, +static void gen_bfcvtn_ah_hs(TCGv_i64 d, TCGv_i64 n) +{ + TCGv_ptr fpst = fpstatus_ptr(FPST_AH); + TCGv_i32 tmp = tcg_temp_new_i32(); + gen_helper_bfcvt_pair(tmp, n, fpst); + tcg_gen_extu_i32_i64(d, tmp); +} + +static ArithOneOp * const f_vector_bfcvtn[2][3] = { + { + NULL, + gen_bfcvtn_hs, + NULL, + }, { + NULL, + gen_bfcvtn_ah_hs, + NULL, + } }; -TRANS_FEAT(BFCVTN_v, aa64_bf16, do_2misc_narrow_vector, a, f_vector_bfcvtn) +TRANS_FEAT(BFCVTN_v, aa64_bf16, do_2misc_narrow_vector, a, + f_vector_bfcvtn[s->fpcr_ah]) static bool trans_SHLL_v(DisasContext *s, arg_qrr_e *a) { @@ -9350,9 +9816,10 @@ TRANS_FEAT(FRINT64Z_v, aa64_frint, do_fp1_vector, a, &f_scalar_frint64, FPROUNDING_ZERO) TRANS_FEAT(FRINT64X_v, aa64_frint, do_fp1_vector, a, &f_scalar_frint64, -1) -static bool do_gvec_op2_fpst(DisasContext *s, MemOp esz, bool is_q, - int rd, int rn, int data, - gen_helper_gvec_2_ptr * const fns[3]) +static bool do_gvec_op2_fpst_with_fpsttype(DisasContext *s, MemOp esz, + bool is_q, int rd, int rn, int data, + gen_helper_gvec_2_ptr * const fns[3], + ARMFPStatusFlavour fpsttype) { int check = fp_access_check_vector_hsd(s, is_q, esz); TCGv_ptr fpst; @@ -9361,7 +9828,7 @@ static bool do_gvec_op2_fpst(DisasContext *s, MemOp esz, bool is_q, return check == 0; } - fpst = fpstatus_ptr(esz == MO_16 ? FPST_A64_F16 : FPST_A64); + fpst = fpstatus_ptr(fpsttype); tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), fpst, is_q ? 16 : 8, vec_full_reg_size(s), @@ -9369,6 +9836,23 @@ static bool do_gvec_op2_fpst(DisasContext *s, MemOp esz, bool is_q, return true; } +static bool do_gvec_op2_fpst(DisasContext *s, MemOp esz, bool is_q, + int rd, int rn, int data, + gen_helper_gvec_2_ptr * const fns[3]) +{ + return do_gvec_op2_fpst_with_fpsttype(s, esz, is_q, rd, rn, data, fns, + esz == MO_16 ? FPST_A64_F16 : + FPST_A64); +} + +static bool do_gvec_op2_ah_fpst(DisasContext *s, MemOp esz, bool is_q, + int rd, int rn, int data, + gen_helper_gvec_2_ptr * const fns[3]) +{ + return do_gvec_op2_fpst_with_fpsttype(s, esz, is_q, rd, rn, data, + fns, select_ah_fpst(s, esz)); +} + static gen_helper_gvec_2_ptr * const f_scvtf_v[] = { gen_helper_gvec_vcvt_sh, gen_helper_gvec_vcvt_sf, @@ -9478,14 +9962,26 @@ static gen_helper_gvec_2_ptr * const f_frecpe[] = { gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d, }; -TRANS(FRECPE_v, do_gvec_op2_fpst, a->esz, a->q, a->rd, a->rn, 0, f_frecpe) +static gen_helper_gvec_2_ptr * const f_frecpe_rpres[] = { + gen_helper_gvec_frecpe_h, + gen_helper_gvec_frecpe_rpres_s, + gen_helper_gvec_frecpe_d, +}; +TRANS(FRECPE_v, do_gvec_op2_ah_fpst, a->esz, a->q, a->rd, a->rn, 0, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? f_frecpe_rpres : f_frecpe) static gen_helper_gvec_2_ptr * const f_frsqrte[] = { gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d, }; -TRANS(FRSQRTE_v, do_gvec_op2_fpst, a->esz, a->q, a->rd, a->rn, 0, f_frsqrte) +static gen_helper_gvec_2_ptr * const f_frsqrte_rpres[] = { + gen_helper_gvec_frsqrte_h, + gen_helper_gvec_frsqrte_rpres_s, + gen_helper_gvec_frsqrte_d, +}; +TRANS(FRSQRTE_v, do_gvec_op2_ah_fpst, a->esz, a->q, a->rd, a->rn, 0, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? f_frsqrte_rpres : f_frsqrte) static bool trans_FCVTL_v(DisasContext *s, arg_qrr_e *a) { @@ -9655,6 +10151,8 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->nv2 = EX_TBFLAG_A64(tb_flags, NV2); dc->nv2_mem_e20 = EX_TBFLAG_A64(tb_flags, NV2_MEM_E20); dc->nv2_mem_be = EX_TBFLAG_A64(tb_flags, NV2_MEM_BE); + dc->fpcr_ah = EX_TBFLAG_A64(tb_flags, AH); + dc->fpcr_nep = EX_TBFLAG_A64(tb_flags, NEP); dc->vec_len = 0; dc->vec_stride = 0; dc->cp_regs = arm_cpu->cp_regs; diff --git a/target/arm/tcg/translate-a64.h b/target/arm/tcg/translate-a64.h index 0fcf7cb..7d3b59c 100644 --- a/target/arm/tcg/translate-a64.h +++ b/target/arm/tcg/translate-a64.h @@ -185,6 +185,19 @@ static inline TCGv_ptr pred_full_reg_ptr(DisasContext *s, int regno) return ret; } +/* + * Return the ARMFPStatusFlavour to use based on element size and + * whether FPCR.AH is set. + */ +static inline ARMFPStatusFlavour select_ah_fpst(DisasContext *s, MemOp esz) +{ + if (s->fpcr_ah) { + return esz == MO_16 ? FPST_AH_F16 : FPST_AH; + } else { + return esz == MO_16 ? FPST_A64_F16 : FPST_A64; + } +} + bool disas_sve(DisasContext *, uint32_t); bool disas_sme(DisasContext *, uint32_t); diff --git a/target/arm/tcg/translate-sve.c b/target/arm/tcg/translate-sve.c index e178833..d23be47 100644 --- a/target/arm/tcg/translate-sve.c +++ b/target/arm/tcg/translate-sve.c @@ -137,11 +137,11 @@ static bool gen_gvec_fpst_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn, return true; } -static bool gen_gvec_fpst_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn, - arg_rr_esz *a, int data) +static bool gen_gvec_fpst_ah_arg_zz(DisasContext *s, gen_helper_gvec_2_ptr *fn, + arg_rr_esz *a, int data) { return gen_gvec_fpst_zz(s, fn, a->rd, a->rn, data, - a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); + select_ah_fpst(s, a->esz)); } /* Invoke an out-of-line helper on 3 Zregs. */ @@ -194,6 +194,13 @@ static bool gen_gvec_fpst_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); } +static bool gen_gvec_fpst_ah_arg_zzz(DisasContext *s, gen_helper_gvec_3_ptr *fn, + arg_rrr_esz *a, int data) +{ + return gen_gvec_fpst_zzz(s, fn, a->rd, a->rn, a->rm, data, + select_ah_fpst(s, a->esz)); +} + /* Invoke an out-of-line helper on 4 Zregs. */ static bool gen_gvec_ool_zzzz(DisasContext *s, gen_helper_gvec_4 *fn, int rd, int rn, int rm, int ra, int data) @@ -776,13 +783,23 @@ static gen_helper_gvec_3 * const fabs_fns[4] = { NULL, gen_helper_sve_fabs_h, gen_helper_sve_fabs_s, gen_helper_sve_fabs_d, }; -TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, fabs_fns[a->esz], a, 0) +static gen_helper_gvec_3 * const fabs_ah_fns[4] = { + NULL, gen_helper_sve_ah_fabs_h, + gen_helper_sve_ah_fabs_s, gen_helper_sve_ah_fabs_d, +}; +TRANS_FEAT(FABS, aa64_sve, gen_gvec_ool_arg_zpz, + s->fpcr_ah ? fabs_ah_fns[a->esz] : fabs_fns[a->esz], a, 0) static gen_helper_gvec_3 * const fneg_fns[4] = { NULL, gen_helper_sve_fneg_h, gen_helper_sve_fneg_s, gen_helper_sve_fneg_d, }; -TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, fneg_fns[a->esz], a, 0) +static gen_helper_gvec_3 * const fneg_ah_fns[4] = { + NULL, gen_helper_sve_ah_fneg_h, + gen_helper_sve_ah_fneg_s, gen_helper_sve_ah_fneg_d, +}; +TRANS_FEAT(FNEG, aa64_sve, gen_gvec_ool_arg_zpz, + s->fpcr_ah ? fneg_ah_fns[a->esz] : fneg_fns[a->esz], a, 0) static gen_helper_gvec_3 * const sxtb_fns[4] = { NULL, gen_helper_sve_sxtb_h, @@ -1221,14 +1238,14 @@ static gen_helper_gvec_2 * const fexpa_fns[4] = { gen_helper_sve_fexpa_s, gen_helper_sve_fexpa_d, }; TRANS_FEAT_NONSTREAMING(FEXPA, aa64_sve, gen_gvec_ool_zz, - fexpa_fns[a->esz], a->rd, a->rn, 0) + fexpa_fns[a->esz], a->rd, a->rn, s->fpcr_ah) static gen_helper_gvec_3 * const ftssel_fns[4] = { NULL, gen_helper_sve_ftssel_h, gen_helper_sve_ftssel_s, gen_helper_sve_ftssel_d, }; TRANS_FEAT_NONSTREAMING(FTSSEL, aa64_sve, gen_gvec_ool_arg_zzz, - ftssel_fns[a->esz], a, 0) + ftssel_fns[a->esz], a, s->fpcr_ah) /* *** SVE Predicate Logical Operations Group @@ -3507,21 +3524,24 @@ DO_SVE2_RRXR_ROT(CDOT_zzxw_d, gen_helper_sve2_cdot_idx_d) *** SVE Floating Point Multiply-Add Indexed Group */ -static bool do_FMLA_zzxz(DisasContext *s, arg_rrxr_esz *a, bool sub) -{ - static gen_helper_gvec_4_ptr * const fns[4] = { - NULL, - gen_helper_gvec_fmla_idx_h, - gen_helper_gvec_fmla_idx_s, - gen_helper_gvec_fmla_idx_d, - }; - return gen_gvec_fpst_zzzz(s, fns[a->esz], a->rd, a->rn, a->rm, a->ra, - (a->index << 1) | sub, - a->esz == MO_16 ? FPST_A64_F16 : FPST_A64); -} +static gen_helper_gvec_4_ptr * const fmla_idx_fns[4] = { + NULL, gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_fmla_idx_s, gen_helper_gvec_fmla_idx_d +}; +TRANS_FEAT(FMLA_zzxz, aa64_sve, gen_gvec_fpst_zzzz, + fmla_idx_fns[a->esz], a->rd, a->rn, a->rm, a->ra, a->index, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) -TRANS_FEAT(FMLA_zzxz, aa64_sve, do_FMLA_zzxz, a, false) -TRANS_FEAT(FMLS_zzxz, aa64_sve, do_FMLA_zzxz, a, true) +static gen_helper_gvec_4_ptr * const fmls_idx_fns[4][2] = { + { NULL, NULL }, + { gen_helper_gvec_fmls_idx_h, gen_helper_gvec_ah_fmls_idx_h }, + { gen_helper_gvec_fmls_idx_s, gen_helper_gvec_ah_fmls_idx_s }, + { gen_helper_gvec_fmls_idx_d, gen_helper_gvec_ah_fmls_idx_d }, +}; +TRANS_FEAT(FMLS_zzxz, aa64_sve, gen_gvec_fpst_zzzz, + fmls_idx_fns[a->esz][s->fpcr_ah], + a->rd, a->rn, a->rm, a->ra, a->index, + a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) /* *** SVE Floating Point Multiply Indexed Group @@ -3581,11 +3601,23 @@ static bool do_reduce(DisasContext *s, arg_rpr_esz *a, }; \ TRANS_FEAT(NAME, aa64_sve, do_reduce, a, name##_fns[a->esz]) +#define DO_VPZ_AH(NAME, name) \ + static gen_helper_fp_reduce * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d, \ + }; \ + static gen_helper_fp_reduce * const name##_ah_fns[4] = { \ + NULL, gen_helper_sve_ah_##name##_h, \ + gen_helper_sve_ah_##name##_s, gen_helper_sve_ah_##name##_d, \ + }; \ + TRANS_FEAT(NAME, aa64_sve, do_reduce, a, \ + s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz]) + DO_VPZ(FADDV, faddv) DO_VPZ(FMINNMV, fminnmv) DO_VPZ(FMAXNMV, fmaxnmv) -DO_VPZ(FMINV, fminv) -DO_VPZ(FMAXV, fmaxv) +DO_VPZ_AH(FMINV, fminv) +DO_VPZ_AH(FMAXV, fmaxv) #undef DO_VPZ @@ -3597,13 +3629,25 @@ static gen_helper_gvec_2_ptr * const frecpe_fns[] = { NULL, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s, gen_helper_gvec_frecpe_d, }; -TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_arg_zz, frecpe_fns[a->esz], a, 0) +static gen_helper_gvec_2_ptr * const frecpe_rpres_fns[] = { + NULL, gen_helper_gvec_frecpe_h, + gen_helper_gvec_frecpe_rpres_s, gen_helper_gvec_frecpe_d, +}; +TRANS_FEAT(FRECPE, aa64_sve, gen_gvec_fpst_ah_arg_zz, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? + frecpe_rpres_fns[a->esz] : frecpe_fns[a->esz], a, 0) static gen_helper_gvec_2_ptr * const frsqrte_fns[] = { NULL, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s, gen_helper_gvec_frsqrte_d, }; -TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_arg_zz, frsqrte_fns[a->esz], a, 0) +static gen_helper_gvec_2_ptr * const frsqrte_rpres_fns[] = { + NULL, gen_helper_gvec_frsqrte_h, + gen_helper_gvec_frsqrte_rpres_s, gen_helper_gvec_frsqrte_d, +}; +TRANS_FEAT(FRSQRTE, aa64_sve, gen_gvec_fpst_ah_arg_zz, + s->fpcr_ah && dc_isar_feature(aa64_rpres, s) ? + frsqrte_rpres_fns[a->esz] : frsqrte_fns[a->esz], a, 0) /* *** SVE Floating Point Compare with Zero Group @@ -3653,7 +3697,8 @@ static gen_helper_gvec_3_ptr * const ftmad_fns[4] = { gen_helper_sve_ftmad_s, gen_helper_sve_ftmad_d, }; TRANS_FEAT_NONSTREAMING(FTMAD, aa64_sve, gen_gvec_fpst_zzz, - ftmad_fns[a->esz], a->rd, a->rn, a->rm, a->imm, + ftmad_fns[a->esz], a->rd, a->rn, a->rm, + a->imm | (s->fpcr_ah << 3), a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) /* @@ -3707,11 +3752,23 @@ static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a) }; \ TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_arg_zzz, name##_fns[a->esz], a, 0) +#define DO_FP3_AH(NAME, name) \ + static gen_helper_gvec_3_ptr * const name##_fns[4] = { \ + NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ + }; \ + static gen_helper_gvec_3_ptr * const name##_ah_fns[4] = { \ + NULL, gen_helper_gvec_ah_##name##_h, \ + gen_helper_gvec_ah_##name##_s, gen_helper_gvec_ah_##name##_d \ + }; \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_ah_arg_zzz, \ + s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz], a, 0) + DO_FP3(FADD_zzz, fadd) DO_FP3(FSUB_zzz, fsub) DO_FP3(FMUL_zzz, fmul) -DO_FP3(FRECPS, recps) -DO_FP3(FRSQRTS, rsqrts) +DO_FP3_AH(FRECPS, recps) +DO_FP3_AH(FRSQRTS, rsqrts) #undef DO_FP3 @@ -3733,14 +3790,27 @@ TRANS_FEAT_NONSTREAMING(FTSMUL, aa64_sve, gen_gvec_fpst_arg_zzz, }; \ TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, name##_zpzz_fns[a->esz], a) +#define DO_ZPZZ_AH_FP(NAME, FEAT, name, ah_name) \ + static gen_helper_gvec_4_ptr * const name##_zpzz_fns[4] = { \ + NULL, gen_helper_##name##_h, \ + gen_helper_##name##_s, gen_helper_##name##_d \ + }; \ + static gen_helper_gvec_4_ptr * const name##_ah_zpzz_fns[4] = { \ + NULL, gen_helper_##ah_name##_h, \ + gen_helper_##ah_name##_s, gen_helper_##ah_name##_d \ + }; \ + TRANS_FEAT(NAME, FEAT, gen_gvec_fpst_arg_zpzz, \ + s->fpcr_ah ? name##_ah_zpzz_fns[a->esz] : \ + name##_zpzz_fns[a->esz], a) + DO_ZPZZ_FP(FADD_zpzz, aa64_sve, sve_fadd) DO_ZPZZ_FP(FSUB_zpzz, aa64_sve, sve_fsub) DO_ZPZZ_FP(FMUL_zpzz, aa64_sve, sve_fmul) -DO_ZPZZ_FP(FMIN_zpzz, aa64_sve, sve_fmin) -DO_ZPZZ_FP(FMAX_zpzz, aa64_sve, sve_fmax) +DO_ZPZZ_AH_FP(FMIN_zpzz, aa64_sve, sve_fmin, sve_ah_fmin) +DO_ZPZZ_AH_FP(FMAX_zpzz, aa64_sve, sve_fmax, sve_ah_fmax) DO_ZPZZ_FP(FMINNM_zpzz, aa64_sve, sve_fminnum) DO_ZPZZ_FP(FMAXNM_zpzz, aa64_sve, sve_fmaxnum) -DO_ZPZZ_FP(FABD, aa64_sve, sve_fabd) +DO_ZPZZ_AH_FP(FABD, aa64_sve, sve_fabd, sve_ah_fabd) DO_ZPZZ_FP(FSCALE, aa64_sve, sve_fscalbn) DO_ZPZZ_FP(FDIV, aa64_sve, sve_fdiv) DO_ZPZZ_FP(FMULX, aa64_sve, sve_fmulx) @@ -3795,14 +3865,35 @@ static bool do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm, TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a, \ name##_const[a->esz][a->imm], name##_fns[a->esz]) +#define DO_FP_AH_IMM(NAME, name, const0, const1) \ + static gen_helper_sve_fp2scalar * const name##_fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d \ + }; \ + static gen_helper_sve_fp2scalar * const name##_ah_fns[4] = { \ + NULL, gen_helper_sve_ah_##name##_h, \ + gen_helper_sve_ah_##name##_s, \ + gen_helper_sve_ah_##name##_d \ + }; \ + static uint64_t const name##_const[4][2] = { \ + { -1, -1 }, \ + { float16_##const0, float16_##const1 }, \ + { float32_##const0, float32_##const1 }, \ + { float64_##const0, float64_##const1 }, \ + }; \ + TRANS_FEAT(NAME##_zpzi, aa64_sve, do_fp_imm, a, \ + name##_const[a->esz][a->imm], \ + s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz]) + DO_FP_IMM(FADD, fadds, half, one) DO_FP_IMM(FSUB, fsubs, half, one) DO_FP_IMM(FMUL, fmuls, half, two) DO_FP_IMM(FSUBR, fsubrs, half, one) DO_FP_IMM(FMAXNM, fmaxnms, zero, one) DO_FP_IMM(FMINNM, fminnms, zero, one) -DO_FP_IMM(FMAX, fmaxs, zero, one) -DO_FP_IMM(FMIN, fmins, zero, one) +DO_FP_AH_IMM(FMAX, fmaxs, zero, one) +DO_FP_AH_IMM(FMIN, fmins, zero, one) #undef DO_FP_IMM @@ -3846,22 +3937,28 @@ static gen_helper_gvec_4_ptr * const fcadd_fns[] = { gen_helper_sve_fcadd_s, gen_helper_sve_fcadd_d, }; TRANS_FEAT(FCADD, aa64_sve, gen_gvec_fpst_zzzp, fcadd_fns[a->esz], - a->rd, a->rn, a->rm, a->pg, a->rot, + a->rd, a->rn, a->rm, a->pg, a->rot | (s->fpcr_ah << 1), a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) -#define DO_FMLA(NAME, name) \ +#define DO_FMLA(NAME, name, ah_name) \ static gen_helper_gvec_5_ptr * const name##_fns[4] = { \ NULL, gen_helper_sve_##name##_h, \ gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ }; \ - TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, name##_fns[a->esz], \ + static gen_helper_gvec_5_ptr * const name##_ah_fns[4] = { \ + NULL, gen_helper_sve_##ah_name##_h, \ + gen_helper_sve_##ah_name##_s, gen_helper_sve_##ah_name##_d \ + }; \ + TRANS_FEAT(NAME, aa64_sve, gen_gvec_fpst_zzzzp, \ + s->fpcr_ah ? name##_ah_fns[a->esz] : name##_fns[a->esz], \ a->rd, a->rn, a->rm, a->ra, a->pg, 0, \ a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) -DO_FMLA(FMLA_zpzzz, fmla_zpzzz) -DO_FMLA(FMLS_zpzzz, fmls_zpzzz) -DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz) -DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz) +/* We don't need an ah_fmla_zpzzz because fmla doesn't negate anything */ +DO_FMLA(FMLA_zpzzz, fmla_zpzzz, fmla_zpzzz) +DO_FMLA(FMLS_zpzzz, fmls_zpzzz, ah_fmls_zpzzz) +DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz, ah_fnmla_zpzzz) +DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz, ah_fnmls_zpzzz) #undef DO_FMLA @@ -3870,7 +3967,7 @@ static gen_helper_gvec_5_ptr * const fcmla_fns[4] = { gen_helper_sve_fcmla_zpzzz_s, gen_helper_sve_fcmla_zpzzz_d, }; TRANS_FEAT(FCMLA_zpzzz, aa64_sve, gen_gvec_fpst_zzzzp, fcmla_fns[a->esz], - a->rd, a->rn, a->rm, a->ra, a->pg, a->rot, + a->rd, a->rn, a->rm, a->ra, a->pg, a->rot | (s->fpcr_ah << 2), a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) static gen_helper_gvec_4_ptr * const fcmla_idx_fns[4] = { @@ -3890,7 +3987,8 @@ TRANS_FEAT(FCVT_hs, aa64_sve, gen_gvec_fpst_arg_zpz, gen_helper_sve_fcvt_hs, a, 0, FPST_A64_F16) TRANS_FEAT(BFCVT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz, - gen_helper_sve_bfcvt, a, 0, FPST_A64) + gen_helper_sve_bfcvt, a, 0, + s->fpcr_ah ? FPST_AH : FPST_A64) TRANS_FEAT(FCVT_dh, aa64_sve, gen_gvec_fpst_arg_zpz, gen_helper_sve_fcvt_dh, a, 0, FPST_A64) @@ -3993,7 +4091,7 @@ static gen_helper_gvec_3_ptr * const frecpx_fns[] = { gen_helper_sve_frecpx_s, gen_helper_sve_frecpx_d, }; TRANS_FEAT(FRECPX, aa64_sve, gen_gvec_fpst_arg_zpz, frecpx_fns[a->esz], - a, 0, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64) + a, 0, select_ah_fpst(s, a->esz)) static gen_helper_gvec_3_ptr * const fsqrt_fns[] = { NULL, gen_helper_sve_fsqrt_h, @@ -7040,7 +7138,8 @@ TRANS_FEAT(FCVTNT_ds, aa64_sve2, gen_gvec_fpst_arg_zpz, gen_helper_sve2_fcvtnt_ds, a, 0, FPST_A64) TRANS_FEAT(BFCVTNT, aa64_sve_bf16, gen_gvec_fpst_arg_zpz, - gen_helper_sve_bfcvtnt, a, 0, FPST_A64) + gen_helper_sve_bfcvtnt, a, 0, + s->fpcr_ah ? FPST_AH : FPST_A64) TRANS_FEAT(FCVTLT_hs, aa64_sve2, gen_gvec_fpst_arg_zpz, gen_helper_sve2_fcvtlt_hs, a, 0, FPST_A64) @@ -7101,7 +7200,8 @@ TRANS_FEAT_NONSTREAMING(BFMMLA, aa64_sve_bf16, gen_gvec_env_arg_zzzz, static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel) { return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal, - a->rd, a->rn, a->rm, a->ra, sel, FPST_A64); + a->rd, a->rn, a->rm, a->ra, sel, + s->fpcr_ah ? FPST_AH : FPST_A64); } TRANS_FEAT(BFMLALB_zzzw, aa64_sve_bf16, do_BFMLAL_zzzw, a, false) @@ -7111,7 +7211,8 @@ static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel) { return gen_gvec_fpst_zzzz(s, gen_helper_gvec_bfmlal_idx, a->rd, a->rn, a->rm, a->ra, - (a->index << 1) | sel, FPST_A64); + (a->index << 1) | sel, + s->fpcr_ah ? FPST_AH : FPST_A64); } TRANS_FEAT(BFMLALB_zzxw, aa64_sve_bf16, do_BFMLAL_zzxw, a, false) diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index 084ee63..f8dc2f0 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -155,6 +155,10 @@ typedef struct DisasContext { bool nv2_mem_e20; /* True if NV2 enabled and NV2 RAM accesses are big-endian */ bool nv2_mem_be; + /* True if FPCR.AH is 1 (alternate floating point handling) */ + bool fpcr_ah; + /* True if FPCR.NEP is 1 (FEAT_AFP scalar upper-element result handling) */ + bool fpcr_nep; /* * >= 0, a copy of PSTATE.BTYPE, which will be 0 without v8.5-BTI. * < 0, set by the current instruction. @@ -666,66 +670,18 @@ static inline CPUARMTBFlags arm_tbflags_from_tb(const TranslationBlock *tb) return (CPUARMTBFlags){ tb->flags, tb->cs_base }; } -/* - * Enum for argument to fpstatus_ptr(). - */ -typedef enum ARMFPStatusFlavour { - FPST_A32, - FPST_A64, - FPST_A32_F16, - FPST_A64_F16, - FPST_STD, - FPST_STD_F16, -} ARMFPStatusFlavour; - /** * fpstatus_ptr: return TCGv_ptr to the specified fp_status field * * We have multiple softfloat float_status fields in the Arm CPU state struct * (see the comment in cpu.h for details). Return a TCGv_ptr which has * been set up to point to the requested field in the CPU state struct. - * The options are: - * - * FPST_A32 - * for AArch32 non-FP16 operations controlled by the FPCR - * FPST_A64 - * for AArch64 non-FP16 operations controlled by the FPCR - * FPST_A32_F16 - * for AArch32 operations controlled by the FPCR where FPCR.FZ16 is to be used - * FPST_A64_F16 - * for AArch64 operations controlled by the FPCR where FPCR.FZ16 is to be used - * FPST_STD - * for A32/T32 Neon operations using the "standard FPSCR value" - * FPST_STD_F16 - * as FPST_STD, but where FPCR.FZ16 is to be used */ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour) { TCGv_ptr statusptr = tcg_temp_new_ptr(); - int offset; - - switch (flavour) { - case FPST_A32: - offset = offsetof(CPUARMState, vfp.fp_status_a32); - break; - case FPST_A64: - offset = offsetof(CPUARMState, vfp.fp_status_a64); - break; - case FPST_A32_F16: - offset = offsetof(CPUARMState, vfp.fp_status_f16_a32); - break; - case FPST_A64_F16: - offset = offsetof(CPUARMState, vfp.fp_status_f16_a64); - break; - case FPST_STD: - offset = offsetof(CPUARMState, vfp.standard_fp_status); - break; - case FPST_STD_F16: - offset = offsetof(CPUARMState, vfp.standard_fp_status_f16); - break; - default: - g_assert_not_reached(); - } + int offset = offsetof(CPUARMState, vfp.fp_status[flavour]); + tcg_gen_addi_ptr(statusptr, tcg_env, offset); return statusptr; } diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 7330b37..986eaf8 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -879,19 +879,21 @@ void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, float16 *d = vd; float16 *n = vn; float16 *m = vm; - uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = neg_real ^ 1; + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; - for (i = 0; i < opr_sz / 2; i += 2) { float16 e0 = n[H2(i)]; - float16 e1 = m[H2(i + 1)] ^ neg_imag; + float16 e1 = m[H2(i + 1)]; float16 e2 = n[H2(i + 1)]; - float16 e3 = m[H2(i)] ^ neg_real; + float16 e3 = m[H2(i)]; + + if (rot) { + e3 = float16_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float16_maybe_ah_chs(e1, fpcr_ah); + } d[H2(i)] = float16_add(e0, e1, fpst); d[H2(i + 1)] = float16_add(e2, e3, fpst); @@ -906,19 +908,21 @@ void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, float32 *d = vd; float32 *n = vn; float32 *m = vm; - uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = neg_real ^ 1; + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; - for (i = 0; i < opr_sz / 4; i += 2) { float32 e0 = n[H4(i)]; - float32 e1 = m[H4(i + 1)] ^ neg_imag; + float32 e1 = m[H4(i + 1)]; float32 e2 = n[H4(i + 1)]; - float32 e3 = m[H4(i)] ^ neg_real; + float32 e3 = m[H4(i)]; + + if (rot) { + e3 = float32_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float32_maybe_ah_chs(e1, fpcr_ah); + } d[H4(i)] = float32_add(e0, e1, fpst); d[H4(i + 1)] = float32_add(e2, e3, fpst); @@ -933,19 +937,21 @@ void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, float64 *d = vd; float64 *n = vn; float64 *m = vm; - uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); - uint64_t neg_imag = neg_real ^ 1; + bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); + bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 63; - neg_imag <<= 63; - for (i = 0; i < opr_sz / 8; i += 2) { float64 e0 = n[i]; - float64 e1 = m[i + 1] ^ neg_imag; + float64 e1 = m[i + 1]; float64 e2 = n[i + 1]; - float64 e3 = m[i] ^ neg_real; + float64 e3 = m[i]; + + if (rot) { + e3 = float64_maybe_ah_chs(e3, fpcr_ah); + } else { + e1 = float64_maybe_ah_chs(e1, fpcr_ah); + } d[i] = float64_add(e0, e1, fpst); d[i + 1] = float64_add(e2, e3, fpst); @@ -959,22 +965,26 @@ void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float16 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float16 negx_imag, negx_real; uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 15; + negx_imag = (negf_imag & ~fpcr_ah) << 15; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < opr_sz / 2; i += 2) { float16 e2 = n[H2(i + flip)]; - float16 e1 = m[H2(i + flip)] ^ neg_real; + float16 e1 = m[H2(i + flip)] ^ negx_real; float16 e4 = e2; - float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; + float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; - d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); - d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); + d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); + d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -985,29 +995,33 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float16 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); + uint32_t negf_real = flip ^ negf_imag; intptr_t elements = opr_sz / sizeof(float16); intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); + float16 negx_imag, negx_real; intptr_t i, j; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 15; + negx_imag = (negf_imag & ~fpcr_ah) << 15; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < elements; i += eltspersegment) { float16 mr = m[H2(i + 2 * index + 0)]; float16 mi = m[H2(i + 2 * index + 1)]; - float16 e1 = neg_real ^ (flip ? mi : mr); - float16 e3 = neg_imag ^ (flip ? mr : mi); + float16 e1 = negx_real ^ (flip ? mi : mr); + float16 e3 = negx_imag ^ (flip ? mr : mi); for (j = i; j < i + eltspersegment; j += 2) { float16 e2 = n[H2(j + flip)]; float16 e4 = e2; - d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); - d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); + d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); + d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); } } clear_tail(d, opr_sz, simd_maxsz(desc)); @@ -1019,22 +1033,26 @@ void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float32 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float32 negx_imag, negx_real; uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 31; + negx_imag = (negf_imag & ~fpcr_ah) << 31; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < opr_sz / 4; i += 2) { float32 e2 = n[H4(i + flip)]; - float32 e1 = m[H4(i + flip)] ^ neg_real; + float32 e1 = m[H4(i + flip)] ^ negx_real; float32 e4 = e2; - float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; + float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; - d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); - d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); + d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); + d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -1045,29 +1063,33 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float32 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); + uint32_t negf_real = flip ^ negf_imag; intptr_t elements = opr_sz / sizeof(float32); intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); + float32 negx_imag, negx_real; intptr_t i, j; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 31; + negx_imag = (negf_imag & ~fpcr_ah) << 31; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < elements; i += eltspersegment) { float32 mr = m[H4(i + 2 * index + 0)]; float32 mi = m[H4(i + 2 * index + 1)]; - float32 e1 = neg_real ^ (flip ? mi : mr); - float32 e3 = neg_imag ^ (flip ? mr : mi); + float32 e1 = negx_real ^ (flip ? mi : mr); + float32 e3 = negx_imag ^ (flip ? mr : mi); for (j = i; j < i + eltspersegment; j += 2) { float32 e2 = n[H4(j + flip)]; float32 e4 = e2; - d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); - d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); + d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); + d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); } } clear_tail(d, opr_sz, simd_maxsz(desc)); @@ -1079,22 +1101,26 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float64 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); - uint64_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_real = flip ^ negf_imag; + float64 negx_real, negx_imag; uintptr_t i; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 63; - neg_imag <<= 63; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; + negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < opr_sz / 8; i += 2) { float64 e2 = n[i + flip]; - float64 e1 = m[i + flip] ^ neg_real; + float64 e1 = m[i + flip] ^ negx_real; float64 e4 = e2; - float64 e3 = m[i + 1 - flip] ^ neg_imag; + float64 e3 = m[i + 1 - flip] ^ negx_imag; - d[i] = float64_muladd(e2, e1, a[i], 0, fpst); - d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); + d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); + d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -1210,10 +1236,12 @@ void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) +DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32) DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) +DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32) DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) @@ -1302,6 +1330,25 @@ static float64 float64_abd(float64 op1, float64 op2, float_status *stat) return float64_abs(float64_sub(op1, op2, stat)); } +/* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ +static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) +{ + float16 r = float16_sub(op1, op2, stat); + return float16_is_any_nan(r) ? r : float16_abs(r); +} + +static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) +{ + float32 r = float32_sub(op1, op2, stat); + return float32_is_any_nan(r) ? r : float32_abs(r); +} + +static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) +{ + float64 r = float64_sub(op1, op2, stat); + return float64_is_any_nan(r) ? r : float64_abs(r); +} + /* * Reciprocal step. These are the AArch32 version which uses a * non-fused multiply-and-subtract. @@ -1389,6 +1436,10 @@ DO_3OP(gvec_fabd_h, float16_abd, float16) DO_3OP(gvec_fabd_s, float32_abd, float32) DO_3OP(gvec_fabd_d, float64_abd, float64) +DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) +DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) +DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) + DO_3OP(gvec_fceq_h, float16_ceq, float16) DO_3OP(gvec_fceq_s, float32_ceq, float32) DO_3OP(gvec_fceq_d, float64_ceq, float64) @@ -1448,6 +1499,22 @@ DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) +DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) +DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) +DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) + +DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) +DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) +DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) + +DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) +DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) +DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) + +DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) +DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) +DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) + #endif #undef DO_3OP @@ -1513,6 +1580,24 @@ static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, return float64_muladd(float64_chs(op1), op2, dest, 0, stat); } +static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, + float_status *stat) +{ + return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); +} + +static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, + float_status *stat) +{ + return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); +} + +static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, + float_status *stat) +{ + return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); +} + #define DO_MULADD(NAME, FUNC, TYPE) \ void HELPER(NAME)(void *vd, void *vn, void *vm, \ float_status *stat, uint32_t desc) \ @@ -1539,6 +1624,10 @@ DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) +DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) +DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) +DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) + /* For the indexed ops, SVE applies the index per 128-bit vector segment. * For AdvSIMD, there is of course only one such vector segment. */ @@ -1635,29 +1724,35 @@ DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) #undef DO_FMUL_IDX -#define DO_FMLA_IDX(NAME, TYPE, H) \ +#define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ float_status *stat, uint32_t desc) \ { \ intptr_t i, j, oprsz = simd_oprsz(desc); \ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ - TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ - intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ + intptr_t idx = simd_data(desc); \ TYPE *d = vd, *n = vn, *m = vm, *a = va; \ - op1_neg <<= (8 * sizeof(TYPE) - 1); \ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ TYPE mm = m[H(i + idx)]; \ for (j = 0; j < segment; j++) { \ - d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ - mm, a[i + j], 0, stat); \ + d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ + a[i + j], NEGF, stat); \ } \ } \ clear_tail(d, oprsz, simd_maxsz(desc)); \ } -DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) -DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) -DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) +DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) +DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) +DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) + +DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) +DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) +DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) + +DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) +DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) +DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) #undef DO_FMLA_IDX @@ -2030,28 +2125,29 @@ static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) * as there is not yet SVE versions that might use blocking. */ -static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) +static void do_fmlal(float32 *d, void *vn, void *vm, + CPUARMState *env, uint32_t desc, + ARMFPStatusFlavour fpst_idx, + uint64_t negx, int negf) { + float_status *fpst = &env->vfp.fp_status[fpst_idx]; + bool fz16 = env->vfp.fpcr & FPCR_FZ16; intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); int is_q = oprsz == 16; uint64_t n_4, m_4; - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); + /* + * Pre-load all of the f16 data, avoiding overlap issues. + * Negate all inputs for AH=0 FMLSL at once. + */ + n_4 = load4_f16(vn, is_q, is_2) ^ negx; m_4 = load4_f16(vm, is_q, is_2); - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - for (i = 0; i < oprsz / 4; i++) { float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); } clear_tail(d, oprsz, simd_maxsz(desc)); } @@ -2059,61 +2155,82 @@ static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = is_s ? 0x8000800080008000ull : 0; + + do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0); } void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = 0; + int negf = 0; + + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000800080008000ull; + } + } + do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf); } void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, CPUARMState *env, uint32_t desc) { intptr_t i, oprsz = simd_oprsz(desc); - uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); - float_status *status = &env->vfp.fp_status_a64; - bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); + float_status *status = &env->vfp.fp_status[FPST_A64]; + bool fz16 = env->vfp.fpcr & FPCR_FZ16; + int negx = 0, negf = 0; + + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000; + } + } for (i = 0; i < oprsz; i += sizeof(float32)) { - float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; + float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); float32 nn = float16_to_float32_by_bits(nn_16, fz16); float32 mm = float16_to_float32_by_bits(mm_16, fz16); float32 aa = *(float32 *)(va + H1_4(i)); - *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); + *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); } } -static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) +static void do_fmlal_idx(float32 *d, void *vn, void *vm, + CPUARMState *env, uint32_t desc, + ARMFPStatusFlavour fpst_idx, + uint64_t negx, int negf) { + float_status *fpst = &env->vfp.fp_status[fpst_idx]; + bool fz16 = env->vfp.fpcr & FPCR_FZ16; intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); int is_q = oprsz == 16; uint64_t n_4; float32 m_1; - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); - - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - + /* + * Pre-load all of the f16 data, avoiding overlap issues. + * Negate all inputs for AH=0 FMLSL at once. + */ + n_4 = load4_f16(vn, is_q, is_2) ^ negx; m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); for (i = 0; i < oprsz / 4; i++) { float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); } clear_tail(d, oprsz, simd_maxsz(desc)); } @@ -2121,38 +2238,58 @@ static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = is_s ? 0x8000800080008000ull : 0; + + do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0); } void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, - get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = 0; + int negf = 0; + + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000800080008000ull; + } + } + do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf); } void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, CPUARMState *env, uint32_t desc) { intptr_t i, j, oprsz = simd_oprsz(desc); - uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); - float_status *status = &env->vfp.fp_status_a64; - bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); + float_status *status = &env->vfp.fp_status[FPST_A64]; + bool fz16 = env->vfp.fpcr & FPCR_FZ16; + int negx = 0, negf = 0; + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000; + } + } for (i = 0; i < oprsz; i += 16) { float16 mm_16 = *(float16 *)(vm + i + idx); float32 mm = float16_to_float32_by_bits(mm_16, fz16); for (j = 0; j < 16; j += sizeof(float32)) { - float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; + float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; float32 nn = float16_to_float32_by_bits(nn_16, fz16); float32 aa = *(float32 *)(va + H1_4(i + j)); *(float32 *)(vd + H1_4(i + j)) = - float32_muladd(nn, mm, aa, 0, status); + float32_muladd(nn, mm, aa, negf, status); } } } @@ -2436,6 +2573,16 @@ DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) +#ifdef TARGET_AARCH64 +DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) +DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) +DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) + +DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) +DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) +DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) +#endif + #undef DO_3OP_PAIR #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ @@ -2808,7 +2955,7 @@ bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) */ bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; - *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; + *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32]; set_default_nan_mode(true, statusp); if (ebf) { diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index 094f5c1..6b93b5a 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -20,6 +20,8 @@ #ifndef TARGET_ARM_VEC_INTERNAL_H #define TARGET_ARM_VEC_INTERNAL_H +#include "fpu/softfloat.h" + /* * Note that vector data is stored in host-endian 64-bit chunks, * so addressing units smaller than that needs a host-endian fixup. @@ -265,4 +267,37 @@ float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, */ bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp); +/* + * Negate as for FPCR.AH=1 -- do not negate NaNs. + */ +static inline float16 float16_ah_chs(float16 a) +{ + return float16_is_any_nan(a) ? a : float16_chs(a); +} + +static inline float32 float32_ah_chs(float32 a) +{ + return float32_is_any_nan(a) ? a : float32_chs(a); +} + +static inline float64 float64_ah_chs(float64 a) +{ + return float64_is_any_nan(a) ? a : float64_chs(a); +} + +static inline float16 float16_maybe_ah_chs(float16 a, bool fpcr_ah) +{ + return fpcr_ah && float16_is_any_nan(a) ? a : float16_chs(a); +} + +static inline float32 float32_maybe_ah_chs(float32 a, bool fpcr_ah) +{ + return fpcr_ah && float32_is_any_nan(a) ? a : float32_chs(a); +} + +static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah) +{ + return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a); +} + #endif /* TARGET_ARM_VEC_INTERNAL_H */ diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 3c8f3e6..5d42447 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -22,19 +22,63 @@ #include "exec/helper-proto.h" #include "internals.h" #include "cpu-features.h" +#include "fpu/softfloat.h" #ifdef CONFIG_TCG #include "qemu/log.h" -#include "fpu/softfloat.h" #endif /* VFP support. We follow the convention used for VFP instructions: Single precision routines have a "s" suffix, double precision a "d" suffix. */ +/* + * Set the float_status behaviour to match the Arm defaults: + * * tininess-before-rounding + * * 2-input NaN propagation prefers SNaN over QNaN, and then + * operand A over operand B (see FPProcessNaNs() pseudocode) + * * 3-input NaN propagation prefers SNaN over QNaN, and then + * operand C over A over B (see FPProcessNaNs3() pseudocode, + * but note that for QEMU muladd is a * b + c, whereas for + * the pseudocode function the arguments are in the order c, a, b. + * * 0 * Inf + NaN returns the default NaN if the input NaN is quiet, + * and the input NaN if it is signalling + * * Default NaN has sign bit clear, msb frac bit set + */ +void arm_set_default_fp_behaviours(float_status *s) +{ + set_float_detect_tininess(float_tininess_before_rounding, s); + set_float_ftz_detection(float_ftz_before_rounding, s); + set_float_2nan_prop_rule(float_2nan_prop_s_ab, s); + set_float_3nan_prop_rule(float_3nan_prop_s_cab, s); + set_float_infzeronan_rule(float_infzeronan_dnan_if_qnan, s); + set_float_default_nan_pattern(0b01000000, s); +} + +/* + * Set the float_status behaviour to match the FEAT_AFP + * FPCR.AH=1 requirements: + * * tininess-after-rounding + * * 2-input NaN propagation prefers the first NaN + * * 3-input NaN propagation prefers a over b over c + * * 0 * Inf + NaN always returns the input NaN and doesn't + * set Invalid for a QNaN + * * default NaN has sign bit set, msb frac bit set + */ +void arm_set_ah_fp_behaviours(float_status *s) +{ + set_float_detect_tininess(float_tininess_after_rounding, s); + set_float_ftz_detection(float_ftz_after_rounding, s); + set_float_2nan_prop_rule(float_2nan_prop_ab, s); + set_float_3nan_prop_rule(float_3nan_prop_abc, s); + set_float_infzeronan_rule(float_infzeronan_dnan_never | + float_infzeronan_suppress_invalid, s); + set_float_default_nan_pattern(0b11000000, s); +} + #ifdef CONFIG_TCG /* Convert host exception flags to vfp form. */ -static inline uint32_t vfp_exceptbits_from_host(int host_bits) +static inline uint32_t vfp_exceptbits_from_host(int host_bits, bool ah) { uint32_t target_bits = 0; @@ -56,24 +100,52 @@ static inline uint32_t vfp_exceptbits_from_host(int host_bits) if (host_bits & float_flag_input_denormal_flushed) { target_bits |= FPSR_IDC; } + /* + * With FPCR.AH, IDC is set when an input denormal is used, + * and flushing an output denormal to zero sets both IXC and UFC. + */ + if (ah && (host_bits & float_flag_input_denormal_used)) { + target_bits |= FPSR_IDC; + } + if (ah && (host_bits & float_flag_output_denormal_flushed)) { + target_bits |= FPSR_IXC; + } return target_bits; } static uint32_t vfp_get_fpsr_from_host(CPUARMState *env) { - uint32_t i = 0; + uint32_t a32_flags = 0, a64_flags = 0; - i |= get_float_exception_flags(&env->vfp.fp_status_a32); - i |= get_float_exception_flags(&env->vfp.fp_status_a64); - i |= get_float_exception_flags(&env->vfp.standard_fp_status); + a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A32]); + a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_STD]); /* FZ16 does not generate an input denormal exception. */ - i |= (get_float_exception_flags(&env->vfp.fp_status_f16_a32) + a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A32_F16]) & ~float_flag_input_denormal_flushed); - i |= (get_float_exception_flags(&env->vfp.fp_status_f16_a64) + a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_STD_F16]) & ~float_flag_input_denormal_flushed); - i |= (get_float_exception_flags(&env->vfp.standard_fp_status_f16) - & ~float_flag_input_denormal_flushed); - return vfp_exceptbits_from_host(i); + + a64_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A64]); + a64_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A64_F16]) + & ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used)); + /* + * We do not merge in flags from FPST_AH or FPST_AH_F16, because + * they are used for insns that must not set the cumulative exception bits. + */ + + /* + * Flushing an input denormal *only* because FPCR.FIZ == 1 does + * not set FPSR.IDC; if FPCR.FZ is also set then this takes + * precedence and IDC is set (see the FPUnpackBase pseudocode). + * So squash it unless (FPCR.AH == 0 && FPCR.FZ == 1). + * We only do this for the a64 flags because FIZ has no effect + * on AArch32 even if it is set. + */ + if ((env->vfp.fpcr & (FPCR_FZ | FPCR_AH)) != FPCR_FZ) { + a64_flags &= ~float_flag_input_denormal_flushed; + } + return vfp_exceptbits_from_host(a64_flags, env->vfp.fpcr & FPCR_AH) | + vfp_exceptbits_from_host(a32_flags, false); } static void vfp_clear_float_status_exc_flags(CPUARMState *env) @@ -83,12 +155,25 @@ static void vfp_clear_float_status_exc_flags(CPUARMState *env) * values. The caller should have arranged for env->vfp.fpsr to * be the architecturally up-to-date exception flag information first. */ - set_float_exception_flags(0, &env->vfp.fp_status_a32); - set_float_exception_flags(0, &env->vfp.fp_status_a64); - set_float_exception_flags(0, &env->vfp.fp_status_f16_a32); - set_float_exception_flags(0, &env->vfp.fp_status_f16_a64); - set_float_exception_flags(0, &env->vfp.standard_fp_status); - set_float_exception_flags(0, &env->vfp.standard_fp_status_f16); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32_F16]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64_F16]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD_F16]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH]); + set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH_F16]); +} + +static void vfp_sync_and_clear_float_status_exc_flags(CPUARMState *env) +{ + /* + * Synchronize any pending exception-flag information in the + * float_status values into env->vfp.fpsr, and then clear out + * the float_status data. + */ + env->vfp.fpsr |= vfp_get_fpsr_from_host(env); + vfp_clear_float_status_exc_flags(env); } static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) @@ -113,33 +198,66 @@ static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) i = float_round_to_zero; break; } - set_float_rounding_mode(i, &env->vfp.fp_status_a32); - set_float_rounding_mode(i, &env->vfp.fp_status_a64); - set_float_rounding_mode(i, &env->vfp.fp_status_f16_a32); - set_float_rounding_mode(i, &env->vfp.fp_status_f16_a64); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32]); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64]); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32_F16]); + set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64_F16]); } if (changed & FPCR_FZ16) { bool ftz_enabled = val & FPCR_FZ16; - set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32); - set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64); - set_flush_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16); - set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32); - set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64); - set_flush_inputs_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]); } if (changed & FPCR_FZ) { bool ftz_enabled = val & FPCR_FZ; - set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_a32); - set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_a32); - set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_a64); - set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_a64); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]); + set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64]); + /* FIZ is A64 only so FZ always makes A32 code flush inputs to zero */ + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]); + } + if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) { + /* + * A64: Flush denormalized inputs to zero if FPCR.FIZ = 1, or + * both FPCR.AH = 0 and FPCR.FZ = 1. + */ + bool fitz_enabled = (val & FPCR_FIZ) || + (val & (FPCR_FZ | FPCR_AH)) == FPCR_FZ; + set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_A64]); } if (changed & FPCR_DN) { bool dnan_enabled = val & FPCR_DN; - set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a32); - set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a64); - set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a32); - set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a64); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32]); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64]); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32_F16]); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64_F16]); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH]); + set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH_F16]); + } + if (changed & FPCR_AH) { + bool ah_enabled = val & FPCR_AH; + + if (ah_enabled) { + /* Change behaviours for A64 FP operations */ + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64]); + arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]); + } else { + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]); + arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]); + } + } + /* + * If any bits changed that we look at in vfp_get_fpsr_from_host(), + * we must sync the float_status flags into vfp.fpsr now (under the + * old regime) before we update vfp.fpcr. + */ + if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) { + vfp_sync_and_clear_float_status_exc_flags(env); } } @@ -242,6 +360,9 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask) if (!cpu_isar_feature(any_fp16, cpu)) { val &= ~FPCR_FZ16; } + if (!cpu_isar_feature(aa64_afp, cpu)) { + val &= ~(FPCR_FIZ | FPCR_AH | FPCR_NEP); + } if (!cpu_isar_feature(aa64_ebf16, cpu)) { val &= ~FPCR_EBF; @@ -271,12 +392,14 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask) * We don't implement trapped exception handling, so the * trap enable bits, IDE|IXE|UFE|OFE|DZE|IOE are all RAZ/WI (not RES0!) * - * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode, EBF - * and FZ16. Len, Stride and LTPSIZE we just handled. Store those bits + * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode, EBF, FZ16, + * FIZ, AH, and NEP. + * Len, Stride and LTPSIZE we just handled. Store those bits * there, and zero any of the other FPCR bits and the RES0 and RAZ/WI * bits. */ - val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16 | FPCR_EBF; + val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16 | + FPCR_EBF | FPCR_FIZ | FPCR_AH | FPCR_NEP; env->vfp.fpcr &= ~mask; env->vfp.fpcr |= val; } @@ -366,16 +489,16 @@ static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp) void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \ { \ softfloat_to_vfp_compare(env, \ - FLOATTYPE ## _compare_quiet(a, b, &env->vfp.FPST)); \ + FLOATTYPE ## _compare_quiet(a, b, &env->vfp.fp_status[FPST])); \ } \ void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \ { \ softfloat_to_vfp_compare(env, \ - FLOATTYPE ## _compare(a, b, &env->vfp.FPST)); \ + FLOATTYPE ## _compare(a, b, &env->vfp.fp_status[FPST])); \ } -DO_VFP_cmp(h, float16, dh_ctype_f16, fp_status_f16_a32) -DO_VFP_cmp(s, float32, float32, fp_status_a32) -DO_VFP_cmp(d, float64, float64, fp_status_a32) +DO_VFP_cmp(h, float16, dh_ctype_f16, FPST_A32_F16) +DO_VFP_cmp(s, float32, float32, FPST_A32) +DO_VFP_cmp(d, float64, float64, FPST_A32) #undef DO_VFP_cmp /* Integer to float and float to integer conversions */ @@ -611,6 +734,33 @@ static int recip_estimate(int input) } /* + * Increased precision version: + * input is a 13 bit fixed point number + * input range 2048 .. 4095 for a number from 0.5 <= x < 1.0. + * result range 4096 .. 8191 for a number from 1.0 to 2.0 + */ +static int recip_estimate_incprec(int input) +{ + int a, b, r; + assert(2048 <= input && input < 4096); + a = (input * 2) + 1; + /* + * The pseudocode expresses this as an operation on infinite + * precision reals where it calculates 2^25 / a and then looks + * at the error between that and the rounded-down-to-integer + * value to see if it should instead round up. We instead + * follow the same approach as the pseudocode for the 8-bit + * precision version, and calculate (2 * (2^25 / a)) as an + * integer so we can do the "add one and halve" to round it. + * So the 1 << 26 here is correct. + */ + b = (1 << 26) / a; + r = (b + 1) >> 1; + assert(4096 <= r && r < 8192); + return r; +} + +/* * Common wrapper to call recip_estimate * * The parameters are exponent and 64 bit fraction (without implicit @@ -619,7 +769,8 @@ static int recip_estimate(int input) * callee. */ -static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac) +static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac, + bool increasedprecision) { uint32_t scaled, estimate; uint64_t result_frac; @@ -635,12 +786,22 @@ static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac) } } - /* scaled = UInt('1':fraction<51:44>) */ - scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8)); - estimate = recip_estimate(scaled); + if (increasedprecision) { + /* scaled = UInt('1':fraction<51:41>) */ + scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11)); + estimate = recip_estimate_incprec(scaled); + } else { + /* scaled = UInt('1':fraction<51:44>) */ + scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8)); + estimate = recip_estimate(scaled); + } result_exp = exp_off - *exp; - result_frac = deposit64(0, 44, 8, estimate); + if (increasedprecision) { + result_frac = deposit64(0, 40, 12, estimate); + } else { + result_frac = deposit64(0, 44, 8, estimate); + } if (result_exp == 0) { result_frac = deposit64(result_frac >> 1, 51, 1, 1); } else if (result_exp == -1) { @@ -709,7 +870,7 @@ uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst) } f64_frac = call_recip_estimate(&f16_exp, 29, - ((uint64_t) f16_frac) << (52 - 10)); + ((uint64_t) f16_frac) << (52 - 10), false); /* result = sign : result_exp<4:0> : fraction<51:42> */ f16_val = deposit32(0, 15, 1, f16_sign); @@ -718,7 +879,11 @@ uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst) return make_float16(f16_val); } -float32 HELPER(recpe_f32)(float32 input, float_status *fpst) +/* + * FEAT_RPRES means the f32 FRECPE has an "increased precision" variant + * which is used when FPCR.AH == 1. + */ +static float32 do_recpe_f32(float32 input, float_status *fpst, bool rpres) { float32 f32 = float32_squash_input_denormal(input, fpst); uint32_t f32_val = float32_val(f32); @@ -758,7 +923,7 @@ float32 HELPER(recpe_f32)(float32 input, float_status *fpst) } f64_frac = call_recip_estimate(&f32_exp, 253, - ((uint64_t) f32_frac) << (52 - 23)); + ((uint64_t) f32_frac) << (52 - 23), rpres); /* result = sign : result_exp<7:0> : fraction<51:29> */ f32_val = deposit32(0, 31, 1, f32_sign); @@ -767,6 +932,16 @@ float32 HELPER(recpe_f32)(float32 input, float_status *fpst) return make_float32(f32_val); } +float32 HELPER(recpe_f32)(float32 input, float_status *fpst) +{ + return do_recpe_f32(input, fpst, false); +} + +float32 HELPER(recpe_rpres_f32)(float32 input, float_status *fpst) +{ + return do_recpe_f32(input, fpst, true); +} + float64 HELPER(recpe_f64)(float64 input, float_status *fpst) { float64 f64 = float64_squash_input_denormal(input, fpst); @@ -806,7 +981,7 @@ float64 HELPER(recpe_f64)(float64 input, float_status *fpst) return float64_set_sign(float64_zero, float64_is_neg(f64)); } - f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac); + f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac, false); /* result = sign : result_exp<10:0> : fraction<51:0>; */ f64_val = deposit64(0, 63, 1, f64_sign); @@ -840,8 +1015,36 @@ static int do_recip_sqrt_estimate(int a) return estimate; } +static int do_recip_sqrt_estimate_incprec(int a) +{ + /* + * The Arm ARM describes the 12-bit precision version of RecipSqrtEstimate + * in terms of an infinite-precision floating point calculation of a + * square root. We implement this using the same kind of pure integer + * algorithm as the 8-bit mantissa, to get the same bit-for-bit result. + */ + int64_t b, estimate; + + assert(1024 <= a && a < 4096); + if (a < 2048) { + a = a * 2 + 1; + } else { + a = (a >> 1) << 1; + a = (a + 1) * 2; + } + b = 8192; + while (a * (b + 1) * (b + 1) < (1ULL << 39)) { + b += 1; + } + estimate = (b + 1) / 2; + + assert(4096 <= estimate && estimate < 8192); -static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac) + return estimate; +} + +static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac, + bool increasedprecision) { int estimate; uint32_t scaled; @@ -854,17 +1057,32 @@ static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac) frac = extract64(frac, 0, 51) << 1; } - if (*exp & 1) { - /* scaled = UInt('01':fraction<51:45>) */ - scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7)); + if (increasedprecision) { + if (*exp & 1) { + /* scaled = UInt('01':fraction<51:42>) */ + scaled = deposit32(1 << 10, 0, 10, extract64(frac, 42, 10)); + } else { + /* scaled = UInt('1':fraction<51:41>) */ + scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11)); + } + estimate = do_recip_sqrt_estimate_incprec(scaled); } else { - /* scaled = UInt('1':fraction<51:44>) */ - scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8)); + if (*exp & 1) { + /* scaled = UInt('01':fraction<51:45>) */ + scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7)); + } else { + /* scaled = UInt('1':fraction<51:44>) */ + scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8)); + } + estimate = do_recip_sqrt_estimate(scaled); } - estimate = do_recip_sqrt_estimate(scaled); *exp = (exp_off - *exp) / 2; - return extract64(estimate, 0, 8) << 44; + if (increasedprecision) { + return extract64(estimate, 0, 12) << 40; + } else { + return extract64(estimate, 0, 8) << 44; + } } uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s) @@ -903,7 +1121,7 @@ uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s) f64_frac = ((uint64_t) f16_frac) << (52 - 10); - f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac); + f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac, false); /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(2) */ val = deposit32(0, 15, 1, f16_sign); @@ -912,7 +1130,11 @@ uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s) return make_float16(val); } -float32 HELPER(rsqrte_f32)(float32 input, float_status *s) +/* + * FEAT_RPRES means the f32 FRSQRTE has an "increased precision" variant + * which is used when FPCR.AH == 1. + */ +static float32 do_rsqrte_f32(float32 input, float_status *s, bool rpres) { float32 f32 = float32_squash_input_denormal(input, s); uint32_t val = float32_val(f32); @@ -948,15 +1170,33 @@ float32 HELPER(rsqrte_f32)(float32 input, float_status *s) f64_frac = ((uint64_t) f32_frac) << 29; - f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac); + f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac, rpres); - /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(15) */ + /* + * result = sign : result_exp<7:0> : estimate<7:0> : Zeros(15) + * or for increased precision + * result = sign : result_exp<7:0> : estimate<11:0> : Zeros(11) + */ val = deposit32(0, 31, 1, f32_sign); val = deposit32(val, 23, 8, f32_exp); - val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8)); + if (rpres) { + val = deposit32(val, 11, 12, extract64(f64_frac, 52 - 12, 12)); + } else { + val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8)); + } return make_float32(val); } +float32 HELPER(rsqrte_f32)(float32 input, float_status *s) +{ + return do_rsqrte_f32(input, s, false); +} + +float32 HELPER(rsqrte_rpres_f32)(float32 input, float_status *s) +{ + return do_rsqrte_f32(input, s, true); +} + float64 HELPER(rsqrte_f64)(float64 input, float_status *s) { float64 f64 = float64_squash_input_denormal(input, s); @@ -987,7 +1227,7 @@ float64 HELPER(rsqrte_f64)(float64 input, float_status *s) return float64_zero; } - f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac); + f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac, false); /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(44) */ val = deposit64(0, 61, 1, f64_sign); @@ -1145,7 +1385,7 @@ uint64_t HELPER(fjcvtzs)(float64 value, float_status *status) uint32_t HELPER(vjcvt)(float64 value, CPUARMState *env) { - uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status_a32); + uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status[FPST_A32]); uint32_t result = pair; uint32_t z = (pair >> 32) == 0; diff --git a/target/hppa/fpu_helper.c b/target/hppa/fpu_helper.c index 239c027..8ff4b44 100644 --- a/target/hppa/fpu_helper.c +++ b/target/hppa/fpu_helper.c @@ -67,6 +67,17 @@ void HELPER(loaded_fr0)(CPUHPPAState *env) set_float_infzeronan_rule(float_infzeronan_dnan_never, &env->fp_status); /* Default NaN: sign bit clear, msb-1 frac bit set */ set_float_default_nan_pattern(0b00100000, &env->fp_status); + /* + * "PA-RISC 2.0 Architecture" says it is IMPDEF whether the flushing + * enabled by FPSR.D happens before or after rounding. We pick "before" + * for consistency with tininess detection. + */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); + /* + * TODO: "PA-RISC 2.0 Architecture" chapter 10 says that we should + * detect tininess before rounding, but we don't set that here so we + * get the default tininess after rounding. + */ } void cpu_hppa_loaded_fr0(CPUHPPAState *env) diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index de6d0b2..f112c6c 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -188,6 +188,14 @@ void cpu_init_fp_statuses(CPUX86State *env) set_float_default_nan_pattern(0b11000000, &env->fp_status); set_float_default_nan_pattern(0b11000000, &env->mmx_status); set_float_default_nan_pattern(0b11000000, &env->sse_status); + /* + * TODO: x86 does flush-to-zero detection after rounding (the SDM + * section 10.2.3.3 on the FTZ bit of MXCSR says that we flush + * when we detect underflow, which x86 does after rounding). + */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); + set_float_ftz_detection(float_ftz_before_rounding, &env->mmx_status); + set_float_ftz_detection(float_ftz_before_rounding, &env->sse_status); } static inline uint8_t save_exception_flags(CPUX86State *env) diff --git a/target/mips/fpu_helper.h b/target/mips/fpu_helper.h index 6ad1e46..08fb409 100644 --- a/target/mips/fpu_helper.h +++ b/target/mips/fpu_helper.h @@ -84,6 +84,12 @@ static inline void fp_reset(CPUMIPSState *env) */ set_float_2nan_prop_rule(float_2nan_prop_s_ab, &env->active_fpu.fp_status); + /* + * TODO: the spec does't say clearly whether FTZ happens before + * or after rounding for normal FPU operations. + */ + set_float_ftz_detection(float_ftz_before_rounding, + &env->active_fpu.fp_status); } /* MSA */ diff --git a/target/mips/msa.c b/target/mips/msa.c index fc77bfc..32c6acb 100644 --- a/target/mips/msa.c +++ b/target/mips/msa.c @@ -48,6 +48,15 @@ void msa_reset(CPUMIPSState *env) /* tininess detected after rounding.*/ set_float_detect_tininess(float_tininess_after_rounding, &env->active_tc.msa_fp_status); + /* + * MSACSR.FS detects tiny results to flush to zero before rounding + * (per "MIPS Architecture for Programmers Volume IV-j: The MIPS64 SIMD + * Architecture Module, Revision 1.1" section 3.5.4), even though it + * detects tininess after rounding for underflow purposes (section 3.4.2 + * table 3.3). + */ + set_float_ftz_detection(float_ftz_before_rounding, + &env->active_tc.msa_fp_status); /* * According to MIPS specifications, if one of the two operands is diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c index 8e49051..062a6e8 100644 --- a/target/ppc/cpu_init.c +++ b/target/ppc/cpu_init.c @@ -7262,6 +7262,9 @@ static void ppc_cpu_reset_hold(Object *obj, ResetType type) /* tininess for underflow is detected before rounding */ set_float_detect_tininess(float_tininess_before_rounding, &env->fp_status); + /* Similarly for flush-to-zero */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); + /* * PowerPC propagation rules: * 1. A if it sNaN or qNaN diff --git a/target/rx/cpu.c b/target/rx/cpu.c index 8c50c7a..37a6fdd 100644 --- a/target/rx/cpu.c +++ b/target/rx/cpu.c @@ -103,6 +103,14 @@ static void rx_cpu_reset_hold(Object *obj, ResetType type) set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status); /* Default NaN value: sign bit clear, set frac msb */ set_float_default_nan_pattern(0b01000000, &env->fp_status); + /* + * TODO: "RX Family RXv1 Instruction Set Architecture" is not 100% clear + * on whether flush-to-zero should happen before or after rounding, but + * section 1.3.2 says that it happens when underflow is detected, and + * implies that underflow is detected after rounding. So this may not + * be the correct setting. + */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); } static ObjectClass *rx_cpu_class_by_name(const char *cpu_model) diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c index 24a2272..4ac693d 100644 --- a/target/sh4/cpu.c +++ b/target/sh4/cpu.c @@ -130,6 +130,14 @@ static void superh_cpu_reset_hold(Object *obj, ResetType type) set_default_nan_mode(1, &env->fp_status); /* sign bit clear, set all frac bits other than msb */ set_float_default_nan_pattern(0b00111111, &env->fp_status); + /* + * TODO: "SH-4 CPU Core Architecture ADCS 7182230F" doesn't say whether + * it detects tininess before or after rounding. Section 6.4 is clear + * that flush-to-zero happens when the result underflows, though, so + * either this should be "detect ftz after rounding" or else we should + * be setting "detect tininess before rounding". + */ + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); } static void superh_cpu_disas_set_info(CPUState *cpu, disassemble_info *info) diff --git a/target/tricore/helper.c b/target/tricore/helper.c index e8b0ec5..9898752 100644 --- a/target/tricore/helper.c +++ b/target/tricore/helper.c @@ -116,6 +116,7 @@ void fpu_set_state(CPUTriCoreState *env) set_flush_inputs_to_zero(1, &env->fp_status); set_flush_to_zero(1, &env->fp_status); set_float_detect_tininess(float_tininess_before_rounding, &env->fp_status); + set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status); set_default_nan_mode(1, &env->fp_status); /* Default NaN pattern: sign bit clear, frac msb set */ set_float_default_nan_pattern(0b01000000, &env->fp_status); diff --git a/tests/fp/fp-bench.c b/tests/fp/fp-bench.c index eacb39b..d90f542 100644 --- a/tests/fp/fp-bench.c +++ b/tests/fp/fp-bench.c @@ -496,6 +496,7 @@ static void run_bench(void) set_float_3nan_prop_rule(float_3nan_prop_s_cab, &soft_status); set_float_infzeronan_rule(float_infzeronan_dnan_if_qnan, &soft_status); set_float_default_nan_pattern(0b01000000, &soft_status); + set_float_ftz_detection(float_ftz_before_rounding, &soft_status); f = bench_funcs[operation][precision]; g_assert(f); |