aboutsummaryrefslogtreecommitdiff
path: root/target/arm/vfp_helper.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/arm/vfp_helper.c')
-rw-r--r--target/arm/vfp_helper.c374
1 files changed, 307 insertions, 67 deletions
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 3c8f3e6..5d42447 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -22,19 +22,63 @@
#include "exec/helper-proto.h"
#include "internals.h"
#include "cpu-features.h"
+#include "fpu/softfloat.h"
#ifdef CONFIG_TCG
#include "qemu/log.h"
-#include "fpu/softfloat.h"
#endif
/* VFP support. We follow the convention used for VFP instructions:
Single precision routines have a "s" suffix, double precision a
"d" suffix. */
+/*
+ * Set the float_status behaviour to match the Arm defaults:
+ * * tininess-before-rounding
+ * * 2-input NaN propagation prefers SNaN over QNaN, and then
+ * operand A over operand B (see FPProcessNaNs() pseudocode)
+ * * 3-input NaN propagation prefers SNaN over QNaN, and then
+ * operand C over A over B (see FPProcessNaNs3() pseudocode,
+ * but note that for QEMU muladd is a * b + c, whereas for
+ * the pseudocode function the arguments are in the order c, a, b.
+ * * 0 * Inf + NaN returns the default NaN if the input NaN is quiet,
+ * and the input NaN if it is signalling
+ * * Default NaN has sign bit clear, msb frac bit set
+ */
+void arm_set_default_fp_behaviours(float_status *s)
+{
+ set_float_detect_tininess(float_tininess_before_rounding, s);
+ set_float_ftz_detection(float_ftz_before_rounding, s);
+ set_float_2nan_prop_rule(float_2nan_prop_s_ab, s);
+ set_float_3nan_prop_rule(float_3nan_prop_s_cab, s);
+ set_float_infzeronan_rule(float_infzeronan_dnan_if_qnan, s);
+ set_float_default_nan_pattern(0b01000000, s);
+}
+
+/*
+ * Set the float_status behaviour to match the FEAT_AFP
+ * FPCR.AH=1 requirements:
+ * * tininess-after-rounding
+ * * 2-input NaN propagation prefers the first NaN
+ * * 3-input NaN propagation prefers a over b over c
+ * * 0 * Inf + NaN always returns the input NaN and doesn't
+ * set Invalid for a QNaN
+ * * default NaN has sign bit set, msb frac bit set
+ */
+void arm_set_ah_fp_behaviours(float_status *s)
+{
+ set_float_detect_tininess(float_tininess_after_rounding, s);
+ set_float_ftz_detection(float_ftz_after_rounding, s);
+ set_float_2nan_prop_rule(float_2nan_prop_ab, s);
+ set_float_3nan_prop_rule(float_3nan_prop_abc, s);
+ set_float_infzeronan_rule(float_infzeronan_dnan_never |
+ float_infzeronan_suppress_invalid, s);
+ set_float_default_nan_pattern(0b11000000, s);
+}
+
#ifdef CONFIG_TCG
/* Convert host exception flags to vfp form. */
-static inline uint32_t vfp_exceptbits_from_host(int host_bits)
+static inline uint32_t vfp_exceptbits_from_host(int host_bits, bool ah)
{
uint32_t target_bits = 0;
@@ -56,24 +100,52 @@ static inline uint32_t vfp_exceptbits_from_host(int host_bits)
if (host_bits & float_flag_input_denormal_flushed) {
target_bits |= FPSR_IDC;
}
+ /*
+ * With FPCR.AH, IDC is set when an input denormal is used,
+ * and flushing an output denormal to zero sets both IXC and UFC.
+ */
+ if (ah && (host_bits & float_flag_input_denormal_used)) {
+ target_bits |= FPSR_IDC;
+ }
+ if (ah && (host_bits & float_flag_output_denormal_flushed)) {
+ target_bits |= FPSR_IXC;
+ }
return target_bits;
}
static uint32_t vfp_get_fpsr_from_host(CPUARMState *env)
{
- uint32_t i = 0;
+ uint32_t a32_flags = 0, a64_flags = 0;
- i |= get_float_exception_flags(&env->vfp.fp_status_a32);
- i |= get_float_exception_flags(&env->vfp.fp_status_a64);
- i |= get_float_exception_flags(&env->vfp.standard_fp_status);
+ a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A32]);
+ a32_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_STD]);
/* FZ16 does not generate an input denormal exception. */
- i |= (get_float_exception_flags(&env->vfp.fp_status_f16_a32)
+ a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A32_F16])
& ~float_flag_input_denormal_flushed);
- i |= (get_float_exception_flags(&env->vfp.fp_status_f16_a64)
+ a32_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_STD_F16])
& ~float_flag_input_denormal_flushed);
- i |= (get_float_exception_flags(&env->vfp.standard_fp_status_f16)
- & ~float_flag_input_denormal_flushed);
- return vfp_exceptbits_from_host(i);
+
+ a64_flags |= get_float_exception_flags(&env->vfp.fp_status[FPST_A64]);
+ a64_flags |= (get_float_exception_flags(&env->vfp.fp_status[FPST_A64_F16])
+ & ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used));
+ /*
+ * We do not merge in flags from FPST_AH or FPST_AH_F16, because
+ * they are used for insns that must not set the cumulative exception bits.
+ */
+
+ /*
+ * Flushing an input denormal *only* because FPCR.FIZ == 1 does
+ * not set FPSR.IDC; if FPCR.FZ is also set then this takes
+ * precedence and IDC is set (see the FPUnpackBase pseudocode).
+ * So squash it unless (FPCR.AH == 0 && FPCR.FZ == 1).
+ * We only do this for the a64 flags because FIZ has no effect
+ * on AArch32 even if it is set.
+ */
+ if ((env->vfp.fpcr & (FPCR_FZ | FPCR_AH)) != FPCR_FZ) {
+ a64_flags &= ~float_flag_input_denormal_flushed;
+ }
+ return vfp_exceptbits_from_host(a64_flags, env->vfp.fpcr & FPCR_AH) |
+ vfp_exceptbits_from_host(a32_flags, false);
}
static void vfp_clear_float_status_exc_flags(CPUARMState *env)
@@ -83,12 +155,25 @@ static void vfp_clear_float_status_exc_flags(CPUARMState *env)
* values. The caller should have arranged for env->vfp.fpsr to
* be the architecturally up-to-date exception flag information first.
*/
- set_float_exception_flags(0, &env->vfp.fp_status_a32);
- set_float_exception_flags(0, &env->vfp.fp_status_a64);
- set_float_exception_flags(0, &env->vfp.fp_status_f16_a32);
- set_float_exception_flags(0, &env->vfp.fp_status_f16_a64);
- set_float_exception_flags(0, &env->vfp.standard_fp_status);
- set_float_exception_flags(0, &env->vfp.standard_fp_status_f16);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_A32_F16]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_A64_F16]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_STD_F16]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH]);
+ set_float_exception_flags(0, &env->vfp.fp_status[FPST_AH_F16]);
+}
+
+static void vfp_sync_and_clear_float_status_exc_flags(CPUARMState *env)
+{
+ /*
+ * Synchronize any pending exception-flag information in the
+ * float_status values into env->vfp.fpsr, and then clear out
+ * the float_status data.
+ */
+ env->vfp.fpsr |= vfp_get_fpsr_from_host(env);
+ vfp_clear_float_status_exc_flags(env);
}
static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
@@ -113,33 +198,66 @@ static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask)
i = float_round_to_zero;
break;
}
- set_float_rounding_mode(i, &env->vfp.fp_status_a32);
- set_float_rounding_mode(i, &env->vfp.fp_status_a64);
- set_float_rounding_mode(i, &env->vfp.fp_status_f16_a32);
- set_float_rounding_mode(i, &env->vfp.fp_status_f16_a64);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32]);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64]);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A32_F16]);
+ set_float_rounding_mode(i, &env->vfp.fp_status[FPST_A64_F16]);
}
if (changed & FPCR_FZ16) {
bool ftz_enabled = val & FPCR_FZ16;
- set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32);
- set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64);
- set_flush_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16);
- set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32);
- set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64);
- set_flush_inputs_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32_F16]);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64_F16]);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_STD_F16]);
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_AH_F16]);
}
if (changed & FPCR_FZ) {
bool ftz_enabled = val & FPCR_FZ;
- set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_a32);
- set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_a32);
- set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_a64);
- set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_a64);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
+ set_flush_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A64]);
+ /* FIZ is A64 only so FZ always makes A32 code flush inputs to zero */
+ set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status[FPST_A32]);
+ }
+ if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) {
+ /*
+ * A64: Flush denormalized inputs to zero if FPCR.FIZ = 1, or
+ * both FPCR.AH = 0 and FPCR.FZ = 1.
+ */
+ bool fitz_enabled = (val & FPCR_FIZ) ||
+ (val & (FPCR_FZ | FPCR_AH)) == FPCR_FZ;
+ set_flush_inputs_to_zero(fitz_enabled, &env->vfp.fp_status[FPST_A64]);
}
if (changed & FPCR_DN) {
bool dnan_enabled = val & FPCR_DN;
- set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a32);
- set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a64);
- set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a32);
- set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a64);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32]);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64]);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A32_F16]);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_A64_F16]);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH]);
+ set_default_nan_mode(dnan_enabled, &env->vfp.fp_status[FPST_AH_F16]);
+ }
+ if (changed & FPCR_AH) {
+ bool ah_enabled = val & FPCR_AH;
+
+ if (ah_enabled) {
+ /* Change behaviours for A64 FP operations */
+ arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
+ arm_set_ah_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
+ } else {
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64]);
+ arm_set_default_fp_behaviours(&env->vfp.fp_status[FPST_A64_F16]);
+ }
+ }
+ /*
+ * If any bits changed that we look at in vfp_get_fpsr_from_host(),
+ * we must sync the float_status flags into vfp.fpsr now (under the
+ * old regime) before we update vfp.fpcr.
+ */
+ if (changed & (FPCR_FZ | FPCR_AH | FPCR_FIZ)) {
+ vfp_sync_and_clear_float_status_exc_flags(env);
}
}
@@ -242,6 +360,9 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask)
if (!cpu_isar_feature(any_fp16, cpu)) {
val &= ~FPCR_FZ16;
}
+ if (!cpu_isar_feature(aa64_afp, cpu)) {
+ val &= ~(FPCR_FIZ | FPCR_AH | FPCR_NEP);
+ }
if (!cpu_isar_feature(aa64_ebf16, cpu)) {
val &= ~FPCR_EBF;
@@ -271,12 +392,14 @@ static void vfp_set_fpcr_masked(CPUARMState *env, uint32_t val, uint32_t mask)
* We don't implement trapped exception handling, so the
* trap enable bits, IDE|IXE|UFE|OFE|DZE|IOE are all RAZ/WI (not RES0!)
*
- * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode, EBF
- * and FZ16. Len, Stride and LTPSIZE we just handled. Store those bits
+ * The FPCR bits we keep in vfp.fpcr are AHP, DN, FZ, RMode, EBF, FZ16,
+ * FIZ, AH, and NEP.
+ * Len, Stride and LTPSIZE we just handled. Store those bits
* there, and zero any of the other FPCR bits and the RES0 and RAZ/WI
* bits.
*/
- val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16 | FPCR_EBF;
+ val &= FPCR_AHP | FPCR_DN | FPCR_FZ | FPCR_RMODE_MASK | FPCR_FZ16 |
+ FPCR_EBF | FPCR_FIZ | FPCR_AH | FPCR_NEP;
env->vfp.fpcr &= ~mask;
env->vfp.fpcr |= val;
}
@@ -366,16 +489,16 @@ static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp)
void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
{ \
softfloat_to_vfp_compare(env, \
- FLOATTYPE ## _compare_quiet(a, b, &env->vfp.FPST)); \
+ FLOATTYPE ## _compare_quiet(a, b, &env->vfp.fp_status[FPST])); \
} \
void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
{ \
softfloat_to_vfp_compare(env, \
- FLOATTYPE ## _compare(a, b, &env->vfp.FPST)); \
+ FLOATTYPE ## _compare(a, b, &env->vfp.fp_status[FPST])); \
}
-DO_VFP_cmp(h, float16, dh_ctype_f16, fp_status_f16_a32)
-DO_VFP_cmp(s, float32, float32, fp_status_a32)
-DO_VFP_cmp(d, float64, float64, fp_status_a32)
+DO_VFP_cmp(h, float16, dh_ctype_f16, FPST_A32_F16)
+DO_VFP_cmp(s, float32, float32, FPST_A32)
+DO_VFP_cmp(d, float64, float64, FPST_A32)
#undef DO_VFP_cmp
/* Integer to float and float to integer conversions */
@@ -611,6 +734,33 @@ static int recip_estimate(int input)
}
/*
+ * Increased precision version:
+ * input is a 13 bit fixed point number
+ * input range 2048 .. 4095 for a number from 0.5 <= x < 1.0.
+ * result range 4096 .. 8191 for a number from 1.0 to 2.0
+ */
+static int recip_estimate_incprec(int input)
+{
+ int a, b, r;
+ assert(2048 <= input && input < 4096);
+ a = (input * 2) + 1;
+ /*
+ * The pseudocode expresses this as an operation on infinite
+ * precision reals where it calculates 2^25 / a and then looks
+ * at the error between that and the rounded-down-to-integer
+ * value to see if it should instead round up. We instead
+ * follow the same approach as the pseudocode for the 8-bit
+ * precision version, and calculate (2 * (2^25 / a)) as an
+ * integer so we can do the "add one and halve" to round it.
+ * So the 1 << 26 here is correct.
+ */
+ b = (1 << 26) / a;
+ r = (b + 1) >> 1;
+ assert(4096 <= r && r < 8192);
+ return r;
+}
+
+/*
* Common wrapper to call recip_estimate
*
* The parameters are exponent and 64 bit fraction (without implicit
@@ -619,7 +769,8 @@ static int recip_estimate(int input)
* callee.
*/
-static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac)
+static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac,
+ bool increasedprecision)
{
uint32_t scaled, estimate;
uint64_t result_frac;
@@ -635,12 +786,22 @@ static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac)
}
}
- /* scaled = UInt('1':fraction<51:44>) */
- scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
- estimate = recip_estimate(scaled);
+ if (increasedprecision) {
+ /* scaled = UInt('1':fraction<51:41>) */
+ scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
+ estimate = recip_estimate_incprec(scaled);
+ } else {
+ /* scaled = UInt('1':fraction<51:44>) */
+ scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
+ estimate = recip_estimate(scaled);
+ }
result_exp = exp_off - *exp;
- result_frac = deposit64(0, 44, 8, estimate);
+ if (increasedprecision) {
+ result_frac = deposit64(0, 40, 12, estimate);
+ } else {
+ result_frac = deposit64(0, 44, 8, estimate);
+ }
if (result_exp == 0) {
result_frac = deposit64(result_frac >> 1, 51, 1, 1);
} else if (result_exp == -1) {
@@ -709,7 +870,7 @@ uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst)
}
f64_frac = call_recip_estimate(&f16_exp, 29,
- ((uint64_t) f16_frac) << (52 - 10));
+ ((uint64_t) f16_frac) << (52 - 10), false);
/* result = sign : result_exp<4:0> : fraction<51:42> */
f16_val = deposit32(0, 15, 1, f16_sign);
@@ -718,7 +879,11 @@ uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst)
return make_float16(f16_val);
}
-float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
+/*
+ * FEAT_RPRES means the f32 FRECPE has an "increased precision" variant
+ * which is used when FPCR.AH == 1.
+ */
+static float32 do_recpe_f32(float32 input, float_status *fpst, bool rpres)
{
float32 f32 = float32_squash_input_denormal(input, fpst);
uint32_t f32_val = float32_val(f32);
@@ -758,7 +923,7 @@ float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
}
f64_frac = call_recip_estimate(&f32_exp, 253,
- ((uint64_t) f32_frac) << (52 - 23));
+ ((uint64_t) f32_frac) << (52 - 23), rpres);
/* result = sign : result_exp<7:0> : fraction<51:29> */
f32_val = deposit32(0, 31, 1, f32_sign);
@@ -767,6 +932,16 @@ float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
return make_float32(f32_val);
}
+float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
+{
+ return do_recpe_f32(input, fpst, false);
+}
+
+float32 HELPER(recpe_rpres_f32)(float32 input, float_status *fpst)
+{
+ return do_recpe_f32(input, fpst, true);
+}
+
float64 HELPER(recpe_f64)(float64 input, float_status *fpst)
{
float64 f64 = float64_squash_input_denormal(input, fpst);
@@ -806,7 +981,7 @@ float64 HELPER(recpe_f64)(float64 input, float_status *fpst)
return float64_set_sign(float64_zero, float64_is_neg(f64));
}
- f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac);
+ f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac, false);
/* result = sign : result_exp<10:0> : fraction<51:0>; */
f64_val = deposit64(0, 63, 1, f64_sign);
@@ -840,8 +1015,36 @@ static int do_recip_sqrt_estimate(int a)
return estimate;
}
+static int do_recip_sqrt_estimate_incprec(int a)
+{
+ /*
+ * The Arm ARM describes the 12-bit precision version of RecipSqrtEstimate
+ * in terms of an infinite-precision floating point calculation of a
+ * square root. We implement this using the same kind of pure integer
+ * algorithm as the 8-bit mantissa, to get the same bit-for-bit result.
+ */
+ int64_t b, estimate;
+
+ assert(1024 <= a && a < 4096);
+ if (a < 2048) {
+ a = a * 2 + 1;
+ } else {
+ a = (a >> 1) << 1;
+ a = (a + 1) * 2;
+ }
+ b = 8192;
+ while (a * (b + 1) * (b + 1) < (1ULL << 39)) {
+ b += 1;
+ }
+ estimate = (b + 1) / 2;
+
+ assert(4096 <= estimate && estimate < 8192);
-static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac)
+ return estimate;
+}
+
+static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac,
+ bool increasedprecision)
{
int estimate;
uint32_t scaled;
@@ -854,17 +1057,32 @@ static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac)
frac = extract64(frac, 0, 51) << 1;
}
- if (*exp & 1) {
- /* scaled = UInt('01':fraction<51:45>) */
- scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7));
+ if (increasedprecision) {
+ if (*exp & 1) {
+ /* scaled = UInt('01':fraction<51:42>) */
+ scaled = deposit32(1 << 10, 0, 10, extract64(frac, 42, 10));
+ } else {
+ /* scaled = UInt('1':fraction<51:41>) */
+ scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
+ }
+ estimate = do_recip_sqrt_estimate_incprec(scaled);
} else {
- /* scaled = UInt('1':fraction<51:44>) */
- scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
+ if (*exp & 1) {
+ /* scaled = UInt('01':fraction<51:45>) */
+ scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7));
+ } else {
+ /* scaled = UInt('1':fraction<51:44>) */
+ scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
+ }
+ estimate = do_recip_sqrt_estimate(scaled);
}
- estimate = do_recip_sqrt_estimate(scaled);
*exp = (exp_off - *exp) / 2;
- return extract64(estimate, 0, 8) << 44;
+ if (increasedprecision) {
+ return extract64(estimate, 0, 12) << 40;
+ } else {
+ return extract64(estimate, 0, 8) << 44;
+ }
}
uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s)
@@ -903,7 +1121,7 @@ uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s)
f64_frac = ((uint64_t) f16_frac) << (52 - 10);
- f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac);
+ f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac, false);
/* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(2) */
val = deposit32(0, 15, 1, f16_sign);
@@ -912,7 +1130,11 @@ uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s)
return make_float16(val);
}
-float32 HELPER(rsqrte_f32)(float32 input, float_status *s)
+/*
+ * FEAT_RPRES means the f32 FRSQRTE has an "increased precision" variant
+ * which is used when FPCR.AH == 1.
+ */
+static float32 do_rsqrte_f32(float32 input, float_status *s, bool rpres)
{
float32 f32 = float32_squash_input_denormal(input, s);
uint32_t val = float32_val(f32);
@@ -948,15 +1170,33 @@ float32 HELPER(rsqrte_f32)(float32 input, float_status *s)
f64_frac = ((uint64_t) f32_frac) << 29;
- f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac);
+ f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac, rpres);
- /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(15) */
+ /*
+ * result = sign : result_exp<7:0> : estimate<7:0> : Zeros(15)
+ * or for increased precision
+ * result = sign : result_exp<7:0> : estimate<11:0> : Zeros(11)
+ */
val = deposit32(0, 31, 1, f32_sign);
val = deposit32(val, 23, 8, f32_exp);
- val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8));
+ if (rpres) {
+ val = deposit32(val, 11, 12, extract64(f64_frac, 52 - 12, 12));
+ } else {
+ val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8));
+ }
return make_float32(val);
}
+float32 HELPER(rsqrte_f32)(float32 input, float_status *s)
+{
+ return do_rsqrte_f32(input, s, false);
+}
+
+float32 HELPER(rsqrte_rpres_f32)(float32 input, float_status *s)
+{
+ return do_rsqrte_f32(input, s, true);
+}
+
float64 HELPER(rsqrte_f64)(float64 input, float_status *s)
{
float64 f64 = float64_squash_input_denormal(input, s);
@@ -987,7 +1227,7 @@ float64 HELPER(rsqrte_f64)(float64 input, float_status *s)
return float64_zero;
}
- f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac);
+ f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac, false);
/* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(44) */
val = deposit64(0, 61, 1, f64_sign);
@@ -1145,7 +1385,7 @@ uint64_t HELPER(fjcvtzs)(float64 value, float_status *status)
uint32_t HELPER(vjcvt)(float64 value, CPUARMState *env)
{
- uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status_a32);
+ uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status[FPST_A32]);
uint32_t result = pair;
uint32_t z = (pair >> 32) == 0;