aboutsummaryrefslogtreecommitdiff
path: root/fpu/softfloat.c
diff options
context:
space:
mode:
Diffstat (limited to 'fpu/softfloat.c')
-rw-r--r--fpu/softfloat.c726
1 files changed, 259 insertions, 467 deletions
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index bc0f52f..8cd2400 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -96,16 +96,6 @@ this code that are retained.
#include "fpu/softfloat-macros.h"
/*----------------------------------------------------------------------------
-| Functions and definitions to determine: (1) whether tininess for underflow
-| is detected before or after rounding by default, (2) what (if anything)
-| happens when exceptions are raised, (3) how signaling NaNs are distinguished
-| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
-| are propagated from function inputs to output. These details are target-
-| specific.
-*----------------------------------------------------------------------------*/
-#include "softfloat-specialize.h"
-
-/*----------------------------------------------------------------------------
| Returns the fraction bits of the half-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
@@ -124,15 +114,6 @@ static inline int extractFloat16Exp(float16 a)
}
/*----------------------------------------------------------------------------
-| Returns the sign bit of the single-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline flag extractFloat16Sign(float16 a)
-{
- return float16_val(a)>>15;
-}
-
-/*----------------------------------------------------------------------------
| Returns the fraction bits of the single-precision floating-point value `a'.
*----------------------------------------------------------------------------*/
@@ -198,10 +179,24 @@ typedef enum __attribute__ ((__packed__)) {
float_class_inf,
float_class_qnan, /* all NaNs from here */
float_class_snan,
- float_class_dnan,
- float_class_msnan, /* maybe silenced */
} FloatClass;
+/* Simple helpers for checking if, or what kind of, NaN we have */
+static inline __attribute__((unused)) bool is_nan(FloatClass c)
+{
+ return unlikely(c >= float_class_qnan);
+}
+
+static inline __attribute__((unused)) bool is_snan(FloatClass c)
+{
+ return c == float_class_snan;
+}
+
+static inline __attribute__((unused)) bool is_qnan(FloatClass c)
+{
+ return c == float_class_qnan;
+}
+
/*
* Structure holding all of the decomposed parts of a float. The
* exponent is unbiased and the fraction is normalized. All
@@ -232,8 +227,10 @@ typedef struct {
* frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
* The following are computed based the size of fraction
* frac_lsb: least significant bit of fraction
- * fram_lsbm1: the bit bellow the least significant bit (for rounding)
+ * frac_lsbm1: the bit below the least significant bit (for rounding)
* round_mask/roundeven_mask: masks used for rounding
+ * The following optional modifiers are available:
+ * arm_althp: handle ARM Alternative Half Precision
*/
typedef struct {
int exp_size;
@@ -245,6 +242,7 @@ typedef struct {
uint64_t frac_lsbm1;
uint64_t round_mask;
uint64_t roundeven_mask;
+ bool arm_althp;
} FloatFmt;
/* Expand fields based on the size of exponent and fraction */
@@ -263,6 +261,11 @@ static const FloatFmt float16_params = {
FLOAT_PARAMS(5, 10)
};
+static const FloatFmt float16_params_ahp = {
+ FLOAT_PARAMS(5, 10),
+ .arm_althp = true
+};
+
static const FloatFmt float32_params = {
FLOAT_PARAMS(8, 23)
};
@@ -322,24 +325,27 @@ static inline float64 float64_pack_raw(FloatParts p)
return make_float64(pack_raw(float64_params, p));
}
+/*----------------------------------------------------------------------------
+| Functions and definitions to determine: (1) whether tininess for underflow
+| is detected before or after rounding by default, (2) what (if anything)
+| happens when exceptions are raised, (3) how signaling NaNs are distinguished
+| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
+| are propagated from function inputs to output. These details are target-
+| specific.
+*----------------------------------------------------------------------------*/
+#include "softfloat-specialize.h"
+
/* Canonicalize EXP and FRAC, setting CLS. */
static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
float_status *status)
{
- if (part.exp == parm->exp_max) {
+ if (part.exp == parm->exp_max && !parm->arm_althp) {
if (part.frac == 0) {
part.cls = float_class_inf;
} else {
-#ifdef NO_SIGNALING_NANS
- part.cls = float_class_qnan;
-#else
- int64_t msb = part.frac << (parm->frac_shift + 2);
- if ((msb < 0) == status->snan_bit_is_one) {
- part.cls = float_class_snan;
- } else {
- part.cls = float_class_qnan;
- }
-#endif
+ part.frac <<= parm->frac_shift;
+ part.cls = (parts_is_snan_frac(part.frac, status)
+ ? float_class_snan : float_class_qnan);
}
} else if (part.exp == 0) {
if (likely(part.frac == 0)) {
@@ -422,7 +428,15 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
}
frac >>= frac_shift;
- if (unlikely(exp >= exp_max)) {
+ if (parm->arm_althp) {
+ /* ARM Alt HP eschews Inf and NaN for a wider exponent. */
+ if (unlikely(exp > exp_max)) {
+ /* Overflow. Return the maximum normal. */
+ flags = float_flag_invalid;
+ exp = exp_max;
+ frac = -1;
+ }
+ } else if (unlikely(exp >= exp_max)) {
flags |= float_flag_overflow | float_flag_inexact;
if (overflow_norm) {
exp = exp_max - 1;
@@ -473,13 +487,16 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
case float_class_inf:
do_inf:
+ assert(!parm->arm_althp);
exp = exp_max;
frac = 0;
break;
case float_class_qnan:
case float_class_snan:
+ assert(!parm->arm_althp);
exp = exp_max;
+ frac >>= parm->frac_shift;
break;
default:
@@ -492,22 +509,27 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
return p;
}
+/* Explicit FloatFmt version */
+static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
+ const FloatFmt *params)
+{
+ return canonicalize(float16_unpack_raw(f), params, s);
+}
+
static FloatParts float16_unpack_canonical(float16 f, float_status *s)
{
- return canonicalize(float16_unpack_raw(f), &float16_params, s);
+ return float16a_unpack_canonical(f, s, &float16_params);
+}
+
+static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
+ const FloatFmt *params)
+{
+ return float16_pack_raw(round_canonical(p, s, params));
}
static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
{
- switch (p.cls) {
- case float_class_dnan:
- return float16_default_nan(s);
- case float_class_msnan:
- return float16_maybe_silence_nan(float16_pack_raw(p), s);
- default:
- p = round_canonical(p, s, &float16_params);
- return float16_pack_raw(p);
- }
+ return float16a_round_pack_canonical(p, s, &float16_params);
}
static FloatParts float32_unpack_canonical(float32 f, float_status *s)
@@ -517,15 +539,7 @@ static FloatParts float32_unpack_canonical(float32 f, float_status *s)
static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
{
- switch (p.cls) {
- case float_class_dnan:
- return float32_default_nan(s);
- case float_class_msnan:
- return float32_maybe_silence_nan(float32_pack_raw(p), s);
- default:
- p = round_canonical(p, s, &float32_params);
- return float32_pack_raw(p);
- }
+ return float32_pack_raw(round_canonical(p, s, &float32_params));
}
static FloatParts float64_unpack_canonical(float64 f, float_status *s)
@@ -535,29 +549,7 @@ static FloatParts float64_unpack_canonical(float64 f, float_status *s)
static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
{
- switch (p.cls) {
- case float_class_dnan:
- return float64_default_nan(s);
- case float_class_msnan:
- return float64_maybe_silence_nan(float64_pack_raw(p), s);
- default:
- p = round_canonical(p, s, &float64_params);
- return float64_pack_raw(p);
- }
-}
-
-/* Simple helpers for checking if what NaN we have */
-static bool is_nan(FloatClass c)
-{
- return unlikely(c >= float_class_qnan);
-}
-static bool is_snan(FloatClass c)
-{
- return c == float_class_snan;
-}
-static bool is_qnan(FloatClass c)
-{
- return c == float_class_qnan;
+ return float64_pack_raw(round_canonical(p, s, &float64_params));
}
static FloatParts return_nan(FloatParts a, float_status *s)
@@ -565,11 +557,11 @@ static FloatParts return_nan(FloatParts a, float_status *s)
switch (a.cls) {
case float_class_snan:
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_msnan;
+ a = parts_silence_nan(a, s);
/* fall through */
case float_class_qnan:
if (s->default_nan_mode) {
- a.cls = float_class_dnan;
+ return parts_default_nan(s);
}
break;
@@ -586,15 +578,16 @@ static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
}
if (s->default_nan_mode) {
- a.cls = float_class_dnan;
+ return parts_default_nan(s);
} else {
- if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
- is_qnan(b.cls), is_snan(b.cls),
+ if (pickNaN(a.cls, b.cls,
a.frac > b.frac ||
(a.frac == b.frac && a.sign < b.sign))) {
a = b;
}
- a.cls = float_class_msnan;
+ if (is_snan(a.cls)) {
+ return parts_silence_nan(a, s);
+ }
}
return a;
}
@@ -608,17 +601,13 @@ static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
s->float_exception_flags |= float_flag_invalid;
}
- which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
- is_qnan(b.cls), is_snan(b.cls),
- is_qnan(c.cls), is_snan(c.cls),
- inf_zero, s);
+ which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
if (s->default_nan_mode) {
/* Note that this check is after pickNaNMulAdd so that function
* has an opportunity to set the Invalid flag.
*/
- a.cls = float_class_dnan;
- return a;
+ which = 3;
}
switch (which) {
@@ -631,13 +620,14 @@ static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
a = c;
break;
case 3:
- a.cls = float_class_dnan;
- return a;
+ return parts_default_nan(s);
default:
g_assert_not_reached();
}
- a.cls = float_class_msnan;
+ if (is_snan(a.cls)) {
+ return parts_silence_nan(a, s);
+ }
return a;
}
@@ -685,7 +675,7 @@ static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
if (a.cls == float_class_inf) {
if (b.cls == float_class_inf) {
float_raise(float_flag_invalid, s);
- a.cls = float_class_dnan;
+ return parts_default_nan(s);
}
return a;
}
@@ -831,9 +821,7 @@ static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
(a.cls == float_class_zero && b.cls == float_class_inf)) {
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_dnan;
- a.sign = sign;
- return a;
+ return parts_default_nan(s);
}
/* Multiply by 0 or Inf */
if (a.cls == float_class_inf || a.cls == float_class_zero) {
@@ -911,8 +899,7 @@ static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
if (inf_zero) {
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_dnan;
- return a;
+ return parts_default_nan(s);
}
if (flags & float_muladd_negate_c) {
@@ -936,12 +923,12 @@ static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
if (c.cls == float_class_inf) {
if (p_class == float_class_inf && p_sign != c.sign) {
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_dnan;
+ return parts_default_nan(s);
} else {
a.cls = float_class_inf;
a.sign = c.sign ^ sign_flip;
+ return a;
}
- return a;
}
if (p_class == float_class_inf) {
@@ -1151,8 +1138,7 @@ static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
&&
(a.cls == float_class_inf || a.cls == float_class_zero)) {
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_dnan;
- return a;
+ return parts_default_nan(s);
}
/* Inf / x or 0 / x */
if (a.cls == float_class_inf || a.cls == float_class_zero) {
@@ -1203,6 +1189,104 @@ float64 float64_div(float64 a, float64 b, float_status *status)
}
/*
+ * Float to Float conversions
+ *
+ * Returns the result of converting one float format to another. The
+ * conversion is performed according to the IEC/IEEE Standard for
+ * Binary Floating-Point Arithmetic.
+ *
+ * The float_to_float helper only needs to take care of raising
+ * invalid exceptions and handling the conversion on NaNs.
+ */
+
+static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
+ float_status *s)
+{
+ if (dstf->arm_althp) {
+ switch (a.cls) {
+ case float_class_qnan:
+ case float_class_snan:
+ /* There is no NaN in the destination format. Raise Invalid
+ * and return a zero with the sign of the input NaN.
+ */
+ s->float_exception_flags |= float_flag_invalid;
+ a.cls = float_class_zero;
+ a.frac = 0;
+ a.exp = 0;
+ break;
+
+ case float_class_inf:
+ /* There is no Inf in the destination format. Raise Invalid
+ * and return the maximum normal with the correct sign.
+ */
+ s->float_exception_flags |= float_flag_invalid;
+ a.cls = float_class_normal;
+ a.exp = dstf->exp_max;
+ a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
+ break;
+
+ default:
+ break;
+ }
+ } else if (is_nan(a.cls)) {
+ if (is_snan(a.cls)) {
+ s->float_exception_flags |= float_flag_invalid;
+ a = parts_silence_nan(a, s);
+ }
+ if (s->default_nan_mode) {
+ return parts_default_nan(s);
+ }
+ }
+ return a;
+}
+
+float32 float16_to_float32(float16 a, bool ieee, float_status *s)
+{
+ const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
+ FloatParts p = float16a_unpack_canonical(a, s, fmt16);
+ FloatParts pr = float_to_float(p, &float32_params, s);
+ return float32_round_pack_canonical(pr, s);
+}
+
+float64 float16_to_float64(float16 a, bool ieee, float_status *s)
+{
+ const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
+ FloatParts p = float16a_unpack_canonical(a, s, fmt16);
+ FloatParts pr = float_to_float(p, &float64_params, s);
+ return float64_round_pack_canonical(pr, s);
+}
+
+float16 float32_to_float16(float32 a, bool ieee, float_status *s)
+{
+ const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
+ FloatParts p = float32_unpack_canonical(a, s);
+ FloatParts pr = float_to_float(p, fmt16, s);
+ return float16a_round_pack_canonical(pr, s, fmt16);
+}
+
+float64 float32_to_float64(float32 a, float_status *s)
+{
+ FloatParts p = float32_unpack_canonical(a, s);
+ FloatParts pr = float_to_float(p, &float64_params, s);
+ return float64_round_pack_canonical(pr, s);
+}
+
+float16 float64_to_float16(float64 a, bool ieee, float_status *s)
+{
+ const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
+ FloatParts p = float64_unpack_canonical(a, s);
+ FloatParts pr = float_to_float(p, fmt16, s);
+ return float16a_round_pack_canonical(pr, s, fmt16);
+}
+
+float32 float64_to_float32(float64 a, float_status *s)
+{
+ FloatParts p = float64_unpack_canonical(a, s);
+ FloatParts pr = float_to_float(p, &float32_params, s);
+ return float32_round_pack_canonical(pr, s);
+}
+
+/*
* Rounds the floating-point value `a' to an integer, and returns the
* result as a floating-point value. The operation is performed
* according to the IEC/IEEE Standard for Binary Floating-Point
@@ -1350,8 +1434,6 @@ static int64_t round_to_int_and_pack(FloatParts in, int rmode,
switch (p.cls) {
case float_class_snan:
case float_class_qnan:
- case float_class_dnan:
- case float_class_msnan:
s->float_exception_flags = orig_flags | float_flag_invalid;
return max;
case float_class_inf:
@@ -1442,8 +1524,6 @@ static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
switch (p.cls) {
case float_class_snan:
case float_class_qnan:
- case float_class_dnan:
- case float_class_msnan:
s->float_exception_flags = orig_flags | float_flag_invalid;
return max;
case float_class_inf:
@@ -1943,8 +2023,7 @@ static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
}
if (a.sign) {
s->float_exception_flags |= float_flag_invalid;
- a.cls = float_class_dnan;
- return a;
+ return parts_default_nan(s);
}
if (a.cls == float_class_inf) {
return a; /* sqrt(+inf) = +inf */
@@ -2013,6 +2092,78 @@ float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
return float64_round_pack_canonical(pr, status);
}
+/*----------------------------------------------------------------------------
+| The pattern for a default generated NaN.
+*----------------------------------------------------------------------------*/
+
+float16 float16_default_nan(float_status *status)
+{
+ FloatParts p = parts_default_nan(status);
+ p.frac >>= float16_params.frac_shift;
+ return float16_pack_raw(p);
+}
+
+float32 float32_default_nan(float_status *status)
+{
+ FloatParts p = parts_default_nan(status);
+ p.frac >>= float32_params.frac_shift;
+ return float32_pack_raw(p);
+}
+
+float64 float64_default_nan(float_status *status)
+{
+ FloatParts p = parts_default_nan(status);
+ p.frac >>= float64_params.frac_shift;
+ return float64_pack_raw(p);
+}
+
+float128 float128_default_nan(float_status *status)
+{
+ FloatParts p = parts_default_nan(status);
+ float128 r;
+
+ /* Extrapolate from the choices made by parts_default_nan to fill
+ * in the quad-floating format. If the low bit is set, assume we
+ * want to set all non-snan bits.
+ */
+ r.low = -(p.frac & 1);
+ r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
+ r.high |= LIT64(0x7FFF000000000000);
+ r.high |= (uint64_t)p.sign << 63;
+
+ return r;
+}
+
+/*----------------------------------------------------------------------------
+| Returns a quiet NaN from a signalling NaN for the floating point value `a'.
+*----------------------------------------------------------------------------*/
+
+float16 float16_silence_nan(float16 a, float_status *status)
+{
+ FloatParts p = float16_unpack_raw(a);
+ p.frac <<= float16_params.frac_shift;
+ p = parts_silence_nan(p, status);
+ p.frac >>= float16_params.frac_shift;
+ return float16_pack_raw(p);
+}
+
+float32 float32_silence_nan(float32 a, float_status *status)
+{
+ FloatParts p = float32_unpack_raw(a);
+ p.frac <<= float32_params.frac_shift;
+ p = parts_silence_nan(p, status);
+ p.frac >>= float32_params.frac_shift;
+ return float32_pack_raw(p);
+}
+
+float64 float64_silence_nan(float64 a, float_status *status)
+{
+ FloatParts p = float64_unpack_raw(a);
+ p.frac <<= float64_params.frac_shift;
+ p = parts_silence_nan(p, status);
+ p.frac >>= float64_params.frac_shift;
+ return float64_pack_raw(p);
+}
/*----------------------------------------------------------------------------
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
@@ -3147,42 +3298,7 @@ float128 uint64_to_float128(uint64_t a, float_status *status)
if (a == 0) {
return float128_zero;
}
- return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
-}
-
-
-
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the double-precision floating-point format. The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float32_to_float64(float32 a, float_status *status)
-{
- flag aSign;
- int aExp;
- uint32_t aSig;
- a = float32_squash_input_denormal(a, status);
-
- aSig = extractFloat32Frac( a );
- aExp = extractFloat32Exp( a );
- aSign = extractFloat32Sign( a );
- if ( aExp == 0xFF ) {
- if (aSig) {
- return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
- }
- return packFloat64( aSign, 0x7FF, 0 );
- }
- if ( aExp == 0 ) {
- if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
- normalizeFloat32Subnormal( aSig, &aExp, &aSig );
- --aExp;
- }
- return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
-
+ return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
}
/*----------------------------------------------------------------------------
@@ -3703,173 +3819,6 @@ int float32_unordered_quiet(float32 a, float32 b, float_status *status)
return 0;
}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the single-precision floating-point format. The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float64_to_float32(float64 a, float_status *status)
-{
- flag aSign;
- int aExp;
- uint64_t aSig;
- uint32_t zSig;
- a = float64_squash_input_denormal(a, status);
-
- aSig = extractFloat64Frac( a );
- aExp = extractFloat64Exp( a );
- aSign = extractFloat64Sign( a );
- if ( aExp == 0x7FF ) {
- if (aSig) {
- return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
- }
- return packFloat32( aSign, 0xFF, 0 );
- }
- shift64RightJamming( aSig, 22, &aSig );
- zSig = aSig;
- if ( aExp || zSig ) {
- zSig |= 0x40000000;
- aExp -= 0x381;
- }
- return roundAndPackFloat32(aSign, aExp, zSig, status);
-
-}
-
-
-/*----------------------------------------------------------------------------
-| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
-| half-precision floating-point value, returning the result. After being
-| shifted into the proper positions, the three fields are simply added
-| together to form the result. This means that any integer portion of `zSig'
-| will be added into the exponent. Since a properly normalized significand
-| will have an integer portion equal to 1, the `zExp' input should be 1 less
-| than the desired result exponent whenever `zSig' is a complete, normalized
-| significand.
-*----------------------------------------------------------------------------*/
-static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
-{
- return make_float16(
- (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
-}
-
-/*----------------------------------------------------------------------------
-| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
-| and significand `zSig', and returns the proper half-precision floating-
-| point value corresponding to the abstract input. Ordinarily, the abstract
-| value is simply rounded and packed into the half-precision format, with
-| the inexact exception raised if the abstract input cannot be represented
-| exactly. However, if the abstract value is too large, the overflow and
-| inexact exceptions are raised and an infinity or maximal finite value is
-| returned. If the abstract value is too small, the input value is rounded to
-| a subnormal number, and the underflow and inexact exceptions are raised if
-| the abstract input cannot be represented exactly as a subnormal half-
-| precision floating-point number.
-| The `ieee' flag indicates whether to use IEEE standard half precision, or
-| ARM-style "alternative representation", which omits the NaN and Inf
-| encodings in order to raise the maximum representable exponent by one.
-| The input significand `zSig' has its binary point between bits 22
-| and 23, which is 13 bits to the left of the usual location. This shifted
-| significand must be normalized or smaller. If `zSig' is not normalized,
-| `zExp' must be 0; in that case, the result returned is a subnormal number,
-| and it must not require rounding. In the usual case that `zSig' is
-| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
-| Note the slightly odd position of the binary point in zSig compared with the
-| other roundAndPackFloat functions. This should probably be fixed if we
-| need to implement more float16 routines than just conversion.
-| The handling of underflow and overflow follows the IEC/IEEE Standard for
-| Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-static float16 roundAndPackFloat16(flag zSign, int zExp,
- uint32_t zSig, flag ieee,
- float_status *status)
-{
- int maxexp = ieee ? 29 : 30;
- uint32_t mask;
- uint32_t increment;
- bool rounding_bumps_exp;
- bool is_tiny = false;
-
- /* Calculate the mask of bits of the mantissa which are not
- * representable in half-precision and will be lost.
- */
- if (zExp < 1) {
- /* Will be denormal in halfprec */
- mask = 0x00ffffff;
- if (zExp >= -11) {
- mask >>= 11 + zExp;
- }
- } else {
- /* Normal number in halfprec */
- mask = 0x00001fff;
- }
-
- switch (status->float_rounding_mode) {
- case float_round_nearest_even:
- increment = (mask + 1) >> 1;
- if ((zSig & mask) == increment) {
- increment = zSig & (increment << 1);
- }
- break;
- case float_round_ties_away:
- increment = (mask + 1) >> 1;
- break;
- case float_round_up:
- increment = zSign ? 0 : mask;
- break;
- case float_round_down:
- increment = zSign ? mask : 0;
- break;
- default: /* round_to_zero */
- increment = 0;
- break;
- }
-
- rounding_bumps_exp = (zSig + increment >= 0x01000000);
-
- if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
- if (ieee) {
- float_raise(float_flag_overflow | float_flag_inexact, status);
- return packFloat16(zSign, 0x1f, 0);
- } else {
- float_raise(float_flag_invalid, status);
- return packFloat16(zSign, 0x1f, 0x3ff);
- }
- }
-
- if (zExp < 0) {
- /* Note that flush-to-zero does not affect half-precision results */
- is_tiny =
- (status->float_detect_tininess == float_tininess_before_rounding)
- || (zExp < -1)
- || (!rounding_bumps_exp);
- }
- if (zSig & mask) {
- float_raise(float_flag_inexact, status);
- if (is_tiny) {
- float_raise(float_flag_underflow, status);
- }
- }
-
- zSig += increment;
- if (rounding_bumps_exp) {
- zSig >>= 1;
- zExp++;
- }
-
- if (zExp < -10) {
- return packFloat16(zSign, 0, 0);
- }
- if (zExp < 0) {
- zSig >>= -zExp;
- zExp = 0;
- }
- return packFloat16(zSign, zExp, zSig >> 13);
-}
-
/*----------------------------------------------------------------------------
| If `a' is denormal and we are in flush-to-zero mode then set the
| input-denormal exception and return zero. Otherwise just return the value.
@@ -3885,163 +3834,6 @@ float16 float16_squash_input_denormal(float16 a, float_status *status)
return a;
}
-static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
- uint32_t *zSigPtr)
-{
- int8_t shiftCount = countLeadingZeros32(aSig) - 21;
- *zSigPtr = aSig << shiftCount;
- *zExpPtr = 1 - shiftCount;
-}
-
-/* Half precision floats come in two formats: standard IEEE and "ARM" format.
- The latter gains extra exponent range by omitting the NaN/Inf encodings. */
-
-float32 float16_to_float32(float16 a, flag ieee, float_status *status)
-{
- flag aSign;
- int aExp;
- uint32_t aSig;
-
- aSign = extractFloat16Sign(a);
- aExp = extractFloat16Exp(a);
- aSig = extractFloat16Frac(a);
-
- if (aExp == 0x1f && ieee) {
- if (aSig) {
- return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
- }
- return packFloat32(aSign, 0xff, 0);
- }
- if (aExp == 0) {
- if (aSig == 0) {
- return packFloat32(aSign, 0, 0);
- }
-
- normalizeFloat16Subnormal(aSig, &aExp, &aSig);
- aExp--;
- }
- return packFloat32( aSign, aExp + 0x70, aSig << 13);
-}
-
-float16 float32_to_float16(float32 a, flag ieee, float_status *status)
-{
- flag aSign;
- int aExp;
- uint32_t aSig;
-
- a = float32_squash_input_denormal(a, status);
-
- aSig = extractFloat32Frac( a );
- aExp = extractFloat32Exp( a );
- aSign = extractFloat32Sign( a );
- if ( aExp == 0xFF ) {
- if (aSig) {
- /* Input is a NaN */
- if (!ieee) {
- float_raise(float_flag_invalid, status);
- return packFloat16(aSign, 0, 0);
- }
- return commonNaNToFloat16(
- float32ToCommonNaN(a, status), status);
- }
- /* Infinity */
- if (!ieee) {
- float_raise(float_flag_invalid, status);
- return packFloat16(aSign, 0x1f, 0x3ff);
- }
- return packFloat16(aSign, 0x1f, 0);
- }
- if (aExp == 0 && aSig == 0) {
- return packFloat16(aSign, 0, 0);
- }
- /* Decimal point between bits 22 and 23. Note that we add the 1 bit
- * even if the input is denormal; however this is harmless because
- * the largest possible single-precision denormal is still smaller
- * than the smallest representable half-precision denormal, and so we
- * will end up ignoring aSig and returning via the "always return zero"
- * codepath.
- */
- aSig |= 0x00800000;
- aExp -= 0x71;
-
- return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
-}
-
-float64 float16_to_float64(float16 a, flag ieee, float_status *status)
-{
- flag aSign;
- int aExp;
- uint32_t aSig;
-
- aSign = extractFloat16Sign(a);
- aExp = extractFloat16Exp(a);
- aSig = extractFloat16Frac(a);
-
- if (aExp == 0x1f && ieee) {
- if (aSig) {
- return commonNaNToFloat64(
- float16ToCommonNaN(a, status), status);
- }
- return packFloat64(aSign, 0x7ff, 0);
- }
- if (aExp == 0) {
- if (aSig == 0) {
- return packFloat64(aSign, 0, 0);
- }
-
- normalizeFloat16Subnormal(aSig, &aExp, &aSig);
- aExp--;
- }
- return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
-}
-
-float16 float64_to_float16(float64 a, flag ieee, float_status *status)
-{
- flag aSign;
- int aExp;
- uint64_t aSig;
- uint32_t zSig;
-
- a = float64_squash_input_denormal(a, status);
-
- aSig = extractFloat64Frac(a);
- aExp = extractFloat64Exp(a);
- aSign = extractFloat64Sign(a);
- if (aExp == 0x7FF) {
- if (aSig) {
- /* Input is a NaN */
- if (!ieee) {
- float_raise(float_flag_invalid, status);
- return packFloat16(aSign, 0, 0);
- }
- return commonNaNToFloat16(
- float64ToCommonNaN(a, status), status);
- }
- /* Infinity */
- if (!ieee) {
- float_raise(float_flag_invalid, status);
- return packFloat16(aSign, 0x1f, 0x3ff);
- }
- return packFloat16(aSign, 0x1f, 0);
- }
- shift64RightJamming(aSig, 29, &aSig);
- zSig = aSig;
- if (aExp == 0 && zSig == 0) {
- return packFloat16(aSign, 0, 0);
- }
- /* Decimal point between bits 22 and 23. Note that we add the 1 bit
- * even if the input is denormal; however this is harmless because
- * the largest possible single-precision denormal is still smaller
- * than the smallest representable half-precision denormal, and so we
- * will end up ignoring aSig and returning via the "always return zero"
- * codepath.
- */
- zSig |= 0x00800000;
- aExp -= 0x3F1;
-
- return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
-}
-
/*----------------------------------------------------------------------------
| Returns the result of converting the double-precision floating-point value
| `a' to the extended double-precision floating-point format. The conversion