//===-- A class to store high precision floating point numbers --*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H #include "FEnvImpl.h" #include "FPBits.h" #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "multiply_add.h" #include "rounding_mode.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/big_int.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/types.h" #include namespace LIBC_NAMESPACE_DECL { namespace fputil { // Decide whether to round a UInt up, down or not at all at a given bit // position, based on the current rounding mode. The assumption is that the // caller is going to make the integer `value >> rshift`, and then might need // to round it up by 1 depending on the value of the bits shifted off the // bottom. // // `logical_sign` causes the behavior of FE_DOWNWARD and FE_UPWARD to // be reversed, which is what you'd want if this is the mantissa of a // negative floating-point number. // // Return value is +1 if the value should be rounded up; -1 if it should be // rounded down; 0 if it's exact and needs no rounding. template LIBC_INLINE constexpr int rounding_direction(const LIBC_NAMESPACE::UInt &value, size_t rshift, Sign logical_sign) { if (rshift == 0 || (rshift < Bits && (value << (Bits - rshift)) == 0) || (rshift >= Bits && value == 0)) return 0; // exact switch (quick_get_round()) { case FE_TONEAREST: if (rshift > 0 && rshift <= Bits && value.get_bit(rshift - 1)) { // We round up, unless the value is an exact halfway case and // the bit that will end up in the units place is 0, in which // case tie-break-to-even says round down. bool round_bit = rshift < Bits ? value.get_bit(rshift) : 0; return round_bit != 0 || (value << (Bits - rshift + 1)) != 0 ? +1 : -1; } else { return -1; } case FE_TOWARDZERO: return -1; case FE_DOWNWARD: return logical_sign.is_neg() && (rshift < Bits && (value << (Bits - rshift)) != 0) ? +1 : -1; case FE_UPWARD: return logical_sign.is_pos() && (rshift < Bits && (value << (Bits - rshift)) != 0) ? +1 : -1; default: __builtin_unreachable(); } } // A generic class to perform computations of high precision floating points. // We store the value in dyadic format, including 3 fields: // sign : boolean value - false means positive, true means negative // exponent: the exponent value of the least significant bit of the mantissa. // mantissa: unsigned integer of length `Bits`. // So the real value that is stored is: // real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer) // The stored data is normal if for non-zero mantissa, the leading bit is 1. // The outputs of the constructors and most functions will be normalized. // To simplify and improve the efficiency, many functions will assume that the // inputs are normal. template struct DyadicFloat { using MantissaType = LIBC_NAMESPACE::UInt; Sign sign = Sign::POS; int exponent = 0; MantissaType mantissa = MantissaType(0); LIBC_INLINE constexpr DyadicFloat() = default; template , int> = 0> LIBC_INLINE constexpr DyadicFloat(T x) { static_assert(FPBits::FRACTION_LEN < Bits); FPBits x_bits(x); sign = x_bits.sign(); exponent = x_bits.get_explicit_exponent() - FPBits::FRACTION_LEN; mantissa = MantissaType(x_bits.get_explicit_mantissa()); normalize(); } LIBC_INLINE constexpr DyadicFloat(Sign s, int e, const MantissaType &m) : sign(s), exponent(e), mantissa(m) { normalize(); } // Normalizing the mantissa, bringing the leading 1 bit to the most // significant bit. LIBC_INLINE constexpr DyadicFloat &normalize() { if (!mantissa.is_zero()) { int shift_length = cpp::countl_zero(mantissa); exponent -= shift_length; mantissa <<= static_cast(shift_length); } return *this; } // Used for aligning exponents. Output might not be normalized. LIBC_INLINE constexpr DyadicFloat &shift_left(unsigned shift_length) { if (shift_length < Bits) { exponent -= static_cast(shift_length); mantissa <<= shift_length; } else { exponent = 0; mantissa = MantissaType(0); } return *this; } // Used for aligning exponents. Output might not be normalized. LIBC_INLINE constexpr DyadicFloat &shift_right(unsigned shift_length) { if (shift_length < Bits) { exponent += static_cast(shift_length); mantissa >>= shift_length; } else { exponent = 0; mantissa = MantissaType(0); } return *this; } // Assume that it is already normalized. Output the unbiased exponent. LIBC_INLINE constexpr int get_unbiased_exponent() const { return exponent + (Bits - 1); } // Produce a correctly rounded DyadicFloat from a too-large mantissa, // by shifting it down and rounding if necessary. template LIBC_INLINE constexpr static DyadicFloat round(Sign result_sign, int result_exponent, const LIBC_NAMESPACE::UInt &input_mantissa, size_t rshift) { MantissaType result_mantissa(input_mantissa >> rshift); if (rounding_direction(input_mantissa, rshift, result_sign) > 0) { ++result_mantissa; if (result_mantissa == 0) { // Rounding up made the mantissa integer wrap round to 0, // carrying a bit off the top. So we've rounded up to the next // exponent. result_mantissa.set_bit(Bits - 1); ++result_exponent; } } return DyadicFloat(result_sign, result_exponent, result_mantissa); } template LIBC_INLINE constexpr cpp::enable_if_t< cpp::is_floating_point_v && (FPBits::FRACTION_LEN < Bits), T> generic_as() const { using FPBits = FPBits; using StorageType = typename FPBits::StorageType; constexpr int EXTRA_FRACTION_LEN = Bits - 1 - FPBits::FRACTION_LEN; if (mantissa == 0) return FPBits::zero(sign).get_val(); int unbiased_exp = get_unbiased_exponent(); if (unbiased_exp + FPBits::EXP_BIAS >= FPBits::MAX_BIASED_EXPONENT) { if constexpr (ShouldSignalExceptions) { set_errno_if_required(ERANGE); raise_except_if_required(FE_OVERFLOW | FE_INEXACT); } switch (quick_get_round()) { case FE_TONEAREST: return FPBits::inf(sign).get_val(); case FE_TOWARDZERO: return FPBits::max_normal(sign).get_val(); case FE_DOWNWARD: if (sign.is_pos()) return FPBits::max_normal(Sign::POS).get_val(); return FPBits::inf(Sign::NEG).get_val(); case FE_UPWARD: if (sign.is_neg()) return FPBits::max_normal(Sign::NEG).get_val(); return FPBits::inf(Sign::POS).get_val(); default: __builtin_unreachable(); } } StorageType out_biased_exp = 0; StorageType out_mantissa = 0; bool round = false; bool sticky = false; bool underflow = false; if (unbiased_exp < -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { sticky = true; underflow = true; } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) { round = true; MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1; sticky = (mantissa & sticky_mask) != 0; } else { int extra_fraction_len = EXTRA_FRACTION_LEN; if (unbiased_exp < 1 - FPBits::EXP_BIAS) { underflow = true; extra_fraction_len += 1 - FPBits::EXP_BIAS - unbiased_exp; } else { out_biased_exp = static_cast(unbiased_exp + FPBits::EXP_BIAS); } MantissaType round_mask = MantissaType(1) << (extra_fraction_len - 1); round = (mantissa & round_mask) != 0; MantissaType sticky_mask = round_mask - 1; sticky = (mantissa & sticky_mask) != 0; out_mantissa = static_cast(mantissa >> extra_fraction_len); } bool lsb = (out_mantissa & 1) != 0; StorageType result = FPBits::create_value(sign, out_biased_exp, out_mantissa).uintval(); switch (quick_get_round()) { case FE_TONEAREST: if (round && (lsb || sticky)) ++result; break; case FE_DOWNWARD: if (sign.is_neg() && (round || sticky)) ++result; break; case FE_UPWARD: if (sign.is_pos() && (round || sticky)) ++result; break; default: break; } if (ShouldSignalExceptions && (round || sticky)) { int excepts = FE_INEXACT; if (FPBits(result).is_inf()) { set_errno_if_required(ERANGE); excepts |= FE_OVERFLOW; } else if (underflow) { set_errno_if_required(ERANGE); excepts |= FE_UNDERFLOW; } raise_except_if_required(excepts); } return FPBits(result).get_val(); } template && (FPBits::FRACTION_LEN < Bits), void>> LIBC_INLINE constexpr T fast_as() const { if (LIBC_UNLIKELY(mantissa.is_zero())) return FPBits::zero(sign).get_val(); // Assume that it is normalized, and output is also normal. constexpr uint32_t PRECISION = FPBits::FRACTION_LEN + 1; using output_bits_t = typename FPBits::StorageType; constexpr output_bits_t IMPLICIT_MASK = FPBits::SIG_MASK - FPBits::FRACTION_MASK; int exp_hi = exponent + static_cast((Bits - 1) + FPBits::EXP_BIAS); if (LIBC_UNLIKELY(exp_hi > 2 * FPBits::EXP_BIAS)) { // Results overflow. T d_hi = FPBits::create_value(sign, 2 * FPBits::EXP_BIAS, IMPLICIT_MASK) .get_val(); // volatile prevents constant propagation that would result in infinity // always being returned no matter the current rounding mode. volatile T two = static_cast(2.0); T r = two * d_hi; // TODO: Whether rounding down the absolute value to max_normal should // also raise FE_OVERFLOW and set ERANGE is debatable. if (ShouldSignalExceptions && FPBits(r).is_inf()) set_errno_if_required(ERANGE); return r; } bool denorm = false; uint32_t shift = Bits - PRECISION; if (LIBC_UNLIKELY(exp_hi <= 0)) { // Output is denormal. denorm = true; shift = (Bits - PRECISION) + static_cast(1 - exp_hi); exp_hi = FPBits::EXP_BIAS; } int exp_lo = exp_hi - static_cast(PRECISION) - 1; MantissaType m_hi = shift >= MantissaType::BITS ? MantissaType(0) : mantissa >> shift; T d_hi = FPBits::create_value( sign, static_cast(exp_hi), (static_cast(m_hi) & FPBits::SIG_MASK) | IMPLICIT_MASK) .get_val(); MantissaType round_mask = shift - 1 >= MantissaType::BITS ? 0 : MantissaType(1) << (shift - 1); MantissaType sticky_mask = round_mask - MantissaType(1); bool round_bit = !(mantissa & round_mask).is_zero(); bool sticky_bit = !(mantissa & sticky_mask).is_zero(); int round_and_sticky = int(round_bit) * 2 + int(sticky_bit); T d_lo; if (LIBC_UNLIKELY(exp_lo <= 0)) { // d_lo is denormal, but the output is normal. int scale_up_exponent = 1 - exp_lo; T scale_up_factor = FPBits::create_value(Sign::POS, static_cast( FPBits::EXP_BIAS + scale_up_exponent), IMPLICIT_MASK) .get_val(); T scale_down_factor = FPBits::create_value(Sign::POS, static_cast( FPBits::EXP_BIAS - scale_up_exponent), IMPLICIT_MASK) .get_val(); d_lo = FPBits::create_value( sign, static_cast(exp_lo + scale_up_exponent), IMPLICIT_MASK) .get_val(); return multiply_add(d_lo, T(round_and_sticky), d_hi * scale_up_factor) * scale_down_factor; } d_lo = FPBits::create_value(sign, static_cast(exp_lo), IMPLICIT_MASK) .get_val(); // Still correct without FMA instructions if `d_lo` is not underflow. T r = multiply_add(d_lo, T(round_and_sticky), d_hi); if (LIBC_UNLIKELY(denorm)) { // Exponent before rounding is in denormal range, simply clear the // exponent field. output_bits_t clear_exp = static_cast( output_bits_t(exp_hi) << FPBits::SIG_LEN); output_bits_t r_bits = FPBits(r).uintval() - clear_exp; if (!(r_bits & FPBits::EXP_MASK)) { // Output is denormal after rounding, clear the implicit bit for 80-bit // long double. r_bits -= IMPLICIT_MASK; // TODO: IEEE Std 754-2019 lets implementers choose whether to check for // "tininess" before or after rounding for base-2 formats, as long as // the same choice is made for all operations. Our choice to check after // rounding might not be the same as the hardware's. if (ShouldSignalExceptions && round_and_sticky) { set_errno_if_required(ERANGE); raise_except_if_required(FE_UNDERFLOW); } } return FPBits(r_bits).get_val(); } return r; } // Assume that it is already normalized. // Output is rounded correctly with respect to the current rounding mode. template && (FPBits::FRACTION_LEN < Bits), void>> LIBC_INLINE constexpr T as() const { if constexpr (cpp::is_same_v #if defined(LIBC_TYPES_HAS_FLOAT16) && !defined(__LIBC_USE_FLOAT16_CONVERSION) || cpp::is_same_v #endif ) return generic_as(); else return fast_as(); } template && (FPBits::FRACTION_LEN < Bits), void>> LIBC_INLINE explicit constexpr operator T() const { return as(); } LIBC_INLINE constexpr MantissaType as_mantissa_type() const { if (mantissa.is_zero()) return 0; MantissaType new_mant = mantissa; if (exponent > 0) { new_mant <<= exponent; } else { // Cast the exponent to size_t before negating it, rather than after, // to avoid undefined behavior negating INT_MIN as an integer (although // exponents coming in to this function _shouldn't_ be that large). The // result should always end up as a positive size_t. size_t shift = -static_cast(exponent); new_mant >>= shift; } if (sign.is_neg()) { new_mant = (~new_mant) + 1; } return new_mant; } LIBC_INLINE constexpr MantissaType as_mantissa_type_rounded(int *round_dir_out = nullptr) const { int round_dir = 0; MantissaType new_mant; if (mantissa.is_zero()) { new_mant = 0; } else { new_mant = mantissa; if (exponent > 0) { new_mant <<= exponent; } else if (exponent < 0) { // Cast the exponent to size_t before negating it, rather than after, // to avoid undefined behavior negating INT_MIN as an integer (although // exponents coming in to this function _shouldn't_ be that large). The // result should always end up as a positive size_t. size_t shift = -static_cast(exponent); if (shift >= Bits) new_mant = 0; else new_mant >>= shift; round_dir = rounding_direction(mantissa, shift, sign); if (round_dir > 0) ++new_mant; } if (sign.is_neg()) { new_mant = (~new_mant) + 1; } } if (round_dir_out) *round_dir_out = round_dir; return new_mant; } LIBC_INLINE constexpr DyadicFloat operator-() const { return DyadicFloat(sign.negate(), exponent, mantissa); } }; // Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the // output: // - Align the exponents so that: // new a.exponent = new b.exponent = max(a.exponent, b.exponent) // - Add or subtract the mantissas depending on the signs. // - Normalize the result. // The absolute errors compared to the mathematical sum is bounded by: // | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2), // i.e., errors are up to 2 ULPs. // Assume inputs are normalized (by constructors or other functions) so that we // don't need to normalize the inputs again in this function. If the inputs are // not normalized, the results might lose precision significantly. template LIBC_INLINE constexpr DyadicFloat quick_add(DyadicFloat a, DyadicFloat b) { if (LIBC_UNLIKELY(a.mantissa.is_zero())) return b; if (LIBC_UNLIKELY(b.mantissa.is_zero())) return a; // Align exponents if (a.exponent > b.exponent) b.shift_right(static_cast(a.exponent - b.exponent)); else if (b.exponent > a.exponent) a.shift_right(static_cast(b.exponent - a.exponent)); DyadicFloat result; if (a.sign == b.sign) { // Addition result.sign = a.sign; result.exponent = a.exponent; result.mantissa = a.mantissa; if (result.mantissa.add_overflow(b.mantissa)) { // Mantissa addition overflow. result.shift_right(1); result.mantissa.val[DyadicFloat::MantissaType::WORD_COUNT - 1] |= (uint64_t(1) << 63); } // Result is already normalized. return result; } // Subtraction if (a.mantissa >= b.mantissa) { result.sign = a.sign; result.exponent = a.exponent; result.mantissa = a.mantissa - b.mantissa; } else { result.sign = b.sign; result.exponent = b.exponent; result.mantissa = b.mantissa - a.mantissa; } return result.normalize(); } template LIBC_INLINE constexpr DyadicFloat quick_sub(DyadicFloat a, DyadicFloat b) { return quick_add(a, -b); } // Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic // floats with rounding toward 0 and then normalize the output: // result.exponent = a.exponent + b.exponent + Bits, // result.mantissa = quick_mul_hi(a.mantissa + b.mantissa) // ~ (full product a.mantissa * b.mantissa) >> Bits. // The errors compared to the mathematical product is bounded by: // 2 * errors of quick_mul_hi = 2 * (UInt::WORD_COUNT - 1) in ULPs. // Assume inputs are normalized (by constructors or other functions) so that we // don't need to normalize the inputs again in this function. If the inputs are // not normalized, the results might lose precision significantly. template LIBC_INLINE constexpr DyadicFloat quick_mul(const DyadicFloat &a, const DyadicFloat &b) { DyadicFloat result; result.sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; result.exponent = a.exponent + b.exponent + static_cast(Bits); if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) { result.mantissa = a.mantissa.quick_mul_hi(b.mantissa); // Check the leading bit directly, should be faster than using clz in // normalize(). if (result.mantissa.val[DyadicFloat::MantissaType::WORD_COUNT - 1] >> (DyadicFloat::MantissaType::WORD_SIZE - 1) == 0) result.shift_left(1); } else { result.mantissa = (typename DyadicFloat::MantissaType)(0); } return result; } // Correctly rounded multiplication of 2 dyadic floats, assuming the // exponent remains within range. template LIBC_INLINE constexpr DyadicFloat rounded_mul(const DyadicFloat &a, const DyadicFloat &b) { using DblMant = LIBC_NAMESPACE::UInt<(2 * Bits)>; Sign result_sign = (a.sign != b.sign) ? Sign::NEG : Sign::POS; int result_exponent = a.exponent + b.exponent + static_cast(Bits); auto product = DblMant(a.mantissa) * DblMant(b.mantissa); // As in quick_mul(), renormalize by 1 bit manually rather than countl_zero if (product.get_bit(2 * Bits - 1) == 0) { product <<= 1; result_exponent -= 1; } return DyadicFloat::round(result_sign, result_exponent, product, Bits); } // Approximate reciprocal - given a nonzero a, make a good approximation to 1/a. // The method is Newton-Raphson iteration, based on quick_mul. template = 32)>> LIBC_INLINE constexpr DyadicFloat approx_reciprocal(const DyadicFloat &a) { // Given an approximation x to 1/a, a better one is x' = x(2-ax). // // You can derive this by using the Newton-Raphson formula with the function // f(x) = 1/x - a. But another way to see that it works is to say: suppose // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) = // 1-e^2. So the error in x' is the square of the error in x, i.e. the number // of correct bits in x' is double the number in x. // An initial approximation to the reciprocal DyadicFloat x(Sign::POS, -32 - a.exponent - int(Bits), uint64_t(0xFFFFFFFFFFFFFFFF) / static_cast(a.mantissa >> (Bits - 32))); // The constant 2, which we'll need in every iteration DyadicFloat two(Sign::POS, 1, 1); // We expect at least 31 correct bits from our 32-bit starting approximation size_t ok_bits = 31; // The number of good bits doubles in each iteration, except that rounding // errors introduce a little extra each time. Subtract a bit from our // accuracy assessment to account for that. while (ok_bits < Bits) { x = quick_mul(x, quick_sub(two, quick_mul(a, x))); ok_bits = 2 * ok_bits - 1; } return x; } // Correctly rounded division of 2 dyadic floats, assuming the // exponent remains within range. template LIBC_INLINE constexpr DyadicFloat rounded_div(const DyadicFloat &af, const DyadicFloat &bf) { using DblMant = LIBC_NAMESPACE::UInt<(Bits * 2 + 64)>; // Make an approximation to the quotient as a * (1/b). Both the // multiplication and the reciprocal are a bit sloppy, which doesn't // matter, because we're going to correct for that below. auto qf = fputil::quick_mul(af, fputil::approx_reciprocal(bf)); // Switch to BigInt and stop using quick_add and quick_mul: now // we're working in exact integers so as to get the true remainder. DblMant a = af.mantissa, b = bf.mantissa, q = qf.mantissa; q <<= 2; // leave room for a round bit, even if exponent decreases a <<= af.exponent - bf.exponent - qf.exponent + 2; DblMant qb = q * b; if (qb < a) { DblMant too_small = a - b; while (qb <= too_small) { qb += b; ++q; } } else { while (qb > a) { qb -= b; --q; } } DyadicFloat<(Bits * 2)> qbig(qf.sign, qf.exponent - 2, q); return DyadicFloat::round(qbig.sign, qbig.exponent + Bits, qbig.mantissa, Bits); } // Simple polynomial approximation. template LIBC_INLINE constexpr DyadicFloat multiply_add(const DyadicFloat &a, const DyadicFloat &b, const DyadicFloat &c) { return quick_add(c, quick_mul(a, b)); } // Simple exponentiation implementation for printf. Only handles positive // exponents, since division isn't implemented. template LIBC_INLINE constexpr DyadicFloat pow_n(const DyadicFloat &a, uint32_t power) { DyadicFloat result = 1.0; DyadicFloat cur_power = a; while (power > 0) { if ((power % 2) > 0) { result = quick_mul(result, cur_power); } power = power >> 1; cur_power = quick_mul(cur_power, cur_power); } return result; } template LIBC_INLINE constexpr DyadicFloat mul_pow_2(const DyadicFloat &a, int32_t pow_2) { DyadicFloat result = a; result.exponent += pow_2; return result; } } // namespace fputil } // namespace LIBC_NAMESPACE_DECL #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_DYADIC_FLOAT_H