aboutsummaryrefslogtreecommitdiff
path: root/libc/utils/FPUtil/NormalFloat.h
diff options
context:
space:
mode:
Diffstat (limited to 'libc/utils/FPUtil/NormalFloat.h')
-rw-r--r--libc/utils/FPUtil/NormalFloat.h228
1 files changed, 228 insertions, 0 deletions
diff --git a/libc/utils/FPUtil/NormalFloat.h b/libc/utils/FPUtil/NormalFloat.h
new file mode 100644
index 0000000..e0e6911
--- /dev/null
+++ b/libc/utils/FPUtil/NormalFloat.h
@@ -0,0 +1,228 @@
+//===-- A class to store a normalized floating point number -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H
+#define LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H
+
+#include "FPBits.h"
+
+#include "utils/CPP/TypeTraits.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace fputil {
+
+// A class which stores the normalized form of a floating point value.
+// The special IEEE-754 bits patterns of Zero, infinity and NaNs are
+// are not handled by this class.
+//
+// A normalized floating point number is of this form:
+// (-1)*sign * 2^exponent * <mantissa>
+// where <mantissa> is of the form 1.<...>.
+template <typename T> struct NormalFloat {
+ static_assert(
+ cpp::IsFloatingPointType<T>::Value,
+ "NormalFloat template parameter has to be a floating point type.");
+
+ using UIntType = typename FPBits<T>::UIntType;
+ static constexpr UIntType one = (UIntType(1) << MantissaWidth<T>::value);
+
+ // Unbiased exponent value.
+ int32_t exponent;
+
+ UIntType mantissa;
+ // We want |UIntType| to have atleast one bit more than the actual mantissa
+ // bit width to accommodate the implicit 1 value.
+ static_assert(sizeof(UIntType) * 8 >= MantissaWidth<T>::value + 1,
+ "Bad type for mantissa in NormalFloat.");
+
+ bool sign;
+
+ NormalFloat(int32_t e, UIntType m, bool s)
+ : exponent(e), mantissa(m), sign(s) {
+ if (mantissa >= one)
+ return;
+
+ unsigned normalizationShift = evaluateNormalizationShift(mantissa);
+ mantissa = mantissa << normalizationShift;
+ exponent -= normalizationShift;
+ }
+
+ explicit NormalFloat(T x) { initFromBits(FPBits<T>(x)); }
+
+ explicit NormalFloat(FPBits<T> bits) { initFromBits(bits); }
+
+ // Compares this normalized number with another normalized number.
+ // Returns -1 is this number is less than |other|, 0 if this number is equal
+ // to |other|, and 1 if this number is greater than |other|.
+ int cmp(const NormalFloat<T> &other) const {
+ if (sign != other.sign)
+ return sign ? -1 : 1;
+
+ if (exponent > other.exponent) {
+ return sign ? -1 : 1;
+ } else if (exponent == other.exponent) {
+ if (mantissa > other.mantissa)
+ return sign ? -1 : 1;
+ else if (mantissa == other.mantissa)
+ return 0;
+ else
+ return sign ? 1 : -1;
+ } else {
+ return sign ? 1 : -1;
+ }
+ }
+
+ // Returns a new normalized floating point number which is equal in value
+ // to this number multiplied by 2^e. That is:
+ // new = this * 2^e
+ NormalFloat<T> mul2(int e) const {
+ NormalFloat<T> result = *this;
+ result.exponent += e;
+ return result;
+ }
+
+ operator T() const {
+ int biasedExponent = exponent + FPBits<T>::exponentBias;
+ // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
+ constexpr int maxExponentValue = (1 << ExponentWidth<T>::value) - 2;
+ if (biasedExponent > maxExponentValue) {
+ // TODO: Should infinity with the correct sign be returned?
+ return FPBits<T>::buildNaN(1);
+ }
+
+ FPBits<T> result(T(0.0));
+
+ constexpr int subnormalExponent = -FPBits<T>::exponentBias + 1;
+ if (exponent < subnormalExponent) {
+ unsigned shift = subnormalExponent - exponent;
+ if (shift <= MantissaWidth<T>::value) {
+ // Generate a subnormal number. Might lead to loss of precision.
+ result.exponent = 0;
+ result.mantissa = mantissa >> shift;
+ result.sign = sign;
+ return result;
+ } else {
+ // TODO: Should zero with the correct sign be returned?
+ return FPBits<T>::buildNaN(1);
+ }
+ }
+
+ result.exponent = exponent + FPBits<T>::exponentBias;
+ result.mantissa = mantissa;
+ result.sign = sign;
+ return result;
+ }
+
+private:
+ void initFromBits(FPBits<T> bits) {
+ sign = bits.sign;
+
+ if (bits.isInfOrNaN() || bits.isZero()) {
+ // Ignore special bit patterns. Implementations deal with them separately
+ // anyway so this should not be a problem.
+ exponent = 0;
+ mantissa = 0;
+ return;
+ }
+
+ // Normalize subnormal numbers.
+ if (bits.exponent == 0) {
+ unsigned shift = evaluateNormalizationShift(bits.mantissa);
+ mantissa = UIntType(bits.mantissa) << shift;
+ exponent = 1 - FPBits<T>::exponentBias - shift;
+ } else {
+ exponent = bits.exponent - FPBits<T>::exponentBias;
+ mantissa = one | bits.mantissa;
+ }
+ }
+
+ unsigned evaluateNormalizationShift(UIntType m) {
+ unsigned shift = 0;
+ for (; (one & m) == 0 && (shift < MantissaWidth<T>::value);
+ m <<= 1, ++shift)
+ ;
+ return shift;
+ }
+};
+
+#if defined(__x86_64__) || defined(__i386__)
+template <>
+inline void NormalFloat<long double>::initFromBits(FPBits<long double> bits) {
+ sign = bits.sign;
+
+ if (bits.isInfOrNaN() || bits.isZero()) {
+ // Ignore special bit patterns. Implementations deal with them separately
+ // anyway so this should not be a problem.
+ exponent = 0;
+ mantissa = 0;
+ return;
+ }
+
+ if (bits.exponent == 0) {
+ if (bits.implicitBit == 0) {
+ // Since we ignore zero value, the mantissa in this case is non-zero.
+ int normalizationShift = evaluateNormalizationShift(bits.mantissa);
+ exponent = -16382 - normalizationShift;
+ mantissa = (bits.mantissa << normalizationShift);
+ } else {
+ exponent = -16382;
+ mantissa = one | bits.mantissa;
+ }
+ } else {
+ if (bits.implicitBit == 0) {
+ // Invalid number so just store 0 similar to a NaN.
+ exponent = 0;
+ mantissa = 0;
+ } else {
+ exponent = bits.exponent - 16383;
+ mantissa = one | bits.mantissa;
+ }
+ }
+}
+
+template <> inline NormalFloat<long double>::operator long double() const {
+ int biasedExponent = exponent + FPBits<long double>::exponentBias;
+ // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
+ constexpr int maxExponentValue = (1 << ExponentWidth<long double>::value) - 2;
+ if (biasedExponent > maxExponentValue) {
+ // TODO: Should infinity with the correct sign be returned?
+ return FPBits<long double>::buildNaN(1);
+ }
+
+ FPBits<long double> result(0.0l);
+
+ constexpr int subnormalExponent = -FPBits<long double>::exponentBias + 1;
+ if (exponent < subnormalExponent) {
+ unsigned shift = subnormalExponent - exponent;
+ if (shift <= MantissaWidth<long double>::value) {
+ // Generate a subnormal number. Might lead to loss of precision.
+ result.exponent = 0;
+ result.mantissa = mantissa >> shift;
+ result.implicitBit = 0;
+ result.sign = sign;
+ return result;
+ } else {
+ // TODO: Should zero with the correct sign be returned?
+ return FPBits<long double>::buildNaN(1);
+ }
+ }
+
+ result.exponent = biasedExponent;
+ result.mantissa = mantissa;
+ result.implicitBit = 1;
+ result.sign = sign;
+ return result;
+}
+#endif
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_UTILS_FPUTIL_NORMAL_FLOAT_H