diff options
author | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2023-01-10 21:18:10 +0000 |
---|---|---|
committer | Krzysztof Drewniak <Krzysztof.Drewniak@amd.com> | 2023-02-09 22:08:00 +0000 |
commit | 6109e70c72fc5171d25c4467fc3cfe6eb2029f50 (patch) | |
tree | 82d5ba119cc763aeab3df49113240abe0d851938 /llvm/lib/Support/APFloat.cpp | |
parent | 848c700b66f569adc893518c37d500f80a5412e2 (diff) | |
download | llvm-6109e70c72fc5171d25c4467fc3cfe6eb2029f50.zip llvm-6109e70c72fc5171d25c4467fc3cfe6eb2029f50.tar.gz llvm-6109e70c72fc5171d25c4467fc3cfe6eb2029f50.tar.bz2 |
[llvm][APFloat] Add NaN-in-negative-zero formats by AMD and GraphCore
AMD, GraphCore, and Qualcom have published a standard for 8-bit floats that
differs from the 8-bit floats defined by Nvidia, Intel, and ARM. This
commit adds support for these alternate 8-bit floats to APFloat in
order to enable their usage in MLIR. These formats are presented in
the paper at https://arxiv.org/abs/2206.02915 and are implemented in
GRaphCore hardware whose ISA is available at
https://docs.graphcore.ai/projects/isa-mk2-with-fp8/en/latest/_static/TileVertexISA-IPU21-1.3.1.pdf .
In these formats, like the existing Float8E4M3FN, there are no
infinity values and there is only one NaN. Unlike in that format,
however, the NaN values is 0x80, which would be negative 0 in IEEE
formats. This means that these formats also make 0 unsigned.
To allow for these new variant semantics, this commit adds
fltNanEncoding, which can be IEEE (the default), AllOnes (used by
Fleat8E4M3FN), or NegativeZero (used by the new formats,
Float8E5M2FNUZ and Float8E4M3FNUZ). Normalization, arithmetic, and
other such routines have been updated to account for the potential
variant semantics.
The two new formats are Float8E5M2FNUZ (5 bits exponent, 2 bits
mantissa, finite, unsigned zero) and Float8E4M3FNUZ (4 bits exponent,
3 bits mantissa, finite, unsigned zero).
Reviewed By: jakeh-gc, reedwm, lattner
Differential Revision: https://reviews.llvm.org/D141863
Diffstat (limited to 'llvm/lib/Support/APFloat.cpp')
-rw-r--r-- | llvm/lib/Support/APFloat.cpp | 353 |
1 files changed, 295 insertions, 58 deletions
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index e9998d7..2e7926b 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/StringExtras.h" @@ -51,49 +52,75 @@ static_assert(APFloatBase::integerPartWidth % 4 == 0, "Part width must be divisi namespace llvm { - // How the nonfinite values Inf and NaN are represented. - enum class fltNonfiniteBehavior { - // Represents standard IEEE 754 behavior. A value is nonfinite if the - // exponent field is all 1s. In such cases, a value is Inf if the - // significand bits are all zero, and NaN otherwise - IEEE754, - - // Only the Float8E5M2 has this behavior. There is no Inf representation. A - // value is NaN if the exponent field and the mantissa field are all 1s. - // This behavior matches the FP8 E4M3 type described in - // https://arxiv.org/abs/2209.05433. We treat both signed and unsigned NaNs - // as non-signalling, although the paper does not state whether the NaN - // values are signalling or not. - NanOnly, - }; +// How the nonfinite values Inf and NaN are represented. +enum class fltNonfiniteBehavior { + // Represents standard IEEE 754 behavior. A value is nonfinite if the + // exponent field is all 1s. In such cases, a value is Inf if the + // significand bits are all zero, and NaN otherwise + IEEE754, + + // This behavior is present in the Float8ExMyFN* types (Float8E4M3FN, + // Float8E5M2FNUZ, and Float8E4M3FNUZ). There is no representation for Inf, + // and operations that would ordinarily produce Inf produce NaN instead. + // The details of the NaN representation(s) in this form are determined by the + // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available + // encodings do not distinguish between signalling and quiet NaN. + NanOnly, +}; - /* Represents floating point arithmetic semantics. */ - struct fltSemantics { - /* The largest E such that 2^E is representable; this matches the - definition of IEEE 754. */ - APFloatBase::ExponentType maxExponent; +// How NaN values are represented. This is curently only used in combination +// with fltNonfiniteBehavior::NanOnly, and using a variant other than IEEE +// while having IEEE non-finite behavior is liable to lead to unexpected +// results. +enum class fltNanEncoding { + // Represents the standard IEEE behavior where a value is NaN if its + // exponent is all 1s and the significand is non-zero. + IEEE, + + // Represents the behavior in the Float8E4M3 floating point type where NaN is + // represented by having the exponent and mantissa set to all 1s. + // This behavior matches the FP8 E4M3 type described in + // https://arxiv.org/abs/2209.05433. We treat both signed and unsigned NaNs + // as non-signalling, although the paper does not state whether the NaN + // values are signalling or not. + AllOnes, + + // Represents the behavior in Float8E{5,4}E{2,3}FNUZ floating point types + // where NaN is represented by a sign bit of 1 and all 0s in the exponent + // and mantissa (i.e. the negative zero encoding in a IEEE float). Since + // there is only one NaN value, it is treated as quiet NaN. This matches the + // behavior described in https://arxiv.org/abs/2206.02915 . + NegativeZero, +}; - /* The smallest E such that 2^E is a normalized number; this - matches the definition of IEEE 754. */ - APFloatBase::ExponentType minExponent; +/* Represents floating point arithmetic semantics. */ +struct fltSemantics { + /* The largest E such that 2^E is representable; this matches the + definition of IEEE 754. */ + APFloatBase::ExponentType maxExponent; - /* Number of bits in the significand. This includes the integer - bit. */ - unsigned int precision; + /* The smallest E such that 2^E is a normalized number; this + matches the definition of IEEE 754. */ + APFloatBase::ExponentType minExponent; - /* Number of bits actually used in the semantics. */ - unsigned int sizeInBits; + /* Number of bits in the significand. This includes the integer + bit. */ + unsigned int precision; - fltNonfiniteBehavior nonFiniteBehavior = fltNonfiniteBehavior::IEEE754; + /* Number of bits actually used in the semantics. */ + unsigned int sizeInBits; - // Returns true if any number described by this semantics can be precisely - // represented by the specified semantics. Does not take into account - // the value of fltNonfiniteBehavior. - bool isRepresentableBy(const fltSemantics &S) const { - return maxExponent <= S.maxExponent && minExponent >= S.minExponent && - precision <= S.precision; - } - }; + fltNonfiniteBehavior nonFiniteBehavior = fltNonfiniteBehavior::IEEE754; + + fltNanEncoding nanEncoding = fltNanEncoding::IEEE; + // Returns true if any number described by this semantics can be precisely + // represented by the specified semantics. Does not take into account + // the value of fltNonfiniteBehavior. + bool isRepresentableBy(const fltSemantics &S) const { + return maxExponent <= S.maxExponent && minExponent >= S.minExponent && + precision <= S.precision; + } +}; static const fltSemantics semIEEEhalf = {15, -14, 11, 16}; static const fltSemantics semBFloat = {127, -126, 8, 16}; @@ -101,8 +128,16 @@ namespace llvm { static const fltSemantics semIEEEdouble = {1023, -1022, 53, 64}; static const fltSemantics semIEEEquad = {16383, -16382, 113, 128}; static const fltSemantics semFloat8E5M2 = {15, -14, 3, 8}; - static const fltSemantics semFloat8E4M3FN = {8, -6, 4, 8, - fltNonfiniteBehavior::NanOnly}; + static const fltSemantics semFloat8E5M2FNUZ = {15, + -15, + 3, + 8, + fltNonfiniteBehavior::NanOnly, + fltNanEncoding::NegativeZero}; + static const fltSemantics semFloat8E4M3FN = { + 8, -6, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::AllOnes}; + static const fltSemantics semFloat8E4M3FNUZ = { + 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; static const fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80}; static const fltSemantics semBogus = {0, 0, 0, 0}; @@ -160,8 +195,12 @@ namespace llvm { return PPCDoubleDouble(); case S_Float8E5M2: return Float8E5M2(); + case S_Float8E5M2FNUZ: + return Float8E5M2FNUZ(); case S_Float8E4M3FN: return Float8E4M3FN(); + case S_Float8E4M3FNUZ: + return Float8E4M3FNUZ(); case S_x87DoubleExtended: return x87DoubleExtended(); } @@ -184,8 +223,12 @@ namespace llvm { return S_PPCDoubleDouble; else if (&Sem == &llvm::APFloat::Float8E5M2()) return S_Float8E5M2; + else if (&Sem == &llvm::APFloat::Float8E5M2FNUZ()) + return S_Float8E5M2FNUZ; else if (&Sem == &llvm::APFloat::Float8E4M3FN()) return S_Float8E4M3FN; + else if (&Sem == &llvm::APFloat::Float8E4M3FNUZ()) + return S_Float8E4M3FNUZ; else if (&Sem == &llvm::APFloat::x87DoubleExtended()) return S_x87DoubleExtended; else @@ -209,7 +252,13 @@ namespace llvm { return semPPCDoubleDouble; } const fltSemantics &APFloatBase::Float8E5M2() { return semFloat8E5M2; } + const fltSemantics &APFloatBase::Float8E5M2FNUZ() { + return semFloat8E5M2FNUZ; + } const fltSemantics &APFloatBase::Float8E4M3FN() { return semFloat8E4M3FN; } + const fltSemantics &APFloatBase::Float8E4M3FNUZ() { + return semFloat8E4M3FNUZ; + } const fltSemantics &APFloatBase::x87DoubleExtended() { return semX87DoubleExtended; } @@ -808,10 +857,15 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { APInt fill_storage; if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { - // The only NaN representation is where the mantissa is all 1s, which is - // non-signalling. + // Finite-only types do not distinguish signalling and quiet NaN, so + // make them all signalling. SNaN = false; - fill_storage = APInt::getAllOnes(semantics->precision - 1); + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + sign = true; + fill_storage = APInt::getZero(semantics->precision - 1); + } else { + fill_storage = APInt::getAllOnes(semantics->precision - 1); + } fill = &fill_storage; } @@ -842,6 +896,9 @@ void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) { // conventionally, this is the next bit down from the QNaN bit. if (APInt::tcIsZero(significand, numParts)) APInt::tcSetBit(significand, QNaNBit - 1); + } else if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + // The only NaN is a quiet NaN, and it has no bits sets in the significand. + // Do nothing. } else { // We always have to set the QNaN bit to make it a QNaN. APInt::tcSetBit(significand, QNaNBit); @@ -986,7 +1043,8 @@ bool IEEEFloat::isSignificandAllZerosExceptMSB() const { } bool IEEEFloat::isLargest() const { - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) { // The largest number by magnitude in our format will be the floating point // number with maximum exponent and with significand that is all ones except // the LSB. @@ -1428,7 +1486,8 @@ IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) { exponent = semantics->maxExponent; tcSetLeastSignificantBits(significandParts(), partCount(), semantics->precision); - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) APInt::tcClearBit(significandParts(), 0); return opInexact; @@ -1529,7 +1588,10 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode, } } + // The all-ones values is an overflow if NaN is all ones. If NaN is + // represented by negative zero, then it is a valid finite value. if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes && exponent == semantics->maxExponent && isSignificandAllOnes()) return handleOverflow(rounding_mode); @@ -1540,8 +1602,11 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode, underflow for exact results. */ if (lost_fraction == lfExactlyZero) { /* Canonicalize zeroes. */ - if (omsb == 0) + if (omsb == 0) { category = fcZero; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } return opOK; } @@ -1559,18 +1624,22 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode, /* Renormalize by incrementing the exponent and shifting our significand right one. However if we already have the maximum exponent we overflow to infinity. */ - if (exponent == semantics->maxExponent) { - category = fcInfinity; - - return (opStatus) (opOverflow | opInexact); - } + if (exponent == semantics->maxExponent) + // Invoke overflow handling with a rounding mode that will guarantee + // that the result gets turned into the correct infinity representation. + // This is needed instead of just setting the category to infinity to + // account for 8-bit floating point types that have no inf, only NaN. + return handleOverflow(sign ? rmTowardNegative : rmTowardPositive); shiftSignificandRight(1); return opInexact; } + // The all-ones values is an overflow if NaN is all ones. If NaN is + // represented by negative zero, then it is a valid finite value. if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes && exponent == semantics->maxExponent && isSignificandAllOnes()) return handleOverflow(rounding_mode); } @@ -1584,8 +1653,11 @@ IEEEFloat::opStatus IEEEFloat::normalize(roundingMode rounding_mode, assert(omsb < semantics->precision); /* Canonicalize zeroes. */ - if (omsb == 0) + if (omsb == 0) { category = fcZero; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } /* The fcZero case is a denormal that underflowed to zero. */ return (opStatus) (opUnderflow | opInexact); @@ -1887,6 +1959,11 @@ IEEEFloat::opStatus IEEEFloat::remainderSpecials(const IEEEFloat &rhs) { /* Change sign. */ void IEEEFloat::changeSign() { + // With NaN-as-negative-zero, neither NaN or negative zero can change + // their signs. + if (semantics->nanEncoding == fltNanEncoding::NegativeZero && + (isZero() || isNaN())) + return; /* Look mummy, this one's easy. */ sign = !sign; } @@ -1916,6 +1993,9 @@ IEEEFloat::opStatus IEEEFloat::addOrSubtract(const IEEEFloat &rhs, if (category == fcZero) { if (rhs.category != fcZero || (sign == rhs.sign) == subtract) sign = (rounding_mode == rmTowardNegative); + // NaN-in-negative-zero means zeros need to be normalized to +0. + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; } return fs; @@ -1941,6 +2021,8 @@ IEEEFloat::opStatus IEEEFloat::multiply(const IEEEFloat &rhs, sign ^= rhs.sign; fs = multiplySpecials(rhs); + if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; if (isFiniteNonZero()) { lostFraction lost_fraction = multiplySignificand(rhs); fs = normalize(rounding_mode, lost_fraction); @@ -1959,6 +2041,8 @@ IEEEFloat::opStatus IEEEFloat::divide(const IEEEFloat &rhs, sign ^= rhs.sign; fs = divideSpecials(rhs); + if (isZero() && semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; if (isFiniteNonZero()) { lostFraction lost_fraction = divideSignificand(rhs); fs = normalize(rounding_mode, lost_fraction); @@ -2067,8 +2151,13 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) { } } - if (isZero()) + if (isZero()) { sign = origSign; // IEEE754 requires this + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + // But some 8-bit floats only have positive 0. + sign = false; + } + else sign ^= origSign; return fs; @@ -2093,8 +2182,11 @@ IEEEFloat::opStatus IEEEFloat::mod(const IEEEFloat &rhs) { fs = subtract(V, rmNearestTiesToEven); assert(fs==opOK); } - if (isZero()) + if (isZero()) { sign = origSign; // fmod requires this + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } return fs; } @@ -2122,8 +2214,11 @@ IEEEFloat::opStatus IEEEFloat::fusedMultiplyAdd(const IEEEFloat &multiplicand, /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a positive zero unless rounding to minus infinity, except that adding two like-signed zeroes gives that zero. */ - if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign) + if (category == fcZero && !(fs & opUnderflow) && sign != addend.sign) { sign = (rounding_mode == rmTowardNegative); + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; + } } else { fs = multiplySpecials(multiplicand); @@ -2399,6 +2494,12 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, return is_signaling ? opInvalidOp : opOK; } + // If NaN is negative zero, we need to create a new NaN to avoid converting + // NaN to -Inf. + if (fromSemantics.nanEncoding == fltNanEncoding::NegativeZero && + semantics->nanEncoding != fltNanEncoding::NegativeZero) + makeNaN(false, false); + *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan; // For x87 extended precision, we want to make a NaN, not a special NaN if @@ -2420,6 +2521,14 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics, makeNaN(false, sign); *losesInfo = true; fs = opInexact; + } else if (category == fcZero && + semantics->nanEncoding == fltNanEncoding::NegativeZero) { + // Negative zero loses info, but positive zero doesn't. + *losesInfo = + fromSemantics.nanEncoding != fltNanEncoding::NegativeZero && sign; + fs = *losesInfo ? opInexact : opOK; + // NaN is negative zero means -0 -> +0, which can lose information + sign = false; } else { *losesInfo = false; fs = opOK; @@ -2887,9 +2996,11 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) { if (D.firstSigDigit == str.end() || decDigitValue(*D.firstSigDigit) >= 10U) { category = fcZero; fs = opOK; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; - /* Check whether the normalized exponent is high enough to overflow - max during the log-rebasing in the max-exponent check below. */ + /* Check whether the normalized exponent is high enough to overflow + max during the log-rebasing in the max-exponent check below. */ } else if (D.normalizedExponent - 1 > INT_MAX / 42039) { fs = handleOverflow(rounding_mode); @@ -3517,6 +3628,33 @@ APInt IEEEFloat::convertFloat8E5M2APFloatToAPInt() const { (mysignificand & 0x3))); } +APInt IEEEFloat::convertFloat8E5M2FNUZAPFloatToAPInt() const { + assert(semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ); + assert(partCount() == 1); + + uint32_t myexponent, mysignificand; + + if (isFiniteNonZero()) { + myexponent = exponent + 16; // bias + mysignificand = (uint32_t)*significandParts(); + if (myexponent == 1 && !(mysignificand & 0x4)) + myexponent = 0; // denormal + } else if (category == fcZero) { + myexponent = 0; + mysignificand = 0; + } else if (category == fcInfinity) { + myexponent = 0; + mysignificand = 0; + } else { + assert(category == fcNaN && "Unknown category!"); + myexponent = 0; + mysignificand = (uint32_t)*significandParts(); + } + + return APInt(8, (((sign & 1) << 7) | ((myexponent & 0x1f) << 2) | + (mysignificand & 0x3))); +} + APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN); assert(partCount() == 1); @@ -3544,6 +3682,33 @@ APInt IEEEFloat::convertFloat8E4M3FNAPFloatToAPInt() const { (mysignificand & 0x7))); } +APInt IEEEFloat::convertFloat8E4M3FNUZAPFloatToAPInt() const { + assert(semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ); + assert(partCount() == 1); + + uint32_t myexponent, mysignificand; + + if (isFiniteNonZero()) { + myexponent = exponent + 8; // bias + mysignificand = (uint32_t)*significandParts(); + if (myexponent == 1 && !(mysignificand & 0x8)) + myexponent = 0; // denormal + } else if (category == fcZero) { + myexponent = 0; + mysignificand = 0; + } else if (category == fcInfinity) { + myexponent = 0; + mysignificand = 0; + } else { + assert(category == fcNaN && "Unknown category!"); + myexponent = 0; + mysignificand = (uint32_t)*significandParts(); + } + + return APInt(8, (((sign & 1) << 7) | ((myexponent & 0xf) << 3) | + (mysignificand & 0x7))); +} + // This function creates an APInt that is just a bit map of the floating // point constant as it would appear in memory. It is not a conversion, // and treating the result as a normal integer is unlikely to be useful. @@ -3570,9 +3735,15 @@ APInt IEEEFloat::bitcastToAPInt() const { if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2) return convertFloat8E5M2APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E5M2FNUZ) + return convertFloat8E5M2FNUZAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FN) return convertFloat8E4M3FNAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3FNUZ) + return convertFloat8E4M3FNUZAPFloatToAPInt(); + assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended && "unknown format!"); return convertF80LongDoubleAPFloatToAPInt(); @@ -3828,6 +3999,32 @@ void IEEEFloat::initFromFloat8E5M2APInt(const APInt &api) { } } +void IEEEFloat::initFromFloat8E5M2FNUZAPInt(const APInt &api) { + uint32_t i = (uint32_t)*api.getRawData(); + uint32_t myexponent = (i >> 2) & 0x1f; + uint32_t mysignificand = i & 0x3; + + initialize(&semFloat8E5M2FNUZ); + assert(partCount() == 1); + + sign = i >> 7; + if (myexponent == 0 && mysignificand == 0 && sign == 0) { + makeZero(sign); + } else if (myexponent == 0 && mysignificand == 0 && sign == 1) { + category = fcNaN; + exponent = exponentNaN(); + *significandParts() = mysignificand; + } else { + category = fcNormal; + exponent = myexponent - 16; // bias + *significandParts() = mysignificand; + if (myexponent == 0) // denormal + exponent = -15; + else + *significandParts() |= 0x4; // integer bit + } +} + void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { uint32_t i = (uint32_t)*api.getRawData(); uint32_t myexponent = (i >> 3) & 0xf; @@ -3854,6 +4051,32 @@ void IEEEFloat::initFromFloat8E4M3FNAPInt(const APInt &api) { } } +void IEEEFloat::initFromFloat8E4M3FNUZAPInt(const APInt &api) { + uint32_t i = (uint32_t)*api.getRawData(); + uint32_t myexponent = (i >> 3) & 0xf; + uint32_t mysignificand = i & 0x7; + + initialize(&semFloat8E4M3FNUZ); + assert(partCount() == 1); + + sign = i >> 7; + if (myexponent == 0 && mysignificand == 0 && sign == 0) { + makeZero(sign); + } else if (myexponent == 0 && mysignificand == 0 && sign == 1) { + category = fcNaN; + exponent = exponentNaN(); + *significandParts() = mysignificand; + } else { + category = fcNormal; + exponent = myexponent - 8; // bias + *significandParts() = mysignificand; + if (myexponent == 0) // denormal + exponent = -7; + else + *significandParts() |= 0x8; // integer bit + } +} + /// Treat api as containing the bits of a floating point number. void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { assert(api.getBitWidth() == Sem->sizeInBits); @@ -3873,8 +4096,12 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { return initFromPPCDoubleDoubleAPInt(api); if (Sem == &semFloat8E5M2) return initFromFloat8E5M2APInt(api); + if (Sem == &semFloat8E5M2FNUZ) + return initFromFloat8E5M2FNUZAPInt(api); if (Sem == &semFloat8E4M3FN) return initFromFloat8E4M3FNAPInt(api); + if (Sem == &semFloat8E4M3FNUZ) + return initFromFloat8E4M3FNUZAPInt(api); llvm_unreachable(nullptr); } @@ -3903,7 +4130,8 @@ void IEEEFloat::makeLargest(bool Negative) { ? (~integerPart(0) >> NumUnusedHighBits) : 0; - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly && + semantics->nanEncoding == fltNanEncoding::AllOnes) significand[0] &= ~integerPart(1); } @@ -4331,6 +4559,8 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) { APInt::tcSet(significandParts(), 0, partCount()); category = fcZero; exponent = 0; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + sign = false; break; } @@ -4417,8 +4647,11 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) { } APFloatBase::ExponentType IEEEFloat::exponentNaN() const { - if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) + if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) { + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) + return semantics->minExponent; return semantics->maxExponent; + } return semantics->maxExponent + 1; } @@ -4445,6 +4678,10 @@ void IEEEFloat::makeInf(bool Negative) { void IEEEFloat::makeZero(bool Negative) { category = fcZero; sign = Negative; + if (semantics->nanEncoding == fltNanEncoding::NegativeZero) { + // Merge negative zero to positive because 0b10000...000 is used for NaN + sign = false; + } exponent = exponentZero(); APInt::tcSet(significandParts(), 0, partCount()); } |