diff options
author | Alexander Pivovarov <pivovaa@amazon.com> | 2024-07-30 00:11:10 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-30 00:11:10 -0700 |
commit | abc2fe31fc622c4eab3766d739576110eb6f16c3 (patch) | |
tree | d101961650708fab7e30ccfbb34c4219b551abc9 /llvm/lib/Support/APFloat.cpp | |
parent | fcd6bd5587cc376cd8f43b60d1c7d61fdfe0f535 (diff) | |
download | llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.zip llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.gz llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.bz2 |
[APFloat] Add support for f8E3M4 IEEE 754 type (#99698)
This PR adds `f8E4M3` type to APFloat.
`f8E3M4` type follows IEEE 754 convention
```c
f8E3M4 (IEEE 754)
- Exponent bias: 3
- Maximum stored exponent value: 6 (binary 110)
- Maximum unbiased exponent value: 6 - 3 = 3
- Minimum stored exponent value: 1 (binary 001)
- Minimum unbiased exponent value: 1 − 3 = −2
- Precision specifies the total number of bits used for the significand (mantissa),
including implicit leading integer bit = 4 + 1 = 5
- Follows IEEE 754 conventions for representation of special values
- Has Positive and Negative zero
- Has Positive and Negative infinity
- Has NaNs
Additional details:
- Max exp (unbiased): 3
- Min exp (unbiased): -2
- Infinities (+/-): S.111.0000
- Zeros (+/-): S.000.0000
- NaNs: S.111.{0,1}⁴ except S.111.0000
- Max normal number: S.110.1111 = +/-2^(6-3) x (1 + 15/16) = +/-2^3 x 31 x 2^(-4) = +/-15.5
- Min normal number: S.001.0000 = +/-2^(1-3) x (1 + 0) = +/-2^(-2)
- Max subnormal number: S.000.1111 = +/-2^(-2) x 15/16 = +/-2^(-2) x 15 x 2^(-4) = +/-15 x 2^(-6)
- Min subnormal number: S.000.0001 = +/-2^(-2) x 1/16 = +/-2^(-2) x 2^(-4) = +/-2^(-6)
```
Related PRs:
- [PR-97179](https://github.com/llvm/llvm-project/pull/97179) [APFloat]
Add support for f8E4M3 IEEE 754 type
Diffstat (limited to 'llvm/lib/Support/APFloat.cpp')
-rw-r--r-- | llvm/lib/Support/APFloat.cpp | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp index 26b4f8e..7f68c5a 100644 --- a/llvm/lib/Support/APFloat.cpp +++ b/llvm/lib/Support/APFloat.cpp @@ -143,6 +143,7 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = { 7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; static constexpr fltSemantics semFloat8E4M3B11FNUZ = { 4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero}; +static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8}; static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19}; static constexpr fltSemantics semFloat6E3M2FN = { 4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly}; @@ -217,6 +218,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) { return Float8E4M3FNUZ(); case S_Float8E4M3B11FNUZ: return Float8E4M3B11FNUZ(); + case S_Float8E3M4: + return Float8E3M4(); case S_FloatTF32: return FloatTF32(); case S_Float6E3M2FN: @@ -257,6 +260,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) { return S_Float8E4M3FNUZ; else if (&Sem == &llvm::APFloat::Float8E4M3B11FNUZ()) return S_Float8E4M3B11FNUZ; + else if (&Sem == &llvm::APFloat::Float8E3M4()) + return S_Float8E3M4; else if (&Sem == &llvm::APFloat::FloatTF32()) return S_FloatTF32; else if (&Sem == &llvm::APFloat::Float6E3M2FN()) @@ -287,6 +292,7 @@ const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; } const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() { return semFloat8E4M3B11FNUZ; } +const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; } const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; } const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; } const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; } @@ -3643,6 +3649,11 @@ APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const { return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>(); } +APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const { + assert(partCount() == 1); + return convertIEEEFloatToAPInt<semFloat8E3M4>(); +} + APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const { assert(partCount() == 1); return convertIEEEFloatToAPInt<semFloatTF32>(); @@ -3704,6 +3715,9 @@ APInt IEEEFloat::bitcastToAPInt() const { if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ) return convertFloat8E4M3B11FNUZAPFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4) + return convertFloat8E3M4APFloatToAPInt(); + if (semantics == (const llvm::fltSemantics *)&semFloatTF32) return convertFloatTF32APFloatToAPInt(); @@ -3932,6 +3946,10 @@ void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) { initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api); } +void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) { + initFromIEEEAPInt<semFloat8E3M4>(api); +} + void IEEEFloat::initFromFloatTF32APInt(const APInt &api) { initFromIEEEAPInt<semFloatTF32>(api); } @@ -3977,6 +3995,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) { return initFromFloat8E4M3FNUZAPInt(api); if (Sem == &semFloat8E4M3B11FNUZ) return initFromFloat8E4M3B11FNUZAPInt(api); + if (Sem == &semFloat8E3M4) + return initFromFloat8E3M4APInt(api); if (Sem == &semFloatTF32) return initFromFloatTF32APInt(api); if (Sem == &semFloat6E3M2FN) |