diff options
author | Alexander Pivovarov <pivovaa@amazon.com> | 2024-07-30 00:11:10 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-30 00:11:10 -0700 |
commit | abc2fe31fc622c4eab3766d739576110eb6f16c3 (patch) | |
tree | d101961650708fab7e30ccfbb34c4219b551abc9 /llvm/unittests/ADT/APFloatTest.cpp | |
parent | fcd6bd5587cc376cd8f43b60d1c7d61fdfe0f535 (diff) | |
download | llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.zip llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.gz llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.bz2 |
[APFloat] Add support for f8E3M4 IEEE 754 type (#99698)
This PR adds `f8E4M3` type to APFloat.
`f8E3M4` type follows IEEE 754 convention
```c
f8E3M4 (IEEE 754)
- Exponent bias: 3
- Maximum stored exponent value: 6 (binary 110)
- Maximum unbiased exponent value: 6 - 3 = 3
- Minimum stored exponent value: 1 (binary 001)
- Minimum unbiased exponent value: 1 − 3 = −2
- Precision specifies the total number of bits used for the significand (mantissa),
including implicit leading integer bit = 4 + 1 = 5
- Follows IEEE 754 conventions for representation of special values
- Has Positive and Negative zero
- Has Positive and Negative infinity
- Has NaNs
Additional details:
- Max exp (unbiased): 3
- Min exp (unbiased): -2
- Infinities (+/-): S.111.0000
- Zeros (+/-): S.000.0000
- NaNs: S.111.{0,1}⁴ except S.111.0000
- Max normal number: S.110.1111 = +/-2^(6-3) x (1 + 15/16) = +/-2^3 x 31 x 2^(-4) = +/-15.5
- Min normal number: S.001.0000 = +/-2^(1-3) x (1 + 0) = +/-2^(-2)
- Max subnormal number: S.000.1111 = +/-2^(-2) x 15/16 = +/-2^(-2) x 15 x 2^(-4) = +/-15 x 2^(-6)
- Min subnormal number: S.000.0001 = +/-2^(-2) x 1/16 = +/-2^(-2) x 2^(-4) = +/-2^(-6)
```
Related PRs:
- [PR-97179](https://github.com/llvm/llvm-project/pull/97179) [APFloat]
Add support for f8E4M3 IEEE 754 type
Diffstat (limited to 'llvm/unittests/ADT/APFloatTest.cpp')
-rw-r--r-- | llvm/unittests/ADT/APFloatTest.cpp | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index d50bdf4..be675bb 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) { {&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1}, {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1}, {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1}, + {&APFloat::Float8E3M4(), false, true, {0, 0}, 1}, + {&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1}, {&APFloat::FloatTF32(), false, true, {0, 0}, 1}, {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}, {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1}, @@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) { EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); } +TEST(APFloatTest, Float8E3M4ToDouble) { + APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false); + APFloat PosZeroToDouble(PosZero.convertToDouble()); + EXPECT_TRUE(PosZeroToDouble.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true); + APFloat NegZeroToDouble(NegZero.convertToDouble()); + EXPECT_TRUE(NegZeroToDouble.isNegZero()); + + APFloat One(APFloat::Float8E3M4(), "1.0"); + EXPECT_EQ(1.0, One.convertToDouble()); + APFloat Two(APFloat::Float8E3M4(), "2.0"); + EXPECT_EQ(2.0, Two.convertToDouble()); + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false); + EXPECT_EQ(15.5F, PosLargest.convertToDouble()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true); + EXPECT_EQ(-15.5F, NegLargest.convertToDouble()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false); + EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true); + EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble()); + + APFloat PosSmallestDenorm = + APFloat::getSmallest(APFloat::Float8E3M4(), false); + EXPECT_TRUE(PosSmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble()); + APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true); + EXPECT_TRUE(NegSmallestDenorm.isDenormal()); + EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4()); + EXPECT_EQ(std::numeric_limits<double>::infinity(), PosInf.convertToDouble()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true); + EXPECT_EQ(-std::numeric_limits<double>::infinity(), NegInf.convertToDouble()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4()); + EXPECT_TRUE(std::isnan(QNaN.convertToDouble())); +} + TEST(APFloatTest, FloatTF32ToDouble) { APFloat One(APFloat::FloatTF32(), "1.0"); EXPECT_EQ(1.0, One.convertToDouble()); @@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) { EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); } +TEST(APFloatTest, Float8E3M4ToFloat) { + APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false); + APFloat PosZeroToFloat(PosZero.convertToFloat()); + EXPECT_TRUE(PosZeroToFloat.isPosZero()); + APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true); + APFloat NegZeroToFloat(NegZero.convertToFloat()); + EXPECT_TRUE(NegZeroToFloat.isNegZero()); + + APFloat One(APFloat::Float8E3M4(), "1.0"); + EXPECT_EQ(1.0F, One.convertToFloat()); + APFloat Two(APFloat::Float8E3M4(), "2.0"); + EXPECT_EQ(2.0F, Two.convertToFloat()); + + APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false); + EXPECT_EQ(15.5F, PosLargest.convertToFloat()); + APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true); + EXPECT_EQ(-15.5F, NegLargest.convertToFloat()); + APFloat PosSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false); + EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat()); + APFloat NegSmallest = + APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true); + EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat()); + + APFloat PosSmallestDenorm = + APFloat::getSmallest(APFloat::Float8E3M4(), false); + EXPECT_TRUE(PosSmallestDenorm.isDenormal()); + EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat()); + APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true); + EXPECT_TRUE(NegSmallestDenorm.isDenormal()); + EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat()); + + APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4()); + EXPECT_EQ(std::numeric_limits<float>::infinity(), PosInf.convertToFloat()); + APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true); + EXPECT_EQ(-std::numeric_limits<float>::infinity(), NegInf.convertToFloat()); + APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4()); + EXPECT_TRUE(std::isnan(QNaN.convertToFloat())); +} + TEST(APFloatTest, FloatTF32ToFloat) { APFloat PosZero = APFloat::getZero(APFloat::FloatTF32()); APFloat PosZeroToFloat(PosZero.convertToFloat()); |