aboutsummaryrefslogtreecommitdiff
path: root/llvm/unittests/ADT/APFloatTest.cpp
diff options
context:
space:
mode:
authorAlexander Pivovarov <pivovaa@amazon.com>2024-07-30 00:11:10 -0700
committerGitHub <noreply@github.com>2024-07-30 00:11:10 -0700
commitabc2fe31fc622c4eab3766d739576110eb6f16c3 (patch)
treed101961650708fab7e30ccfbb34c4219b551abc9 /llvm/unittests/ADT/APFloatTest.cpp
parentfcd6bd5587cc376cd8f43b60d1c7d61fdfe0f535 (diff)
downloadllvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.zip
llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.gz
llvm-abc2fe31fc622c4eab3766d739576110eb6f16c3.tar.bz2
[APFloat] Add support for f8E3M4 IEEE 754 type (#99698)
This PR adds `f8E4M3` type to APFloat. `f8E3M4` type follows IEEE 754 convention ```c f8E3M4 (IEEE 754) - Exponent bias: 3 - Maximum stored exponent value: 6 (binary 110) - Maximum unbiased exponent value: 6 - 3 = 3 - Minimum stored exponent value: 1 (binary 001) - Minimum unbiased exponent value: 1 − 3 = −2 - Precision specifies the total number of bits used for the significand (mantissa), including implicit leading integer bit = 4 + 1 = 5 - Follows IEEE 754 conventions for representation of special values - Has Positive and Negative zero - Has Positive and Negative infinity - Has NaNs Additional details: - Max exp (unbiased): 3 - Min exp (unbiased): -2 - Infinities (+/-): S.111.0000 - Zeros (+/-): S.000.0000 - NaNs: S.111.{0,1}⁴ except S.111.0000 - Max normal number: S.110.1111 = +/-2^(6-3) x (1 + 15/16) = +/-2^3 x 31 x 2^(-4) = +/-15.5 - Min normal number: S.001.0000 = +/-2^(1-3) x (1 + 0) = +/-2^(-2) - Max subnormal number: S.000.1111 = +/-2^(-2) x 15/16 = +/-2^(-2) x 15 x 2^(-4) = +/-15 x 2^(-6) - Min subnormal number: S.000.0001 = +/-2^(-2) x 1/16 = +/-2^(-2) x 2^(-4) = +/-2^(-6) ``` Related PRs: - [PR-97179](https://github.com/llvm/llvm-project/pull/97179) [APFloat] Add support for f8E4M3 IEEE 754 type
Diffstat (limited to 'llvm/unittests/ADT/APFloatTest.cpp')
-rw-r--r--llvm/unittests/ADT/APFloatTest.cpp81
1 files changed, 81 insertions, 0 deletions
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index d50bdf4..be675bb 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) {
{&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1},
{&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
{&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
+ {&APFloat::Float8E3M4(), false, true, {0, 0}, 1},
+ {&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1},
{&APFloat::FloatTF32(), false, true, {0, 0}, 1},
{&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
{&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
@@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) {
EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
}
+TEST(APFloatTest, Float8E3M4ToDouble) {
+ APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+ APFloat PosZeroToDouble(PosZero.convertToDouble());
+ EXPECT_TRUE(PosZeroToDouble.isPosZero());
+ APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+ APFloat NegZeroToDouble(NegZero.convertToDouble());
+ EXPECT_TRUE(NegZeroToDouble.isNegZero());
+
+ APFloat One(APFloat::Float8E3M4(), "1.0");
+ EXPECT_EQ(1.0, One.convertToDouble());
+ APFloat Two(APFloat::Float8E3M4(), "2.0");
+ EXPECT_EQ(2.0, Two.convertToDouble());
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+ EXPECT_EQ(15.5F, PosLargest.convertToDouble());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-15.5F, NegLargest.convertToDouble());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+ EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble());
+
+ APFloat PosSmallestDenorm =
+ APFloat::getSmallest(APFloat::Float8E3M4(), false);
+ EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+ EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble());
+ APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+ EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+ EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble());
+
+ APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+ EXPECT_EQ(std::numeric_limits<double>::infinity(), PosInf.convertToDouble());
+ APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-std::numeric_limits<double>::infinity(), NegInf.convertToDouble());
+ APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+ EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
+}
+
TEST(APFloatTest, FloatTF32ToDouble) {
APFloat One(APFloat::FloatTF32(), "1.0");
EXPECT_EQ(1.0, One.convertToDouble());
@@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) {
EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
}
+TEST(APFloatTest, Float8E3M4ToFloat) {
+ APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
+ APFloat PosZeroToFloat(PosZero.convertToFloat());
+ EXPECT_TRUE(PosZeroToFloat.isPosZero());
+ APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
+ APFloat NegZeroToFloat(NegZero.convertToFloat());
+ EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+ APFloat One(APFloat::Float8E3M4(), "1.0");
+ EXPECT_EQ(1.0F, One.convertToFloat());
+ APFloat Two(APFloat::Float8E3M4(), "2.0");
+ EXPECT_EQ(2.0F, Two.convertToFloat());
+
+ APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
+ EXPECT_EQ(15.5F, PosLargest.convertToFloat());
+ APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-15.5F, NegLargest.convertToFloat());
+ APFloat PosSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
+ EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat());
+ APFloat NegSmallest =
+ APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat());
+
+ APFloat PosSmallestDenorm =
+ APFloat::getSmallest(APFloat::Float8E3M4(), false);
+ EXPECT_TRUE(PosSmallestDenorm.isDenormal());
+ EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat());
+ APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
+ EXPECT_TRUE(NegSmallestDenorm.isDenormal());
+ EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat());
+
+ APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
+ EXPECT_EQ(std::numeric_limits<float>::infinity(), PosInf.convertToFloat());
+ APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
+ EXPECT_EQ(-std::numeric_limits<float>::infinity(), NegInf.convertToFloat());
+ APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
+ EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
+}
+
TEST(APFloatTest, FloatTF32ToFloat) {
APFloat PosZero = APFloat::getZero(APFloat::FloatTF32());
APFloat PosZeroToFloat(PosZero.convertToFloat());