1 files changed, 41 insertions, 11 deletions
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index b9490b5..3d16a3c 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -21,12 +21,22 @@ using DoubleDouble = LIBC_NAMESPACE::NumberPair<double>;
 // The output of Dekker's FastTwoSum algorithm is correct, i.e.:
 //   r.hi + r.lo = a + b exactly
 //   and |r.lo| < eps(r.lo)
-// if ssumption: |a| >= |b|, or a = 0.
+// Assumption: |a| >= |b|, or a = 0.
+template <bool FAST2SUM = true>
 LIBC_INLINE constexpr DoubleDouble exact_add(double a, double b) {
   DoubleDouble r{0.0, 0.0};
-  r.hi = a + b;
-  double t = r.hi - a;
-  r.lo = b - t;
+  if constexpr (FAST2SUM) {
+    r.hi = a + b;
+    double t = r.hi - a;
+    r.lo = b - t;
+  } else {
+    r.hi = a + b;
+    double t1 = r.hi - a;
+    double t2 = r.hi - t1;
+    double t3 = b - t1;
+    double t4 = a - t2;
+    r.lo = t3 + t4;
+  }
   return r;
 }
 
@@ -40,15 +50,20 @@ LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a,
 
 // Assumption: |a.hi| >= |b|
 LIBC_INLINE constexpr DoubleDouble add(const DoubleDouble &a, double b) {
-  DoubleDouble r = exact_add(a.hi, b);
+  DoubleDouble r = exact_add<false>(a.hi, b);
   return exact_add(r.hi, r.lo + a.lo);
 }
 
-// Velkamp's Splitting for double precision.
-LIBC_INLINE constexpr DoubleDouble split(double a) {
+// Veltkamp's Splitting for double precision.
+// Note: This is proved to be correct for all rounding modes:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Default splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
+template <size_t N = 27> LIBC_INLINE constexpr DoubleDouble split(double a) {
   DoubleDouble r{0.0, 0.0};
-  // Splitting constant = 2^ceil(prec(double)/2) + 1 = 2^27 + 1.
-  constexpr double C = 0x1.0p27 + 1.0;
+  // CN = 2^N.
+  constexpr double CN = static_cast<double>(1 << N);
+  constexpr double C = CN + 1.0;
   double t1 = C * a;
   double t2 = a - t1;
   r.hi = t1 + t2;
@@ -56,6 +71,14 @@ LIBC_INLINE constexpr DoubleDouble split(double a) {
   return r;
 }
 
+// Note: When FMA instruction is not available, the `exact_mult` function is
+// only correct for round-to-nearest mode.  See:
+//   Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
+//   Roundings," https://inria.hal.science/hal-04480440.
+// Using Theorem 1 in the paper above, without FMA instruction, if we restrict
+// the generated constants to precision <= 51, and splitting it by 2^28 + 1,
+// then a * b = r.hi + r.lo is exact for all rounding modes.
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
   DoubleDouble r{0.0, 0.0};
 
@@ -65,7 +88,13 @@ LIBC_INLINE DoubleDouble exact_mult(double a, double b) {
 #else
   // Dekker's Product.
   DoubleDouble as = split(a);
-  DoubleDouble bs = split(b);
+  DoubleDouble bs;
+
+  if constexpr (NO_FMA_ALL_ROUNDINGS)
+    bs = split<28>(b);
+  else
+    bs = split(b);
+
   r.hi = a * b;
   double t1 = as.hi * bs.hi - r.hi;
   double t2 = as.hi * bs.lo + t1;
@@ -82,9 +111,10 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) {
   return r;
 }
 
+template <bool NO_FMA_ALL_ROUNDINGS = false>
 LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a,
                                     const DoubleDouble &b) {
-  DoubleDouble r = exact_mult(a.hi, b.hi);
+  DoubleDouble r = exact_mult<NO_FMA_ALL_ROUNDINGS>(a.hi, b.hi);
   double t1 = multiply_add(a.hi, b.lo, r.lo);
   double t2 = multiply_add(a.lo, b.hi, t1);
   r.lo = t2;