sysdeps/x86/fpu/powl_helper.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

/* Implement powl for x86 using extra-precision log.
   Copyright (C) 2012-2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <math.h>
#include <math_private.h>
#include <math-underflow.h>
#include <stdbool.h>

/* High parts and low parts of -log (k/16), for integer k from 12 to
   24.  */

static const long double powl_log_table[] =
  {
    0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L,
    0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L,
    0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L,
    0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L,
    -0x0p+0L, 0x0p+0L,
    -0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L,
    -0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L,
    -0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L,
    -0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L,
    -0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L,
    -0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L,
    -0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L,
    -0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L,
  };

/* High 32 bits of log2 (e), and remainder rounded to 64 bits.  */
static const long double log2e_hi = 0x1.71547652p+0L;
static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L;

/* Given a number with high part HI and low part LO, add the number X
   to it and store the result in *RHI and *RLO.  It is given that
   either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are
   small enough that no overflow occurs.  The result does not need to
   be exact to 128 bits; 78-bit accuracy of the final accumulated
   result suffices.  */

static inline void
acc_split (long double *rhi, long double *rlo, long double hi, long double lo,
	   long double x)
{
  long double thi = hi + x;
  long double tlo = (hi - thi) + x + lo;
  *rhi = thi + tlo;
  *rlo = (thi - *rhi) + tlo;
}

extern long double __powl_helper (long double x, long double y);
libm_hidden_proto (__powl_helper)

/* Given X a value that is finite and nonzero, or a NaN, and Y a
   finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to
   the power Y.  */

long double
__powl_helper (long double x, long double y)
{
  if (isnan (x))
    return __ieee754_expl (y * __ieee754_logl (x));
  bool negate;
  if (x < 0)
    {
      long double absy = fabsl (y);
      if (absy >= 0x1p64L)
	negate = false;
      else
	{
	  unsigned long long yll = absy;
	  if (yll != absy)
	    return __ieee754_expl (y * __ieee754_logl (x));
	  negate = (yll & 1) != 0;
	}
      x = fabsl (x);
    }
  else
    negate = false;

  /* We need to compute Y * log2 (X) to at least 64 bits after the
     point for normal results (that is, to at least 78 bits
     precision).  */
  int x_int_exponent;
  long double x_frac;
  x_frac = __frexpl (x, &x_int_exponent);
  if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */
    {
      x_frac *= 2.0;
      x_int_exponent--;
    }

  long double log_x_frac_hi, log_x_frac_lo;
  /* Determine an initial approximation to log (X_FRAC) using
     POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an
     interval (24/25, 26/25).  */
  int k = (int) ((16.0L / x_frac) + 0.5L);
  log_x_frac_hi = powl_log_table[2 * k - 24];
  log_x_frac_lo = powl_log_table[2 * k - 23];
  long double x_frac_low;
  if (k == 16)
    x_frac_low = 0.0L;
  else
    {
      /* Mask off low 5 bits of X_FRAC so the multiplication by K/16
	 is exact.  These bits are small enough that they can be
	 corrected for by adding log2 (e) * X_FRAC_LOW to the final
	 result.  */
      int32_t se;
      uint32_t i0, i1;
      GET_LDOUBLE_WORDS (se, i0, i1, x_frac);
      x_frac_low = x_frac;
      i1 &= 0xffffffe0;
      SET_LDOUBLE_WORDS (x_frac, se, i0, i1);
      x_frac_low -= x_frac;
      x_frac_low /= x_frac;
      x_frac *= k / 16.0L;
    }

  /* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25).  Separate
     W = X_FRAC - 1 into high 16 bits and remaining bits, so that
     multiplications for low-order power series terms are exact.  The
     remaining bits are small enough that adding a 64-bit value of
     log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for
     them.  */
  long double w = x_frac - 1;
  long double w_hi, w_lo;
  int32_t se;
  uint32_t i0, i1;
  GET_LDOUBLE_WORDS (se, i0, i1, w);
  i0 &= 0xffff0000;
  i1 = 0;
  SET_LDOUBLE_WORDS (w_hi, se, i0, i1);
  w_lo = w - w_hi;
  long double wp = w_hi;
  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp);
  wp *= -w_hi;
  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
	     wp / 2.0L);
  wp *= -w_hi;
  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
	     wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part.  */
  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
	     wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part.  */
  wp *= -w_hi;
  acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
	     wp / 4.0L);
  /* Subsequent terms are small enough that they only need be computed
     to 64 bits.  */
  for (int i = 5; i <= 17; i++)
    {
      wp *= -w_hi;
      acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
		 wp / i);
    }

  /* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm.  */
  long double log2_x_frac_hi, log2_x_frac_lo;
  long double log_x_frac_hi32, log_x_frac_lo64;
  GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi);
  i1 = 0;
  SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1);
  log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo;
  long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi;
  long double log2_x_frac_lo1
    = log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo;
  log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1;
  log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1;

  /* Correct for the masking off of W_LO.  */
  long double log2_1p_w_lo;
  asm ("fyl2xp1"
       : "=t" (log2_1p_w_lo)
       : "0" (w_lo / (1.0L + w_hi)), "u" (1.0L)
       : "st(1)");
  acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
	     log2_1p_w_lo);

  /* Correct for the masking off of X_FRAC_LOW.  */
  acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
	     x_frac_low * M_LOG2El);

  /* Add the integer and fractional parts of the base-2 logarithm.  */
  long double log2_x_hi, log2_x_lo;
  log2_x_hi = x_int_exponent + log2_x_frac_hi;
  log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo;

  /* Compute the base-2 logarithm of the result.  */
  long double log2_res_hi, log2_res_lo;
  long double log2_x_hi32, log2_x_lo64;
  GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi);
  i1 = 0;
  SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1);
  log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo;
  long double y_hi32, y_lo32;
  GET_LDOUBLE_WORDS (se, i0, i1, y);
  i1 = 0;
  SET_LDOUBLE_WORDS (y_hi32, se, i0, i1);
  y_lo32 = y - y_hi32;
  log2_res_hi = log2_x_hi32 * y_hi32;
  log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y;

  /* Split the base-2 logarithm of the result into integer and
     fractional parts.  */
  long double log2_res_int = roundl (log2_res_hi);
  long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo;
  /* If the integer part is very large, the computed fractional part
     may be outside the valid range for f2xm1.  */
  if (fabsl (log2_res_int) > 16500)
    log2_res_frac = 0;

  /* Compute the final result.  */
  long double res;
  asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac));
  res += 1.0L;
  if (negate)
    res = -res;
  asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int));
  math_check_force_underflow (res);
  return res;
}

libm_hidden_def (__powl_helper)