From d6752ccd696c71d23cd3df8fb9cc60b61c32e65a Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar Date: Thu, 14 Feb 2013 10:31:09 +0530 Subject: New __sqr function as a faster special case of __mul --- sysdeps/ieee754/dbl-64/mpa.c | 91 ++++++++++++++++++++++++++ sysdeps/ieee754/dbl-64/mpa.h | 1 + sysdeps/ieee754/dbl-64/mpexp.c | 4 +- sysdeps/powerpc/powerpc32/power4/fpu/mpa.c | 100 +++++++++++++++++++++++++++++ sysdeps/powerpc/powerpc64/power4/fpu/mpa.c | 100 +++++++++++++++++++++++++++++ sysdeps/x86_64/fpu/multiarch/mpa-avx.c | 1 + sysdeps/x86_64/fpu/multiarch/mpa-fma4.c | 1 + 7 files changed, 296 insertions(+), 2 deletions(-) (limited to 'sysdeps') diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c index 5b50b0d..542d81b 100644 --- a/sysdeps/ieee754/dbl-64/mpa.c +++ b/sysdeps/ieee754/dbl-64/mpa.c @@ -702,6 +702,97 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) Z[0] = X[0] * Y[0]; } +/* Square *X and store result in *Y. X and Y may not overlap. For P in + [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the + error is bounded by 1.001 ULP. This is a faster special case of + multiplication. */ +void +SECTION +__sqr (const mp_no *x, mp_no *y, int p) +{ + long i, j, k, ip; + double u, yk; + + /* Is z=0? */ + if (__glibc_unlikely (X[0] == ZERO)) + { + Y[0] = ZERO; + return; + } + + /* We need not iterate through all X's since it's pointless to + multiply zeroes. */ + for (ip = p; ip > 0; ip--) + if (X[ip] != ZERO) + break; + + k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; + + while (k > 2 * ip + 1) + Y[k--] = ZERO; + + yk = ZERO; + + while (k > p) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + for (i = k - p, j = p; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + + while (k > 1) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + for (i = 1, j = k - 1; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + Y[k] = yk; + + /* Squares are always positive. */ + Y[0] = 1.0; + + EY = 2 * EX; + /* Is there a carry beyond the most significant digit? */ + if (__glibc_unlikely (Y[1] == ZERO)) + { + for (i = 1; i <= p; i++) + Y[i] = Y[i + 1]; + EY--; + } +} + /* Invert *X and store in *Y. Relative error bound: - For P = 2: 1.001 * R ^ (1 - P) - For P = 3: 1.063 * R ^ (1 - P) diff --git a/sysdeps/ieee754/dbl-64/mpa.h b/sysdeps/ieee754/dbl-64/mpa.h index 06343d4..168b334 100644 --- a/sysdeps/ieee754/dbl-64/mpa.h +++ b/sysdeps/ieee754/dbl-64/mpa.h @@ -115,6 +115,7 @@ void __dbl_mp (double, mp_no *, int); void __add (const mp_no *, const mp_no *, mp_no *, int); void __sub (const mp_no *, const mp_no *, mp_no *, int); void __mul (const mp_no *, const mp_no *, mp_no *, int); +void __sqr (const mp_no *, mp_no *, int); void __dvd (const mp_no *, const mp_no *, mp_no *, int); extern void __mpatan (mp_no *, mp_no *, int); diff --git a/sysdeps/ieee754/dbl-64/mpexp.c b/sysdeps/ieee754/dbl-64/mpexp.c index 5b3ff04..565c6c8 100644 --- a/sysdeps/ieee754/dbl-64/mpexp.c +++ b/sysdeps/ieee754/dbl-64/mpexp.c @@ -145,14 +145,14 @@ __mpexp (mp_no *x, mp_no *y, int p) /* Raise polynomial value to the power of 2**m. Put result in y. */ for (k = 0, j = 0; k < m;) { - __mul (&mpt2, &mpt2, &mpt1, p); + __sqr (&mpt2, &mpt1, p); k++; if (k == m) { j = 1; break; } - __mul (&mpt1, &mpt1, &mpt2, p); + __sqr (&mpt1, &mpt2, p); k++; } if (j) diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c index b1784f2..7ebf50b 100644 --- a/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c +++ b/sysdeps/powerpc/powerpc32/power4/fpu/mpa.c @@ -687,6 +687,106 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) return; } +/* Square *X and store result in *Y. X and Y may not overlap. For P in + [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the + error is bounded by 1.001 ULP. This is a faster special case of + multiplication. */ +void +__sqr (const mp_no *x, mp_no *y, int p) +{ + long i, j, k, ip; + double u, yk; + + /* Is z=0? */ + if (__glibc_unlikely (X[0] == ZERO)) + { + Y[0] = ZERO; + return; + } + + /* We need not iterate through all X's since it's pointless to + multiply zeroes. */ + for (ip = p; ip > 0; ip--) + if (X[ip] != ZERO) + break; + + k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; + + while (k > 2 * ip + 1) + Y[k--] = ZERO; + + yk = ZERO; + + while (k > p) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* In __mul, this loop (and the one within the next while loop) run + between a range to calculate the mantissa as follows: + + Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] + + X[n] * Y[k] + + For X == Y, we can get away with summing halfway and doubling the + result. For cases where the range size is even, the mid-point needs + to be added separately (above). */ + for (i = k - p, j = p; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + + while (k > 1) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* Likewise for this loop. */ + for (i = 1, j = k - 1; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + Y[k] = yk; + + /* Squares are always positive. */ + Y[0] = 1.0; + + EY = 2 * EX; + /* Is there a carry beyond the most significant digit? */ + if (__glibc_unlikely (Y[1] == ZERO)) + { + for (i = 1; i <= p; i++) + Y[i] = Y[i + 1]; + EY--; + } +} + /* Invert *X and store in *Y. Relative error bound: - For P = 2: 1.001 * R ^ (1 - P) - For P = 3: 1.063 * R ^ (1 - P) diff --git a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c index b1784f2..7ebf50b 100644 --- a/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c +++ b/sysdeps/powerpc/powerpc64/power4/fpu/mpa.c @@ -687,6 +687,106 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p) return; } +/* Square *X and store result in *Y. X and Y may not overlap. For P in + [1, 2, 3], the exact result is truncated to P digits. In case P > 3 the + error is bounded by 1.001 ULP. This is a faster special case of + multiplication. */ +void +__sqr (const mp_no *x, mp_no *y, int p) +{ + long i, j, k, ip; + double u, yk; + + /* Is z=0? */ + if (__glibc_unlikely (X[0] == ZERO)) + { + Y[0] = ZERO; + return; + } + + /* We need not iterate through all X's since it's pointless to + multiply zeroes. */ + for (ip = p; ip > 0; ip--) + if (X[ip] != ZERO) + break; + + k = (__glibc_unlikely (p < 3)) ? p + p : p + 3; + + while (k > 2 * ip + 1) + Y[k--] = ZERO; + + yk = ZERO; + + while (k > p) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* In __mul, this loop (and the one within the next while loop) run + between a range to calculate the mantissa as follows: + + Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1] + + X[n] * Y[k] + + For X == Y, we can get away with summing halfway and doubling the + result. For cases where the range size is even, the mid-point needs + to be added separately (above). */ + for (i = k - p, j = p; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + + while (k > 1) + { + double yk2 = 0.0; + long lim = k / 2; + + if (k % 2 == 0) + { + yk += X[lim] * X[lim]; + lim--; + } + + /* Likewise for this loop. */ + for (i = 1, j = k - 1; i <= lim; i++, j--) + yk2 += X[i] * X[j]; + + yk += 2.0 * yk2; + + u = (yk + CUTTER) - CUTTER; + if (u > yk) + u -= RADIX; + Y[k--] = yk - u; + yk = u * RADIXI; + } + Y[k] = yk; + + /* Squares are always positive. */ + Y[0] = 1.0; + + EY = 2 * EX; + /* Is there a carry beyond the most significant digit? */ + if (__glibc_unlikely (Y[1] == ZERO)) + { + for (i = 1; i <= p; i++) + Y[i] = Y[i + 1]; + EY--; + } +} + /* Invert *X and store in *Y. Relative error bound: - For P = 2: 1.001 * R ^ (1 - P) - For P = 3: 1.063 * R ^ (1 - P) diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c index d3f4d7a..366b0b7 100644 --- a/sysdeps/x86_64/fpu/multiarch/mpa-avx.c +++ b/sysdeps/x86_64/fpu/multiarch/mpa-avx.c @@ -1,5 +1,6 @@ #define __add __add_avx #define __mul __mul_avx +#define __sqr __sqr_avx #define __sub __sub_avx #define __dbl_mp __dbl_mp_avx #define __dvd __dvd_avx diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c index 6abb671..a4a7594 100644 --- a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c +++ b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c @@ -1,5 +1,6 @@ #define __add __add_fma4 #define __mul __mul_fma4 +#define __sqr __sqr_fma4 #define __sub __sub_fma4 #define __dbl_mp __dbl_mp_fma4 #define __dvd __dvd_fma4 -- cgit v1.1