diff options
author | Ulrich Drepper <drepper@gmail.com> | 2011-10-24 20:19:17 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-24 20:19:17 -0400 |
commit | af968f62f24c5c0ef4e7e5ab41acae946908c112 (patch) | |
tree | e1e0570eeb00c434cc751cbadfbeae150eeea11a | |
parent | 58985aa92f57ff46e96b32388ce65e7fdd8c8b9e (diff) | |
download | glibc-af968f62f24c5c0ef4e7e5ab41acae946908c112.zip glibc-af968f62f24c5c0ef4e7e5ab41acae946908c112.tar.gz glibc-af968f62f24c5c0ef4e7e5ab41acae946908c112.tar.bz2 |
Optimize accurate 64-bit routines for FMA4 on x86-64
48 files changed, 595 insertions, 177 deletions
@@ -1,5 +1,62 @@ 2011-10-24 Ulrich Drepper <drepper@gmail.com> + * config.make.in: Add have-mfma4 entry. + * configure.in: Substitute libc_cv_cc_fma4. + * math/Makefile (dbl-only-routines): Add sincostab. + * sysdeps/ieee754/dbl-64/dosincos.c: Don't include sincos.tbl. + Use __sincostab not sincos. + * sysdeps/ieee754/dbl-64/e_asin.c: Don't define aliases when function + name is a macro. + * sysdeps/ieee754/dbl-64/e_exp.c: Likewise. + * sysdeps/ieee754/dbl-64/e_log.c: Likewise. + * sysdeps/ieee754/dbl-64/e_pow.c: Likewise. + * sysdeps/ieee754/dbl-64/e_atan2.c: Likewise. Define singArctan2 + using __copysign. + * sysdeps/ieee754/dbl-64/mpa.c: Don't export __acr. Don't define + __cr and __cpymn. Define __cpy unless NO___CPY is defined. Define + norm, denorm, and __mp_dbl unless NO___MP_DBL is defined. + * sysdeps/ieee754/dbl-64/mpa.h: Don't declare __acr, __cr, __cpymn, + and __inv. + * sysdeps/ieee754/dbl-64/mpsqrt.c: Make fastiroot static. + * sysdeps/ieee754/dbl-64/s_atan.c: Define __signArctan using + __copysign. + * sysdeps/ieee754/dbl-64/s_sin.c: Use __sincostab not sincos. Don't + define aliases when function name is a macro. + * sysdeps/ieee754/dbl-64/sincostab.c: Renamed from + sysdeps/ieee754/dbl-64/sincos.tbl. + * sysdeps/x86_64/fpu/multiarch/Makefile: Add entries to build + fma4-enabled routines. + * sysdeps/x86_64/fpu/multiarch/brandred-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/doasin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_asin.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_atan2.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_exp.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_log-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_log.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_pow.c: New file. + * sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpa-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mplog-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mptan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_atan.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_sin.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_tan.c: New file. + * sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c: New file. + * sysdeps/ieee754/dbl-64/doasin.c: Adjust for DLA_FMA -> DLA_FMS rename. * sysdeps/ieee754/dbl-64/dosincos.c: Likewise. diff --git a/config.make.in b/config.make.in index 2181d05..d2baf6d 100644 --- a/config.make.in +++ b/config.make.in @@ -59,6 +59,7 @@ have-cpp-asm-debuginfo = @libc_cv_cpp_asm_debuginfo@ enable-check-abi = @enable_check_abi@ have-forced-unwind = @libc_cv_forced_unwind@ have-fpie = @libc_cv_fpie@ +have-mfma4 = @libc_cv_cc_fma4@ gnu89-inline-CFLAGS = @gnu89_inline@ have-ssp = @libc_cv_ssp@ have-selinux = @have_selinux@ @@ -623,6 +623,7 @@ elf ldd_rewrite_script use_ldconfig libc_cv_as_i686 +libc_cv_cc_fma4 libc_cv_cc_novzeroupper libc_cv_cc_avx libc_cv_cc_sse4 @@ -7944,6 +7945,7 @@ fi + if test $elf = yes; then $as_echo "#define HAVE_ELF 1" >>confdefs.h diff --git a/configure.in b/configure.in index 6977fe1..9678cbe 100644 --- a/configure.in +++ b/configure.in @@ -2339,6 +2339,7 @@ AC_SUBST(libc_cv_cpp_asm_debuginfo) AC_SUBST(libc_cv_cc_sse4) AC_SUBST(libc_cv_cc_avx) AC_SUBST(libc_cv_cc_novzeroupper) +AC_SUBST(libc_cv_cc_fma4) AC_SUBST(libc_cv_as_i686) AC_SUBST(use_ldconfig) diff --git a/math/Makefile b/math/Makefile index 431eb5a..41340da 100644 --- a/math/Makefile +++ b/math/Makefile @@ -66,7 +66,7 @@ include ../Makeconfig dbl-only-routines := branred doasin dosincos halfulp mpa mpatan2 \ mpatan mpexp mplog mpsqrt mptan sincos32 slowexp \ - slowpow + slowpow sincostab libm-routines = $(strip $(libm-support) $(libm-calls) \ $(patsubst %_rf,%f_r,$(libm-calls:=f)) \ $(long-m-$(long-double-fcts))) \ diff --git a/sysdeps/ieee754/dbl-64/dosincos.c b/sysdeps/ieee754/dbl-64/dosincos.c index d5c6a14..712d585 100644 --- a/sysdeps/ieee754/dbl-64/dosincos.c +++ b/sysdeps/ieee754/dbl-64/dosincos.c @@ -35,11 +35,16 @@ #include "endian.h" #include "mydefs.h" -#include "sincos.tbl" #include <dla.h> #include "dosincos.h" #include "math_private.h" +extern const union +{ + int4 i[880]; + double x[440]; +} __sincostab attribute_hidden; + /***********************************************************************/ /* Routine receive Double-Length number (x+dx) and computing sin(x+dx) */ /* as Double-Length number and store it at array v .It computes it by */ @@ -66,10 +71,10 @@ void __dubsin(double x, double dx, double v[]) { dd=(x-d)+dx; /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); - sn=sincos.x[k]; /* */ - ssn=sincos.x[k+1]; /* sin(Xi) and cos(Xi) */ - cs=sincos.x[k+2]; /* */ - ccs=sincos.x[k+3]; /* */ + sn=__sincostab.x[k]; /* */ + ssn=__sincostab.x[k+1]; /* sin(Xi) and cos(Xi) */ + cs=__sincostab.x[k+2]; /* */ + ccs=__sincostab.x[k+3]; /* */ MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* Taylor */ ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* series */ @@ -118,10 +123,10 @@ void __dubcos(double x, double dx, double v[]) { d=x+dx; dd=(x-d)+dx; /* cos(x+dx)=cos(Xi+t)=cos(Xi)cos(t) - sin(Xi)sin(t) */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); - sn=sincos.x[k]; /* */ - ssn=sincos.x[k+1]; /* sin(Xi) and cos(Xi) */ - cs=sincos.x[k+2]; /* */ - ccs=sincos.x[k+3]; /* */ + sn=__sincostab.x[k]; /* */ + ssn=__sincostab.x[k+1]; /* sin(Xi) and cos(Xi) */ + cs=__sincostab.x[k+2]; /* */ + ccs=__sincostab.x[k+3]; /* */ MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); diff --git a/sysdeps/ieee754/dbl-64/e_asin.c b/sysdeps/ieee754/dbl-64/e_asin.c index 02efb7a..cd4cc2e 100644 --- a/sysdeps/ieee754/dbl-64/e_asin.c +++ b/sysdeps/ieee754/dbl-64/e_asin.c @@ -324,7 +324,9 @@ double __ieee754_asin(double x){ return u.x/v.x; /* NaN */ } } +#ifndef __ieee754_asin strong_alias (__ieee754_asin, __asin_finite) +#endif /*******************************************************************/ /* */ @@ -636,4 +638,6 @@ double __ieee754_acos(double x) return u.x/v.x; } } +#ifndef __ieee754_acos strong_alias (__ieee754_acos, __acos_finite) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_atan2.c b/sysdeps/ieee754/dbl-64/e_atan2.c index 264791e..9caaccc 100644 --- a/sysdeps/ieee754/dbl-64/e_atan2.c +++ b/sysdeps/ieee754/dbl-64/e_atan2.c @@ -51,7 +51,11 @@ /* round to nearest mode of IEEE 754 standard. */ /************************************************************************/ static double atan2Mp(double ,double ,const int[]); -static double signArctan2(double ,double); + /* Fix the sign and return after stage 1 or stage 2 */ +static double signArctan2(double y,double z) +{ + return __copysign(z, y); +} static double normalized(double ,double,double ,double); void __mpatan2(mp_no *,mp_no *,mp_no *,int); @@ -375,7 +379,9 @@ double __ieee754_atan2(double y,double x) { } } } +#ifndef __ieee754_atan2 strong_alias (__ieee754_atan2, __atan2_finite) +#endif /* Treat the Denormalized case */ static double normalized(double ax,double ay,double y, double z) @@ -387,11 +393,6 @@ static double normalized(double ax,double ay,double y, double z) __sub(&mpz,&mperr,&mpz2,p); __mp_dbl(&mpz2,&z,p); return signArctan2(y,z); } - /* Fix the sign and return after stage 1 or stage 2 */ -static double signArctan2(double y,double z) -{ - return ((y<ZERO) ? -z : z); -} /* Stage 3: Perform a multi-Precision computation */ static double atan2Mp(double x,double y,const int pr[]) { diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c index f4b34a6..48bbb05 100644 --- a/sysdeps/ieee754/dbl-64/e_exp.c +++ b/sysdeps/ieee754/dbl-64/e_exp.c @@ -145,7 +145,9 @@ double __ieee754_exp(double x) { else return __slowexp(x); } } +#ifndef __ieee754_exp strong_alias (__ieee754_exp, __exp_finite) +#endif /************************************************************************/ /* Compute e^(x+xx)(Double-Length number) .The routine also receive */ diff --git a/sysdeps/ieee754/dbl-64/e_log.c b/sysdeps/ieee754/dbl-64/e_log.c index b7df81b..7a0a26f 100644 --- a/sysdeps/ieee754/dbl-64/e_log.c +++ b/sysdeps/ieee754/dbl-64/e_log.c @@ -207,4 +207,6 @@ double __ieee754_log(double x) { } return y1; } +#ifndef __ieee754_log strong_alias (__ieee754_log, __log_finite) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_pow.c b/sysdeps/ieee754/dbl-64/e_pow.c index 0c7abb6..94b1ab8 100644 --- a/sysdeps/ieee754/dbl-64/e_pow.c +++ b/sysdeps/ieee754/dbl-64/e_pow.c @@ -153,7 +153,9 @@ double __ieee754_pow(double x, double y) { if (y<0) return (x<1.0)?INF.x:0; return 0; /* unreachable, to make the compiler happy */ } +#ifndef __ieee754_pow strong_alias (__ieee754_pow, __pow_finite) +#endif /**************************************************************************/ /* Computing x^y using more accurate but more slow log routine */ diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c index 68647ba..ad5a639 100644 --- a/sysdeps/ieee754/dbl-64/mpa.c +++ b/sysdeps/ieee754/dbl-64/mpa.c @@ -1,8 +1,7 @@ - /* * IBM Accurate Mathematical Library * written by International Business Machines Corp. - * Copyright (C) 2001 Free Software Foundation + * Copyright (C) 2001, 2011 Free Software Foundation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -64,7 +63,7 @@ static int mcr(const mp_no *x, const mp_no *y, int p) { /* acr() compares the absolute values of two multiple precision numbers */ -int __acr(const mp_no *x, const mp_no *y, int p) { +static int __acr(const mp_no *x, const mp_no *y, int p) { int i; if (X[0] == ZERO) { @@ -82,8 +81,9 @@ int __acr(const mp_no *x, const mp_no *y, int p) { } +#if 0 /* cr90 compares the values of two multiple precision numbers */ -int __cr(const mp_no *x, const mp_no *y, int p) { +static int __cr(const mp_no *x, const mp_no *y, int p) { int i; if (X[0] > Y[0]) i= 1; @@ -93,26 +93,26 @@ int __cr(const mp_no *x, const mp_no *y, int p) { return i; } +#endif +#ifndef NO___CPY /* Copy a multiple precision number. Set *y=*x. x=y is permissible. */ void __cpy(const mp_no *x, mp_no *y, int p) { - int i; - EY = EX; - for (i=0; i <= p; i++) Y[i] = X[i]; - - return; + for (int i=0; i <= p; i++) Y[i] = X[i]; } +#endif +#if 0 /* Copy a multiple precision number x of precision m into a */ /* multiple precision number y of precision n. In case n>m, */ /* the digits of y beyond the m'th are set to zero. In case */ /* n<m, the digits of x beyond the n'th are ignored. */ /* x=y is permissible. */ -void __cpymn(const mp_no *x, int m, mp_no *y, int n) { +static void __cpymn(const mp_no *x, int m, mp_no *y, int n) { int i,k; @@ -122,7 +122,10 @@ void __cpymn(const mp_no *x, int m, mp_no *y, int n) { return; } +#endif + +#ifndef NO___MP_DBL /* Convert a multiple precision number *x into a double precision */ /* number *y, normalized case (|x| >= 2**(-1022))) */ static void norm(const mp_no *x, double *y, int p) @@ -141,7 +144,7 @@ static void norm(const mp_no *x, double *y, int p) } else { for (a=ONE, z[1]=X[1]; z[1] < TWO23; ) - {a *= TWO; z[1] *= TWO; } + {a *= TWO; z[1] *= TWO; } for (i=2; i<5; i++) { z[i] = X[i]*a; @@ -157,10 +160,10 @@ static void norm(const mp_no *x, double *y, int p) if (v == TWO18) { if (z[4] == ZERO) { - for (i=5; i <= p; i++) { - if (X[i] == ZERO) continue; - else {z[3] += ONE; break; } - } + for (i=5; i <= p; i++) { + if (X[i] == ZERO) continue; + else {z[3] += ONE; break; } + } } else z[3] += ONE; } @@ -242,6 +245,7 @@ void __mp_dbl(const mp_no *x, double *y, int p) { else if (EX==-42 && X[1]>=TWO10) norm(x,y,p); else denorm(x,y,p); } +#endif /* dbl_mp() converts a double precision number x into a multiple precision */ @@ -336,11 +340,11 @@ static void sub_magnitudes(const mp_no *x, const mp_no *y, mp_no *z, int p) { else { i=p; j=p+1-j; k=p; if (Y[j] > ZERO) { - Z[k+1] = RADIX - Y[j--]; - Z[k] = MONE; } + Z[k+1] = RADIX - Y[j--]; + Z[k] = MONE; } else { - Z[k+1] = ZERO; - Z[k] = ZERO; j--;} + Z[k+1] = ZERO; + Z[k] = ZERO; j--;} } } @@ -431,11 +435,11 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) { int i, i1, i2, j, k, k2; double u; - /* Is z=0? */ + /* Is z=0? */ if (X[0]*Y[0]==ZERO) { Z[0]=ZERO; return; } - /* Multiply, add and carry */ + /* Multiply, add and carry */ k2 = (p<3) ? p+p : p+3; Z[k2]=ZERO; for (k=k2; k>1; ) { @@ -449,7 +453,7 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) { Z[--k] = u*RADIXI; } - /* Is there a carry beyond the most significant digit? */ + /* Is there a carry beyond the most significant digit? */ if (Z[1] == ZERO) { for (i=1; i<=p; i++) Z[i]=Z[i+1]; EZ = EX + EY - 1; } @@ -466,7 +470,7 @@ void __mul(const mp_no *x, const mp_no *y, mp_no *z, int p) { /* 2.001*r**(1-p) for p>3. */ /* *x=0 is not permissible. *x is left unchanged. */ -void __inv(const mp_no *x, mp_no *y, int p) { +static void __inv(const mp_no *x, mp_no *y, int p) { int i; #if 0 int l; @@ -474,11 +478,11 @@ void __inv(const mp_no *x, mp_no *y, int p) { double t; mp_no z,w; static const int np1[] = {0,0,0,0,1,2,2,2,2,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4}; + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4}; const mp_no mptwo = {1,{1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}; + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}; __cpy(x,&z,p); z.e=0; __mp_dbl(&z,&t,p); t=ONE/t; __dbl_mp(t,y,p); EY -= EX; diff --git a/sysdeps/ieee754/dbl-64/mpa.h b/sysdeps/ieee754/dbl-64/mpa.h index 4aec48e..3ca0ca5 100644 --- a/sysdeps/ieee754/dbl-64/mpa.h +++ b/sysdeps/ieee754/dbl-64/mpa.h @@ -1,8 +1,7 @@ - /* * IBM Accurate Mathematical Library * Written by International Business Machines Corp. - * Copyright (C) 2001 Free Software Foundation, Inc. + * Copyright (C) 2001, 2011 Free Software Foundation, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -45,14 +44,14 @@ typedef struct {/* This structure holds the details of a multi-precision */ int e; /* floating point number, x: d[0] holds its sign (-1,0 or 1) */ double d[40]; /* e holds its exponent (...,-2,-1,0,1,2,...) and */ } mp_no; /* d[1]...d[p] hold its mantissa digits. The value of x is, */ - /* x = d[1]*r**(e-1) + d[2]*r**(e-2) + ... + d[p]*r**(e-p). */ - /* Here r = 2**24, 0 <= d[i] < r and 1 <= p <= 32. */ - /* p is a global variable. A multi-precision number is */ - /* always normalized. Namely, d[1] > 0. An exception is */ - /* a zero which is characterized by d[0] = 0. The terms */ - /* d[p+1], d[p+2], ... of a none zero number have no */ - /* significance and so are the terms e, d[1],d[2],... */ - /* of a zero. */ + /* x = d[1]*r**(e-1) + d[2]*r**(e-2) + ... + d[p]*r**(e-p). */ + /* Here r = 2**24, 0 <= d[i] < r and 1 <= p <= 32. */ + /* p is a global variable. A multi-precision number is */ + /* always normalized. Namely, d[1] > 0. An exception is */ + /* a zero which is characterized by d[0] = 0. The terms */ + /* d[p+1], d[p+2], ... of a none zero number have no */ + /* significance and so are the terms e, d[1],d[2],... */ + /* of a zero. */ typedef union { int i[2]; double d; } number; @@ -65,16 +64,16 @@ typedef union { int i[2]; double d; } number; #define ABS(x) ((x) < 0 ? -(x) : (x)) -int __acr(const mp_no *, const mp_no *, int); -int __cr(const mp_no *, const mp_no *, int); +// int __acr(const mp_no *, const mp_no *, int); +// int __cr(const mp_no *, const mp_no *, int); void __cpy(const mp_no *, mp_no *, int); -void __cpymn(const mp_no *, int, mp_no *, int); +// void __cpymn(const mp_no *, int, mp_no *, int); void __mp_dbl(const mp_no *, double *, int); void __dbl_mp(double, mp_no *, int); void __add(const mp_no *, const mp_no *, mp_no *, int); void __sub(const mp_no *, const mp_no *, mp_no *, int); void __mul(const mp_no *, const mp_no *, mp_no *, int); -void __inv(const mp_no *, mp_no *, int); +// void __inv(const mp_no *, mp_no *, int); void __dvd(const mp_no *, const mp_no *, mp_no *, int); extern void __mpatan (mp_no *, mp_no *, int); diff --git a/sysdeps/ieee754/dbl-64/mpsqrt.c b/sysdeps/ieee754/dbl-64/mpsqrt.c index 9945de3..bea6232 100644 --- a/sysdeps/ieee754/dbl-64/mpsqrt.c +++ b/sysdeps/ieee754/dbl-64/mpsqrt.c @@ -1,8 +1,7 @@ - /* * IBM Accurate Mathematical Library * written by International Business Machines Corp. - * Copyright (C) 2001 Free Software Foundation + * Copyright (C) 2001, 2011 Free Software Foundation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -41,7 +40,7 @@ /* p as integer. Routine computes sqrt(*x) and stores result in *y */ /****************************************************************************/ -double fastiroot(double); +static double fastiroot(double); void __mpsqrt(mp_no *x, mp_no *y, int p) { #include "mpsqrt.h" @@ -50,11 +49,11 @@ void __mpsqrt(mp_no *x, mp_no *y, int p) { double dx,dy; mp_no mphalf = {0,{0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}, mp3halfs = {0,{0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, - 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}; + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}}; mp_no mpxn,mpz,mpu,mpt1,mpt2; /* Prepare multi-precision 1/2 and 3/2 */ @@ -82,7 +81,7 @@ void __mpsqrt(mp_no *x, mp_no *y, int p) { /* Compute a double precision approximation for 1/sqrt(x) */ /* with the relative error bounded by 2**-51. */ /***********************************************************/ -double fastiroot(double x) { +static double fastiroot(double x) { union {int i[2]; double d;} p,q; double y,z, t; int n; diff --git a/sysdeps/ieee754/dbl-64/s_atan.c b/sysdeps/ieee754/dbl-64/s_atan.c index c7f5f3e..65369ff 100644 --- a/sysdeps/ieee754/dbl-64/s_atan.c +++ b/sysdeps/ieee754/dbl-64/s_atan.c @@ -46,7 +46,13 @@ void __mpatan(mp_no *,mp_no *,int); /* see definition in mpatan.c */ static double atanMp(double,const int[]); -double __signArctan(double,double); + + /* Fix the sign of y and return */ +static double __signArctan(double x,double y){ + return __copysign(y, x); +} + + /* An ultimate atan() routine. Given an IEEE double machine number x, */ /* routine computes the correctly rounded (to nearest) value of atan(x). */ double atan(double x) { @@ -203,14 +209,6 @@ double atan(double x) { } - - /* Fix the sign of y and return */ -double __signArctan(double x,double y){ - - if (x<ZERO) return -y; - else return y; -} - /* Final stages. Compute atan(x) by multiple precision arithmetic */ static double atanMp(double x,const int pr[]){ mp_no mpx,mpy,mpy2,mperr,mpt1,mpy1; diff --git a/sysdeps/ieee754/dbl-64/s_sin.c b/sysdeps/ieee754/dbl-64/s_sin.c index b40776f..02d428c 100644 --- a/sysdeps/ieee754/dbl-64/s_sin.c +++ b/sysdeps/ieee754/dbl-64/s_sin.c @@ -1,7 +1,7 @@ /* * IBM Accurate Mathematical Library * written by International Business Machines Corp. - * Copyright (C) 2001, 2009 Free Software Foundation + * Copyright (C) 2001, 2009, 2011 Free Software Foundation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -53,15 +53,20 @@ #include "mydefs.h" #include "usncs.h" #include "MathLib.h" -#include "sincos.tbl" #include "math_private.h" +extern const union +{ + int4 i[880]; + double x[440]; +} __sincostab attribute_hidden; + static const double - sn3 = -1.66666666666664880952546298448555E-01, - sn5 = 8.33333214285722277379541354343671E-03, - cs2 = 4.99999999999999999999950396842453E-01, - cs4 = -4.16666666666664434524222570944589E-02, - cs6 = 1.38888874007937613028114285595617E-03; + sn3 = -1.66666666666664880952546298448555E-01, + sn5 = 8.33333214285722277379541354343671E-03, + cs2 = 4.99999999999999999999950396842453E-01, + cs4 = -4.16666666666664434524222570944589E-02, + cs6 = 1.38888874007937613028114285595617E-03; void __dubsin(double x, double dx, double w[]); void __docos(double x, double dx, double w[]); @@ -120,10 +125,10 @@ double __sin(double x){ s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=(m>0)?sincos.x[k]:-sincos.x[k]; - ssn=(m>0)?sincos.x[k+1]:-sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=(m>0)?__sincostab.x[k]:-__sincostab.x[k]; + ssn=(m>0)?__sincostab.x[k+1]:-__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -146,10 +151,10 @@ double __sin(double x){ s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ccs-s*ssn-cs*c)-sn*s; res=cs+cor; cor=(cs-res)+cor; @@ -174,7 +179,7 @@ double __sin(double x){ xx = a*a; if (n) {a=-a;da=-da;} if (xx < 0.01588) { - /*Taylor series */ + /*Taylor series */ t = (((((s5.x*xx + s4.x)*xx + s3.x)*xx + s2.x)*xx + s1.x)*a - 0.5*da)*xx+da; res = a+t; cor = (a-res)+t; @@ -192,10 +197,10 @@ double __sin(double x){ s = y + (db+y*xx*(sn3 +xx*sn5)); c = y*db+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -212,10 +217,10 @@ double __sin(double x){ y=a-(u.x-big.x)+da; xx=y*y; k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); cor=(ccs-s*ssn-cs*c)-sn*s; @@ -253,7 +258,7 @@ double __sin(double x){ xx = a*a; if (n) {a=-a;da=-da;} if (xx < 0.01588) { - /* Taylor series */ + /* Taylor series */ t = (((((s5.x*xx + s4.x)*xx + s3.x)*xx + s2.x)*xx + s1.x)*a - 0.5*da)*xx+da; res = a+t; cor = (a-res)+t; @@ -269,10 +274,10 @@ double __sin(double x){ s = y + (db+y*xx*(sn3 +xx*sn5)); c = y*db+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -289,10 +294,10 @@ double __sin(double x){ y=a-(u.x-big.x)+da; xx=y*y; k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); cor=(ccs-s*ssn-cs*c)-sn*s; @@ -364,10 +369,10 @@ double __cos(double x) s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ccs-s*ssn-cs*c)-sn*s; res=cs+cor; cor=(cs-res)+cor; @@ -396,10 +401,10 @@ double __cos(double x) s = y + (db+y*xx*(sn3 +xx*sn5)); c = y*db+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -442,10 +447,10 @@ double __cos(double x) s = y + (db+y*xx*(sn3 +xx*sn5)); c = y*db+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -461,10 +466,10 @@ double __cos(double x) y=a-(u.x-big.x)+da; xx=y*y; k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); cor=(ccs-s*ssn-cs*c)-sn*s; @@ -473,7 +478,7 @@ double __cos(double x) cor = (cor>0)? 1.025*cor+eps : 1.025*cor-eps; return (res==res+cor)? ((n)?-res:res) : csloww2(a,da,x,n); - break; + break; } @@ -517,10 +522,10 @@ double __cos(double x) s = y + (db+y*xx*(sn3 +xx*sn5)); c = y*db+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; cor=(ssn+s*ccs-sn*c)+cs*s; res=sn+cor; cor=(sn-res)+cor; @@ -536,10 +541,10 @@ double __cos(double x) y=a-(u.x-big.x)+da; xx=y*y; k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; s = y + y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); cor=(ccs-s*ssn-cs*c)-sn*s; @@ -611,7 +616,7 @@ static const double th2_36 = 206158430208.0; /* 1.5*2**37 */ } } /*******************************************************************************/ -/* Routine compute sin(x) for 0.25<|x|< 0.855469 by sincos.tbl and Taylor */ +/* Routine compute sin(x) for 0.25<|x|< 0.855469 by __sincostab.tbl and Taylor */ /* and if result still doesn't accurate enough by mpsin or dubsin */ /*******************************************************************************/ @@ -627,10 +632,10 @@ static double slow1(double x) { s = y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; /* Data */ - ssn=sincos.x[k+1]; /* from */ - cs=sincos.x[k+2]; /* tables */ - ccs=sincos.x[k+3]; /* sincos.tbl */ + sn=__sincostab.x[k]; /* Data */ + ssn=__sincostab.x[k+1]; /* from */ + cs=__sincostab.x[k+2]; /* tables */ + ccs=__sincostab.x[k+3]; /* __sincostab.tbl */ y1 = (y+t22)-t22; y2 = y - y1; c1 = (cs+t22)-t22; @@ -648,7 +653,7 @@ static double slow1(double x) { } } /**************************************************************************/ -/* Routine compute sin(x) for 0.855469 <|x|<2.426265 by sincos.tbl */ +/* Routine compute sin(x) for 0.855469 <|x|<2.426265 by __sincostab.tbl */ /* and if result still doesn't accurate enough by mpsin or dubsin */ /**************************************************************************/ static double slow2(double x) { @@ -672,10 +677,10 @@ static double slow2(double x) { s = y*xx*(sn3 +xx*sn5); c = y*del+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+del; e1 = (sn+t22)-t22; @@ -763,10 +768,10 @@ static double sloww1(double x, double dx, double orig) { s = y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; c1 = (cs+t22)-t22; @@ -805,10 +810,10 @@ static double sloww2(double x, double dx, double orig, int n) { s = y*xx*(sn3 +xx*sn5); c = y*dx+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; @@ -882,10 +887,10 @@ mynumber u; s = y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; c1 = (cs+t22)-t22; @@ -925,10 +930,10 @@ mynumber u; s = y*xx*(sn3 +xx*sn5); c = y*dx+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; @@ -966,10 +971,10 @@ static double cslow2(double x) { s = y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = y - y1; e1 = (sn+t22)-t22; @@ -1059,10 +1064,10 @@ static double csloww1(double x, double dx, double orig) { s = y*xx*(sn3 +xx*sn5); c = xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; c1 = (cs+t22)-t22; @@ -1103,10 +1108,10 @@ static double csloww2(double x, double dx, double orig, int n) { s = y*xx*(sn3 +xx*sn5); c = y*dx+xx*(cs2 +xx*(cs4 + xx*cs6)); k=u.i[LOW_HALF]<<2; - sn=sincos.x[k]; - ssn=sincos.x[k+1]; - cs=sincos.x[k+2]; - ccs=sincos.x[k+3]; + sn=__sincostab.x[k]; + ssn=__sincostab.x[k+1]; + cs=__sincostab.x[k+2]; + ccs=__sincostab.x[k+3]; y1 = (y+t22)-t22; y2 = (y - y1)+dx; @@ -1127,12 +1132,17 @@ static double csloww2(double x, double dx, double orig, int n) { } } +#ifndef __cos weak_alias (__cos, cos) +# ifdef NO_LONG_DOUBLE +strong_alias (__cos, __cosl) +weak_alias (__cos, cosl) +# endif +#endif +#ifndef __sin weak_alias (__sin, sin) - -#ifdef NO_LONG_DOUBLE +# ifdef NO_LONG_DOUBLE strong_alias (__sin, __sinl) weak_alias (__sin, sinl) -strong_alias (__cos, __cosl) -weak_alias (__cos, cosl) +# endif #endif diff --git a/sysdeps/ieee754/dbl-64/sincos.tbl b/sysdeps/ieee754/dbl-64/sincostab.c index 9343f24..49fccac 100644 --- a/sysdeps/ieee754/dbl-64/sincos.tbl +++ b/sysdeps/ieee754/dbl-64/sincostab.c @@ -1,7 +1,7 @@ /* * IBM Accurate Mathematical Library * Written by International Business Machines Corp. - * Copyright (C) 2001, 2007 Free Software Foundation, Inc. + * Copyright (C) 2001, 2007, 2011 Free Software Foundation, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -18,13 +18,16 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +#include <mydefs.h> +#include <endian.h> + /****************************************************************/ /* TABLES FOR THE usin() and ucos() FUNCTION */ /****************************************************************/ #ifdef BIG_ENDI -static const union {int4 i[880]; double x[440];}sincos = { .i = { +const union {int4 i[880]; double x[440];}__sincostab = { .i = { /**/ 0x00000000, 0x00000000, /**/ 0x00000000, 0x00000000, /**/ 0x3FF00000, 0x00000000, @@ -467,7 +470,7 @@ static const union {int4 i[880]; double x[440];}sincos = { .i = { /**/ 0x3C747A10, 0x8073C259 } }; #else #ifdef LITTLE_ENDI -static const union {int4 i[880]; double x[440];} sincos = { .i = { +const union {int4 i[880]; double x[440];} __sincostab = { .i = { /**/ 0x00000000, 0x00000000, /**/ 0x00000000, 0x00000000, /**/ 0x00000000, 0x3FF00000, diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index bd07e98..70cb740 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -1,4 +1,36 @@ ifeq ($(subdir),math) libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c + +ifeq ($(have-mfma4),yes) +libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \ + e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \ + mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \ + sincos32-fma4 doasin-fma4 dosincos-fma4 \ + brandred-fma4 halfulp-fma4 mpexp-fma4 \ + mpatan2-fma4 mpatan-fma4 mpsqrt-fma4 mptan-fma4 + +CFLAGS-brandred-fma4.c = -mfma4 +CFLAGS-doasin-fma4.c = -mfma4 +CFLAGS-dosincos-fma4.c = -mfma4 +CFLAGS-e_asin-fma4.c = -mfma4 +CFLAGS-e_atan2-fma4.c = -mfma4 +CFLAGS-e_exp-fma4.c = -mfma4 +CFLAGS-e_log-fma4.c = -mfma4 +CFLAGS-e_pow-fma4.c = -mfma4 +CFLAGS-halfulp-fma4.c = -mfma4 +CFLAGS-mpa-fma4.c = -mfma4 +CFLAGS-mpatan-fma4.c = -mfma4 +CFLAGS-mpatan2-fma4.c = -mfma4 +CFLAGS-mpexp-fma4.c = -mfma4 +CFLAGS-mplog-fma4.c = -mfma4 +CFLAGS-mpsqrt-fma4.c = -mfma4 +CFLAGS-mptan-fma4.c = -mfma4 +CFLAGS-s_atan-fma4.c = -mfma4 +CFLAGS-sincos32-fma4.c = -mfma4 +CFLAGS-slowexp-fma4.c = -mfma4 +CFLAGS-slowpow-fma4.c = -mfma4 +CLFAGS-s_sin-fma4.c = -mfma4 +CLFAGS-s_tan-fma4.c = -mfma4 +endif endif diff --git a/sysdeps/x86_64/fpu/multiarch/brandred-fma4.c b/sysdeps/x86_64/fpu/multiarch/brandred-fma4.c new file mode 100644 index 0000000..93fb5a1 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/brandred-fma4.c @@ -0,0 +1,3 @@ +#define __branred __branred_fma4 + +#include <sysdeps/ieee754/dbl-64/branred.c> diff --git a/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c b/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c new file mode 100644 index 0000000..d7ba67e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/doasin-fma4.c @@ -0,0 +1,3 @@ +#define __doasin __doasin_fma4 + +#include <sysdeps/ieee754/dbl-64/doasin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c b/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c new file mode 100644 index 0000000..02b420b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c @@ -0,0 +1,5 @@ +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 + +#include <sysdeps/ieee754/dbl-64/dosincos.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c new file mode 100644 index 0000000..938bc84 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c @@ -0,0 +1,10 @@ +#define __ieee754_acos __ieee754_acos_fma4 +#define __ieee754_asin __ieee754_asin_fma4 +#define __cos32 __cos32_fma4 +#define __doasin __doasin_fma4 +#define __docos __docos_fma4 +#define __dubcos __dubcos_fma4 +#define __dubsin __dubsin_fma4 +#define __sin32 __sin32_fma4 + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_asin.c b/sysdeps/x86_64/fpu/multiarch/e_asin.c new file mode 100644 index 0000000..8882cea --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_asin.c @@ -0,0 +1,23 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math_private.h> + +extern double __ieee754_acos_sse2 (double); +extern double __ieee754_acos_fma4 (double); +extern double __ieee754_asin_sse2 (double); +extern double __ieee754_asin_fma4 (double); + +libm_ifunc (__ieee754_acos, + HAS_FMA4 ? __ieee754_acos_fma4 : __ieee754_acos_sse2); +strong_alias (__ieee754_acos, __acos_finite) + +libm_ifunc (__ieee754_asin, + HAS_FMA4 ? __ieee754_asin_fma4 : __ieee754_asin_sse2); +strong_alias (__ieee754_asin, __asin_finite) + +# define __ieee754_acos __ieee754_acos_sse2 +# define __ieee754_asin __ieee754_asin_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/e_asin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c new file mode 100644 index 0000000..84a6f86 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c @@ -0,0 +1,9 @@ +#define __ieee754_atan2 __ieee754_atan2_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mpatan2 __mpatan2_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_atan2.c b/sysdeps/x86_64/fpu/multiarch/e_atan2.c new file mode 100644 index 0000000..12fc929 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_atan2.c @@ -0,0 +1,16 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math_private.h> + +extern double __ieee754_atan2_sse2 (double, double); +extern double __ieee754_atan2_fma4 (double, double); + +libm_ifunc (__ieee754_atan2, + HAS_FMA4 ? __ieee754_atan2_fma4 : __ieee754_atan2_sse2); +strong_alias (__ieee754_atan2, __atan2_finite) + +# define __ieee754_atan2 __ieee754_atan2_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/e_atan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c new file mode 100644 index 0000000..942dfff --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c @@ -0,0 +1,5 @@ +#define __ieee754_exp __ieee754_exp_fma4 +#define __exp1 __exp1_fma4 +#define __slowexp __slowexp_fma4 + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_exp.c b/sysdeps/x86_64/fpu/multiarch/e_exp.c new file mode 100644 index 0000000..fc1096b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_exp.c @@ -0,0 +1,15 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math_private.h> + +extern double __ieee754_exp_sse2 (double); +extern double __ieee754_exp_fma4 (double); + +libm_ifunc (__ieee754_exp, HAS_FMA4 ? __ieee754_exp_fma4 : __ieee754_exp_sse2); +strong_alias (__ieee754_exp, __exp_finite) + +# define __ieee754_exp __ieee754_exp_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/e_exp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c new file mode 100644 index 0000000..0be66d1 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_log-fma4.c @@ -0,0 +1,7 @@ +#define __ieee754_log __ieee754_log_fma4 +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_log.c b/sysdeps/x86_64/fpu/multiarch/e_log.c new file mode 100644 index 0000000..c542646 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_log.c @@ -0,0 +1,15 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math_private.h> + +extern double __ieee754_log_sse2 (double); +extern double __ieee754_log_fma4 (double); + +libm_ifunc (__ieee754_log, HAS_FMA4 ? __ieee754_log_fma4 : __ieee754_log_sse2); +strong_alias (__ieee754_log, __log_finite) + +# define __ieee754_log __ieee754_log_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/e_log.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c b/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c new file mode 100644 index 0000000..20313be --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c @@ -0,0 +1,5 @@ +#define __ieee754_pow __ieee754_pow_fma4 +#define __exp1 __exp1_fma4 +#define __slowpow __slowpow_fma4 + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/e_pow.c b/sysdeps/x86_64/fpu/multiarch/e_pow.c new file mode 100644 index 0000000..a740b6c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/e_pow.c @@ -0,0 +1,15 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math_private.h> + +extern double __ieee754_pow_sse2 (double, double); +extern double __ieee754_pow_fma4 (double, double); + +libm_ifunc (__ieee754_pow, HAS_FMA4 ? __ieee754_pow_fma4 : __ieee754_pow_sse2); +strong_alias (__ieee754_pow, __pow_finite) + +# define __ieee754_pow __ieee754_pow_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/e_pow.c> diff --git a/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c b/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c new file mode 100644 index 0000000..3fc223e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c @@ -0,0 +1,3 @@ +#define __halfulp __halfulp_fma4 + +#include <sysdeps/ieee754/dbl-64/halfulp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c new file mode 100644 index 0000000..7b9e2ef --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpa-fma4.c @@ -0,0 +1,10 @@ +#define __add __add_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 + +#define NO___CPY 1 +#define NO___MP_DBL 1 + +#include <sysdeps/ieee754/dbl-64/mpa.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c new file mode 100644 index 0000000..942974b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c @@ -0,0 +1,8 @@ +#define __mpatan __mpatan_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/mpatan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c new file mode 100644 index 0000000..e7c469e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c @@ -0,0 +1,8 @@ +#define __mpatan2 __mpatan2_fma4 +#define __add __add_fma4 +#define __dvd __dvd_fma4 +#define __mpatan __mpatan_fma4 +#define __mpsqrt __mpsqrt_fma4 +#define __mul __mul_fma4 + +#include <sysdeps/ieee754/dbl-64/mpatan2.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c new file mode 100644 index 0000000..021970c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c @@ -0,0 +1,7 @@ +#define __mpexp __mpexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __dvd __dvd_fma4 +#define __mul __mul_fma4 + +#include <sysdeps/ieee754/dbl-64/mpexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c b/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c new file mode 100644 index 0000000..9581eaf --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mplog-fma4.c @@ -0,0 +1,7 @@ +#define __mplog __mplog_fma4 +#define __add __add_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/mplog.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c b/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c new file mode 100644 index 0000000..43b6493 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c @@ -0,0 +1,6 @@ +#define __mpsqrt __mpsqrt_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/mpsqrt.c> diff --git a/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c b/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c new file mode 100644 index 0000000..767924e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/mptan-fma4.c @@ -0,0 +1,6 @@ +#define __mptan __mptan_fma4 +#define __c32 __c32_fma4 +#define __dvd __dvd_fma4 +#define __mpranred __mpranred_fma4 + +#include <sysdeps/ieee754/dbl-64/mptan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c new file mode 100644 index 0000000..a8f0977 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c @@ -0,0 +1,8 @@ +#define atan __atan_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpatan __mpatan_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_atan.c b/sysdeps/x86_64/fpu/multiarch/s_atan.c new file mode 100644 index 0000000..ffc4a56 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_atan.c @@ -0,0 +1,14 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math.h> + +extern double __atan_sse2 (double); +extern double __atan_fma4 (double); + +libm_ifunc (atan, HAS_FMA4 ? __atan_fma4 : __atan_sse2); + +# define atan __atan_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/s_atan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c new file mode 100644 index 0000000..97cef8b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c @@ -0,0 +1,11 @@ +#define __cos __cos_fma4 +#define __sin __sin_fma4 +#define __branred __branred_fma4 +#define __docos __docos_fma4 +#define __dubsin __dubsin_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_sin.c b/sysdeps/x86_64/fpu/multiarch/s_sin.c new file mode 100644 index 0000000..a7c35dc --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_sin.c @@ -0,0 +1,22 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math.h> +# undef NAN + +extern double __cos_sse2 (double); +extern double __cos_fma4 (double); +extern double __sin_sse2 (double); +extern double __sin_fma4 (double); + +libm_ifunc (__cos, HAS_FMA4 ? __cos_fma4 : __cos_sse2); +weak_alias (__cos, cos) + +libm_ifunc (__sin, HAS_FMA4 ? __sin_fma4 : __sin_sse2); +weak_alias (__sin, sin) + +# define __cos __cos_sse2 +# define __sin __sin_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/s_sin.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c b/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c new file mode 100644 index 0000000..c3cefc2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c @@ -0,0 +1,9 @@ +#define tan __tan_fma4 +#define __branred __branred_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpranred __mpranred_fma4 +#define __mptan __mptan_fma4 +#define __sub __sub_fma4 + + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/s_tan.c b/sysdeps/x86_64/fpu/multiarch/s_tan.c new file mode 100644 index 0000000..cca02b5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/s_tan.c @@ -0,0 +1,14 @@ +#ifdef HAVE_FMA4_SUPPORT +# include <init-arch.h> +# include <math.h> + +extern double __tan_sse2 (double); +extern double __tan_fma4 (double); + +libm_ifunc (tan, HAS_FMA4 ? __tan_fma4 : __tan_sse2); + +# define tan __tan_sse2 +#endif + + +#include <sysdeps/ieee754/dbl-64/s_tan.c> diff --git a/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c b/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c new file mode 100644 index 0000000..f0d2d27 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c @@ -0,0 +1,14 @@ +#define __cos32 __cos32_fma4 +#define __sin32 __sin32_fma4 +#define __c32 __c32_fma4 +#define __mpsin __mpsin_fma4 +#define __mpsin1 __mpsin1_fma4 +#define __mpcos __mpcos_fma4 +#define __mpcos1 __mpcos1_fma4 +#define __mpranred __mpranred_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/sincos32.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c new file mode 100644 index 0000000..83cb359 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c @@ -0,0 +1,8 @@ +#define __slowexp __slowexp_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 + +#include <sysdeps/ieee754/dbl-64/slowexp.c> diff --git a/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c b/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c new file mode 100644 index 0000000..744f3f6 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c @@ -0,0 +1,10 @@ +#define __slowpow __slowpow_fma4 +#define __add __add_fma4 +#define __dbl_mp __dbl_mp_fma4 +#define __mpexp __mpexp_fma4 +#define __mplog __mplog_fma4 +#define __mul __mul_fma4 +#define __sub __sub_fma4 +#define __halfulp __halfulp_fma4 + +#include <sysdeps/ieee754/dbl-64/slowpow.c> |