diff options
21 files changed, 1103 insertions, 47 deletions
@@ -251,6 +251,58 @@ * sysdeps/x86_64/fpu/test-float-libmvec-sincosf.c: Likewise. * sysdeps/x86_64/fpu/Makefile: Added new tests. +2016-10-14 Anton Blanchard <anton@samba.org> + + * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile + [$(subdir) = math] (libm-sysdep_routines): Add s_sinf-power8 and + s_sinf-ppc64. + * sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S: New file. + * sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-ppc64.c: Likewise. + * sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf.c: Likewise. + * sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S: Likewise. + +2016-10-14 Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> + + * sysdeps/powerpc/fpu/libm-test-ulps: Update. + * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile + [$(subdir) = math] (libm-sysdep_routines): Add e_expf-power8 and + e_expf-ppc64. + * sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S: New file. + * sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c: Likewise. + * sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c: Likewise. + * sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S: Likewise. + +2016-04-12 Joseph Myers <joseph@codesourcery.com> + + * sysdeps/unix/sysv/linux/i386/lowlevellock.h + (lll_unlock_elision): Add adapt_count parameter. + +2016-04-12 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * sysdeps/unix/sysv/linux/powerpc/elision-lock.c + (__lll_lock_elision): Remove adapt_count decrement... + * sysdeps/unix/sysv/linux/powerpc/elision-trylock.c + (__lll_trylock_elision): Likewise. + * sysdeps/unix/sysv/linux/powerpc/elision-unlock.c + (__lll_unlock_elision): ... to here. And utilize + new adapt_count parameter. + * sysdeps/unix/sysv/linux/powerpc/lowlevellock.h + (__lll_unlock_elision): Update to include adapt_count + parameter. + (lll_unlock_elision): Pass pointer to adapt_count + variable. + +2016-04-12 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * nptl/pthread_mutex_unlock.c (lll_unlock_elision): + Add elision adapt_count parameter to list of arguments. + * sysdeps/unix/sysv/linux/powerpc/lowlevellock.h + (lll_unlock_elision): Update with new parameter list + * sysdeps/unix/sysv/linux/s390/lowlevellock.h + (lll_unlock_elision): Likewise + * sysdeps/unix/sysv/linux/x86_64/lowlevellock.h + (lll_unlock_elision): Likewise + 2016-08-17 Florian Weimer <fweimer@redhat.com> Reduce time to expected nptl/tst-once5 failure. @@ -700,6 +752,16 @@ 2016-02-22 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + * sysdeps/unix/sysv/linux/powerpc/elision-trylock.c + (__lll_trylock_elision): Fix setting of adapt_count. + * sysdeps/unix/sysv/linux/powerpc/htm.h + (_ABORT_PERSISTENT): Define to clarify persistent aborts. + (_ABORT_NESTED_TRYLOCK): Renumber, and make persistent. + (_ABORT_SYSCALL): Renumber, and clarify definition. + (_ABORT_LOCK_BUSY): Renumber, make non-persistent. + +2016-02-22 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + * sysdeps/unix/sysv/linux/powerpc/htm.h (__libc_tbegin): Remove semicolon. (__libc_tend): Likewise. @@ -1042,6 +1104,25 @@ * sysdeps/hppa/dl-symaddr.c (_dl_symbol_address): Add rtld_hidden_def. +2015-08-26 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * sysdeps/unix/sysv/linux/powerpc/elision-lock.c + (__arch_compare_and_exchange_val_32_acq): Remove and use common + definition. ISA 2.07B no longer requires full sync. + +2015-08-26 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc32/sysdep.h (ABORT_TRANSACTION): Use + register other than r0 for tabort, it has special meaning. + * sysdeps/powerpc/powerpc64/sysdep.h (ABORT_TRANSACTION): Likewise + * sysdeps/unix.sysv/linux/powerpc/syscall.S (syscall): Abort + transaction before starting syscall. + +2015-08-06 Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> + + * elf/get-dynamic-info.h (elf_get_dynamic_info): Remove assert + if DT_RUNPATH and DT_RPATH flags are found in ld.so. + 2015-08-05 Zack Weinberg <zackw@panix.com> * misc/regexp.h: Update comments. diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h index dc8359d..529ed6c 100644 --- a/elf/get-dynamic-info.h +++ b/elf/get-dynamic-info.h @@ -138,9 +138,6 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp) || (info[VERSYMIDX (DT_FLAGS_1)]->d_un.d_val & ~DF_1_NOW) == 0); assert (info[DT_FLAGS] == NULL || (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0); - /* Flags must not be set for ld.so. */ - assert (info[DT_RUNPATH] == NULL); - assert (info[DT_RPATH] == NULL); #else if (info[DT_FLAGS] != NULL) { diff --git a/nptl/pthread_mutex_unlock.c b/nptl/pthread_mutex_unlock.c index 80939ba..9e864c1 100644 --- a/nptl/pthread_mutex_unlock.c +++ b/nptl/pthread_mutex_unlock.c @@ -24,7 +24,7 @@ #include <stap-probe.h> #ifndef lll_unlock_elision -#define lll_unlock_elision(a,b) ({ lll_unlock (a,b); 0; }) +#define lll_unlock_elision(a,b,c) ({ lll_unlock (a,c); 0; }) #endif static int @@ -63,7 +63,7 @@ __pthread_mutex_unlock_usercnt (mutex, decr) else if (__glibc_likely (type == PTHREAD_MUTEX_TIMED_ELISION_NP)) { /* Don't reset the owner/users fields for elision. */ - return lll_unlock_elision (mutex->__data.__lock, + return lll_unlock_elision (mutex->__data.__lock, mutex->__data.__elision, PTHREAD_MUTEX_PSHARED (mutex)); } else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex) diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps index 5a78b0b..3a47163 100644 --- a/sysdeps/powerpc/fpu/libm-test-ulps +++ b/sysdeps/powerpc/fpu/libm-test-ulps @@ -1574,8 +1574,10 @@ ildouble: 1 ldouble: 1 Function: "exp_upward": +float: 1 double: 1 idouble: 1 +ifloat: 1 ildouble: 1 ldouble: 1 diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile index 0e3eac7..add1fb8 100644 --- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile @@ -24,7 +24,9 @@ libm-sysdep_routines += s_isnan-power7 s_isnan-power6x s_isnan-power6 \ s_modff-power5+ s_modff-ppc64 e_hypot-ppc64 \ e_hypot-power7 e_hypotf-ppc64 e_hypotf-power7 \ s_isnan-power8 s_isinf-power8 s_finite-power8 \ - s_llrint-power8 s_llround-power8 + s_llrint-power8 s_llround-power8 \ + e_expf-power8 e_expf-ppc64 \ + s_sinf-ppc64 s_sinf-power8 CFLAGS-s_logbf-power7.c = -mcpu=power7 CFLAGS-s_logbl-power7.c = -mcpu=power7 diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S new file mode 100644 index 0000000..02eff24 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-power8.S @@ -0,0 +1,26 @@ +/* __ieee754_expf() POWER8 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef strong_alias +#define strong_alias(a, b) + +#define __ieee754_expf __ieee754_expf_power8 + +#include <sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c new file mode 100644 index 0000000..40f9e3a --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf-ppc64.c @@ -0,0 +1,24 @@ +/* __ieee_expf() PowerPC64 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#undef strong_alias +#define strong_alias(a, b) + +#define __ieee754_expf __ieee754_expf_ppc64 + +#include <sysdeps/ieee754/flt-32/e_expf.c> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c new file mode 100644 index 0000000..1d9a8c6 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/e_expf.c @@ -0,0 +1,31 @@ +/* Multiple versions of ieee754_expf. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <math_ldbl_opt.h> +#include "init-arch.h" + +extern __typeof (__ieee754_expf) __ieee754_expf_ppc64 attribute_hidden; +extern __typeof (__ieee754_expf) __ieee754_expf_power8 attribute_hidden; + +libc_ifunc (__ieee754_expf, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __ieee754_expf_power8 + : __ieee754_expf_ppc64); + +strong_alias (__ieee754_expf, __expf_finite) diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S new file mode 100644 index 0000000..579019c --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-power8.S @@ -0,0 +1,26 @@ +/* sinf(). PowerPC64/POWER8 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef weak_alias +#define weak_alias(a, b) + +#define __sinf __sinf_power8 + +#include <sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-ppc64.c new file mode 100644 index 0000000..eaf83fa --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf-ppc64.c @@ -0,0 +1,26 @@ +/* sinf(). PowerPC64 default version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef weak_alias +#define weak_alias(a, b) + +#define __sinf __sinf_ppc64 + +#include <sysdeps/ieee754/flt-32/s_sinf.c> diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf.c new file mode 100644 index 0000000..4269d58 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sinf.c @@ -0,0 +1,31 @@ +/* Multiple versions of sinf. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <shlib-compat.h> +#include "init-arch.h" + +extern __typeof (__sinf) __sinf_ppc64 attribute_hidden; +extern __typeof (__sinf) __sinf_power8 attribute_hidden; + +libc_ifunc (__sinf, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __sinf_power8 + : __sinf_ppc64); + +weak_alias (__sinf, sinf) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S new file mode 100644 index 0000000..a5e68bb --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/e_expf.S @@ -0,0 +1,303 @@ +/* Optimized expf(). PowerPC64/POWER8 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Short algorithm description: + * + * Let K = 64 (table size). + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) + * where: + * x = m*log(2)/K + y, y in [0.0..log(2)/K] + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] + * values of 2^(j/K) are tabulated as T[j]. + * + * P(y) is a minimax polynomial approximation of expf(y)-1 + * on small interval [0.0..log(2)/K]. + * + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y + * + * Special cases: + * expf(NaN) = NaN + * expf(+INF) = +INF + * expf(-INF) = 0 + * expf(x) = 1 for subnormals + * for finite argument, only expf(0)=1 is exact + * expf(x) overflows if x>88.7228317260742190 + * expf(x) underflows if x<-103.972076416015620 + */ + +#define C1 0x42ad496b /* Single precision 125*log(2). */ +#define C2 0x31800000 /* Single precision 2^(-28). */ +#define SP_INF 0x7f800000 /* Single precision Inf. */ +#define SP_EXP_BIAS 0x1fc0 /* Single precision exponent bias. */ + +#define DATA_OFFSET r9 + +/* Implements the function + + float [fp1] expf (float [fp1] x) */ + + .machine power8 +EALIGN(__ieee754_expf, 4, 0) + addis DATA_OFFSET,r2,.Lanchor@toc@ha + addi DATA_OFFSET,DATA_OFFSET,.Lanchor@toc@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 /* r8 = x */ + lfd fp2,(.KLN2-.Lanchor)(DATA_OFFSET) + lfd fp3,(.P2-.Lanchor)(DATA_OFFSET) + rldicl r3,r8,32,33 /* r3 = |x| */ + lis r4,C1@ha /* r4 = 125*log(2) */ + ori r4,r4,C1@l + cmpw r3,r4 + lfd fp5,(.P3-.Lanchor)(DATA_OFFSET) + lfd fp4,(.RS-.Lanchor)(DATA_OFFSET) + fmadd fp2,fp1,fp2,fp4 /* fp2 = x * K/log(2) + (2^23 + 2^22) */ + bge L(special_paths) /* |x| >= 125*log(2) ? */ + + lis r4,C2@ha + ori r4,r4,C2@l + cmpw r3,r4 + blt L(small_args) /* |x| < 2^(-28) ? */ + + /* Main path: here if 2^(-28) <= |x| < 125*log(2) */ + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + lis r4,SP_EXP_BIAS@ha + ori r4,r4,SP_EXP_BIAS@l + add r3,r3,r4 + rldic r3,r3,49,1 /* r3 = 2^n */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + xscvspdp v1,v1 + fmul fp4,fp4,fp2 /* fp4 = (P3 * z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + P(y)) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + P(y)) */ + frsp fp1,fp1 + blr + + .align 4 +/* x is either underflow, overflow, infinite or NaN. */ +L(special_paths): + srdi r8,r8,32 + rlwinm r8,r8,3,29,29 /* r8 = 0, if x positive. + r8 = 4, otherwise. */ + addi r6,DATA_OFFSET,(.SPRANGE-.Lanchor) + lwzx r4,r6,r8 /* r4 = .SPRANGE[signbit(x)] */ + cmpw r3,r4 + /* |x| <= .SPRANGE[signbit(x)] */ + ble L(near_under_or_overflow) + + lis r4,SP_INF@ha + ori r4,r4,SP_INF@l + cmpw r3,r4 + bge L(arg_inf_or_nan) /* |x| > Infinite ? */ + + addi r6,DATA_OFFSET,(.SPLARGE_SMALL-.Lanchor) + lfsx fp1,r6,r8 + fmuls fp1,fp1,fp1 + blr + + + .align 4 +L(small_args): + /* expf(x) = 1.0, where |x| < |2^(-28)| */ + lfs fp2,(.SPone-.Lanchor)(DATA_OFFSET) + fadds fp1,fp1,fp2 + blr + + + .align 4 +L(arg_inf_or_nan:) + bne L(arg_nan) + + /* expf(+INF) = +INF + expf(-INF) = 0 */ + addi r6,DATA_OFFSET,(.INF_ZERO-.Lanchor) + lfsx fp1,r6,r8 + blr + + + .align 4 +L(arg_nan): + /* expf(NaN) = NaN */ + fadd fp1,fp1,fp1 + frsp fp1,fp1 + blr + + .align 4 +L(near_under_or_overflow): + frsp fp6,fp2 + xscvdpsp v2,v2 + mfvsrd r8,v2 + mr r3,r8 /* r3 = m */ + rldicl r8,r8,32,58 /* r8 = j */ + lfs fp4,(.SP_RS-.Lanchor)(DATA_OFFSET) + fsubs fp2,fp6,fp4 /* fp2 = m = x * K/log(2) */ + srdi r3,r3,32 + clrrwi r3,r3,6 /* r3 = n */ + lfd fp6,(.NLN2K-.Lanchor)(DATA_OFFSET) + fmadd fp0,fp2,fp6,fp1 /* fp0 = y = x - m*log(2)/K */ + fmul fp2,fp0,fp0 /* fp2 = z = y^2 */ + lfd fp4,(.P1-.Lanchor)(DATA_OFFSET) + lfd fp6,(.P0-.Lanchor)(DATA_OFFSET) + ld r4,(.DP_EXP_BIAS-.Lanchor)(DATA_OFFSET) + add r3,r3,r4 + rldic r3,r3,46,1 /* r3 = 2 */ + fmadd fp4,fp5,fp2,fp4 /* fp4 = P3 * z + P1 */ + fmadd fp6,fp3,fp2,fp6 /* fp6 = P2 * z + P0 */ + mtvsrd v1,r3 + fmul fp4,fp4,fp2 /* fp4 = (P3*z + P1)*z */ + fmadd fp0,fp0,fp6,fp4 /* fp0 = P(y) */ + sldi r8,r8,3 /* Access doublewords from T[j]. */ + addi r6,DATA_OFFSET,(.Ttable-.Lanchor) + lfdx fp3,r6,r8 + fmadd fp0,fp0,fp3,fp3 /* fp0 = T[j] * (1 + T[j]) */ + fmul fp1,fp1,fp0 /* fp1 = 2^n * T[j] * (1 + T[j]) */ + frsp fp1,fp1 + blr +END(__ieee754_expf) + + .section .rodata, "a",@progbits +.Lanchor: + .balign 8 +/* Table T[j] = 2^(j/K). Double precision. */ +.Ttable: + .8byte 0x3ff0000000000000 + .8byte 0x3ff02c9a3e778061 + .8byte 0x3ff059b0d3158574 + .8byte 0x3ff0874518759bc8 + .8byte 0x3ff0b5586cf9890f + .8byte 0x3ff0e3ec32d3d1a2 + .8byte 0x3ff11301d0125b51 + .8byte 0x3ff1429aaea92de0 + .8byte 0x3ff172b83c7d517b + .8byte 0x3ff1a35beb6fcb75 + .8byte 0x3ff1d4873168b9aa + .8byte 0x3ff2063b88628cd6 + .8byte 0x3ff2387a6e756238 + .8byte 0x3ff26b4565e27cdd + .8byte 0x3ff29e9df51fdee1 + .8byte 0x3ff2d285a6e4030b + .8byte 0x3ff306fe0a31b715 + .8byte 0x3ff33c08b26416ff + .8byte 0x3ff371a7373aa9cb + .8byte 0x3ff3a7db34e59ff7 + .8byte 0x3ff3dea64c123422 + .8byte 0x3ff4160a21f72e2a + .8byte 0x3ff44e086061892d + .8byte 0x3ff486a2b5c13cd0 + .8byte 0x3ff4bfdad5362a27 + .8byte 0x3ff4f9b2769d2ca7 + .8byte 0x3ff5342b569d4f82 + .8byte 0x3ff56f4736b527da + .8byte 0x3ff5ab07dd485429 + .8byte 0x3ff5e76f15ad2148 + .8byte 0x3ff6247eb03a5585 + .8byte 0x3ff6623882552225 + .8byte 0x3ff6a09e667f3bcd + .8byte 0x3ff6dfb23c651a2f + .8byte 0x3ff71f75e8ec5f74 + .8byte 0x3ff75feb564267c9 + .8byte 0x3ff7a11473eb0187 + .8byte 0x3ff7e2f336cf4e62 + .8byte 0x3ff82589994cce13 + .8byte 0x3ff868d99b4492ed + .8byte 0x3ff8ace5422aa0db + .8byte 0x3ff8f1ae99157736 + .8byte 0x3ff93737b0cdc5e5 + .8byte 0x3ff97d829fde4e50 + .8byte 0x3ff9c49182a3f090 + .8byte 0x3ffa0c667b5de565 + .8byte 0x3ffa5503b23e255d + .8byte 0x3ffa9e6b5579fdbf + .8byte 0x3ffae89f995ad3ad + .8byte 0x3ffb33a2b84f15fb + .8byte 0x3ffb7f76f2fb5e47 + .8byte 0x3ffbcc1e904bc1d2 + .8byte 0x3ffc199bdd85529c + .8byte 0x3ffc67f12e57d14b + .8byte 0x3ffcb720dcef9069 + .8byte 0x3ffd072d4a07897c + .8byte 0x3ffd5818dcfba487 + .8byte 0x3ffda9e603db3285 + .8byte 0x3ffdfc97337b9b5f + .8byte 0x3ffe502ee78b3ff6 + .8byte 0x3ffea4afa2a490da + .8byte 0x3ffefa1bee615a27 + .8byte 0x3fff50765b6e4540 + .8byte 0x3fffa7c1819e90d8 + +.KLN2: + .8byte 0x40571547652b82fe /* Double precision K/log(2). */ + +/* Double precision polynomial coefficients. */ +.P0: + .8byte 0x3fefffffffffe7c6 +.P1: + .8byte 0x3fe00000008d6118 +.P2: + .8byte 0x3fc55550da752d4f +.P3: + .8byte 0x3fa56420eb78fa85 + +.RS: + .8byte 0x4168000000000000 /* Double precision 2^23 + 2^22. */ +.NLN2K: + .8byte 0xbf862e42fefa39ef /* Double precision -log(2)/K. */ +.DP_EXP_BIAS: + .8byte 0x000000000000ffc0 /* Double precision exponent bias. */ + + .balign 4 +.SPone: + .4byte 0x3f800000 /* Single precision 1.0. */ +.SP_RS: + .4byte 0x4b400000 /* Single precision 2^23 + 2^22. */ + +.SPRANGE: /* Single precision overflow/underflow bounds. */ + .4byte 0x42b17217 /* if x>this bound, then result overflows. */ + .4byte 0x42cff1b4 /* if x<this bound, then result underflows. */ + +.SPLARGE_SMALL: + .4byte 0x71800000 /* 2^100. */ + .4byte 0x0d800000 /* 2^-100. */ + +.INF_ZERO: + .4byte 0x7f800000 /* Single precision Inf. */ + .4byte 0 /* Single precision zero. */ + +strong_alias (__ieee754_expf, __expf_finite) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S new file mode 100644 index 0000000..3b8f5af --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_sinf.S @@ -0,0 +1,519 @@ +/* Optimized sinf(). PowerPC64/POWER8 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +#define FRAMESIZE (FRAME_MIN_SIZE+16) + +#define FLOAT_EXPONENT_SHIFT 23 +#define FLOAT_EXPONENT_BIAS 127 +#define INTEGER_BITS 3 + +#define PI_4 0x3f490fdb /* PI/4 */ +#define NINEPI_4 0x40e231d6 /* 9 * PI/4 */ +#define TWO_PN5 0x3d000000 /* 2^-5 */ +#define TWO_PN27 0x32000000 /* 2^-27 */ +#define INFINITY 0x7f800000 +#define TWO_P23 0x4b000000 /* 2^27 */ +#define FX_FRACTION_1_28 0x9249250 /* 0x100000000 / 28 + 1 */ + + /* Implements the function + + float [fp1] sinf (float [fp1] x) */ + + .machine power8 +EALIGN(__sinf, 4, 0) + addis r9,r2,L(anchor)@toc@ha + addi r9,r9,L(anchor)@toc@l + + lis r4,PI_4@h + ori r4,r4,PI_4@l + + xscvdpspn v0,v1 + mfvsrd r8,v0 + rldicl r3,r8,32,33 /* Remove sign bit. */ + + cmpw r3,r4 + bge L(greater_or_equal_pio4) + + lis r4,TWO_PN5@h + ori r4,r4,TWO_PN5@l + + cmpw r3,r4 + blt L(less_2pn5) + + /* Chebyshev polynomial of the form: + * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_pio4): + lis r4,NINEPI_4@h + ori r4,r4,NINEPI_4@l + cmpw r3,r4 + bge L(greater_or_equal_9pio4) + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + fabs fp1,fp1 /* |x| */ + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + fctiduz fp2,fp2 + mfvsrd r3,v2 /* n = |x| mod PI/4 */ + + /* Now use that quotient to find |x| mod (PI/2). */ + addi r7,r3,1 + rldicr r5,r7,2,60 /* ((n+1) >> 1) << 3 */ + addi r6,r9,(L(pio2_table)-L(anchor)) + lfdx fp4,r5,r6 + fsub fp1,fp1,fp4 + + .balign 16 +L(reduced): + /* Now we are in the range -PI/4 to PI/4. */ + + /* Work out if we are in a positive or negative primary interval. */ + rldicl r4,r7,62,63 /* ((n+1) >> 2) & 1 */ + + /* We are operating on |x|, so we need to add back the original + sign. */ + rldicl r8,r8,33,63 /* (x >> 31) & 1, ie the sign bit. */ + xor r4,r4,r8 /* 0 if result should be positive, + 1 if negative. */ + + /* Load a 1.0 or -1.0. */ + addi r5,r9,(L(ones)-L(anchor)) + sldi r4,r4,3 + lfdx fp0,r4,r5 + + /* Are we in the primary interval of sin or cos? */ + andi. r4,r7,0x2 + bne L(cos) + + /* Chebyshev polynomial of the form: + x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ + + lfd fp9,(L(S0)-L(anchor))(r9) + lfd fp10,(L(S1)-L(anchor))(r9) + lfd fp11,(L(S2)-L(anchor))(r9) + lfd fp12,(L(S3)-L(anchor))(r9) + lfd fp13,(L(S4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp13,fp12 /* S3+x^2*S4 */ + fmadd fp4,fp2,fp4,fp11 /* S2+x^2*(S3+x^2*S4) */ + fmadd fp4,fp2,fp4,fp10 /* S1+x^2*(S2+x^2*(S3+x^2*S4)) */ + fmadd fp4,fp2,fp4,fp9 /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))) */ + fmadd fp4,fp3,fp4,fp1 /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(cos): + /* Chebyshev polynomial of the form: + 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). */ + + lfd fp9,(L(C0)-L(anchor))(r9) + lfd fp10,(L(C1)-L(anchor))(r9) + lfd fp11,(L(C2)-L(anchor))(r9) + lfd fp12,(L(C3)-L(anchor))(r9) + lfd fp13,(L(C4)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + lfd fp3,(L(DPone)-L(anchor))(r9) + + fmadd fp4,fp2,fp13,fp12 /* C3+x^2*C4 */ + fmadd fp4,fp2,fp4,fp11 /* C2+x^2*(C3+x^2*C4) */ + fmadd fp4,fp2,fp4,fp10 /* C1+x^2*(C2+x^2*(C3+x^2*C4)) */ + fmadd fp4,fp2,fp4,fp9 /* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))) */ + fmadd fp4,fp2,fp4,fp3 /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))) */ + fmul fp4,fp4,fp0 /* Add in the sign. */ + frsp fp1,fp4 /* Round to single precision. */ + + blr + + .balign 16 +L(greater_or_equal_9pio4): + lis r4,INFINITY@h + ori r4,r4,INFINITY@l + cmpw r3,r4 + bge L(inf_or_nan) + + lis r4,TWO_P23@h + ori r4,r4,TWO_P23@l + cmpw r3,r4 + bge L(greater_or_equal_2p23) + + fabs fp1,fp1 /* |x| */ + + /* Calculate quotient of |x|/(PI/4). */ + lfd fp2,(L(invpio4)-L(anchor))(r9) + + lfd fp3,(L(DPone)-L(anchor))(r9) + lfd fp4,(L(DPhalf)-L(anchor))(r9) + fmul fp2,fp1,fp2 /* |x|/(PI/4) */ + friz fp2,fp2 /* n = floor(|x|/(PI/4)) */ + + /* Calculate (n + 1) / 2. */ + fadd fp2,fp2,fp3 /* n + 1 */ + fmul fp3,fp2,fp4 /* (n + 1) / 2 */ + friz fp3,fp3 + + lfd fp4,(L(pio2hi)-L(anchor))(r9) + lfd fp5,(L(pio2lo)-L(anchor))(r9) + + fmul fp6,fp4,fp3 + fadd fp6,fp6,fp1 + fmadd fp1,fp5,fp3,fp6 + + fctiduz fp2,fp2 + mfvsrd r7,v2 /* n + 1 */ + + b L(reduced) + + .balign 16 +L(inf_or_nan): + bne L(skip_errno_setting) /* Is a NAN? */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + stfd fp1,-8(r1) + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl JUMPTARGET(__errno_location) + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + + lfd fp1,-8(r1) + + /* errno = EDOM */ + li r4,EDOM + stw r4,0(r3) + +L(skip_errno_setting): + fsub fp1,fp1,fp1 /* x - x */ + blr + + .balign 16 +L(greater_or_equal_2p23): + fabs fp1,fp1 + + srwi r4,r3,FLOAT_EXPONENT_SHIFT + subi r4,r4,FLOAT_EXPONENT_BIAS + + /* We reduce the input modulo pi/4, so we need 3 bits of integer + to determine where in 2*pi we are. Index into our array + accordingly. */ + addi r4,r4,INTEGER_BITS + + /* To avoid an expensive divide, for the range we care about (0 - 127) + we can transform x/28 into: + + x/28 = (x * ((0x100000000 / 28) + 1)) >> 32 + + mulhwu returns the top 32 bits of the 64 bit result, doing the + shift for us in the same instruction. The top 32 bits are undefined, + so we have to mask them. */ + + lis r6,FX_FRACTION_1_28@h + ori r6,r6,FX_FRACTION_1_28@l + mulhwu r5,r4,r6 + clrldi r5,r5,32 + + /* Get our pointer into the invpio4_table array. */ + sldi r4,r5,3 + addi r6,r9,(L(invpio4_table)-L(anchor)) + add r4,r4,r6 + + lfd fp2,0(r4) + lfd fp3,8(r4) + lfd fp4,16(r4) + lfd fp5,24(r4) + + fmul fp6,fp2,fp1 + fmul fp7,fp3,fp1 + fmul fp8,fp4,fp1 + fmul fp9,fp5,fp1 + + /* Mask off larger integer bits in highest double word that we don't + care about to avoid losing precision when combining with smaller + values. */ + fctiduz fp10,fp6 + mfvsrd r7,v10 + rldicr r7,r7,0,(63-INTEGER_BITS) + mtvsrd v10,r7 + fcfidu fp10,fp10 /* Integer bits. */ + + fsub fp6,fp6,fp10 /* highest -= integer bits */ + + /* Work out the integer component, rounded down. Use the top two + limbs for this. */ + fadd fp10,fp6,fp7 /* highest + higher */ + + fctiduz fp10,fp10 + mfvsrd r7,v10 + andi. r0,r7,1 + fcfidu fp10,fp10 + + /* Subtract integer component from highest limb. */ + fsub fp12,fp6,fp10 + + beq L(even_integer) + + /* Our integer component is odd, so we are in the -PI/4 to 0 primary + region. We need to shift our result down by PI/4, and to do this + in the mod (4/PI) space we simply subtract 1. */ + lfd fp11,(L(DPone)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,fp12,fp8 + fadd fp12,fp12,fp9 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(even_integer): + lfd fp11,(L(DPone)-L(anchor))(r9) + + /* Now add up all the limbs in order. */ + fadd fp12,fp12,fp7 + fadd fp12,r12,fp8 + fadd fp12,r12,fp9 + + /* We need to check if the addition of all the limbs resulted in us + overflowing 1.0. */ + fcmpu 0,fp12,fp11 + bgt L(greater_than_one) + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + +L(greater_than_one): + /* We did overflow 1.0 when adding up all the limbs. Add 1.0 to our + integer, and subtract 1.0 from our result. Since that makes the + integer component odd, we need to subtract another 1.0 as + explained above. */ + addi r7,r7,1 + + lfd fp11,(L(DPtwo)-L(anchor))(r9) + fsub fp12,fp12,fp11 + + /* And finally multiply by pi/4. */ + lfd fp13,(L(pio4)-L(anchor))(r9) + fmul fp1,fp12,fp13 + + addi r7,r7,1 + b L(reduced) + + .balign 16 +L(less_2pn5): + lis r4,TWO_PN27@h + ori r4,r4,TWO_PN27@l + + cmpw r3,r4 + blt L(less_2pn27) + + /* A simpler Chebyshev approximation is close enough for this range: + x+x^3*(SS0+x^2*SS1). */ + + lfd fp10,(L(SS0)-L(anchor))(r9) + lfd fp11,(L(SS1)-L(anchor))(r9) + + fmul fp2,fp1,fp1 /* x^2 */ + fmul fp3,fp2,fp1 /* x^3 */ + + fmadd fp4,fp2,fp11,fp10 /* SS0+x^2*SS1 */ + fmadd fp1,fp3,fp4,fp1 /* x+x^3*(SS0+x^2*SS1) */ + + frsp fp1,fp1 /* Round to single precision. */ + + blr + + .balign 16 +L(less_2pn27): + cmpwi r3,0 + beq L(zero) + + /* Handle some special cases: + + sinf(subnormal) raises inexact/underflow + sinf(min_normalized) raises inexact/underflow + sinf(normalized) raises inexact. */ + + lfd fp2,(L(small)-L(anchor))(r9) + + fmul fp2,fp1,fp2 /* x * small */ + fsub fp1,fp1,fp2 /* x - x * small */ + + frsp fp1,fp1 + + blr + + .balign 16 +L(zero): + blr + +END (__sinf) + + .section .rodata, "a" + + .balign 8 + +L(anchor): + + /* Chebyshev constants for sin, range -PI/4 - PI/4. */ +L(S0): .8byte 0xbfc5555555551cd9 +L(S1): .8byte 0x3f81111110c2688b +L(S2): .8byte 0xbf2a019f8b4bd1f9 +L(S3): .8byte 0x3ec71d7264e6b5b4 +L(S4): .8byte 0xbe5a947e1674b58a + + /* Chebyshev constants for sin, range 2^-27 - 2^-5. */ +L(SS0): .8byte 0xbfc555555543d49d +L(SS1): .8byte 0x3f8110f475cec8c5 + + /* Chebyshev constants for cos, range -PI/4 - PI/4. */ +L(C0): .8byte 0xbfdffffffffe98ae +L(C1): .8byte 0x3fa55555545c50c7 +L(C2): .8byte 0xbf56c16b348b6874 +L(C3): .8byte 0x3efa00eb9ac43cc0 +L(C4): .8byte 0xbe923c97dd8844d7 + +L(invpio2): + .8byte 0x3fe45f306dc9c883 /* 2/PI */ + +L(invpio4): + .8byte 0x3ff45f306dc9c883 /* 4/PI */ + +L(invpio4_table): + .8byte 0x0000000000000000 + .8byte 0x3ff45f306c000000 + .8byte 0x3e3c9c882a000000 + .8byte 0x3c54fe13a8000000 + .8byte 0x3aaf47d4d0000000 + .8byte 0x38fbb81b6c000000 + .8byte 0x3714acc9e0000000 + .8byte 0x3560e4107c000000 + .8byte 0x33bca2c756000000 + .8byte 0x31fbd778ac000000 + .8byte 0x300b7246e0000000 + .8byte 0x2e5d2126e8000000 + .8byte 0x2c97003248000000 + .8byte 0x2ad77504e8000000 + .8byte 0x290921cfe0000000 + .8byte 0x274deb1cb0000000 + .8byte 0x25829a73e0000000 + .8byte 0x23fd1046be000000 + .8byte 0x2224baed10000000 + .8byte 0x20709d338e000000 + .8byte 0x1e535a2f80000000 + .8byte 0x1cef904e64000000 + .8byte 0x1b0d639830000000 + .8byte 0x1964ce7d24000000 + .8byte 0x17b908bf16000000 + +L(pio4): + .8byte 0x3fe921fb54442d18 /* PI/4 */ + +/* PI/2 as a sum of two doubles. We only use 32 bits of the upper limb + to avoid losing significant bits when multiplying with up to + (2^22)/(pi/2). */ +L(pio2hi): + .8byte 0xbff921fb54400000 + +L(pio2lo): + .8byte 0xbdd0b4611a626332 + +L(pio2_table): + .8byte 0 + .8byte 0x3ff921fb54442d18 /* 1 * PI/2 */ + .8byte 0x400921fb54442d18 /* 2 * PI/2 */ + .8byte 0x4012d97c7f3321d2 /* 3 * PI/2 */ + .8byte 0x401921fb54442d18 /* 4 * PI/2 */ + .8byte 0x401f6a7a2955385e /* 5 * PI/2 */ + .8byte 0x4022d97c7f3321d2 /* 6 * PI/2 */ + .8byte 0x4025fdbbe9bba775 /* 7 * PI/2 */ + .8byte 0x402921fb54442d18 /* 8 * PI/2 */ + .8byte 0x402c463abeccb2bb /* 9 * PI/2 */ + .8byte 0x402f6a7a2955385e /* 10 * PI/2 */ + +L(small): + .8byte 0x3cd0000000000000 /* 2^-50 */ + +L(ones): + .8byte 0x3ff0000000000000 /* +1.0 */ + .8byte 0xbff0000000000000 /* -1.0 */ + +L(DPhalf): + .8byte 0x3fe0000000000000 /* 0.5 */ + +L(DPone): + .8byte 0x3ff0000000000000 /* 1.0 */ + +L(DPtwo): + .8byte 0x4000000000000000 /* 2.0 */ + +weak_alias(__sinf, sinf) diff --git a/sysdeps/unix/sysv/linux/i386/lowlevellock.h b/sysdeps/unix/sysv/linux/i386/lowlevellock.h index 58f5638..b8ccd31 100644 --- a/sysdeps/unix/sysv/linux/i386/lowlevellock.h +++ b/sysdeps/unix/sysv/linux/i386/lowlevellock.h @@ -317,7 +317,7 @@ extern int __lll_trylock_elision(int *lock, short *adapt_count) #define lll_lock_elision(futex, adapt_count, private) \ __lll_lock_elision (&(futex), &(adapt_count), private) -#define lll_unlock_elision(futex, private) \ +#define lll_unlock_elision(futex, adapt_count, private) \ __lll_unlock_elision (&(futex), private) #define lll_trylock_elision(futex, adapt_count) \ __lll_trylock_elision(&(futex), &(adapt_count)) diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c index e11ad1d..2a0e540 100644 --- a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c +++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c @@ -23,27 +23,6 @@ #include <elision-conf.h> #include "htm.h" -/* PowerISA 2.0.7 Section B.5.5 defines isync to be insufficient as a - barrier in acquire mechanism for HTM operations, a strong 'sync' is - required. */ -#undef __arch_compare_and_exchange_val_32_acq -#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \ - ({ \ - __typeof (*(mem)) __tmp; \ - __typeof (mem) __memp = (mem); \ - __asm __volatile ( \ - "1: lwarx %0,0,%1" MUTEX_HINT_ACQ "\n" \ - " cmpw %0,%2\n" \ - " bne 2f\n" \ - " stwcx. %3,0,%1\n" \ - " bne- 1b\n" \ - "2: sync" \ - : "=&r" (__tmp) \ - : "b" (__memp), "r" (oldval), "r" (newval) \ - : "cr0", "memory"); \ - __tmp; \ - }) - #if !defined(LLL_LOCK) && !defined(EXTRAARG) /* Make sure the configuration code is always linked in for static libraries. */ @@ -68,7 +47,6 @@ __lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared) { if (*adapt_count > 0) { - (*adapt_count)--; goto use_lock; } diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c index edec155..b391116 100644 --- a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c +++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c @@ -36,7 +36,6 @@ __lll_trylock_elision (int *futex, short *adapt_count) /* Only try a transaction if it's worth it. */ if (*adapt_count > 0) { - (*adapt_count)--; goto use_lock; } @@ -45,8 +44,12 @@ __lll_trylock_elision (int *futex, short *adapt_count) if (*futex == 0) return 0; - /* Lock was busy. Fall back to normal locking. */ - __libc_tabort (_ABORT_LOCK_BUSY); + /* Lock was busy. This is never a nested transaction. + End it, and set the adapt count. */ + __libc_tend (0); + + if (aconf.skip_lock_busy > 0) + *adapt_count = aconf.skip_lock_busy; } else { @@ -58,9 +61,6 @@ __lll_trylock_elision (int *futex, short *adapt_count) if (aconf.skip_trylock_internal_abort > 0) *adapt_count = aconf.skip_trylock_internal_abort; } - - if (aconf.skip_lock_busy > 0) - *adapt_count = aconf.skip_lock_busy; } use_lock: diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c index 7234db6..4b4ae62 100644 --- a/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c @@ -21,12 +21,20 @@ #include "htm.h" int -__lll_unlock_elision(int *lock, int pshared) +__lll_unlock_elision (int *lock, short *adapt_count, int pshared) { /* When the lock was free we're in a transaction. */ if (*lock == 0) __libc_tend (0); else - lll_unlock ((*lock), pshared); + { + lll_unlock ((*lock), pshared); + + /* Update the adapt count AFTER completing the critical section. + Doing this here prevents unneeded stalling when entering + a critical section. Saving about 8% runtime on P8. */ + if (*adapt_count > 0) + (*adapt_count)--; + } return 0; } diff --git a/sysdeps/unix/sysv/linux/powerpc/htm.h b/sysdeps/unix/sysv/linux/powerpc/htm.h index 7b49817..75c99c2 100644 --- a/sysdeps/unix/sysv/linux/powerpc/htm.h +++ b/sysdeps/unix/sysv/linux/powerpc/htm.h @@ -160,10 +160,12 @@ #endif /* __ASSEMBLER__ */ -/* Definitions used for TEXASR Failure code (bits 0:6), they need to be even - because tabort. always sets the first bit. */ -#define _ABORT_LOCK_BUSY 0x3f /* Lock already used. */ -#define _ABORT_NESTED_TRYLOCK 0x3e /* Write operation in trylock. */ -#define _ABORT_SYSCALL 0x3d /* Syscall issued. */ +/* Definitions used for TEXASR Failure code (bits 0:7). If the failure + should be persistent, the abort code must be odd. 0xd0 through 0xff + are reserved for the kernel and potential hypervisor. */ +#define _ABORT_PERSISTENT 0x01 /* An unspecified persistent abort. */ +#define _ABORT_LOCK_BUSY 0x34 /* Busy lock, not persistent. */ +#define _ABORT_NESTED_TRYLOCK (0x32 | _ABORT_PERSISTENT) +#define _ABORT_SYSCALL (0x30 | _ABORT_PERSISTENT) #endif diff --git a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h index 67db1de..6769c25 100644 --- a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h +++ b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h @@ -32,7 +32,7 @@ extern int __lll_timedlock_elision extern int __lll_lock_elision (int *futex, short *adapt_count, int private) attribute_hidden; -extern int __lll_unlock_elision(int *lock, int private) +extern int __lll_unlock_elision (int *lock, short *adapt_count, int private) attribute_hidden; extern int __lll_trylock_elision(int *lock, short *adapt_count) @@ -40,8 +40,8 @@ extern int __lll_trylock_elision(int *lock, short *adapt_count) #define lll_lock_elision(futex, adapt_count, private) \ __lll_lock_elision (&(futex), &(adapt_count), private) -#define lll_unlock_elision(futex, private) \ - __lll_unlock_elision (&(futex), private) +#define lll_unlock_elision(futex, adapt_count, private) \ + __lll_unlock_elision (&(futex), &(adapt_count), private) #define lll_trylock_elision(futex, adapt_count) \ __lll_trylock_elision (&(futex), &(adapt_count)) diff --git a/sysdeps/unix/sysv/linux/s390/lowlevellock.h b/sysdeps/unix/sysv/linux/s390/lowlevellock.h index 163a731..cab5f4c 100644 --- a/sysdeps/unix/sysv/linux/s390/lowlevellock.h +++ b/sysdeps/unix/sysv/linux/s390/lowlevellock.h @@ -41,7 +41,7 @@ extern int __lll_trylock_elision(int *futex, short *adapt_count) # define lll_lock_elision(futex, adapt_count, private) \ __lll_lock_elision (&(futex), &(adapt_count), private) -# define lll_unlock_elision(futex, private) \ +# define lll_unlock_elision(futex, adapt_count, private) \ __lll_unlock_elision (&(futex), private) # define lll_trylock_elision(futex, adapt_count) \ __lll_trylock_elision(&(futex), &(adapt_count)) diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h index de525cd..1fbd31e 100644 --- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h +++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h @@ -342,7 +342,7 @@ extern int __lll_trylock_elision (int *lock, short *adapt_count) #define lll_lock_elision(futex, adapt_count, private) \ __lll_lock_elision (&(futex), &(adapt_count), private) -#define lll_unlock_elision(futex, private) \ +#define lll_unlock_elision(futex, adapt_count, private) \ __lll_unlock_elision (&(futex), private) #define lll_trylock_elision(futex, adapt_count) \ __lll_trylock_elision (&(futex), &(adapt_count)) |