aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/powerpc/powerpc32/970/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power5+/Implies3
-rw-r--r--sysdeps/powerpc/powerpc32/power5+/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power5/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power5/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc32/power6/Implies2
-rw-r--r--sysdeps/powerpc/powerpc32/power6x/Implies3
-rw-r--r--sysdeps/powerpc/powerpc32/power6x/fpu/Implies3
-rw-r--r--sysdeps/powerpc/powerpc32/power7/Implies3
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S89
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S88
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S92
-rw-r--r--sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S1
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memcmp.S988
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memcpy.S469
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memset.S434
-rw-r--r--sysdeps/powerpc/powerpc32/power7/strncmp.S177
-rw-r--r--sysdeps/powerpc/powerpc64/970/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power5+/Implies3
-rw-r--r--sysdeps/powerpc/powerpc64/power5+/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power5/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power5/fpu/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power6/Implies1
-rw-r--r--sysdeps/powerpc/powerpc64/power6x/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power6x/fpu/Implies3
-rw-r--r--sysdeps/powerpc/powerpc64/power7/Implies3
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/Implies2
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S68
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S71
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S69
-rw-r--r--sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S1
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcmp.S984
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S449
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S398
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncmp.S181
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/970/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/970/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/Implies (renamed from sysdeps/powerpc/powerpc32/970/fpu/Implies)1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/Implies (renamed from sysdeps/powerpc/powerpc32/power6/fpu/Implies)2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/fpu/Implies4
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/970/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/970/fpu/Implies1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/Implies (renamed from sysdeps/powerpc/powerpc64/970/fpu/Implies)1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/fpu/Implies1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/Implies (renamed from sysdeps/powerpc/powerpc64/power6/fpu/Implies)2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/fpu/Implies1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/fpu/Implies3
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/fpu/Implies4
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/Implies2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies3
69 files changed, 4608 insertions, 59 deletions
diff --git a/sysdeps/powerpc/powerpc32/970/Implies b/sysdeps/powerpc/powerpc32/970/Implies
index 4e3a983..17139bf 100644
--- a/sysdeps/powerpc/powerpc32/970/Implies
+++ b/sysdeps/powerpc/powerpc32/970/Implies
@@ -1 +1,2 @@
+powerpc/powerpc32/power4/fpu
powerpc/powerpc32/power4
diff --git a/sysdeps/powerpc/powerpc32/power5+/Implies b/sysdeps/powerpc/powerpc32/power5+/Implies
index 4e3a983..a51d2fd 100644
--- a/sysdeps/powerpc/powerpc32/power5+/Implies
+++ b/sysdeps/powerpc/powerpc32/power5+/Implies
@@ -1 +1,2 @@
-powerpc/powerpc32/power4
+powerpc/powerpc32/power5/fpu
+powerpc/powerpc32/power5
diff --git a/sysdeps/powerpc/powerpc32/power5+/fpu/Implies b/sysdeps/powerpc/powerpc32/power5+/fpu/Implies
deleted file mode 100644
index 7c16e45..0000000
--- a/sysdeps/powerpc/powerpc32/power5+/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power4/fpu
diff --git a/sysdeps/powerpc/powerpc32/power5/Implies b/sysdeps/powerpc/powerpc32/power5/Implies
index 4e3a983..17139bf 100644
--- a/sysdeps/powerpc/powerpc32/power5/Implies
+++ b/sysdeps/powerpc/powerpc32/power5/Implies
@@ -1 +1,2 @@
+powerpc/powerpc32/power4/fpu
powerpc/powerpc32/power4
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/Implies b/sysdeps/powerpc/powerpc32/power5/fpu/Implies
deleted file mode 100644
index 7c16e45..0000000
--- a/sysdeps/powerpc/powerpc32/power5/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc32/power4/fpu
diff --git a/sysdeps/powerpc/powerpc32/power6/Implies b/sysdeps/powerpc/powerpc32/power6/Implies
index c9a2d47..8e5b58a 100644
--- a/sysdeps/powerpc/powerpc32/power6/Implies
+++ b/sysdeps/powerpc/powerpc32/power6/Implies
@@ -1,2 +1,2 @@
+powerpc/powerpc32/power5+/fpu
powerpc/powerpc32/power5+
-powerpc/powerpc32/power4
diff --git a/sysdeps/powerpc/powerpc32/power6x/Implies b/sysdeps/powerpc/powerpc32/power6x/Implies
index 1ac063b..c0e1bea 100644
--- a/sysdeps/powerpc/powerpc32/power6x/Implies
+++ b/sysdeps/powerpc/powerpc32/power6x/Implies
@@ -1,3 +1,2 @@
+powerpc/powerpc32/power6/fpu
powerpc/powerpc32/power6
-powerpc/powerpc32/power5+
-powerpc/powerpc32/power4
diff --git a/sysdeps/powerpc/powerpc32/power6x/fpu/Implies b/sysdeps/powerpc/powerpc32/power6x/fpu/Implies
deleted file mode 100644
index 081f750..0000000
--- a/sysdeps/powerpc/powerpc32/power6x/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-powerpc/powerpc32/power6/fpu
-powerpc/powerpc32/power5+/fpu
-powerpc/powerpc32/power4/fpu
diff --git a/sysdeps/powerpc/powerpc32/power7/Implies b/sysdeps/powerpc/powerpc32/power7/Implies
index 03899d8..c0e1bea 100644
--- a/sysdeps/powerpc/powerpc32/power7/Implies
+++ b/sysdeps/powerpc/powerpc32/power7/Implies
@@ -1 +1,2 @@
-powerpc/powerpc32/power5
+powerpc/powerpc32/power6/fpu
+powerpc/powerpc32/power6
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
new file mode 100644
index 0000000..5b0d950
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -0,0 +1,89 @@
+/* finite(). PowerPC32/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x) */
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 3
+.LC0: /* 1.0 */
+ .quad 0x3ff0000000000000
+
+ .section ".text"
+ .type __finite, @function
+ .machine power7
+ENTRY (__finite)
+#ifdef SHARED
+ mflr r11
+ cfi_register(lr,r11)
+
+ bcl 20,31,1f
+1: mflr r9
+ addis r9,r9,.LC0-1b@ha
+ lfd fp0,.LC0-1b@l(r9)
+
+ mtlr r11
+ cfi_same_value (lr)
+#else
+ lis r9,.LC0@ha
+ lfd fp0,.LC0@l(r9)
+#endif
+ ftdiv cr7,fp1,fp0
+ li r3,1
+ bflr 30
+
+ /* We have -INF/+INF/NaN or a denormal. */
+
+ stwu r1,-16(r1) /* Allocate stack space. */
+ stfd fp1,8(r1) /* Transfer FP to GPR's. */
+
+ ori 2,2,0 /* Force a new dispatch group. */
+ lhz r0,8(r1) /* Fetch the upper portion of the high word of
+ the FP value (where the exponent and sign bits
+ are). */
+ clrlwi r0,r0,17 /* r0 = abs(r0). */
+ addi r1,r1,16 /* Reset the stack pointer. */
+ cmpwi cr7,r0,0x7ff0 /* r4 == 0x7ff0?. */
+ bltlr cr7 /* LT means we have a denormal. */
+ li r3,0
+ blr
+ END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__finite, __finitel)
+weak_alias (__finite, finitel)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
new file mode 100644
index 0000000..54bd941
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S. */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
new file mode 100644
index 0000000..2979534
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -0,0 +1,88 @@
+/* isinf(). PowerPC32/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x) */
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 3
+.LC0: /* 1.0 */
+ .quad 0x3ff0000000000000
+
+ .section ".text"
+ .type __isinf, @function
+ .machine power7
+ENTRY (__isinf)
+#ifdef SHARED
+ mflr r11
+ cfi_register(lr,r11)
+
+ bcl 20,31,1f
+1: mflr r9
+ addis r9,r9,.LC0-1b@ha
+ lfd fp0,.LC0-1b@l(r9)
+
+ mtlr r11
+ cfi_same_value (lr)
+#else
+ lis r9,.LC0@ha
+ lfd fp0,.LC0@l(r9)
+#endif
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 29 /* If not INF, return. */
+
+ /* Either we have -INF/+INF or a denormal. */
+
+ stwu r1,-16(r1) /* Allocate stack space. */
+ stfd fp1,8(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ lhz r4,8(r1) /* Fetch the upper portion of the high word of
+ the FP value (where the exponent and sign bits
+ are). */
+ addi r1,r1,16 /* Reset the stack pointer. */
+ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
+ li r3,1
+ beqlr cr7 /* EQ means INF, otherwise -INF. */
+ li r3,-1
+ blr
+ END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
new file mode 100644
index 0000000..be759e0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S. */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
new file mode 100644
index 0000000..852539f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -0,0 +1,92 @@
+/* isnan(). PowerPC32/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x) */
+ .section .rodata.cst8,"aM",@progbits,8
+ .align 3
+.LC0: /* 1.0 */
+ .quad 0x3ff0000000000000
+
+ .section ".text"
+ .type __isnan, @function
+ .machine power7
+ENTRY (__isnan)
+#ifdef SHARED
+ mflr r11
+ cfi_register(lr,r11)
+
+ bcl 20,31,1f
+1: mflr r9
+ addis r9,r9,.LC0-1b@ha
+ lfd fp0,.LC0-1b@l(r9)
+
+ mtlr r11
+ cfi_same_value (lr)
+#else
+ lis r9,.LC0@ha
+ lfd fp0,.LC0@l(r9)
+#endif
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 30 /* If not NaN or Inf, finish. */
+
+ /* We have -INF/+INF/NaN or a denormal. */
+
+ stwu r1,-16(r1) /* Allocate stack space. */
+ stfd fp1,8(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ lwz r4,8(r1) /* Load the upper half of the FP value. */
+ lwz r5,12(r1) /* Load the lower half of the FP value. */
+ addi r1,r1,16 /* Reset the stack pointer. */
+ lis r0,0x7ff0 /* Load the upper portion for an INF/NaN. */
+ clrlwi r4,r4,1 /* r4 = abs(r4). */
+ cmpw cr7,r4,r0 /* if (abs(r4) <= inf). */
+ cmpwi cr6,r5,0 /* r5 == 0x00000000? */
+ bltlr cr7 /* LT means we have a denormal. */
+ bgt cr7,L(NaN) /* GT means we have a NaN. */
+ beqlr cr6 /* EQ means we have +/-INF. */
+L(NaN):
+ li r3,1 /* x == NaN? */
+ blr
+ END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000..b48c85e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S. */
diff --git a/sysdeps/powerpc/powerpc32/power7/memcmp.S b/sysdeps/powerpc/powerpc32/power7/memcmp.S
new file mode 100644
index 0000000..d529b49
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -0,0 +1,988 @@
+/* Optimized memcmp implementation for POWER7/PowerPC32.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+ .machine power7
+EALIGN (BP_SYM(memcmp),4,0)
+ CALL_MCOUNT
+
+#define rTMP r0
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+#define rBITDIF r12 /* bits that differ in s1 & s2 words */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+ xor rTMP,rSTR2,rSTR1
+ cmplwi cr6,rN,0
+ cmplwi cr1,rN,12
+ clrlwi. rTMP,rTMP,30
+ clrlwi rBITDIF,rSTR1,30
+ cmplwi cr5,rBITDIF,0
+ beq- cr6,L(zeroLength)
+ dcbt 0,rSTR1
+ dcbt 0,rSTR2
+
+ /* If less than 8 bytes or not aligned, use the unaligned
+ byte loop. */
+
+ blt cr1,L(bytealigned)
+ stwu 1,-64(1)
+ cfi_adjust_cfa_offset(64)
+ stw r31,48(1)
+ cfi_offset(31,(48-64))
+ stw r30,44(1)
+ cfi_offset(30,(44-64))
+ bne L(unaligned)
+/* At this point we know both strings have the same alignment and the
+ compare length is at least 8 bytes. rBITDIF contains the low order
+ 2 bits of rSTR1 and cr5 contains the result of the logical compare
+ of rBITDIF to 0. If rBITDIF == 0 then we are already word
+ aligned and can perform the word aligned loop.
+
+ Otherwise we know the two strings have the same alignment (but not
+ yet word aligned). So we force the string addresses to the next lower
+ word boundary and special case this first word using shift left to
+ eliminate bits preceeding the first byte. Since we want to join the
+ normal (word aligned) compare loop, starting at the second word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first word. This insures that the loop count is
+ correct and the first word (shifted) is in the expected register pair. */
+ .align 4
+L(samealignment):
+ clrrwi rSTR1,rSTR1,2
+ clrrwi rSTR2,rSTR2,2
+ beq cr5,L(Waligned)
+ add rN,rN,rBITDIF
+ slwi r11,rBITDIF,3
+ srwi rTMP,rN,4 /* Divide by 16 */
+ andi. rBITDIF,rN,12 /* Get the word remainder */
+ lwz rWORD1,0(rSTR1)
+ lwz rWORD2,0(rSTR2)
+ cmplwi cr1,rBITDIF,8
+ cmplwi cr7,rN,16
+ clrlwi rN,rN,30
+ beq L(dPs4)
+ mtctr rTMP
+ bgt cr1,L(dPs3)
+ beq cr1,L(dPs2)
+
+/* Remainder is 4 */
+ .align 3
+L(dsP1):
+ slw rWORD5,rWORD1,r11
+ slw rWORD6,rWORD2,r11
+ cmplw cr5,rWORD5,rWORD6
+ blt cr7,L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ b L(dP1e)
+/* Remainder is 8 */
+ .align 4
+L(dPs2):
+ slw rWORD5,rWORD1,r11
+ slw rWORD6,rWORD2,r11
+ cmplw cr6,rWORD5,rWORD6
+ blt cr7,L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ lwz rWORD7,4(rSTR1)
+ lwz rWORD8,4(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ b L(dP2e)
+/* Remainder is 12 */
+ .align 4
+L(dPs3):
+ slw rWORD3,rWORD1,r11
+ slw rWORD4,rWORD2,r11
+ cmplw cr1,rWORD3,rWORD4
+ b L(dP3e)
+/* Count is a multiple of 16, remainder is 0 */
+ .align 4
+L(dPs4):
+ mtctr rTMP
+ slw rWORD1,rWORD1,r11
+ slw rWORD2,rWORD2,r11
+ cmplw cr0,rWORD1,rWORD2
+ b L(dP4e)
+
+/* At this point we know both strings are word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(Waligned):
+ andi. rBITDIF,rN,12 /* Get the word remainder */
+ srwi rTMP,rN,4 /* Divide by 16 */
+ cmplwi cr1,rBITDIF,8
+ cmplwi cr7,rN,16
+ clrlwi rN,rN,30
+ beq L(dP4)
+ bgt cr1,L(dP3)
+ beq cr1,L(dP2)
+
+/* Remainder is 4 */
+ .align 4
+L(dP1):
+ mtctr rTMP
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+ (8-15 byte compare), we want to use only volatile registers. This
+ means we can avoid restoring non-volatile registers since we did not
+ change any on the early exit path. The key here is the non-early
+ exit path only cares about the condition code (cr5), not about which
+ register pair was used. */
+ lwz rWORD5,0(rSTR1)
+ lwz rWORD6,0(rSTR2)
+ cmplw cr5,rWORD5,rWORD6
+ blt cr7,L(dP1x)
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+L(dP1e):
+ lwz rWORD3,8(rSTR1)
+ lwz rWORD4,8(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ lwz rWORD5,12(rSTR1)
+ lwz rWORD6,12(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+ bne cr0,L(dLcr0)
+
+ lwzu rWORD7,16(rSTR1)
+ lwzu rWORD8,16(rSTR2)
+ bne cr1,L(dLcr1)
+ cmplw cr5,rWORD7,rWORD8
+ bdnz L(dLoop)
+ bne cr6,L(dLcr6)
+ lwz r30,44(1)
+ lwz r31,48(1)
+ .align 3
+L(dP1x):
+ slwi. r12,rN,3
+ bne cr5,L(dLcr5)
+ subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
+ lwz 1,0(1)
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Remainder is 8 */
+ .align 4
+L(dP2):
+ mtctr rTMP
+ lwz rWORD5,0(rSTR1)
+ lwz rWORD6,0(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ blt cr7,L(dP2x)
+ lwz rWORD7,4(rSTR1)
+ lwz rWORD8,4(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+L(dP2e):
+ lwz rWORD1,8(rSTR1)
+ lwz rWORD2,8(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ lwz rWORD3,12(rSTR1)
+ lwz rWORD4,12(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ addi rSTR1,rSTR1,4
+ addi rSTR2,rSTR2,4
+ bne cr6,L(dLcr6)
+ bne cr5,L(dLcr5)
+ b L(dLoop2)
+/* Again we are on a early exit path (16-23 byte compare), we want to
+ only use volatile registers and avoid restoring non-volatile
+ registers. */
+ .align 4
+L(dP2x):
+ lwz rWORD3,4(rSTR1)
+ lwz rWORD4,4(rSTR2)
+ cmplw cr5,rWORD3,rWORD4
+ slwi. r12,rN,3
+ bne cr6,L(dLcr6)
+ addi rSTR1,rSTR1,4
+ addi rSTR2,rSTR2,4
+ bne cr5,L(dLcr5)
+ subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
+ lwz 1,0(1)
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Remainder is 12 */
+ .align 4
+L(dP3):
+ mtctr rTMP
+ lwz rWORD3,0(rSTR1)
+ lwz rWORD4,0(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+L(dP3e):
+ lwz rWORD5,4(rSTR1)
+ lwz rWORD6,4(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ blt cr7,L(dP3x)
+ lwz rWORD7,8(rSTR1)
+ lwz rWORD8,8(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ lwz rWORD1,12(rSTR1)
+ lwz rWORD2,12(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr1,L(dLcr1)
+ bne cr6,L(dLcr6)
+ b L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+ only use volatile registers and avoid restoring non-volatile
+ registers. */
+ .align 4
+L(dP3x):
+ lwz rWORD1,8(rSTR1)
+ lwz rWORD2,8(rSTR2)
+ cmplw cr5,rWORD1,rWORD2
+ slwi. r12,rN,3
+ bne cr1,L(dLcr1)
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr6,L(dLcr6)
+ subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
+ bne cr5,L(dLcr5)
+ lwz 1,0(1)
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Count is a multiple of 16, remainder is 0 */
+ .align 4
+L(dP4):
+ mtctr rTMP
+ lwz rWORD1,0(rSTR1)
+ lwz rWORD2,0(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+L(dP4e):
+ lwz rWORD3,4(rSTR1)
+ lwz rWORD4,4(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ lwz rWORD5,8(rSTR1)
+ lwz rWORD6,8(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ lwzu rWORD7,12(rSTR1)
+ lwzu rWORD8,12(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr0,L(dLcr0)
+ bne cr1,L(dLcr1)
+ bdz- L(d24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(dLoop):
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ bne cr6,L(dLcr6)
+L(dLoop1):
+ lwz rWORD3,8(rSTR1)
+ lwz rWORD4,8(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+L(dLoop2):
+ lwz rWORD5,12(rSTR1)
+ lwz rWORD6,12(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr0,L(dLcr0)
+L(dLoop3):
+ lwzu rWORD7,16(rSTR1)
+ lwzu rWORD8,16(rSTR2)
+ bne cr1,L(dLcr1)
+ cmplw cr0,rWORD1,rWORD2
+ bdnz L(dLoop)
+
+L(dL4):
+ cmplw cr1,rWORD3,rWORD4
+ bne cr6,L(dLcr6)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+ cmplw cr5,rWORD7,rWORD8
+L(d44):
+ bne cr0,L(dLcr0)
+L(d34):
+ bne cr1,L(dLcr1)
+L(d24):
+ bne cr6,L(dLcr6)
+L(d14):
+ slwi. r12,rN,3
+ bne cr5,L(dLcr5)
+L(d04):
+ lwz r30,44(1)
+ lwz r31,48(1)
+ lwz 1,0(1)
+ subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
+ beq L(zeroLength)
+/* At this point we have a remainder of 1 to 3 bytes to compare. Since
+ we are aligned it is safe to load the whole word, and use
+ shift right to eliminate bits beyond the compare length. */
+L(d00):
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ srw rWORD1,rWORD1,rN
+ srw rWORD2,rWORD2,rN
+ cmplw rWORD1,rWORD2
+ li rRTN,0
+ beqlr
+ li rRTN,1
+ bgtlr
+ li rRTN,-1
+ blr
+
+ .align 4
+L(dLcr0):
+ lwz r30,44(1)
+ lwz r31,48(1)
+ li rRTN,1
+ lwz 1,0(1)
+ bgtlr cr0
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr1):
+ lwz r30,44(1)
+ lwz r31,48(1)
+ li rRTN,1
+ lwz 1,0(1)
+ bgtlr cr1
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr6):
+ lwz r30,44(1)
+ lwz r31,48(1)
+ li rRTN,1
+ lwz 1,0(1)
+ bgtlr cr6
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr5):
+ lwz r30,44(1)
+ lwz r31,48(1)
+L(dLcr5x):
+ li rRTN,1
+ lwz 1,0(1)
+ bgtlr cr5
+ li rRTN,-1
+ blr
+
+ .align 4
+L(bytealigned):
+ cfi_adjust_cfa_offset(-64)
+ mtctr rN
+
+/* We need to prime this loop. This loop is swing modulo scheduled
+ to avoid pipe delays. The dependent instruction latencies (load to
+ compare to conditional branch) is 2 to 3 cycles. In this loop each
+ dispatch group ends in a branch and takes 1 cycle. Effectively
+ the first iteration of the loop only serves to load operands and
+ branches based on compares are delayed until the next loop.
+
+ So we must precondition some registers and condition codes so that
+ we don't exit the loop early on the first iteration. */
+ lbz rWORD1,0(rSTR1)
+ lbz rWORD2,0(rSTR2)
+ bdz L(b11)
+ cmplw cr0,rWORD1,rWORD2
+ lbz rWORD3,1(rSTR1)
+ lbz rWORD4,1(rSTR2)
+ bdz L(b12)
+ cmplw cr1,rWORD3,rWORD4
+ lbzu rWORD5,2(rSTR1)
+ lbzu rWORD6,2(rSTR2)
+ bdz L(b13)
+ .align 4
+L(bLoop):
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ bne cr0,L(bLcr0)
+
+ cmplw cr6,rWORD5,rWORD6
+ bdz L(b3i)
+
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne cr1,L(bLcr1)
+
+ cmplw cr0,rWORD1,rWORD2
+ bdz L(b2i)
+
+ lbzu rWORD5,1(rSTR1)
+ lbzu rWORD6,1(rSTR2)
+ bne cr6,L(bLcr6)
+
+ cmplw cr1,rWORD3,rWORD4
+ bdnz L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+ bytes. But we must avoid overrunning the length (in the ctr) to
+ prevent these speculative loads from causing a segfault. In this
+ case the loop will exit early (before the all pending bytes are
+ tested. In this case we must complete the pending operations
+ before returning. */
+L(b1i):
+ bne cr0,L(bLcr0)
+ bne cr1,L(bLcr1)
+ b L(bx56)
+ .align 4
+L(b2i):
+ bne cr6,L(bLcr6)
+ bne cr0,L(bLcr0)
+ b L(bx34)
+ .align 4
+L(b3i):
+ bne cr1,L(bLcr1)
+ bne cr6,L(bLcr6)
+ b L(bx12)
+ .align 4
+L(bLcr0):
+ li rRTN,1
+ bgtlr cr0
+ li rRTN,-1
+ blr
+L(bLcr1):
+ li rRTN,1
+ bgtlr cr1
+ li rRTN,-1
+ blr
+L(bLcr6):
+ li rRTN,1
+ bgtlr cr6
+ li rRTN,-1
+ blr
+
+L(b13):
+ bne cr0,L(bx12)
+ bne cr1,L(bx34)
+L(bx56):
+ sub rRTN,rWORD5,rWORD6
+ blr
+ nop
+L(b12):
+ bne cr0,L(bx12)
+L(bx34):
+ sub rRTN,rWORD3,rWORD4
+ blr
+
+L(b11):
+L(bx12):
+ sub rRTN,rWORD1,rWORD2
+ blr
+
+ .align 4
+L(zeroLengthReturn):
+
+L(zeroLength):
+ li rRTN,0
+ blr
+
+ cfi_adjust_cfa_offset(64)
+ .align 4
+/* At this point we know the strings have different alignment and the
+ compare length is at least 8 bytes. rBITDIF contains the low order
+ 2 bits of rSTR1 and cr5 contains the result of the logical compare
+ of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
+ perform the Wunaligned loop.
+
+ Otherwise we know that rSTR1 is not aready word aligned yet.
+ So we can force the string addresses to the next lower word
+ boundary and special case this first word using shift left to
+ eliminate bits preceeding the first byte. Since we want to join the
+ normal (Wualigned) compare loop, starting at the second word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first W. This insures that the loop count is
+ correct and the first W (shifted) is in the expected resister pair. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rB r27 /* Left rotation temp for rWORD2. */
+#define rD r26 /* Left rotation temp for rWORD4. */
+#define rF r25 /* Left rotation temp for rWORD6. */
+#define rH r24 /* Left rotation temp for rWORD8. */
+#define rA r0 /* Right rotation temp for rWORD2. */
+#define rC r12 /* Right rotation temp for rWORD4. */
+#define rE r0 /* Right rotation temp for rWORD6. */
+#define rG r12 /* Right rotation temp for rWORD8. */
+L(unaligned):
+ stw r29,40(r1)
+ cfi_offset(r29,(40-64))
+ clrlwi rSHL,rSTR2,30
+ stw r28,36(r1)
+ cfi_offset(r28,(36-64))
+ beq cr5,L(Wunaligned)
+ stw r27,32(r1)
+ cfi_offset(r27,(32-64))
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+ in the 1st rSTR1 W. */
+ sub r27,rSTR2,rBITDIF
+/* But do not attempt to address the W before that W that contains
+ the actual start of rSTR2. */
+ clrrwi rSTR2,rSTR2,2
+ stw r26,28(r1)
+ cfi_offset(r26,(28-64))
+/* Compute the left/right shift counts for the unalign rSTR2,
+ compensating for the logical (W aligned) start of rSTR1. */
+ clrlwi rSHL,r27,30
+ clrrwi rSTR1,rSTR1,2
+ stw r25,24(r1)
+ cfi_offset(r25,(24-64))
+ slwi rSHL,rSHL,3
+ cmplw cr5,r27,rSTR2
+ add rN,rN,rBITDIF
+ slwi r11,rBITDIF,3
+ stw r24,20(r1)
+ cfi_offset(r24,(20-64))
+ subfic rSHR,rSHL,32
+ srwi rTMP,rN,4 /* Divide by 16 */
+ andi. rBITDIF,rN,12 /* Get the W remainder */
+/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
+ this special case those bits may be discarded anyway. Also we
+ must avoid loading a W where none of the bits are part of rSTR2 as
+ this may cross a page boundary and cause a page fault. */
+ li rWORD8,0
+ blt cr5,L(dus0)
+ lwz rWORD8,0(rSTR2)
+ la rSTR2,4(rSTR2)
+ slw rWORD8,rWORD8,rSHL
+
+L(dus0):
+ lwz rWORD1,0(rSTR1)
+ lwz rWORD2,0(rSTR2)
+ cmplwi cr1,rBITDIF,8
+ cmplwi cr7,rN,16
+ srw rG,rWORD2,rSHR
+ clrlwi rN,rN,30
+ beq L(duPs4)
+ mtctr rTMP
+ or rWORD8,rG,rWORD8
+ bgt cr1,L(duPs3)
+ beq cr1,L(duPs2)
+
+/* Remainder is 4 */
+ .align 4
+L(dusP1):
+ slw rB,rWORD2,rSHL
+ slw rWORD7,rWORD1,r11
+ slw rWORD8,rWORD8,r11
+ bge cr7,L(duP1e)
+/* At this point we exit early with the first word compare
+ complete and remainder of 0 to 3 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+ cmplw cr5,rWORD7,rWORD8
+ slwi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmplw cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ lwz rWORD2,4(rSTR2)
+ srw rA,rWORD2,rSHR
+ b L(dutrim)
+/* Remainder is 8 */
+ .align 4
+L(duPs2):
+ slw rH,rWORD2,rSHL
+ slw rWORD5,rWORD1,r11
+ slw rWORD6,rWORD8,r11
+ b L(duP2e)
+/* Remainder is 12 */
+ .align 4
+L(duPs3):
+ slw rF,rWORD2,rSHL
+ slw rWORD3,rWORD1,r11
+ slw rWORD4,rWORD8,r11
+ b L(duP3e)
+/* Count is a multiple of 16, remainder is 0 */
+ .align 4
+L(duPs4):
+ mtctr rTMP
+ or rWORD8,rG,rWORD8
+ slw rD,rWORD2,rSHL
+ slw rWORD1,rWORD1,r11
+ slw rWORD2,rWORD8,r11
+ b L(duP4e)
+
+/* At this point we know rSTR1 is word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(Wunaligned):
+ stw r27,32(r1)
+ cfi_offset(r27,(32-64))
+ clrrwi rSTR2,rSTR2,2
+ stw r26,28(r1)
+ cfi_offset(r26,(28-64))
+ srwi rTMP,rN,4 /* Divide by 16 */
+ stw r25,24(r1)
+ cfi_offset(r25,(24-64))
+ andi. rBITDIF,rN,12 /* Get the W remainder */
+ stw r24,20(r1)
+ cfi_offset(r24,(24-64))
+ slwi rSHL,rSHL,3
+ lwz rWORD6,0(rSTR2)
+ lwzu rWORD8,4(rSTR2)
+ cmplwi cr1,rBITDIF,8
+ cmplwi cr7,rN,16
+ clrlwi rN,rN,30
+ subfic rSHR,rSHL,32
+ slw rH,rWORD6,rSHL
+ beq L(duP4)
+ mtctr rTMP
+ bgt cr1,L(duP3)
+ beq cr1,L(duP2)
+
+/* Remainder is 4 */
+ .align 4
+L(duP1):
+ srw rG,rWORD8,rSHR
+ lwz rWORD7,0(rSTR1)
+ slw rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP1x)
+L(duP1e):
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ srw rA,rWORD2,rSHR
+ slw rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ lwz rWORD3,8(rSTR1)
+ lwz rWORD4,8(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ srw rC,rWORD4,rSHR
+ slw rF,rWORD4,rSHL
+ bne cr5,L(duLcr5)
+ or rWORD4,rC,rD
+ lwz rWORD5,12(rSTR1)
+ lwz rWORD6,12(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ srw rE,rWORD6,rSHR
+ slw rH,rWORD6,rSHL
+ bne cr0,L(duLcr0)
+ or rWORD6,rE,rF
+ cmplw cr6,rWORD5,rWORD6
+ b L(duLoop3)
+ .align 4
+/* At this point we exit early with the first word compare
+ complete and remainder of 0 to 3 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+L(duP1x):
+ cmplw cr5,rWORD7,rWORD8
+ slwi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmplw cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srw rA,rWORD2,rSHR
+ b L(dutrim)
+/* Remainder is 8 */
+ .align 4
+L(duP2):
+ srw rE,rWORD8,rSHR
+ lwz rWORD5,0(rSTR1)
+ or rWORD6,rE,rH
+ slw rH,rWORD8,rSHL
+L(duP2e):
+ lwz rWORD7,4(rSTR1)
+ lwz rWORD8,4(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ srw rG,rWORD8,rSHR
+ slw rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP2x)
+ lwz rWORD1,8(rSTR1)
+ lwz rWORD2,8(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ srw rA,rWORD2,rSHR
+ slw rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ lwz rWORD3,12(rSTR1)
+ lwz rWORD4,12(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ bne cr5,L(duLcr5)
+ srw rC,rWORD4,rSHR
+ slw rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+ addi rSTR1,rSTR1,4
+ addi rSTR2,rSTR2,4
+ cmplw cr1,rWORD3,rWORD4
+ b L(duLoop2)
+ .align 4
+L(duP2x):
+ cmplw cr5,rWORD7,rWORD8
+ addi rSTR1,rSTR1,4
+ addi rSTR2,rSTR2,4
+ bne cr6,L(duLcr6)
+ slwi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmplw cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ lwz rWORD2,4(rSTR2)
+ srw rA,rWORD2,rSHR
+ b L(dutrim)
+
+/* Remainder is 12 */
+ .align 4
+L(duP3):
+ srw rC,rWORD8,rSHR
+ lwz rWORD3,0(rSTR1)
+ slw rF,rWORD8,rSHL
+ or rWORD4,rC,rH
+L(duP3e):
+ lwz rWORD5,4(rSTR1)
+ lwz rWORD6,4(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ srw rE,rWORD6,rSHR
+ slw rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+ lwz rWORD7,8(rSTR1)
+ lwz rWORD8,8(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr1,L(duLcr1)
+ srw rG,rWORD8,rSHR
+ slw rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP3x)
+ lwz rWORD1,12(rSTR1)
+ lwz rWORD2,12(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ srw rA,rWORD2,rSHR
+ slw rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ cmplw cr0,rWORD1,rWORD2
+ b L(duLoop1)
+ .align 4
+L(duP3x):
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr1,L(duLcr1)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ slwi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmplw cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ lwz rWORD2,4(rSTR2)
+ srw rA,rWORD2,rSHR
+ b L(dutrim)
+
+/* Count is a multiple of 16, remainder is 0 */
+ .align 4
+L(duP4):
+ mtctr rTMP
+ srw rA,rWORD8,rSHR
+ lwz rWORD1,0(rSTR1)
+ slw rD,rWORD8,rSHL
+ or rWORD2,rA,rH
+L(duP4e):
+ lwz rWORD3,4(rSTR1)
+ lwz rWORD4,4(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ srw rC,rWORD4,rSHR
+ slw rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+ lwz rWORD5,8(rSTR1)
+ lwz rWORD6,8(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ bne cr0,L(duLcr0)
+ srw rE,rWORD6,rSHR
+ slw rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+ lwzu rWORD7,12(rSTR1)
+ lwzu rWORD8,12(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr1,L(duLcr1)
+ srw rG,rWORD8,rSHR
+ slw rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ cmplw cr5,rWORD7,rWORD8
+ bdz L(du24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(duLoop):
+ lwz rWORD1,4(rSTR1)
+ lwz rWORD2,4(rSTR2)
+ cmplw cr1,rWORD3,rWORD4
+ bne cr6,L(duLcr6)
+ srw rA,rWORD2,rSHR
+ slw rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+L(duLoop1):
+ lwz rWORD3,8(rSTR1)
+ lwz rWORD4,8(rSTR2)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr5,L(duLcr5)
+ srw rC,rWORD4,rSHR
+ slw rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+L(duLoop2):
+ lwz rWORD5,12(rSTR1)
+ lwz rWORD6,12(rSTR2)
+ cmplw cr5,rWORD7,rWORD8
+ bne cr0,L(duLcr0)
+ srw rE,rWORD6,rSHR
+ slw rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+L(duLoop3):
+ lwzu rWORD7,16(rSTR1)
+ lwzu rWORD8,16(rSTR2)
+ cmplw cr0,rWORD1,rWORD2
+ bne cr1,L(duLcr1)
+ srw rG,rWORD8,rSHR
+ slw rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ bdnz L(duLoop)
+
+L(duL4):
+ bne cr1,L(duLcr1)
+ cmplw cr1,rWORD3,rWORD4
+ bne cr6,L(duLcr6)
+ cmplw cr6,rWORD5,rWORD6
+ bne cr5,L(duLcr5)
+ cmplw cr5,rWORD7,rWORD8
+L(du44):
+ bne cr0,L(duLcr0)
+L(du34):
+ bne cr1,L(duLcr1)
+L(du24):
+ bne cr6,L(duLcr6)
+L(du14):
+ slwi. rN,rN,3
+ bne cr5,L(duLcr5)
+/* At this point we have a remainder of 1 to 3 bytes to compare. We use
+ shift right to eliminate bits beyond the compare length.
+
+ However it may not be safe to load rWORD2 which may be beyond the
+ string length. So we compare the bit length of the remainder to
+ the right shift count (rSHR). If the bit count is less than or equal
+ we do not need to load rWORD2 (all significant bits are already in
+ rB). */
+ cmplw cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ lwz rWORD2,4(rSTR2)
+ srw rA,rWORD2,rSHR
+ .align 4
+L(dutrim):
+ lwz rWORD1,4(rSTR1)
+ lwz r31,48(1)
+ subfic rN,rN,32 /* Shift count is 32 - (rN * 8). */
+ or rWORD2,rA,rB
+ lwz r30,44(1)
+ lwz r29,40(r1)
+ srw rWORD1,rWORD1,rN
+ srw rWORD2,rWORD2,rN
+ lwz r28,36(r1)
+ lwz r27,32(r1)
+ cmplw rWORD1,rWORD2
+ li rRTN,0
+ beq L(dureturn26)
+ li rRTN,1
+ bgt L(dureturn26)
+ li rRTN,-1
+ b L(dureturn26)
+ .align 4
+L(duLcr0):
+ lwz r31,48(1)
+ lwz r30,44(1)
+ li rRTN,1
+ bgt cr0,L(dureturn29)
+ lwz r29,40(r1)
+ lwz r28,36(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr1):
+ lwz r31,48(1)
+ lwz r30,44(1)
+ li rRTN,1
+ bgt cr1,L(dureturn29)
+ lwz r29,40(r1)
+ lwz r28,36(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr6):
+ lwz r31,48(1)
+ lwz r30,44(1)
+ li rRTN,1
+ bgt cr6,L(dureturn29)
+ lwz r29,40(r1)
+ lwz r28,36(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr5):
+ lwz r31,48(1)
+ lwz r30,44(1)
+ li rRTN,1
+ bgt cr5,L(dureturn29)
+ lwz r29,40(r1)
+ lwz r28,36(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 3
+L(duZeroReturn):
+ li rRTN,0
+ .align 4
+L(dureturn):
+ lwz r31,48(1)
+ lwz r30,44(1)
+L(dureturn29):
+ lwz r29,40(r1)
+ lwz r28,36(r1)
+L(dureturn27):
+ lwz r27,32(r1)
+L(dureturn26):
+ lwz r26,28(r1)
+L(dureturn25):
+ lwz r25,24(r1)
+ lwz r24,20(r1)
+ lwz 1,0(1)
+ blr
+END (BP_SYM (memcmp))
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp,bcmp)
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
new file mode 100644
index 0000000..f0c332f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -0,0 +1,469 @@
+/* Optimized memcpy implementation for PowerPC32/POWER7.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+ .machine power7
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT
+
+ stwu 1,-32(1)
+ cfi_adjust_cfa_offset(32)
+ stw 30,20(1)
+ cfi_offset(30,(20-32))
+ stw 31,24(1)
+ mr 30,3
+ cmplwi cr1,5,31
+ neg 0,3
+ cfi_offset(31,-8)
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 11,3,7 /* Check alignment of DST. */
+ clrlwi 10,4,29 /* Check alignment of SRC. */
+ cmplw cr6,10,11 /* SRC and DST alignments match? */
+ mr 12,4
+ mr 31,5
+ bne cr6,L(copy_GE_32_unaligned)
+
+ srwi 9,5,3 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_aligned_cont)
+
+ clrlwi 0,0,29
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Get the SRC aligned to 8 bytes. */
+
+1: bf 31,2f
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: bf 30,4f
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: bf 29,0f
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+0:
+ clrlwi 10,12,29 /* Check alignment of SRC again. */
+ srwi 9,31,3 /* Number of full doublewords remaining. */
+
+L(copy_GE_32_aligned_cont):
+
+ clrlwi 11,31,29
+ mtcrf 0x01,9
+
+ srwi 8,31,5
+ cmplwi cr1,9,4
+ cmplwi cr6,11,0
+ mr 11,12
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+ lfd 6,0(12)
+ lfd 7,8(12)
+ addi 11,12,16
+ mtctr 8
+ stfd 6,0(3)
+ stfd 7,8(3)
+ addi 10,3,16
+ bf 31,4f
+ lfd 0,16(12)
+ stfd 0,16(3)
+ blt cr1,3f
+ addi 11,12,24
+ addi 10,3,24
+ b 4f
+
+ .align 4
+1: /* Copy 1 doubleword and set the counter. */
+ mr 10,3
+ mtctr 8
+ bf 31,4f
+ lfd 6,0(12)
+ addi 11,12,8
+ stfd 6,0(3)
+ addi 10,3,8
+
+ .align 4
+4: /* Main aligned copy loop. Copies 32-bytes at a time. */
+ lfd 6,0(11)
+ lfd 7,8(11)
+ lfd 8,16(11)
+ lfd 0,24(11)
+ addi 11,11,32
+
+ stfd 6,0(10)
+ stfd 7,8(10)
+ stfd 8,16(10)
+ stfd 0,24(10)
+ addi 10,10,32
+ bdnz 4b
+3:
+
+ /* Check for tail bytes. */
+
+ clrrwi 0,31,3
+ mtcrf 0x01,31
+ beq cr6,0f
+
+.L9:
+ add 3,3,0
+ add 12,12,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ lwz 31,24(1)
+ addi 1,1,32
+ blr
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ cmplwi cr6,5,8
+ mr 12,4
+ mtcrf 0x01,5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ clrrwi 11,4,2
+ andi. 0,8,3
+ cmplwi cr1,5,16
+ mr 10,5
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-bytes alignment for SRC. */
+ mtocrf 0x01,0
+ subf 10,0,5
+2: bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: bf 31,L(end_4bytes_alignment)
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmplwi cr1,10,16
+ mtcrf 0x01,10
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(12)
+ lwz 7,4(12)
+ stw 6,0(3)
+ lwz 8,8(12)
+ stw 7,4(3)
+ lwz 6,12(12)
+ addi 12,12,16
+ stw 8,8(3)
+ stw 6,12(3)
+ addi 3,3,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ sth 6,0(3)
+ bf 31,0f
+ lbz 7,2(12)
+ stb 7,2(3)
+
+ /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ addi 1,1,32
+ blr
+
+ .align 4
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ addi 1,1,32
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,4f
+
+ /* Though we could've used lfd/stfd here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(4)
+ lwz 7,4(4)
+ stw 6,0(3)
+ stw 7,4(3)
+
+ /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ addi 1,1,32
+ blr
+
+ .align 4
+4: /* Copies 4~7 bytes. */
+ bf 29,2b
+
+ lwz 6,0(4)
+ stw 6,0(3)
+ bf 30,5f
+ lhz 7,4(4)
+ sth 7,4(3)
+ bf 31,0f
+ lbz 8,6(4)
+ stb 8,6(3)
+
+ /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ addi 1,1,32
+ blr
+
+ .align 4
+5: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,4(4)
+ stb 6,4(3)
+
+0: /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ addi 1,1,32
+ blr
+
+ /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ andi. 11,3,15 /* Check alignment of DST. */
+ clrlwi 0,0,28 /* Number of bytes until the 1st
+ quadword of DST. */
+ srwi 9,5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* SRC is not quadword aligned, get it aligned. */
+
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: /* Copy 2 bytes. */
+ bf 30,4f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: /* Copy 4 bytes. */
+ bf 29,8f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+8: /* Copy 8 bytes. */
+ bf 28,0f
+
+ lfd 6,0(12)
+ addi 12,12,8
+ stfd 6,0(3)
+ addi 3,3,8
+0:
+ clrlwi 10,12,28 /* Check alignment of SRC. */
+ srwi 9,31,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrlwi 11,31,28
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmplwi cr1,11,0
+ srwi 8,31,5 /* Setup the loop counter. */
+ mr 10,3
+ mr 11,12
+ mtcrf 0x01,9
+ cmplwi cr6,9,1
+ lvsl 5,0,12
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
+ vperm 6,3,4,5
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+ vor 3,4,4
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+ vperm 6,3,4,5 /* Merge the correctly-aligned portions
+ of vr3/vr4 into vr6. */
+ lvx 3,11,7 /* vr3 = r11+32. */
+ vperm 10,4,3,5 /* Merge the correctly-aligned portions
+ of vr3/vr4 into vr10. */
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6
+ addi 10,10,32
+
+ bdnz L(unaligned_loop)
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ clrrwi 0,31,4
+ mtcrf 0x01,31
+ beq cr1,0f
+
+ add 3,3,0
+ add 12,12,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2~3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ mr 3,30
+ lwz 30,20(1)
+ lwz 31,24(1)
+ addi 1,1,32
+ blr
+
+END (BP_SYM (memcpy))
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
new file mode 100644
index 0000000..8aabb49
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -0,0 +1,434 @@
+/* Optimized memset implementation for PowerPC32/POWER7.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+ .machine power7
+EALIGN (BP_SYM (memset), 5, 0)
+ CALL_MCOUNT
+
+ .align 4
+L(_memset):
+ cmplwi cr7,5,31
+ cmplwi cr6,5,8
+ mr 10,3 /* Save original argument for later. */
+ mr 7,1 /* Save original r1 for later. */
+ cfi_offset(31,-8)
+
+ /* Replicate byte to word. */
+ rlwimi 4,4,8,16,23
+ rlwimi 4,4,16,0,15
+
+ ble cr6,L(small) /* If length <= 8, use short copy code. */
+
+ neg 0,3
+ ble cr7,L(medium) /* If length < 32, use medium copy code. */
+
+ /* Save our word twice to create a doubleword that we will later
+ copy to a FPR. */
+ stwu 1,-32(1)
+ andi. 11,10,7 /* Check alignment of DST. */
+ mr 12,5
+ stw 4,24(1)
+ stw 4,28(1)
+ beq L(big_aligned)
+
+ clrlwi 0,0,29
+ mtocrf 0x01,0
+ subf 5,0,5
+
+ /* Get DST aligned to 8 bytes. */
+1: bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: bf 30,4f
+
+ sth 4,0(10)
+ addi 10,10,2
+4: bf 29,L(big_aligned)
+
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+L(big_aligned):
+ cmplwi cr5,5,255
+ li 0,32
+ cmplwi cr1,5,160
+ dcbtst 0,10
+ cmplwi cr6,4,0
+ srwi 9,5,3 /* Number of full doublewords remaining. */
+ crand 27,26,21
+ mtocrf 0x01,9
+ bt 27,L(huge)
+
+ /* From this point on, we'll copy 32+ bytes and the value
+ isn't 0 (so we can't use dcbz). */
+
+ srwi 8,5,5
+ clrlwi 11,5,29
+ cmplwi cr6,11,0
+ cmplwi cr1,9,4
+ mtctr 8
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+
+ stw 4,0(10)
+ stw 4,4(10)
+ stw 4,8(10)
+ stw 4,12(10)
+ addi 10,10,16
+ bf 31,L(big_loop)
+
+ stw 4,0(10)
+ stw 4,4(10)
+ addi 10,10,8
+ mr 12,10
+ blt cr1,L(tail_bytes)
+
+ b L(big_loop)
+
+ .align 4
+1: /* Copy 1 doubleword. */
+ bf 31,L(big_loop)
+
+ stw 4,0(10)
+ stw 4,4(10)
+ addi 10,10,8
+
+ /* First use a 32-bytes loop with stw's to try and avoid the LHS due
+ to the lfd we will do next. Also, ping-pong through r10 and r12
+ to avoid AGEN delays. */
+ .align 4
+L(big_loop):
+ addi 12,10,32
+ stw 4,0(10)
+ stw 4,4(10)
+ stw 4,8(10)
+ stw 4,12(10)
+ stw 4,16(10)
+ stw 4,20(10)
+ stw 4,24(10)
+ stw 4,28(10)
+ bdz L(tail_bytes)
+
+ addi 10,10,64
+ stw 4,0(12)
+ stw 4,4(12)
+ stw 4,8(12)
+ stw 4,12(12)
+ stw 4,16(12)
+ stw 4,20(12)
+ stw 4,24(12)
+ stw 4,28(12)
+ bdnz L(big_loop_fast_setup)
+
+ mr 12,10
+ b L(tail_bytes)
+
+ /* Now that we're probably past the LHS window, use the VSX to
+ speed up the loop. */
+L(big_loop_fast_setup):
+ li 11,24
+ li 6,16
+ lxvdsx 4,1,11
+
+ .align 4
+L(big_loop_fast):
+ addi 12,10,32
+ stxvd2x 4,0,10
+ stxvd2x 4,10,6
+ bdz L(tail_bytes)
+
+ addi 10,10,64
+ stxvd2x 4,0,12
+ stxvd2x 4,12,6
+ bdnz L(big_loop_fast)
+
+ mr 12,10
+
+ .align 4
+L(tail_bytes):
+
+ /* Check for tail bytes. */
+ mr 1,7 /* Restore r1. */
+ beqlr cr6
+
+ clrlwi 0,5,29
+ mtocrf 0x01,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(12)
+ addi 12,12,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ sth 4,0(12)
+ addi 12,12,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(12)
+ blr
+
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out 128-bytes at a time. Before using
+ dcbz though, we need to get the destination 128-bytes aligned. */
+ .align 4
+L(huge):
+ lfd 4,24(1)
+ andi. 11,10,127
+ neg 0,10
+ beq L(huge_aligned)
+
+ clrlwi 0,0,25
+ subf 5,0,5
+ srwi 0,0,3
+ mtocrf 0x01,0
+
+ /* Get DST aligned to 128 bytes. */
+8: bf 28,4f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ stfd 4,16(10)
+ stfd 4,24(10)
+ stfd 4,32(10)
+ stfd 4,40(10)
+ stfd 4,48(10)
+ stfd 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ stfd 4,16(10)
+ stfd 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(huge_aligned)
+
+ stfd 4,0(10)
+ addi 10,10,8
+
+L(huge_aligned):
+ srwi 8,5,7
+ clrlwi 11,5,25
+ cmplwi cr6,11,0
+ mtctr 8
+
+ /* Copies 128-bytes at a time. */
+ .align 4
+L(huge_loop):
+ dcbz 0,10
+ addi 10,10,128
+ bdnz L(huge_loop)
+
+ /* We have a tail of 0~127 bytes to handle. */
+ mr 1,7 /* Restore r1. */
+ beqlr cr6
+
+ subf 9,3,10
+ subf 5,9,12
+ srwi 8,5,3
+ cmplwi cr6,8,0
+ mtocrf 0x01,8
+
+ /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+ speed. We'll handle the resulting tail bytes later. */
+ beq cr6,L(tail)
+
+8: bf 28,4f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ stfd 4,16(10)
+ stfd 4,24(10)
+ stfd 4,32(10)
+ stfd 4,40(10)
+ stfd 4,48(10)
+ stfd 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ stfd 4,16(10)
+ stfd 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ stfd 4,0(10)
+ stfd 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(tail)
+
+ stfd 4,0(10)
+ addi 10,10,8
+
+ /* Handle the rest of the tail bytes here. */
+L(tail):
+ mtocrf 0x01,5
+
+ .align 4
+4: bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+ .align 4
+2: bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+ .align 4
+1: bflr 31
+
+ stb 4,0(10)
+ blr
+
+
+ /* Expanded tree to copy tail bytes without increments. */
+ .align 4
+L(copy_tail):
+ bf 29,L(FXX)
+
+ stw 4,0(10)
+ bf 30,L(TFX)
+
+ sth 4,4(10)
+ bflr 31
+
+ stb 4,6(10)
+ blr
+
+ .align 4
+L(FXX): bf 30,L(FFX)
+
+ sth 4,0(10)
+ bflr 31
+
+ stb 4,2(10)
+ blr
+
+ .align 4
+L(TFX): bflr 31
+
+ stb 4,4(10)
+ blr
+
+ .align 4
+L(FFX): bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handle copies of 9~31 bytes. */
+ .align 4
+L(medium):
+ /* At least 9 bytes to go. */
+ andi. 11,10,3
+ clrlwi 0,0,30
+ beq L(medium_aligned)
+
+ /* Force 4-bytes alignment for DST. */
+ mtocrf 0x01,0
+ subf 5,0,5
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: /* Copy 2 bytes. */
+ bf 30,L(medium_aligned)
+
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+L(medium_aligned):
+ /* At least 6 bytes to go, and DST is word-aligned. */
+ cmplwi cr1,5,16
+ mtocrf 0x01,5
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ stw 4,0(10)
+ stw 4,4(10)
+ stw 4,8(10)
+ stw 4,12(10)
+ addi 10,10,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ stw 4,0(10)
+ stw 4,4(10)
+ addi 10,10,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(small):
+ mtocrf 0x01,5
+ bne cr6,L(copy_tail)
+
+ stw 4,0(10)
+ stw 4,4(10)
+ blr
+
+END (BP_SYM (memset))
+libc_hidden_builtin_def (memset)
diff --git a/sysdeps/powerpc/powerpc32/power7/strncmp.S b/sysdeps/powerpc/powerpc32/power7/strncmp.S
new file mode 100644
index 0000000..ba72d0a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -0,0 +1,177 @@
+/* Optimized strcmp implementation for POWER7/PowerPC32.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* See strlen.s for comments on how the end-of-string testing works. */
+
+/* int [r3] strncmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+EALIGN (BP_SYM(strncmp),4,0)
+
+#define rTMP r0
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+/* Note: The Bounded pointer support in this code is broken. This code
+ was inherited from PPC32 and and that support was never completed.
+ Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r10
+#define rWORD4 r11
+#define rFEFE r8 /* constant 0xfefefeff (-0x01010101) */
+#define r7F7F r9 /* constant 0x7f7f7f7f */
+#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f) */
+#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+
+ dcbt 0,rSTR1
+ or rTMP,rSTR2,rSTR1
+ lis r7F7F,0x7f7f
+ dcbt 0,rSTR2
+ clrlwi. rTMP,rTMP,30
+ cmplwi cr1,rN,0
+ lis rFEFE,-0x101
+ bne L(unaligned)
+/* We are word alligned so set up for two loops. first a word
+ loop, then fall into the byte loop if any residual. */
+ srwi. rTMP,rN,2
+ clrlwi rN,rN,30
+ addi rFEFE,rFEFE,-0x101
+ addi r7F7F,r7F7F,0x7f7f
+ cmplwi cr1,rN,0
+ beq L(unaligned)
+
+ mtctr rTMP
+ lwz rWORD1,0(rSTR1)
+ lwz rWORD2,0(rSTR2)
+ b L(g1)
+
+L(g0):
+ lwzu rWORD1,4(rSTR1)
+ bne cr1,L(different)
+ lwzu rWORD2,4(rSTR2)
+L(g1): add rTMP,rFEFE,rWORD1
+ nor rNEG,r7F7F,rWORD1
+ bdz L(tail)
+ and. rTMP,rTMP,rNEG
+ cmpw cr1,rWORD1,rWORD2
+ beq L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of gunk beyond
+ the end of the strings... */
+
+L(endstring):
+ and rTMP,r7F7F,rWORD1
+ beq cr1,L(equal)
+ add rTMP,rTMP,r7F7F
+ xor. rBITDIF,rWORD1,rWORD2
+
+ andc rNEG,rNEG,rTMP
+ blt L(highbit)
+ cntlzw rBITDIF,rBITDIF
+ cntlzw rNEG,rNEG
+ addi rNEG,rNEG,7
+ cmpw cr1,rNEG,rBITDIF
+ sub rRTN,rWORD1,rWORD2
+ blt cr1,L(equal)
+ srawi rRTN,rRTN,31
+ ori rRTN,rRTN,1
+ blr
+L(equal):
+ li rRTN,0
+ blr
+
+L(different):
+ lwzu rWORD1,-4(rSTR1)
+ xor. rBITDIF,rWORD1,rWORD2
+ sub rRTN,rWORD1,rWORD2
+ blt L(highbit)
+ srawi rRTN,rRTN,31
+ ori rRTN,rRTN,1
+ blr
+L(highbit):
+ srwi rWORD2,rWORD2,24
+ srwi rWORD1,rWORD1,24
+ sub rRTN,rWORD1,rWORD2
+ blr
+
+
+/* Oh well. In this case, we just do a byte-by-byte comparison. */
+ .align 4
+L(tail):
+ and. rTMP,rTMP,rNEG
+ cmpw cr1,rWORD1,rWORD2
+ bne L(endstring)
+ addi rSTR1,rSTR1,4
+ bne cr1,L(different)
+ addi rSTR2,rSTR2,4
+ cmplwi cr1,rN,0
+L(unaligned):
+ mtctr rN
+ ble cr1,L(ux)
+L(uz):
+ lbz rWORD1,0(rSTR1)
+ lbz rWORD2,0(rSTR2)
+ .align 4
+L(u1):
+ cmpwi cr1,rWORD1,0
+ bdz L(u4)
+ cmpw rWORD1,rWORD2
+ beq cr1,L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne L(u4)
+ cmpwi cr1,rWORD3,0
+ bdz L(u3)
+ cmpw rWORD3,rWORD4
+ beq cr1,L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ bne L(u3)
+ cmpwi cr1,rWORD1,0
+ bdz L(u4)
+ cmpw rWORD1,rWORD2
+ beq cr1,L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne L(u4)
+ cmpwi cr1,rWORD3,0
+ bdz L(u3)
+ cmpw rWORD3,rWORD4
+ beq cr1,L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ beq L(u1)
+
+L(u3): sub rRTN,rWORD3,rWORD4
+ blr
+L(u4): sub rRTN,rWORD1,rWORD2
+ blr
+L(ux):
+ li rRTN,0
+ blr
+END (BP_SYM (strncmp))
+libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/powerpc/powerpc64/970/Implies b/sysdeps/powerpc/powerpc64/970/Implies
index ac431fa..bedb20b 100644
--- a/sysdeps/powerpc/powerpc64/970/Implies
+++ b/sysdeps/powerpc/powerpc64/970/Implies
@@ -1 +1,2 @@
+powerpc/powerpc64/power4/fpu
powerpc/powerpc64/power4
diff --git a/sysdeps/powerpc/powerpc64/power5+/Implies b/sysdeps/powerpc/powerpc64/power5+/Implies
index ac431fa..a01a13a 100644
--- a/sysdeps/powerpc/powerpc64/power5+/Implies
+++ b/sysdeps/powerpc/powerpc64/power5+/Implies
@@ -1 +1,2 @@
-powerpc/powerpc64/power4
+powerpc/powerpc64/power5/fpu
+powerpc/powerpc64/power5
diff --git a/sysdeps/powerpc/powerpc64/power5+/fpu/Implies b/sysdeps/powerpc/powerpc64/power5+/fpu/Implies
deleted file mode 100644
index 558a5fb..0000000
--- a/sysdeps/powerpc/powerpc64/power5+/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/powerpc/powerpc64/power5/Implies b/sysdeps/powerpc/powerpc64/power5/Implies
index ac431fa..bedb20b 100644
--- a/sysdeps/powerpc/powerpc64/power5/Implies
+++ b/sysdeps/powerpc/powerpc64/power5/Implies
@@ -1 +1,2 @@
+powerpc/powerpc64/power4/fpu
powerpc/powerpc64/power4
diff --git a/sysdeps/powerpc/powerpc64/power5/fpu/Implies b/sysdeps/powerpc/powerpc64/power5/fpu/Implies
deleted file mode 100644
index 558a5fb..0000000
--- a/sysdeps/powerpc/powerpc64/power5/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/powerpc/powerpc64/power6/Implies b/sysdeps/powerpc/powerpc64/power6/Implies
index 0f35f37..4c782d4 100644
--- a/sysdeps/powerpc/powerpc64/power6/Implies
+++ b/sysdeps/powerpc/powerpc64/power6/Implies
@@ -1 +1,2 @@
+powerpc/powerpc64/power5+/fpu
powerpc/powerpc64/power5+
diff --git a/sysdeps/powerpc/powerpc64/power6x/Implies b/sysdeps/powerpc/powerpc64/power6x/Implies
index d993b0c..9d68f39 100644
--- a/sysdeps/powerpc/powerpc64/power6x/Implies
+++ b/sysdeps/powerpc/powerpc64/power6x/Implies
@@ -1,2 +1,2 @@
+powerpc/powerpc64/power6/fpu
powerpc/powerpc64/power6
-powerpc/powerpc64/power5+
diff --git a/sysdeps/powerpc/powerpc64/power6x/fpu/Implies b/sysdeps/powerpc/powerpc64/power6x/fpu/Implies
deleted file mode 100644
index 8451099..0000000
--- a/sysdeps/powerpc/powerpc64/power6x/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-powerpc/powerpc64/power6/fpu
-powerpc/powerpc64/power5+/fpu
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/powerpc/powerpc64/power7/Implies b/sysdeps/powerpc/powerpc64/power7/Implies
index 13b0330..9d68f39 100644
--- a/sysdeps/powerpc/powerpc64/power7/Implies
+++ b/sysdeps/powerpc/powerpc64/power7/Implies
@@ -1 +1,2 @@
-powerpc/powerpc64/power5
+powerpc/powerpc64/power6/fpu
+powerpc/powerpc64/power6
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
index 13b0330..f00c50f 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/Implies
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -1 +1 @@
-powerpc/powerpc64/power5
+powerpc/powerpc64/power5/fpu
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000..6763d1a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,68 @@
+/* finite(). PowerPC64/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __finite, @function
+ .machine power7
+EALIGN (__finite, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,1
+ bflr 30
+
+ /* If we are here, we either have +/-INF,
+ NaN or denormal. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+
+ lhz r4,-16(r1) /* Fetch the upper portion of the high word of
+ the FP value (where the exponent and sign bits
+ are). */
+ clrlwi r4,r4,17 /* r4 = abs(r4). */
+ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
+ bltlr cr7 /* LT means finite, other non-finite. */
+ li r3,0
+ blr
+ END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000..54bd941
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000..f896d38
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,71 @@
+/* isinf(). PowerPC64/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __isinf, @function
+ .machine power7
+EALIGN (__isinf, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 29 /* If not INF, return. */
+
+ /* Either we have -INF/+INF or a denormal. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ lhz r4,-16(r1) /* Fetch the upper portion of the high word of
+ the FP value (where the exponent and sign bits
+ are). */
+ cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
+ li r3,1
+ beqlr cr7 /* EQ means INF, otherwise -INF. */
+ li r3,-1
+ blr
+ END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000..be759e0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000..8877012
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,69 @@
+/* isnan(). PowerPC64/POWER7 version.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x) */
+ .section ".toc","aw"
+.LC0: /* 1.0 */
+ .tc FD_ONE[TC],0x3ff0000000000000
+ .section ".text"
+ .type __isnan, @function
+ .machine power7
+EALIGN (__isnan, 4, 0)
+ CALL_MCOUNT 0
+ lfd fp0,.LC0@toc(r2)
+ ftdiv cr7,fp1,fp0
+ li r3,0
+ bflr 30 /* If not NaN, finish. */
+
+ stfd fp1,-16(r1) /* Transfer FP to GPR's. */
+ ori 2,2,0 /* Force a new dispatch group. */
+ ld r4,-16(r1) /* Load FP into GPR. */
+ lis r0,0x7ff0
+ sldi r0,r0,32 /* const long r0 0x7ff00000 00000000. */
+ clrldi r4,r4,1 /* x = fabs(x) */
+ cmpd cr7,r4,r0 /* if (fabs(x) <= inf) */
+ blelr cr7 /* LE means not NaN. */
+ li r3,1 /* else return 1 */
+ blr
+ END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+ single-precision. */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#ifndef IS_IN_libm
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000..b48c85e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S. */
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
new file mode 100644
index 0000000..f1afffb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -0,0 +1,984 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+ .machine power7
+EALIGN (BP_SYM(memcmp),4,0)
+ CALL_MCOUNT 3
+
+#define rTMP r0
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+/* Note: The Bounded pointer support in this code is broken. This code
+ was inherited from PPC32 and and that support was never completed.
+ Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+#define rBITDIF r12 /* bits that differ in s1 & s2 words */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+ xor rTMP,rSTR2,rSTR1
+ cmpldi cr6,rN,0
+ cmpldi cr1,rN,12
+ clrldi. rTMP,rTMP,61
+ clrldi rBITDIF,rSTR1,61
+ cmpldi cr5,rBITDIF,0
+ beq- cr6,L(zeroLength)
+ dcbt 0,rSTR1
+ dcbt 0,rSTR2
+/* If less than 8 bytes or not aligned, use the unalligned
+ byte loop. */
+ blt cr1,L(bytealigned)
+ std rWORD8,-8(r1)
+ cfi_offset(rWORD8,-8)
+ std rWORD7,-16(r1)
+ cfi_offset(rWORD7,-16)
+ bne L(unaligned)
+/* At this point we know both strings have the same alignment and the
+ compare length is at least 8 bytes. rBITDIF containes the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of rBITDIF to 0. If rBITDIF == 0 then we are already double word
+ aligned and can perform the DWaligned loop.
+
+ Otherwise we know the two strings have the same alignment (but not
+ yet DW). So we can force the string addresses to the next lower DW
+ boundary and special case this first DW word using shift left to
+ ellimiate bits preceeding the first byte. Since we want to join the
+ normal (DWaligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This insures that the loop count is
+ correct and the first DW (shifted) is in the expected resister pair. */
+ .align 4
+L(samealignment):
+ clrrdi rSTR1,rSTR1,3
+ clrrdi rSTR2,rSTR2,3
+ beq cr5,L(DWaligned)
+ add rN,rN,rBITDIF
+ sldi r11,rBITDIF,3
+ srdi rTMP,rN,5 /* Divide by 32 */
+ andi. rBITDIF,rN,24 /* Get the DW remainder */
+ ld rWORD1,0(rSTR1)
+ ld rWORD2,0(rSTR2)
+ cmpldi cr1,rBITDIF,16
+ cmpldi cr7,rN,32
+ clrldi rN,rN,61
+ beq L(dPs4)
+ mtctr rTMP
+ bgt cr1,L(dPs3)
+ beq cr1,L(dPs2)
+
+/* Remainder is 8 */
+ .align 3
+L(dsP1):
+ sld rWORD5,rWORD1,r11
+ sld rWORD6,rWORD2,r11
+ cmpld cr5,rWORD5,rWORD6
+ blt cr7,L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ b L(dP1e)
+/* Remainder is 16 */
+ .align 4
+L(dPs2):
+ sld rWORD5,rWORD1,r11
+ sld rWORD6,rWORD2,r11
+ cmpld cr6,rWORD5,rWORD6
+ blt cr7,L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway. */
+ ld rWORD7,8(rSTR1)
+ ld rWORD8,8(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ b L(dP2e)
+/* Remainder is 24 */
+ .align 4
+L(dPs3):
+ sld rWORD3,rWORD1,r11
+ sld rWORD4,rWORD2,r11
+ cmpld cr1,rWORD3,rWORD4
+ b L(dP3e)
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(dPs4):
+ mtctr rTMP
+ sld rWORD1,rWORD1,r11
+ sld rWORD2,rWORD2,r11
+ cmpld cr0,rWORD1,rWORD2
+ b L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWaligned):
+ andi. rBITDIF,rN,24 /* Get the DW remainder */
+ srdi rTMP,rN,5 /* Divide by 32 */
+ cmpldi cr1,rBITDIF,16
+ cmpldi cr7,rN,32
+ clrldi rN,rN,61
+ beq L(dP4)
+ bgt cr1,L(dP3)
+ beq cr1,L(dP2)
+
+/* Remainder is 8 */
+ .align 4
+L(dP1):
+ mtctr rTMP
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+ (8-15 byte compare), we want to use only volitile registers. This
+ means we can avoid restoring non-volitile registers since we did not
+ change any on the early exit path. The key here is the non-early
+ exit path only cares about the condition code (cr5), not about which
+ register pair was used. */
+ ld rWORD5,0(rSTR1)
+ ld rWORD6,0(rSTR2)
+ cmpld cr5,rWORD5,rWORD6
+ blt cr7,L(dP1x)
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+L(dP1e):
+ ld rWORD3,16(rSTR1)
+ ld rWORD4,16(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ ld rWORD5,24(rSTR1)
+ ld rWORD6,24(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+ bne cr0,L(dLcr0)
+
+ ldu rWORD7,32(rSTR1)
+ ldu rWORD8,32(rSTR2)
+ bne cr1,L(dLcr1)
+ cmpld cr5,rWORD7,rWORD8
+ bdnz L(dLoop)
+ bne cr6,L(dLcr6)
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ .align 3
+L(dP1x):
+ sldi. r12,rN,3
+ bne cr5,L(dLcr5)
+ subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Remainder is 16 */
+ .align 4
+L(dP2):
+ mtctr rTMP
+ ld rWORD5,0(rSTR1)
+ ld rWORD6,0(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ blt cr7,L(dP2x)
+ ld rWORD7,8(rSTR1)
+ ld rWORD8,8(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+L(dP2e):
+ ld rWORD1,16(rSTR1)
+ ld rWORD2,16(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ ld rWORD3,24(rSTR1)
+ ld rWORD4,24(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr6,L(dLcr6)
+ bne cr5,L(dLcr5)
+ b L(dLoop2)
+/* Again we are on a early exit path (16-23 byte compare), we want to
+ only use volitile registers and avoid restoring non-volitile
+ registers. */
+ .align 4
+L(dP2x):
+ ld rWORD3,8(rSTR1)
+ ld rWORD4,8(rSTR2)
+ cmpld cr5,rWORD3,rWORD4
+ sldi. r12,rN,3
+ bne cr6,L(dLcr6)
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr5,L(dLcr5)
+ subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Remainder is 24 */
+ .align 4
+L(dP3):
+ mtctr rTMP
+ ld rWORD3,0(rSTR1)
+ ld rWORD4,0(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+L(dP3e):
+ ld rWORD5,8(rSTR1)
+ ld rWORD6,8(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ blt cr7,L(dP3x)
+ ld rWORD7,16(rSTR1)
+ ld rWORD8,16(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ ld rWORD1,24(rSTR1)
+ ld rWORD2,24(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ addi rSTR1,rSTR1,16
+ addi rSTR2,rSTR2,16
+ bne cr1,L(dLcr1)
+ bne cr6,L(dLcr6)
+ b L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+ only use volitile registers and avoid restoring non-volitile
+ registers. */
+ .align 4
+L(dP3x):
+ ld rWORD1,16(rSTR1)
+ ld rWORD2,16(rSTR2)
+ cmpld cr5,rWORD1,rWORD2
+ sldi. r12,rN,3
+ bne cr1,L(dLcr1)
+ addi rSTR1,rSTR1,16
+ addi rSTR2,rSTR2,16
+ bne cr6,L(dLcr6)
+ subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
+ bne cr5,L(dLcr5)
+ bne L(d00)
+ li rRTN,0
+ blr
+
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(dP4):
+ mtctr rTMP
+ ld rWORD1,0(rSTR1)
+ ld rWORD2,0(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+L(dP4e):
+ ld rWORD3,8(rSTR1)
+ ld rWORD4,8(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ ld rWORD5,16(rSTR1)
+ ld rWORD6,16(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ ldu rWORD7,24(rSTR1)
+ ldu rWORD8,24(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr0,L(dLcr0)
+ bne cr1,L(dLcr1)
+ bdz- L(d24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(dLoop):
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ bne cr6,L(dLcr6)
+L(dLoop1):
+ ld rWORD3,16(rSTR1)
+ ld rWORD4,16(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+L(dLoop2):
+ ld rWORD5,24(rSTR1)
+ ld rWORD6,24(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr0,L(dLcr0)
+L(dLoop3):
+ ldu rWORD7,32(rSTR1)
+ ldu rWORD8,32(rSTR2)
+ bne cr1,L(dLcr1)
+ cmpld cr0,rWORD1,rWORD2
+ bdnz L(dLoop)
+
+L(dL4):
+ cmpld cr1,rWORD3,rWORD4
+ bne cr6,L(dLcr6)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr5,L(dLcr5)
+ cmpld cr5,rWORD7,rWORD8
+L(d44):
+ bne cr0,L(dLcr0)
+L(d34):
+ bne cr1,L(dLcr1)
+L(d24):
+ bne cr6,L(dLcr6)
+L(d14):
+ sldi. r12,rN,3
+ bne cr5,L(dLcr5)
+L(d04):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
+ beq L(zeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare. Since
+ we are aligned it is safe to load the whole double word, and use
+ shift right double to elliminate bits beyond the compare length. */
+L(d00):
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ srd rWORD1,rWORD1,rN
+ srd rWORD2,rWORD2,rN
+ cmpld cr5,rWORD1,rWORD2
+ bne cr5,L(dLcr5x)
+ li rRTN,0
+ blr
+ .align 4
+L(dLcr0):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgtlr cr0
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr1):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgtlr cr1
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr6):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgtlr cr6
+ li rRTN,-1
+ blr
+ .align 4
+L(dLcr5):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+L(dLcr5x):
+ li rRTN,1
+ bgtlr cr5
+ li rRTN,-1
+ blr
+
+ .align 4
+L(bytealigned):
+ mtctr rN
+ beq cr6,L(zeroLength)
+
+/* We need to prime this loop. This loop is swing modulo scheduled
+ to avoid pipe delays. The dependent instruction latencies (load to
+ compare to conditional branch) is 2 to 3 cycles. In this loop each
+ dispatch group ends in a branch and takes 1 cycle. Effectively
+ the first iteration of the loop only serves to load operands and
+ branches based on compares are delayed until the next loop.
+
+ So we must precondition some registers and condition codes so that
+ we don't exit the loop early on the first iteration. */
+
+ lbz rWORD1,0(rSTR1)
+ lbz rWORD2,0(rSTR2)
+ bdz L(b11)
+ cmpld cr0,rWORD1,rWORD2
+ lbz rWORD3,1(rSTR1)
+ lbz rWORD4,1(rSTR2)
+ bdz L(b12)
+ cmpld cr1,rWORD3,rWORD4
+ lbzu rWORD5,2(rSTR1)
+ lbzu rWORD6,2(rSTR2)
+ bdz L(b13)
+ .align 4
+L(bLoop):
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ bne cr0,L(bLcr0)
+
+ cmpld cr6,rWORD5,rWORD6
+ bdz L(b3i)
+
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne cr1,L(bLcr1)
+
+ cmpld cr0,rWORD1,rWORD2
+ bdz L(b2i)
+
+ lbzu rWORD5,1(rSTR1)
+ lbzu rWORD6,1(rSTR2)
+ bne cr6,L(bLcr6)
+
+ cmpld cr1,rWORD3,rWORD4
+ bdnz L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+ bytes. But we must avoid overrunning the length (in the ctr) to
+ prevent these speculative loads from causing a segfault. In this
+ case the loop will exit early (before the all pending bytes are
+ tested. In this case we must complete the pending operations
+ before returning. */
+L(b1i):
+ bne cr0,L(bLcr0)
+ bne cr1,L(bLcr1)
+ b L(bx56)
+ .align 4
+L(b2i):
+ bne cr6,L(bLcr6)
+ bne cr0,L(bLcr0)
+ b L(bx34)
+ .align 4
+L(b3i):
+ bne cr1,L(bLcr1)
+ bne cr6,L(bLcr6)
+ b L(bx12)
+ .align 4
+L(bLcr0):
+ li rRTN,1
+ bgtlr cr0
+ li rRTN,-1
+ blr
+L(bLcr1):
+ li rRTN,1
+ bgtlr cr1
+ li rRTN,-1
+ blr
+L(bLcr6):
+ li rRTN,1
+ bgtlr cr6
+ li rRTN,-1
+ blr
+
+L(b13):
+ bne cr0,L(bx12)
+ bne cr1,L(bx34)
+L(bx56):
+ sub rRTN,rWORD5,rWORD6
+ blr
+ nop
+L(b12):
+ bne cr0,L(bx12)
+L(bx34):
+ sub rRTN,rWORD3,rWORD4
+ blr
+L(b11):
+L(bx12):
+ sub rRTN,rWORD1,rWORD2
+ blr
+ .align 4
+L(zeroLengthReturn):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+L(zeroLength):
+ li rRTN,0
+ blr
+
+ .align 4
+/* At this point we know the strings have different alignment and the
+ compare length is at least 8 bytes. rBITDIF containes the low order
+ 3 bits of rSTR1 and cr5 contains the result of the logical compare
+ of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
+ aligned and can perform the DWunaligned loop.
+
+ Otherwise we know that rSTR1 is not aready DW aligned yet.
+ So we can force the string addresses to the next lower DW
+ boundary and special case this first DW word using shift left to
+ ellimiate bits preceeding the first byte. Since we want to join the
+ normal (DWaligned) compare loop, starting at the second double word,
+ we need to adjust the length (rN) and special case the loop
+ versioning for the first DW. This insures that the loop count is
+ correct and the first DW (shifted) is in the expected resister pair. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rB r27 /* Left rotation temp for rWORD2. */
+#define rD r26 /* Left rotation temp for rWORD4. */
+#define rF r25 /* Left rotation temp for rWORD6. */
+#define rH r24 /* Left rotation temp for rWORD8. */
+#define rA r0 /* Right rotation temp for rWORD2. */
+#define rC r12 /* Right rotation temp for rWORD4. */
+#define rE r0 /* Right rotation temp for rWORD6. */
+#define rG r12 /* Right rotation temp for rWORD8. */
+L(unaligned):
+ std r29,-24(r1)
+ cfi_offset(r29,-24)
+ clrldi rSHL,rSTR2,61
+ beq cr6,L(duzeroLength)
+ std r28,-32(r1)
+ cfi_offset(r28,-32)
+ beq cr5,L(DWunaligned)
+ std r27,-40(r1)
+ cfi_offset(r27,-40)
+/* Adjust the logical start of rSTR2 ro compensate for the extra bits
+ in the 1st rSTR1 DW. */
+ sub r27,rSTR2,rBITDIF
+/* But do not attempt to address the DW before that DW that contains
+ the actual start of rSTR2. */
+ clrrdi rSTR2,rSTR2,3
+ std r26,-48(r1)
+ cfi_offset(r26,-48)
+/* Compute the leaft/right shift counts for the unalign rSTR2,
+ compensating for the logical (DW aligned) start of rSTR1. */
+ clrldi rSHL,r27,61
+ clrrdi rSTR1,rSTR1,3
+ std r25,-56(r1)
+ cfi_offset(r25,-56)
+ sldi rSHL,rSHL,3
+ cmpld cr5,r27,rSTR2
+ add rN,rN,rBITDIF
+ sldi r11,rBITDIF,3
+ std r24,-64(r1)
+ cfi_offset(r24,-64)
+ subfic rSHR,rSHL,64
+ srdi rTMP,rN,5 /* Divide by 32 */
+ andi. rBITDIF,rN,24 /* Get the DW remainder */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+ this special case those bits may be discarded anyway. Also we
+ must avoid loading a DW where none of the bits are part of rSTR2 as
+ this may cross a page boundary and cause a page fault. */
+ li rWORD8,0
+ blt cr5,L(dus0)
+ ld rWORD8,0(rSTR2)
+ la rSTR2,8(rSTR2)
+ sld rWORD8,rWORD8,rSHL
+
+L(dus0):
+ ld rWORD1,0(rSTR1)
+ ld rWORD2,0(rSTR2)
+ cmpldi cr1,rBITDIF,16
+ cmpldi cr7,rN,32
+ srd rG,rWORD2,rSHR
+ clrldi rN,rN,61
+ beq L(duPs4)
+ mtctr rTMP
+ or rWORD8,rG,rWORD8
+ bgt cr1,L(duPs3)
+ beq cr1,L(duPs2)
+
+/* Remainder is 8 */
+ .align 4
+L(dusP1):
+ sld rB,rWORD2,rSHL
+ sld rWORD7,rWORD1,r11
+ sld rWORD8,rWORD8,r11
+ bge cr7,L(duP1e)
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+ cmpld cr5,rWORD7,rWORD8
+ sldi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmpld cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srd rA,rWORD2,rSHR
+ b L(dutrim)
+/* Remainder is 16 */
+ .align 4
+L(duPs2):
+ sld rH,rWORD2,rSHL
+ sld rWORD5,rWORD1,r11
+ sld rWORD6,rWORD8,r11
+ b L(duP2e)
+/* Remainder is 24 */
+ .align 4
+L(duPs3):
+ sld rF,rWORD2,rSHL
+ sld rWORD3,rWORD1,r11
+ sld rWORD4,rWORD8,r11
+ b L(duP3e)
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(duPs4):
+ mtctr rTMP
+ or rWORD8,rG,rWORD8
+ sld rD,rWORD2,rSHL
+ sld rWORD1,rWORD1,r11
+ sld rWORD2,rWORD8,r11
+ b L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+ compare length is at least 8 bytes. */
+ .align 4
+L(DWunaligned):
+ std r27,-40(r1)
+ cfi_offset(r27,-40)
+ clrrdi rSTR2,rSTR2,3
+ std r26,-48(r1)
+ cfi_offset(r26,-48)
+ srdi rTMP,rN,5 /* Divide by 32 */
+ std r25,-56(r1)
+ cfi_offset(r25,-56)
+ andi. rBITDIF,rN,24 /* Get the DW remainder */
+ std r24,-64(r1)
+ cfi_offset(r24,-64)
+ sldi rSHL,rSHL,3
+ ld rWORD6,0(rSTR2)
+ ldu rWORD8,8(rSTR2)
+ cmpldi cr1,rBITDIF,16
+ cmpldi cr7,rN,32
+ clrldi rN,rN,61
+ subfic rSHR,rSHL,64
+ sld rH,rWORD6,rSHL
+ beq L(duP4)
+ mtctr rTMP
+ bgt cr1,L(duP3)
+ beq cr1,L(duP2)
+
+/* Remainder is 8 */
+ .align 4
+L(duP1):
+ srd rG,rWORD8,rSHR
+ ld rWORD7,0(rSTR1)
+ sld rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP1x)
+L(duP1e):
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ srd rA,rWORD2,rSHR
+ sld rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ ld rWORD3,16(rSTR1)
+ ld rWORD4,16(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ srd rC,rWORD4,rSHR
+ sld rF,rWORD4,rSHL
+ bne cr5,L(duLcr5)
+ or rWORD4,rC,rD
+ ld rWORD5,24(rSTR1)
+ ld rWORD6,24(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ srd rE,rWORD6,rSHR
+ sld rH,rWORD6,rSHL
+ bne cr0,L(duLcr0)
+ or rWORD6,rE,rF
+ cmpld cr6,rWORD5,rWORD6
+ b L(duLoop3)
+ .align 4
+/* At this point we exit early with the first double word compare
+ complete and remainder of 0 to 7 bytes. See L(du14) for details on
+ how we handle the remaining bytes. */
+L(duP1x):
+ cmpld cr5,rWORD7,rWORD8
+ sldi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmpld cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srd rA,rWORD2,rSHR
+ b L(dutrim)
+/* Remainder is 16 */
+ .align 4
+L(duP2):
+ srd rE,rWORD8,rSHR
+ ld rWORD5,0(rSTR1)
+ or rWORD6,rE,rH
+ sld rH,rWORD8,rSHL
+L(duP2e):
+ ld rWORD7,8(rSTR1)
+ ld rWORD8,8(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ srd rG,rWORD8,rSHR
+ sld rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP2x)
+ ld rWORD1,16(rSTR1)
+ ld rWORD2,16(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ srd rA,rWORD2,rSHR
+ sld rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ ld rWORD3,24(rSTR1)
+ ld rWORD4,24(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ bne cr5,L(duLcr5)
+ srd rC,rWORD4,rSHR
+ sld rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ cmpld cr1,rWORD3,rWORD4
+ b L(duLoop2)
+ .align 4
+L(duP2x):
+ cmpld cr5,rWORD7,rWORD8
+ addi rSTR1,rSTR1,8
+ addi rSTR2,rSTR2,8
+ bne cr6,L(duLcr6)
+ sldi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmpld cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srd rA,rWORD2,rSHR
+ b L(dutrim)
+
+/* Remainder is 24 */
+ .align 4
+L(duP3):
+ srd rC,rWORD8,rSHR
+ ld rWORD3,0(rSTR1)
+ sld rF,rWORD8,rSHL
+ or rWORD4,rC,rH
+L(duP3e):
+ ld rWORD5,8(rSTR1)
+ ld rWORD6,8(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ srd rE,rWORD6,rSHR
+ sld rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+ ld rWORD7,16(rSTR1)
+ ld rWORD8,16(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr1,L(duLcr1)
+ srd rG,rWORD8,rSHR
+ sld rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ blt cr7,L(duP3x)
+ ld rWORD1,24(rSTR1)
+ ld rWORD2,24(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ srd rA,rWORD2,rSHR
+ sld rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+ addi rSTR1,rSTR1,16
+ addi rSTR2,rSTR2,16
+ cmpld cr0,rWORD1,rWORD2
+ b L(duLoop1)
+ .align 4
+L(duP3x):
+ addi rSTR1,rSTR1,16
+ addi rSTR2,rSTR2,16
+ bne cr1,L(duLcr1)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr6,L(duLcr6)
+ sldi. rN,rN,3
+ bne cr5,L(duLcr5)
+ cmpld cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srd rA,rWORD2,rSHR
+ b L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0 */
+ .align 4
+L(duP4):
+ mtctr rTMP
+ srd rA,rWORD8,rSHR
+ ld rWORD1,0(rSTR1)
+ sld rD,rWORD8,rSHL
+ or rWORD2,rA,rH
+L(duP4e):
+ ld rWORD3,8(rSTR1)
+ ld rWORD4,8(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ srd rC,rWORD4,rSHR
+ sld rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+ ld rWORD5,16(rSTR1)
+ ld rWORD6,16(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ bne cr0,L(duLcr0)
+ srd rE,rWORD6,rSHR
+ sld rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+ ldu rWORD7,24(rSTR1)
+ ldu rWORD8,24(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr1,L(duLcr1)
+ srd rG,rWORD8,rSHR
+ sld rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ cmpld cr5,rWORD7,rWORD8
+ bdz L(du24) /* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+ .align 4
+L(duLoop):
+ ld rWORD1,8(rSTR1)
+ ld rWORD2,8(rSTR2)
+ cmpld cr1,rWORD3,rWORD4
+ bne cr6,L(duLcr6)
+ srd rA,rWORD2,rSHR
+ sld rD,rWORD2,rSHL
+ or rWORD2,rA,rB
+L(duLoop1):
+ ld rWORD3,16(rSTR1)
+ ld rWORD4,16(rSTR2)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr5,L(duLcr5)
+ srd rC,rWORD4,rSHR
+ sld rF,rWORD4,rSHL
+ or rWORD4,rC,rD
+L(duLoop2):
+ ld rWORD5,24(rSTR1)
+ ld rWORD6,24(rSTR2)
+ cmpld cr5,rWORD7,rWORD8
+ bne cr0,L(duLcr0)
+ srd rE,rWORD6,rSHR
+ sld rH,rWORD6,rSHL
+ or rWORD6,rE,rF
+L(duLoop3):
+ ldu rWORD7,32(rSTR1)
+ ldu rWORD8,32(rSTR2)
+ cmpld cr0,rWORD1,rWORD2
+ bne- cr1,L(duLcr1)
+ srd rG,rWORD8,rSHR
+ sld rB,rWORD8,rSHL
+ or rWORD8,rG,rH
+ bdnz L(duLoop)
+
+L(duL4):
+ bne cr1,L(duLcr1)
+ cmpld cr1,rWORD3,rWORD4
+ bne cr6,L(duLcr6)
+ cmpld cr6,rWORD5,rWORD6
+ bne cr5,L(duLcr5)
+ cmpld cr5,rWORD7,rWORD8
+L(du44):
+ bne cr0,L(duLcr0)
+L(du34):
+ bne cr1,L(duLcr1)
+L(du24):
+ bne cr6,L(duLcr6)
+L(du14):
+ sldi. rN,rN,3
+ bne cr5,L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare. We use
+ shift right double to elliminate bits beyond the compare length.
+ This allows the use of double word subtract to compute the final
+ result.
+
+ However it may not be safe to load rWORD2 which may be beyond the
+ string length. So we compare the bit length of the remainder to
+ the right shift count (rSHR). If the bit count is less than or equal
+ we do not need to load rWORD2 (all significant bits are already in
+ rB). */
+ cmpld cr7,rN,rSHR
+ beq L(duZeroReturn)
+ li rA,0
+ ble cr7,L(dutrim)
+ ld rWORD2,8(rSTR2)
+ srd rA,rWORD2,rSHR
+ .align 4
+L(dutrim):
+ ld rWORD1,8(rSTR1)
+ ld rWORD8,-8(r1)
+ subfic rN,rN,64 /* Shift count is 64 - (rN * 8). */
+ or rWORD2,rA,rB
+ ld rWORD7,-16(r1)
+ ld r29,-24(r1)
+ srd rWORD1,rWORD1,rN
+ srd rWORD2,rWORD2,rN
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ li rRTN,0
+ cmpld cr0,rWORD1,rWORD2
+ ld r26,-48(r1)
+ ld r25,-56(r1)
+ beq cr0,L(dureturn24)
+ li rRTN,1
+ ld r24,-64(r1)
+ bgtlr cr0
+ li rRTN,-1
+ blr
+ .align 4
+L(duLcr0):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgt cr0,L(dureturn29)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr1):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgt cr1,L(dureturn29)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr6):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgt cr6,L(dureturn29)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 4
+L(duLcr5):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+ li rRTN,1
+ bgt cr5,L(dureturn29)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ li rRTN,-1
+ b L(dureturn27)
+ .align 3
+L(duZeroReturn):
+ li rRTN,0
+ .align 4
+L(dureturn):
+ ld rWORD8,-8(r1)
+ ld rWORD7,-16(r1)
+L(dureturn29):
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+L(dureturn27):
+ ld r27,-40(r1)
+L(dureturn26):
+ ld r26,-48(r1)
+L(dureturn25):
+ ld r25,-56(r1)
+L(dureturn24):
+ ld r24,-64(r1)
+ blr
+L(duzeroLength):
+ li rRTN,0
+ blr
+
+END (BP_SYM (memcmp))
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp,bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
new file mode 100644
index 0000000..2e5beed
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -0,0 +1,449 @@
+/* Optimized memcpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+ .machine power7
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT 3
+
+ cmpldi cr1,5,31
+ neg 0,3
+ std 3,-16(1)
+ std 31,-8(1)
+ cfi_offset(31,-8)
+ ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
+ code. */
+
+ andi. 11,3,7 /* Check alignment of DST. */
+
+
+ clrldi 10,4,61 /* Check alignment of SRC. */
+ cmpld cr6,10,11 /* SRC and DST alignments match? */
+ mr 12,4
+ mr 31,5
+ bne cr6,L(copy_GE_32_unaligned)
+
+ srdi 9,5,3 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_aligned_cont)
+
+ clrldi 0,0,61
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Get the SRC aligned to 8 bytes. */
+
+1: bf 31,2f
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: bf 30,4f
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: bf 29,0f
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+0:
+ clrldi 10,12,61 /* Check alignment of SRC again. */
+ srdi 9,31,3 /* Number of full doublewords remaining. */
+
+L(copy_GE_32_aligned_cont):
+
+ clrldi 11,31,61
+ mtcrf 0x01,9
+
+ srdi 8,31,5
+ cmpldi cr1,9,4
+ cmpldi cr6,11,0
+ mr 11,12
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+ ld 6,0(12)
+ ld 7,8(12)
+ addi 11,12,16
+ mtctr 8
+ std 6,0(3)
+ std 7,8(3)
+ addi 10,3,16
+ bf 31,4f
+ ld 0,16(12)
+ std 0,16(3)
+ blt cr1,3f
+ addi 11,12,24
+ addi 10,3,24
+ b 4f
+
+ .align 4
+1: /* Copy 1 doubleword and set the counter. */
+ mr 10,3
+ mtctr 8
+ bf 31,4f
+ ld 6,0(12)
+ addi 11,12,8
+ std 6,0(3)
+ addi 10,3,8
+
+ /* Main aligned copy loop. Copies 32-bytes at a time. */
+ .align 4
+4:
+ ld 6,0(11)
+ ld 7,8(11)
+ ld 8,16(11)
+ ld 0,24(11)
+ addi 11,11,32
+
+ std 6,0(10)
+ std 7,8(10)
+ std 8,16(10)
+ std 0,24(10)
+ addi 10,10,32
+ bdnz 4b
+3:
+
+ /* Check for tail bytes. */
+ rldicr 0,31,0,60
+ mtcrf 0x01,31
+ beq cr6,0f
+
+.L9:
+ add 3,3,0
+ add 12,12,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ ld 31,-8(1)
+ ld 3,-16(1)
+ blr
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ cmpldi cr6,5,8
+ mr 12,4
+ mtcrf 0x01,5
+ ble cr6,L(copy_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg 8,4
+ clrrdi 11,4,2
+ andi. 0,8,3
+ cmpldi cr1,5,16
+ mr 10,5
+ beq L(copy_LT_32_aligned)
+
+ /* Force 4-bytes alignment for SRC. */
+ mtocrf 0x01,0
+ subf 10,0,5
+2: bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: bf 31,L(end_4bytes_alignment)
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,10,16
+ mtcrf 0x01,10
+
+L(copy_LT_32_aligned):
+ /* At least 6 bytes to go, and SRC is word-aligned. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ lwz 6,0(12)
+ lwz 7,4(12)
+ stw 6,0(3)
+ lwz 8,8(12)
+ stw 7,4(3)
+ lwz 6,12(12)
+ addi 12,12,16
+ stw 8,8(3)
+ stw 6,12(3)
+ addi 3,3,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ sth 6,0(3)
+ bf 31,0f
+ lbz 7,2(12)
+ stb 7,2(3)
+ ld 3,-16(1)
+ blr
+
+ .align 4
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ ld 3,-16(1)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,4f
+
+ /* Though we could've used ld/std here, they are still
+ slow for unaligned cases. */
+
+ lwz 6,0(4)
+ lwz 7,4(4)
+ stw 6,0(3)
+ stw 7,4(3)
+ ld 3,-16(1) /* Return original DST pointers. */
+ blr
+
+ .align 4
+4: /* Copies 4~7 bytes. */
+ bf 29,2b
+
+ lwz 6,0(4)
+ stw 6,0(3)
+ bf 30,5f
+ lhz 7,4(4)
+ sth 7,4(3)
+ bf 31,0f
+ lbz 8,6(4)
+ stb 8,6(3)
+ ld 3,-16(1)
+ blr
+
+ .align 4
+5: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,4(4)
+ stb 6,4(3)
+
+0: /* Return original DST pointer. */
+ ld 3,-16(1)
+ blr
+
+ /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+ SRC is not. Use aligned quadword loads from SRC, shifted to realign
+ the data, allowing for aligned DST stores. */
+ .align 4
+L(copy_GE_32_unaligned):
+ clrldi 0,0,60 /* Number of bytes until the 1st
+ quadword. */
+ andi. 11,3,15 /* Check alignment of DST (against
+ quadwords). */
+ srdi 9,5,4 /* Number of full quadwords remaining. */
+
+ beq L(copy_GE_32_unaligned_cont)
+
+ /* SRC is not quadword aligned, get it aligned. */
+
+ mtcrf 0x01,0
+ subf 31,0,5
+
+ /* Vector instructions work best when proper alignment (16-bytes)
+ is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ lbz 6,0(12)
+ addi 12,12,1
+ stb 6,0(3)
+ addi 3,3,1
+2: /* Copy 2 bytes. */
+ bf 30,4f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+4: /* Copy 4 bytes. */
+ bf 29,8f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+8: /* Copy 8 bytes. */
+ bf 28,0f
+
+ ld 6,0(12)
+ addi 12,12,8
+ std 6,0(3)
+ addi 3,3,8
+0:
+ clrldi 10,12,60 /* Check alignment of SRC. */
+ srdi 9,31,4 /* Number of full quadwords remaining. */
+
+ /* The proper alignment is present, it is OK to copy the bytes now. */
+L(copy_GE_32_unaligned_cont):
+
+ /* Setup two indexes to speed up the indexed vector operations. */
+ clrldi 11,31,60
+ li 6,16 /* Index for 16-bytes offsets. */
+ li 7,32 /* Index for 32-bytes offsets. */
+ cmpldi cr1,11,0
+ srdi 8,31,5 /* Setup the loop counter. */
+ mr 10,3
+ mr 11,12
+ mtcrf 0x01,9
+ cmpldi cr6,9,1
+ lvsl 5,0,12
+ lvx 3,0,12
+ bf 31,L(setup_unaligned_loop)
+
+ /* Copy another 16 bytes to align to 32-bytes due to the loop . */
+ lvx 4,12,6
+ vperm 6,3,4,5
+ addi 11,12,16
+ addi 10,3,16
+ stvx 6,0,3
+ vor 3,4,4
+
+L(setup_unaligned_loop):
+ mtctr 8
+ ble cr6,L(end_unaligned_loop)
+
+ /* Copy 32 bytes at a time using vector instructions. */
+ .align 4
+L(unaligned_loop):
+
+ /* Note: vr6/vr10 may contain data that was already copied,
+ but in order to get proper alignment, we may have to copy
+ some portions again. This is faster than having unaligned
+ vector instructions though. */
+
+ lvx 4,11,6 /* vr4 = r11+16. */
+ vperm 6,3,4,5 /* Merge the correctly-aligned portions
+ of vr3/vr4 into vr6. */
+ lvx 3,11,7 /* vr3 = r11+32. */
+ vperm 10,4,3,5 /* Merge the correctly-aligned portions
+ of vr3/vr4 into vr10. */
+ addi 11,11,32
+ stvx 6,0,10
+ stvx 10,10,6
+ addi 10,10,32
+
+ bdnz L(unaligned_loop)
+
+ .align 4
+L(end_unaligned_loop):
+
+ /* Check for tail bytes. */
+ rldicr 0,31,0,59
+ mtcrf 0x01,31
+ beq cr1,0f
+
+ add 3,3,0
+ add 12,12,0
+
+ /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ lwz 6,0(12)
+ lwz 7,4(12)
+ addi 12,12,8
+ stw 6,0(3)
+ stw 7,4(3)
+ addi 3,3,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ lwz 6,0(12)
+ addi 12,12,4
+ stw 6,0(3)
+ addi 3,3,4
+2: /* Copy 2~3 bytes. */
+ bf 30,1f
+
+ lhz 6,0(12)
+ addi 12,12,2
+ sth 6,0(3)
+ addi 3,3,2
+1: /* Copy 1 byte. */
+ bf 31,0f
+
+ lbz 6,0(12)
+ stb 6,0(3)
+0: /* Return original DST pointer. */
+ ld 31,-8(1)
+ ld 3,-16(1)
+ blr
+
+END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000..02a9eed
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,398 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Luis Machado <luisgpm@br.ibm.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+ .machine power7
+EALIGN (BP_SYM (memset), 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,5,31
+ cmpldi cr6,5,8
+ mr 10,3
+
+ /* Replicate byte to word. */
+ rlwimi 4,4,8,16,23
+ rlwimi 4,4,16,0,15
+ ble cr6,L(small) /* If length <= 8, use short copy code. */
+
+ neg 0,3
+ ble cr7,L(medium) /* If length < 32, use medium copy code. */
+
+ andi. 11,10,7 /* Check alignment of SRC. */
+ insrdi 4,4,32,0 /* Replicate word to double word. */
+
+ mr 12,5
+ beq L(big_aligned)
+
+ clrldi 0,0,61
+ mtocrf 0x01,0
+ subf 5,0,5
+
+ /* Get DST aligned to 8 bytes. */
+1: bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: bf 30,4f
+
+ sth 4,0(10)
+ addi 10,10,2
+4: bf 29,L(big_aligned)
+
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+L(big_aligned):
+
+ cmpldi cr5,5,255
+ li 0,32
+ dcbtst 0,10
+ cmpldi cr6,4,0
+ srdi 9,5,3 /* Number of full doublewords remaining. */
+ crand 27,26,21
+ mtocrf 0x01,9
+ bt 27,L(huge)
+
+ /* From this point on, we'll copy 32+ bytes and the value
+ isn't 0 (so we can't use dcbz). */
+
+ srdi 8,5,5
+ clrldi 11,5,61
+ cmpldi cr6,11,0
+ cmpldi cr1,9,4
+ mtctr 8
+
+ /* Copy 1~3 doublewords so the main loop starts
+ at a multiple of 32 bytes. */
+
+ bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ bf 31,L(big_loop)
+
+ std 4,0(10)
+ addi 10,10,8
+ mr 12,10
+ blt cr1,L(tail_bytes)
+ b L(big_loop)
+
+ .align 4
+1: /* Copy 1 doubleword. */
+ bf 31,L(big_loop)
+
+ std 4,0(10)
+ addi 10,10,8
+
+ /* Main aligned copy loop. Copies 32-bytes at a time and
+ ping-pong through r10 and r12 to avoid AGEN delays. */
+ .align 4
+L(big_loop):
+ addi 12,10,32
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ bdz L(tail_bytes)
+
+ addi 10,10,64
+ std 4,0(12)
+ std 4,8(12)
+ std 4,16(12)
+ std 4,24(12)
+ bdnz L(big_loop)
+
+ mr 12,10
+ b L(tail_bytes)
+
+ .align 4
+L(tail_bytes):
+
+ /* Check for tail bytes. */
+ beqlr cr6
+
+ clrldi 0,5,61
+ mtocrf 0x01,0
+
+ /* At this point we have a tail of 0-7 bytes and we know that the
+ destination is doubleword-aligned. */
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(12)
+ addi 12,12,4
+2: /* Copy 2 bytes. */
+ bf 30,1f
+
+ sth 4,0(12)
+ addi 12,12,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(12)
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out 128-bytes at a time. Before using
+ dcbz though, we need to get the destination 128-bytes aligned. */
+ .align 4
+L(huge):
+ andi. 11,10,127
+ neg 0,10
+ beq L(huge_aligned)
+
+ clrldi 0,0,57
+ subf 5,0,5
+ srdi 0,0,3
+ mtocrf 0x01,0
+
+ /* Get DST aligned to 128 bytes. */
+8: bf 28,4f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ std 4,32(10)
+ std 4,40(10)
+ std 4,48(10)
+ std 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(huge_aligned)
+
+ std 4,0(10)
+ addi 10,10,8
+
+
+L(huge_aligned):
+ srdi 8,5,7
+ clrldi 11,5,57
+ cmpldi cr6,11,0
+ mtctr 8
+
+ .align 4
+L(huge_loop):
+ dcbz 0,10
+ addi 10,10,128
+ bdnz L(huge_loop)
+
+ /* Check how many bytes are still left. */
+ beqlr cr6
+
+ subf 9,3,10
+ subf 5,9,12
+ srdi 8,5,3
+ cmpldi cr6,8,0
+ mtocrf 0x01,8
+
+ /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
+ speed. We'll handle the resulting tail bytes later. */
+ beq cr6,L(tail)
+
+8: bf 28,4f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ std 4,32(10)
+ std 4,40(10)
+ std 4,48(10)
+ std 4,56(10)
+ addi 10,10,64
+ .align 4
+4: bf 29,2f
+
+ std 4,0(10)
+ std 4,8(10)
+ std 4,16(10)
+ std 4,24(10)
+ addi 10,10,32
+ .align 4
+2: bf 30,1f
+
+ std 4,0(10)
+ std 4,8(10)
+ addi 10,10,16
+ .align 4
+1: bf 31,L(tail)
+
+ std 4,0(10)
+ addi 10,10,8
+
+ /* Handle the rest of the tail bytes here. */
+L(tail):
+ mtocrf 0x01,5
+
+ .align 4
+4: bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+ .align 4
+2: bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+ .align 4
+1: bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Expanded tree to copy tail bytes without increments. */
+ .align 4
+L(copy_tail):
+ bf 29,L(FXX)
+
+ stw 4,0(10)
+ bf 30,L(TFX)
+
+ sth 4,4(10)
+ bflr 31
+
+ stb 4,6(10)
+ blr
+
+ .align 4
+L(FXX): bf 30,L(FFX)
+
+ sth 4,0(10)
+ bflr 31
+
+ stb 4,2(10)
+ blr
+
+ .align 4
+L(TFX): bflr 31
+
+ stb 4,4(10)
+ blr
+
+ .align 4
+L(FFX): bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handle copies of 9~31 bytes. */
+ .align 4
+L(medium):
+ /* At least 9 bytes to go. */
+ andi. 11,10,3
+ clrldi 0,0,62
+ beq L(medium_aligned)
+
+ /* Force 4-bytes alignment for SRC. */
+ mtocrf 0x01,0
+ subf 5,0,5
+1: /* Copy 1 byte. */
+ bf 31,2f
+
+ stb 4,0(10)
+ addi 10,10,1
+2: /* Copy 2 bytes. */
+ bf 30,L(medium_aligned)
+
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+L(medium_aligned):
+ /* At least 6 bytes to go, and DST is word-aligned. */
+ cmpldi cr1,5,16
+ mtocrf 0x01,5
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ stw 4,0(10)
+ stw 4,4(10)
+ stw 4,8(10)
+ stw 4,12(10)
+ addi 10,10,16
+8: /* Copy 8 bytes. */
+ bf 28,4f
+
+ stw 4,0(10)
+ stw 4,4(10)
+ addi 10,10,8
+4: /* Copy 4 bytes. */
+ bf 29,2f
+
+ stw 4,0(10)
+ addi 10,10,4
+2: /* Copy 2-3 bytes. */
+ bf 30,1f
+
+ sth 4,0(10)
+ addi 10,10,2
+1: /* Copy 1 byte. */
+ bflr 31
+
+ stb 4,0(10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(small):
+ mtocrf 0x01,5
+ bne cr6,L(copy_tail)
+
+ stw 4,0(10)
+ stw 4,4(10)
+ blr
+
+END_GEN_TB (BP_SYM (memset),TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (BP_SYM (__bzero))
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END_GEN_TB (BP_SYM (__bzero),TB_TOCLESS)
+
+weak_alias (BP_SYM (__bzero), BP_SYM (bzero))
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
new file mode 100644
index 0000000..34f1e52
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -0,0 +1,181 @@
+/* Optimized strcmp implementation for POWER7/PowerPC64.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
+ 02110-1301 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+/* See strlen.s for comments on how the end-of-string testing works. */
+
+/* int [r3] strncmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
+
+EALIGN (BP_SYM(strncmp),4,0)
+ CALL_MCOUNT 3
+
+#define rTMP r0
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+/* Note: The Bounded pointer support in this code is broken. This code
+ was inherited from PPC32 and and that support was never completed.
+ Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r10
+#define rWORD4 r11
+#define rFEFE r8 /* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F r9 /* constant 0x7f7f7f7f7f7f7f7f */
+#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+
+ dcbt 0,rSTR1
+ or rTMP,rSTR2,rSTR1
+ lis r7F7F,0x7f7f
+ dcbt 0,rSTR2
+ clrldi. rTMP,rTMP,61
+ cmpldi cr1,rN,0
+ lis rFEFE,-0x101
+ bne L(unaligned)
+/* We are doubleword alligned so set up for two loops. first a double word
+ loop, then fall into the byte loop if any residual. */
+ srdi. rTMP,rN,3
+ clrldi rN,rN,61
+ addi rFEFE,rFEFE,-0x101
+ addi r7F7F,r7F7F,0x7f7f
+ cmpldi cr1,rN,0
+ beq L(unaligned)
+
+ mtctr rTMP
+ ld rWORD1,0(rSTR1)
+ ld rWORD2,0(rSTR2)
+ sldi rTMP,rFEFE,32
+ insrdi r7F7F,r7F7F,32,0
+ add rFEFE,rFEFE,rTMP
+ b L(g1)
+
+L(g0):
+ ldu rWORD1,8(rSTR1)
+ bne cr1,L(different)
+ ldu rWORD2,8(rSTR2)
+L(g1): add rTMP,rFEFE,rWORD1
+ nor rNEG,r7F7F,rWORD1
+ bdz L(tail)
+ and. rTMP,rTMP,rNEG
+ cmpd cr1,rWORD1,rWORD2
+ beq L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+ we don't compare two strings as different because of gunk beyond
+ the end of the strings... */
+
+L(endstring):
+ and rTMP,r7F7F,rWORD1
+ beq cr1,L(equal)
+ add rTMP,rTMP,r7F7F
+ xor. rBITDIF,rWORD1,rWORD2
+
+ andc rNEG,rNEG,rTMP
+ blt L(highbit)
+ cntlzd rBITDIF,rBITDIF
+ cntlzd rNEG,rNEG
+ addi rNEG,rNEG,7
+ cmpd cr1,rNEG,rBITDIF
+ sub rRTN,rWORD1,rWORD2
+ blt cr1,L(equal)
+ sradi rRTN,rRTN,63
+ ori rRTN,rRTN,1
+ blr
+L(equal):
+ li rRTN,0
+ blr
+
+L(different):
+ ldu rWORD1,-8(rSTR1)
+ xor. rBITDIF,rWORD1,rWORD2
+ sub rRTN,rWORD1,rWORD2
+ blt L(highbit)
+ sradi rRTN,rRTN,63
+ ori rRTN,rRTN,1
+ blr
+L(highbit):
+ srdi rWORD2,rWORD2,56
+ srdi rWORD1,rWORD1,56
+ sub rRTN,rWORD1,rWORD2
+ blr
+
+
+/* Oh well. In this case, we just do a byte-by-byte comparison. */
+ .align 4
+L(tail):
+ and. rTMP,rTMP,rNEG
+ cmpd cr1,rWORD1,rWORD2
+ bne L(endstring)
+ addi rSTR1,rSTR1,8
+ bne cr1,L(different)
+ addi rSTR2,rSTR2,8
+ cmpldi cr1,rN,0
+L(unaligned):
+ mtctr rN
+ ble cr1,L(ux)
+L(uz):
+ lbz rWORD1,0(rSTR1)
+ lbz rWORD2,0(rSTR2)
+ .align 4
+L(u1):
+ cmpdi cr1,rWORD1,0
+ bdz L(u4)
+ cmpd rWORD1,rWORD2
+ beq cr1,L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne L(u4)
+ cmpdi cr1,rWORD3,0
+ bdz L(u3)
+ cmpd rWORD3,rWORD4
+ beq cr1,L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ bne L(u3)
+ cmpdi cr1,rWORD1,0
+ bdz L(u4)
+ cmpd rWORD1,rWORD2
+ beq cr1,L(u4)
+ lbzu rWORD3,1(rSTR1)
+ lbzu rWORD4,1(rSTR2)
+ bne L(u4)
+ cmpdi cr1,rWORD3,0
+ bdz L(u3)
+ cmpd rWORD3,rWORD4
+ beq cr1,L(u3)
+ lbzu rWORD1,1(rSTR1)
+ lbzu rWORD2,1(rSTR2)
+ beq L(u1)
+
+L(u3): sub rRTN,rWORD3,rWORD4
+ blr
+L(u4): sub rRTN,rWORD1,rWORD2
+ blr
+L(ux):
+ li rRTN,0
+ blr
+END (BP_SYM (strncmp))
+libc_hidden_builtin_def (strncmp)
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/Implies
new file mode 100644
index 0000000..0c01883
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/970/fpu
+powerpc/powerpc32/970
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/fpu/Implies
deleted file mode 100644
index 52993ae..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/970/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/970/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/Implies
new file mode 100644
index 0000000..29c49a4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/cell/fpu
+powerpc/powerpc32/cell
diff --git a/sysdeps/powerpc/powerpc32/970/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/Implies
index 7c16e45..17139bf 100644
--- a/sysdeps/powerpc/powerpc32/970/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/Implies
@@ -1 +1,2 @@
powerpc/powerpc32/power4/fpu
+powerpc/powerpc32/power4
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/fpu/Implies
deleted file mode 100644
index 3c0690e..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power4/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power4/fpu
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/Implies
index 57b92fb..8e5b58a 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/Implies
@@ -1,2 +1,2 @@
powerpc/powerpc32/power5+/fpu
-powerpc/powerpc32/power4/fpu
+powerpc/powerpc32/power5+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/fpu/Implies
deleted file mode 100644
index 37d43bb..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5+/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power5+/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/Implies
new file mode 100644
index 0000000..a51d2fd
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/power5/fpu
+powerpc/powerpc32/power5
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/fpu/Implies
deleted file mode 100644
index d379a2d..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power5/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/Implies
new file mode 100644
index 0000000..c0e1bea
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/power6/fpu
+powerpc/powerpc32/power6
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/fpu/Implies
deleted file mode 100644
index 9fb457d..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power6/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/Implies
new file mode 100644
index 0000000..bc9e74f
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/power6x/fpu
+powerpc/powerpc32/power6x
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/fpu/Implies
deleted file mode 100644
index ffca13c..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power6x/fpu/Implies
+++ /dev/null
@@ -1,4 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power6x/fpu
-powerpc/powerpc32/power6/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/Implies
new file mode 100644
index 0000000..083f3e9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/power7/fpu
+powerpc/powerpc32/power7
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
deleted file mode 100644
index d379a2d..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/power7/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc32/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
-powerpc/powerpc32/power5/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/Implies
new file mode 100644
index 0000000..76a32ce
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/970/fpu
+powerpc/powerpc64/970
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/fpu/Implies
deleted file mode 100644
index 558a5fb..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/970/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/Implies
new file mode 100644
index 0000000..583d408
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/cell/fpu
+powerpc/powerpc64/cell
diff --git a/sysdeps/powerpc/powerpc64/970/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/Implies
index 558a5fb..bedb20b 100644
--- a/sysdeps/powerpc/powerpc64/970/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/Implies
@@ -1 +1,2 @@
powerpc/powerpc64/power4/fpu
+powerpc/powerpc64/power4
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/fpu/Implies
deleted file mode 100644
index 558a5fb..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power4/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/powerpc/powerpc64/power6/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/Implies
index 885e39c..4c782d4 100644
--- a/sysdeps/powerpc/powerpc64/power6/fpu/Implies
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/Implies
@@ -1,2 +1,2 @@
powerpc/powerpc64/power5+/fpu
-powerpc/powerpc64/power4/fpu
+powerpc/powerpc64/power5+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/fpu/Implies
deleted file mode 100644
index cf5913d..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5+/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc64/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
-powerpc/powerpc64/power5+/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/Implies
new file mode 100644
index 0000000..a01a13a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power5/fpu
+powerpc/powerpc64/power5
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/fpu/Implies
deleted file mode 100644
index 558a5fb..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power5/fpu/Implies
+++ /dev/null
@@ -1 +0,0 @@
-powerpc/powerpc64/power4/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/Implies
new file mode 100644
index 0000000..9d68f39
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power6/fpu
+powerpc/powerpc64/power6
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/fpu/Implies
deleted file mode 100644
index 9451147..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc64/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
-powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/Implies
new file mode 100644
index 0000000..9019778
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power6x/fpu
+powerpc/powerpc64/power6x
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/fpu/Implies
deleted file mode 100644
index fd74b83..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power6x/fpu/Implies
+++ /dev/null
@@ -1,4 +0,0 @@
-# Make sure this comes before the powerpc/powerpc64/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
-powerpc/powerpc64/power6x/fpu
-powerpc/powerpc64/power6/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/Implies
new file mode 100644
index 0000000..9a5e3c7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power7/fpu
+powerpc/powerpc64/power7
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
deleted file mode 100644
index c46b3d4..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/power7/fpu/Implies
+++ /dev/null
@@ -1,3 +0,0 @@
-# Make sure this comes before the powerpc/powerpc64/fpu that's
-# listed in unix/sysv/linux/powerpc/powerpc64/fpu/Implies.
-powerpc/powerpc64/power5/fpu