aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/powerpc/powerpc64/power7
diff options
context:
space:
mode:
authorVidya Ranganathan <vidya@linux.vnet.ibm.com>2014-05-05 19:10:45 -0500
committerAdhemerval Zanella <azanella@linux.vnet.ibm.com>2014-05-06 09:54:25 -0500
commitf360f94a05570045be615649e9a411cefba2e210 (patch)
treeac6fb86d01ea80b3bf914c2f74aa231b42bfda28 /sysdeps/powerpc/powerpc64/power7
parent978a41c35738dbe4592edb7dcaa155fec31dab50 (diff)
downloadglibc-f360f94a05570045be615649e9a411cefba2e210.zip
glibc-f360f94a05570045be615649e9a411cefba2e210.tar.gz
glibc-f360f94a05570045be615649e9a411cefba2e210.tar.bz2
PowerPC: strncpy/stpncpy optimization for PPC64/POWER7
The optimization is achieved by following techniques: > data alignment [gain from aligned memory access on read/write] > POWER7 gains performance with loop unrolling/unwinding [gain by reduction of branch penalty]. > zero padding done by calling optimized memset
Diffstat (limited to 'sysdeps/powerpc/powerpc64/power7')
-rw-r--r--sysdeps/powerpc/powerpc64/power7/stpncpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncpy.S338
2 files changed, 362 insertions, 0 deletions
diff --git a/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000..a539093
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strncpy.S b/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000..51860df
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,338 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+ char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ AND
+
+ char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+ The algorithm is as follows:
+ > if src and dest are 8 byte aligned, perform double word copy
+ else
+ > copy byte by byte on unaligned addresses.
+
+ The aligned comparison are made using cmpb instructions. */
+
+/* The focus on optimization for performance improvements are as follows:
+ 1. data alignment [gain from aligned memory access on read/write]
+ 2. POWER7 gains performance with loop unrolling/unwinding
+ [gain by reduction of branch penalty].
+ 3. The final pad with null bytes is done by calling an optimized
+ memset. */
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+#define FRAMESIZE (FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+ GLIBC symbol (created by libc_hidden_builtin_def). */
+# ifdef SHARED
+# define MEMSET __GI_memset
+# else
+# define MEMSET memset
+# endif
+#endif
+
+ .machine power7
+EALIGN(FUNC_NAME, 4, 0)
+ CALL_MCOUNT 3
+
+ mflr r0 /* load link register LR to r0 */
+ or r10, r3, r4 /* to verify source and destination */
+ rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
+
+ std r19, -8(r1) /* save callers register , r19 */
+ std r18, -16(r1) /* save callers register , r18 */
+ std r0, 16(r1) /* store the link register */
+ stdu r1, -FRAMESIZE(r1) /* create the stack frame */
+
+ mr r9, r3 /* save r3 into r9 for use */
+ mr r18, r3 /* save r3 for retCode of strncpy */
+ bne 0, L(byte_by_byte)
+
+
+ srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
+ cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
+ ble 7, L(update1)
+
+ ld r10, 0(r4) /* load doubleWord from src */
+ cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne cr7, L(update3)
+
+ std r10, 0(r3) /* copy doubleword at offset=0 */
+ ld r10, 8(r4) /* load next doubleword from offset=8 */
+ cmpb r8, r10, r8 /* compare src with NULL , we read just now */
+ cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
+ bne 7,L(HopBy8)
+
+ addi r8, r11, -4
+ mr r7, r3
+ srdi r8, r8, 2
+ mr r6, r4
+ addi r8, r8, 1
+ li r12, 0
+ mtctr r8
+ b L(dwordCopy)
+
+ .p2align 4
+L(dWordUnroll):
+ std r8, 16(r9)
+ ld r8, 24(r4) /* load dword,perform loop unrolling again */
+ cmpb r10, r8, r10
+ cmpdi cr7, r10, 0
+ bne cr7, L(HopBy24)
+
+ std r8, 24(r7) /* copy dword at offset=24 */
+ addi r9, r9, 32
+ addi r4, r4, 32
+ bdz L(leftDwords) /* continue with loop on counter */
+
+ ld r3, 32(r6)
+ cmpb r8, r3, r10
+ cmpdi cr7, r8, 0
+ bne cr7, L(update2)
+
+ std r3, 32(r7)
+ ld r10, 40(r6)
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(HopBy40)
+
+ mr r6, r4 /* update values */
+ mr r7, r9
+ mr r11, r0
+ mr r5, r19
+
+L(dwordCopy):
+ std r10, 8(r9) /* copy dword at offset=8 */
+ addi r19, r5, -32
+ addi r0, r11, -4
+ ld r8, 16(r4)
+ cmpb r10, r8, r12
+ cmpdi cr7, r10, 0
+ beq cr7, L(dWordUnroll)
+
+ addi r9, r9, 16 /* increment dst by 16 */
+ addi r4, r4, 16 /* increment src by 16 */
+ addi r5, r5, -16 /* decrement length 'n' by 16 */
+ addi r0, r11, -2 /* decrement loop counter */
+
+L(dWordUnrollOFF):
+ ld r10, 0(r4) /* load first dword */
+ li r8, 0 /* load mask */
+ cmpb r8, r10, r8
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+ mtctr r0
+ li r7, 0
+ b L(CopyDword)
+
+ .p2align 4
+L(loadDWordandCompare):
+ ld r10, 0(r4)
+ cmpb r8, r10, r7
+ cmpdi cr7, r8, 0
+ bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+ addi r9, r9, 8
+ std r10, -8(r9)
+ addi r4, r4, 8
+ addi r5, r5, -8
+ bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+ cmpldi cr7, r5, 3
+ ble cr7, L(verifyByte)
+ srdi r10, r5, 2
+ mr r19, r9
+ mtctr r10
+ b L(firstByteUnroll)
+
+ .p2align 4
+L(bytes_unroll):
+ lbz r10, 1(r4) /* load byte from src */
+ cmpdi cr7, r10, 0 /* compare for NULL */
+ stb r10, 1(r19) /* store byte to dst */
+ beq cr7, L(updtDestComputeN2ndByte)
+
+ addi r4, r4, 4 /* advance src */
+
+ lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, r10, 0
+ stb r10, 2(r19)
+ beq cr7, L(updtDestComputeN3rdByte)
+
+ lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
+ addi r19, r19, 4
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ beq cr7, L(ComputeNByte)
+
+ bdz L(update0)
+
+L(firstByteUnroll):
+ lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
+ cmpdi cr7, 10, 0
+ stb r10, 0(r19)
+ bne cr7, L(bytes_unroll)
+ addi r19, r19, 1
+
+L(ComputeNByte):
+ subf r9, r19, r9 /* compute 'n'n bytes to fill */
+ add r8, r9, r5
+
+L(zeroFill):
+ cmpdi cr7, r8, 0 /* compare if length is zero */
+ beq cr7, L(update3return)
+
+ mr r3, r19 /* fill buffer with */
+ li r4, 0 /* zero fill buffer */
+ mr r5, r8 /* how many bytes to fill buffer with */
+ bl MEMSET /* call optimized memset */
+ nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+ addi r3, r19, -1 /* update return value */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+ mr r3, r18 /* set return value */
+#endif
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+ .p2align 4
+L(update0):
+ mr r9, r19
+
+ .p2align 4
+L(verifyByte):
+ rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+ mr r3, r9
+#endif
+ beq cr0, L(hop2return)
+ mtctr r8
+ addi r4, r4, -1
+ mr r19, r9
+ b L(oneBYone)
+
+ .p2align 4
+L(proceed):
+ bdz L(done)
+
+L(oneBYone):
+ lbzu r10, 1(r4) /* copy byte */
+ addi r19, r19, 1
+ addi r8, r8, -1
+ cmpdi cr7, r10, 0
+ stb r10, -1(r19)
+ bne cr7, L(proceed)
+ b L(zeroFill)
+
+ .p2align 4
+L(done):
+ addi r1, r1, FRAMESIZE /* restore stack pointer */
+#ifdef USE_AS_STPNCPY
+ mr r3, r19 /* set the return value */
+#else
+ mr r3, r18 /* set the return value */
+#endif
+ ld r0, 16(r1) /* read the saved link register */
+ ld r18, -16(r1) /* restore callers save register, r18 */
+ ld r19, -8(r1) /* restore callers save register, r19 */
+ mtlr r0 /* branch to link register */
+ blr /* return */
+
+L(update1):
+ mr r0, r11
+ mr r19, r5
+
+ .p2align 4
+L(leftDwords):
+ cmpdi cr7, r0, 0
+ mr r5, r19
+ bne cr7, L(dWordUnrollOFF)
+ b L(byte_by_byte)
+
+ .p2align 4
+L(updtDestComputeN2ndByte):
+ addi r19, r19, 2 /* update dst by 2 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(updtDestComputeN3rdByte):
+ addi r19, r19, 3 /* update dst by 3 */
+ subf r9, r19, r9 /* compute distance covered */
+ add r8, r9, r5
+ b L(zeroFill)
+
+ .p2align 4
+L(HopBy24):
+ addi r9, r9, 24 /* increment dst by 24 */
+ addi r4, r4, 24 /* increment src by 24 */
+ addi r5, r5, -24 /* decrement length 'n' by 24 */
+ addi r0, r11, -3 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(update2):
+ mr r5, r19
+ b L(dWordUnrollOFF)
+
+ .p2align 4
+L(HopBy40):
+ addi r9, r7, 40 /* increment dst by 40 */
+ addi r4, r6, 40 /* increment src by 40 */
+ addi r5, r5, -40 /* decrement length 'n' by 40 */
+ addi r0, r11, -5 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+
+L(update3):
+ mr r0, r11
+ b L(dWordUnrollOFF)
+
+L(HopBy8):
+ addi r9, r3, 8 /* increment dst by 8 */
+ addi r4, r4, 8 /* increment src by 8 */
+ addi r5, r5, -8 /* decrement length 'n' by 8 */
+ addi r0, r11, -1 /* decrement loop counter */
+ b L(dWordUnrollOFF)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif