aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--benchtests/bench-strcpy.c16
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcpy.S327
3 files changed, 268 insertions, 82 deletions
diff --git a/ChangeLog b/ChangeLog
index 3de3443..3e2d4b0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2014-12-31 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+ Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned
+ path.
+ * benchtests/bench-strcpy.c (test_main): Add more unaligned inputs.
+
2014-12-31 Joseph Myers <joseph@codesourcery.com>
* sysdeps/powerpc/bits/fenvinline.h (fegetround): Rename macro to
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..e9445f2 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -171,6 +171,22 @@ test_main (void)
do_test (i, i, 8 << i, BIG_CHAR);
}
+ for (i = 16; i <= 512; i+=4)
+ {
+ do_test (0, 4, i, SMALL_CHAR);
+ do_test (4, 0, i, BIG_CHAR);
+ do_test (4, 4, i, SMALL_CHAR);
+ do_test (2, 2, i, BIG_CHAR);
+ do_test (2, 6, i, SMALL_CHAR);
+ do_test (6, 2, i, BIG_CHAR);
+ do_test (1, 7, i, SMALL_CHAR);
+ do_test (7, 1, i, BIG_CHAR);
+ do_test (3, 4, i, SMALL_CHAR);
+ do_test (4, 3, i, BIG_CHAR);
+ do_test (5, 7, i, SMALL_CHAR);
+ do_test (7, 5, i, SMALL_CHAR);
+ }
+
return ret;
}
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982..115f98a 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -31,8 +31,6 @@
if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
goto aligned_doubleword_copy;
- if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
- goto aligned_word_copy;
if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
goto same_alignment;
goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
#endif
or rTMP, rSRC, rRTN
clrldi. rTMP, rTMP, 61
- bne L(check_word_alignment)
+ bne L(check_alignment)
b L(aligned_doubleword_copy)
+ .align 4
+L(check_alignment):
+ rldicl rRTNAL, rRTN, 0, 61
+ rldicl rSRCAL, rSRC, 0, 61
+ cmpld cr7, rSRCAL, rRTNAL
+ beq cr7, L(same_alignment)
+ b L(unaligned)
+
+ .align 4
L(same_alignment):
/* Src and dst with same alignment: align both to doubleword. */
mr rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
#endif
blr
-L(check_word_alignment):
- clrldi. rTMP, rTMP, 62
- beq L(aligned_word_copy)
- rldicl rRTNAL, rRTN, 0, 61
- rldicl rSRCAL, rSRC, 0, 61
- cmpld cr7, rSRCAL, rRTNAL
- beq cr7, L(same_alignment)
- b L(unaligned)
-
-/* For word aligned memory, operate using word load and stores. */
.align 4
-L(aligned_word_copy):
- li rMASK, 0
- addi rRTN, rRTN, -4
- lwz rWORD, 0(rSRC)
- b L(g5)
+L(unaligned):
+ cmpdi rSRCAL, 0 /* Check src alignment */
+ beq L(srcaligndstunalign)
+ /* src is unaligned */
+ rlwinm r10, rSRC, 3,26,28 /* Calculate padding. */
+ clrrdi rSRC, rSRC, 3 /* Align the addr to dw boundary */
+ ld rWORD, 0(rSRC) /* Load doubleword from memory. */
+ li rTMP, 0
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ srd rALT, rWORD, r10
+#else
+ sld rALT, rWORD, r10
+#endif
+ cmpb rTMP, rALT, rTMP /* Compare each byte against null */
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ sld rTMP, rTMP, r10
+#else
+ srd rTMP, rTMP, r10
+#endif
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* if it has null, copy byte by byte */
+ subfic r8, r9, 8
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding in bits. */
+ rldicl r9, rRTN, 0, 61 /* Calculate padding in bytes. */
+ addi rRTN, rRTN, -1
- .align 4
-L(g3): lwzu rALT, 4(rSRC)
- stwu rWORD, 4(rRTN)
- cmpb rTMP, rALT, rMASK
- cmpwi rTMP, 0
- bne L(g4)
- lwzu rWORD, 4(rSRC)
- stwu rALT, 4(rRTN)
-L(g5): cmpb rTMP, rWORD, rMASK
- cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */
- beq L(g3)
-
- mr rALT, rWORD
-/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(g4):
+ cmpdi r5, 0 /* check dest alignment */
+ beq L(srcunaligndstalign)
+
+ /* both src and dst unaligned */
#ifdef __LITTLE_ENDIAN__
- rlwinm. rTMP, rALT, 0, 24, 31
- stbu rALT, 4(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm rTMP, rALT, 8, 24, 31
- stbu rTMP, 1(rRTN)
+ sld rWORD, rALT, r10
+ mr r11, r10
+ addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
#else
- rlwinm. rTMP, rALT, 8, 24, 31
- stbu rTMP, 4(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- stbu rALT, 1(rRTN)
+ srd rWORD, rALT, r10
+ subfic r11, r10, 64
#endif
- blr
+ /* dst alignment is greater then src alignment? */
+ cmpd cr7, r5, r10
+ blt cr7, L(dst_align_small)
+ /* src alignment is less than dst */
-/* Oh well. In this case, we just do a byte-by-byte copy. */
- .align 4
-L(unaligned):
- lbz rWORD, 0(rSRC)
- addi rRTN, rRTN, -1
- cmpdi rWORD, 0
- beq L(u2)
-
- .align 5
-L(u0): lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ /* Calculate the dst alignment differnce */
+ subfic rALT, r9, 8
+ mtctr rALT
+
+ /* Write till dst is aligned */
+ cmpdi rTMP, rALT, 4
+ blt L(storebyte1) /* less than 4, store byte by byte */
+ beq L(equal1) /* if its 4, store word */
+ addi rTMP, rALT, -4 /* greater than 4, so stb and stw */
+ mtctr rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
+ stbu rALT, 1(rRTN)
+ bdnz L(storebyte1)
+
+ subfic rALT, r9, 8 /* Check the remaining bytes */
+ cmpdi rTMP, rALT, 4
+ blt L(proceed)
+
+ .align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+L(proceed):
+ mr rALT, rWORD
+ /* calculate the Left over bytes to be written */
+ subfic r11, r10, 64
+ subfic r5, r5, 64
+ subf r5, r5, r11 /* remaining bytes on second dw */
+ subfic r10, r5, 64 /* remaining bytes on first dw */
+ subfic r9, r9, 8
+ subf r8, r9, r8 /* recalculate padding */
+L(srcunaligndstalign):
+ addi rRTN, rRTN, 1
+ subfic r5, r10, 64 /* remaining bytes on second dw */
+ addi rSRC, rSRC, 8
+ li rTMP,0
+ b L(storedouble)
+
+ .align 4
+L(dst_align_small):
+ mtctr r8
+ /* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- beq L(u2)
- lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ bdnz L(storebyte2)
+
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ addi rRTN, rRTN, 1 /* Increment dst pointer */
+ rldicl r8, rRTN, 0, 61 /* Recalculate padding */
+
+ /* src is aligned */
+L(srcaligndstunalign):
+ ld rWORD, 0(rSRC)
+ mr rALT, rWORD
+ li rTMP, 0 /* Check null */
+ cmpb rTMP, rWORD, rTMP
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* Do byte by byte if there is NULL */
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding */
+ addi rRTN, rRTN, -1
+ subfic r10, r8, 8
+ /* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+ li r11, -8
+#else
+ li r11, 64
+#endif
+ mtctr r10
+ cmpdi rTMP, r10, 4
+ blt L(storebyte)
+ beq L(equal)
+ addi rTMP, r10, -4
+ mtctr rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- bne L(u0)
-L(u2): stbu rWORD, 1(rRTN)
- blr
-L(u1): stbu rALT, 1(rRTN)
- blr
+ bdnz L(storebyte)
+
+ cmpdi rTMP, r10, 4
+ blt L(align)
+
+ .align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+L(align):
+ addi rRTN, rRTN, 1
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ subfic r10, r5, 64
+ li rTMP, 0
+ /* dst addr aligned to 8 */
+L(storedouble):
+ ld rALT, 0(rSRC) /* load next dw */
+ cmpb rTMP, rALT, rTMP
+ cmpdi rTMP, 0 /* check for null on each new dw */
+ bne L(null)
+#ifdef __LITTLE_ENDIAN__
+ srd r9, rWORD, r10 /* bytes from first dw */
+ sld r11, rALT, r5 /* bytes from second dw */
+#else
+ sld r9, rWORD, r10
+ srd r11, rALT, r5
+#endif
+ or r11, r9, r11 /* make as a single dw */
+ std r11, 0(rRTN) /* store as std on aligned addr */
+ mr rWORD, rALT /* still few bytes left to be written */
+ addi rRTN, rRTN, 8 /* increment dst addr */
+ addi rSRC, rSRC, 8 /* increment src addr */
+ b L(storedouble) /* Loop till NULL */
+
+ .align 4
+
+/* We've hit the end of the string. Do the rest byte-by-byte. */
+L(null):
+ addi rRTN, rRTN, -1
+ mr r10, r5
+ mtctr r8
+#ifdef __LITTLE_ENDIAN__
+ subfic r10, r10, 64
+ addi r10, r10, -8
+#endif
+ cmpdi rTMP, r8, 4
+ blt L(loop)
+
+ /* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+ srd r11, rWORD, r10
+#else
+ subfic r10, r10, 64
+ sld r11, rWORD, r10
+ srdi r11, r11, 32
+#endif
+ stw r11, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+ beq L(bytebybyte1)
+ addi r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, -8
+#else
+ subfic r10, r10, 64
+#endif
+ addi rTMP, r8, -4
+ mtctr rTMP
+ /* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+#else
+ addi r10, r10, -8
+#endif
+ srd rTMP, rWORD, r10
+ stbu rTMP, 1(rRTN)
+ bdnz L(loop)
+
+L(bytebybyte1):
+ addi rRTN, rRTN, 1
+ /* remaining byte by byte part of second dw */
+L(bytebybyte):
+ addi rRTN, rRTN, -8
+ b L(g1)
+
END (FUNC_NAME)
#ifndef USE_AS_STPCPY