From 759cfef3ac4c07dba1ece0bbc1207e099348816d Mon Sep 17 00:00:00 2001 From: Alan Modra Date: Sat, 17 Aug 2013 18:47:22 +0930 Subject: PowerPC LE memcpy http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails. --- sysdeps/powerpc/powerpc32/power4/memcpy.S | 58 +++++++++++++++++++++ sysdeps/powerpc/powerpc32/power6/memcpy.S | 81 +++++++++++++++++++++++++++--- sysdeps/powerpc/powerpc32/power7/memcpy.S | 24 +++++++-- sysdeps/powerpc/powerpc32/power7/mempcpy.S | 28 ++++++++--- 4 files changed, 172 insertions(+), 19 deletions(-) (limited to 'sysdeps/powerpc/powerpc32') diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S index d914663..338d3cc 100644 --- a/sysdeps/powerpc/powerpc32/power4/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S @@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0) blt cr6,5f srwi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmplwi cr1,10,16 @@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0) bf 30,1f /* there are at least two words to copy, so copy them */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 1st src word to left align it in R0 */ srw 8,7,9 /* shift 2nd src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ lwz 6,8(5) /* load the 3rd src word */ stw 0,0(4) /* store the 1st dst word */ +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 /* now left align 2nd src word into R0 */ srw 8,6,9 /* shift 3rd src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ lwz 7,12(5) stw 0,4(4) /* store the 2nd dst word */ @@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0) addi 5,5,16 bf 31,4f /* there is a third word to copy, so copy it */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 3rd src word to left align it in R0 */ srw 8,7,9 /* shift 4th src word to right align it in R8 */ +#endif or 0,0,8 /* or them to get word to store */ stw 0,0(4) /* store 3rd dst word */ mr 6,7 @@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0) b 4f .align 4 1: +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 /* shift 1st src word to left align it in R0 */ srw 8,7,9 /* shift 2nd src word to right align it in R8 */ +#endif addi 5,5,8 or 0,0,8 /* or them to get word to store */ bf 31,4f @@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0) .align 4 4: /* copy 16 bytes at a time */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 lwz 6,0(5) stw 0,0(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 srw 8,6,9 +#endif or 0,0,8 lwz 7,4(5) stw 0,4(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 lwz 6,8(5) stw 0,8(4) +#ifdef __LITTLE_ENDIAN__ + srw 0,7,10 + slw 8,6,9 +#else slw 0,7,10 srw 8,6,9 +#endif or 0,0,8 lwz 7,12(5) stw 0,12(4) @@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0) bdnz+ 4b 8: /* calculate and store the final word */ +#ifdef __LITTLE_ENDIAN__ + srw 0,6,10 + slw 8,7,9 +#else slw 0,6,10 srw 8,7,9 +#endif or 0,0,8 stw 0,0(4) 3: diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S index a76f71e..f58114a 100644 --- a/sysdeps/powerpc/powerpc32/power6/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S @@ -219,15 +219,28 @@ L(word_unaligned_short): blt cr6,5f srwi 7,6,16 bgt cr6,3f +#ifdef __LITTLE_ENDIAN__ + sth 7,0(3) +#else sth 6,0(3) +#endif b 7f .align 4 3: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,24 + stb 6,0(3) + sth 7,1(3) +#else stb 7,0(3) sth 6,1(3) +#endif b 7f .align 4 5: +#ifdef __LITTLE_ENDIAN__ + rotlwi 6,6,8 +#endif stb 6,0(3) 7: cmplwi cr1,10,16 @@ -577,7 +590,11 @@ L(wdu1_32): lwz 6,-1(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,8 +#else slwi 6,6,8 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu1_32tail) mtctr 8 @@ -585,8 +602,12 @@ L(wdu1_32): lwz 8,3(4) lwz 7,4(4) +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else /* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif b L(wdu1_loop32x) .align 4 L(wdu1_loop32): @@ -595,8 +616,12 @@ L(wdu1_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else /* Equivalent to srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif L(wdu1_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -613,7 +638,11 @@ L(wdu1_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,8 +#else slwi 6,8,8 +#endif bdnz+ L(wdu1_loop32) stw 10,-8(3) stw 11,-4(3) @@ -624,8 +653,12 @@ L(wdu1_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,3(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,24,32 +#else +/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ rlwimi 6,8,8,(32-8),31 +#endif b L(wdu_32tailx) L(wdu2_32): @@ -633,7 +666,11 @@ L(wdu2_32): lwz 6,-2(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,16 +#else slwi 6,6,16 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu2_32tail) mtctr 8 @@ -641,8 +678,11 @@ L(wdu2_32): lwz 8,2(4) lwz 7,4(4) -/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif b L(wdu2_loop32x) .align 4 L(wdu2_loop32): @@ -651,8 +691,11 @@ L(wdu2_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) -/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif L(wdu2_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -670,7 +713,11 @@ L(wdu2_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,16 +#else slwi 6,8,16 +#endif bdnz+ L(wdu2_loop32) stw 10,-8(3) stw 11,-4(3) @@ -681,8 +728,11 @@ L(wdu2_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,2(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,16,32 +#else rlwimi 6,8,16,(32-16),31 +#endif b L(wdu_32tailx) L(wdu3_32): @@ -690,7 +740,11 @@ L(wdu3_32): lwz 6,-3(4) cmplwi cr6,31,4 srwi 8,31,5 /* calculate the 32 byte loop count */ +#ifdef __LITTLE_ENDIAN__ + srwi 6,6,24 +#else slwi 6,6,24 +#endif clrlwi 31,31,27 /* The remaining bytes, < 32. */ blt cr5,L(wdu3_32tail) mtctr 8 @@ -698,8 +752,11 @@ L(wdu3_32): lwz 8,1(4) lwz 7,4(4) -/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif b L(wdu3_loop32x) .align 4 L(wdu3_loop32): @@ -708,8 +765,11 @@ L(wdu3_loop32): lwz 7,4(4) stw 10,-8(3) stw 11,-4(3) -/* Equivalent to srwi 8,8,32-8; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif L(wdu3_loop32x): lwz 10,8(4) lwz 11,12(4) @@ -726,7 +786,11 @@ L(wdu3_loop32x): stw 6,16(3) stw 7,20(3) addi 3,3,32 +#ifdef __LITTLE_ENDIAN__ + srwi 6,8,24 +#else slwi 6,8,24 +#endif bdnz+ L(wdu3_loop32) stw 10,-8(3) stw 11,-4(3) @@ -737,8 +801,11 @@ L(wdu3_32tail): blt cr6,L(wdu_4tail) /* calculate and store the final word */ lwz 8,1(4) -/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */ +#ifdef __LITTLE_ENDIAN__ + rldimi 6,8,8,32 +#else rlwimi 6,8,24,(32-24),31 +#endif b L(wdu_32tailx) .align 4 L(wdu_32tailx): diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S index 7f00778..acf3c10 100644 --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S @@ -383,7 +383,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -461,11 +469,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S index 5ad4edb..4610ec5 100644 --- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S @@ -325,7 +325,7 @@ L(copy_GE_32_unaligned): beq L(copy_GE_32_unaligned_cont) - /* SRC is not quadword aligned, get it aligned. */ + /* DST is not quadword aligned, get it aligned. */ mtcrf 0x01,0 subf 31,0,5 @@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont): mr 11,12 mtcrf 0x01,9 cmplwi cr6,9,1 - lvsl 5,0,12 +#ifdef __LITTLE_ENDIAN__ + lvsr 5,0,12 +#else + lvsl 5,0,12 +#endif lvx 3,0,12 bf 31,L(setup_unaligned_loop) /* Copy another 16 bytes to align to 32-bytes due to the loop . */ lvx 4,12,6 - vperm 6,3,4,5 +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif addi 11,12,16 addi 10,3,16 stvx 6,0,3 @@ -403,11 +411,17 @@ L(unaligned_loop): vector instructions though. */ lvx 4,11,6 /* vr4 = r11+16. */ - vperm 6,3,4,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr6. */ +#ifdef __LITTLE_ENDIAN__ + vperm 6,4,3,5 +#else + vperm 6,3,4,5 +#endif lvx 3,11,7 /* vr3 = r11+32. */ - vperm 10,4,3,5 /* Merge the correctly-aligned portions - of vr3/vr4 into vr10. */ +#ifdef __LITTLE_ENDIAN__ + vperm 10,3,4,5 +#else + vperm 10,4,3,5 +#endif addi 11,11,32 stvx 6,0,10 stvx 10,10,6 -- cgit v1.1