diff options
author | Christopher Faylor <me@cgf.cx> | 2000-02-17 19:39:52 +0000 |
---|---|---|
committer | Christopher Faylor <me@cgf.cx> | 2000-02-17 19:39:52 +0000 |
commit | 8a0efa53e44919bcf5ccb1d3353618a82afdf8bc (patch) | |
tree | 68c3dbf3f2c6fd5d49777def9914d77b5cd4589d /newlib/libc/machine/sh/memcpy.S | |
parent | 1fd5e000ace55b323124c7e556a7a864b972a5c4 (diff) | |
download | newlib-8a0efa53e44919bcf5ccb1d3353618a82afdf8bc.zip newlib-8a0efa53e44919bcf5ccb1d3353618a82afdf8bc.tar.gz newlib-8a0efa53e44919bcf5ccb1d3353618a82afdf8bc.tar.bz2 |
import newlib-2000-02-17 snapshot
Diffstat (limited to 'newlib/libc/machine/sh/memcpy.S')
-rw-r--r-- | newlib/libc/machine/sh/memcpy.S | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/newlib/libc/machine/sh/memcpy.S b/newlib/libc/machine/sh/memcpy.S new file mode 100644 index 0000000..4df72e3 --- /dev/null +++ b/newlib/libc/machine/sh/memcpy.S @@ -0,0 +1,221 @@ +! +! Fast SH memcpy +! +! by Toshiyasu Morita (tm@netcom.com) +! hacked by J"orn Rernnecke (amylaar@cygnus.co.uk) ("o for o-umlaut) +! +! Entry: r4: destination pointer +! r5: source pointer +! r6: byte count +! +! Exit: r0: destination pointer +! r1-r7: trashed +! +! Notes: Usually one wants to do small reads and write a longword, but +! unfortunately it is difficult in some cases to concatanate bytes +! into a longword on the SH, so this does a longword read and small +! writes. +! +! This implementation makes two assumptions about how it is called: +! +! 1.: If the byte count is nonzero, the address of the last byte to be +! copied is unsigned greater than the address of the first byte to +! be copied. This could be easily swapped for a signed comparison, +! but the algorithm used needs some comparison. +! +! 2.: When there are two or three bytes in the last word of an 11-or-bore +! bytes memory chunk to b copied, the rest of the word can be read +! without size effects. +! This could be easily changed by increasing the minumum size of +! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, +! however, this would cost a few extra cyles on average. +! + +#include "asm.h" + +ENTRY(memcpy) +#ifdef __LITTLE_ENDIAN__ + ! Little endian version copies with increasing addresses. + mov r4,r3 ! Save return value + mov #11,r0 ! Check if small number of bytes + cmp/hs r0,r6 + ! r6 becomes src end address + SL(bf, L_small, add r5,r6) + mov #1,r1 + tst r1,r5 ! check if source even + SL(bt, L_even, mov r6,r7) + mov.b @r5+,r0 ! no, make it even. + mov.b r0,@r4 + add #1,r4 +L_even: tst r1,r4 ! check if destination is even + add #-3,r7 + SL(bf, L_odddst, mov #2,r1) + tst r1,r4 ! check if destination is 4-byte aligned + mov r4,r0 + SL(bt, L_al4dst, sub r5,r0) + mov.w @r5+,r2 + mov.w r2,@r4 + ! add #2,r4 r4 is dead here. +L_al4dst: + tst r1,r5 + bt L_al4both + mov.w @r5+,r1 + swap.w r1,r1 + add #-6,r0 + add #-6,r7 ! r7 := src end address minus 9. + .align 2 +L_2l_loop: + mov.l @r5+,r2 ! Read & write two longwords per iteration + xtrct r2,r1 + mov.l r1,@(r0,r5) + cmp/hs r7,r5 + mov.l @r5+,r1 + xtrct r1,r2 + mov.l r2,@(r0,r5) + bf L_2l_loop + add #-2,r5 + bra L_cleanup + add #5,r0 +L_al4both: + add #-4,r0 + .align 2 +L_al4both_loop: + mov.l @r5+,r4 ! Read longword, write longword per iteration + cmp/hs r7,r5 + SL(bf, L_al4both_loop, mov.l r4,@(r0,r5)) + + bra L_cleanup + add #3,r0 + +L_odddst: + tst r1,r5 + SL(bt, L_al4src, add #-1,r4) + mov.w @r5+,r0 + mov.b r0,@(1,r4) + shlr8 r0 + mov.b r0,@(2,r4) + add #2,r4 +L_al4src: + .align 2 +L_odd_loop: + mov.l @r5+,r0 ! Read longword, write byte, word, byte per iteration + cmp/hs r7,r5 + mov.b r0,@(1,r4) + shlr8 r0 + mov.w r0,@(2,r4) + shlr16 r0 + mov.b r0,@(4,r4) + SL(bf, L_odd_loop, add #4,r4) + .align 2 ! avoid nop in more frequently executed code. +L_cleanup2: + mov r4,r0 + sub r5,r0 +L_cleanup: + cmp/eq r6,r5 + bt L_ready + .align 2 +L_cleanup_loop: + mov.b @r5+,r1 + cmp/eq r6,r5 + mov.b r1,@(r0,r5) + bf L_cleanup_loop +L_ready: + rts + mov r3,r0 +L_small: + bra L_cleanup2 + add #-1,r4 +#else + ! Big endian version copies with decreasing addresses. + mov r4,r0 + add r6,r0 + sub r4,r5 + mov #11,r1 + cmp/hs r1,r6 + SL(bf, L_small, add #-1,r5) + mov r5,r3 + add r0,r3 + shlr r3 + SL(bt, L_even, + mov r4,r7) + mov.b @(r0,r5),r2 + add #-1,r3 + mov.b r2,@-r0 +L_even: + tst #1,r0 + add #-1,r5 + SL(bf, L_odddst, add #8,r7) + tst #2,r0 + bt L_al4dst + add #-1,r3 + mov.w @(r0,r5),r1 + mov.w r1,@-r0 +L_al4dst: + shlr r3 + bt L_al4both + mov.w @(r0,r5),r1 + swap.w r1,r1 + add #4,r7 + add #-4,r5 + .align 2 +L_2l_loop: + mov.l @(r0,r5),r2 + xtrct r2,r1 + mov.l r1,@-r0 + cmp/hs r7,r0 + mov.l @(r0,r5),r1 + xtrct r1,r2 + mov.l r2,@-r0 + bt L_2l_loop + bra L_cleanup + add #5,r5 + + nop ! avoid nop in executed code. +L_al4both: + add #-2,r5 + .align 2 +L_al4both_loop: + mov.l @(r0,r5),r1 + cmp/hs r7,r0 + SL(bt, L_al4both_loop, + mov.l r1,@-r0) + bra L_cleanup + add #3,r5 + + nop ! avoid nop in executed code. +L_odddst: + shlr r3 + bt L_al4src + mov.w @(r0,r5),r1 + mov.b r1,@-r0 + shlr8 r1 + mov.b r1,@-r0 +L_al4src: + add #-2,r5 + .align 2 +L_odd_loop: + mov.l @(r0,r5),r2 + cmp/hs r7,r0 + mov.b r2,@-r0 + shlr8 r2 + mov.w r2,@-r0 + shlr16 r2 + mov.b r2,@-r0 + bt L_odd_loop + + add #3,r5 +L_cleanup: +L_small: + cmp/eq r4,r0 + bt L_ready + add #1,r4 + .align 2 +L_cleanup_loop: + mov.b @(r0,r5),r2 + cmp/eq r4,r0 + mov.b r2,@-r0 + bf L_cleanup_loop +L_ready: + rts + nop +#endif |