aboutsummaryrefslogtreecommitdiff
path: root/newlib/libc/machine/arc64/memcpy.S
diff options
context:
space:
mode:
Diffstat (limited to 'newlib/libc/machine/arc64/memcpy.S')
-rw-r--r--newlib/libc/machine/arc64/memcpy.S236
1 files changed, 236 insertions, 0 deletions
diff --git a/newlib/libc/machine/arc64/memcpy.S b/newlib/libc/machine/arc64/memcpy.S
new file mode 100644
index 0000000..77cf307
--- /dev/null
+++ b/newlib/libc/machine/arc64/memcpy.S
@@ -0,0 +1,236 @@
+/*
+ Copyright (c) 2024, Synopsys, Inc. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1) Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2) Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3) Neither the name of the Synopsys, Inc., nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <sys/asm.h>
+
+; This file contains variants of the same function with different
+; instructions. The generic one, the implementation that comes the
+; last after the #else macro, is the most commented.
+
+; Using 128-bit memory operations
+#if defined (__ARC64_M128__)
+
+ENTRY (memcpy)
+ lsrl.f r12, r2, 6 ; Check size < 64bytes
+ beq.d @.L_write_1_bytes
+ movl r3, r0
+.L_write_64_bytes:
+ lddl.ab r4r5, [r1, +16]
+ lddl.ab r6r7, [r1, +16]
+ lddl.ab r8r9, [r1, +16]
+ lddl.ab r10r11, [r1, +16]
+ stdl.ab r4r5, [r3, +16]
+ stdl.ab r6r7, [r3, +16]
+ stdl.ab r8r9, [r3, +16]
+ dbnz.d r12, @.L_write_64_bytes
+ stdl.ab r10r11, [r3, +16]
+.L_write_1_bytes:
+ ;; Handle anything between 15bytes < size < 64bytes
+ ;; The algorithm has two phases:
+ ;; - copy 16, 32, or 48 bytes of data using 128bit ops
+ ;; - copy the remaining 15 bytes of data using a single stdl/lddl pair
+ bmsk.f r2, r2, 5 ; Check size == 0
+ jeq.d [blink]
+ lsr.f r12, r2, 4 ; Check size < 16bytes
+ beq.d @1f
+ xor r12, r12, 3
+ ;; R12 can be 3,2, or 1, which are indicating how much data we should
+ ;; copy: 3 -> 48bytes, 2 -> 32bytes, 1 -> 16bytes.
+ ;; Zero case shouldn't happen as we check for it above.
+ ;; Then I use the BI instructions to implement the following code
+ ;; switch ($R12)
+ ;; case 3:
+ ;; lddl RA, ...
+ ;; stdl RA, ...
+ ;; case 2:
+ ;; lddl RA, ...
+ ;; stdl RA, ...
+ ;; case 1:
+ ;; lddl RA, ...
+ ;; stdl RA, ...
+ ;; case 0:
+ ;; break
+ ;; N.B the BI instruction works the other way than I expected, namely
+ ;; BI's entry 0 is the closest to instruction, hence I need to bit
+ ;; invert R12 to get the desired behaviour (done by above XOR).
+ asl r12,r12,1
+ bi [r12]
+ lddl.ab r4r5, [r1, +16]
+ stdl.ab r4r5, [r3, +16]
+ lddl.ab r6r7, [r1, +16]
+ stdl.ab r6r7, [r3, +16]
+ lddl.ab r8r9, [r1, +16]
+ stdl.ab r8r9, [r3, +16]
+ bmsk.f r2, r2, 3 ; Check size == 0
+ jeq.d [blink]
+ subl r2, r2, 16
+ ;; We are still having 15 bytes top to transfer, exactly like in the
+ ;; case of below byte-by-byte transfer. However, we already transfered
+ ;; at least 16bytes before, thus, we can create a new 16byte load which
+ ;; re-reads parts of the already transfer data AND the remaining up to
+ ;; 15 bytes of data still to be transfered.
+ ;; The position of the window is controlled by the $r12 which is the
+ ;; complement of the number of remaining bytes.
+ addl r3, r3, r2
+ lddl r4r5, [r1, r2]
+ j_s.d [blink]
+ stdl r4r5, [r3]
+1:
+ ;; Anything size < 16 we go byte by byte.
+ ldb.ab r4, [r1, +1]
+ dbnz.d r2, @1b
+ stb.ab r4, [r3, +1]
+ j_s [blink]
+ENDFUNC (memcpy)
+
+; The 64-bit crunching implementation.
+#elif defined (__ARC64_ARCH64__) \
+ || (defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__))
+
+; R0: dest
+; R1: source
+; R2: count
+; ret (R0): dest
+; clobber: r1, r3, r4r5, r6r7, r8r9, r10r11, r12
+ENTRY (memcpy)
+ LSRP.f r12, r2, 5 ; counter for 32-byte chunks
+ beq.d @.L_write_31_bytes
+ MOVP r3, r0 ; do not clobber the "dest"
+.L_write_32_bytes: ; Take care of 32 byte chunks
+ LD64.ab r4, [r1, +8]
+ LD64.ab r6, [r1, +8]
+ LD64.ab r8, [r1, +8]
+ LD64.ab r10,[r1, +8]
+ ST64.ab r4, [r3, +8]
+ ST64.ab r6, [r3, +8]
+ ST64.ab r8, [r3, +8]
+ dbnz.d r12, @.L_write_32_bytes
+ ST64.ab r10, [r3, +8] ; Shove store in delay slot
+ bmsk_s r2, r2, 4 ; From now on, we only care for the remainder % 32
+
+
+; The remainder bits indicating how many more bytes to copy
+; .------------------------.
+; | b4 | b3 | b2 | b1 | b0 |
+; `------------------------'
+; 16 8 4 2 1
+.L_write_31_bytes:
+ bbit0.d r2, 2, @1f ; is b2 set? then copy 4 bytes
+ lsr r12, r2, 3 ; see the notes below
+ ld.ab r4, [r1, 4]
+ st.ab r4, [r3, 4]
+1:
+ bbit0.d r2, 1, @1f ; is b1 set? then copy 2 bytes
+ xor r12, r12, 3
+ ldh.ab r4, [r1, 2]
+ sth.ab r4, [r3, 2]
+1:
+ bbit0.d r2, 0, @1f ; is b0 set? then copy 1 byte
+ asl r12, r12, 1
+ ldb.ab r4, [r1, 1]
+ stb.ab r4, [r3, 1]
+
+; Interpreting bits (b4,b3) [1] and how they correlate to branch index:
+;
+; (b4,b3) | bytes to copy | branch index
+; --------+---------------+-------------
+; 00b | 0 | 3 (11b)
+; 01b | 8 | 2 (10b)
+; 10b | 16 | 1 (01b)
+; 11b | 24 | 0 (00b)
+;
+; To go from (b4,b3) to branch index, the bits must be flipped.
+; In other words, they must be XORed with 11b [2].
+;
+; Last but not least, "bi" jumps at boundaries of 4. We need to double
+; the index to jump 8 bytes [3].
+;
+; Hence, the 3 operations for calculating the branch index that are spread
+; in "bbit0" delay slots:
+;
+; lsr r12, r2, 3 [1]
+; xor r12, r12, 3 [2]
+; asl r12, r12, 1 [3]
+1:
+ bi [r12]
+ LD64.ab r4, [r1, 8]
+ ST64.ab r4, [r3, 8]
+ LD64.ab r4, [r1, 8]
+ ST64.ab r4, [r3, 8]
+ LD64.ab r4, [r1, 8]
+ ST64.ab r4, [r3, 8]
+
+ j_s [blink]
+ENDFUNC (memcpy)
+
+#elif defined (__ARC64_ARCH32__)
+
+ENTRY (memcpy)
+ lsr.f r11, r2, 4 ; counter for 16-byte chunks
+ beq.d @.L_write_15_bytes
+ mov r3, r0 ; work on a copy of "r0"
+.L_write_16_bytes:
+ ld.ab r4, [r1, 4]
+ ld.ab r5, [r1, 4]
+ ld.ab r6, [r1, 4]
+ ld.ab r7, [r1, 4]
+ st.ab r4, [r3, 4]
+ st.ab r5, [r3, 4]
+ st.ab r6, [r3, 4]
+ dbnz.d r11, @.L_write_16_bytes
+ st.ab r7, [r3, 4]
+ bmsk_s r2, r2, 3
+
+.L_write_15_bytes:
+ bbit0.d r2, 1, @1f
+ lsr r11, r2, 2
+ ldh.ab r4, [r1, 2]
+ sth.ab r4, [r3, 2]
+1:
+ bbit0.d r2, 0, @1f
+ xor r11, r11, 3
+ ldb.ab r4, [r1, 1]
+ stb.ab r4, [r3, 1]
+1:
+ asl r11, r11, 1
+ bi [r11]
+ ld.ab r4,[r1, 4]
+ st.ab r4,[r3, 4]
+ ld.ab r4,[r1, 4]
+ st.ab r4,[r3, 4]
+ ld r4,[r1]
+ st r4,[r3]
+
+ j_s [blink]
+ENDFUNC (memcpy)
+
+#else
+# error Unknown configuration
+#endif