1 files changed, 592 insertions, 0 deletions
diff --git a/newlib/libc/machine/arc64/strcat.S b/newlib/libc/machine/arc64/strcat.S
new file mode 100644
index 0000000..48e6ce1
--- /dev/null
+++ b/newlib/libc/machine/arc64/strcat.S
@@ -0,0 +1,592 @@
+/*
+   Copyright (c) 2024, Synopsys, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+   1) Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+   2) Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+   3) Neither the name of the Synopsys, Inc., nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <sys/asm.h>
+
+
+; r0 char* dest
+; r1 const char* src
+
+; dest and src MUST NOT intercept
+
+; Brief:
+; Perform the same operation as strlen for finding the end of r0 string
+; If r0 and r1 have
+; If 4 byte aligned
+; 	Do 4 byte search until there are no more 4 byte chunks
+;	Then, do 1 byte search
+; Otherwise, 1 byte search until alignment
+;	Then, do 4 byte search as previously specified
+;
+;; More in depth description at the end
+;
+; R0 char* dest (destination string)
+; R1 const char* src (source string)
+; ret (R0):
+;		- char* (destiantion string)
+;
+
+#if defined (__ARC64_ARCH32__)
+
+ENTRY (strcat)
+; Find end of r0 string
+; ========================== STRLEN CODE START ==========================
+
+; Preserve r0 for size calculation when returning
+	mov	r13, r0
+	xor	r6, r6, r6
+
+; Setup byte detector (more information below) [1]
+	mov	r8, NULL_32DT_1
+	asl	r9, r8, 7
+
+.L_4_4B_search:
+
+#if defined (__ARC64_LL64__)
+
+	ldd.ab	r2r3, [r13, +8]
+	ldd.ab	r4r5, [r13, +8]
+
+#else
+
+	ld.ab	r2, [r13, +4]
+	ld.ab	r3, [r13, +4]
+	ld.ab	r4, [r13, +4]
+	ld.ab	r5, [r13, +4]
+
+#endif
+
+; NULL byte position is detected and encoded in r6 [0] [9]
+	sub	r10, r2, r8
+	sub	r11, r3, r8
+	sub	r12, r4, r8
+	sub	r7, r5, r8
+
+	bic	r10, r10, r2
+	bic	r11, r11, r3
+	bic	r12, r12, r4
+	bic	r7, r7, r5
+
+	tst	r10, r9
+	bset.ne	r6, r6, 4
+
+	tst	r11, r9
+	bset.ne	r6, r6, 3
+
+	tst	r12, r9
+	bset.ne	r6, r6, 2
+
+	tst	r7, r9
+	bset.ne	r6, r6, 1
+
+	breq.d	r6, 0, @.L_4_4B_search
+
+	fls	r5, r6 ; [2]
+
+; Point r13 to first NULL byte containing double word [3]
+	sub2	r13, r13, r5
+
+	; Select appropriate register to analyze [4]
+	mov	r2, r7
+
+	asr.f	r6, r6, 3
+	mov.c	r2, r12
+
+	asr.f	r6, r6, 1
+	mov.c	r2, r11
+
+	asr.f	r6, r6, 1
+	mov.c	r2, r10
+
+; Point r13 to first NULL byte in selected double word
+	and	r2, r2, r9 ; [5]
+
+	ffs	r2, r2 ; [6]
+
+	xbfu 	r2, r2, 0b0111000011 ; [7]
+
+	add	r13, r13, r2 ; [8]
+
+
+; ========================== STRLEN CODE END >|< ==========================
+
+	xor	r6, r6, r6
+
+.L_4_4B_search_src:
+
+#if defined (__ARC64_LL64__)
+
+	ldd.ab	r2r3, [r1, +8]
+	ldd.ab	r4r5, [r1, +8]
+
+#else
+
+	ld.ab	r2, [r1, +4]
+	ld.ab	r3, [r1, +4]
+	ld.ab	r4, [r1, +4]
+	ld.ab	r5, [r1, +4]
+
+#endif
+
+; NULL byte position is detected and encoded in r6 [0] [9]
+	sub	r10, r2, r8
+	sub	r11, r3, r8
+	sub	r12, r4, r8
+	sub	r7, r5, r8
+
+	bic	r10, r10, r2
+	bic	r11, r11, r3
+	bic	r12, r12, r4
+	bic	r7, r7, r5
+
+	tst	r10, r9
+	bset.ne	r6, r6, 4
+
+	tst	r11, r9
+	bset.ne	r6, r6, 3
+
+	tst	r12, r9
+	bset.ne	r6, r6, 2
+
+	tst	r7, r9
+	bset.ne	r6, r6, 1
+
+	brne	r6, 0, @.L_found_in_32B
+
+#if defined (__ARC64_LL64__)
+
+	std.ab	r2r3, [r13, +8]
+	std.ab	r4r5, [r13, +8]
+
+#else
+
+	st.ab	r2, [r13, +4]
+	st.ab	r3, [r13, +4]
+	st.ab	r4, [r13, +4]
+	st.ab	r5, [r13, +4]
+
+#endif
+
+	b	@.L_4_4B_search_src
+
+.L_found_in_32B:
+
+	fls	r6, r6 ; [2]
+
+; Point r1 to first NULL byte containing double word [3]
+	sub2	r1, r1, r6
+
+;; Store the already loaded data
+
+	; 4 -> 1 to 3 -> 0
+	;subl	r6, r6, 1
+
+; Invert so the biggest branch is at the end, and we dont need to increase
+; block size
+	; 3 -> 0 to 0 -> 3
+	;subl	r6, 3, r6
+
+	; Condense the two subs here
+	rsub	r6, r6, 4
+
+	asl	r6, r6, 2
+
+; Store double words
+	bi	[r6]
+
+	b.d	@.L_store_lastL32bits
+	mov	r11, r2
+	nop
+	nop
+
+	st.ab	r2, [r13, +4]
+	b.d	@.L_store_lastL32bits
+	mov	r11, r3
+	nop
+
+	st.ab	r2, [r13, +4]
+	st.ab	r3, [r13, +4]
+	b.d	@.L_store_lastL32bits
+	mov	r11, r4
+
+	st.ab	r2, [r13, +4]
+	st.ab	r3, [r13, +4]
+	st.ab	r4, [r13, +4]
+	mov	r11, r5
+
+; r11 now contains the data to write
+.L_store_lastL32bits:
+	sub r10, r11, r8
+	bic	r10, r10, r11
+	and	r10, r10, r9 ; [5]
+
+	ffs	r2, r10 ; [6]
+	add	r2, r2, 1
+
+	xbfu	r2, r2, 0b0111000011 ; [7]
+
+	mov	r3, -1; Bitmask setup
+
+	; If the NULL byte is in byte 3 (starting from the right)
+	; we want to store 8-3 bytes
+	rsub	r2, r2, 8
+	asl	r2, r2, 3
+
+	; According to the target byte, setup masks
+	lsr	r3, r3, r2
+	not	r4, r3
+
+	; Obtain relevant data from destination
+	ld	r10, [r13]
+
+	; Get which data from dest is not to be overwritten and OR it
+	; with the relevant data to write
+	and	r3, r3, r11
+	and	r4, r4, r10
+
+	or	r3, r3, r4
+
+	j_s.d	[blink]
+	st.ab	r3, [r13, +4]
+
+
+
+ENDFUNC (strcat)
+
+#else
+
+ENTRY (strcat)
+; Find end of r0 string
+; ========================== STRLEN CODE START ==========================
+
+; Preserve r0 for size calculation when returning
+	movl	r13, r0
+	xorl	r6, r6, r6
+
+; Setup byte detector (more information below) [1]
+	vpack2wl	r8, NULL_32DT_1, NULL_32DT_1
+	asll	r9, r8, 7
+
+.L_4_8B_search:
+
+; Using 128-bit memory operations
+#if defined (__ARC64_M128__)
+
+	lddl.ab r2r3, [r13, +16]
+	lddl.ab r4r5, [r13, +16]
+
+; The 64-bit crunching implementation.
+#elif defined (__ARC64_ARCH64__)
+
+	ldl.ab	r2, [r13, +8]
+	ldl.ab	r3, [r13, +8]
+	ldl.ab	r4, [r13, +8]
+	ldl.ab	r5, [r13, +8]
+
+#else
+# error Unknown configuration
+#endif
+
+; NULL byte position is detected and encoded in r6 [0] [9]
+	subl	r10, r2, r8
+	subl	r11, r3, r8
+	subl	r12, r4, r8
+	subl	r7, r5, r8
+
+	bicl	r10, r10, r2
+	bicl	r11, r11, r3
+	bicl	r12, r12, r4
+	bicl	r7, r7, r5
+
+	tstl	r10, r9
+	bset.ne	r6, r6, 4
+
+	tstl	r11, r9
+	bset.ne	r6, r6, 3
+
+	tstl	r12, r9
+	bset.ne	r6, r6, 2
+
+	tstl	r7, r9
+	bset.ne	r6, r6, 1
+
+	breq.d	r6, 0, @.L_4_8B_search
+
+	fls	r5, r6 ; [2]
+
+; Point r13 to first NULL byte containing double word [3]
+	sub3l	r13, r13, r5
+
+	; Select appropriate register to analyze [4]
+	MOVP	r2, r7
+
+	asr.f	r6, r6, 3
+	MOVP.c	r2, r12
+
+	asr.f	r6, r6, 1
+	MOVP.c	r2, r11
+
+	asr.f	r6, r6, 1
+	MOVP.c	r2, r10
+
+; Point r13 to first NULL byte in selected double word
+	andl	r2, r2, r9 ; [5]
+
+	ffsl	r2, r2 ; [6]
+
+	xbful 	r2, r2, 0b0111000011 ; [7]
+
+	addl	r13, r13, r2 ; [8]
+
+
+; ========================== STRLEN CODE END >|< ==========================
+
+	xorl	r6, r6, r6
+
+.L_4_8B_search_src:
+#if defined (__ARC64_M128__)
+
+	lddl.ab	r2r3, [r1, +16]
+	lddl.ab	r4r5, [r1, +16]
+
+#elif defined (__ARC64_ARCH64__)
+
+	ldl.ab	r2, [r1, +8]
+	ldl.ab	r3, [r1, +8]
+	ldl.ab	r4, [r1, +8]
+	ldl.ab	r5, [r1, +8]
+
+#else
+	# error Unknown configuration
+#endif
+
+; NULL byte position is detected and encoded in r6 [0] [9]
+	subl	r10, r2, r8
+	subl	r11, r3, r8
+	subl	r12, r4, r8
+	subl	r7, r5, r8
+
+	bicl	r10, r10, r2
+	bicl	r11, r11, r3
+	bicl	r12, r12, r4
+	bicl	r7, r7, r5
+
+	tstl	r10, r9
+	bset.ne	r6, r6, 4
+
+	tstl	r11, r9
+	bset.ne	r6, r6, 3
+
+	tstl	r12, r9
+	bset.ne	r6, r6, 2
+
+	tstl	r7, r9
+	bset.ne	r6, r6, 1
+
+	brne	r6, 0, @.L_found_in_32B
+
+#if defined (__ARC64_M128__)
+
+	stdl.ab	r2r3, [r13, +16]
+	stdl.ab	r4r5, [r13, +16]
+
+#elif defined (__ARC64_ARCH64__)
+
+	stl.ab	r2, [r13, +8]
+	stl.ab	r3, [r13, +8]
+	stl.ab	r4, [r13, +8]
+	stl.ab	r5, [r13, +8]
+
+#else
+# error Unknown configuration
+#endif
+
+	b	@.L_4_8B_search_src
+
+.L_found_in_32B:
+
+	fls	r6, r6 ; [2]
+
+; Point r1 to first NULL byte containing double word [3]
+	sub3l	r1, r1, r6
+
+;; Store the already loaded data
+
+	; 4 -> 1 to 3 -> 0
+	;subl	r6, r6, 1
+
+; Invert so the biggest branch is at the end, and we dont need to increase
+; block size
+	; 3 -> 0 to 0 -> 3
+	;subl	r6, 3, r6
+
+	; Condense the two subs here
+	rsubl	r6, r6, 4
+
+	asll	r6, r6, 2
+
+; Store double words
+	bi	[r6]
+
+	b.d	@.L_store_lastL64bits
+	MOVP	r11, r2
+	nop
+	nop
+
+	stl.ab	r2, [r13, +8]
+	b.d	@.L_store_lastL64bits
+	MOVP	r11, r3
+	nop
+
+	stl.ab	r2, [r13, +8]
+	stl.ab	r3, [r13, +8]
+	b.d	@.L_store_lastL64bits
+	MOVP	r11, r4
+
+	stl.ab	r2, [r13, +8]
+	stl.ab	r3, [r13, +8]
+	stl.ab	r4, [r13, +8]
+	MOVP	r11, r5
+
+; r11 now contains the data to write
+.L_store_lastL64bits:
+	subl	r10, r11, r8
+	bicl	r10, r10, r11
+
+	andl	r10, r10, r9 ; [5]
+
+	ffsl	r2, r10 ; [6]
+	addl	r2, r2, 1
+
+	xbful 	r2, r2, 0b0111000011 ; [7]
+
+	movl	r3, -1; Bitmask setup
+
+	; If the NULL byte is in byte 3 (starting from the right)
+	; we want to store 8-3 bytes
+	rsubl	r2, r2, 8
+	asl	r2, r2, 3
+
+	; According to the target byte, setup masks
+	lsrl	r3, r3, r2
+	notl	r4, r3
+
+	; Obtain relevant data from destination
+	ldl	r10, [r13]
+
+	; Get which data from dest is not to be overwritten and OR it
+	; with the relevant data to write
+	andl	r3, r3, r11
+	andl	r4, r4, r10
+
+	orl	r3, r3, r4
+
+	j_s.d	[blink]
+	stl.ab	r3, [r13, +8]
+
+
+ENDFUNC (strcat)
+
+#endif
+
+;; This code uses a common technique for NULL byte detection inside a word.
+;; Details on this technique can be found in:
+;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord)
+;
+; In sum, this technique allows for detecting a NULL byte inside any given
+; amount of bits by performing the following operation
+; 		DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0]
+;
+; The code above implements this by setting r8 to a 0x01010101... sequence and
+; r9 to a 0x80808080... sequence of appropriate length
+; As LIMM are 32 bit only, we need to perform MOVHL and ORL [1] operations to
+; have the appropriate 64 bit values in place
+;
+;; Search is done 32 bytes at a time, either with 64 bit loads or 128 bit loads
+;; If a NULL byte is detected, the position of the double word is encoded
+;; in r6, which is then used to adjust r13 to the exact byte
+;
+; r6 is set via bset, which means we can simply use a fls to obtain the first
+; match (or ffs depending on the values in bset) [2].
+; The reason for starting at 1 and not 0 is so r6 encodes how many double
+; words to go back, and it wouldnt make sense to go back 0 (the NULL would be
+; in the next loop iteration).
+;
+; The first step to take is point r13 to the appropriate double word.
+; As the chosen encoded information is how many double words to go back,
+; we can simply multiply r6 by 8 and reduce r13 by that amount [3]
+;
+; Then, we need to place the loaded double word containing the first NULL byte
+; into a "common" register we can operate on later [4].
+;
+; To do this without any jumps, we can shift r6 and perform a conditional mov
+; based on the carry flag value.
+; The order is very important because the NULL byte can appear in several
+; double words, so we want to analyze from last to first.
+;
+; We can ignore the first asr (which would be asr.f 2, as we started r6 on 1)
+; because if r7 isnt the NULL byte, r2 will always be overwritten so we can
+; just decide to start at r7, and overwrite it if needed.
+;
+; Now comes the tricky part. In order to obtain the first NULL byte, we need to
+; understand the NULL byte detection operation. It is explained in depth in the
+; link above but in short, it works by first setting the highest bit of each
+; byte to 1, if the corresponding byte is either 0 or less than 0x80
+; Then, separately, it makes the highest bit of each byte 1, if the byte is
+; less than 0x80. The last step is to and these two values (this operation is
+; simplified with the subl, bicl and tst instructions).
+;
+; This means that the evaluated equation result value [5] has zeros for all non
+; zero bytes, except for the NULL bytes. Therefore, we can simply find the
+; first non zero bit (counting from bit 0) which will be inside the position of
+; the first NULL byte.
+;
+; One thing to note, is that ffs oddly returns 31 if no bit is found, setting
+; the zero flag. As r9 is never all 0s at this stage (would mean there is no
+; NULL byte and we wouldnt be here) we dont need to worry about that. [6]
+;
+; We can then convert the bit position into the last byte position by looking
+; into bits 3 to 5, and shifting 3 bits to the right. This can be combined into
+; a single xbful operation. The bottom 000011 represent shift by 3 and the top
+; 0111 represents the mask (3 to 5 shifted by 3 is 0 to 2). We dont need to worry
+; about the case where ffs does not find a bit, because we know for sure there is
+; at least one NULL byte, and therefore one of the highest bits is set to 1 [7]
+;
+; Finally, we can add the NULL byte position inside the loaded double word to
+; r13 and subtract r0 from r13 to obtain the string size [8]
+;
+; Some operations are re-ordered such that register dependency is reduced,
+; allowing the CPU to run more instructions in parallel [9]
+;
+;
+; Some data was already read, and needs to be stored following the same read
+; order. To do this, we need to make the
+;
+;