aboutsummaryrefslogtreecommitdiff
path: root/newlib/libc/machine/arc64/memcmp.S
diff options
context:
space:
mode:
Diffstat (limited to 'newlib/libc/machine/arc64/memcmp.S')
-rw-r--r--newlib/libc/machine/arc64/memcmp.S269
1 files changed, 269 insertions, 0 deletions
diff --git a/newlib/libc/machine/arc64/memcmp.S b/newlib/libc/machine/arc64/memcmp.S
new file mode 100644
index 0000000..5defd0c
--- /dev/null
+++ b/newlib/libc/machine/arc64/memcmp.S
@@ -0,0 +1,269 @@
+/*
+ Copyright (c) 2024, Synopsys, Inc. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ 1) Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ 2) Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ 3) Neither the name of the Synopsys, Inc., nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <sys/asm.h>
+
+#if defined (__ARC64_ARCH64__)
+
+; R0: lhs
+; R1: rhs
+; R2: count
+; ret (R0):
+; - lhs < rhs: <0
+; - lhs = rhs: 0
+; - lhs > rhs: >0
+ENTRY (memcmp)
+ cmpl r2, 64
+ bls.d @.L_compare_1_bytes
+ movl r3, r0 ; "r0" will be used as return value
+ ; If one is curious why the code below looks like the way it does,
+ ; there is a documentation at the end of this file.
+ lsrl r12, r2, 5 ; counter for 32-byte chunks
+ xor r13, r13, r13 ; the mask showing inequal registers
+ ldl.ab r4, [r3, +8]
+ ldl.ab r5, [r1, +8]
+.L_compare_32_bytes:
+ ldl.ab r6, [r3, +8]
+ ldl.ab r7, [r1, +8]
+ ldl.ab r8, [r3, +8]
+ ldl.ab r9, [r1, +8]
+ ldl.ab r10, [r3, +8]
+ ldl.ab r11, [r1, +8]
+ xorl.f 0, r4, r5
+ xor.ne r13, r13, 0b0001
+ xorl.f 0, r6, r7
+ xor.ne r13, r13, 0b0010
+ xorl.f 0, r8, r9
+ xor.ne r13, r13, 0b0100
+ xorl.f 0, r10, r11
+ xor.ne r13, r13, 0b1000
+ brne r13, 0, @.L_unequal_find
+ ldl.ab r4, [r3, +8]
+ dbnz.d r12, @.L_compare_32_bytes
+ ldl.ab r5, [r1, +8]
+ ; Adjusting the pointers because of the extra loads in the end
+ subl r1, r1, 8
+ subl r3, r3, 8
+ bmsk_s r2, r2, 4 ; any remaining bytes to compare
+.L_compare_1_bytes:
+ cmp r2, 0
+ jeq.d [blink]
+ xor_s r0, r0, r0
+ ldb.ab r4, [r3, +1]
+ ldb.ab r5, [r1, +1]
+2:
+ sub.f r0, r4, r5
+ jne.d [blink]
+ ldb.ab r4, [r3, +1]
+ dbnz.d r2, @2b
+ ldb.ab r5, [r1, +1] ; this load may read beyond the "count".
+ j_s [blink]
+; At this point, we want to find the _first_ comparison that marked the
+; inequality of "lhs" and "rhs". The rest acts like a multiplexer:
+;
+; if r4 was not equal to r5 --> r1=r4, r2=r5
+; if r6 was not equal to r7 --> r1=r6, r2=r7
+; if r8 was not equal to r9 --> r1=r8, r2=r9
+; if r10 was not equal to r11 --> r1=r10, r2=r11
+; find_different_byte(r1, r2)
+;
+; About the "bi [n]" (branch index) instruction: This instruction alters
+; next PC (program counter):
+;
+; next_pc = current_pc + n*4 n*4 is the same as n<<2
+;
+; In other words, it tells the processor to execute the n'th instruction
+; from where we are (assuming all the next instructions are 4 bytes long).
+;
+; We used this to our benefit. We made each "case" (unequal_r4r5,
+; unequal_r5r6, ...) 16 bytes long (power of 2) and fed "bi" an index
+; that is already multiplied by 4 (asl r13, r13, 2). This translates
+; into "bi [n]" jumping to 16-bytes slots. The last slot we did not
+; make 16 bytes long with "nop" because we don't need to address after
+; it.
+.L_unequal_find:
+ ffs r13, r13
+ asl r13, r13, 2
+ bi [r13]
+.L_unequal_r4r5:
+ movl r1, r4
+ b.d @.L_diff_byte_in_regs
+ movl r2, r5
+ nop
+.L_unequal_r6r7:
+ movl r1, r6
+ b.d @.L_diff_byte_in_regs
+ movl r2, r7
+ nop
+.L_unequal_r8r9:
+ movl r1, r8
+ b.d @.L_diff_byte_in_regs
+ movl r2, r9
+ nop
+.L_unequal_r10r11:
+ movl r1, r10
+ movl r2, r11
+ ; fall-through
+; If we're here, that means the two operands are not equal.
+; 1) First we have to get a mask of their inequality through "xor"
+; 2) Then, find the first bit position that they're different: "ffs"
+; 3) Depending on the bit position, we want the whole byte containing
+; that bit, in both operands, to become the very first byte (least
+; significant byte), so that we can subtract one from another.
+; Below is an illustration of bit positions and how much we should
+; shift the numbers right:
+; bit position range : (in binary) | shift right by : (in binary)
+; -------------------+-------------------+----------------+------------
+; [ 0, 7] : (000000 - 000111) | lsr 0 : 000000
+; [ 8,15] : (001000 - 001111) | lsr 8 : 001000
+; [16,23] : (010000 - 010111) | lsr 16 : 010000
+; [24,31] : (011000 - 011111) | lsr 24 : 011000
+; ... : ... | ... : ...
+; [56,63] : (111000 - 111111) | lsr 56 : 111000
+; We need to ignore the least 3 bits of "position" to get "shift right"
+; amount: "and 0x38, ..."
+; 4) When the bytes are positioned at byte #0, mask out the rest of the
+; bytes and subtract the two operands: lhs - rhs
+.L_diff_byte_in_regs:
+ xorl r0, r1, r2 ; (1)
+ ffsl r0, r0 ; (2)
+ and r0, r0, 0x38 ; (3)
+ lsrl r1, r1, r0 ; (3)
+ lsrl r2, r2, r0 ; (3)
+ bmsk_s r1, r1, 7 ; (4)
+ bmsk_s r2, r2, 7 ; (4)
+ j_s.d [blink]
+ subl r0, r1, r2 ; (4)
+ENDFUNC (memcmp)
+
+; __ARC64_ARCH64__
+#endif
+
+; The loop at the heart of the "memcmp" function follows some specific
+; logic and has gone through a few optimisation filters. Knowing them
+; will help understand the code better.
+;
+; The comparison logic
+; --------------------
+; In each loop, we compare 32 bytes of data from "lhs" and "rhs". Those
+; comparisons takes place by using 8 sets of registers:
+;
+; r4 == r5 xor.f 0, r4, r5 lhs[i+0] == rhs[i+0]
+; r6 == r7 xor.f 0, r6, r7 lhs[i+8] == rhs[i+8]
+; r8 == r9 xor.f 0, r8, r9 lhs[i+16] == rhs[i+16]
+; r10 == r11 xor.f 0, r10, r11 lhs[i+24] == rhs[i+32]
+;
+; The idea is to set a corresponding bit in r13 register for each
+; comparison that fails. The relation between the bits and the
+; comparisons are:
+;
+; r13[0..63] = 0
+; r13[0] = 1 if r4 != r5
+; r13[1] = 1 if r6 != r7
+; r13[2] = 1 if r8 != r9
+; r13[3] = 1 if r10 != r11
+;
+; If r13 remains 0, the next possible iteration of the loop begins.
+; If it is not 0 anymore, the algorithm will be interested in the
+; lowest bit that is set to 1. That is achieved by the "ffs"
+; (find first set) instruction.
+;
+; The loop transformation
+; -----------------------
+; 1) At first, the loop looks like below:
+;
+; .loop
+; ldl.ab r4, [r3, +8]
+; ldl.ab r5, [r1, +8]
+; ...
+; ldl.ab r10, [r3, +8]
+; ldl.ab r11, [r1, +8]
+; xorl.f 0, r4, r5
+; xor.ne r13, r13, 0b0001
+; ...
+; xorl.f 0, r10, r11
+; xor.ne r13, r13, 0b1000
+; brne r13, 0, @.unequal_find
+; dbnz r12, @.loop
+;
+; 2) "dbnz" instruction has a delay slot. To make the code more
+; efficient, we can bring the first 2 instructions of the loop
+; to the end (they will be executed just before the next iteration
+; begins). To make the logic of the program sound, those 2
+; instructions need to be duplicated before the loop start as well:
+;
+; ldl.ab r4, [r3, +8]
+; ldl.ab r5, [r1, +8]
+; .loop
+; ldl.ab r6, [r3, +8]
+; ldl.ab r7, [r1, +8]
+; ...
+; ldl.ab r10, [r3, +8]
+; ldl.ab r11, [r1, +8]
+; xorl.f 0, r4, r5
+; xor.ne r13, r13, 0b0001
+; ...
+; xorl.f 0, r10, r11
+; xor.ne r13, r13, 0b1000
+; brne r13, 0, @.unequal_find
+; ldl.ab r4, [r3, +8]
+; dbnz.d r12, @.loop
+; ldl.ab r5, [r1, +8]
+;
+; There is one more loose end to take care of: At the last iteration
+; of the loop, there is an extra load into r4 and r5 registers while
+; incrementing the pointers (r3 and r1). We have to correct for that
+; after the loop:
+;
+; .loop:
+; ..
+; brne r13, 0, @.unequal_find
+; ldl.ab r4, [r3, +8]
+; dbnz.d r12, @.loop
+; ldl.ab r5, [r1, +8]
+; subl r1, r1, 8
+; subl r3, r3, 8
+;
+; One last remark about NOT filling the delay slot of "brne" with
+; "ldl.ab r4, ...". If the branch is taken, the rest of code that
+; is responsible for finding the differentiating bytes relies that
+; all 8 registers hold the comparison data of the loop. Putting
+; "ldl.ab r4, ..." into the delay slot of "brne ..." would clobber
+; the "r4" register:
+;
+; .loop:
+; ..
+; brne.d r13, 0, @.unequal_find --> this branch might be taken
+; ldl.ab r4, [r3, +8] --> clobbers r4
+; dbnz.d r12, @.loop
+; ldl.ab r5, [r1, +8]
+;
+; Having "ldl.ab r4, ..." between "brne" and "dbnz" as two control flow
+; altering instructions is good enough.