diff options
-rw-r--r-- | newlib/ChangeLog | 14 | ||||
-rw-r--r-- | newlib/libc/machine/arm/Makefile.am | 6 | ||||
-rw-r--r-- | newlib/libc/machine/arm/Makefile.in | 7 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-arm-tiny.S | 46 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-armv4.S | 381 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-armv4t.S | 53 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-armv6.S | 469 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-armv7.S | 468 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp-armv7m.S | 377 | ||||
-rw-r--r-- | newlib/libc/machine/arm/strcmp.S | 777 |
10 files changed, 1857 insertions, 741 deletions
diff --git a/newlib/ChangeLog b/newlib/ChangeLog index 1e3c533..1e94592 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,17 @@ +2014-04-22 Richard Earnshaw <rearnsha@arm.com> + + * libc/machine/arm/strcmp-arm-tiny.S: New file. + * libc/machine/arm/strcmp-armv4.S: New file. + * libc/machine/arm/strcmp-armv4t.S: New file. + * libc/machine/arm/strcmp-armv6.S: New file. + * libc/machine/arm/strcmp-armv7.S: New file. + * libc/machine/arm/strcmp-armv7m.S: New file. + * libc/machine/arm/strcmp.S: Replace with wrapper for various + implementations. + * libc/machine/arm/Makefile.am (strcmp.o, strcmp.obj): Add + dependencies. + * libc/machine/arm/Makefile.in: Regenerated. + 2014-04-14 Sebastian Huber <sebastian.huber@embedded-brains.de> * libc/sys/rtems/sys/cpuset.h (CPU_SET_S): Add const qualifier. diff --git a/newlib/libc/machine/arm/Makefile.am b/newlib/libc/machine/arm/Makefile.am index c5e797e..fb33926 100644 --- a/newlib/libc/machine/arm/Makefile.am +++ b/newlib/libc/machine/arm/Makefile.am @@ -18,7 +18,13 @@ ACLOCAL_AMFLAGS = -I ../../.. -I ../../../.. CONFIG_STATUS_DEPENDENCIES = $(newlib_basedir)/configure.host MEMCPY_DEP=memcpy-armv7a.S memcpy-armv7m.S +STRCMP_DEP=strcmp-arm-tiny.S strcmp-armv4.S strcmp-armv4t.S strcmp-armv6.S \ + strcmp-armv7.S strcmp-armv7m.S $(lpfx)memcpy.o: $(MEMCPY_DEP) $(lpfx)memcpy.obj: $(MEMCPY_DEP) + +$(lpfx)strcmp.o: $(STRCMP_DEP) + +$(lpfx)strcmp.obj: $(STRCMP_DEP) diff --git a/newlib/libc/machine/arm/Makefile.in b/newlib/libc/machine/arm/Makefile.in index 975103f..1ccfac5 100644 --- a/newlib/libc/machine/arm/Makefile.in +++ b/newlib/libc/machine/arm/Makefile.in @@ -209,6 +209,9 @@ lib_a_CFLAGS = $(AM_CFLAGS) ACLOCAL_AMFLAGS = -I ../../.. -I ../../../.. CONFIG_STATUS_DEPENDENCIES = $(newlib_basedir)/configure.host MEMCPY_DEP = memcpy-armv7a.S memcpy-armv7m.S +STRCMP_DEP = strcmp-arm-tiny.S strcmp-armv4.S strcmp-armv4t.S strcmp-armv6.S \ + strcmp-armv7.S strcmp-armv7m.S + all: all-am .SUFFIXES: @@ -508,6 +511,10 @@ $(lpfx)memcpy.o: $(MEMCPY_DEP) $(lpfx)memcpy.obj: $(MEMCPY_DEP) +$(lpfx)strcmp.o: $(STRCMP_DEP) + +$(lpfx)strcmp.obj: $(STRCMP_DEP) + # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: diff --git a/newlib/libc/machine/arm/strcmp-arm-tiny.S b/newlib/libc/machine/arm/strcmp-arm-tiny.S new file mode 100644 index 0000000..158133f --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-arm-tiny.S @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Tiny version of strcmp in ARM state. Used only when optimizing + for size. Also supports Thumb-2. */ + + .syntax unified +def_fn strcmp + .cfi_startproc +1: + ldrb r2, [r0], #1 + ldrb r3, [r1], #1 + cmp r2, #1 + it cs + cmpcs r2, r3 + beq 1b +2: + subs r0, r2, r3 + RETURN + .cfi_endproc + .size strcmp, . - strcmp diff --git a/newlib/libc/machine/arm/strcmp-armv4.S b/newlib/libc/machine/arm/strcmp-armv4.S new file mode 100644 index 0000000..b18c3db --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-armv4.S @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + /* Basic ARM implementation. This should run on anything except + for ARMv6-M, but there are better implementations for later + revisions of the architecture. This version can support ARMv4T + ARM/Thumb interworking. */ +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define data1 r2 +#define data2 r3 +#define magic1 r4 +#define tmp2 r5 +#define tmp1 r12 +#define syndrome r12 /* Overlaps tmp1 */ + + .arm +def_fn strcmp + .cfi_startproc + eor tmp1, src1, src2 + tst tmp1, #3 + /* Strings not at same byte offset from a word boundary. */ + bne .Lstrcmp_unaligned + ands tmp1, src1, #3 + bic src1, src1, #3 + bic src2, src2, #3 + ldr data1, [src1], #4 + ldreq data2, [src2], #4 + beq 1f + /* Although s1 and s2 have identical initial alignment, they are + not currently word aligned. Rather than comparing bytes, + make sure that any bytes fetched from before the addressed + bytes are forced to 0xff. Then they will always compare + equal. */ + eor tmp1, tmp1, #3 + mvn data2, #MSB + lsl tmp1, tmp1, #3 + S2LO tmp1, data2, tmp1 + ldr data2, [src2], #4 + orr data1, data1, tmp1 + orr data2, data2, tmp1 +1: + /* Load the 'magic' constant 0x01010101. */ + str r4, [sp, #-4]! + .cfi_def_cfa_offset 4 + .cfi_offset 4, -4 + mov magic1, #1 + orr magic1, magic1, magic1, lsl #8 + orr magic1, magic1, magic1, lsl #16 + .p2align 2 +4: + sub syndrome, data1, magic1 + cmp data1, data2 + /* check for any zero bytes in first word */ + biceq syndrome, syndrome, data1 + tsteq syndrome, magic1, lsl #7 + ldreq data1, [src1], #4 + ldreq data2, [src2], #4 + beq 4b +2: + /* There's a zero or a different byte in the word */ + S2HI result, data1, #24 + S2LO data1, data1, #8 + cmp result, #1 + cmpcs result, data2, S2HI #24 + S2LOEQ data2, data2, #8 + beq 2b + /* On a big-endian machine, RESULT contains the desired byte in bits + 0-7; on a little-endian machine they are in bits 24-31. In + both cases the other bits in RESULT are all zero. For DATA2 the + interesting byte is at the other end of the word, but the + other bits are not necessarily zero. We need a signed result + representing the differnece in the unsigned bytes, so for the + little-endian case we can't just shift the interesting bits + up. */ +#ifdef __ARM_BIG_ENDIAN + sub result, result, data2, lsr #24 +#else + and data2, data2, #255 + rsb result, data2, result, lsr #24 +#endif + ldr r4, [sp], #4 + .cfi_restore 4 + .cfi_def_cfa_offset 0 + RETURN + + +#if 0 + /* The assembly code below is based on the following alogrithm. */ +#ifdef __ARM_BIG_ENDIAN +#define RSHIFT << +#define LSHIFT >> +#else +#define RSHIFT >> +#define LSHIFT << +#endif + +#define body(shift) \ + mask = 0xffffffffU RSHIFT shift; \ + data1 = *src1++; \ + data2 = *src2++; \ + do \ + { \ + tmp2 = data1 & mask; \ + if (__builtin_expect(tmp2 != data2 RSHIFT shift, 0)) \ + { \ + data2 RSHIFT= shift; \ + break; \ + } \ + if (__builtin_expect(((data1 - b1) & ~data1) & (b1 << 7), 0)) \ + { \ + /* See comment in assembler below re syndrome on big-endian */\ + if ((((data1 - b1) & ~data1) & (b1 << 7)) & mask) \ + data2 RSHIFT= shift; \ + else \ + { \ + data2 = *src2; \ + tmp2 = data1 RSHIFT (32 - shift); \ + data2 = (data2 LSHIFT (32 - shift)) RSHIFT (32 - shift); \ + } \ + break; \ + } \ + data2 = *src2++; \ + tmp2 ^= data1; \ + if (__builtin_expect(tmp2 != data2 LSHIFT (32 - shift), 0)) \ + { \ + tmp2 = data1 >> (32 - shift); \ + data2 = (data2 << (32 - shift)) RSHIFT (32 - shift); \ + break; \ + } \ + data1 = *src1++; \ + } while (1) + + const unsigned* src1; + const unsigned* src2; + unsigned data1, data2; + unsigned mask; + unsigned shift; + unsigned b1 = 0x01010101; + char c1, c2; + unsigned tmp2; + + while (((unsigned) s1) & 3) + { + c1 = *s1++; + c2 = *s2++; + if (c1 == 0 || c1 != c2) + return c1 - (int)c2; + } + src1 = (unsigned*) (((unsigned)s1) & ~3); + src2 = (unsigned*) (((unsigned)s2) & ~3); + tmp2 = ((unsigned) s2) & 3; + if (tmp2 == 1) + { + body(8); + } + else if (tmp2 == 2) + { + body(16); + } + else + { + body (24); + } + + do + { +#ifdef __ARM_BIG_ENDIAN + c1 = (char) tmp2 >> 24; + c2 = (char) data2 >> 24; +#else /* not __ARM_BIG_ENDIAN */ + c1 = (char) tmp2; + c2 = (char) data2; +#endif /* not __ARM_BIG_ENDIAN */ + tmp2 RSHIFT= 8; + data2 RSHIFT= 8; + } while (c1 != 0 && c1 == c2); + return c1 - c2; +#endif /* 0 */ + + + /* First of all, compare bytes until src1(sp1) is word-aligned. */ +.Lstrcmp_unaligned: + tst src1, #3 + beq 2f + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + cmp data1, #1 + cmpcs data1, data2 + beq .Lstrcmp_unaligned + sub result, data1, data2 + RETURN + +2: + stmfd sp!, {r4, r5} + .cfi_def_cfa_offset 8 + .cfi_offset 4, -8 + .cfi_offset 5, -4 + mov magic1, #1 + orr magic1, magic1, magic1, lsl #8 + orr magic1, magic1, magic1, lsl #16 + + ldr data1, [src1], #4 + and tmp2, src2, #3 + bic src2, src2, #3 + ldr data2, [src2], #4 + cmp tmp2, #2 + beq .Loverlap2 + bhi .Loverlap1 + + /* Critical inner Loop: Block with 3 bytes initial overlap */ + .p2align 2 +.Loverlap3: + bic tmp2, data1, #MSB + cmp tmp2, data2, S2LO #8 + sub syndrome, data1, magic1 + bic syndrome, syndrome, data1 + bne 4f + ands syndrome, syndrome, magic1, lsl #7 + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: +#ifdef __ARM_BIG_ENDIAN + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00. */ + tst data1, #0xff000000 + tstne data1, #0x00ff0000 + tstne data1, #0x0000ff00 + beq .Lstrcmp_done_equal +#else + bics syndrome, syndrome, #0xff000000 + bne .Lstrcmp_done_equal +#endif + ldrb data2, [src2] + S2LO tmp2, data1, #24 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #24 +#endif + b .Lstrcmp_tail + +6: + S2LO tmp2, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + /* Critical inner Loop: Block with 2 bytes initial overlap. */ + .p2align 2 +.Loverlap2: + S2HI tmp2, data1, #16 + sub syndrome, data1, magic1 + S2LO tmp2, tmp2, #16 + bic syndrome, syndrome, data1 + cmp tmp2, data2, S2LO #16 + bne 4f + ands syndrome, syndrome, magic1, lsl #7 + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 + +5: +#ifdef __ARM_BIG_ENDIAN + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00 */ + tst data1, #0xff000000 + tstne data1, #0x00ff0000 + beq .Lstrcmp_done_equal +#else + lsls syndrome, syndrome, #16 + bne .Lstrcmp_done_equal +#endif + ldrh data2, [src2] + S2LO tmp2, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2HI data2, data2, #16 + S2LO tmp2, data1, #16 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail + + /* Critical inner Loop: Block with 1 byte initial overlap. */ + .p2align 2 +.Loverlap1: + and tmp2, data1, #LSB + cmp tmp2, data2, S2LO #24 + sub syndrome, data1, magic1 + bic syndrome, syndrome, data1 + bne 4f + ands syndrome, syndrome, magic1, lsl #7 + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00. */ + tst data1, #LSB + beq .Lstrcmp_done_equal + ldr data2, [src2], #4 +6: + S2LO tmp2, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail +.Lstrcmp_done_equal: + mov result, #0 + .cfi_remember_state + ldmfd sp!, {r4, r5} + .cfi_restore 4 + .cfi_restore 5 + .cfi_def_cfa_offset 0 + RETURN + +.Lstrcmp_tail: + .cfi_restore_state + and r2, tmp2, #LSB + and result, data2, #LSB + cmp result, #1 + cmpcs result, r2 + S2LOEQ tmp2, tmp2, #8 + S2LOEQ data2, data2, #8 + beq .Lstrcmp_tail + sub result, r2, result + ldmfd sp!, {r4, r5} + .cfi_restore 4 + .cfi_restore 5 + .cfi_def_cfa_offset 0 + RETURN + .cfi_endproc + .size strcmp, . - strcmp diff --git a/newlib/libc/machine/arm/strcmp-armv4t.S b/newlib/libc/machine/arm/strcmp-armv4t.S new file mode 100644 index 0000000..2716b87 --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-armv4t.S @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + /* This version is only used when we want a very basic Thumb1 + implementation or for size, otherwise we use the base ARMv4 + version. This is also suitable for ARMv6-M. */ + + .thumb + .syntax unified + .arch armv4t + .eabi_attribute Tag_also_compatible_with, "\006\013" /* ARMv6-M. */ + .eabi_attribute Tag_ARM_ISA_use, 0 +def_fn strcmp + .cfi_startproc +1: + ldrb r2, [r0] + ldrb r3, [r1] + cmp r2, #0 + beq 2f + adds r0, r0, #1 + adds r1, r1, #1 + cmp r2, r3 + beq 1b +2: + subs r0, r2, r3 + bx lr + .cfi_endproc + .size strcmp, . - strcmp diff --git a/newlib/libc/machine/arm/strcmp-armv6.S b/newlib/libc/machine/arm/strcmp-armv6.S new file mode 100644 index 0000000..a557fc5 --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-armv6.S @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + /* Implementation of strcmp for ARMv6. Use ldrd to support wider + loads, provided the data is sufficiently aligned. Use + saturating arithmetic to optimize the compares. */ + + /* Build Options: + STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first + byte in the string. If comparing completely random strings + the pre-check will save time, since there is a very high + probability of a mismatch in the first character: we save + significant overhead if this is the common case. However, + if strings are likely to be identical (eg because we're + verifying a hit in a hash table), then this check is largely + redundant. */ + + .arm + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsl \d1, \d1, tmp1 + .cfi_remember_state + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .cfi_remember_state + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1 + + bx lr +#endif + .endm + + .text + .p2align 5 +.Lstrcmp_start_addr: +#ifndef STRCMP_NO_PRECHECK +.Lfastpath_exit: + sub r0, r2, r3 + bx lr +#endif +def_fn strcmp +#ifndef STRCMP_NO_PRECHECK + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + cmpcs r2, r3 + bne .Lfastpath_exit +#endif + .cfi_startproc + strd r4, r5, [sp, #-16]! + .cfi_def_cfa_offset 16 + .cfi_offset 4, -16 + .cfi_offset 5, -12 + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + .cfi_offset 6, -8 + .cfi_offset 7, -4 + mvn const_m1, #0 + tst tmp1, #7 + beq .Lloop_aligned8 + +.Lnot_aligned: + eor tmp1, src1, src2 + tst tmp1, #7 + bne .Lmisaligned8 + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + /* In ARM code we can't use ORN, but with do have MVN with a + register shift. */ + mvn tmp1, const_m1, S2HI tmp2 + orr data1a, data1a, tmp1 + orr data2a, data2a, tmp1 + beq .Lstart_realigned8 + orr data1b, data1b, tmp1 + mov data1a, const_m1 + orr data2b, data2b, tmp1 + mov data2a, const_m1 + b .Lstart_realigned8 + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +.Lloop_aligned8: + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +.Lstart_realigned8: + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + bne .Ldiff_found + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq .Lloop_aligned8 + +.Ldiff_found: + cmp syndrome_a, #0 + bne .Ldiff_in_a + +.Ldiff_in_b: + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +.Ldiff_in_a: + .cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + .cfi_restore_state +.Lmisaligned8: + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align4 + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +.Lloop_aligned4: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned4: + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + bne .Laligned4_done + + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq .Lloop_aligned4 + +.Laligned4_done: + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +.Lmutual_align4: + .cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + /* In ARM code we can't use ORN, but with do have MVN with a + register shift. */ + mvn tmp1, const_m1, S2HI tmp1 + orr data1, data1, tmp1 + orr data2, data2, tmp1 + b .Lstart_realigned4 + +.Lmisaligned4: + ands tmp1, src1, #3 + beq .Lsrc1_aligned + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq .Laligned_m2 + bcs .Laligned_m1 + +#ifdef STRCMP_NO_PRECHECK + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + cmp tmp1, #1 + cmpcs tmp1, data2 + bne .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + cmp tmp1, #1 + cmpcs tmp1, data2 + bne .Lmisaligned_exit + +.Laligned_m1: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + cmp tmp1, #1 + cmpcs tmp1, data2 + beq .Lsrc1_aligned + +#else /* STRCMP_NO_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + cmp tmp1, #1 + cmpcs tmp1, data2 + bne .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + cmp tmp1, #1 + cmpcs tmp1, data2 + beq .Laligned_m1 +#endif + +.Lmisaligned_exit: + .cfi_remember_state + sub result, tmp1, data2 + ldr r4, [sp], #16 + .cfi_restore 4 + bx lr + +#ifndef STRCMP_NO_PRECHECK +.Laligned_m1: + add src2, src2, #4 +#endif +.Lsrc1_aligned: + .cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +.Loverlap3: + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cmp syndrome, #0 + ldreq data2, [src2], #4 + bne 5f + + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: + bics syndrome, syndrome, #MSB + bne .Lstrcmp_done_equal + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 Not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + neg result, result + bx lr + +6: + .cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap2: + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cmp syndrome, #0 + ldreq data2, [src2], #4 + bne 5f + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne .Lstrcmp_done_equal + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap1: + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cmp syndrome, #0 + ldreq data2, [src2], #4 + bne 5f + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + tst syndrome, #LSB + bne .Lstrcmp_done_equal + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail + +.Lstrcmp_done_equal: + mov result, #0 + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + bx lr + +.Lstrcmp_tail: + .cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + sub result, result, data2, lsr #24 + bx lr + .cfi_endproc + .size strcmp, . - .Lstrcmp_start_addr diff --git a/newlib/libc/machine/arm/strcmp-armv7.S b/newlib/libc/machine/arm/strcmp-armv7.S new file mode 100644 index 0000000..e2c47ff --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-armv7.S @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + /* Implementation of strcmp for ARMv7 when DSP instructions are + available. Use ldrd to support wider loads, provided the data + is sufficiently aligned. Use saturating arithmetic to optimize + the compares. */ + + /* Build Options: + STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first + byte in the string. If comparing completely random strings + the pre-check will save time, since there is a very high + probability of a mismatch in the first character: we save + significant overhead if this is the common case. However, + if strings are likely to be identical (eg because we're + verifying a hit in a hash table), then this check is largely + redundant. */ + + /* This version uses Thumb-2 code. */ + .thumb + .syntax unified + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define tmp1 r4 +#define tmp2 r5 +#define const_m1 r12 + +/* Additional internal variables for 64-bit aligned data. */ +#define data1a r2 +#define data1b r3 +#define data2a r6 +#define data2b r7 +#define syndrome_a tmp1 +#define syndrome_b tmp2 + +/* Additional internal variables for 32-bit aligned data. */ +#define data1 r2 +#define data2 r3 +#define syndrome tmp2 + + + /* Macro to compute and return the result value for word-aligned + cases. */ + .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 +#ifdef __ARM_BIG_ENDIAN + /* If data1 contains a zero byte, then syndrome will contain a 1 in + bit 7 of that byte. Otherwise, the highest set bit in the + syndrome will highlight the first different bit. It is therefore + sufficient to extract the eight bits starting with the syndrome + bit. */ + clz tmp1, \synd + lsl r1, \d2, tmp1 + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsl \d1, \d1, tmp1 + .cfi_remember_state + lsr result, \d1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1, lsr #24 + bx lr +#else + /* To use the big-endian trick we'd have to reverse all three words. + that's slower than this approach. */ + rev \synd, \synd + clz tmp1, \synd + bic tmp1, tmp1, #7 + lsr r1, \d2, tmp1 + .cfi_remember_state + .if \restore_r6 + ldrd r6, r7, [sp, #8] + .endif + .cfi_restore 6 + .cfi_restore 7 + lsr \d1, \d1, tmp1 + and result, \d1, #255 + and r1, r1, #255 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + sub result, result, r1 + + bx lr +#endif + .endm + + .text + .p2align 5 +.Lstrcmp_start_addr: +#ifndef STRCMP_NO_PRECHECK +.Lfastpath_exit: + sub r0, r2, r3 + bx lr + nop +#endif +def_fn strcmp +#ifndef STRCMP_NO_PRECHECK + ldrb r2, [src1] + ldrb r3, [src2] + cmp r2, #1 + it cs + cmpcs r2, r3 + bne .Lfastpath_exit +#endif + .cfi_startproc + strd r4, r5, [sp, #-16]! + .cfi_def_cfa_offset 16 + .cfi_offset 4, -16 + .cfi_offset 5, -12 + orr tmp1, src1, src2 + strd r6, r7, [sp, #8] + .cfi_offset 6, -8 + .cfi_offset 7, -4 + mvn const_m1, #0 + lsl r2, tmp1, #29 + cbz r2, .Lloop_aligned8 + +.Lnot_aligned: + eor tmp1, src1, src2 + tst tmp1, #7 + bne .Lmisaligned8 + + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + and tmp1, src1, #7 + bic src1, src1, #7 + and tmp2, tmp1, #3 + bic src2, src2, #7 + lsl tmp2, tmp2, #3 /* Bytes -> bits. */ + ldrd data1a, data1b, [src1], #16 + tst tmp1, #4 + ldrd data2a, data2b, [src2], #16 + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp2 + orn data1a, data1a, tmp1 + orn data2a, data2a, tmp1 + beq .Lstart_realigned8 + orn data1b, data1b, tmp1 + mov data1a, const_m1 + orn data2b, data2b, tmp1 + mov data2a, const_m1 + b .Lstart_realigned8 + + /* Unwind the inner loop by a factor of 2, giving 16 bytes per + pass. */ + .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ + .p2align 2 /* Always word aligned. */ +.Lloop_aligned8: + ldrd data1a, data1b, [src1], #16 + ldrd data2a, data2b, [src2], #16 +.Lstart_realigned8: + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + cbnz syndrome_a, .Ldiff_in_a + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + cbnz syndrome_b, .Ldiff_in_b + + ldrd data1a, data1b, [src1, #-8] + ldrd data2a, data2b, [src2, #-8] + uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ + eor syndrome_a, data1a, data2a + sel syndrome_a, syndrome_a, const_m1 + uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ + eor syndrome_b, data1b, data2b + sel syndrome_b, syndrome_b, const_m1 + /* Can't use CBZ for backwards branch. */ + orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ + beq .Lloop_aligned8 + +.Ldiff_found: + cbnz syndrome_a, .Ldiff_in_a + +.Ldiff_in_b: + strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 + +.Ldiff_in_a: + .cfi_restore_state + strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 + + .cfi_restore_state +.Lmisaligned8: + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align4 + + /* Unrolled by a factor of 2, to reduce the number of post-increment + operations. */ +.Lloop_aligned4: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned4: + uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cbnz syndrome, .Laligned4_done + ldr data1, [src1, #-4] + ldr data2, [src2, #-4] + uadd8 syndrome, data1, const_m1 + eor syndrome, data1, data2 + sel syndrome, syndrome, const_m1 + cmp syndrome, #0 + beq .Lloop_aligned4 + +.Laligned4_done: + strcmp_epilogue_aligned syndrome, data1, data2, 0 + +.Lmutual_align4: + .cfi_restore_state + /* Deal with mutual misalignment by aligning downwards and then + masking off the unwanted loaded data to prevent a difference. */ + lsl tmp1, tmp1, #3 /* Bytes -> bits. */ + bic src1, src1, #3 + ldr data1, [src1], #8 + bic src2, src2, #3 + ldr data2, [src2], #8 + + /* In thumb code we can't use MVN with a register shift, but + we do have ORN. */ + S2HI tmp1, const_m1, tmp1 + orn data1, data1, tmp1 + orn data2, data2, tmp1 + b .Lstart_realigned4 + +.Lmisaligned4: + ands tmp1, src1, #3 + beq .Lsrc1_aligned + sub src2, src2, tmp1 + bic src1, src1, #3 + lsls tmp1, tmp1, #31 + ldr data1, [src1], #4 + beq .Laligned_m2 + bcs .Laligned_m1 + +#ifdef STRCMP_NO_PRECHECK + ldrb data2, [src2, #1] + uxtb tmp1, data1, ror #BYTE1_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m1: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + add src2, src2, #4 + cbnz data2, .Lsrc1_aligned +#else /* STRCMP_NO_PRECHECK */ + /* If we've done the pre-check, then we don't need to check the + first byte again here. */ + ldrb data2, [src2, #2] + uxtb tmp1, data1, ror #BYTE2_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbz data2, .Lmisaligned_exit + +.Laligned_m2: + ldrb data2, [src2, #3] + uxtb tmp1, data1, ror #BYTE3_OFFSET + subs tmp1, tmp1, data2 + bne .Lmisaligned_exit + cbnz data2, .Laligned_m1 +#endif + +.Lmisaligned_exit: + .cfi_remember_state + mov result, tmp1 + ldr r4, [sp], #16 + .cfi_restore 4 + bx lr + +#ifndef STRCMP_NO_PRECHECK +.Laligned_m1: + add src2, src2, #4 +#endif +.Lsrc1_aligned: + .cfi_restore_state + /* src1 is word aligned, but src2 has no common alignment + with it. */ + ldr data1, [src1], #4 + lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ + + bic src2, src2, #3 + ldr data2, [src2], #4 + bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ + bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ + + /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ +.Loverlap3: + bic tmp1, data1, #MSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #8 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: + bics syndrome, syndrome, #MSB + bne .Lstrcmp_done_equal + + /* We can only get here if the MSB of data1 contains 0, so + fast-path the exit. */ + ldrb result, [src2] + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 Not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + neg result, result + bx lr + +6: + .cfi_restore_state + S2LO data1, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap2: + and tmp1, data1, const_m1, S2LO #16 + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #16 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail +5: + ands syndrome, syndrome, const_m1, S2LO #16 + bne .Lstrcmp_done_equal + + ldrh data2, [src2] + S2LO data1, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2LO data1, data1, #16 + and data2, data2, const_m1, S2LO #16 + b .Lstrcmp_tail + + .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ +.Loverlap1: + and tmp1, data1, #LSB + uadd8 syndrome, data1, const_m1 + eors syndrome, tmp1, data2, S2LO #24 + sel syndrome, syndrome, const_m1 + bne 4f + cbnz syndrome, 5f + ldr data2, [src2], #4 + eor tmp1, tmp1, data1 + cmp tmp1, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + tst syndrome, #LSB + bne .Lstrcmp_done_equal + ldr data2, [src2] +6: + S2LO data1, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail + +.Lstrcmp_done_equal: + mov result, #0 + .cfi_remember_state + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + bx lr + +.Lstrcmp_tail: + .cfi_restore_state +#ifndef __ARM_BIG_ENDIAN + rev data1, data1 + rev data2, data2 + /* Now everything looks big-endian... */ +#endif + uadd8 tmp1, data1, const_m1 + eor tmp1, data1, data2 + sel syndrome, tmp1, const_m1 + clz tmp1, syndrome + lsl data1, data1, tmp1 + lsl data2, data2, tmp1 + lsr result, data1, #24 + ldrd r4, r5, [sp], #16 + .cfi_restore 4 + .cfi_restore 5 + /* R6/7 not used in this sequence. */ + .cfi_restore 6 + .cfi_restore 7 + sub result, result, data2, lsr #24 + bx lr + .cfi_endproc + .size strcmp, . - .Lstrcmp_start_addr diff --git a/newlib/libc/machine/arm/strcmp-armv7m.S b/newlib/libc/machine/arm/strcmp-armv7m.S new file mode 100644 index 0000000..d66d393 --- /dev/null +++ b/newlib/libc/machine/arm/strcmp-armv7m.S @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2012-2014 ARM Ltd + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the company may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Very similar to the generic code, but uses Thumb2 as implemented + in ARMv7-M. */ + +/* Parameters and result. */ +#define src1 r0 +#define src2 r1 +#define result r0 /* Overlaps src1. */ + +/* Internal variables. */ +#define data1 r2 +#define data2 r3 +#define tmp2 r5 +#define tmp1 r12 +#define syndrome r12 /* Overlaps tmp1 */ + + .thumb + .syntax unified +def_fn strcmp + .cfi_startproc + eor tmp1, src1, src2 + tst tmp1, #3 + /* Strings not at same byte offset from a word boundary. */ + bne .Lstrcmp_unaligned + ands tmp1, src1, #3 + bic src1, src1, #3 + bic src2, src2, #3 + ldr data1, [src1], #4 + it eq + ldreq data2, [src2], #4 + beq 4f + /* Although s1 and s2 have identical initial alignment, they are + not currently word aligned. Rather than comparing bytes, + make sure that any bytes fetched from before the addressed + bytes are forced to 0xff. Then they will always compare + equal. */ + eor tmp1, tmp1, #3 + mvn data2, #MSB + lsl tmp1, tmp1, #3 + S2LO tmp1, data2, tmp1 + ldr data2, [src2], #4 + orr data1, data1, tmp1 + orr data2, data2, tmp1 + .p2align 2 + /* Critical loop. */ +4: + sub syndrome, data1, #0x01010101 + cmp data1, data2 + /* check for any zero bytes in first word */ + itttt eq + biceq syndrome, syndrome, data1 + tsteq syndrome, #0x80808080 + ldreq data1, [src1], #4 + ldreq data2, [src2], #4 + beq 4b +2: + /* There's a zero or a different byte in the word */ + S2HI result, data1, #24 + S2LO data1, data1, #8 + cmp result, #1 + it cs + cmpcs result, data2, S2HI #24 + it eq + S2LOEQ data2, data2, #8 + beq 2b + /* On a big-endian machine, RESULT contains the desired byte in bits + 0-7; on a little-endian machine they are in bits 24-31. In + both cases the other bits in RESULT are all zero. For DATA2 the + interesting byte is at the other end of the word, but the + other bits are not necessarily zero. We need a signed result + representing the differnece in the unsigned bytes, so for the + little-endian case we can't just shift the interesting bits + up. */ +#ifdef __ARM_BIG_ENDIAN + sub result, result, data2, lsr #24 +#else + and data2, data2, #255 + lsrs result, result, #24 + subs result, result, data2 +#endif + RETURN + + +#if 0 + /* The assembly code below is based on the following alogrithm. */ +#ifdef __ARM_BIG_ENDIAN +#define RSHIFT << +#define LSHIFT >> +#else +#define RSHIFT >> +#define LSHIFT << +#endif + +#define body(shift) \ + mask = 0xffffffffU RSHIFT shift; \ + data1 = *src1++; \ + data2 = *src2++; \ + do \ + { \ + tmp2 = data1 & mask; \ + if (__builtin_expect(tmp2 != data2 RSHIFT shift, 0)) \ + { \ + data2 RSHIFT= shift; \ + break; \ + } \ + if (__builtin_expect(((data1 - b1) & ~data1) & (b1 << 7), 0)) \ + { \ + /* See comment in assembler below re syndrome on big-endian */\ + if ((((data1 - b1) & ~data1) & (b1 << 7)) & mask) \ + data2 RSHIFT= shift; \ + else \ + { \ + data2 = *src2; \ + tmp2 = data1 RSHIFT (32 - shift); \ + data2 = (data2 LSHIFT (32 - shift)) RSHIFT (32 - shift); \ + } \ + break; \ + } \ + data2 = *src2++; \ + tmp2 ^= data1; \ + if (__builtin_expect(tmp2 != data2 LSHIFT (32 - shift), 0)) \ + { \ + tmp2 = data1 >> (32 - shift); \ + data2 = (data2 << (32 - shift)) RSHIFT (32 - shift); \ + break; \ + } \ + data1 = *src1++; \ + } while (1) + + const unsigned* src1; + const unsigned* src2; + unsigned data1, data2; + unsigned mask; + unsigned shift; + unsigned b1 = 0x01010101; + char c1, c2; + unsigned tmp2; + + while (((unsigned) s1) & 3) + { + c1 = *s1++; + c2 = *s2++; + if (c1 == 0 || c1 != c2) + return c1 - (int)c2; + } + src1 = (unsigned*) (((unsigned)s1) & ~3); + src2 = (unsigned*) (((unsigned)s2) & ~3); + tmp2 = ((unsigned) s2) & 3; + if (tmp2 == 1) + { + body(8); + } + else if (tmp2 == 2) + { + body(16); + } + else + { + body (24); + } + + do + { +#ifdef __ARM_BIG_ENDIAN + c1 = (char) tmp2 >> 24; + c2 = (char) data2 >> 24; +#else /* not __ARM_BIG_ENDIAN */ + c1 = (char) tmp2; + c2 = (char) data2; +#endif /* not __ARM_BIG_ENDIAN */ + tmp2 RSHIFT= 8; + data2 RSHIFT= 8; + } while (c1 != 0 && c1 == c2); + return c1 - c2; +#endif /* 0 */ + + + /* First of all, compare bytes until src1(sp1) is word-aligned. */ +.Lstrcmp_unaligned: + tst src1, #3 + beq 2f + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + cmp data1, #1 + it cs + cmpcs data1, data2 + beq .Lstrcmp_unaligned + sub result, data1, data2 + bx lr + +2: + stmfd sp!, {r5} + .cfi_def_cfa_offset 4 + .cfi_offset 5, -4 + + ldr data1, [src1], #4 + and tmp2, src2, #3 + bic src2, src2, #3 + ldr data2, [src2], #4 + cmp tmp2, #2 + beq .Loverlap2 + bhi .Loverlap1 + + /* Critical inner Loop: Block with 3 bytes initial overlap */ + .p2align 2 +.Loverlap3: + bic tmp2, data1, #MSB + cmp tmp2, data2, S2LO #8 + sub syndrome, data1, #0x01010101 + bic syndrome, syndrome, data1 + bne 4f + ands syndrome, syndrome, #0x80808080 + it eq + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #24 + bne 6f + ldr data1, [src1], #4 + b .Loverlap3 +4: + S2LO data2, data2, #8 + b .Lstrcmp_tail + +5: +#ifdef __ARM_BIG_ENDIAN + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00. */ + tst data1, #0xff000000 + itt ne + tstne data1, #0x00ff0000 + tstne data1, #0x0000ff00 + beq .Lstrcmp_done_equal +#else + bics syndrome, syndrome, #0xff000000 + bne .Lstrcmp_done_equal +#endif + ldrb data2, [src2] + S2LO tmp2, data1, #24 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #24 +#endif + b .Lstrcmp_tail + +6: + S2LO tmp2, data1, #24 + and data2, data2, #LSB + b .Lstrcmp_tail + + /* Critical inner Loop: Block with 2 bytes initial overlap. */ + .p2align 2 +.Loverlap2: + S2HI tmp2, data1, #16 + sub syndrome, data1, #0x01010101 + S2LO tmp2, tmp2, #16 + bic syndrome, syndrome, data1 + cmp tmp2, data2, S2LO #16 + bne 4f + ands syndrome, syndrome, #0x80808080 + it eq + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #16 + bne 6f + ldr data1, [src1], #4 + b .Loverlap2 + +5: +#ifdef __ARM_BIG_ENDIAN + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00 */ + tst data1, #0xff000000 + it ne + tstne data1, #0x00ff0000 + beq .Lstrcmp_done_equal +#else + lsls syndrome, syndrome, #16 + bne .Lstrcmp_done_equal +#endif + ldrh data2, [src2] + S2LO tmp2, data1, #16 +#ifdef __ARM_BIG_ENDIAN + lsl data2, data2, #16 +#endif + b .Lstrcmp_tail + +6: + S2HI data2, data2, #16 + S2LO tmp2, data1, #16 +4: + S2LO data2, data2, #16 + b .Lstrcmp_tail + + /* Critical inner Loop: Block with 1 byte initial overlap. */ + .p2align 2 +.Loverlap1: + and tmp2, data1, #LSB + cmp tmp2, data2, S2LO #24 + sub syndrome, data1, #0x01010101 + bic syndrome, syndrome, data1 + bne 4f + ands syndrome, syndrome, #0x80808080 + it eq + ldreq data2, [src2], #4 + bne 5f + eor tmp2, tmp2, data1 + cmp tmp2, data2, S2HI #8 + bne 6f + ldr data1, [src1], #4 + b .Loverlap1 +4: + S2LO data2, data2, #24 + b .Lstrcmp_tail +5: + /* The syndrome value may contain false ones if the string ends + with the bytes 0x01 0x00. */ + tst data1, #LSB + beq .Lstrcmp_done_equal + ldr data2, [src2], #4 +6: + S2LO tmp2, data1, #8 + bic data2, data2, #MSB + b .Lstrcmp_tail +.Lstrcmp_done_equal: + mov result, #0 + .cfi_remember_state + ldmfd sp!, {r5} + .cfi_restore 5 + .cfi_def_cfa_offset 0 + RETURN + +.Lstrcmp_tail: + .cfi_restore_state + and r2, tmp2, #LSB + and result, data2, #LSB + cmp result, #1 + it cs + cmpcs result, r2 + itt eq + S2LOEQ tmp2, tmp2, #8 + S2LOEQ data2, data2, #8 + beq .Lstrcmp_tail + sub result, r2, result + ldmfd sp!, {r5} + .cfi_restore 5 + .cfi_def_cfa_offset 0 + RETURN + .cfi_endproc + .size strcmp, . - strcmp diff --git a/newlib/libc/machine/arm/strcmp.S b/newlib/libc/machine/arm/strcmp.S index f3e7387..1742322 100644 --- a/newlib/libc/machine/arm/strcmp.S +++ b/newlib/libc/machine/arm/strcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 ARM Ltd + * Copyright (c) 2012-2014 ARM Ltd * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -26,769 +26,64 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +/* Wrapper for the various implementations of strcmp. */ + #include "arm_asm.h" -#ifdef __ARMEB__ -#define S2LOMEM lsl -#define S2LOMEMEQ lsleq -#define S2HIMEM lsr +#ifdef __ARM_BIG_ENDIAN +#define S2LO lsl +#define S2LOEQ lsleq +#define S2HI lsr #define MSB 0x000000ff #define LSB 0xff000000 #define BYTE0_OFFSET 24 #define BYTE1_OFFSET 16 #define BYTE2_OFFSET 8 #define BYTE3_OFFSET 0 -#else /* not __ARMEB__ */ -#define S2LOMEM lsr -#define S2LOMEMEQ lsreq -#define S2HIMEM lsl +#else /* not __ARM_BIG_ENDIAN */ +#define S2LO lsr +#define S2LOEQ lsreq +#define S2HI lsl #define BYTE0_OFFSET 0 #define BYTE1_OFFSET 8 #define BYTE2_OFFSET 16 #define BYTE3_OFFSET 24 #define MSB 0xff000000 #define LSB 0x000000ff -#endif /* not __ARMEB__ */ - -.syntax unified - -#if defined (__thumb__) - .thumb - .thumb_func -#if !defined (__thumb2__) - /* If we have thumb1 only, we need to explictly mark the - compatibility. */ - .arch armv4t - .eabi_attribute Tag_also_compatible_with, "\006\013" /* v6-M. */ - .eabi_attribute Tag_ARM_ISA_use, 0 -#endif -#endif - .global strcmp - .type strcmp, %function -strcmp: - -#if (defined (__thumb__) && !defined (__thumb2__)) -1: - ldrb r2, [r0] - ldrb r3, [r1] - adds r0, r0, #1 - adds r1, r1, #1 - cmp r2, #0 - beq 2f - cmp r2, r3 - beq 1b -2: - subs r0, r2, r3 - bx lr -#elif (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) -1: - ldrb r2, [r0], #1 - ldrb r3, [r1], #1 - cmp r2, #1 - it cs - cmpcs r2, r3 - beq 1b - subs r0, r2, r3 - RETURN - - -#elif (defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)) - /* Use LDRD whenever possible. */ - -/* The main thing to look out for when comparing large blocks is that - the loads do not cross a page boundary when loading past the index - of the byte with the first difference or the first string-terminator. - - For example, if the strings are identical and the string-terminator - is at index k, byte by byte comparison will not load beyond address - s1+k and s2+k; word by word comparison may load up to 3 bytes beyond - k; double word - up to 7 bytes. If the load of these bytes crosses - a page boundary, it might cause a memory fault (if the page is not mapped) - that would not have happened in byte by byte comparison. - - If an address is (double) word aligned, then a load of a (double) word - from that address will not cross a page boundary. - Therefore, the algorithm below considers word and double-word alignment - of strings separately. */ - -/* High-level description of the algorithm. - - * The fast path: if both strings are double-word aligned, - use LDRD to load two words from each string in every loop iteration. - * If the strings have the same offset from a word boundary, - use LDRB to load and compare byte by byte until - the first string is aligned to a word boundary (at most 3 bytes). - This is optimized for quick return on short unaligned strings. - * If the strings have the same offset from a double-word boundary, - use LDRD to load two words from each string in every loop iteration, as in the fast path. - * If the strings do not have the same offset from a double-word boundary, - load a word from the second string before the loop to initialize the queue. - Use LDRD to load two words from every string in every loop iteration. - Inside the loop, load the second word from the second string only after comparing - the first word, using the queued value, to guarantee safety across page boundaries. - * If the strings do not have the same offset from a word boundary, - use LDR and a shift queue. Order of loads and comparisons matters, - similarly to the previous case. - - * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value. - * The only difference between ARM and Thumb modes is the use of CBZ instruction. - * The only difference between big and little endian is the use of REV in little endian - to compute the return value, instead of MOV. - * No preload. [TODO.] -*/ - - .macro m_cbz reg label -#ifdef __thumb2__ - cbz \reg, \label -#else /* not defined __thumb2__ */ - cmp \reg, #0 - beq \label -#endif /* not defined __thumb2__ */ - .endm /* m_cbz */ - - .macro m_cbnz reg label -#ifdef __thumb2__ - cbnz \reg, \label -#else /* not defined __thumb2__ */ - cmp \reg, #0 - bne \label -#endif /* not defined __thumb2__ */ - .endm /* m_cbnz */ - - .macro init - /* Macro to save temporary registers and prepare magic values. */ - subs sp, sp, #16 - strd r4, r5, [sp, #8] - strd r6, r7, [sp] - mvn r6, #0 /* all F */ - mov r7, #0 /* all 0 */ - .endm /* init */ - - .macro magic_compare_and_branch w1 w2 label - /* Macro to compare registers w1 and w2 and conditionally branch to label. */ - cmp \w1, \w2 /* Are w1 and w2 the same? */ - magic_find_zero_bytes \w1 - it eq - cmpeq ip, #0 /* Is there a zero byte in w1? */ - bne \label - .endm /* magic_compare_and_branch */ - - .macro magic_find_zero_bytes w1 - /* Macro to find all-zero bytes in w1, result is in ip. */ -#if (defined (__ARM_FEATURE_DSP)) - uadd8 ip, \w1, r6 - sel ip, r7, r6 -#else /* not defined (__ARM_FEATURE_DSP) */ - /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors. - Coincidently, these processors only have Thumb-2 mode, where we can use the - the (large) magic constant available directly as an immediate in instructions. - Note that we cannot use the magic constant in ARM mode, where we need - to create the constant in a register. */ - sub ip, \w1, #0x01010101 - bic ip, ip, \w1 - and ip, ip, #0x80808080 -#endif /* not defined (__ARM_FEATURE_DSP) */ - .endm /* magic_find_zero_bytes */ - - .macro setup_return w1 w2 -#ifdef __ARMEB__ - mov r1, \w1 - mov r2, \w2 -#else /* not __ARMEB__ */ - rev r1, \w1 - rev r2, \w2 -#endif /* not __ARMEB__ */ - .endm /* setup_return */ - - /* - optpld r0, #0 - optpld r1, #0 - */ - - /* Are both strings double-word aligned? */ - orr ip, r0, r1 - tst ip, #7 - bne .Ldo_align - - /* Fast path. */ - init - -.Ldoubleword_aligned: - - /* Get here when the strings to compare are double-word aligned. */ - /* Compare two words in every iteration. */ - .p2align 2 -2: - /* - optpld r0, #16 - optpld r1, #16 - */ - - /* Load the next double-word from each string. */ - ldrd r2, r3, [r0], #8 - ldrd r4, r5, [r1], #8 - - magic_compare_and_branch w1=r2, w2=r4, label=.Lreturn_24 - magic_compare_and_branch w1=r3, w2=r5, label=.Lreturn_35 - b 2b - -.Ldo_align: - /* Is the first string word-aligned? */ - ands ip, r0, #3 - beq .Lword_aligned_r0 - - /* Fast compare byte by byte until the first string is word-aligned. */ - /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes - to read until the next word boudnary is 4-ip. */ - bic r0, r0, #3 - ldr r2, [r0], #4 - lsls ip, ip, #31 - beq .Lbyte2 - bcs .Lbyte3 - -.Lbyte1: - ldrb ip, [r1], #1 - uxtb r3, r2, ror #BYTE1_OFFSET - subs ip, r3, ip - bne .Lfast_return - m_cbz reg=r3, label=.Lfast_return - -.Lbyte2: - ldrb ip, [r1], #1 - uxtb r3, r2, ror #BYTE2_OFFSET - subs ip, r3, ip - bne .Lfast_return - m_cbz reg=r3, label=.Lfast_return - -.Lbyte3: - ldrb ip, [r1], #1 - uxtb r3, r2, ror #BYTE3_OFFSET - subs ip, r3, ip - bne .Lfast_return - m_cbnz reg=r3, label=.Lword_aligned_r0 - -.Lfast_return: - mov r0, ip - bx lr - -.Lword_aligned_r0: - init - /* The first string is word-aligned. */ - /* Is the second string word-aligned? */ - ands ip, r1, #3 - bne .Lstrcmp_unaligned - -.Lword_aligned: - /* The strings are word-aligned. */ - /* Is the first string double-word aligned? */ - tst r0, #4 - beq .Ldoubleword_aligned_r0 - - /* If r0 is not double-word aligned yet, align it by loading - and comparing the next word from each string. */ - ldr r2, [r0], #4 - ldr r4, [r1], #4 - magic_compare_and_branch w1=r2 w2=r4 label=.Lreturn_24 - -.Ldoubleword_aligned_r0: - /* Get here when r0 is double-word aligned. */ - /* Is r1 doubleword_aligned? */ - tst r1, #4 - beq .Ldoubleword_aligned - - /* Get here when the strings to compare are word-aligned, - r0 is double-word aligned, but r1 is not double-word aligned. */ - - /* Initialize the queue. */ - ldr r5, [r1], #4 - - /* Compare two words in every iteration. */ - .p2align 2 -3: - /* - optpld r0, #16 - optpld r1, #16 - */ - - /* Load the next double-word from each string and compare. */ - ldrd r2, r3, [r0], #8 - magic_compare_and_branch w1=r2 w2=r5 label=.Lreturn_25 - ldrd r4, r5, [r1], #8 - magic_compare_and_branch w1=r3 w2=r4 label=.Lreturn_34 - b 3b +#endif /* not __ARM_BIG_ENDIAN */ - .macro miscmp_word offsetlo offsethi - /* Macro to compare misaligned strings. */ - /* r0, r1 are word-aligned, and at least one of the strings - is not double-word aligned. */ - /* Compare one word in every loop iteration. */ - /* OFFSETLO is the original bit-offset of r1 from a word-boundary, - OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word). */ + .macro def_fn f p2align=0 + .text + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm - /* Initialize the shift queue. */ - ldr r5, [r1], #4 +#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) \ + || (__ARM_ARCH == 6 && __ARM_ARCH_PROFILE == 'M') - /* Compare one word from each string in every loop iteration. */ - .p2align 2 -7: - ldr r3, [r0], #4 - S2LOMEM r5, r5, #\offsetlo - magic_find_zero_bytes w1=r3 - cmp r7, ip, S2HIMEM #\offsetlo - and r2, r3, r6, S2LOMEM #\offsetlo - it eq - cmpeq r2, r5 - bne .Lreturn_25 - ldr r5, [r1], #4 - cmp ip, #0 - eor r3, r2, r3 - S2HIMEM r2, r5, #\offsethi - it eq - cmpeq r3, r2 - bne .Lreturn_32 - b 7b - .endm /* miscmp_word */ +# if defined (__thumb__) && !defined (__thumb2__) +/* Thumb1 only variant. */ +# include "strcmp-armv4t.S" +# else +# include "strcmp-arm-tiny.S" +# endif -.Lstrcmp_unaligned: - /* r0 is word-aligned, r1 is at offset ip from a word. */ - /* Align r1 to the (previous) word-boundary. */ - bic r1, r1, #3 +#elif __ARM_ARCH >= 7 - /* Unaligned comparison word by word using LDRs. */ - cmp ip, #2 - beq .Lmiscmp_word_16 /* If ip == 2. */ - bge .Lmiscmp_word_24 /* If ip == 3. */ - miscmp_word offsetlo=8 offsethi=24 /* If ip == 1. */ -.Lmiscmp_word_16: miscmp_word offsetlo=16 offsethi=16 -.Lmiscmp_word_24: miscmp_word offsetlo=24 offsethi=8 +# ifdef __ARM_FEATURE_SIMD32 +# include "strcmp-armv7.S" +# else +# include "strcmp-armv7m.S" +# endif +#elif __ARM_ARCH >= 6 -.Lreturn_32: - setup_return w1=r3, w2=r2 - b .Ldo_return -.Lreturn_34: - setup_return w1=r3, w2=r4 - b .Ldo_return -.Lreturn_25: - setup_return w1=r2, w2=r5 - b .Ldo_return -.Lreturn_35: - setup_return w1=r3, w2=r5 - b .Ldo_return -.Lreturn_24: - setup_return w1=r2, w2=r4 +# include "strcmp-armv6.S" -.Ldo_return: - -#ifdef __ARMEB__ - mov r0, ip -#else /* not __ARMEB__ */ - rev r0, ip -#endif /* not __ARMEB__ */ - - /* Restore temporaries early, before computing the return value. */ - ldrd r6, r7, [sp] - ldrd r4, r5, [sp, #8] - adds sp, sp, #16 - - /* There is a zero or a different byte between r1 and r2. */ - /* r0 contains a mask of all-zero bytes in r1. */ - /* Using r0 and not ip here because cbz requires low register. */ - m_cbz reg=r0, label=.Lcompute_return_value - clz r0, r0 - /* r0 contains the number of bits on the left of the first all-zero byte in r1. */ - rsb r0, r0, #24 - /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1. */ - lsr r1, r1, r0 - lsr r2, r2, r0 - -.Lcompute_return_value: - movs r0, #1 - cmp r1, r2 - /* The return value is computed as follows. - If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return. - If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0, - which means r0:=r0-r0-1 and r0 is #-1 at return. - If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1, - which means r0:=r0-r0 and r0 is #0 at return. - (C==0 and Z==1) cannot happen because the carry bit is "not borrow". */ - it ls - sbcls r0, r0, r0 - bx lr - - -#else /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6) - defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || - (defined (__thumb__) && !defined (__thumb2__))) */ - - /* Use LDR whenever possible. */ - -#ifdef __thumb2__ -#define magic1(REG) 0x01010101 -#define magic2(REG) 0x80808080 #else -#define magic1(REG) REG -#define magic2(REG) REG, lsl #7 -#endif - - optpld r0 - optpld r1 - eor r2, r0, r1 - tst r2, #3 - /* Strings not at same byte offset from a word boundary. */ - bne .Lstrcmp_unaligned - ands r2, r0, #3 - bic r0, r0, #3 - bic r1, r1, #3 - ldr ip, [r0], #4 - it eq - ldreq r3, [r1], #4 - beq 1f - /* Although s1 and s2 have identical initial alignment, they are - not currently word aligned. Rather than comparing bytes, - make sure that any bytes fetched from before the addressed - bytes are forced to 0xff. Then they will always compare - equal. */ - eor r2, r2, #3 - lsl r2, r2, #3 - mvn r3, MSB - S2LOMEM r2, r3, r2 - ldr r3, [r1], #4 - orr ip, ip, r2 - orr r3, r3, r2 -1: -#ifndef __thumb2__ - /* Load the 'magic' constant 0x01010101. */ - str r4, [sp, #-4]! - mov r4, #1 - orr r4, r4, r4, lsl #8 - orr r4, r4, r4, lsl #16 -#endif - .p2align 2 -4: - optpld r0, #8 - optpld r1, #8 - sub r2, ip, magic1(r4) - cmp ip, r3 - itttt eq - /* check for any zero bytes in first word */ - biceq r2, r2, ip - tsteq r2, magic2(r4) - ldreq ip, [r0], #4 - ldreq r3, [r1], #4 - beq 4b -2: - /* There's a zero or a different byte in the word */ - S2HIMEM r0, ip, #24 - S2LOMEM ip, ip, #8 - cmp r0, #1 - it cs - cmpcs r0, r3, S2HIMEM #24 - it eq - S2LOMEMEQ r3, r3, #8 - beq 2b - /* On a big-endian machine, r0 contains the desired byte in bits - 0-7; on a little-endian machine they are in bits 24-31. In - both cases the other bits in r0 are all zero. For r3 the - interesting byte is at the other end of the word, but the - other bits are not necessarily zero. We need a signed result - representing the differnece in the unsigned bytes, so for the - little-endian case we can't just shift the interesting bits - up. */ -#ifdef __ARMEB__ - sub r0, r0, r3, lsr #24 -#else - and r3, r3, #255 -#ifdef __thumb2__ - /* No RSB instruction in Thumb2 */ - lsr r0, r0, #24 - sub r0, r0, r3 -#else - rsb r0, r3, r0, lsr #24 -#endif -#endif -#ifndef __thumb2__ - ldr r4, [sp], #4 -#endif - RETURN +# include "strcmp-armv4.S" -.Lstrcmp_unaligned: - -#if 0 - /* The assembly code below is based on the following alogrithm. */ -#ifdef __ARMEB__ -#define RSHIFT << -#define LSHIFT >> -#else -#define RSHIFT >> -#define LSHIFT << #endif - -#define body(shift) \ - mask = 0xffffffffU RSHIFT shift; \ - w1 = *wp1++; \ - w2 = *wp2++; \ - do \ - { \ - t1 = w1 & mask; \ - if (__builtin_expect(t1 != w2 RSHIFT shift, 0)) \ - { \ - w2 RSHIFT= shift; \ - break; \ - } \ - if (__builtin_expect(((w1 - b1) & ~w1) & (b1 << 7), 0)) \ - { \ - /* See comment in assembler below re syndrome on big-endian */\ - if ((((w1 - b1) & ~w1) & (b1 << 7)) & mask) \ - w2 RSHIFT= shift; \ - else \ - { \ - w2 = *wp2; \ - t1 = w1 RSHIFT (32 - shift); \ - w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift); \ - } \ - break; \ - } \ - w2 = *wp2++; \ - t1 ^= w1; \ - if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0)) \ - { \ - t1 = w1 >> (32 - shift); \ - w2 = (w2 << (32 - shift)) RSHIFT (32 - shift); \ - break; \ - } \ - w1 = *wp1++; \ - } while (1) - - const unsigned* wp1; - const unsigned* wp2; - unsigned w1, w2; - unsigned mask; - unsigned shift; - unsigned b1 = 0x01010101; - char c1, c2; - unsigned t1; - - while (((unsigned) s1) & 3) - { - c1 = *s1++; - c2 = *s2++; - if (c1 == 0 || c1 != c2) - return c1 - (int)c2; - } - wp1 = (unsigned*) (((unsigned)s1) & ~3); - wp2 = (unsigned*) (((unsigned)s2) & ~3); - t1 = ((unsigned) s2) & 3; - if (t1 == 1) - { - body(8); - } - else if (t1 == 2) - { - body(16); - } - else - { - body (24); - } - - do - { -#ifdef __ARMEB__ - c1 = (char) t1 >> 24; - c2 = (char) w2 >> 24; -#else /* not __ARMEB__ */ - c1 = (char) t1; - c2 = (char) w2; -#endif /* not __ARMEB__ */ - t1 RSHIFT= 8; - w2 RSHIFT= 8; - } while (c1 != 0 && c1 == c2); - return c1 - c2; -#endif /* 0 */ - - - wp1 .req r0 - wp2 .req r1 - b1 .req r2 - w1 .req r4 - w2 .req r5 - t1 .req ip - @ r3 is scratch - - /* First of all, compare bytes until wp1(sp1) is word-aligned. */ -1: - tst wp1, #3 - beq 2f - ldrb r2, [wp1], #1 - ldrb r3, [wp2], #1 - cmp r2, #1 - it cs - cmpcs r2, r3 - beq 1b - sub r0, r2, r3 - RETURN - -2: - str r5, [sp, #-4]! - str r4, [sp, #-4]! - //stmfd sp!, {r4, r5} - mov b1, #1 - orr b1, b1, b1, lsl #8 - orr b1, b1, b1, lsl #16 - - and t1, wp2, #3 - bic wp2, wp2, #3 - ldr w1, [wp1], #4 - ldr w2, [wp2], #4 - cmp t1, #2 - beq 2f - bhi 3f - - /* Critical inner Loop: Block with 3 bytes initial overlap */ - .p2align 2 -1: - bic t1, w1, MSB - cmp t1, w2, S2LOMEM #8 - sub r3, w1, b1 - bic r3, r3, w1 - bne 4f - ands r3, r3, b1, lsl #7 - it eq - ldreq w2, [wp2], #4 - bne 5f - eor t1, t1, w1 - cmp t1, w2, S2HIMEM #24 - bne 6f - ldr w1, [wp1], #4 - b 1b -4: - S2LOMEM w2, w2, #8 - b 8f - -5: -#ifdef __ARMEB__ - /* The syndrome value may contain false ones if the string ends - with the bytes 0x01 0x00 */ - tst w1, #0xff000000 - itt ne - tstne w1, #0x00ff0000 - tstne w1, #0x0000ff00 - beq 7f -#else - bics r3, r3, #0xff000000 - bne 7f -#endif - ldrb w2, [wp2] - S2LOMEM t1, w1, #24 -#ifdef __ARMEB__ - lsl w2, w2, #24 -#endif - b 8f - -6: - S2LOMEM t1, w1, #24 - and w2, w2, LSB - b 8f - - /* Critical inner Loop: Block with 2 bytes initial overlap */ - .p2align 2 -2: - S2HIMEM t1, w1, #16 - sub r3, w1, b1 - S2LOMEM t1, t1, #16 - bic r3, r3, w1 - cmp t1, w2, S2LOMEM #16 - bne 4f - ands r3, r3, b1, lsl #7 - it eq - ldreq w2, [wp2], #4 - bne 5f - eor t1, t1, w1 - cmp t1, w2, S2HIMEM #16 - bne 6f - ldr w1, [wp1], #4 - b 2b - -5: -#ifdef __ARMEB__ - /* The syndrome value may contain false ones if the string ends - with the bytes 0x01 0x00 */ - tst w1, #0xff000000 - it ne - tstne w1, #0x00ff0000 - beq 7f -#else - lsls r3, r3, #16 - bne 7f -#endif - ldrh w2, [wp2] - S2LOMEM t1, w1, #16 -#ifdef __ARMEB__ - lsl w2, w2, #16 -#endif - b 8f - -6: - S2HIMEM w2, w2, #16 - S2LOMEM t1, w1, #16 -4: - S2LOMEM w2, w2, #16 - b 8f - - /* Critical inner Loop: Block with 1 byte initial overlap */ - .p2align 2 -3: - and t1, w1, LSB - cmp t1, w2, S2LOMEM #24 - sub r3, w1, b1 - bic r3, r3, w1 - bne 4f - ands r3, r3, b1, lsl #7 - it eq - ldreq w2, [wp2], #4 - bne 5f - eor t1, t1, w1 - cmp t1, w2, S2HIMEM #8 - bne 6f - ldr w1, [wp1], #4 - b 3b -4: - S2LOMEM w2, w2, #24 - b 8f -5: - /* The syndrome value may contain false ones if the string ends - with the bytes 0x01 0x00 */ - tst w1, LSB - beq 7f - ldr w2, [wp2], #4 -6: - S2LOMEM t1, w1, #8 - bic w2, w2, MSB - b 8f -7: - mov r0, #0 - //ldmfd sp!, {r4, r5} - ldr r4, [sp], #4 - ldr r5, [sp], #4 - RETURN -8: - and r2, t1, LSB - and r0, w2, LSB - cmp r0, #1 - it cs - cmpcs r0, r2 - itt eq - S2LOMEMEQ t1, t1, #8 - S2LOMEMEQ w2, w2, #8 - beq 8b - sub r0, r2, r0 - //ldmfd sp!, {r4, r5} - ldr r4, [sp], #4 - ldr r5, [sp], #4 - RETURN - -#endif /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6) - defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || - (defined (__thumb__) && !defined (__thumb2__))) */ |