From 8f5ca04bc7fd53741d80117df992995ace8f6d2d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 16 Oct 1995 01:37:51 +0000 Subject: Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. --- sysdeps/i386/add_n.S | 18 ++- sysdeps/i386/gmp-mparam.h | 28 ++++ sysdeps/i386/i486/strcat.S | 260 +++++++++++++++++++++++++++++++++ sysdeps/i386/i486/strlen.S | 132 +++++++++++++++++ sysdeps/i386/i586/Implies | 2 + sysdeps/i386/i586/add_n.S | 136 ++++++++++++++++++ sysdeps/i386/i586/addmul_1.S | 84 +++++++++++ sysdeps/i386/i586/lshift.S | 213 +++++++++++++++++++++++++++ sysdeps/i386/i586/memcopy.h | 6 +- sysdeps/i386/i586/mul_1.S | 78 ++++++++++ sysdeps/i386/i586/rshift.S | 213 +++++++++++++++++++++++++++ sysdeps/i386/i586/strchr.S | 334 +++++++++++++++++++++++++++++++++++++++++++ sysdeps/i386/i586/strlen.S | 185 ++++++++++++++++++++++++ sysdeps/i386/i586/sub_n.S | 136 ++++++++++++++++++ sysdeps/i386/i586/submul_1.S | 82 +++++++++++ sysdeps/i386/memchr.S | 315 ++++++++++++++++++++++++++++++++++++++++ sysdeps/i386/memchr.c | 48 ------- sysdeps/i386/memcmp.S | 68 +++++++++ sysdeps/i386/stpcpy.S | 87 +++++++++++ sysdeps/i386/stpncpy.S | 143 ++++++++++++++++++ sysdeps/i386/strchr.S | 278 +++++++++++++++++++++++++++++++++++ sysdeps/i386/strcspn.S | 176 +++++++++++++++++++++++ sysdeps/i386/strpbrk.S | 177 +++++++++++++++++++++++ sysdeps/i386/strrchr.S | 321 +++++++++++++++++++++++++++++++++++++++++ sysdeps/i386/strspn.S | 176 +++++++++++++++++++++++ sysdeps/i386/sub_n.S | 26 ++-- 26 files changed, 3653 insertions(+), 69 deletions(-) create mode 100644 sysdeps/i386/gmp-mparam.h create mode 100644 sysdeps/i386/i486/strcat.S create mode 100644 sysdeps/i386/i486/strlen.S create mode 100644 sysdeps/i386/i586/Implies create mode 100644 sysdeps/i386/i586/add_n.S create mode 100644 sysdeps/i386/i586/addmul_1.S create mode 100644 sysdeps/i386/i586/lshift.S create mode 100644 sysdeps/i386/i586/mul_1.S create mode 100644 sysdeps/i386/i586/rshift.S create mode 100644 sysdeps/i386/i586/strchr.S create mode 100644 sysdeps/i386/i586/strlen.S create mode 100644 sysdeps/i386/i586/sub_n.S create mode 100644 sysdeps/i386/i586/submul_1.S create mode 100644 sysdeps/i386/memchr.S delete mode 100644 sysdeps/i386/memchr.c create mode 100644 sysdeps/i386/memcmp.S create mode 100644 sysdeps/i386/stpcpy.S create mode 100644 sysdeps/i386/stpncpy.S create mode 100644 sysdeps/i386/strchr.S create mode 100644 sysdeps/i386/strcspn.S create mode 100644 sysdeps/i386/strpbrk.S create mode 100644 sysdeps/i386/strrchr.S create mode 100644 sysdeps/i386/strspn.S (limited to 'sysdeps/i386') diff --git a/sysdeps/i386/add_n.S b/sysdeps/i386/add_n.S index c4e71ea..c3b3c3e 100644 --- a/sysdeps/i386/add_n.S +++ b/sysdeps/i386/add_n.S @@ -1,7 +1,7 @@ /* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:) subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #ifdef PIC - call here -here: leal (Loop - 3 - here)(%eax,%eax,8),%eax - addl %eax,(%esp) - ret +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L0 +L0: leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(Loop-L0-3),%eax + addl $4,%esp #else - leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */ - jmp *%eax /* jump into loop */ +/* Calculate start address in loop for non-PIC. */ + leal (Loop - 3)(%eax,%eax,8),%eax #endif + jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax adcl (%edx),%eax diff --git a/sysdeps/i386/gmp-mparam.h b/sysdeps/i386/gmp-mparam.h new file mode 100644 index 0000000..687f12a --- /dev/null +++ b/sysdeps/i386/gmp-mparam.h @@ -0,0 +1,28 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#define IEEE_DOUBLE_BIG_ENDIAN 0 diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S new file mode 100644 index 0000000..e3d2181 --- /dev/null +++ b/sysdeps/i386/i486/strcat.S @@ -0,0 +1,260 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. +For Intel 80x86, x>=4. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper . +Optimised a little by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) +*/ + + .text +ENTRY (strcat) + pushl %edi /* Save callee-safe register. */ + + movl 12(%esp), %ecx /* load source pointer */ + movl 8(%esp), %edx /* load destination pointer */ + + testb $0xff, (%ecx) /* Is source string empty? */ + jz L8 /* yes => return */ + + /* Test the first bytes separately until destination is aligned. */ + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + /* Now we are aligned. Begin scan loop. */ + jmp L1 + + ALIGN(4) + +L4: addl $16,%edx /* increment destination pointer for round */ + +L1: movl (%edx), %eax /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + + /* If you compare this with the algorithm in memchr.S you will + notice that here is an `xorl' statement missing. But you must + not forget that we are looking for C == 0 and `xorl $0, %eax' + is a no-op. */ + + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L3 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %ecx. */ + jnz L3 + + movl 4(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* one byte is NUL => stop copying */ + + movl 8(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* one byte is NUL => stop copying */ + + movl 12(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L4 /* no byte is NUL => carry on copying */ + +L7: addl $4, %edx /* adjust source pointer */ +L6: addl $4, %edx +L5: addl $4, %edx + +L3: testb %al, %al /* is first byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + + testb %ah, %ah /* is second byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + + testl $0xff0000, %eax /* is third byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + +L2: subl %ecx, %edx /* reduce number of loop variants */ + + /* Now we have to align the source pointer. */ + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + /* Now we are aligned. */ + jmp L29 /* start copy loop */ + + ALIGN(4) + +L28: movl %eax, 12(%ecx,%edx)/* store word at destination */ + addl $16, %ecx /* adjust pointer for full round */ + +L29: movl (%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L9 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L9 /* one byte is NUL => stop copying */ + movl %eax, (%ecx,%edx) /* store word to destination */ + + movl 4(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L91 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L91 /* one byte is NUL => stop copying */ + movl %eax, 4(%ecx,%edx) /* store word to destination */ + + movl 8(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L92 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L92 /* one byte is NUL => stop copying */ + movl %eax, 8(%ecx,%edx) /* store word to destination */ + + movl 12(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L93 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L28 /* no is NUL => carry on copying */ + +L93: addl $4, %ecx /* adjust pointer */ +L92: addl $4, %ecx +L91: addl $4, %ecx + +L9: movb %al, (%ecx,%edx) /* store first byte of last word */ + orb %al, %al /* is it NUL? */ + jz L8 /* yes => return */ + + movb %ah, 1(%ecx,%edx) /* store second byte of last word */ + orb %ah, %ah /* is it NUL? */ + jz L8 /* yes => return */ + + shrl $16, %eax /* make upper bytes accessible */ + movb %al, 2(%ecx,%edx) /* store third byte of last word */ + orb %al, %al /* is it NUL? */ + jz L8 /* yes => return */ + + movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */ + +L8: movl 8(%esp), %eax /* start address of destination is result */ + popl %edi /* restore saved register */ + + ret diff --git a/sysdeps/i386/i486/strlen.S b/sysdeps/i386/i486/strlen.S new file mode 100644 index 0000000..276563b --- /dev/null +++ b/sysdeps/i386/i486/strlen.S @@ -0,0 +1,132 @@ +/* strlen(str) -- determine the length of the string STR. +Optimized for Intel 80x86, x>=4. +Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper . +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) +*/ + + .text +ENTRY (strlen) + movl 4(%esp), %ecx /* get string pointer */ + movl %ecx, %eax /* duplicate it */ + + andl $3, %ecx /* mask alignment bits */ + jz L1 /* aligned => start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + incl %eax /* increment pointer */ + + xorl $3, %ecx /* was alignment = 3? */ + jz L1 /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + addl $1, %eax /* increment pointer */ + + subl $1, %ecx /* was alignment = 2? */ + jz L1 /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + +/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax' + and `decl %ecx' resp. The additional two byte per instruction make the + label 4 to be aligned on a 16 byte boundary with nops. + + The following `sub $15, %eax' is part of this trick, too. Together with + the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just + as expected from the algorithm. But doing so has the advantage that + no jump to label 1 is necessary and so the pipeline is not flushed. */ + + subl $15, %eax /* effectively +1 */ + + +L4: addl $16, %eax /* adjust pointer for full loop */ + +L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L3 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L3 /* found NUL => return pointer */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* found NUL => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* found NUL => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L4 /* no NUL found => continue loop */ + +L7: addl $4, %eax /* adjust pointer */ +L6: addl $4, %eax +L5: addl $4, %eax + +L3: testb %cl, %cl /* is first byte NUL? */ + jz L2 /* yes => return */ + incl %eax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz L2 /* yes => return */ + incl %eax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz L2 /* yes => return pointer */ + incl %eax /* increment pointer */ + +L2: subl 4(%esp), %eax /* compute difference to string start */ + + ret diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies new file mode 100644 index 0000000..477cd74 --- /dev/null +++ b/sysdeps/i386/i586/Implies @@ -0,0 +1,2 @@ +# Code optimized for i486 is better than simple i386 code. +i386/i486 diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S new file mode 100644 index 0000000..9be45ed --- /dev/null +++ b/sysdeps/i386/i586/add_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_add_n) +C_SYMBOL_NAME(__mpn_add_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + adcl x,r1 + movl 4(src2),x + adcl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + adcl x,r1 + movl 12(src2),x + adcl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + adcl x,r1 + movl 20(src2),x + adcl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + adcl x,r1 + movl 28(src2),x + adcl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + adcl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + adcl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S new file mode 100644 index 0000000..b222840 --- /dev/null +++ b/sysdeps/i386/i586/addmul_1.S @@ -0,0 +1,84 @@ +/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_addmul_1) + .type C_SYMBOL_NAME(__mpn_addmul_1),@function +C_SYMBOL_NAME(__mpn_addmul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(ecx),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,ecx,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4)) + INSN1(neg,l ,R(ecx)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,ecx,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,ecx,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(add,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,ecx,4),R(ebx)) + + INSN1(inc,l ,R(ecx)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret +Lfe1: + .size C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1) diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S new file mode 100644 index 0000000..b9f8131 --- /dev/null +++ b/sysdeps/i386/i586/lshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_lshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_lshift) +C_SYMBOL_NAME(__mpn_lshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl %cl,%ebx,%edx + shldl %cl,%eax,%ebx + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl %cl,%edx,%eax + shldl %cl,%ebx,%edx + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shldl %cl,%eax,%edx + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shll %cl,%edx /* compute least significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi /* use leal not to clobber carry */ + leal 32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi /* use leal not to clobber carry */ + leal 4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h index a9bb9e7..0a87687 100644 --- a/sysdeps/i386/i586/memcopy.h +++ b/sysdeps/i386/i586/memcopy.h @@ -1,5 +1,5 @@ /* memcopy.h -- definitions for memory copy functions. Pentium version. - Copyright (C) 1994 Free Software Foundation, Inc. + Copyright (C) 1994, 1995 Free Software Foundation, Inc. Contributed by Torbjorn Granlund (tege@sics.se). This file is part of the GNU C Library. @@ -88,7 +88,7 @@ Cambridge, MA 02139, USA. */ "subl $32,%2\n" \ "jns 1b\n" \ "2: addl $32,%2" : \ - "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \ - "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \ + "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \ "ax", "dx"); \ } while (0) diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S new file mode 100644 index 0000000..2b7258e --- /dev/null +++ b/sysdeps/i386/i586/mul_1.S @@ -0,0 +1,78 @@ +/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_mul_1) +C_SYMBOL_NAME(__mpn_mul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(eax)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S new file mode 100644 index 0000000..51cde8f --- /dev/null +++ b/sysdeps/i386/i586/rshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_rshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_rshift) +C_SYMBOL_NAME(__mpn_rshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl %cl,%ebx,%edx + shrdl %cl,%eax,%ebx + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl %cl,%edx,%eax + shrdl %cl,%ebx,%edx + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shrdl %cl,%eax,%edx /* compute result limb */ + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shrl %cl,%edx /* compute most significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl $1,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl $1,%eax + movl %ebx,(%edi) + rcrl $1,%edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl $1,%ebx + movl %edx,-8(%edi) + rcrl $1,%eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl $1,%edx + movl %eax,-16(%edi) + rcrl $1,%ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl $1,%eax + movl %ebx,-24(%edi) + rcrl $1,%edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi /* use leal not to clobber carry */ + leal -32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + rcrl $1,%edx + movl %ebx,(%edi) + + leal -4(%esi),%esi /* use leal not to clobber carry */ + leal -4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + movl $0,%eax + rcrl $1,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S new file mode 100644 index 0000000..982c80e --- /dev/null +++ b/sysdeps/i386/i586/strchr.S @@ -0,0 +1,334 @@ +/* strchr -- find character CH in a NUL terminated string. +Highly optimized version for ix85, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strchr) + pushl %edi /* Save callee-safe registers. */ + pushl %esi + + pushl %ebx + pushl %ebp + + movl 20(%esp), %eax /* get string pointer */ + movl 24(%esp), %edx /* get character we are looking for */ + + movl %eax, %edi /* duplicate string pointer for later */ + xorl %ecx, %ecx /* clear %ecx */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movb %dl, %cl /* we construct the lower half in %ecx */ + + shll $16, %edx /* now %edx is c|c|0|0 */ + movb %cl, %ch /* now %ecx is 0|0|c|c */ + + orl %ecx, %edx /* and finally c|c|c|c */ + andl $3, %edi /* mask alignment bits */ + + jz L11 /* alignment is 0 => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $3, %edi /* was alignment == 3? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $2, %edi /* was alignment == 2? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The following code is the preparation for the loop. The + four instruction up to `L1' will not be executed in the loop + because the same code is found at the end of the loop, but + there it is executed in parallel with other instructions. */ +L11: movl (%eax), %ecx + movl $magic, %ebp + + movl $magic, %edi + addl %ecx, %ebp + + /* The main loop: it looks complex and indeed it is. I would + love to say `it was hard to write, so it should he hard to + read' but I will give some more hints. To fully understand + this code you should first take a look at the i486 version. + The basic algorithm is the same, but here the code organized + in a way which permits to use both pipelines all the time. + + I tried to make it a bit more understandable by indenting + the code according to stage in the algorithm. It goes as + follows: + check for 0 in 1st word + check for C in 1st word + check for 0 in 2nd word + check for C in 2nd word + check for 0 in 3rd word + check for C in 3rd word + check for 0 in 4th word + check for C in 4th word + + Please note that doing the test for NUL before the test for + C allows us to overlap the test for 0 in the next word with + the test for C. */ + +L1: xorl %ecx, %ebp /* (word^magic) */ + addl %ecx, %edi /* add magic word */ + + leal 4(%eax), %eax /* increment pointer */ + jnc L4 /* previous addl caused overflow? */ + + movl %ecx, %ebx /* duplicate original word */ + orl $magic, %ebp /* (word^magic)|magic */ + + addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ + jne L4 /* yes => we found word with NUL */ + + movl $magic, %esi /* load magic value */ + xorl %edx, %ebx /* clear words which are C */ + + movl (%eax), %ecx + addl %ebx, %esi /* (word+magic) */ + + movl $magic, %edi + jnc L5 /* previous addl caused overflow? */ + + movl %edi, %ebp + xorl %ebx, %esi /* (word+magic)^word */ + + addl %ecx, %ebp + orl $magic, %esi /* ((word+magic)^word)|magic */ + + addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ + jne L5 /* yes => we found word with C */ + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + + je L1 + + /* We know there is no NUL byte but a C byte in the word. + %ebx contains NUL in this particular byte. */ +L5: subl $4, %eax /* adjust pointer */ + testb %bl, %bl /* first byte == C? */ + + jz L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + testb %bh, %bh /* second byte == C? */ + + jz L2 /* yes => return pointer */ + + shrl $16, %ebx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmp $0, %bl /* third byte == C */ + je L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + +L2: popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + + /* We know there is a NUL byte in the word. But we have to test + whether there is an C byte before it in the word. */ +L4: subl $4, %eax /* adjust pointer */ + cmpb %dl, %cl /* first byte == C? */ + + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* first byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cmpb %dl, %ch /* second byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %ch /* second byte == NUL? */ + je L3 /* yes => return NULL */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb %dl, %cl /* third byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* third byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The test four the fourth byte is necessary! */ + cmpb %dl, %ch /* fourth byte == C? */ + je L2 /* yes => return pointer */ + +L3: xorl %eax, %eax /* set return value = NULL */ + + popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + +#undef index +weak_alias (strchr, index) diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S new file mode 100644 index 0000000..b807ed4 --- /dev/null +++ b/sysdeps/i386/i586/strlen.S @@ -0,0 +1,185 @@ +/* strlen -- Compute length og NUL terminated string. +Highly optimized version for ix86, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) +*/ + + .text +ENTRY(strlen) + movl 4(%esp), %eax /* get string pointer */ + + movl %eax, %ecx /* duplicate it */ + andl $3, %ecx /* mask alignment bits */ + + jz L11 /* aligned => start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $3, %ecx /* was alignment = 3? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $2, %ecx /* was alignment = 2? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. */ +L11: xorl %edx, %edx /* We need %edx == 0 for later */ + +L1: + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* wcomplete negation of ord */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L1 /* no => start loop again */ + + +L3: subl $4, %eax /* correct too early pointer increment */ + testb %cl, %cl /* lowest byte NUL? */ + + jz L2 /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L2 /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L2 /* yes => return */ + + incl %eax /* increment pointer */ + +L2: subl 4(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + + ret diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S new file mode 100644 index 0000000..1382e66 --- /dev/null +++ b/sysdeps/i386/i586/sub_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0 + and store difference in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_sub_n) +C_SYMBOL_NAME(__mpn_sub_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + sbbl x,r1 + movl 4(src2),x + sbbl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + sbbl x,r1 + movl 12(src2),x + sbbl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + sbbl x,r1 + movl 20(src2),x + sbbl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + sbbl x,r1 + movl 28(src2),x + sbbl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + sbbl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + sbbl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S new file mode 100644 index 0000000..14bfe54 --- /dev/null +++ b/sysdeps/i386/i586/submul_1.S @@ -0,0 +1,82 @@ +/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_submul_1) +C_SYMBOL_NAME(__mpn_submul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(sub,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret diff --git a/sysdeps/i386/memchr.S b/sysdeps/i386/memchr.S new file mode 100644 index 0000000..9931f97 --- /dev/null +++ b/sysdeps/i386/memchr.S @@ -0,0 +1,315 @@ +/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less + than N. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Optimised a little by Alan Modra +This file is part of the GNU C Library. + +This version is developed using the same algorithm as the fast C +version which carries the following introduction: + +Based on strlen implemention by Torbjorn Granlund (tege@sics.se), +with help from Dan Sahlin (dan@sics.se) and +commentary by Jim Blandy (jimb@ai.mit.edu); +adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu), +and implemented by Roland McGrath (roland@ai.mit.edu). + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + c (sp + 8) + len (sp + 12) +*/ + + .text +ENTRY (memchr) + /* Save callee-safe registers used in this function. */ + pushl %esi + pushl %edi + + /* Load parameters into registers. */ + movl 12(%esp), %eax /* str: pointer to memory block. */ + movl 16(%esp), %edx /* c: byte we are looking for. */ + movl 20(%esp), %esi /* len: length of memory block. */ + + /* If my must not test more than three characters test + them one by one. This is especially true for 0. */ + cmpl $4, %esi + jb L3 + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* Now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* Now c|c|0|0 */ + movw %cx, %dx /* And finally c|c|c|c */ + + /* Better performance can be achieved if the word (32 + bit) memory access is aligned on a four-byte-boundary. + So process first bytes one by one until boundary is + reached. Don't use a loop for better performance. */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L4 /* len==0 => return NULL */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L4 /* len==0 => return NULL */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + /* no test for len==0 here, because this is done in the + loop head */ + jmp L2 + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + + /* Each round the main loop processes 16 bytes. */ + + ALIGN (4) + +L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L8 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L8 /* found it => return pointer */ + + /* This process is unfolded four times for better performance. + we don't increment the source pointer each time. Instead we + use offsets and increment by 16 in each run of the loop. But + before probing for the matching byte we need some extra code + (following LL(13) below). Even the len can be compared with + constants instead of decrementing each time. */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L7 /* found it => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* found it => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* found it => return pointer */ + + /* Adjust both counters for a full round, i.e. 16 bytes. */ + addl $16, %eax +L2: subl $16, %esi + jae L1 /* Still more than 16 bytes remaining */ + + /* Process remaining bytes separately. */ + cmpl $4-16, %esi /* rest < 4 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $8-16, %esi /* rest < 8 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $12-16, %esi /* rest < 12 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + /* Check the remaining bytes one by one. */ +L3: andl $3, %esi /* mask out uninteresting bytes */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + +L4: /* no byte found => return NULL */ + xorl %eax, %eax + jmp L9 + + /* add missing source pointer increments */ +L5: addl $4, %eax +L6: addl $4, %eax +L7: addl $4, %eax + + /* Test for the matching byte in the word. %ecx contains a NUL + char in the byte which originally was the byte we are looking + at. */ +L8: testb %cl, %cl /* test first byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testb %ch, %ch /* test second byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testl $0xff0000, %ecx /* test third byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + /* No further test needed we we known it is one of the four byytes. */ + +L9: popl %edi /* pop saved registers */ + popl %esi + + ret diff --git a/sysdeps/i386/memchr.c b/sysdeps/i386/memchr.c deleted file mode 100644 index ff0f8d9..0000000 --- a/sysdeps/i386/memchr.c +++ /dev/null @@ -1,48 +0,0 @@ -/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less - than N. - For Intel 80x86, x>=3. - Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc. - Contributed by Torbjorn Granlund (tege@sics.se). - -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. - -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. - -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ - -#include -#include - -#ifdef __GNUC__ - -PTR -DEFUN(memchr, (str, c, len), - CONST PTR str AND int c AND size_t len) -{ - PTR retval; - asm("cld\n" /* Search forward. */ - "testl %1,%1\n" /* Clear Z flag, to handle LEN == 0. */ - /* Some old versions of gas need `repne' instead of `repnz'. */ - "repnz\n" /* Search for C in al. */ - "scasb\n" - "movl %2,%0\n" /* Set %0 to 0 (without affecting Z flag). */ - "jnz done\n" /* Jump if we found nothing equal to C. */ - "leal -1(%1),%0\n" /* edi has been incremented. Return edi-1. */ - "done:" : - "=a" (retval), "=D" (str), "=c" (len) : - "0" (c), "1" (str), "2" (len)); - return retval; -} - -#else -#include -#endif diff --git a/sysdeps/i386/memcmp.S b/sysdeps/i386/memcmp.S new file mode 100644 index 0000000..f16b44a --- /dev/null +++ b/sysdeps/i386/memcmp.S @@ -0,0 +1,68 @@ +/* memcmp -- compare two memory blocks for differences in the first COUNT + bytes. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + block1 (sp + 4) + block2 (sp + 8) + len (sp + 12) +*/ + + .text +ENTRY (memcmp) + pushl %esi /* Save callee-safe registers. */ + movl %edi, %edx /* Note that %edx is not used and can + so be used to save %edi. It's faster. */ + + movl 12(%esp), %esi /* Load address of block #1. */ + movl 16(%esp), %edi /* Load address of block #2. */ + movl 20(%esp), %ecx /* Load maximal length of compare area. */ + + cld /* Set direction of comparison. */ + + xorl %eax, %eax /* Default result. */ + + repe /* Compare at most %ecx bytes. */ + cmpsb + jz L1 /* If even last byte was equal we return 0. */ + + /* The memory blocks are not equal. So result of the last + subtraction is present in the carry flag. It is set when + the byte in block #2 is bigger. In this case we have to + return -1 (=0xffffffff), else 1. */ + sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set + or not depending on last subtraction. */ + + /* At this point %eax == 0, if the byte of block #1 was bigger, and + 0xffffffff if the last byte of block #2 was bigger. The later + case is already correct but the former needs a little adjustment. + Note that the following operation does not change 0xffffffff. */ + orb $1, %al /* Change 0 to 1. */ + +L1: popl %esi /* Restore registers. */ + movl %edx, %edi + + ret + +#undef bcmp +weak_alias (memcmp, bcmp) diff --git a/sysdeps/i386/stpcpy.S b/sysdeps/i386/stpcpy.S new file mode 100644 index 0000000..f38a908 --- /dev/null +++ b/sysdeps/i386/stpcpy.S @@ -0,0 +1,87 @@ +/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0' + in DEST. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu). +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* This function is defined neither in ANSI nor POSIX standards but is + also not invented here. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) +*/ + + .text +ENTRY (__stpcpy) + movl 4(%esp), %eax /* load destination pointer */ + movl 8(%esp), %ecx /* load source pointer */ + + subl %eax, %ecx /* magic: reduce number of loop variants + to one using addressing mode */ + + /* Here we would like to write + + subl $4, %eax + ALIGN (4) + + but the assembler is too smart and optimizes for the shortest + form where the number only needs one byte. But if we could + have the long form we would not need the alignment. */ + + .byte 0x81, 0xe8 /* This is `subl $0x00000004, %eax' */ + .long 0x00000004 + + /* Four times unfolded loop with only one loop counter. This + is achieved by the use of index+base adressing mode. As the + loop counter we use the destination address because this is + also the result. */ +L1: addl $4, %eax /* increment loop counter */ + + movb (%eax,%ecx), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L2 /* yes, then exit */ + + movb 1(%eax,%ecx), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + movb 2(%eax,%ecx), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L4 /* yes, then exit */ + + movb 3(%eax,%ecx), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jnz L1 /* no, then continue loop */ + + incl %eax /* correct loop counter */ +L4: incl %eax +L3: incl %eax +L2: + ret + +weak_alias (__stpcpy, stpcpy) diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S new file mode 100644 index 0000000..59192e6 --- /dev/null +++ b/sysdeps/i386/stpncpy.S @@ -0,0 +1,143 @@ +/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the + address of the terminating '\0' in DEST. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some bug fixes by Alan Modra + - original wrote n+1 chars in some cases. + - stpncpy() ought to behave like strncpy() ie. not null-terminate + if limited by n. glibc-1.09 stpncpy() does this. +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) + maxlen (sp + 12) +*/ + + .text +ENTRY (__stpncpy) + + pushl %esi + + movl 8(%esp), %eax /* load destination pointer */ + movl 12(%esp), %esi /* load source pointer */ + movl 16(%esp), %ecx /* load maximal length */ + + subl %eax, %esi /* magic: reduce number of loop variants + to one using addressing mode */ + jmp L1 /* jump to loop "head" */ + + ALIGN(4) + + /* Four times unfolded loop with two loop counters. We get the + the third value (the source address) by using the index+base + adressing mode. */ +L2: movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L7 /* yes, then exit */ + + movb 1(%eax,%esi), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L6 /* yes, then exit */ + + movb 2(%eax,%esi), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L5 /* yes, then exit */ + + movb 3(%eax,%esi), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L4 /* yes, then exit */ + + addl $4, %eax /* increment loop counter for full round */ + +L1: subl $4, %ecx /* still more than 4 bytes allowed? */ + jae L2 /* yes, then go to start of loop */ + + /* The maximal remaining 15 bytes are not processed in a loop. */ + + addl $4, %ecx /* correct above subtraction */ + jz L9 /* maximal allowed char reached => go to end */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L9 /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L9 /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + jmp L9 /* we don't have to test for counter underflow + because we know we had a most 3 bytes + remaining => exit */ + + /* When coming from the main loop we have to adjust the pointer. */ +L4: decl %ecx /* decrement counter */ + incl %eax /* increment pointer */ + +L5: decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ + +L6: decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ +L7: + + addl $3, %ecx /* correct pre-decrementation of counter + at the beginning of the loop; but why 3 + and not 4? Very simple, we have to count + the NUL char we already wrote. */ + jz L9 /* counter is also 0 => exit */ + + /* We now have to fill the rest of the buffer with NUL. This + is done in a tricky way. Please note that the adressing mode + used below is not the same we used above. Here we use the + %ecx register. */ +L8: + movb $0, (%ecx,%eax) /* store NUL char */ +L3: decl %ecx /* all bytes written? */ + jnz L8 /* no, then again */ + +L9: popl %esi /* restore saved register content */ + + ret + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/i386/strchr.S b/sysdeps/i386/strchr.S new file mode 100644 index 0000000..de947cd --- /dev/null +++ b/sysdeps/i386/strchr.S @@ -0,0 +1,278 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some optimisations by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strchr) + pushl %edi /* Save callee-safe registers used here. */ + + movl 8(%esp), %eax /* get string pointer */ + movl 12(%esp), %edx /* get character we are looking for */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* now it is c|c|0|0 */ + movw %cx, %dx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherant + boundaries are multiples of 4. */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + /* No we have reached alignment. */ + jmp L11 /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + ALIGN(4) + +L1: addl $16, %eax /* adjust pointer for whole round */ + +L11: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L7 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L7 /* found it => return pointer */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L71 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L71 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L72 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L72 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L73 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L73 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L1 /* no NUL found => restart loop */ + +L2: /* Return NULL. */ + xorl %eax, %eax /* load NULL in return value register */ + popl %edi /* restore saved register content */ + ret + +L73: addl $4, %eax /* adjust pointer */ +L72: addl $4, %eax +L71: addl $4, %eax + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. */ + +L7: testb %cl, %cl /* is first byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je L2 /* yes => return NULL */ + incl %eax /* it's not in the first byte */ + + testb %ch, %ch /* is second byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je L2 /* yes => return NULL? */ + incl %eax /* it's not in the second byte */ + + shrl $16, %ecx /* make upper byte accessible */ + testb %cl, %cl /* is third byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je L2 /* yes => return NULL */ + + /* It must be in the fourth byte and it cannot be NUL. */ + incl %eax + +L6: popl %edi /* restore saved register content */ + + ret + +weak_alias (strchr, index) diff --git a/sysdeps/i386/strcspn.S b/sysdeps/i386/strcspn.S new file mode 100644 index 0000000..b0e789b --- /dev/null +++ b/sysdeps/i386/strcspn.S @@ -0,0 +1,176 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains no characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + stopset (sp + 8) +*/ + + .text +ENTRY (strcspn) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get stopset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L4 /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L5 /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L6 /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L3 /* yes => return */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + addl $256, %esp /* remove stopset */ + + ret diff --git a/sysdeps/i386/strpbrk.S b/sysdeps/i386/strpbrk.S new file mode 100644 index 0000000..245bf1a --- /dev/null +++ b/sysdeps/i386/strpbrk.S @@ -0,0 +1,177 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains no characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + stopset (sp + 8) +*/ + + .text +ENTRY (strpbrk) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get stopset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L4 /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L5 /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L6 /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L3 /* yes => return */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: addl $256, %esp /* remove stopset */ + + orb %cl, %cl /* was last character NUL? */ + jnz L7 /* no => return pointer */ + xorl %eax, %eax /* return NULL */ + +L7: ret diff --git a/sysdeps/i386/strrchr.S b/sysdeps/i386/strrchr.S new file mode 100644 index 0000000..468a940 --- /dev/null +++ b/sysdeps/i386/strrchr.S @@ -0,0 +1,321 @@ +/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some optimisations by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strrchr) + pushl %edi /* Save callee-safe registers used here. */ + pushl %esi + + xorl %eax, %eax + movl 12(%esp), %esi /* get string pointer */ + movl 16(%esp), %ecx /* get character we are looking for */ + + /* At the moment %ecx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %cl, %ch /* now it is 0|0|c|c */ + movl %ecx, %edx + shll $16, %ecx /* now it is c|c|0|0 */ + movw %dx, %cx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherant + boundaries are multiples of 4. */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L11 /* target found => return */ + movl %esi, %eax /* remember pointer as possible result */ +L11: orb %dl, %dl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L12 /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L12: orb %dl, %dl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L13 /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L13: orb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + /* No we have reached alignment. */ + jmp L19 /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + /* Jump to here when the character is detected. We chose this + way around because the character one is looking for is not + as frequent as the rest and taking a conditional jump is more + expensive than ignoring it. + + Some more words to the code below: it might not be obvious why + we decrement the source pointer here. In the loop the pointer + is not pre-incremented and so it still points before the word + we are looking at. But you should take a look at the instruction + which gets executed before we get into the loop: `addl $16, %esi'. + This makes the following subs into adds. */ + + /* These fill bytes make the main loop be correctly aligned. + We cannot use align because it is not the following instruction + which should be aligned. */ + .byte 0, 0, 0, 0, 0, 0, 0, 0 + +L4: subl $4, %esi /* adjust pointer */ +L41: subl $4, %esi +L42: subl $4, %esi +L43: testl $0xff000000, %edx /* is highest byte == C? */ + jnz L33 /* no => try other bytes */ + leal 15(%esi), %eax /* store address as result */ + jmp L1 /* and start loop again */ + +L3: subl $4, %esi /* adjust pointer */ +L31: subl $4, %esi +L32: subl $4, %esi +L33: testl $0xff0000, %edx /* is C in third byte? */ + jnz L51 /* no => try other bytes */ + leal 14(%esi), %eax /* store address as result */ + jmp L1 /* and start loop again */ + +L51: + /* At this point we know that the byte is in one of the lower bytes. + We make a guess and correct it if necessary. This reduces the + number of necessary jumps. */ + leal 12(%esi), %eax /* guess address of lowest byte as result */ + testb %dh, %dh /* is guess correct? */ + jnz L1 /* yes => start loop */ + leal 13(%esi), %eax /* correct guess to second byte */ + +L1: addl $16, %esi /* increment pointer for full round */ + +L19: movl (%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + + jnc L20 /* found NUL => check last word */ + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %edx, %edi /* (word+magic)^word */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L20 /* found NUL => check last word */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L4 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L3 /* C is detected in the word => examine it */ + + movl 4(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L21 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L21 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L41 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L31 /* C is detected in the word => examine it */ + + movl 8(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L22 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L22 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L42 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L32 /* C is detected in the word => examine it */ + + movl 12(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L23 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L23 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L43 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L1 /* C is not detected => restart loop */ + jmp L33 /* examine word */ + +L23: addl $4, %esi /* adjust pointer */ +L22: addl $4, %esi +L21: addl $4, %esi + + /* What remains to do is to test which byte the NUL char is and + whether the searched character appears in one of the bytes + before. A special case is that the searched byte maybe NUL. + In this case a pointer to the terminating NUL char has to be + returned. */ + +L20: cmpb %cl, %dl /* is first byte == C? */ + jne L24 /* no => skip */ + movl %esi, %eax /* store address as result */ +L24: testb %dl, %dl /* is first byte == NUL? */ + jz L2 /* yes => return */ + + cmpb %cl, %dh /* is second byte == C? */ + jne L25 /* no => skip */ + leal 1(%esi), %eax /* store address as result */ +L25: testb %dh, %dh /* is second byte == NUL? */ + jz L2 /* yes => return */ + + shrl $16,%edx /* make upper bytes accessible */ + cmpb %cl, %dl /* is third byte == C */ + jne L26 /* no => skip */ + leal 2(%esi), %eax /* store address as result */ +L26: testb %dl, %dl /* is third byte == NUL */ + jz L2 /* yes => return */ + + cmpb %cl, %dh /* is fourth byte == C */ + jne L2 /* no => skip */ + leal 3(%esi), %eax /* store address as result */ + +L2: popl %esi /* restore saved register content */ + popl %edi + + ret + +weak_alias (strrchr, rindex) diff --git a/sysdeps/i386/strspn.S b/sysdeps/i386/strspn.S new file mode 100644 index 0000000..1a02026 --- /dev/null +++ b/sysdeps/i386/strspn.S @@ -0,0 +1,176 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains only characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + skipset (sp + 8) +*/ + + .text +ENTRY (strspn) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get skipset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L4 /* no => return */ + + movb 1(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L5 /* no => return */ + + movb 2(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L6 /* no => return */ + + movb 3(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jnz L3 /* yes => start loop again */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + addl $256, %esp /* remove stopset */ + + ret diff --git a/sysdeps/i386/sub_n.S b/sysdeps/i386/sub_n.S index 64d2c25..e18a708 100644 --- a/sysdeps/i386/sub_n.S +++ b/sysdeps/i386/sub_n.S @@ -1,7 +1,7 @@ /* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -37,10 +37,10 @@ C_SYMBOL_NAME(__mpn_sub_n:) pushl %edi pushl %esi - movl 12(%esp),%edi /* res_ptr */ - movl 16(%esp),%esi /* s1_ptr */ - movl 20(%esp),%edx /* s2_ptr */ - movl 24(%esp),%ecx /* size */ + movl 12(%esp),%edi /* res_ptr */ + movl 16(%esp),%esi /* s1_ptr */ + movl 20(%esp),%edx /* s2_ptr */ + movl 24(%esp),%ecx /* size */ movl %ecx,%eax shrl $3,%ecx /* compute count for unrolled loop */ @@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:) subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #ifdef PIC - call here -here: leal (Loop - 3 - here)(%eax,%eax,8),%eax - addl %eax,(%esp) - ret +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L0 +L0: leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(Loop-L0-3),%eax + addl $4,%esp #else - leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */ - jmp *%eax /* jump into loop */ +/* Calculate start address in loop for non-PIC. */ + leal (Loop - 3)(%eax,%eax,8),%eax #endif + jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax sbbl (%edx),%eax -- cgit v1.1