From 78df0fcb80247ca7573a8ed07cc992b7031674c1 Mon Sep 17 00:00:00 2001 From: Andreas Jaeger Date: Sat, 31 Aug 2002 17:45:33 +0000 Subject: Update. * sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Declare external functions with hidden attribute. (elf_machine_rela): Optimize. * sysdeps/x86_64/memset.S: New file. * sysdeps/x86_64/bzero.S: New file. * sysdeps/x86_64/stpcpy.S: New file. * sysdeps/x86_64/strcat.S: New file. * sysdeps/x86_64/strchr.S: New file. * sysdeps/x86_64/strcpy.S: New file. * sysdeps/x86_64/strcspn.S: New file. * sysdeps/x86_64/strlen.S: New file. * sysdeps/x86_64/strpbrk.S: New file. * sysdeps/x86_64/strspn.S: New file. * sysdeps/x86_64/strcmp.S: New file. * sysdeps/x86_64/strtok_r.S: New file. * sysdeps/x86_64/strtok.S: New file. * sysdeps/x86_64/memcpy.S: New file. * sysdeps/x86_64/mempcpy.S: New file. --- ChangeLog | 20 +++ sysdeps/x86_64/bzero.S | 3 + sysdeps/x86_64/dl-machine.h | 9 +- sysdeps/x86_64/memcpy.S | 92 ++++++++++++++ sysdeps/x86_64/mempcpy.S | 5 + sysdeps/x86_64/memset.S | 131 ++++++++++++++++++++ sysdeps/x86_64/stpcpy.S | 7 ++ sysdeps/x86_64/strcat.S | 257 +++++++++++++++++++++++++++++++++++++++ sysdeps/x86_64/strchr.S | 290 ++++++++++++++++++++++++++++++++++++++++++++ sysdeps/x86_64/strcmp.S | 44 +++++++ sysdeps/x86_64/strcpy.S | 156 ++++++++++++++++++++++++ sysdeps/x86_64/strcspn.S | 123 +++++++++++++++++++ sysdeps/x86_64/strlen.S | 138 +++++++++++++++++++++ sysdeps/x86_64/strpbrk.S | 2 + sysdeps/x86_64/strspn.S | 113 +++++++++++++++++ sysdeps/x86_64/strtok.S | 208 +++++++++++++++++++++++++++++++ sysdeps/x86_64/strtok_r.S | 4 + 17 files changed, 1599 insertions(+), 3 deletions(-) create mode 100644 sysdeps/x86_64/bzero.S create mode 100644 sysdeps/x86_64/memcpy.S create mode 100644 sysdeps/x86_64/mempcpy.S create mode 100644 sysdeps/x86_64/memset.S create mode 100644 sysdeps/x86_64/stpcpy.S create mode 100644 sysdeps/x86_64/strcat.S create mode 100644 sysdeps/x86_64/strchr.S create mode 100644 sysdeps/x86_64/strcmp.S create mode 100644 sysdeps/x86_64/strcpy.S create mode 100644 sysdeps/x86_64/strcspn.S create mode 100644 sysdeps/x86_64/strlen.S create mode 100644 sysdeps/x86_64/strpbrk.S create mode 100644 sysdeps/x86_64/strspn.S create mode 100644 sysdeps/x86_64/strtok.S create mode 100644 sysdeps/x86_64/strtok_r.S diff --git a/ChangeLog b/ChangeLog index 44451b2..3881b41 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,25 @@ 2002-08-31 Andreas Jaeger + * sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup): Declare + external functions with hidden attribute. + (elf_machine_rela): Optimize. + + * sysdeps/x86_64/memset.S: New file. + * sysdeps/x86_64/bzero.S: New file. + * sysdeps/x86_64/stpcpy.S: New file. + * sysdeps/x86_64/strcat.S: New file. + * sysdeps/x86_64/strchr.S: New file. + * sysdeps/x86_64/strcpy.S: New file. + * sysdeps/x86_64/strcspn.S: New file. + * sysdeps/x86_64/strlen.S: New file. + * sysdeps/x86_64/strpbrk.S: New file. + * sysdeps/x86_64/strspn.S: New file. + * sysdeps/x86_64/strcmp.S: New file. + * sysdeps/x86_64/strtok_r.S: New file. + * sysdeps/x86_64/strtok.S: New file. + * sysdeps/x86_64/memcpy.S: New file. + * sysdeps/x86_64/mempcpy.S: New file. + * sysdeps/x86_64/fpu/s_copysign.S: Fix algorithm. * sysdeps/x86_64/fpu/libm-test-ulps: Add ulps for double tests. diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S new file mode 100644 index 0000000..2688f45 --- /dev/null +++ b/sysdeps/x86_64/bzero.S @@ -0,0 +1,3 @@ +#define memset __bzero +#include +weak_alias (__bzero, bzero) diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h index adf108c..2800a9e 100644 --- a/sysdeps/x86_64/dl-machine.h +++ b/sysdeps/x86_64/dl-machine.h @@ -84,8 +84,8 @@ static inline int __attribute__ ((unused)) elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) { Elf64_Addr *got; - extern void _dl_runtime_resolve (Elf64_Word); - extern void _dl_runtime_profile (Elf64_Word); + extern void _dl_runtime_resolve (Elf64_Word) attribute_hidden; + extern void _dl_runtime_profile (Elf64_Word) attribute_hidden; if (l->l_info[DT_JMPREL] && lazy) { @@ -367,7 +367,10 @@ elf_machine_rela (struct link_map *map, const Elf64_Rela *reloc, const Elf64_Sym *const refsym = sym; #endif Elf64_Addr value = RESOLVE (&sym, version, r_type); - if (sym) + +# ifndef RTLD_BOOTSTRAP + if (sym != NULL) +# endif value += sym->st_value; #ifdef RTLD_BOOTSTRAP diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S new file mode 100644 index 0000000..1339036 --- /dev/null +++ b/sysdeps/x86_64/memcpy.S @@ -0,0 +1,92 @@ +/* Highly optimized version for x86-64. + Copyright (C) 1997, 2000, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Based on i586 version contributed by Ulrich Drepper , 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy', + and the return value is the byte after the last one copied in + the destination. */ +#define MEMPCPY_P (defined memcpy) + + .text +ENTRY (BP_SYM (memcpy)) + /* Cutoff for the big loop is a size of 32 bytes since otherwise + the loop will never be entered. */ + cmpq $32, %rdx + movq %rdx, %rcx +#if !MEMPCPY_P + movq %rdi, %r10 /* Save value. */ +#endif + + /* We need this in any case. */ + cld + + jbe 1f + + /* Align destination. */ + movq %rdi, %rax + negq %rax + andq $3, %rax + subq %rax, %rcx + xchgq %rax, %rcx + + rep; movsb + + movq %rax, %rcx + subq $32, %rcx + js 2f + + .p2align 4 +3: + + /* Now correct the loop counter. Please note that in the following + code the flags are not changed anymore. */ + subq $32, %rcx + + movq (%rsi), %rax + movq 8(%rsi), %rdx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq %rax, (%rdi) + movq %rdx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi + + jns 3b + + /* Correct extra loop counter modification. */ +2: addq $32, %rcx +1: rep; movsb + +#if MEMPCPY_P + movq %rdi, %rax /* Set return value. */ +#else + movq %r10, %rax /* Set return value. */ + +#endif + ret + +END (BP_SYM (memcpy)) diff --git a/sysdeps/x86_64/mempcpy.S b/sysdeps/x86_64/mempcpy.S new file mode 100644 index 0000000..38fdd05 --- /dev/null +++ b/sysdeps/x86_64/mempcpy.S @@ -0,0 +1,5 @@ +#define memcpy __mempcpy +#include + +libc_hidden_def (BP_SYM (__mempcpy)) +weak_alias (BP_SYM (__mempcpy), BP_SYM (mempcpy)) diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S new file mode 100644 index 0000000..b95ca40 --- /dev/null +++ b/sysdeps/x86_64/memset.S @@ -0,0 +1,131 @@ +/* memset/bzero -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ +#define BZERO_P (defined memset) + +/* This is somehow experimental and could made dependend on the cache + size. */ +#define LARGE $120000 + + .text +ENTRY (memset) +#if BZERO_P + mov %rsi,%rdx /* Adjust parameter. */ + xorq %rsi,%rsi /* Fill with 0s. */ +#endif + cmp $0x7,%rdx /* Check for small length. */ + mov %rdi,%rcx /* Save ptr as return value. */ + jbe 7f + +#if BZERO_P + mov %rsi,%r8 /* Just copy 0. */ +#else + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r8 + movzbl %sil,%eax + imul %rax,%r8 +#endif + test $0x7,%edi /* Check for alignment. */ + je 2f + + .p2align 4 +1: /* Align ptr to 8 byte. */ + mov %sil,(%rcx) + dec %rdx + inc %rcx + test $0x7,%ecx + jne 1b + +2: /* Check for really large regions. */ + mov %rdx,%rax + shr $0x6,%rax + je 4f + cmp LARGE, %rdx + jae 11f + + .p2align 4 +3: /* Copy 64 bytes. */ + mov %r8,(%rcx) + mov %r8,0x8(%rcx) + mov %r8,0x10(%rcx) + mov %r8,0x18(%rcx) + mov %r8,0x20(%rcx) + mov %r8,0x28(%rcx) + mov %r8,0x30(%rcx) + mov %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 3b + +4: /* Copy final bytes. */ + and $0x3f,%edx + mov %rdx,%rax + shr $0x3,%rax + je 6f + +5: /* First in chunks of 8 bytes. */ + mov %r8,(%rcx) + add $0x8,%rcx + dec %rax + jne 5b +6: + and $0x7,%edx +7: + test %rdx,%rdx + je 9f +8: /* And finally as bytes (up to 7). */ + mov %sil,(%rcx) + inc %rcx + dec %rdx + jne 8b +9: +#if BZERO_P + nop +#else + /* Load result (only if used as memset). */ + mov %rdi,%rax /* start address of destination is result */ +#endif + retq + + .p2align 4 +11: /* Copy 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further + speed up for large cases but let's not use XMM registers. */ + movnti %r8,(%rcx) + movnti %r8,0x8(%rcx) + movnti %r8,0x10(%rcx) + movnti %r8,0x18(%rcx) + movnti %r8,0x20(%rcx) + movnti %r8,0x28(%rcx) + movnti %r8,0x30(%rcx) + movnti %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 11b + jmp 4b + +END (memset) diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S new file mode 100644 index 0000000..b9bbcd9 --- /dev/null +++ b/sysdeps/x86_64/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy + +#include + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S new file mode 100644 index 0000000..549fd21 --- /dev/null +++ b/sysdeps/x86_64/strcat.S @@ -0,0 +1,257 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. + Optimized for x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger , 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (BP_SYM (strcat)) + movq %rdi, %rcx /* Dest. register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* Duplicate destination pointer. */ + movq $0xfefefefefefefeff,%r8 + + /* First step: Find end of destination. */ + jz 4f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => start copy */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + + + + /* Now the source is aligned. Scan for NUL byte. */ + .p2align 4 +4: + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + +2: + /* Second step: Copy source to destination. */ + + movq %rax, %rcx /* duplicate */ + andl $7,%ecx /* mask alignment bits */ + movq %rax, %rdx /* move around */ + jz 22f /* aligned => start loop */ + + /* Align the source pointer. */ +21: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 24f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 21b + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +22: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 22b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +23: + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 23b /* and look at next two bytes in %rax. */ + + +24: + movq %rdi, %rax /* Source is return value. */ + retq +END (BP_SYM (strcat)) diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S new file mode 100644 index 0000000..391f575 --- /dev/null +++ b/sysdeps/x86_64/strchr.S @@ -0,0 +1,290 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For AMD x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (BP_SYM (strchr)) + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 64-bit memory access is faster + and (more important) + 2. we process in the main loop 64 bit in one step although + we don't know the end of the string. But accessing at + 8-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 8. */ + + movq %rdi, %rcx + andl $7, %ecx /* Mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + neg %ecx + addl $8, %ecx /* Align to 8 bytes. */ + + /* Search the first bytes directly. */ +0: movb (%rax), %cl /* load byte */ + cmpb %cl,%sil /* compare byte. */ + je 6f /* target found */ + testb %cl,%cl /* is byte NUL? */ + je 7f /* yes => return NULL */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + + +1: + /* At the moment %rsi contains C. What we need for the + algorithm is C in all bytes of the register. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r9 + movzbl %sil,%edx + imul %rdx,%r9 + + movq $0xfefefefefefefeff, %r8 /* Save magic. */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of QUARDWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24 tec.. If one of bits 54-63 is set, there will be a carry + into bit 64 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + .p2align 4 +4: + /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => restart loop */ + + +7: /* Return NULL. */ + xorq %rax, %rax + retq + + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. Note that we XORed %rcx + with the byte we're looking for, therefore the tests below look + reversed. */ + + + .p2align 4 /* Align, it's a jump target. */ +3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + subq $8,%rax /* correct pointer increment. */ + testb %cl, %cl /* is first byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is third byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is fourth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is fourth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is fifth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is fifth byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is sixth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is sixth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is seventh byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is seventh byte NUL? */ + je 7b /* yes => return NULL */ + + /* It must be in the eigth byte and it cannot be NUL. */ + incq %rax + +6: + nop + retq +END (BP_SYM (strchr)) + +weak_alias (BP_SYM (strchr), BP_SYM (index)) diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S new file mode 100644 index 0000000..6e6bdcb --- /dev/null +++ b/sysdeps/x86_64/strcmp.S @@ -0,0 +1,44 @@ +/* Highly optimized version for x86-64. + Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Based on i686 version contributed by Ulrich Drepper + , 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + .text +ENTRY (BP_SYM (strcmp)) +L(oop): movb (%rdi), %al + cmpb (%rsi), %al + jne L(neq) + incq %rdi + incq %rsi + testb %al, %al + jnz L(oop) + + xorq %rax, %rax + ret + +L(neq): movl $1, %eax + movl $-1, %ecx + cmovbl %ecx, %eax + ret +END (BP_SYM (strcmp)) diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S new file mode 100644 index 0000000..f178b9b --- /dev/null +++ b/sysdeps/x86_64/strcpy.S @@ -0,0 +1,156 @@ +/* strcpy/stpcpy implementation for x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger , 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +#ifndef USE_AS_STPCPY +# define STRCPY strcpy +#endif + + .text +ENTRY (BP_SYM (STRCPY)) + movq %rsi, %rcx /* Source register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rdx /* Duplicate destination pointer. */ + + jz 5f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 4f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 0b + +5: + movq $0xfefefefefefefeff,%r8 + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +1: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 1b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +3: + /* Note that stpcpy needs to return with the value of the NUL + byte. */ + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 3b /* and look at next two bytes in %rax. */ + +4: +#ifdef USE_AS_STPCPY + movq %rdx, %rax /* Destination is return value. */ +#else + movq %rdi, %rax /* Source is return value. */ +#endif + retq +END (BP_SYM (STRCPY)) diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S new file mode 100644 index 0000000..b488161 --- /dev/null +++ b/sysdeps/x86_64/strcspn.S @@ -0,0 +1,123 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains no characters from SS. + For AMD x86-64. + Copyright (C) 1994, 1995, 1996, 1997, 2000, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper . + Bug fixes by Alan Modra . + Adopted for x86-64 by Andreas Jaeger . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" + +/* BEWARE: `#ifdef strcspn' means that strcspn is redefined as `strpbrk' */ +#define STRPBRK_P (defined strcspn) + + .text +ENTRY (strcspn) + + movq %rdi, %rdx /* Save SRC. */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. */ + movq %rdi, %r8 /* Save value. */ + subq $256, %rsp /* Make space for 256 bytes. */ + movq $32, %rcx /* 32*8 bytes = 256 bytes. */ + movq %rsp, %rdi + xorq %rax, %rax /* We store 0s. */ + cld + rep + stosq + + movq %rsi, %rax /* Setup skipset. */ + +/* For understanding the following code remember that %rcx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 64-bit value in %rcx. */ + + .p2align 4 +L(2): movb (%rax), %cl /* get byte from skipset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 1(%rax), %cl /* get byte from skipset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 2(%rax), %cl /* get byte from skipset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + + movb 3(%rax), %cl /* get byte from skipset */ + addq $4, %rax /* increment skipset pointer */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from skipset */ + +L(1): leaq -4(%rdx), %rax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the skipset was found + and + 2. the end of the string was found + But as a sign that the character is in the skipset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + + .p2align 4 +L(3): addq $4, %rax /* adjust pointer for full loop round */ + + movb (%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(4) /* yes => return */ + + movb 1(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(5) /* yes => return */ + + movb 2(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(6) /* yes => return */ + + movb 3(%rax), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jne L(3) /* no => start loop again */ + + incq %rax /* adjust pointer */ +L(6): incq %rax +L(5): incq %rax + +L(4): addq $256, %rsp /* remove skipset */ +#if STRPBRK_P + xorq %rdx,%rdx + orb %cl, %cl /* was last character NUL? */ + cmovzq %rdx, %rax /* Yes: return NULL */ +#else + subq %rdx, %rax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ +#endif + ret +END (strcspn) diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S new file mode 100644 index 0000000..4441ba7 --- /dev/null +++ b/sysdeps/x86_64/strlen.S @@ -0,0 +1,138 @@ +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2002 Free Software Foundation, Inc. + Based on i486 version contributed by Ulrich Drepper . + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (strlen) + movq %rdi, %rcx /* Duplicate source pointer. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => return */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + +1: movq $0xfefefefefefefeff,%r8 /* Save magic. */ + + .p2align 4 /* Align loop. */ +4: /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ +2: + subq %rdi, %rax /* compute difference to string start */ + ret +END (strlen) diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S new file mode 100644 index 0000000..9b97ada --- /dev/null +++ b/sysdeps/x86_64/strpbrk.S @@ -0,0 +1,2 @@ +#define strcspn strpbrk +#include diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S new file mode 100644 index 0000000..a8f0c07 --- /dev/null +++ b/sysdeps/x86_64/strspn.S @@ -0,0 +1,113 @@ +/* strspn (str, ss) -- Return the length of the initial segment of STR + which contains only characters from SS. + For AMD x86-64. + Copyright (C) 1994, 1995, 1996, 1997, 2000, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper . + Bug fixes by Alan Modra . + Adopted for x86-64 by Andreas Jaeger . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include + + .text +ENTRY (strspn) + + movq %rdi, %rdx /* Save SRC. */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. */ + movq %rdi, %r8 /* Save value. */ + subq $256, %rsp /* Make space for 256 bytes. */ + movq $32, %rcx /* 32*8 bytes = 256 bytes. */ + movq %rsp, %rdi + xorq %rax, %rax /* We store 0s. */ + cld + rep + stosq + + movq %rsi, %rax /* Setup stopset. */ + +/* For understanding the following code remember that %rcx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 64-bit value in %rcx. */ + + .p2align 4 +L(2): movb (%rax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 1(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 2(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 3(%rax), %cl /* get byte from stopset */ + addq $4, %rax /* increment stopset pointer */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): leaq -4(%rdx), %rax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the character is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + + .p2align 4 +L(3): addq $4, %rax /* adjust pointer for full loop round */ + + movb (%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(4) /* no => return */ + + movb 1(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(5) /* no => return */ + + movb 2(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jz L(6) /* no => return */ + + movb 3(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jnz L(3) /* yes => start loop again */ + + incq %rax /* adjust pointer */ +L(6): incq %rax +L(5): incq %rax + +L(4): addq $256, %rsp /* remove stopset */ + subq %rdx, %rax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + ret +END (strspn) diff --git a/sysdeps/x86_64/strtok.S b/sysdeps/x86_64/strtok.S new file mode 100644 index 0000000..771bd2d --- /dev/null +++ b/sysdeps/x86_64/strtok.S @@ -0,0 +1,208 @@ +/* strtok (str, delim) -- Return next DELIM separated token from STR. + For AMD x86-64. + Copyright (C) 1998, 2000, 2001, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Based on i686 version contributed by Ulrich Drepper + , 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +/* This file can be used for the strtok and strtok_r functions: + + strtok: + INPUT PARAMETER: + str %rdi + delim %rsi + + strtok_r: + INPUT PARAMETER: + str %rdi + delim %rsi + save_ptr %rdx + + We do a common implementation here. */ + +#ifdef USE_AS_STRTOK_R +# define SAVE_PTR (%r9) +#else + .bss + .local save_ptr + ASM_TYPE_DIRECTIVE (save_ptr, @object) + .size save_ptr, 8 +save_ptr: + .space 8 + +# ifdef PIC +# define SAVE_PTR save_ptr(%rip) +# else +# define SAVE_PTR save_ptr +# endif + +# define FUNCTION strtok +#endif + + .text +ENTRY (BP_SYM (FUNCTION)) + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. */ + movq %rdi, %r8 /* Save value. */ + subq $256, %rsp /* Make space for 256 bytes. */ + movq $32, %rcx /* 32*8 bytes = 256 bytes. */ + movq %rsp, %rdi + xorq %rax, %rax /* We store 0s. */ + cld + rep + stosq + + /* Note: %rcx = 0 !!! */ + +#ifdef USE_AS_STRTOK_R + /* The value is stored in the third argument. */ + movq %rdx, %rax + movq %rdx, %r9 /* Save value - see def. of SAVE_PTR. */ + movq (%rax), %rax +#else + /* The value is in the local variable defined above. But + we have to take care for PIC code. */ + movq SAVE_PTR, %rax +#endif + movq %r8, %rdx /* Get start of string. */ + + /* If the pointer is NULL we have to use the stored value of + the last run. */ + cmpq $0, %rdx + cmove %rax, %rdx + testq %rdx, %rdx + jz L(returnNULL) + movq %rsi, %rax /* Get start of delimiter set. */ + +/* For understanding the following code remember that %rcx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 64-bit value in %rcx. */ + +L(2): movb (%rax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 1(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 2(%rax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + + movb 3(%rax), %cl /* get byte from stopset */ + addq $4, %rax /* increment stopset pointer */ + movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): + + leaq -4(%rdx), %rax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + As a sign that the character is in the stopset we store its + value in the table. The value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L(3): addq $4, %rax /* adjust pointer for full loop round */ + + movb (%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ + jz L(4) /* no => start of token */ + + movb 1(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ + jz L(5) /* no => start of token */ + + movb 2(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ + jz L(6) /* no => start of token */ + + movb 3(%rax), %cl /* get byte from string */ + testb %cl, (%rsp,%rcx) /* is it contained in stopset? */ + jnz L(3) /* yes => start of loop */ + + incq %rax /* adjust pointer */ +L(6): incq %rax +L(5): incq %rax + + /* Now we have to terminate the string. */ + +L(4): leaq -4(%rax), %rdx /* We use %rDX for the next run. */ + +L(7): addq $4, %rdx /* adjust pointer for full loop round */ + + movb (%rdx), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(8) /* yes => return */ + + movb 1(%rdx), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(9) /* yes => return */ + + movb 2(%rdx), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + je L(10) /* yes => return */ + + movb 3(%rdx), %cl /* get byte from string */ + cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ + jne L(7) /* no => start loop again */ + + incq %rdx /* adjust pointer */ +L(10): incq %rdx +L(9): incq %rdx + +L(8): cmpq %rax, %rdx + je L(returnNULL) /* There was no token anymore. */ + + movb $0, (%rdx) /* Terminate string. */ + + /* Are we at end of string? */ + cmpb $0, %cl + leaq 1(%rdx), %rcx + cmovne %rcx, %rdx + + /* Store the pointer to the next character. */ + movq %rdx, SAVE_PTR + +L(epilogue): + /* Remove the stopset table. */ + addq $256, %rsp + retq + +L(returnNULL): + xorq %rax, %rax + jmp L(epilogue) + +END (BP_SYM (FUNCTION)) diff --git a/sysdeps/x86_64/strtok_r.S b/sysdeps/x86_64/strtok_r.S new file mode 100644 index 0000000..0248f27 --- /dev/null +++ b/sysdeps/x86_64/strtok_r.S @@ -0,0 +1,4 @@ +#define FUNCTION __strtok_r +#define USE_AS_STRTOK_R 1 +#include +weak_alias (BP_SYM (__strtok_r), BP_SYM (strtok_r)) -- cgit v1.1