diff options
author | Ulrich Drepper <drepper@redhat.com> | 2009-04-06 03:29:26 +0000 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2009-04-06 03:29:26 +0000 |
commit | 1df6f9d80819118f25bcff2aecaf5846aba8e766 (patch) | |
tree | e7a7985088b2d0e88da084e5fa818ef8c1b29c63 | |
parent | a152f366dc38314c891c523639706a20e353dd6b (diff) | |
download | glibc-1df6f9d80819118f25bcff2aecaf5846aba8e766.zip glibc-1df6f9d80819118f25bcff2aecaf5846aba8e766.tar.gz glibc-1df6f9d80819118f25bcff2aecaf5846aba8e766.tar.bz2 |
* sysdeps/x86_64/strchr.S: Likewise.
-rw-r--r-- | ChangeLog | 1 | ||||
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | sysdeps/x86_64/strchr.S | 313 |
3 files changed, 48 insertions, 268 deletions
@@ -1,6 +1,7 @@ 2009-04-05 Ulrich Drepper <drepper@redhat.com> * sysdeps/x86_64/strlen.S: Optimize by using SSE2 instructions. + * sysdeps/x86_64/strchr.S: Likewise. 2009-04-03 Ulrich Drepper <drepper@redhat.com> @@ -37,7 +37,7 @@ Version 2.10 * New locale: nan_TW@latin -* Faster strlen on x86-64. +* Faster strlen and strchr on x86-64. Implemented by Ulrich Drepper. diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S index 8934697..8833cd0 100644 --- a/sysdeps/x86_64/strchr.S +++ b/sysdeps/x86_64/strchr.S @@ -1,6 +1,6 @@ /* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. For AMD x86-64. - Copyright (C) 2002, 2005 Free Software Foundation, Inc. + Copyright (C) 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,273 +19,52 @@ 02111-1307 USA. */ #include <sysdep.h> -#include "asm-syntax.h" -#include "bp-sym.h" -#include "bp-asm.h" .text -ENTRY (BP_SYM (strchr)) - - /* Before we start with the main loop we process single bytes - until the source pointer is aligned. This has two reasons: - 1. aligned 64-bit memory access is faster - and (more important) - 2. we process in the main loop 64 bit in one step although - we don't know the end of the string. But accessing at - 8-byte alignment guarantees that we never access illegal - memory if this would not also be done by the trivial - implementation (this is because all processor inherent - boundaries are multiples of 8). */ - - movq %rdi, %rdx - andl $7, %edx /* Mask alignment bits */ - movq %rdi, %rax /* duplicate destination. */ - jz 1f /* aligned => start loop */ - neg %edx - addl $8, %edx /* Align to 8 bytes. */ - - /* Search the first bytes directly. */ -0: movb (%rax), %cl /* load byte */ - cmpb %cl,%sil /* compare byte. */ - je 6f /* target found */ - testb %cl,%cl /* is byte NUL? */ - je 7f /* yes => return NULL */ - incq %rax /* increment pointer */ - decl %edx - jnz 0b - - -1: - /* At the moment %rsi contains C. What we need for the - algorithm is C in all bytes of the register. Avoid - operations on 16 bit words because these require an - prefix byte (and one more cycle). */ - /* Populate 8 bit data to full 64-bit. */ - movabs $0x0101010101010101,%r9 - movzbl %sil,%edx - imul %rdx,%r9 - - movq $0xfefefefefefefeff, %r8 /* Save magic. */ - - /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to - change any of the hole bits of LONGWORD. - - 1) Is this safe? Will it catch all the zero bytes? - Suppose there is a byte with all zeros. Any carry bits - propagating from its left will fall into the hole at its - least significant bit and stop. Since there will be no - carry from its most significant bit, the LSB of the - byte to the left will be unchanged, and the zero will be - detected. - - 2) Is this worthwhile? Will it ignore everything except - zero bytes? Suppose every byte of QUARDWORD has a bit set - somewhere. There will be a carry into bit 8. If bit 8 - is set, this will carry into bit 16. If bit 8 is clear, - one of bits 9-15 must be set, so there will be a carry - into bit 16. Similarly, there will be a carry into bit - 24 tec.. If one of bits 54-63 is set, there will be a carry - into bit 64 (=carry flag), so all of the hole bits will - be changed. - - 3) But wait! Aren't we looking for C, not zero? - Good point. So what we do is XOR LONGWORD with a longword, - each of whose bytes is C. This turns each byte that is C - into a zero. */ - - .p2align 4 -4: - /* Main Loop is unrolled 4 times. */ - /* First unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c - are now 0 */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found c => return pointer */ - - /* The quadword we looked at does not contain the value we're looking - for. Let's search now whether we have reached the end of the - string. */ - xorq %r9, %rcx /* restore original dword without reload */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 7f /* highest byte is NUL => return NULL */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 7f /* found NUL => return NULL */ - - /* Second unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c - are now 0 */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found c => return pointer */ - - /* The quadword we looked at does not contain the value we're looking - for. Let's search now whether we have reached the end of the - string. */ - xorq %r9, %rcx /* restore original dword without reload */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 7f /* highest byte is NUL => return NULL */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 7f /* found NUL => return NULL */ - /* Third unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c - are now 0 */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found c => return pointer */ - - /* The quadword we looked at does not contain the value we're looking - for. Let's search now whether we have reached the end of the - string. */ - xorq %r9, %rcx /* restore original dword without reload */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 7f /* highest byte is NUL => return NULL */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 7f /* found NUL => return NULL */ - /* Fourth unroll. */ - movq (%rax), %rcx /* get double word (= 8 bytes) in question */ - addq $8,%rax /* adjust pointer for next word */ - movq %r8, %rdx /* magic value */ - xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c - are now 0 */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jnz 3f /* found c => return pointer */ - - /* The quadword we looked at does not contain the value we're looking - for. Let's search now whether we have reached the end of the - string. */ - xorq %r9, %rcx /* restore original dword without reload */ - movq %r8, %rdx /* magic value */ - addq %rcx, %rdx /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 7f /* highest byte is NUL => return NULL */ - xorq %rcx, %rdx /* (word+magic)^word */ - orq %r8, %rdx /* set all non-carry bits */ - incq %rdx /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - jz 4b /* no NUL found => restart loop */ - - -7: /* Return NULL. */ - xorl %eax, %eax - retq - - - /* We now scan for the byte in which the character was matched. - But we have to take care of the case that a NUL char is - found before this in the dword. Note that we XORed %rcx - with the byte we're looking for, therefore the tests below look - reversed. */ - - - .p2align 4 /* Align, it's a jump target. */ -3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ - subq $8,%rax /* correct pointer increment. */ - testb %cl, %cl /* is first byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %cl /* is first byte NUL? */ - je 7b /* yes => return NULL */ - incq %rax /* increment pointer */ - - testb %ch, %ch /* is second byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %ch /* is second byte NUL? */ - je 7b /* yes => return NULL? */ - incq %rax /* increment pointer */ - - shrq $16, %rcx /* make upper bytes accessible */ - testb %cl, %cl /* is third byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %cl /* is third byte NUL? */ - je 7b /* yes => return NULL */ - incq %rax /* increment pointer */ - - testb %ch, %ch /* is fourth byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %ch /* is fourth byte NUL? */ - je 7b /* yes => return NULL? */ - incq %rax /* increment pointer */ - - shrq $16, %rcx /* make upper bytes accessible */ - testb %cl, %cl /* is fifth byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %cl /* is fifth byte NUL? */ - je 7b /* yes => return NULL */ - incq %rax /* increment pointer */ - - testb %ch, %ch /* is sixth byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %ch /* is sixth byte NUL? */ - je 7b /* yes => return NULL? */ - incq %rax /* increment pointer */ - - shrq $16, %rcx /* make upper bytes accessible */ - testb %cl, %cl /* is seventh byte C? */ - jz 6f /* yes => return pointer */ - cmpb %dl, %cl /* is seventh byte NUL? */ - je 7b /* yes => return NULL */ - - /* It must be in the eigth byte and it cannot be NUL. */ - incq %rax - -6: - nop - retq -END (BP_SYM (strchr)) - -weak_alias (BP_SYM (strchr), BP_SYM (index)) +ENTRY (strchr) + movd %esi, %xmm1 + movq %rdi, %rcx + punpcklbw %xmm1, %xmm1 + andq $~15, %rdi + pxor %xmm2, %xmm2 + punpcklbw %xmm1, %xmm1 + orl $0xffffffff, %esi + movdqa (%rdi), %xmm0 + pshufd $0, %xmm1, %xmm1 + subq %rdi, %rcx + movdqa %xmm0, %xmm3 + leaq 16(%rdi), %rdi + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + shl %cl, %esi + pmovmskb %xmm0, %edx + pmovmskb %xmm3, %ecx + andl %esi, %edx + andl %esi, %ecx + orl %edx, %ecx + jnz 1f + +2: movdqa (%rdi), %xmm0 + leaq 16(%rdi), %rdi + movdqa %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm3, %ecx + orl %edx, %ecx + jz 2b + +1: bsfl %edx, %edx + jz 4f + bsfl %ecx, %ecx + leaq -16(%rdi,%rdx), %rax + cmpl %edx, %ecx + je 5f +4: xorl %eax, %eax +5: ret +END (strchr) + +weak_alias (strchr, index) libc_hidden_builtin_def (strchr) + |