From 5905e7b3e29139dbef84c065ca39315485f497e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20B=C3=ADlka?= Date: Wed, 11 Sep 2013 17:07:38 +0200 Subject: Faster strchr implementation. --- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 - sysdeps/x86_64/multiarch/strchr.S | 127 ------------------- sysdeps/x86_64/strchr.S | 191 ++++++++++++++++++++++++----- sysdeps/x86_64/strchrnul.S | 41 +------ 4 files changed, 160 insertions(+), 200 deletions(-) (limited to 'sysdeps/x86_64') diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f8756d7..1a65ac0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -110,7 +110,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strchr.S. */ IFUNC_IMPL (i, name, strchr, - IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S index f170238..3f0b2c5 100644 --- a/sysdeps/x86_64/multiarch/strchr.S +++ b/sysdeps/x86_64/multiarch/strchr.S @@ -29,12 +29,6 @@ ENTRY(strchr) jne 1f call __init_cpu_features 1: leaq __strchr_sse2(%rip), %rax - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) - jnz 2f - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) - jz 2f - leaq __strchr_sse42(%rip), %rax - ret 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) jz 3f leaq __strchr_sse2_no_bsf(%rip), %rax @@ -42,127 +36,6 @@ ENTRY(strchr) END(strchr) -/* - This implementation uses SSE4 instructions to compare up to 16 bytes - at a time looking for the first occurrence of the character c in the - string s: - - char *strchr (const char *s, int c); - - We use 0xa: - _SIDD_SBYTE_OPS - | _SIDD_CMP_EQUAL_EACH - | _SIDD_LEAST_SIGNIFICANT - on pcmpistri to compare xmm/mem128 - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - X X X X X X X X X X X X X X X X - - against xmm - - 0 1 2 3 4 5 6 7 8 9 A B C D E F - C C C C C C C C C C C C C C C C - - to find out if the first 16byte data element has a byte C and the - offset of the first byte. There are 3 cases: - - 1. The first 16byte data element has the byte C at the offset X. - 2. The first 16byte data element has EOS and doesn't have the byte C. - 3. The first 16byte data element is valid and doesn't have the byte C. - - Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases: - - case ECX CFlag ZFlag SFlag - 1 X 1 0/1 0 - 2 16 0 1 0 - 3 16 0 0 0 - - We exit from the loop for cases 1 and 2 with jbe which branches - when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset - X for case 1. */ - - .section .text.sse4.2,"ax",@progbits - .align 16 - .type __strchr_sse42, @function - .globl __strchr_sse42 - .hidden __strchr_sse42 -__strchr_sse42: - cfi_startproc - CALL_MCOUNT - testb %sil, %sil - je __strend_sse4 - pxor %xmm2, %xmm2 - movd %esi, %xmm1 - movl %edi, %ecx - pshufb %xmm2, %xmm1 - andl $15, %ecx - movq %rdi, %r8 - je L(aligned_start) - -/* Handle unaligned string. */ - andq $-16, %r8 - movdqa (%r8), %xmm0 - pcmpeqb %xmm0, %xmm2 - pcmpeqb %xmm1, %xmm0 - /* Find where NULL is. */ - pmovmskb %xmm2, %edx - /* Check if there is a match. */ - pmovmskb %xmm0, %esi - /* Remove the leading bytes. */ - sarl %cl, %edx - sarl %cl, %esi - testl %esi, %esi - je L(unaligned_no_match) - /* Check which byte is a match. */ - bsfl %esi, %eax - /* Is there a NULL? */ - testl %edx, %edx - je L(unaligned_match) - bsfl %edx, %esi - cmpl %esi, %eax - /* Return NULL if NULL comes first. */ - ja L(return_null) -L(unaligned_match): - addq %rdi, %rax - ret - - .p2align 4 -L(unaligned_no_match): - testl %edx, %edx - jne L(return_null) - -/* Loop start on aligned string. */ -L(loop): - addq $16, %r8 -L(aligned_start): - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - addq $16, %r8 - pcmpistri $0x2, (%r8), %xmm1 - jbe L(wrap) - jmp L(loop) -L(wrap): - jc L(loop_exit) - -/* Return NULL. */ -L(return_null): - xorl %eax, %eax - ret - -/* Loop exit. */ - .p2align 4 -L(loop_exit): - leaq (%r8,%rcx), %rax - ret - cfi_endproc - .size __strchr_sse42, .-__strchr_sse42 - # undef ENTRY # define ENTRY(name) \ diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S index d89f1eb..1900b37 100644 --- a/sysdeps/x86_64/strchr.S +++ b/sysdeps/x86_64/strchr.S @@ -19,51 +19,174 @@ #include +# ifndef ALIGN +# define ALIGN(n) .p2align n +# endif + .text ENTRY (strchr) movd %esi, %xmm1 - movq %rdi, %rcx - punpcklbw %xmm1, %xmm1 - andq $~15, %rdi - pxor %xmm2, %xmm2 + movl %edi, %eax + andl $4095, %eax punpcklbw %xmm1, %xmm1 - orl $0xffffffff, %esi - movdqa (%rdi), %xmm0 + cmpl $4032, %eax + punpcklwd %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1 - subq %rdi, %rcx - movdqa %xmm0, %xmm3 - leaq 16(%rdi), %rdi + jg L(cross_page) + movdqu (%rdi), %xmm0 + pxor %xmm3, %xmm3 + movdqa %xmm0, %xmm4 pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - shl %cl, %esi - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - andl %esi, %edx - andl %esi, %ecx - orl %edx, %ecx - jnz 1f + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + je L(next_48_bytes) + bsf %eax, %eax +#ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +#else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +#endif + ret -2: movdqa (%rdi), %xmm0 - leaq 16(%rdi), %rdi - movdqa %xmm0, %xmm3 + ALIGN(3) + L(next_48_bytes): + movdqu 16(%rdi), %xmm0 + movdqa %xmm0, %xmm4 pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - orl %edx, %ecx - jz 2b + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %ecx + movdqu 32(%rdi), %xmm0 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + salq $16, %rcx + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm0, %xmm3 + salq $32, %rax + pcmpeqb %xmm1, %xmm0 + orq %rcx, %rax + por %xmm3, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + testq %rax, %rax + jne L(return) +L(loop_start): + /* We use this alignment to force loop be aligned to 8 but not + 16 bytes. This gives better sheduling on AMD processors. */ + ALIGN(4) + pxor %xmm6, %xmm6 + andq $-64, %rdi + ALIGN(3) +L(loop64): + addq $64, %rdi + movdqa (%rdi), %xmm5 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + pxor %xmm1, %xmm5 + movdqa 48(%rdi), %xmm4 + pxor %xmm1, %xmm2 + pxor %xmm1, %xmm3 + pminub (%rdi), %xmm5 + pxor %xmm1, %xmm4 + pminub 16(%rdi), %xmm2 + pminub 32(%rdi), %xmm3 + pminub %xmm2, %xmm5 + pminub 48(%rdi), %xmm4 + pminub %xmm3, %xmm5 + pminub %xmm4, %xmm5 + pcmpeqb %xmm6, %xmm5 + pmovmskb %xmm5, %eax + + testl %eax, %eax + je L(loop64) + + movdqa (%rdi), %xmm5 + movdqa %xmm5, %xmm0 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm6, %xmm0 + por %xmm0, %xmm5 + pcmpeqb %xmm6, %xmm2 + pcmpeqb %xmm6, %xmm3 + pcmpeqb %xmm6, %xmm4 + + pmovmskb %xmm5, %ecx + pmovmskb %xmm2, %eax + salq $16, %rax + pmovmskb %xmm3, %r8d + pmovmskb %xmm4, %edx + salq $32, %r8 + orq %r8, %rax + orq %rcx, %rax + salq $48, %rdx + orq %rdx, %rax + ALIGN(3) +L(return): + bsfq %rax, %rax +#ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +#else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +#endif + ret + ALIGN(4) + +L(cross_page): + movq %rdi, %rdx + pxor %xmm2, %xmm2 + andq $-64, %rdx + movdqa %xmm1, %xmm0 + movdqa (%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r8d + movdqa 16(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %eax + movdqa 32(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + salq $16, %rax + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r9d + movdqa 48(%rdx), %xmm3 + pcmpeqb %xmm3, %xmm2 + salq $32, %r9 + pcmpeqb %xmm3, %xmm0 + orq %r9, %rax + orq %r8, %rax + por %xmm2, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + movl %edi, %ecx + subb %dl, %cl + shrq %cl, %rax + testq %rax, %rax + jne L(return) + jmp L(loop_start) -1: bsfl %edx, %edx - jz 4f - bsfl %ecx, %ecx - leaq -16(%rdi,%rdx), %rax - cmpl %edx, %ecx - je 5f -4: xorl %eax, %eax -5: ret END (strchr) +#ifndef AS_STRCHRNUL weak_alias (strchr, index) libc_hidden_builtin_def (strchr) - +#endif diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S index d8c345b..bceeb61 100644 --- a/sysdeps/x86_64/strchrnul.S +++ b/sysdeps/x86_64/strchrnul.S @@ -20,43 +20,8 @@ #include - - .text -ENTRY (__strchrnul) - movd %esi, %xmm1 - movq %rdi, %rcx - punpcklbw %xmm1, %xmm1 - andq $~15, %rdi - pxor %xmm2, %xmm2 - punpcklbw %xmm1, %xmm1 - orl $0xffffffff, %esi - movdqa (%rdi), %xmm0 - pshufd $0, %xmm1, %xmm1 - subq %rdi, %rcx - movdqa %xmm0, %xmm3 - leaq 16(%rdi), %rdi - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - shl %cl, %esi - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - orl %edx, %ecx - andl %esi, %ecx - jnz 1f - -2: movdqa (%rdi), %xmm0 - leaq 16(%rdi), %rdi - movdqa %xmm0, %xmm3 - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm2, %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm3, %ecx - orl %edx, %ecx - jz 2b - -1: bsfl %ecx, %edx - leaq -16(%rdi,%rdx), %rax - ret -END (__strchrnul) +#define strchr __strchrnul +#define AS_STRCHRNUL +#include "strchr.S" weak_alias (__strchrnul, strchrnul) -- cgit v1.1