diff options
author | Ondřej Bílka <neleai@seznam.cz> | 2013-09-03 16:21:38 +0200 |
---|---|---|
committer | Ondřej Bílka <neleai@seznam.cz> | 2013-09-03 16:27:10 +0200 |
commit | 8f02859f17d01ce0cf542d934a04a79f048b73fd (patch) | |
tree | 2173f12761daf5f4df32efeb9217bba4f980712c | |
parent | d34202f67408b3a6363d8d5a9ef78ae8a264d03f (diff) | |
download | glibc-8f02859f17d01ce0cf542d934a04a79f048b73fd.zip glibc-8f02859f17d01ce0cf542d934a04a79f048b73fd.tar.gz glibc-8f02859f17d01ce0cf542d934a04a79f048b73fd.tar.bz2 |
Add unaligned strcmp.
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 210 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp.S | 7 |
5 files changed, 231 insertions, 2 deletions
@@ -1,3 +1,12 @@ +2013-09-03 Ondřej Bílka <neleai@seznam.cz> + + * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): + Add ifunc. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): + Add strcmp-sse2-unaligned + * sysdeps/x86_64/multiarch/strcmp.S (strcmp): Add ifunc. + 2013-09-02 Mike Frysinger <vapier@gentoo.org> * Versions.def (libc): Add GLIBC_2.19. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 203d16e..5ab950a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -6,8 +6,10 @@ endif ifeq ($(subdir),string) -sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \ +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ + strcmp-sse2-unaligned strncmp-ssse3 \ + strend-sse4 memcmp-sse4 memcpy-ssse3 \ + memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d0992e1..f8756d7 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -118,6 +118,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcmp, IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcpy.S. */ diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 0000000..eed8432 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,210 @@ +/* strcmp with unaligned loads + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#define ALIGN(x) .p2align x + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + ALIGN (4) +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + ALIGN (4) +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 1d4d711..c5dcd1a 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -66,6 +66,7 @@ # define STRCMP_SSE2 __strncasecmp_l_sse2 # define __GI_STRCMP __GI___strncasecmp_l #else +# define USE_AS_STRCMP # define UPDATE_STRNCMP_COUNTER # ifndef STRCMP # define STRCMP strcmp @@ -88,11 +89,17 @@ ENTRY(STRCMP) jne 1f call __init_cpu_features 1: +#ifdef USE_AS_STRCMP + leaq __strcmp_sse2_unaligned(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 3f +#else testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip) jnz 2f leaq STRCMP_SSE42(%rip), %rax testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip) jnz 3f +#endif 2: leaq STRCMP_SSSE3(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jnz 3f |