aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Drepper <drepper@gmail.com>2011-12-17 14:39:23 -0500
committerUlrich Drepper <drepper@gmail.com>2011-12-17 14:39:23 -0500
commit1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch)
tree90a3f8d19f941a684e1482b8813c534d82cfb19e
parenta2d18b64edb486825fb5946eefc2131426ccfec9 (diff)
downloadglibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.bz2
Optimized wcschr and wcscpy for x86-64 and x86-32
-rw-r--r--ChangeLog25
-rw-r--r--NEWS3
-rw-r--r--string/test-strcmp.c17
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile3
-rw-r--r--sysdeps/i386/i686/multiarch/wcschr-c.c8
-rw-r--r--sysdeps/i386/i686/multiarch/wcschr-sse2.S220
-rw-r--r--sysdeps/i386/i686/multiarch/wcschr.S54
-rw-r--r--sysdeps/i386/i686/multiarch/wcscpy-c.c5
-rw-r--r--sysdeps/i386/i686/multiarch/wcscpy-ssse3.S621
-rw-r--r--sysdeps/i386/i686/multiarch/wcscpy.S46
-rw-r--r--sysdeps/i386/i686/multiarch/wcsrchr-c.c5
-rw-r--r--sysdeps/i386/i686/multiarch/wcsrchr-sse2.S355
-rw-r--r--sysdeps/i386/i686/multiarch/wcsrchr.S54
-rw-r--r--sysdeps/x86_64/multiarch/Makefile6
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-c.c5
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S566
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy.S43
-rw-r--r--sysdeps/x86_64/wcschr.S155
-rw-r--r--sysdeps/x86_64/wcsrchr.S283
-rw-r--r--wcsmbs/wcschr.c5
20 files changed, 2466 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index f3ce260..19f2cde 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,28 @@
+2011-11-14 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/x86_64/multiarch/Makefile [subdir=wcsmbs] (sysdep_routines):
+ Add wcscpy-ssse3 wcscpy-c.
+ * sysdeps/x86_64/multiarch/wcscpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/wcscpy-c.c: New file.
+ * sysdeps/x86_64/multiarch/wcscpy.S: New file.
+ * sysdeps/x86_64/wcschr.S: New file.
+ * sysdeps/x86_64/wcsrchr.S: New file.
+ * string/test-strcmp.c: Remove checking of wcscmp function for
+ wrong alignments.
+ * sysdeps/i386/i686/multiarch/Makefile [subdir=wcsmbs]
+ (sysdep_routines): Add wcscpy-ssse3 wcscpy-c wcschr-sse2 wcschr-c
+ wcsrchr-sse2 wcsrchr-c.
+ * sysdeps/i386/i686/multiarch/wcschr.S: New file.
+ * sysdeps/i386/i686/multiarch/wcschr-c.c: New file.
+ * sysdeps/i386/i686/multiarch/wcschr-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/wcsrchr.S: New file.
+ * sysdeps/i386/i686/multiarch/wcsrchr-c.c: New file.
+ * sysdeps/i386/i686/multiarch/wcsrchr-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/wcscpy.S: New file.
+ * sysdeps/i386/i686/multiarch/wcscpy-c.c: New file.
+ * sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: New file.
+ * wcsmbc/wcschr.c (WCSCHR): New macro.
+
2011-11-17 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
* wcsmbs/Makefile (strop-tests): Add wcsrchr wcscpy.
diff --git a/NEWS b/NEWS
index 5b5324c..6181a01 100644
--- a/NEWS
+++ b/NEWS
@@ -34,7 +34,8 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
-* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp, wcschr, wcscpy
+ for x86-64 and x86-32.
Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 9ea4df7..5ed9626 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -221,14 +221,14 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
static void
do_random_tests (void)
{
- for (size_t a = 0; a < CHARBYTES; a += CHARALIGN)
- for (size_t b = 0; b < CHARBYTES; b += CHARALIGN)
- {
- UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES - a);
- UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES - b);
+ UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES);
+ UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES);
for (size_t n = 0; n < ITERATIONS; n++)
{
+ /* for wcscmp case align1 and align2 mean here alignment in wchar_t symbols, it
+ equal 4*k alignment in bytes, we don't check other alignments like for example p1 = (wchar_t *)(buf1 + 1)
+ because it's wrong using of wchar_t type. */
size_t align1 = random () & 31;
size_t align2;
if (random () & 1)
@@ -274,7 +274,7 @@ do_random_tests (void)
}
int result = 0;
- MEMCPY ((CHAR *) (p2 + align2), (CHAR *) (p1 + align1), pos);
+ MEMCPY (p2 + align2, p1 + align1, pos);
if (pos < len1)
{
if (p2[align2 + pos] == p1[align1 + pos])
@@ -302,13 +302,12 @@ do_random_tests (void)
|| (r < 0 && result >= 0)
|| (r > 0 && result <= 0))
{
- error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %d != %d, p1 %p p2 %p",
+ error (0, 0, "Iteration %zd - wrong result in function %s (align in bytes: %zd, align in bytes: %zd, len1: %zd, len2: %zd, pos: %zd) %d != %d, p1 %p p2 %p",
n, impl->name, (size_t) (p1 + align1) & 63, (size_t) (p1 + align2) & 63, len1, len2, pos, r, result, p1, p2);
ret = 1;
}
}
- }
- }
+ }
}
static void
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 426b718..b764e5b 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -37,7 +37,8 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
- wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+ wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
endif
ifeq (mathyes,$(subdir)$(config-cflags-avx))
diff --git a/sysdeps/i386/i686/multiarch/wcschr-c.c b/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000..a63e50e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,8 @@
+#ifndef NOT_IN_libc
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32);
+# define WCSCHR __wcschr_ia32
+#endif
+
+#include "wcsmbs/wcschr.c"
diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000..cc8204c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,220 @@
+/* wcschr with SSE2, without using bsf instructions
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcschr_sse2)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %eax
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+ and $63, %eax
+ cmp $48, %eax
+ ja L(cross_cache)
+
+ movdqu (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ and $-16, %ecx
+ jmp L(loop)
+
+ .p2align 4
+L(cross_cache):
+ PUSH (%edi)
+ mov %ecx, %edi
+ mov %eax, %ecx
+ and $-16, %edi
+ and $15, %ecx
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+
+ add %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jz L(match_case1)
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_no_match):
+ mov %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ pxor %xmm2, %xmm2
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ add $16, %ecx
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_4):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+ test $15, %ah
+ jnz L(match_case2_12)
+ test $15, %dh
+ jnz L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_12):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(exit0)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(exit3)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit0):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit3):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000..bf0d6d5
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,54 @@
+/* Multiple versions of wcschr
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(wcschr)
+ .type wcschr, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wcschr_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wcschr_sse2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(wcschr)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-c.c b/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000..a3c4024
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcscpy __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000..84d92a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,621 @@
+/* wcscpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+ atom_text_section
+ENTRY (__wcscpy_ssse3)
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmp $0, (%ecx)
+ jz L(ExitTail4)
+ cmp $0, 4(%ecx)
+ jz L(ExitTail8)
+ cmp $0, 8(%ecx)
+ jz L(ExitTail12)
+ cmp $0, 12(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+
+ PUSH (%esi)
+ lea 16(%ecx), %esi
+
+ and $-16, %esi
+
+ pxor %xmm0, %xmm0
+ pcmpeqd (%esi), %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+ jz L(Align16Both)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $8, %eax
+ je L(Shl8)
+ jmp L(Shl12)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqd %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqd %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ mov $-0x40, %esi
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $12, %xmm6
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ add $12, %edx
+ add $12, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $8, %xmm6
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ add $8, %edx
+ add $8, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %esi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit12)
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000..c7bafbe
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wcscpy
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(wcscpy)
+ .type wcscpy, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wcscpy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wcscpy_ssse3@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(wcscpy)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000..c7444ce
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcsrchr __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000..2859f7e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,355 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8
+# define ENTRANCE PUSH (%edi);
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1 PARMS
+# define STR2 STR1+4
+
+ atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %edi
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+
+/* Save current match */
+ mov %eax, %edx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm3
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+
+ mov %eax, %edx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm3
+ pcmpeqd %xmm3, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm4
+ pcmpeqd %xmm4, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm4
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm4, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm5
+ pcmpeqd %xmm5, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm5
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm5, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %edx, %edx
+ jz L(return_null_1)
+ mov %edx, %eax
+ mov %esi, %edi
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+/* save match info */
+ mov %eax, %edx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_fourth_wchar):
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match_second_wchar):
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_or_fourth_wchar):
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_wchar):
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_fourth_wchar):
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(prolog_find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_null)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_null)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(prolog_find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_null)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000..8240063
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,54 @@
+/* Multiple versions of wcsrchr
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(wcsrchr)
+ .type wcsrchr, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal __wcsrchr_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wcsrchr_sse2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4);
+ cfi_restore (ebx)
+ ret
+END(wcsrchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4cf4cf4..9a183f0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
- memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+ memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
@@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4
CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
new file mode 100644
index 0000000..f27c069
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcscpy __wcscpy_sse2
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000..133700f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,566 @@
+/* wcscpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+.text
+ENTRY (__wcscpy_ssse3)
+ mov %rsi, %rcx
+ mov %rdi, %rdx
+
+ cmpl $0, (%rcx)
+ jz L(Exit4)
+ cmpl $0, 4(%rcx)
+ jz L(Exit8)
+ cmpl $0, 8(%rcx)
+ jz L(Exit12)
+ cmpl $0, 12(%rcx)
+ jz L(Exit16)
+
+ lea 16(%rcx), %rsi
+ and $-16, %rsi
+
+ pxor %xmm0, %xmm0
+ mov (%rcx), %r9
+ mov %r9, (%rdx)
+
+ pcmpeqd (%rsi), %xmm0
+ mov 8(%rcx), %r9
+ mov %r9, 8(%rdx)
+
+ pmovmskb %xmm0, %rax
+ sub %rcx, %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %rdx, %rax
+ lea 16(%rdx), %rdx
+ and $-16, %rdx
+ sub %rdx, %rax
+ sub %rax, %rcx
+ mov %rcx, %rax
+ and $0xf, %rax
+ mov $0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+ jz L(Align16Both)
+
+ cmp $4, %rax
+ je L(Shl4)
+ cmp $8, %rax
+ je L(Shl8)
+ jmp L(Shl12)
+
+L(Align16Both):
+ movaps (%rcx), %xmm1
+ movaps 16(%rcx), %xmm2
+ movaps %xmm1, (%rdx)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm4
+ movaps %xmm3, (%rdx, %rsi)
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm1
+ movaps %xmm4, (%rdx, %rsi)
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm2
+ movaps %xmm1, (%rdx, %rsi)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%rdx, %rsi)
+ mov %rcx, %rax
+ lea 16(%rcx, %rsi), %rcx
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ sub %rax, %rdx
+
+ mov $-0x40, %rsi
+
+L(Aligned64Loop):
+ movaps (%rcx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rcx), %xmm5
+ movaps 32(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rcx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqd %xmm0, %xmm3
+ pmovmskb %xmm3, %rax
+ lea 64(%rdx), %rdx
+ lea 64(%rcx), %rcx
+ test %rax, %rax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%rdx)
+ movaps %xmm5, -48(%rdx)
+ movaps %xmm6, -32(%rdx)
+ movaps %xmm7, -16(%rdx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm5, %xmm0
+
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm6, %xmm0
+
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%rdx)
+ pcmpeqd %xmm7, %xmm0
+
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov $-0x40, %rsi
+ movaps %xmm7, -16(%rdx)
+ jmp L(Aligned64Loop)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%rcx), %xmm1
+ movaps 12(%rcx), %xmm2
+L(Shl4Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 28(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -12(%rcx), %rcx
+ sub %rax, %rdx
+
+ movaps -4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%rcx), %xmm2
+ movaps 28(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+
+ palignr $4, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %rsi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%rcx), %xmm1
+ movaps 8(%rcx), %xmm2
+L(Shl8Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 24(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -8(%rcx), %rcx
+ sub %rax, %rdx
+
+ movaps -8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%rcx), %xmm2
+ movaps 24(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+
+ palignr $8, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %rsi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%rcx), %xmm1
+ movaps 4(%rcx), %xmm2
+L(Shl12Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 20(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -4(%rcx), %rcx
+ sub %rax, %rdx
+
+ movaps -12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%rcx), %xmm2
+ movaps 20(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+ palignr $12, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %rsi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit12)
+
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(Exit4):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(Exit8):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %eax
+ mov %eax, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rcx), %xmm0
+ movdqu %xmm0, (%rdx)
+ mov %rdi, %rax
+ ret
+
+END(__wcscpy_ssse3)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
new file mode 100644
index 0000000..818c554
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -0,0 +1,43 @@
+/* Multiple versions of wcscpy
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+
+ .text
+ENTRY(wcscpy)
+ .type wcscpy, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __wcscpy_sse2(%rip), %rax
+ ret
+
+2: leaq __wcscpy_ssse3(%rip), %rax
+ ret
+
+END(wcscpy)
+#endif
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
new file mode 100644
index 0000000..b3a1b3b
--- /dev/null
+++ b/sysdeps/x86_64/wcschr.S
@@ -0,0 +1,155 @@
+/* wcschr with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (wcschr)
+
+ movd %rsi, %xmm1
+ pxor %xmm2, %xmm2
+ mov %rdi, %rcx
+ punpckldq %xmm1, %xmm1
+ punpckldq %xmm1, %xmm1
+
+ and $63, %rcx
+ cmp $48, %rcx
+ ja L(cross_cache)
+
+ movdqu (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ and $-16, %rdi
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ jmp L(loop)
+
+L(cross_cache):
+ and $15, %rcx
+ and $-16, %rdi
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+
+ sar %cl, %rdx
+ sar %cl, %rax
+ test %rax, %rax
+ je L(unaligned_no_match)
+
+ bsf %rax, %rax
+ test %rdx, %rdx
+ je L(unaligned_match)
+ bsf %rdx, %rdx
+ cmp %rdx, %rax
+ ja L(return_null)
+
+L(unaligned_match):
+ add %rdi, %rax
+ add %rcx, %rax
+ ret
+
+ .p2align 4
+L(unaligned_no_match):
+ test %rdx, %rdx
+ jne L(return_null)
+ pxor %xmm2, %xmm2
+
+ add $16, %rdi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+ jmp L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %rdx
+ test %rax, %rax
+ jz L(return_null)
+ bsf %rax, %rax
+ test %rdx, %rdx
+ je L(match)
+ bsf %rdx, %rcx
+ cmp %rcx, %rax
+ ja L(return_null)
+L(match):
+ sub $16, %rdi
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
+END (wcschr)
+
+libc_hidden_def(wcschr)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
new file mode 100644
index 0000000..c2e4b7e
--- /dev/null
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -0,0 +1,283 @@
+/* wcsrchr with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (wcsrchr)
+
+ movd %rsi, %xmm1
+ mov %rdi, %rcx
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+ and $63, %rcx
+ cmp $48, %rcx
+ ja L(crosscache)
+
+ movdqu (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ add $16, %rdi
+
+ test %rax, %rax
+ jnz L(unaligned_match1)
+
+ test %rcx, %rcx
+ jnz L(return_null)
+
+ and $-16, %rdi
+ xor %r8, %r8
+ jmp L(loop)
+
+ .p2align 4
+L(unaligned_match1):
+ test %rcx, %rcx
+ jnz L(prolog_find_zero_1)
+
+ mov %rax, %r8
+ mov %rdi, %rsi
+ and $-16, %rdi
+ jmp L(loop)
+
+ .p2align 4
+L(crosscache):
+ and $15, %rcx
+ and $-16, %rdi
+ pxor %xmm3, %xmm3
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm3
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm3, %rdx
+ pmovmskb %xmm0, %rax
+ shr %cl, %rdx
+ shr %cl, %rax
+ add $16, %rdi
+
+ test %rax, %rax
+ jnz L(unaligned_match)
+
+ test %rdx, %rdx
+ jnz L(return_null)
+
+ xor %r8, %r8
+ jmp L(loop)
+
+ .p2align 4
+L(unaligned_match):
+ test %rdx, %rdx
+ jnz L(prolog_find_zero)
+
+ mov %rax, %r8
+ lea (%rdi, %rcx), %rsi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm3
+ pcmpeqd %xmm3, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm3, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm4
+ pcmpeqd %xmm4, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm4
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm4, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm5
+ pcmpeqd %xmm5, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm5
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm5, %rax
+ or %rax, %rcx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ test %rax, %rax
+ jnz L(match)
+L(return_value):
+ test %r8, %r8
+ jz L(return_null)
+ mov %r8, %rax
+ mov %rsi, %rdi
+
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %rcx
+ test %rcx, %rcx
+ jnz L(find_zero)
+ mov %rax, %r8
+ mov %rdi, %rsi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test $15, %cl
+ jnz L(find_zero_in_first_wchar)
+ test %cl, %cl
+ jnz L(find_zero_in_second_wchar)
+ test $15, %ch
+ jnz L(find_zero_in_third_wchar)
+
+ and $1 << 13 - 1, %rax
+ jz L(return_value)
+
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(find_zero_in_first_wchar):
+ test $1, %rax
+ jz L(return_value)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(find_zero_in_second_wchar):
+ and $1 << 5 - 1, %rax
+ jz L(return_value)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(find_zero_in_third_wchar):
+ and $1 << 9 - 1, %rax
+ jz L(return_value)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero):
+ add %rcx, %rdi
+ mov %rdx, %rcx
+L(prolog_find_zero_1):
+ test $15, %cl
+ jnz L(prolog_find_zero_in_first_wchar)
+ test %cl, %cl
+ jnz L(prolog_find_zero_in_second_wchar)
+ test $15, %ch
+ jnz L(prolog_find_zero_in_third_wchar)
+
+ and $1 << 13 - 1, %rax
+ jz L(return_null)
+
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_in_first_wchar):
+ test $1, %rax
+ jz L(return_null)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_in_second_wchar):
+ and $1 << 5 - 1, %rax
+ jz L(return_null)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_in_third_wchar):
+ and $1 << 9 - 1, %rax
+ jz L(return_null)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_second_wchar):
+ lea -12(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_third_wchar):
+ lea -8(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_fourth_wchar):
+ lea -4(%rdi), %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
+END (wcsrchr)
diff --git a/wcsmbs/wcschr.c b/wcsmbs/wcschr.c
index 15b55d4..1a31f74 100644
--- a/wcsmbs/wcschr.c
+++ b/wcsmbs/wcschr.c
@@ -18,8 +18,11 @@
#include <wchar.h>
-
/* Find the first occurrence of WC in WCS. */
+#ifdef WCSCHR
+# define wcschr WCSCHR
+#endif
+
wchar_t *
wcschr (wcs, wc)
register const wchar_t *wcs;