Optimized wcschr and wcscpy for x86-64 and x86-32

author: Ulrich Drepper <drepper@gmail.com> 2011-12-17 14:39:23 -0500
committer: Ulrich Drepper <drepper@gmail.com> 2011-12-17 14:39:23 -0500
commit: 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch)
tree: 90a3f8d19f941a684e1482b8813c534d82cfb19e
parent: a2d18b64edb486825fb5946eefc2131426ccfec9 (diff)
download: glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.bz2
20 files changed, 2466 insertions, 13 deletions
diff --git a/ChangeLog b/ChangeLog
index f3ce260..19f2cde 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,28 @@
+2011-11-14  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	* sysdeps/x86_64/multiarch/Makefile [subdir=wcsmbs] (sysdep_routines):
+	Add wcscpy-ssse3 wcscpy-c.
+	* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/wcscpy-c.c: New file.
+	* sysdeps/x86_64/multiarch/wcscpy.S: New file.
+	* sysdeps/x86_64/wcschr.S: New file.
+	* sysdeps/x86_64/wcsrchr.S: New file.
+	* string/test-strcmp.c: Remove checking of wcscmp function for
+	wrong alignments.
+	* sysdeps/i386/i686/multiarch/Makefile [subdir=wcsmbs]
+	(sysdep_routines): Add wcscpy-ssse3 wcscpy-c wcschr-sse2 wcschr-c
+	wcsrchr-sse2 wcsrchr-c.
+	* sysdeps/i386/i686/multiarch/wcschr.S: New file.
+	* sysdeps/i386/i686/multiarch/wcschr-c.c: New file.
+	* sysdeps/i386/i686/multiarch/wcschr-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/wcsrchr.S: New file.
+	* sysdeps/i386/i686/multiarch/wcsrchr-c.c: New file.
+	* sysdeps/i386/i686/multiarch/wcsrchr-sse2.S: New file.
+	* sysdeps/i386/i686/multiarch/wcscpy.S: New file.
+	* sysdeps/i386/i686/multiarch/wcscpy-c.c: New file.
+	* sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: New file.
+	* wcsmbc/wcschr.c (WCSCHR): New macro.
+
 2011-11-17  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
 
 	* wcsmbs/Makefile (strop-tests): Add wcsrchr wcscpy.
diff --git a/NEWS b/NEWS
index 5b5324c..6181a01 100644
--- a/NEWS
+++ b/NEWS
@@ -34,7 +34,8 @@ Version 2.15
 * Optimized strchr and strrchr for SSE on x86-32.
   Contributed by Liubov Dmitrieva.
 
-* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp, wcschr, wcscpy
+  for x86-64 and x86-32.
   Contributed by Liubov Dmitrieva.
 
 * New interfaces: scandirat, scandirat64
diff --git a/string/test-strcmp.c b/string/test-strcmp.c
index 9ea4df7..5ed9626 100644
--- a/string/test-strcmp.c
+++ b/string/test-strcmp.c
@@ -221,14 +221,14 @@ do_test (size_t align1, size_t align2, size_t len, int max_char,
 static void
 do_random_tests (void)
 {
-  for (size_t a = 0; a < CHARBYTES; a += CHARALIGN)
-    for (size_t b = 0; b < CHARBYTES; b += CHARALIGN)
-      {
-	UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES - a);
-	UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES - b);
+	UCHAR *p1 = (UCHAR *) (buf1 + page_size - 512 * CHARBYTES);
+	UCHAR *p2 = (UCHAR *) (buf2 + page_size - 512 * CHARBYTES);
 
 	for (size_t n = 0; n < ITERATIONS; n++)
 	  {
+      /* for wcscmp case align1 and align2 mean here alignment in wchar_t symbols, it 
+       equal 4*k alignment in bytes, we don't check other alignments like for example p1 = (wchar_t *)(buf1 + 1)
+       because it's wrong using of wchar_t type. */
 	    size_t align1 = random () & 31;
 	    size_t align2;
 	    if (random () & 1)
@@ -274,7 +274,7 @@ do_random_tests (void)
 	      }
 
 	    int result = 0;
-	    MEMCPY ((CHAR *) (p2 + align2), (CHAR *) (p1 + align1), pos);
+	    MEMCPY (p2 + align2, p1 + align1, pos);
 	    if (pos < len1)
 	      {
 		if (p2[align2 + pos] == p1[align1 + pos])
@@ -302,13 +302,12 @@ do_random_tests (void)
 		    || (r < 0 && result >= 0)
 		    || (r > 0 && result <= 0))
 		  {
-		    error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %zd, %zd, %zd, %zd) %d != %d, p1 %p p2 %p",
+		    error (0, 0, "Iteration %zd - wrong result in function %s (align in bytes: %zd, align in bytes: %zd, len1:  %zd, len2: %zd, pos: %zd) %d != %d, p1 %p p2 %p",
 			   n, impl->name, (size_t) (p1 + align1) & 63, (size_t) (p1 + align2) & 63, len1, len2, pos, r, result, p1, p2);
 		    ret = 1;
 		  }
 	      }
-	  }
-      }
+     }
 }
 
 static void
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 426b718..b764e5b 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -37,7 +37,8 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \
-		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \
+		   wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c
 endif
 
 ifeq (mathyes,$(subdir)$(config-cflags-avx))
diff --git a/sysdeps/i386/i686/multiarch/wcschr-c.c b/sysdeps/i386/i686/multiarch/wcschr-c.c
new file mode 100644
index 0000000..a63e50e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr-c.c
@@ -0,0 +1,8 @@
+#ifndef NOT_IN_libc
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+  __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32);
+# define WCSCHR  __wcschr_ia32
+#endif
+
+#include "wcsmbs/wcschr.c"
diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000..cc8204c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S
@@ -0,0 +1,220 @@
+/* wcschr with SSE2, without using bsf instructions
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcschr_sse2)
+
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %eax
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %eax
+	cmp	$48, %eax
+	ja	L(cross_cache)
+
+	movdqu	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	and	$-16, %ecx
+	jmp	L(loop)
+
+	.p2align 4
+L(cross_cache):
+	PUSH	(%edi)
+	mov	%ecx, %edi
+	mov	%eax, %ecx
+	and	$-16, %edi
+	and	$15, %ecx
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+
+	sarl	%cl, %edx
+	sarl	%cl, %eax
+	test	%eax, %eax
+	jz	L(unaligned_no_match)
+
+	add	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jz	L(match_case1)
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(unaligned_no_match):
+	mov	%edi, %ecx
+	POP	(%edi)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	pxor	%xmm2, %xmm2
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	add	$16, %ecx
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jnz	L(matches)
+	add	$16, %ecx
+
+	movdqa	(%ecx), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %edx
+	pmovmskb %xmm0, %eax
+	or	%eax, %edx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %edx
+	test	%eax, %eax
+	jz	L(return_null)
+	test	%edx, %edx
+	jz	L(match_case1)
+
+	.p2align 4
+L(match_case2):
+	test	%al, %al
+	jz	L(match_higth_case2)
+	test	$15, %al
+	jnz	L(match_case2_4)
+	test	$15, %dl
+	jnz	L(return_null)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_4):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(match_higth_case2):
+	test	%dl, %dl
+	jnz	L(return_null)
+	test	$15, %ah
+	jnz	L(match_case2_12)
+	test	$15, %dh
+	jnz	L(return_null)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case2_12):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_case1):
+	test	%al, %al
+	jz	L(match_higth_case1)
+
+	test	$0x01, %al
+	jnz	L(exit0)
+	lea	4(%ecx), %eax
+	ret
+
+	.p2align 4
+L(match_higth_case1):
+	test	$0x01, %ah
+	jnz	L(exit3)
+	lea	12(%ecx), %eax
+	ret
+
+	.p2align 4
+L(exit0):
+	mov	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit3):
+	lea	8(%ecx), %eax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	ret
+
+END (__wcschr_sse2)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S
new file mode 100644
index 0000000..bf0d6d5
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcschr.S
@@ -0,0 +1,54 @@
+/* Multiple versions of wcschr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(wcschr)
+	.type	wcschr, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wcschr_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wcschr_sse2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(wcschr)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-c.c b/sysdeps/i386/i686/multiarch/wcscpy-c.c
new file mode 100644
index 0000000..a3c4024
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcscpy  __wcscpy_ia32
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000..84d92a8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,621 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	4
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
+# define STR1	PARMS
+# define STR2	STR1+4
+# define LEN	STR2+4
+
+	atom_text_section
+ENTRY (__wcscpy_ssse3)
+	mov	STR1(%esp), %edx
+	mov	STR2(%esp), %ecx
+
+	cmp	$0, (%ecx)
+	jz	L(ExitTail4)
+	cmp	$0, 4(%ecx)
+	jz	L(ExitTail8)
+	cmp	$0, 8(%ecx)
+	jz	L(ExitTail12)
+	cmp	$0, 12(%ecx)
+	jz	L(ExitTail16)
+
+	PUSH	(%edi)
+	mov	%edx, %edi
+
+	PUSH	(%esi)
+	lea	16(%ecx), %esi
+
+	and	$-16, %esi
+
+	pxor	%xmm0, %xmm0
+	pcmpeqd	(%esi), %xmm0
+	movdqu	(%ecx), %xmm1
+	movdqu	%xmm1, (%edx)
+
+	pmovmskb %xmm0, %eax
+	sub	%ecx, %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%edx, %eax
+	lea	16(%edx), %edx
+	and	$-16, %edx
+	sub	%edx, %eax
+
+	sub	%eax, %ecx
+	mov	%ecx, %eax
+	and	$0xf, %eax
+	mov	$0, %esi
+
+	jz	L(Align16Both)
+	cmp	$4, %eax
+	je	L(Shl4)
+	cmp	$8, %eax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%ecx), %xmm1
+	movaps	16(%ecx), %xmm2
+	movaps	%xmm1, (%edx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm4
+	movaps	%xmm3, (%edx, %esi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm1
+	movaps	%xmm4, (%edx, %esi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm2
+	movaps	%xmm1, (%edx, %esi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%ecx, %esi), %xmm3
+	movaps	%xmm2, (%edx, %esi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %eax
+	lea	16(%esi), %esi
+
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%edx, %esi)
+	mov	%ecx, %eax
+	lea	16(%ecx, %esi), %ecx
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	sub	%eax, %edx
+
+	mov	$-0x40, %esi
+
+L(Aligned64Loop):
+	movaps	(%ecx), %xmm2
+	movaps	32(%ecx), %xmm3
+	movaps	%xmm2, %xmm4
+	movaps	16(%ecx), %xmm5
+	movaps	%xmm3, %xmm6
+	movaps	48(%ecx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	lea	64(%edx), %edx
+	pcmpeqd	%xmm0, %xmm3
+	lea	64(%ecx), %ecx
+	pmovmskb %xmm3, %eax
+
+	test	%eax, %eax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%edx)
+	movaps	%xmm5, -48(%edx)
+	movaps	%xmm6, -32(%edx)
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm4, -64(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm5, -48(%edx)
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%edx)
+	pcmpeqd	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	lea	16(%esi), %esi
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %esi
+	movaps	%xmm7, -16(%edx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%ecx), %xmm1
+	movaps	12(%ecx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	28(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	28(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-12(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%ecx), %xmm2
+	movaps	28(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$12, %xmm6
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	add	$12, %edx
+	add	$12, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%ecx), %xmm1
+	movaps	8(%ecx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	24(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	24(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-8(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%ecx), %xmm2
+	movaps	24(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$8, %xmm6
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+	add	$8, %edx
+	add	$8, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%ecx), %xmm1
+	movaps	4(%ecx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %eax
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	movaps	20(%ecx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%edx), %edx
+	pmovmskb %xmm0, %eax
+	lea	16(%ecx), %ecx
+	movaps	%xmm2, %xmm3
+
+	test	%eax, %eax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	20(%ecx), %ecx
+	lea	16(%edx), %edx
+
+	mov	%ecx, %eax
+	and	$-0x40, %ecx
+	sub	%ecx, %eax
+	lea	-4(%ecx), %ecx
+	sub	%eax, %edx
+
+	movaps	-12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%ecx), %xmm2
+	movaps	20(%ecx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%ecx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%ecx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %eax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%eax, %eax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%ecx), %ecx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%edx)
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movaps	(%edx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %esi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%edx)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%esi, %edx
+	add	%esi, %ecx
+
+	POP	(%esi)
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edi, %eax
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edi, %eax
+	RETURN
+
+CFI_POP	(%edi)
+
+	.p2align 4
+L(ExitTail4):
+	movl	(%ecx), %eax
+	movl	%eax, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail8):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail12):
+	movlpd	(%ecx), %xmm0
+	movlpd	%xmm0, (%edx)
+	movl	8(%ecx), %eax
+	movl	%eax, 8(%edx)
+	movl	%edx, %eax
+	ret
+
+	.p2align 4
+L(ExitTail16):
+	movdqu	(%ecx), %xmm0
+	movdqu	%xmm0, (%edx)
+	movl	%edx, %eax
+	ret
+
+END (__wcscpy_ssse3)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S
new file mode 100644
index 0000000..c7bafbe
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wcscpy
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wcscpy_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wcscpy_ssse3@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(wcscpy)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/sysdeps/i386/i686/multiarch/wcsrchr-c.c
new file mode 100644
index 0000000..c7444ce
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcsrchr  __wcsrchr_ia32
+#endif
+
+#include "wcsmbs/wcsrchr.c"
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000..2859f7e
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,355 @@
+/* wcsrchr with SSE2, without using bsf instructions.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
+
+# define PARMS	8
+# define ENTRANCE	PUSH (%edi);
+# define RETURN	POP (%edi); ret; CFI_PUSH (%edi);
+# define STR1	PARMS
+# define STR2	STR1+4
+
+	atom_text_section
+ENTRY (__wcsrchr_sse2)
+
+	ENTRANCE
+	mov	STR1(%esp), %ecx
+	movd	STR2(%esp), %xmm1
+
+	mov	%ecx, %edi
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+	and	$63, %ecx
+	cmp	$48, %ecx
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm2, %ecx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match1)
+
+	test	%ecx, %ecx
+	jnz	L(return_null)
+
+	and	$-16, %edi
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%ecx, %ecx
+	jnz	L(prolog_find_zero_1)
+
+	PUSH	(%esi)
+
+/* Save current match */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	and	$-16, %edi
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %ecx
+	and	$-16, %edi
+	pxor	%xmm3, %xmm3
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+/* Find where NULL is.  */
+	pmovmskb %xmm3, %edx
+/* Check if there is a match.  */
+	pmovmskb %xmm0, %eax
+/* Remove the leading bytes.  */
+	shr	%cl, %edx
+	shr	%cl, %eax
+	add	$16, %edi
+
+	test	%eax, %eax
+	jnz	L(unaligned_match)
+
+	test	%edx, %edx
+	jnz	L(return_null)
+
+	PUSH	(%esi)
+
+	xor	%edx, %edx
+	jmp	L(loop)
+
+	CFI_POP	(%esi)
+
+	.p2align 4
+L(unaligned_match):
+	test	%edx, %edx
+	jnz	L(prolog_find_zero)
+
+	PUSH	(%esi)
+
+	mov	%eax, %edx
+	lea	(%edi, %ecx), %esi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%edi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm0, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm4, %eax
+	or	%eax, %ecx
+	jnz	L(matches)
+
+	movdqa	(%edi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %edi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm5, %eax
+	or	%eax, %ecx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	test	%edx, %edx
+	jz	L(return_null_1)
+	mov	%edx, %eax
+	mov	%esi, %edi
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(return_null_1):
+	POP	(%esi)
+
+	xor	%eax, %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(find_zero)
+/* save match info */
+	mov	%eax, %edx
+	mov	%edi, %esi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_value)
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(find_zero_in_fourth_wchar):
+
+	POP	(%esi)
+
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	CFI_PUSH	(%esi)
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_or_fourth_wchar):
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(return_null):
+	xor	%eax, %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%ecx, %edi
+	mov     %edx, %ecx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_in_third_or_fourth_wchar)
+	test	$15, %cl
+	jz	L(prolog_find_zero_in_second_wchar)
+	and	$1, %eax
+	jz	L(return_null)
+
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %eax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+	test	$15, %ch
+	jz	L(prolog_find_zero_in_fourth_wchar)
+	and	$1 << 9 - 1, %eax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+	.p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+	test	%ah, %ah
+	jnz	L(match_third_or_fourth_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%edi), %eax
+	RETURN
+
+END (__wcsrchr_sse2)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S
new file mode 100644
index 0000000..8240063
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -0,0 +1,54 @@
+/* Multiple versions of wcsrchr
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+	jne	1f
+	call	__init_cpu_features
+1:	leal	__wcsrchr_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wcsrchr_sse2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4);
+	cfi_restore (ebx)
+	ret
+END(wcsrchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4cf4cf4..9a183f0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
 		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
@@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4
 CFLAGS-strcasestr-nonascii.c += -msse4
 endif
 endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
new file mode 100644
index 0000000..f27c069
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcscpy  __wcscpy_sse2
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000..133700f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,566 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+.text
+ENTRY (__wcscpy_ssse3)
+	mov	%rsi, %rcx
+	mov	%rdi, %rdx
+
+	cmpl	$0, (%rcx)
+	jz	L(Exit4)
+	cmpl	$0, 4(%rcx)
+	jz	L(Exit8)
+	cmpl	$0, 8(%rcx)
+	jz	L(Exit12)
+	cmpl	$0, 12(%rcx)
+	jz	L(Exit16)
+
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+
+	pcmpeqd	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$8, %rax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+
+	mov	$-0x40, %rsi
+
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqd	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqd	%xmm7, %xmm0
+
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %rsi
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$12, %xmm6
+	mov	$12, %rsi
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$8, %xmm6
+	mov	$8, %rsi
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %rsi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rcx), %xmm0
+	movdqu	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+END(__wcscpy_ssse3)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
new file mode 100644
index 0000000..818c554
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -0,0 +1,43 @@
+/* Multiple versions of wcscpy
+   Copyright (C)  2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__wcscpy_sse2(%rip), %rax
+	ret
+
+2:	leaq	__wcscpy_ssse3(%rip), %rax
+	ret
+
+END(wcscpy)
+#endif
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
new file mode 100644
index 0000000..b3a1b3b
--- /dev/null
+++ b/sysdeps/x86_64/wcschr.S
@@ -0,0 +1,155 @@
+/* wcschr with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (wcschr)
+
+	movd	%rsi, %xmm1
+	pxor	%xmm2, %xmm2
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(cross_cache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	and	$-16, %rdi
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	jmp	L(loop)
+
+L(cross_cache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+
+	sar	%cl, %rdx
+	sar	%cl, %rax
+	test	%rax, %rax
+	je	L(unaligned_no_match)
+
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(unaligned_match)
+	bsf	%rdx, %rdx
+	cmp	%rdx, %rax
+	ja	L(return_null)
+
+L(unaligned_match):
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%rdx, %rdx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %rdi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+	jmp	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %rdx
+	test	%rax, %rax
+	jz	L(return_null)
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(match)
+	bsf	%rdx, %rcx
+	cmp	%rcx, %rax
+	ja	L(return_null)
+L(match):
+	sub	$16, %rdi
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (wcschr)
+
+libc_hidden_def(wcschr)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
new file mode 100644
index 0000000..c2e4b7e
--- /dev/null
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -0,0 +1,283 @@
+/* wcsrchr with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (wcsrchr)
+
+	movd	%rsi, %xmm1
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match1)
+
+	test	%rcx, %rcx
+	jnz	L(return_null)
+
+	and	$-16, %rdi
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%rcx, %rcx
+	jnz	L(prolog_find_zero_1)
+
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	and	$-16, %rdi
+	jmp	L(loop)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	pxor	%xmm3, %xmm3
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm3, %rdx
+	pmovmskb %xmm0, %rax
+	shr	%cl, %rdx
+	shr	%cl, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match)
+
+	test	%rdx, %rdx
+	jnz	L(return_null)
+
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match):
+	test	%rdx, %rdx
+	jnz	L(prolog_find_zero)
+
+	mov	%rax, %r8
+	lea	(%rdi, %rcx), %rsi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm3, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm4, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm5, %rax
+	or	%rax, %rcx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%rax, %rax
+	jnz	L(match)
+L(return_value):
+	test	%r8, %r8
+	jz	L(return_null)
+	mov	%r8, %rax
+	mov	%rsi, %rdi
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %rcx
+	test	%rcx, %rcx
+	jnz	L(find_zero)
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	$15, %cl
+	jnz	L(find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_value)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_value)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%rcx, %rdi
+	mov     %rdx, %rcx
+L(prolog_find_zero_1):
+	test	$15, %cl
+	jnz	L(prolog_find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(prolog_find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(prolog_find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_null)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (wcsrchr)
diff --git a/wcsmbs/wcschr.c b/wcsmbs/wcschr.c
index 15b55d4..1a31f74 100644
--- a/wcsmbs/wcschr.c
+++ b/wcsmbs/wcschr.c
@@ -18,8 +18,11 @@
 
 #include <wchar.h>
 
-
 /* Find the first occurrence of WC in WCS.  */
+#ifdef WCSCHR
+# define wcschr WCSCHR
+#endif
+
 wchar_t *
 wcschr (wcs, wc)
      register const wchar_t *wcs;
author	Ulrich Drepper <drepper@gmail.com>	2011-12-17 14:39:23 -0500
committer	Ulrich Drepper <drepper@gmail.com>	2011-12-17 14:39:23 -0500
commit	1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch)
tree	90a3f8d19f941a684e1482b8813c534d82cfb19e
parent	a2d18b64edb486825fb5946eefc2131426ccfec9 (diff)
download	glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.bz2