aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-07-19 17:11:54 -0400
committerUlrich Drepper <drepper@gmail.com>2011-07-19 17:11:54 -0400
commit99710781cc47002612e609c7dc5f34692b64e9b3 (patch)
treeac3c980ce57d0420fff758faffbd59d111026219 /sysdeps/x86_64
parent7dc6bd90c569c49807462b0740b18e32fab4d8b7 (diff)
downloadglibc-99710781cc47002612e609c7dc5f34692b64e9b3.zip
glibc-99710781cc47002612e609c7dc5f34692b64e9b3.tar.gz
glibc-99710781cc47002612e609c7dc5f34692b64e9b3.tar.bz2
Improve 64 bit strcat functions with SSE2/SSSE3
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r--sysdeps/x86_64/multiarch/Makefile6
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c10
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h2
-rw-r--r--sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S55
-rw-r--r--sysdeps/x86_64/multiarch/strcat-ssse3.S559
-rw-r--r--sysdeps/x86_64/multiarch/strcat.S85
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S451
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S280
-rw-r--r--sysdeps/x86_64/multiarch/strlen-no-bsf.S74
-rw-r--r--sysdeps/x86_64/multiarch/strlen-sse2-pminub.S260
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncat-c.c8
-rw-r--r--sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncat.S3
15 files changed, 1486 insertions, 318 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 88410b3..c959dd1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,14 +5,16 @@ endif
ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
+sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned
+ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+ strcat-sse2-unaligned strncat-sse2-unaligned \
+ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 81b2378..0a145ca 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -97,18 +97,22 @@ __init_cpu_features (void)
case 0x2c:
case 0x2e:
case 0x2f:
- /* Rep string instructions, copy backward and unaligned loads
- are fast on Intel Core i3, i5 and i7. */
+ /* Rep string instructions, copy backward, unaligned loads
+ and pminub are fast on Intel Core i3, i5 and i7. */
#if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif
#if index_Fast_Rep_String != index_Fast_Unaligned_Load
# error index_Fast_Rep_String != index_Fast_Unaligned_Load
#endif
+#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+#endif
__cpu_features.feature[index_Fast_Rep_String]
|= (bit_Fast_Rep_String
| bit_Fast_Copy_Backward
- | bit_Fast_Unaligned_Load);
+ | bit_Fast_Unaligned_Load
+ | bit_Prefer_PMINUB_for_stringop);
break;
}
}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index addf5f3..6cfdbdd 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -21,6 +21,7 @@
#define bit_Slow_BSF (1 << 2)
#define bit_Prefer_SSE_for_memop (1 << 3)
#define bit_Fast_Unaligned_Load (1 << 4)
+#define bit_Prefer_PMINUB_for_stringop (1 << 5)
#ifdef __ASSEMBLER__
@@ -41,6 +42,7 @@
# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE
# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE
# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
new file mode 100644
index 0000000..1150281
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -0,0 +1,55 @@
+/* strcat with SSE2
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+# define STRCAT __strcat_sse2_unaligned
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+ mov %rdi, %r9
+# ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+# endif
+
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-sse2-pminub.S"
+# undef RETURN
+
+L(StartStrcpyPart):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+# ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-sse2-unaligned.S"
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
new file mode 100644
index 0000000..66736a7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -0,0 +1,559 @@
+/* strcat with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+# define STRCAT __strcat_ssse3
+# endif
+
+# define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+# ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+# endif
+
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-no-bsf.S"
+
+# undef RETURN
+
+L(StartStrcpyPart):
+ mov %rsi, %rcx
+ lea (%rdi, %rax), %rdx
+# ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(StrncatExit0)
+ cmp $8, %r8
+ jbe L(StrncatExit8Bytes)
+# endif
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ cmpb $0, 7(%rcx)
+ jz L(Exit8)
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %r8
+ jb L(StrncatExit15Bytes)
+# endif
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ cmpb $0, 14(%rcx)
+ jz L(Exit15)
+ cmpb $0, 15(%rcx)
+ jz L(Exit16)
+# ifdef USE_AS_STRNCAT
+ cmp $16, %r8
+ je L(StrncatExit16)
+# define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-ssse3.S"
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ movlpd (%rcx), %xmm0
+ movlpd 8(%rcx), %xmm1
+ movlpd %xmm0, (%rdx)
+ movlpd %xmm1, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit1):
+ xor %ah, %ah
+ movb %ah, 1(%rdx)
+L(Exit1):
+ movb (%rcx), %al
+ movb %al, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit2):
+ xor %ah, %ah
+ movb %ah, 2(%rdx)
+L(Exit2):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit3):
+ xor %ah, %ah
+ movb %ah, 3(%rdx)
+L(Exit3):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+ movb 2(%rcx), %al
+ movb %al, 2(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit4):
+ xor %ah, %ah
+ movb %ah, 4(%rdx)
+L(Exit4):
+ mov (%rcx), %eax
+ mov %eax, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit5):
+ xor %ah, %ah
+ movb %ah, 5(%rdx)
+L(Exit5):
+ mov (%rcx), %eax
+ mov %eax, (%rdx)
+ movb 4(%rcx), %al
+ movb %al, 4(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit6):
+ xor %ah, %ah
+ movb %ah, 6(%rdx)
+L(Exit6):
+ mov (%rcx), %eax
+ mov %eax, (%rdx)
+ movw 4(%rcx), %ax
+ movw %ax, 4(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit7):
+ xor %ah, %ah
+ movb %ah, 7(%rdx)
+L(Exit7):
+ mov (%rcx), %eax
+ mov %eax, (%rdx)
+ mov 3(%rcx), %eax
+ mov %eax, 3(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit8):
+ xor %ah, %ah
+ movb %ah, 8(%rdx)
+L(Exit8):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit9):
+ xor %ah, %ah
+ movb %ah, 9(%rdx)
+L(Exit9):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movb 8(%rcx), %al
+ movb %al, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit10):
+ xor %ah, %ah
+ movb %ah, 10(%rdx)
+L(Exit10):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movw 8(%rcx), %ax
+ movw %ax, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit11):
+ xor %ah, %ah
+ movb %ah, 11(%rdx)
+L(Exit11):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ mov 7(%rcx), %eax
+ mov %eax, 7(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit12):
+ xor %ah, %ah
+ movb %ah, 12(%rdx)
+L(Exit12):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ mov 8(%rcx), %eax
+ mov %eax, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit13):
+ xor %ah, %ah
+ movb %ah, 13(%rdx)
+L(Exit13):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 5(%rcx), %xmm1
+ movlpd %xmm1, 5(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit14):
+ xor %ah, %ah
+ movb %ah, 14(%rdx)
+L(Exit14):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 6(%rcx), %xmm1
+ movlpd %xmm1, 6(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit15):
+ xor %ah, %ah
+ movb %ah, 15(%rdx)
+L(Exit15):
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 7(%rcx), %xmm1
+ movlpd %xmm1, 7(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit16):
+ xor %ah, %ah
+ movb %ah, 16(%rdx)
+L(Exit16):
+ movlpd (%rcx), %xmm0
+ movlpd 8(%rcx), %xmm1
+ movlpd %xmm0, (%rdx)
+ movlpd %xmm1, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rsi, %rcx
+ lea (%rsi, %rdx), %rsi
+ lea -9(%r8), %rdx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%rsi), %rdx
+ jz L(ExitHighCase2)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %r8
+ je L(StrncatExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %r8
+ je L(StrncatExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %r8
+ je L(StrncatExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %r8
+ je L(StrncatExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %r8
+ je L(StrncatExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %r8
+ je L(StrncatExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %r8
+ je L(StrncatExit7)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ lea 7(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+ xor %cl, %cl
+ movb %cl, (%rax)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(ExitHighCase2):
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %r8
+ je L(StrncatExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %r8
+ je L(StrncatExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %r8
+ je L(StrncatExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %r8
+ je L(StrncatExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %r8
+ je L(StrncatExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %r8
+ je L(StrncatExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %r8
+ je L(StrncatExit15)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 8(%rcx), %xmm1
+ movlpd %xmm1, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ cmp $8, %r8
+ ja L(ExitHighCase3)
+ cmp $1, %r8
+ je L(StrncatExit1)
+ cmp $2, %r8
+ je L(StrncatExit2)
+ cmp $3, %r8
+ je L(StrncatExit3)
+ cmp $4, %r8
+ je L(StrncatExit4)
+ cmp $5, %r8
+ je L(StrncatExit5)
+ cmp $6, %r8
+ je L(StrncatExit6)
+ cmp $7, %r8
+ je L(StrncatExit7)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ xor %ah, %ah
+ movb %ah, 8(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(ExitHighCase3):
+ cmp $9, %r8
+ je L(StrncatExit9)
+ cmp $10, %r8
+ je L(StrncatExit10)
+ cmp $11, %r8
+ je L(StrncatExit11)
+ cmp $12, %r8
+ je L(StrncatExit12)
+ cmp $13, %r8
+ je L(StrncatExit13)
+ cmp $14, %r8
+ je L(StrncatExit14)
+ cmp $15, %r8
+ je L(StrncatExit15)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 8(%rcx), %xmm1
+ movlpd %xmm1, 8(%rdx)
+ xor %ah, %ah
+ movb %ah, 16(%rdx)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit0):
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit15Bytes):
+ cmp $9, %r8
+ je L(StrncatExit9)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmp $10, %r8
+ je L(StrncatExit10)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmp $11, %r8
+ je L(StrncatExit11)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmp $12, %r8
+ je L(StrncatExit12)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmp $13, %r8
+ je L(StrncatExit13)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ cmp $14, %r8
+ je L(StrncatExit14)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ movlpd 7(%rcx), %xmm1
+ movlpd %xmm1, 7(%rdx)
+ lea 14(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+ xor %cl, %cl
+ movb %cl, (%rax)
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(StrncatExit8Bytes):
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmp $1, %r8
+ je L(StrncatExit1)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmp $2, %r8
+ je L(StrncatExit2)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmp $3, %r8
+ je L(StrncatExit3)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmp $4, %r8
+ je L(StrncatExit4)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmp $5, %r8
+ je L(StrncatExit5)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmp $6, %r8
+ je L(StrncatExit6)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ cmp $7, %r8
+ je L(StrncatExit7)
+ movlpd (%rcx), %xmm0
+ movlpd %xmm0, (%rdx)
+ lea 7(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+ xor %cl, %cl
+ movb %cl, (%rax)
+ mov %rdi, %rax
+ ret
+
+# endif
+END (STRCAT)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
new file mode 100644
index 0000000..f3ccc8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -0,0 +1,85 @@
+/* Multiple versions of strcat
+ Copyright (C) 2009, 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#ifndef USE_AS_STRNCAT
+# ifndef STRCAT
+# define STRCAT strcat
+# endif
+#endif
+
+#ifdef USE_AS_STRNCAT
+# define STRCAT_SSSE3 __strncat_ssse3
+# define STRCAT_SSE2 __strncat_sse2
+# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned
+# define __GI_STRCAT __GI_strncat
+# define __GI___STRCAT __GI___strncat
+#else
+# define STRCAT_SSSE3 __strcat_ssse3
+# define STRCAT_SSE2 __strcat_sse2
+# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned
+# define __GI_STRCAT __GI_strcat
+# define __GI___STRCAT __GI___strcat
+#endif
+
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+ .text
+ENTRY(STRCAT)
+ .type STRCAT, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq STRCAT_SSE2_UNALIGNED(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRCAT_SSE2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq STRCAT_SSSE3(%rip), %rax
+2: ret
+END(STRCAT)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCAT_SSE2, @function; \
+ .align 16; \
+ STRCAT_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcat calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
+#endif
+
+#ifndef USE_AS_STRNCAT
+# include "../strcat.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 9a8d186..6de8c47 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -20,10 +20,13 @@
#ifndef NOT_IN_libc
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2_unaligned
+# endif
-# ifndef STRCPY
-# define STRCPY __strcpy_sse2_unaligned
# endif
# define JMPTBL(I, B) I - B
@@ -33,16 +36,20 @@
lea (%r11, %rcx), %rcx; \
jmp *%rcx
- .text
+# ifndef USE_AS_STRCAT
+
+.text
ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
mov %rdx, %r8
test %r8, %r8
jz L(ExitZero)
-# endif
+# endif
mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
+# ifndef USE_AS_STPCPY
mov %rdi, %rax /* save result */
+# endif
+
# endif
and $15, %rcx
@@ -59,7 +66,7 @@ ENTRY (STRCPY)
pmovmskb %xmm1, %rdx
shr %cl, %rdx
# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $16, %r8
# else
cmp $17, %r8
@@ -72,7 +79,7 @@ ENTRY (STRCPY)
pcmpeqb 16(%rsi), %xmm0
pmovmskb %xmm0, %rdx
# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $32, %r8
# else
cmp $33, %r8
@@ -102,7 +109,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm2)
# else
jnz L(CopyFrom1To16Bytes)
@@ -118,7 +125,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm3)
# else
jnz L(CopyFrom1To16Bytes)
@@ -134,7 +141,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm4)
# else
jnz L(CopyFrom1To16Bytes)
@@ -150,7 +157,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm1)
# else
jnz L(CopyFrom1To16Bytes)
@@ -166,7 +173,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm2)
# else
jnz L(CopyFrom1To16Bytes)
@@ -182,7 +189,7 @@ L(Unalign16Both):
jbe L(CopyFrom1To16BytesCase2OrCase3)
# endif
test %rdx, %rdx
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm3)
# else
jnz L(CopyFrom1To16Bytes)
@@ -264,10 +271,10 @@ L(Unaligned64Leave):
movdqu %xmm4, (%rdi)
movdqu %xmm5, 16(%rdi)
movdqu %xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY
-# ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
lea 48(%rdi, %rdx), %rax
-# endif
+# endif
movdqu %xmm7, 48(%rdi)
add $15, %r8
sub %rdx, %r8
@@ -288,7 +295,7 @@ L(SourceStringAlignmentZero):
pmovmskb %xmm0, %rdx
# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $16, %r8
# else
cmp $17, %r8
@@ -303,7 +310,7 @@ L(SourceStringAlignmentZero):
pmovmskb %xmm0, %rdx
# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
cmp $32, %r8
# else
cmp $33, %r8
@@ -314,11 +321,11 @@ L(SourceStringAlignmentZero):
jnz L(CopyFrom1To32Bytes1)
jmp L(Unalign16Both)
-/* ------End of main part with loops--------------------- */
+/*------End of main part with loops---------------------*/
/* Case1 */
-# if (!defined USE_AS_STRNCPY)
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
.p2align 4
L(CopyFrom1To16Bytes):
add %rcx, %rdi
@@ -328,7 +335,7 @@ L(CopyFrom1To16Bytes):
# endif
.p2align 4
L(CopyFrom1To16BytesTail):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rcx, %r8
# endif
add %rcx, %rsi
@@ -339,7 +346,7 @@ L(CopyFrom1To16BytesTail):
L(CopyFrom1To32Bytes1):
add $16, %rsi
add $16, %rdi
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $16, %r8
# endif
L(CopyFrom1To16BytesTail1):
@@ -348,7 +355,7 @@ L(CopyFrom1To16BytesTail1):
.p2align 4
L(CopyFrom1To32Bytes):
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub %rcx, %r8
# endif
bsf %rdx, %rdx
@@ -360,10 +367,10 @@ L(CopyFrom1To32Bytes):
.p2align 4
L(CopyFrom1To16BytesUnaligned_0):
bsf %rdx, %rdx
-# if defined USE_AS_STRNCPY
-# ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
-# endif
+# endif
movdqu %xmm4, (%rdi)
add $63, %r8
sub %rdx, %r8
@@ -377,10 +384,10 @@ L(CopyFrom1To16BytesUnaligned_0):
L(CopyFrom1To16BytesUnaligned_16):
bsf %rcx, %rdx
movdqu %xmm4, (%rdi)
-# if defined USE_AS_STRNCPY
-# ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
lea 16(%rdi, %rdx), %rax
-# endif
+# endif
movdqu %xmm5, 16(%rdi)
add $47, %r8
sub %rdx, %r8
@@ -397,10 +404,10 @@ L(CopyFrom1To16BytesUnaligned_32):
bsf %rdx, %rdx
movdqu %xmm4, (%rdi)
movdqu %xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY
-# ifdef USE_AS_STPCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
lea 32(%rdi, %rdx), %rax
-# endif
+# endif
movdqu %xmm6, 32(%rdi)
add $31, %r8
sub %rdx, %r8
@@ -413,6 +420,7 @@ L(CopyFrom1To16BytesUnaligned_32):
# endif
# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
.p2align 4
L(CopyFrom1To16BytesUnalignedXmm6):
movdqu %xmm6, (%rdi, %rcx)
@@ -437,6 +445,7 @@ L(CopyFrom1To16BytesUnalignedXmm3):
L(CopyFrom1To16BytesUnalignedXmm1):
movdqu %xmm1, (%rdi, %rcx)
jmp L(CopyFrom1To16BytesXmmExit)
+# endif
.p2align 4
L(CopyFrom1To16BytesExit):
@@ -519,7 +528,7 @@ L(CopyFrom1To16BytesTail1Case2OrCase3):
# endif
-/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
.p2align 4
L(Exit1):
@@ -527,7 +536,7 @@ L(Exit1):
# ifdef USE_AS_STPCPY
lea (%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $1, %r8
lea 1(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -541,7 +550,7 @@ L(Exit2):
# ifdef USE_AS_STPCPY
lea 1(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $2, %r8
lea 2(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -556,7 +565,7 @@ L(Exit3):
# ifdef USE_AS_STPCPY
lea 2(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $3, %r8
lea 3(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -570,7 +579,7 @@ L(Exit4):
# ifdef USE_AS_STPCPY
lea 3(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $4, %r8
lea 4(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -585,7 +594,7 @@ L(Exit5):
# ifdef USE_AS_STPCPY
lea 4(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $5, %r8
lea 5(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -601,7 +610,7 @@ L(Exit6):
# ifdef USE_AS_STPCPY
lea 5(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $6, %r8
lea 6(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -617,7 +626,7 @@ L(Exit7):
# ifdef USE_AS_STPCPY
lea 6(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $7, %r8
lea 7(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -631,7 +640,7 @@ L(Exit8):
# ifdef USE_AS_STPCPY
lea 7(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $8, %r8
lea 8(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -646,7 +655,7 @@ L(Exit9):
# ifdef USE_AS_STPCPY
lea 8(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $9, %r8
lea 9(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -662,7 +671,7 @@ L(Exit10):
# ifdef USE_AS_STPCPY
lea 9(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $10, %r8
lea 10(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -678,7 +687,7 @@ L(Exit11):
# ifdef USE_AS_STPCPY
lea 10(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $11, %r8
lea 11(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -694,7 +703,7 @@ L(Exit12):
# ifdef USE_AS_STPCPY
lea 11(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $12, %r8
lea 12(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -710,7 +719,7 @@ L(Exit13):
# ifdef USE_AS_STPCPY
lea 12(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $13, %r8
lea 13(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -726,7 +735,7 @@ L(Exit14):
# ifdef USE_AS_STPCPY
lea 13(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $14, %r8
lea 14(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -742,7 +751,7 @@ L(Exit15):
# ifdef USE_AS_STPCPY
lea 14(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $15, %r8
lea 15(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -756,7 +765,7 @@ L(Exit16):
# ifdef USE_AS_STPCPY
lea 15(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $16, %r8
lea 16(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -771,7 +780,7 @@ L(Exit17):
# ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $17, %r8
lea 17(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -787,7 +796,7 @@ L(Exit18):
# ifdef USE_AS_STPCPY
lea 17(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $18, %r8
lea 18(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -803,7 +812,7 @@ L(Exit19):
# ifdef USE_AS_STPCPY
lea 18(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $19, %r8
lea 19(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -819,7 +828,7 @@ L(Exit20):
# ifdef USE_AS_STPCPY
lea 19(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $20, %r8
lea 20(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -836,7 +845,7 @@ L(Exit21):
# ifdef USE_AS_STPCPY
lea 20(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $21, %r8
lea 21(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -852,7 +861,7 @@ L(Exit22):
# ifdef USE_AS_STPCPY
lea 21(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $22, %r8
lea 22(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -868,7 +877,7 @@ L(Exit23):
# ifdef USE_AS_STPCPY
lea 22(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $23, %r8
lea 23(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -884,7 +893,7 @@ L(Exit24):
# ifdef USE_AS_STPCPY
lea 23(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $24, %r8
lea 24(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -901,7 +910,7 @@ L(Exit25):
# ifdef USE_AS_STPCPY
lea 24(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $25, %r8
lea 25(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -919,7 +928,7 @@ L(Exit26):
# ifdef USE_AS_STPCPY
lea 25(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $26, %r8
lea 26(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -937,7 +946,7 @@ L(Exit27):
# ifdef USE_AS_STPCPY
lea 26(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $27, %r8
lea 27(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -955,7 +964,7 @@ L(Exit28):
# ifdef USE_AS_STPCPY
lea 27(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $28, %r8
lea 28(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -971,7 +980,7 @@ L(Exit29):
# ifdef USE_AS_STPCPY
lea 28(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $29, %r8
lea 29(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -987,7 +996,7 @@ L(Exit30):
# ifdef USE_AS_STPCPY
lea 29(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $30, %r8
lea 30(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -1003,7 +1012,7 @@ L(Exit31):
# ifdef USE_AS_STPCPY
lea 30(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $31, %r8
lea 31(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -1019,7 +1028,7 @@ L(Exit32):
# ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
# endif
-# if defined USE_AS_STRNCPY
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
sub $32, %r8
lea 32(%rdi), %rdi
jnz L(StrncpyFillTailWithZero)
@@ -1030,27 +1039,39 @@ L(Exit32):
.p2align 4
L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
mov %rdi, %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (%rdi)
+# endif
ret
.p2align 4
L(StrncpyExit1):
mov (%rsi), %dl
mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 1(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 1(%rdi)
+# endif
ret
.p2align 4
L(StrncpyExit2):
mov (%rsi), %dx
mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 2(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 2(%rdi)
+# endif
ret
.p2align 4
@@ -1059,18 +1080,26 @@ L(StrncpyExit3):
mov 2(%rsi), %dl
mov %cx, (%rdi)
mov %dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 3(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 3(%rdi)
+# endif
ret
.p2align 4
L(StrncpyExit4):
mov (%rsi), %edx
mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 4(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 4(%rdi)
+# endif
ret
.p2align 4
@@ -1079,9 +1108,13 @@ L(StrncpyExit5):
mov 4(%rsi), %dl
mov %ecx, (%rdi)
mov %dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 5(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 5(%rdi)
+# endif
ret
.p2align 4
@@ -1090,9 +1123,13 @@ L(StrncpyExit6):
mov 4(%rsi), %dx
mov %ecx, (%rdi)
mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 6(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 6(%rdi)
+# endif
ret
.p2align 4
@@ -1101,18 +1138,26 @@ L(StrncpyExit7):
mov 3(%rsi), %edx
mov %ecx, (%rdi)
mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 7(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 7(%rdi)
+# endif
ret
.p2align 4
L(StrncpyExit8):
mov (%rsi), %rdx
mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 8(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 8(%rdi)
+# endif
ret
.p2align 4
@@ -1121,9 +1166,13 @@ L(StrncpyExit9):
mov 8(%rsi), %dl
mov %rcx, (%rdi)
mov %dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 9(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 9(%rdi)
+# endif
ret
.p2align 4
@@ -1132,9 +1181,13 @@ L(StrncpyExit10):
mov 8(%rsi), %dx
mov %rcx, (%rdi)
mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 10(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 10(%rdi)
+# endif
ret
.p2align 4
@@ -1143,9 +1196,13 @@ L(StrncpyExit11):
mov 7(%rsi), %edx
mov %rcx, (%rdi)
mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 11(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 11(%rdi)
+# endif
ret
.p2align 4
@@ -1154,9 +1211,13 @@ L(StrncpyExit12):
mov 8(%rsi), %edx
mov %rcx, (%rdi)
mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 12(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 12(%rdi)
+# endif
ret
.p2align 4
@@ -1165,9 +1226,13 @@ L(StrncpyExit13):
mov 5(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 13(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 13(%rdi)
+# endif
ret
.p2align 4
@@ -1176,9 +1241,13 @@ L(StrncpyExit14):
mov 6(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 14(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 14(%rdi)
+# endif
ret
.p2align 4
@@ -1187,18 +1256,26 @@ L(StrncpyExit15):
mov 7(%rsi), %rdx
mov %rcx, (%rdi)
mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 15(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 15(%rdi)
+# endif
ret
.p2align 4
L(StrncpyExit16):
movdqu (%rsi), %xmm0
movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 16(%rdi)
+# endif
ret
.p2align 4
@@ -1207,9 +1284,13 @@ L(StrncpyExit17):
mov 16(%rsi), %cl
movdqu %xmm0, (%rdi)
mov %cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 17(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 17(%rdi)
+# endif
ret
.p2align 4
@@ -1218,9 +1299,13 @@ L(StrncpyExit18):
mov 16(%rsi), %cx
movdqu %xmm0, (%rdi)
mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 18(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 18(%rdi)
+# endif
ret
.p2align 4
@@ -1229,9 +1314,13 @@ L(StrncpyExit19):
mov 15(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 19(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 19(%rdi)
+# endif
ret
.p2align 4
@@ -1240,9 +1329,13 @@ L(StrncpyExit20):
mov 16(%rsi), %ecx
movdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 20(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 20(%rdi)
+# endif
ret
.p2align 4
@@ -1253,9 +1346,13 @@ L(StrncpyExit21):
movdqu %xmm0, (%rdi)
mov %ecx, 16(%rdi)
mov %dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 21(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 21(%rdi)
+# endif
ret
.p2align 4
@@ -1264,9 +1361,13 @@ L(StrncpyExit22):
mov 14(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 22(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 22(%rdi)
+# endif
ret
.p2align 4
@@ -1275,9 +1376,13 @@ L(StrncpyExit23):
mov 15(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 23(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 23(%rdi)
+# endif
ret
.p2align 4
@@ -1286,9 +1391,13 @@ L(StrncpyExit24):
mov 16(%rsi), %rcx
movdqu %xmm0, (%rdi)
mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 24(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 24(%rdi)
+# endif
ret
.p2align 4
@@ -1299,9 +1408,13 @@ L(StrncpyExit25):
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 25(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 25(%rdi)
+# endif
ret
.p2align 4
@@ -1312,9 +1425,13 @@ L(StrncpyExit26):
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 26(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 26(%rdi)
+# endif
ret
.p2align 4
@@ -1325,9 +1442,13 @@ L(StrncpyExit27):
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 27(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 27(%rdi)
+# endif
ret
.p2align 4
@@ -1338,9 +1459,13 @@ L(StrncpyExit28):
movdqu %xmm0, (%rdi)
mov %rdx, 16(%rdi)
mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 28(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 28(%rdi)
+# endif
ret
.p2align 4
@@ -1349,9 +1474,13 @@ L(StrncpyExit29):
movdqu 13(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 29(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 29(%rdi)
+# endif
ret
.p2align 4
@@ -1360,9 +1489,13 @@ L(StrncpyExit30):
movdqu 14(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 30(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 30(%rdi)
+# endif
ret
.p2align 4
@@ -1371,9 +1504,13 @@ L(StrncpyExit31):
movdqu 15(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 31(%rdi)
+# endif
ret
.p2align 4
@@ -1382,9 +1519,13 @@ L(StrncpyExit32):
movdqu 16(%rsi), %xmm2
movdqu %xmm0, (%rdi)
movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 32(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 32(%rdi)
+# endif
ret
.p2align 4
@@ -1395,8 +1536,14 @@ L(StrncpyExit33):
movdqu %xmm0, (%rdi)
movdqu %xmm2, 16(%rdi)
mov %cl, 32(%rdi)
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 33(%rdi)
+# endif
ret
+# ifndef USE_AS_STRCAT
+
.p2align 4
L(Fill0):
ret
@@ -1498,9 +1645,9 @@ L(CopyFrom1To16BytesXmmExit):
bsf %rdx, %rdx
add $15, %r8
add %rcx, %rdi
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
-# endif
+# endif
sub %rdx, %r8
lea 1(%rdi, %rdx), %rdi
@@ -1553,6 +1700,9 @@ L(StrncpyFillExit):
add $16, %r8
BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+/* end of ifndef USE_AS_STRCAT */
+# endif
+
.p2align 4
L(UnalignedLeaveCase2OrCase3):
test %rdx, %rdx
@@ -1572,9 +1722,13 @@ L(Unaligned64LeaveCase3):
sub $16, %r8
jb L(CopyFrom1To16BytesCase3)
movdqu %xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 64(%rdi), %rax
-# endif
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 64(%rdi)
+# endif
ret
.p2align 4
@@ -1585,8 +1739,11 @@ L(Unaligned64LeaveCase2):
add $48, %r8
jle L(CopyFrom1To16BytesCase2OrCase3)
test %rdx, %rdx
+# ifndef USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm4)
-
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
pcmpeqb %xmm5, %xmm0
pmovmskb %xmm0, %rdx
movdqu %xmm4, (%rdi)
@@ -1594,7 +1751,11 @@ L(Unaligned64LeaveCase2):
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
test %rdx, %rdx
+# ifndef USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm5)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
pcmpeqb %xmm6, %xmm0
pmovmskb %xmm0, %rdx
@@ -1603,7 +1764,11 @@ L(Unaligned64LeaveCase2):
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
test %rdx, %rdx
+# ifndef USE_AS_STRCAT
jnz L(CopyFrom1To16BytesUnalignedXmm6)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %rdx
@@ -1617,13 +1782,18 @@ L(Unaligned64LeaveCase2):
.p2align 4
L(ExitZero):
+# ifndef USE_AS_STRCAT
mov %rdi, %rax
+# endif
ret
# endif
+# ifndef USE_AS_STRCAT
END (STRCPY)
-
+# else
+END (STRCAT)
+# endif
.p2align 4
.section .rodata
L(ExitTable):
@@ -1695,6 +1865,7 @@ L(ExitStrncpyTable):
.int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
.int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
.p2align 4
L(FillTable):
.int JMPTBL(L(Fill0), L(FillTable))
@@ -1714,5 +1885,7 @@ L(FillTable):
.int JMPTBL(L(Fill14), L(FillTable))
.int JMPTBL(L(Fill15), L(FillTable))
.int JMPTBL(L(Fill16), L(FillTable))
+# endif
# endif
#endif
+
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index efbd3bf..05faf0d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -20,25 +20,26 @@
#ifndef NOT_IN_libc
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
.section .text.ssse3,"ax",@progbits
ENTRY (STRCPY)
mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
mov %rdx, %r8
-# endif
+# endif
mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
test %r8, %r8
jz L(Exit0)
cmp $8, %r8
jbe L(StrncpyExit8Bytes)
-# endif
+# endif
cmpb $0, (%rcx)
jz L(Exit1)
cmpb $0, 1(%rcx)
@@ -55,10 +56,10 @@ ENTRY (STRCPY)
jz L(Exit7)
cmpb $0, 7(%rcx)
jz L(Exit8)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
jb L(StrncpyExit15Bytes)
-# endif
+# endif
cmpb $0, 8(%rcx)
jz L(Exit9)
cmpb $0, 9(%rcx)
@@ -73,12 +74,13 @@ ENTRY (STRCPY)
jz L(Exit14)
cmpb $0, 14(%rcx)
jz L(Exit15)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
je L(Exit16)
-# endif
+# endif
cmpb $0, 15(%rcx)
jz L(Exit16)
+# endif
# ifdef USE_AS_STRNCPY
mov %rcx, %rsi
@@ -2180,12 +2182,12 @@ L(Shl15LoopExit):
jmp L(CopyFrom1To16Bytes)
# endif
-
+# ifndef USE_AS_STRCAT
.p2align 4
L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
add $16, %r8
-# endif
+# endif
add %rsi, %rdx
add %rsi, %rcx
@@ -2210,20 +2212,20 @@ L(CopyFrom1To16Bytes):
L(Exit8):
mov (%rcx), %rax
mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 7(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $8, %r8
lea 8(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2249,23 +2251,23 @@ L(Exit16):
mov %rax, (%rdx)
mov 8(%rcx), %rax
mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 15(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $16, %r8
lea 16(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
.p2align 4
L(CopyFrom1To16BytesCase2):
@@ -2381,46 +2383,46 @@ L(Less12Case3): /* but more than 8 */
jl L(Exit9)
je L(Exit10)
jg L(Exit11)
-# endif
+# endif
.p2align 4
L(Exit1):
movb (%rcx), %al
movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea (%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $1, %r8
lea 1(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
L(Exit2):
movw (%rcx), %ax
movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 1(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $2, %r8
lea 2(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2429,40 +2431,40 @@ L(Exit3):
movw %ax, (%rdx)
movb 2(%rcx), %al
movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 2(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $3, %r8
lea 3(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
L(Exit4):
movl (%rcx), %eax
movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 3(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $4, %r8
lea 4(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2471,20 +2473,20 @@ L(Exit5):
movl %eax, (%rdx)
movb 4(%rcx), %al
movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 4(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $5, %r8
lea 5(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2493,20 +2495,20 @@ L(Exit6):
movl %eax, (%rdx)
movw 4(%rcx), %ax
movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 5(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $6, %r8
lea 6(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2515,20 +2517,20 @@ L(Exit7):
movl %eax, (%rdx)
movl 3(%rcx), %eax
movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 6(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $7, %r8
lea 7(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2537,20 +2539,20 @@ L(Exit9):
mov %rax, (%rdx)
mov 5(%rcx), %eax
mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 8(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $9, %r8
lea 9(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2559,20 +2561,20 @@ L(Exit10):
mov %rax, (%rdx)
mov 6(%rcx), %eax
mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 9(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $10, %r8
lea 10(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2581,20 +2583,20 @@ L(Exit11):
mov %rax, (%rdx)
mov 7(%rcx), %eax
mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 10(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $11, %r8
lea 11(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2603,20 +2605,20 @@ L(Exit12):
mov %rax, (%rdx)
mov 8(%rcx), %eax
mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 11(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $12, %r8
lea 12(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2625,20 +2627,20 @@ L(Exit13):
mov %rax, (%rdx)
mov 5(%rcx), %rax
mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 12(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $13, %r8
lea 13(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2647,20 +2649,20 @@ L(Exit14):
mov %rax, (%rdx)
mov 6(%rcx), %rax
mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 13(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $14, %r8
lea 14(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
.p2align 4
@@ -2669,23 +2671,23 @@ L(Exit15):
mov %rax, (%rdx)
mov 7(%rcx), %rax
mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 14(%rdx), %rax
-# else
+# else
mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
+# endif
+# ifdef USE_AS_STRNCPY
sub $15, %r8
lea 15(%rdx), %rcx
jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
-# endif
+# endif
+# endif
ret
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
.p2align 4
L(Fill0):
ret
@@ -2902,13 +2904,13 @@ L(StrncpyExit15Bytes):
mov %rax, (%rdx)
mov 7(%rcx), %rax
mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 14(%rdx), %rax
cmpb $1, (%rax)
sbb $-1, %rax
-# else
+# else
mov %rdi, %rax
-# endif
+# endif
ret
.p2align 4
@@ -2943,15 +2945,17 @@ L(StrncpyExit8Bytes):
jz L(Exit7)
mov (%rcx), %rax
mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
+# ifdef USE_AS_STPCPY
lea 7(%rdx), %rax
cmpb $1, (%rax)
sbb $-1, %rax
-# else
+# else
mov %rdi, %rax
-# endif
+# endif
ret
+# endif
+
# endif
# ifdef USE_AS_STRNCPY
@@ -3715,7 +3719,7 @@ L(StrncpyExit15):
lea 1(%rsi), %rsi
jmp L(CopyFrom1To16BytesCase3)
# endif
-
+# ifndef USE_AS_STRCAT
END (STRCPY)
-
+# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S
index 3e52f81..c730e0a 100644
--- a/sysdeps/x86_64/multiarch/strlen-no-bsf.S
+++ b/sysdeps/x86_64/multiarch/strlen-no-bsf.S
@@ -1,5 +1,5 @@
-/* strlen without BSF
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* strlen SSE2 without bsf
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -18,12 +18,17 @@
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
-#if defined SHARED && !defined NOT_IN_libc
+#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc
-#include <sysdep.h>
+# ifndef USE_AS_STRCAT
- .section .text.slow,"ax",@progbits
+# include <sysdep.h>
+
+# define RETURN ret
+
+ .section .text.sse2,"ax",@progbits
ENTRY (__strlen_no_bsf)
+# endif
xor %eax, %eax
cmpb $0, (%rdi)
jz L(exit_tail0)
@@ -165,39 +170,37 @@ ENTRY (__strlen_no_bsf)
jnz L(exit)
and $-0x40, %rax
- xor %r8d, %r8d
L(aligned_64):
pcmpeqb (%rax), %xmm0
pcmpeqb 16(%rax), %xmm1
pcmpeqb 32(%rax), %xmm2
pcmpeqb 48(%rax), %xmm3
pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %esi
- pmovmskb %xmm2, %edi
+ pmovmskb %xmm1, %r11d
+ pmovmskb %xmm2, %r10d
pmovmskb %xmm3, %r9d
- or %edx, %r8d
- or %esi, %r8d
- or %edi, %r8d
- or %r9d, %r8d
+ or %edx, %r9d
+ or %r11d, %r9d
+ or %r10d, %r9d
lea 64(%rax), %rax
jz L(aligned_64)
test %edx, %edx
jnz L(aligned_64_exit_16)
- test %esi, %esi
+ test %r11d, %r11d
jnz L(aligned_64_exit_32)
- test %edi, %edi
+ test %r10d, %r10d
jnz L(aligned_64_exit_48)
L(aligned_64_exit_64):
- mov %r9d, %edx
+ pmovmskb %xmm3, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_48):
lea -16(%rax), %rax
- mov %edi, %edx
+ mov %r10d, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_32):
lea -32(%rax), %rax
- mov %esi, %edx
+ mov %r11d, %edx
jmp L(aligned_64_exit)
L(aligned_64_exit_16):
lea -48(%rax), %rax
@@ -228,7 +231,7 @@ L(exit):
jnz L(exit_tail6)
add $7, %eax
L(exit_tail0):
- ret
+ RETURN
L(exit_high):
add $8, %eax
@@ -253,57 +256,58 @@ L(exit_high):
test $0x40, %dh
jnz L(exit_tail6)
add $7, %eax
- ret
+ RETURN
.p2align 4
L(exit_tail1):
add $1, %eax
- ret
+ RETURN
L(exit_tail2):
add $2, %eax
- ret
+ RETURN
L(exit_tail3):
add $3, %eax
- ret
+ RETURN
L(exit_tail4):
add $4, %eax
- ret
+ RETURN
L(exit_tail5):
add $5, %eax
- ret
+ RETURN
L(exit_tail6):
add $6, %eax
- ret
+ RETURN
L(exit_tail7):
add $7, %eax
- ret
+ RETURN
L(exit_tail8):
add $8, %eax
- ret
+ RETURN
L(exit_tail9):
add $9, %eax
- ret
+ RETURN
L(exit_tail10):
add $10, %eax
- ret
+ RETURN
L(exit_tail11):
add $11, %eax
- ret
+ RETURN
L(exit_tail12):
add $12, %eax
- ret
+ RETURN
L(exit_tail13):
add $13, %eax
- ret
+ RETURN
L(exit_tail14):
add $14, %eax
- ret
+ RETURN
L(exit_tail15):
add $15, %eax
- ret
+# ifndef USE_AS_STRCAT
+ RETURN
END (__strlen_no_bsf)
-
+# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
new file mode 100644
index 0000000..57778cf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
@@ -0,0 +1,260 @@
+/* strlen SSE2
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
+
+# ifndef USE_AS_STRCAT
+
+# include <sysdep.h>
+
+# define RETURN ret
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (__strlen_sse2_pminub)
+
+# endif
+ xor %rax, %rax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
+L(next):
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ pmovmskb %xmm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm1
+ add $16, %rax
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm2
+ add $16, %rax
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm3
+ add $16, %rax
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $16, %rax
+ .p2align 4
+ L(align64_loop):
+ movaps (%rax), %xmm4
+ pminub 16(%rax), %xmm4
+ movaps 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ add $64, %rax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+ test %edx, %edx
+ jz L(align64_loop)
+
+
+ pcmpeqb -64(%rax), %xmm0
+ sub $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+ RETURN
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ RETURN
+ .p2align 4
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $16, %rax
+ RETURN
+ .p2align 4
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $32, %rax
+ RETURN
+ .p2align 4
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $48, %rax
+ RETURN
+ .p2align 4
+L(exit64):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+# ifndef USE_AS_STRCAT
+ RETURN
+
+END (__strlen_sse2_pminub)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 83a88ec..d789707 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -32,7 +32,10 @@ ENTRY(strlen)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq __strlen_sse2(%rip), %rax
+1: leaq __strlen_sse2_pminub(%rip), %rax
+ testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+ jnz 2f
+ leaq __strlen_sse2(%rip), %rax
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f
leaq __strlen_sse42(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
new file mode 100644
index 0000000..a3cdbff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-c.c
@@ -0,0 +1,8 @@
+#define STRNCAT __strncat_sse2
+#ifdef SHARED
+#undef libc_hidden_def
+#define libc_hidden_def(name) \
+ __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
+#endif
+
+#include "string/strncat.c"
diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
new file mode 100644
index 0000000..133e1d2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_sse2_unaligned
+#include "strcat-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
new file mode 100644
index 0000000..6c45ff3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCAT
+#define STRCAT __strncat_ssse3
+#include "strcat-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strncat.S b/sysdeps/x86_64/multiarch/strncat.S
new file mode 100644
index 0000000..fd569c2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncat.S
@@ -0,0 +1,3 @@
+#define STRCAT strncat
+#define USE_AS_STRNCAT
+#include "strcat.S"