aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOndřej Bílka <neleai@seznam.cz>2015-06-17 15:32:54 +0200
committerOndřej Bílka <neleai@seznam.cz>2015-06-20 08:32:10 +0200
commitd0731dac4e35206d4cd7a512e357ef66353b3581 (patch)
tree37ba6a1128fb33b77600ae1aaf7d1012777e8a20
parentc10b9b13f7471b08273effc8cd7e51b119df9348 (diff)
downloadglibc-d0731dac4e35206d4cd7a512e357ef66353b3581.zip
glibc-d0731dac4e35206d4cd7a512e357ef66353b3581.tar.gz
glibc-d0731dac4e35206d4cd7a512e357ef66353b3581.tar.bz2
new sse2 and avx2 strcpy and stpcpy
-rw-r--r--math/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c2
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-avx2.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S439
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S2
-rw-r--r--sysdeps/x86_64/multiarch/strchrnul_avx2.S3
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S1890
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S22
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S1891
-rw-r--r--sysdeps/x86_64/multiarch/strncpy.S88
14 files changed, 2435 insertions, 1921 deletions
diff --git a/math/Makefile b/math/Makefile
index 7f6b85e..143fa47 100644
--- a/math/Makefile
+++ b/math/Makefile
@@ -115,7 +115,7 @@ tests-static = test-fpucw-static test-fpucw-ieee-static
test-longdouble-yes = test-ldouble test-ildoubl
ifneq (no,$(PERL))
-libm-vec-tests = $(addprefix test-,$(libmvec-tests))
+#libm-vec-tests = $(addprefix test-,$(libmvec-tests))
libm-tests = test-float test-double $(test-longdouble-$(long-double-fcts)) \
test-ifloat test-idouble $(libm-vec-tests)
libm-tests.o = $(addsuffix .o,$(libm-tests))
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d7002a9..c573744 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -29,7 +29,7 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b64e4f1..d398e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -88,6 +88,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.S. */
IFUNC_IMPL (i, name, stpcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_AVX2, __stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy, HAS_SSSE3, __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
@@ -137,6 +138,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcpy.S. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, HAS_AVX2, __strcpy_avx2)
IFUNC_IMPL_ADD (array, i, strcpy, HAS_SSSE3, __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
new file mode 100644
index 0000000..bd30ef6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define STPCPY __stpcpy_avx2
+#include "stpcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
index 34231f8..695a236 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -1,3 +1,436 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
+/* stpcpy with SSE2 and unaligned load
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#ifndef STPCPY
+# define STPCPY __stpcpy_sse2_unaligned
+#endif
+
+ENTRY(STPCPY)
+ mov %esi, %edx
+#ifdef AS_STRCPY
+ movq %rdi, %rax
+#endif
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ andl $4095, %edx
+ cmp $3968, %edx
+ ja L(cross_page)
+
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ testl %edx, %edx
+ je L(more16bytes)
+ bsf %edx, %ecx
+#ifndef AS_STRCPY
+ lea (%rdi, %rcx), %rax
+#endif
+ cmp $7, %ecx
+ movq (%rsi), %rdx
+ jb L(less_8_bytesb)
+L(8bytes_from_cross):
+ movq -7(%rsi, %rcx), %rsi
+ movq %rdx, (%rdi)
+#ifdef AS_STRCPY
+ movq %rsi, -7(%rdi, %rcx)
+#else
+ movq %rsi, -7(%rax)
+#endif
+ ret
+
+ .p2align 4
+L(less_8_bytesb):
+ cmp $2, %ecx
+ jbe L(less_4_bytes)
+L(4bytes_from_cross):
+ mov -3(%rsi, %rcx), %esi
+ mov %edx, (%rdi)
+#ifdef AS_STRCPY
+ mov %esi, -3(%rdi, %rcx)
+#else
+ mov %esi, -3(%rax)
+#endif
+ ret
+
+.p2align 4
+ L(less_4_bytes):
+ /*
+ Test branch vs this branchless that works for i 0,1,2
+ d[i] = 0;
+ d[i/2] = s[1];
+ d[0] = s[0];
+ */
+#ifdef AS_STRCPY
+ movb $0, (%rdi, %rcx)
+#endif
+
+ shr $1, %ecx
+ mov %edx, %esi
+ shr $8, %edx
+ movb %dl, (%rdi, %rcx)
+#ifndef AS_STRCPY
+ movb $0, (%rax)
+#endif
+ movb %sil, (%rdi)
+ ret
+
+
+
+
+
+ .p2align 4
+L(more16bytes):
+ pxor %xmm6, %xmm6
+ movdqu 16(%rsi), %xmm1
+ pxor %xmm7, %xmm7
+ pcmpeqb %xmm1, %xmm5
+ pmovmskb %xmm5, %edx
+ testl %edx, %edx
+ je L(more32bytes)
+ bsf %edx, %edx
+#ifdef AS_STRCPY
+ movdqu 1(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 1(%rdi, %rdx)
+#else
+ lea 16(%rdi, %rdx), %rax
+ movdqu 1(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -15(%rax)
+#endif
+ ret
+
+ .p2align 4
+L(more32bytes):
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+
+ pcmpeqb %xmm2, %xmm6
+ pcmpeqb %xmm3, %xmm7
+ pmovmskb %xmm7, %edx
+ shl $16, %edx
+ pmovmskb %xmm6, %ecx
+ or %ecx, %edx
+ je L(more64bytes)
+ bsf %edx, %edx
+#ifndef AS_STRCPY
+ lea 32(%rdi, %rdx), %rax
+#endif
+ movdqu 1(%rsi, %rdx), %xmm2
+ movdqu 17(%rsi, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+#ifdef AS_STRCPY
+ movdqu %xmm2, 1(%rdi, %rdx)
+ movdqu %xmm3, 17(%rdi, %rdx)
+#else
+ movdqu %xmm2, -31(%rax)
+ movdqu %xmm3, -15(%rax)
+#endif
+ ret
+
+ .p2align 4
+L(more64bytes):
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu 64(%rsi), %xmm0
+ movdqu 80(%rsi), %xmm1
+ movdqu 96(%rsi), %xmm2
+ movdqu 112(%rsi), %xmm3
+
+ pcmpeqb %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm2, %xmm6
+ pcmpeqb %xmm3, %xmm7
+ pmovmskb %xmm4, %ecx
+ pmovmskb %xmm5, %edx
+ pmovmskb %xmm6, %r8d
+ pmovmskb %xmm7, %r9d
+ shl $16, %edx
+ or %ecx, %edx
+ shl $32, %r8
+ shl $48, %r9
+ or %r8, %rdx
+ or %r9, %rdx
+ test %rdx, %rdx
+ je L(prepare_loop)
+ bsf %rdx, %rdx
+#ifndef AS_STRCPY
+ lea 64(%rdi, %rdx), %rax
+#endif
+ movdqu 1(%rsi, %rdx), %xmm0
+ movdqu 17(%rsi, %rdx), %xmm1
+ movdqu 33(%rsi, %rdx), %xmm2
+ movdqu 49(%rsi, %rdx), %xmm3
+#ifdef AS_STRCPY
+ movdqu %xmm0, 1(%rdi, %rdx)
+ movdqu %xmm1, 17(%rdi, %rdx)
+ movdqu %xmm2, 33(%rdi, %rdx)
+ movdqu %xmm3, 49(%rdi, %rdx)
+#else
+ movdqu %xmm0, -63(%rax)
+ movdqu %xmm1, -47(%rax)
+ movdqu %xmm2, -31(%rax)
+ movdqu %xmm3, -15(%rax)
+#endif
+ ret
+
+
+ .p2align 4
+L(prepare_loop):
+ movdqu %xmm0, 64(%rdi)
+ movdqu %xmm1, 80(%rdi)
+ movdqu %xmm2, 96(%rdi)
+ movdqu %xmm3, 112(%rdi)
+
+ subq %rsi, %rdi
+ add $64, %rsi
+ andq $-64, %rsi
+ addq %rsi, %rdi
+ jmp L(loop_entry)
+
+#ifdef USE_AVX2
+ .p2align 4
+L(loop):
+ vmovdqu %ymm1, (%rdi)
+ vmovdqu %ymm3, 32(%rdi)
+L(loop_entry):
+ vmovdqa 96(%rsi), %ymm3
+ vmovdqa 64(%rsi), %ymm1
+ vpminub %ymm3, %ymm1, %ymm2
+ addq $64, %rsi
+ addq $64, %rdi
+ vpcmpeqb %ymm5, %ymm2, %ymm0
+ vpmovmskb %ymm0, %edx
+ test %edx, %edx
+ je L(loop)
+ salq $32, %rdx
+ vpcmpeqb %ymm5, %ymm1, %ymm4
+ vpmovmskb %ymm4, %ecx
+ or %rcx, %rdx
+ bsfq %rdx, %rdx
+#ifndef AS_STRCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ vmovdqu -63(%rsi, %rdx), %ymm0
+ vmovdqu -31(%rsi, %rdx), %ymm2
+#ifdef AS_STRCPY
+ vmovdqu %ymm0, -63(%rdi, %rdx)
+ vmovdqu %ymm2, -31(%rdi, %rdx)
+#else
+ vmovdqu %ymm0, -63(%rax)
+ vmovdqu %ymm2, -31(%rax)
+#endif
+ vzeroupper
+ ret
+#else
+ .p2align 4
+L(loop):
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm4, 48(%rdi)
+L(loop_entry):
+ movdqa 96(%rsi), %xmm3
+ movdqa 112(%rsi), %xmm4
+ movdqa %xmm3, %xmm0
+ movdqa 80(%rsi), %xmm2
+ pminub %xmm4, %xmm0
+ movdqa 64(%rsi), %xmm1
+ pminub %xmm2, %xmm0
+ pminub %xmm1, %xmm0
+ addq $64, %rsi
+ addq $64, %rdi
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ je L(loop)
+ salq $48, %rdx
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm2, %xmm6
+ pmovmskb %xmm5, %ecx
+#ifdef AS_STRCPY
+ pmovmskb %xmm6, %r8d
+ pcmpeqb %xmm3, %xmm7
+ pmovmskb %xmm7, %r9d
+ sal $16, %r8d
+ or %r8d, %ecx
+#else
+ pmovmskb %xmm6, %eax
+ pcmpeqb %xmm3, %xmm7
+ pmovmskb %xmm7, %r9d
+ sal $16, %eax
+ or %eax, %ecx
+#endif
+ salq $32, %r9
+ orq %rcx, %rdx
+ orq %r9, %rdx
+ bsfq %rdx, %rdx
+#ifndef AS_STRCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ movdqu -63(%rsi, %rdx), %xmm0
+ movdqu -47(%rsi, %rdx), %xmm1
+ movdqu -31(%rsi, %rdx), %xmm2
+ movdqu -15(%rsi, %rdx), %xmm3
+#ifdef AS_STRCPY
+ movdqu %xmm0, -63(%rdi, %rdx)
+ movdqu %xmm1, -47(%rdi, %rdx)
+ movdqu %xmm2, -31(%rdi, %rdx)
+ movdqu %xmm3, -15(%rdi, %rdx)
+#else
+ movdqu %xmm0, -63(%rax)
+ movdqu %xmm1, -47(%rax)
+ movdqu %xmm2, -31(%rax)
+ movdqu %xmm3, -15(%rax)
+#endif
+ ret
+#endif
+
+ .p2align 4
+L(cross_page):
+ movq %rsi, %rcx
+ pxor %xmm0, %xmm0
+ and $15, %ecx
+ movq %rsi, %r9
+ movq %rdi, %r10
+ subq %rcx, %rsi
+ subq %rcx, %rdi
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+ shl %cl, %edx
+ test %edx, %edx
+ jne L(less_32_cross)
+
+ addq $16, %rsi
+ addq $16, %rdi
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jne L(less_32_cross)
+ movdqu %xmm1, (%rdi)
+
+ movdqu (%r9), %xmm0
+ movdqu %xmm0, (%r10)
+
+ mov $8, %rcx
+L(cross_loop):
+ addq $16, %rsi
+ addq $16, %rdi
+ pxor %xmm0, %xmm0
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jne L(return_cross)
+ movdqu %xmm1, (%rdi)
+ sub $1, %rcx
+ ja L(cross_loop)
+
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+
+ lea -64(%rsi), %rdx
+ andq $-64, %rdx
+ addq %rdx, %rdi
+ subq %rsi, %rdi
+ movq %rdx, %rsi
+ jmp L(loop_entry)
+
+ .p2align 4
+L(return_cross):
+ bsf %edx, %edx
+#ifdef AS_STRCPY
+ movdqu -15(%rsi, %rdx), %xmm0
+ movdqu %xmm0, -15(%rdi, %rdx)
+#else
+ lea (%rdi, %rdx), %rax
+ movdqu -15(%rsi, %rdx), %xmm0
+ movdqu %xmm0, -15(%rax)
+#endif
+ ret
+
+ .p2align 4
+L(less_32_cross):
+ bsf %rdx, %rdx
+ lea (%rdi, %rdx), %rcx
+#ifndef AS_STRCPY
+ mov %rcx, %rax
+#endif
+ mov %r9, %rsi
+ mov %r10, %rdi
+ sub %rdi, %rcx
+ cmp $15, %ecx
+ jb L(less_16_cross)
+ movdqu (%rsi), %xmm0
+ movdqu -15(%rsi, %rcx), %xmm1
+ movdqu %xmm0, (%rdi)
+#ifdef AS_STRCPY
+ movdqu %xmm1, -15(%rdi, %rcx)
+#else
+ movdqu %xmm1, -15(%rax)
+#endif
+ ret
+
+L(less_16_cross):
+ cmp $7, %ecx
+ jb L(less_8_bytes_cross)
+ movq (%rsi), %rdx
+ jmp L(8bytes_from_cross)
+
+L(less_8_bytes_cross):
+ cmp $2, %ecx
+ jbe L(3_bytes_cross)
+ mov (%rsi), %edx
+ jmp L(4bytes_from_cross)
+
+L(3_bytes_cross):
+ jb L(1_2bytes_cross)
+ movzwl (%rsi), %edx
+ jmp L(_3_bytesb)
+
+L(1_2bytes_cross):
+ movb (%rsi), %dl
+ jmp L(0_2bytes_from_cross)
+
+ .p2align 4
+L(less_4_bytesb):
+ je L(_3_bytesb)
+L(0_2bytes_from_cross):
+ movb %dl, (%rdi)
+#ifdef AS_STRCPY
+ movb $0, (%rdi, %rcx)
+#else
+ movb $0, (%rax)
+#endif
+ ret
+
+ .p2align 4
+L(_3_bytesb):
+ movw %dx, (%rdi)
+ movb $0, 2(%rdi)
+ ret
+
+END(STPCPY)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
index 658520f..3f35068 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -1,4 +1,3 @@
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
#define STRCPY __stpncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
+#include "strncpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S
index 2698ca6..159604a 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.S
+++ b/sysdeps/x86_64/multiarch/stpncpy.S
@@ -1,8 +1,7 @@
/* Multiple versions of stpncpy
All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY __stpncpy
+#define STRNCPY __stpncpy
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#include "strcpy.S"
+#include "strncpy.S"
weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 81f1b40..1faa49d 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -275,5 +275,5 @@ L(StartStrcpyPart):
# define USE_AS_STRNCPY
# endif
-# include "strcpy-sse2-unaligned.S"
+# include "strncpy-sse2-unaligned.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul_avx2.S b/sysdeps/x86_64/multiarch/strchrnul_avx2.S
new file mode 100644
index 0000000..4dcb981
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul_avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRCHRNUL
+#define __strchr_avx2 __strchrnul_avx2
+#include "strchr_avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 0000000..a3133a4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AVX2
+#define AS_STRCPY
+#define STPCPY __strcpy_avx2
+#include "stpcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 8f03d1d..310e4fa 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -1,1887 +1,3 @@
-/* strcpy with SSE2 and unaligned load
- Copyright (C) 2011-2015 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_sse2_unaligned
-# endif
-
-# endif
-
-# define JMPTBL(I, B) I - B
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), %rcx; \
- lea (%r11, %rcx), %rcx; \
- jmp *%rcx
-
-# ifndef USE_AS_STRCAT
-
-.text
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
- test %r8, %r8
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
-
-# endif
-
- and $63, %rcx
- cmp $32, %rcx
- jbe L(SourceStringAlignmentLess32)
-
- and $-16, %rsi
- and $15, %rcx
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
-
- pcmpeqb (%rsi), %xmm1
- pmovmskb %xmm1, %rdx
- shr %cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $16, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $17, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyFrom1To16BytesTailCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail)
-
- pcmpeqb 16(%rsi), %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
- add $16, %r10
- cmp %r10, %r8
- jbe L(CopyFrom1To32BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes)
-
- movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
- movdqu %xmm1, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(Unalign16Both):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
-# endif
- mov $16, %rcx
- movdqa (%rsi, %rcx), %xmm1
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $48, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm4
- movdqu %xmm3, (%rdi, %rcx)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm1
- movdqu %xmm4, (%rdi, %rcx)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm1)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm2
- movdqu %xmm1, (%rdi, %rcx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm2)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movaps 16(%rsi, %rcx), %xmm3
- movdqu %xmm2, (%rdi, %rcx)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rdx
- add $16, %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm3)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- movdqu %xmm3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea 16(%rsi, %rcx), %rsi
- and $-0x40, %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea 128(%r8, %rdx), %r8
-# endif
-L(Unaligned64Loop):
- movaps (%rsi), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rsi), %xmm5
- movaps 32(%rsi), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rsi), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(Unaligned64Leave)
-
-L(Unaligned64Loop_start):
- add $64, %rdi
- add $64, %rsi
- movdqu %xmm4, -64(%rdi)
- movaps (%rsi), %xmm2
- movdqa %xmm2, %xmm4
- movdqu %xmm5, -48(%rdi)
- movaps 16(%rsi), %xmm5
- pminub %xmm5, %xmm2
- movaps 32(%rsi), %xmm3
- movdqu %xmm6, -32(%rdi)
- movaps %xmm3, %xmm6
- movdqu %xmm7, -16(%rdi)
- movaps 48(%rsi), %xmm7
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rdx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %rdx, %rdx
- jz L(Unaligned64Loop_start)
-
-L(Unaligned64Leave):
- pxor %xmm1, %xmm1
-
- pcmpeqb %xmm4, %xmm0
- pcmpeqb %xmm5, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_0)
- test %rcx, %rcx
- jnz L(CopyFrom1To16BytesUnaligned_16)
-
- pcmpeqb %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm1
- pmovmskb %xmm0, %rdx
- pmovmskb %xmm1, %rcx
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesUnaligned_32)
-
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
- movdqu %xmm6, 32(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 48(%rdi, %rdx), %rax
-# endif
- movdqu %xmm7, 48(%rdi)
- add $15, %r8
- sub %rdx, %r8
- lea 49(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $48, %rsi
- add $48, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLess32):
- pxor %xmm0, %xmm0
- movdqu (%rsi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $16, %r8
-# else
- cmp $17, %r8
-# endif
- jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1)
-
- pcmpeqb %xmm2, %xmm0
- movdqu %xmm1, (%rdi)
- pmovmskb %xmm0, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $32, %r8
-# else
- cmp $33, %r8
-# endif
- jbe L(CopyFrom1To32Bytes1Case2OrCase3)
-# endif
- test %rdx, %rdx
- jnz L(CopyFrom1To32Bytes1)
-
- and $-16, %rsi
- and $15, %rcx
- jmp L(Unalign16Both)
-
-/*------End of main part with loops---------------------*/
-
-/* Case1 */
-
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
- .p2align 4
-L(CopyFrom1To16BytesTail):
- add %rcx, %rsi
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1):
- add $16, %rsi
- add $16, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
-# endif
-L(CopyFrom1To16BytesTail1):
- bsf %rdx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes):
- bsf %rdx, %rdx
- add %rcx, %rsi
- add $16, %rdx
- sub %rcx, %rdx
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_0):
- bsf %rdx, %rdx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- movdqu %xmm4, (%rdi)
- add $63, %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_16):
- bsf %rcx, %rdx
- movdqu %xmm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 16(%rdi, %rdx), %rax
-# endif
- movdqu %xmm5, 16(%rdi)
- add $47, %r8
- sub %rdx, %r8
- lea 17(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $16, %rsi
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesUnaligned_32):
- bsf %rdx, %rdx
- movdqu %xmm4, (%rdi)
- movdqu %xmm5, 16(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea 32(%rdi, %rdx), %rax
-# endif
- movdqu %xmm6, 32(%rdi)
- add $31, %r8
- sub %rdx, %r8
- lea 33(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $32, %rsi
- add $32, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-# endif
-
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm6):
- movdqu %xmm6, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm5):
- movdqu %xmm5, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm4):
- movdqu %xmm4, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm3):
- movdqu %xmm3, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm1):
- movdqu %xmm1, (%rdi, %rcx)
- jmp L(CopyFrom1To16BytesXmmExit)
-# endif
-
- .p2align 4
-L(CopyFrom1To16BytesExit):
- BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
-
-/* Case2 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- add $16, %rdx
- sub %rcx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTailCase2):
- add %rcx, %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-L(CopyFrom1To16BytesTail1Case2):
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesCase2)
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32BytesCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To32BytesCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To16BytesTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTailCase2)
- add %rcx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(CopyFrom1To32Bytes1Case2OrCase3):
- add $16, %rdi
- add $16, %rsi
- sub $16, %r8
-L(CopyFrom1To16BytesTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyFrom1To16BytesTail1Case2)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
-# endif
-
-/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
-
- .p2align 4
-L(Exit1):
- mov %dh, (%rdi)
-# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov %dh, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- mov (%rsi), %ecx
- mov %dh, 4(%rdi)
- mov %ecx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $5, %r8
- lea 5(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $6, %r8
- lea 6(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $7, %r8
- lea 7(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $8, %r8
- lea 8(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rsi), %rcx
- mov %dh, 8(%rdi)
- mov %rcx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $9, %r8
- lea 9(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $10, %r8
- lea 10(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $11, %r8
- lea 11(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $12, %r8
- lea 12(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $13, %r8
- lea 13(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $14, %r8
- lea 14(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $15, %r8
- lea 15(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $16, %r8
- lea 16(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit17):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
- mov %dh, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $17, %r8
- lea 17(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $18, %r8
- lea 18(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $19, %r8
- lea 19(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $20, %r8
- lea 20(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dh, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $21, %r8
- lea 21(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $22, %r8
- lea 22(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $23, %r8
- lea 23(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $24, %r8
- lea 24(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
- mov %dh, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $25, %r8
- lea 25(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $26, %r8
- lea 26(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $27, %r8
- lea 27(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $28, %r8
- lea 28(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $29, %r8
- lea 29(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $30, %r8
- lea 30(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $31, %r8
- lea 31(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
- .p2align 4
-L(Exit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $32, %r8
- lea 32(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(StrncpyExit0):
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, (%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit1):
- mov (%rsi), %dl
- mov %dl, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 1(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit2):
- mov (%rsi), %dx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 2(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit3):
- mov (%rsi), %cx
- mov 2(%rsi), %dl
- mov %cx, (%rdi)
- mov %dl, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 3(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit4):
- mov (%rsi), %edx
- mov %edx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 4(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit5):
- mov (%rsi), %ecx
- mov 4(%rsi), %dl
- mov %ecx, (%rdi)
- mov %dl, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 5(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit6):
- mov (%rsi), %ecx
- mov 4(%rsi), %dx
- mov %ecx, (%rdi)
- mov %dx, 4(%rdi)
-# ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 6(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit7):
- mov (%rsi), %ecx
- mov 3(%rsi), %edx
- mov %ecx, (%rdi)
- mov %edx, 3(%rdi)
-# ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 7(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 8(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit9):
- mov (%rsi), %rcx
- mov 8(%rsi), %dl
- mov %rcx, (%rdi)
- mov %dl, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 9(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit10):
- mov (%rsi), %rcx
- mov 8(%rsi), %dx
- mov %rcx, (%rdi)
- mov %dx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 10(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit11):
- mov (%rsi), %rcx
- mov 7(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 11(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit12):
- mov (%rsi), %rcx
- mov 8(%rsi), %edx
- mov %rcx, (%rdi)
- mov %edx, 8(%rdi)
-# ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 12(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit13):
- mov (%rsi), %rcx
- mov 5(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 5(%rdi)
-# ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 13(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit14):
- mov (%rsi), %rcx
- mov 6(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 6(%rdi)
-# ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 14(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit15):
- mov (%rsi), %rcx
- mov 7(%rsi), %rdx
- mov %rcx, (%rdi)
- mov %rdx, 7(%rdi)
-# ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 15(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit16):
- movdqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 16(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit17):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %cl, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 17(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit18):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %cx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 18(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit19):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 19(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit20):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 20(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit21):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %ecx
- mov 20(%rsi), %dl
- movdqu %xmm0, (%rdi)
- mov %ecx, 16(%rdi)
- mov %dl, 20(%rdi)
-# ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 21(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit22):
- movdqu (%rsi), %xmm0
- mov 14(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 22(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit23):
- movdqu (%rsi), %xmm0
- mov 15(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 23(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit24):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rcx
- movdqu %xmm0, (%rdi)
- mov %rcx, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 24(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit25):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cl
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cl, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 25(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit26):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %cx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %cx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 26(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit27):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 23(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 23(%rdi)
-# ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 27(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit28):
- movdqu (%rsi), %xmm0
- mov 16(%rsi), %rdx
- mov 24(%rsi), %ecx
- movdqu %xmm0, (%rdi)
- mov %rdx, 16(%rdi)
- mov %ecx, 24(%rdi)
-# ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 28(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit29):
- movdqu (%rsi), %xmm0
- movdqu 13(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 13(%rdi)
-# ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 29(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit30):
- movdqu (%rsi), %xmm0
- movdqu 14(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 14(%rdi)
-# ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 30(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit31):
- movdqu (%rsi), %xmm0
- movdqu 15(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 15(%rdi)
-# ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 31(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit32):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
-# ifdef USE_AS_STPCPY
- lea 32(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 32(%rdi)
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit33):
- movdqu (%rsi), %xmm0
- movdqu 16(%rsi), %xmm2
- mov 32(%rsi), %cl
- movdqu %xmm0, (%rdi)
- movdqu %xmm2, 16(%rdi)
- mov %cl, 32(%rdi)
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 33(%rdi)
-# endif
- ret
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- ret
-
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(Fill3):
- mov %edx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill4):
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(Fill5):
- mov %edx, (%rdi)
- mov %dl, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill6):
- mov %edx, (%rdi)
- mov %dx, 4(%rdi)
- ret
-
- .p2align 4
-L(Fill7):
- mov %rdx, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rdi)
- mov %dl, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rdi)
- mov %dx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rdi)
- mov %edx, 7(%rdi)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rdi)
- mov %edx, 8(%rdi)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rdi)
- mov %rdx, 5(%rdi)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rdi)
- mov %rdx, 6(%rdi)
- ret
-
- .p2align 4
-L(Fill15):
- movdqu %xmm0, -1(%rdi)
- ret
-
- .p2align 4
-L(Fill16):
- movdqu %xmm0, (%rdi)
- ret
-
- .p2align 4
-L(CopyFrom1To16BytesUnalignedXmm2):
- movdqu %xmm2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyFrom1To16BytesXmmExit):
- bsf %rdx, %rdx
- add $15, %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- pxor %xmm0, %xmm0
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit)
-
- movdqu %xmm0, (%rdi)
- add $16, %rdi
-
- mov %rdi, %rsi
- and $0xf, %rsi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- movdqa %xmm0, 32(%rdi)
- movdqa %xmm0, 48(%rdi)
- add $64, %rdi
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rdi)
- movdqa %xmm0, 16(%rdi)
- add $32, %rdi
- sub $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit)
- movdqa %xmm0, (%rdi)
- add $16, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-L(StrncpyFillExit):
- add $16, %r8
- BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
-
-/* end of ifndef USE_AS_STRCAT */
-# endif
-
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(Unaligned64LeaveCase2)
-L(Unaligned64LeaveCase3):
- lea 64(%r8), %rcx
- and $-16, %rcx
- add $48, %r8
- jl L(CopyFrom1To16BytesCase3)
- movdqu %xmm4, (%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm5, 16(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm6, 32(%rdi)
- sub $16, %r8
- jb L(CopyFrom1To16BytesCase3)
- movdqu %xmm7, 48(%rdi)
-# ifdef USE_AS_STPCPY
- lea 64(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- xor %ch, %ch
- movb %ch, 64(%rdi)
-# endif
- ret
-
- .p2align 4
-L(Unaligned64LeaveCase2):
- xor %rcx, %rcx
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rdx
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm4)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm4, (%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm5)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm5, 16(%rdi)
- add $16, %rcx
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rdx, %rdx
-# ifndef USE_AS_STRCAT
- jnz L(CopyFrom1To16BytesUnalignedXmm6)
-# else
- jnz L(CopyFrom1To16Bytes)
-# endif
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rdx
- movdqu %xmm6, 32(%rdi)
- lea 16(%rdi, %rcx), %rdi
- lea 16(%rsi, %rcx), %rsi
- bsf %rdx, %rdx
- cmp %r8, %rdx
- jb L(CopyFrom1To16BytesExit)
- BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
-
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
- .p2align 4
- .section .rodata
-L(ExitTable):
- .int JMPTBL(L(Exit1), L(ExitTable))
- .int JMPTBL(L(Exit2), L(ExitTable))
- .int JMPTBL(L(Exit3), L(ExitTable))
- .int JMPTBL(L(Exit4), L(ExitTable))
- .int JMPTBL(L(Exit5), L(ExitTable))
- .int JMPTBL(L(Exit6), L(ExitTable))
- .int JMPTBL(L(Exit7), L(ExitTable))
- .int JMPTBL(L(Exit8), L(ExitTable))
- .int JMPTBL(L(Exit9), L(ExitTable))
- .int JMPTBL(L(Exit10), L(ExitTable))
- .int JMPTBL(L(Exit11), L(ExitTable))
- .int JMPTBL(L(Exit12), L(ExitTable))
- .int JMPTBL(L(Exit13), L(ExitTable))
- .int JMPTBL(L(Exit14), L(ExitTable))
- .int JMPTBL(L(Exit15), L(ExitTable))
- .int JMPTBL(L(Exit16), L(ExitTable))
- .int JMPTBL(L(Exit17), L(ExitTable))
- .int JMPTBL(L(Exit18), L(ExitTable))
- .int JMPTBL(L(Exit19), L(ExitTable))
- .int JMPTBL(L(Exit20), L(ExitTable))
- .int JMPTBL(L(Exit21), L(ExitTable))
- .int JMPTBL(L(Exit22), L(ExitTable))
- .int JMPTBL(L(Exit23), L(ExitTable))
- .int JMPTBL(L(Exit24), L(ExitTable))
- .int JMPTBL(L(Exit25), L(ExitTable))
- .int JMPTBL(L(Exit26), L(ExitTable))
- .int JMPTBL(L(Exit27), L(ExitTable))
- .int JMPTBL(L(Exit28), L(ExitTable))
- .int JMPTBL(L(Exit29), L(ExitTable))
- .int JMPTBL(L(Exit30), L(ExitTable))
- .int JMPTBL(L(Exit31), L(ExitTable))
- .int JMPTBL(L(Exit32), L(ExitTable))
-# ifdef USE_AS_STRNCPY
-L(ExitStrncpyTable):
- .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
- .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(FillTable):
- .int JMPTBL(L(Fill0), L(FillTable))
- .int JMPTBL(L(Fill1), L(FillTable))
- .int JMPTBL(L(Fill2), L(FillTable))
- .int JMPTBL(L(Fill3), L(FillTable))
- .int JMPTBL(L(Fill4), L(FillTable))
- .int JMPTBL(L(Fill5), L(FillTable))
- .int JMPTBL(L(Fill6), L(FillTable))
- .int JMPTBL(L(Fill7), L(FillTable))
- .int JMPTBL(L(Fill8), L(FillTable))
- .int JMPTBL(L(Fill9), L(FillTable))
- .int JMPTBL(L(Fill10), L(FillTable))
- .int JMPTBL(L(Fill11), L(FillTable))
- .int JMPTBL(L(Fill12), L(FillTable))
- .int JMPTBL(L(Fill13), L(FillTable))
- .int JMPTBL(L(Fill14), L(FillTable))
- .int JMPTBL(L(Fill15), L(FillTable))
- .int JMPTBL(L(Fill16), L(FillTable))
-# endif
-# endif
-#endif
+#define AS_STRCPY
+#define STPCPY __strcpy_sse2_unaligned
+#include "stpcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 9464ee8..92be04c 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -28,31 +28,18 @@
#endif
#ifdef USE_AS_STPCPY
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
-# define __GI_STRCPY __GI_stpncpy
-# define __GI___STRCPY __GI___stpncpy
-# else
# define STRCPY_SSSE3 __stpcpy_ssse3
# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_AVX2 __stpcpy_avx2
# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
# define __GI_STRCPY __GI_stpcpy
# define __GI___STRCPY __GI___stpcpy
-# endif
#else
-# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
-# define __GI_STRCPY __GI_strncpy
-# else
# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_AVX2 __strcpy_avx2
# define STRCPY_SSE2 __strcpy_sse2
# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
# define __GI_STRCPY __GI_strcpy
-# endif
#endif
@@ -64,7 +51,10 @@ ENTRY(STRCPY)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
+1: leaq STRCPY_AVX2(%rip), %rax
+ testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 2f
leaq STRCPY_SSE2(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
index fcc23a7..e4c98e7 100644
--- a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -1,3 +1,1888 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011-2015 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strncpy_sse2_unaligned
+# endif
+
+# define USE_AS_STRNCPY
+# endif
+
+# define JMPTBL(I, B) I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+# ifndef USE_AS_STRCAT
+
+.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+# endif
+
+ and $63, %rcx
+ cmp $32, %rcx
+ jbe L(SourceStringAlignmentLess32)
+
+ and $-16, %rsi
+ and $15, %rcx
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $16, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $17, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+ add $16, %r10
+ cmp %r10, %r8
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%rdi)
+
+/* If source address alignment != destination address alignment */
+ .p2align 4
+L(Unalign16Both):
+ sub %rcx, %rdi
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+# endif
+ mov $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $48, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm4
+ movdqu %xmm3, (%rdi, %rcx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm1
+ movdqu %xmm4, (%rdi, %rcx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movdqu %xmm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea 16(%rsi, %rcx), %rsi
+ and $-0x40, %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea 128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+ movaps (%rsi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rsi), %xmm5
+ movaps 32(%rsi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rsi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %rdi
+ add $64, %rsi
+ movdqu %xmm4, -64(%rdi)
+ movaps (%rsi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%rdi)
+ movaps 16(%rsi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%rsi), %xmm3
+ movdqu %xmm6, -32(%rdi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%rdi)
+ movaps 48(%rsi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %rcx, %rcx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm7, 48(%rdi)
+ add $15, %r8
+ sub %rdx, %r8
+ lea 49(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $48, %rsi
+ add $48, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source address alignment == destination address alignment */
+
+L(SourceStringAlignmentLess32):
+ pxor %xmm0, %xmm0
+ movdqu (%rsi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb %xmm2, %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+
+ and $-16, %rsi
+ and $15, %rcx
+ jmp L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %rdx, %rdx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ movdqu %xmm4, (%rdi)
+ add $63, %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm5, 16(%rdi)
+ add $47, %r8
+ sub %rdx, %r8
+ lea 17(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $16, %rsi
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %rdx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm6, 32(%rdi)
+ add $31, %r8
+ sub %rdx, %r8
+ lea 33(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $32, %rsi
+ add $32, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 1(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 2(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 3(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 4(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 5(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 6(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 7(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 8(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 9(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 10(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 11(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 12(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 13(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 14(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 15(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 16(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 17(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 18(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 19(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 20(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 21(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 22(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 23(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 24(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 25(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 26(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 27(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 28(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 29(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 30(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 31(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 32(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 33(%rdi)
+# endif
+ ret
+
+# ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ ret
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%rdi)
+ ret
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add $15, %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%rdi)
+ add $16, %rdi
+
+ mov %rdi, %rsi
+ and $0xf, %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ movdqa %xmm0, 32(%rdi)
+ movdqa %xmm0, 48(%rdi)
+ add $64, %rdi
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ add $32, %rdi
+ sub $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $16, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+# endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%r8), %rcx
+ and $-16, %rcx
+ add $48, %r8
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+# endif
+# ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 64(%rdi)
+# endif
+ ret
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %rcx, %rcx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm4, (%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm5, 16(%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+# ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm6, 32(%rdi)
+ lea 16(%rdi, %rcx), %rdi
+ lea 16(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+# ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+# endif
+ ret
+
+# endif
+
+# ifndef USE_AS_STRCAT
+END (STRCPY)
+# else
+END (STRCAT)
+# endif
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# endif
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S
index 6d87a0b..afbd870 100644
--- a/sysdeps/x86_64/multiarch/strncpy.S
+++ b/sysdeps/x86_64/multiarch/strncpy.S
@@ -1,5 +1,85 @@
-/* Multiple versions of strncpy
- All versions must be listed in ifunc-impl-list.c. */
-#define STRCPY strncpy
+/* Multiple versions of strcpy
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2015 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
#define USE_AS_STRNCPY
-#include "strcpy.S"
+#ifndef STRNCPY
+#define STRNCPY strncpy
+#endif
+
+#ifdef USE_AS_STPCPY
+# define STRNCPY_SSSE3 __stpncpy_ssse3
+# define STRNCPY_SSE2 __stpncpy_sse2
+# define STRNCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+# define __GI_STRNCPY __GI_stpncpy
+# define __GI___STRNCPY __GI___stpncpy
+#else
+# define STRNCPY_SSSE3 __strncpy_ssse3
+# define STRNCPY_SSE2 __strncpy_sse2
+# define STRNCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
+# define __GI_STRNCPY __GI_strncpy
+#endif
+
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(STRNCPY)
+ .type STRNCPY, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq STRNCPY_SSE2_UNALIGNED(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRNCPY_SSE2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq STRNCPY_SSSE3(%rip), %rax
+2: ret
+END(STRNCPY)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRNCPY_SSE2, @function; \
+ .align 16; \
+ .globl STRNCPY_SSE2; \
+ .hidden STRNCPY_SSE2; \
+ STRNCPY_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRNCPY_SSE2, .-STRNCPY_SSE2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRNCPY; __GI_STRNCPY = STRNCPY_SSE2
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRNCPY; __GI___STRNCPY = STRNCPY_SSE2
+#endif
+
+#ifndef USE_AS_STRNCPY
+#include "../strcpy.S"
+#endif