aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 15:14:22 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 15:14:22 -0400
commit8912479f9ea9f56dc188d3d00c4ba4259f600661 (patch)
treefc91331de86b054859ce0dfe3fdec2a06812aa4c
parentd5495a116c6271c0ae8f6955b64b7b010b1b341a (diff)
downloadglibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.zip
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.gz
glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.bz2
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
-rw-r--r--ChangeLog17
-rw-r--r--NEWS3
-rw-r--r--sysdeps/x86_64/multiarch/Makefile7
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S4
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S1718
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S3721
-rw-r--r--sysdeps/x86_64/multiarch/strcpy.S1860
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S3
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-ssse3.S3
12 files changed, 5508 insertions, 1838 deletions
diff --git a/ChangeLog b/ChangeLog
index 8bf8eeb..b950dcc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2011-06-22 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+ strcpy-sse2-unaligned strncpy-sse2-unaligned
+ stpcpy-sse2-unaligned stpncpy-sse2-unaligned.
+ * sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/stpcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/stpncpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/strncpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy.S: Remove strcpy with SSSE3.
+ (STRCPY): Support SSE2 and SSSE3 versions.
+
2011-06-24 Ulrich Drepper <drepper@gmail.com>
[BZ #12874]
diff --git a/NEWS b/NEWS
index dd28004..bc77d2d 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ Version 2.15
* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
Contributed by HJ Lu.
+
+* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
+ Contributed by HJ Lu.
Version 2.14
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 19aa4be..88410b3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,12 +4,15 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
+
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 strlen-sse4 strlen-no-bsf \
- memset-x86-64
+ strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
+ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+ strcpy-sse2-unaligned strncpy-sse2-unaligned \
+ stpcpy-sse2-unaligned stpncpy-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
new file mode 100644
index 0000000..34231f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000..d971c2d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
new file mode 100644
index 0000000..658520f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000..14ed16f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
new file mode 100644
index 0000000..9a8d186
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -0,0 +1,1718 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2_unaligned
+# endif
+
+# define JMPTBL(I, B) I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+ .text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+ and $15, %rcx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %rsi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+# endif
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%rdi)
+
+ sub %rcx, %rdi
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $48, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm4
+ movdqu %xmm3, (%rdi, %rcx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm1
+ movdqu %xmm4, (%rdi, %rcx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movdqu %xmm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea 16(%rsi, %rcx), %rsi
+ and $-0x40, %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea 128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+ movaps (%rsi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rsi), %xmm5
+ movaps 32(%rsi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rsi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %rdi
+ add $64, %rsi
+ movdqu %xmm4, -64(%rdi)
+ movaps (%rsi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%rdi)
+ movaps 16(%rsi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%rsi), %xmm3
+ movdqu %xmm6, -32(%rdi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%rdi)
+ movaps 48(%rsi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %rcx, %rcx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm7, 48(%rdi)
+ add $15, %r8
+ sub %rdx, %r8
+ lea 49(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $48, %rsi
+ add $48, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%rsi), %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+ jmp L(Unalign16Both)
+
+/* ------End of main part with loops--------------------- */
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY)
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+# if defined USE_AS_STRNCPY
+ sub %rcx, %r8
+# endif
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+# if defined USE_AS_STRNCPY
+ sub $16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+# if defined USE_AS_STRNCPY
+ sub %rcx, %r8
+# endif
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %rdx, %rdx
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ movdqu %xmm4, (%rdi)
+ add $63, %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm5, 16(%rdi)
+ add $47, %r8
+ sub %rdx, %r8
+ lea 17(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $16, %rsi
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %rdx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm6, 32(%rdi)
+ add $31, %r8
+ sub %rdx, %r8
+ lea 33(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $32, %rsi
+ add $32, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %rcx, %r8
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %rcx, %r8
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %rcx, %r8
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %rcx, %r8
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+ ret
+
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ ret
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%rdi)
+ ret
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add $15, %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%rdi)
+ add $16, %rdi
+
+ mov %rdi, %rsi
+ and $0xf, %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ movdqa %xmm0, 32(%rdi)
+ movdqa %xmm0, 48(%rdi)
+ add $64, %rdi
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ add $32, %rdi
+ sub $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $16, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%r8), %rcx
+ and $-16, %rcx
+ add $48, %r8
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %rcx, %rcx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm4, (%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm5, 16(%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm6, 32(%rdi)
+ lea 16(%rdi, %rcx), %rdi
+ lea 16(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+ mov %rdi, %rax
+ ret
+
+# endif
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000..efbd3bf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3721 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+ mov %rsi, %rcx
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+# endif
+ mov %rdi, %rdx
+# ifdef USE_AS_STRNCPY
+ test %r8, %r8
+ jz L(Exit0)
+ cmp $8, %r8
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ cmpb $0, 7(%rcx)
+ jz L(Exit8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ cmpb $0, 14(%rcx)
+ jz L(Exit15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ je L(Exit16)
+# endif
+ cmpb $0, 15(%rcx)
+ jz L(Exit16)
+
+# ifdef USE_AS_STRNCPY
+ mov %rcx, %rsi
+ and $0xf, %rsi
+
+/* add 16 bytes rcx_shift to r8 */
+
+ add %rsi, %r8
+# endif
+ lea 16(%rcx), %rsi
+/* Now:
+ rsi = alignment_16(rcx) + rcx_shift + 16;
+ rcx_shift = rcx - alignment_16(rcx)
+*/
+ and $-16, %rsi
+/* Now:
+ rsi = alignment_16(rcx) + 16
+*/
+ pxor %xmm0, %xmm0
+ mov (%rcx), %r9
+ mov %r9, (%rdx)
+/*
+ look if there is zero symbol in next 16 bytes of string
+ from rsi to rsi + 15 and form mask in xmm0
+*/
+ pcmpeqb (%rsi), %xmm0
+ mov 8(%rcx), %r9
+ mov %r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+ pmovmskb %xmm0, %rax
+ sub %rcx, %rsi
+
+/* rsi = 16 - rcx_shift */
+
+/* rax = 0: there isn't end of string from position rsi to rsi+15 */
+
+# ifdef USE_AS_STRNCPY
+ sub $32, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %rdx, %rax
+ lea 16(%rdx), %rdx
+/* Now:
+ rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16
+*/
+ and $-16, %rdx
+
+/* Now: rdx = alignment_16(rdx) + 16 */
+
+ sub %rdx, %rax
+
+/* Now: rax = rdx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+ add %rax, %rsi
+ lea -1(%rsi), %rsi
+ and $1<<31, %esi
+ test %rsi, %rsi
+ jnz L(ContinueCopy)
+ lea 16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+ sub %rax, %rcx
+/* Now:
+ case rcx_shift >= rdx_shift:
+ rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16
+ case rcx_shift < rdx_shift:
+ rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift)
+*/
+ mov %rcx, %rax
+ and $0xf, %rax
+/* Now:
+ case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift
+ case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift)
+ rax can be 0, 1, ..., 15
+*/
+ mov $0, %rsi
+
+/* case: rcx_shift == rdx_shift */
+
+ jz L(Align16Both)
+
+ cmp $8, %rax
+ jae L(ShlHigh8)
+ cmp $1, %rax
+ je L(Shl1)
+ cmp $2, %rax
+ je L(Shl2)
+ cmp $3, %rax
+ je L(Shl3)
+ cmp $4, %rax
+ je L(Shl4)
+ cmp $5, %rax
+ je L(Shl5)
+ cmp $6, %rax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %rax
+ je L(Shl9)
+ cmp $10, %rax
+ je L(Shl10)
+ cmp $11, %rax
+ je L(Shl11)
+ cmp $12, %rax
+ je L(Shl12)
+ cmp $13, %rax
+ je L(Shl13)
+ cmp $14, %rax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%rcx), %xmm1
+ movaps 16(%rcx), %xmm2
+ movaps %xmm1, (%rdx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm4
+ movaps %xmm3, (%rdx, %rsi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm1
+ movaps %xmm4, (%rdx, %rsi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm2
+ movaps %xmm1, (%rdx, %rsi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%rdx, %rsi)
+ mov %rcx, %rax
+ lea 16(%rcx, %rsi), %rcx
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ lea 48+64(%r8, %rax), %r8
+# endif
+ mov $-0x40, %rsi
+
+L(Aligned64Loop):
+ movaps (%rcx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rcx), %xmm5
+ movaps 32(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rcx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rax
+ lea 64(%rdx), %rdx
+ lea 64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%rdx)
+ movaps %xmm5, -48(%rdx)
+ movaps %xmm6, -32(%rdx)
+ movaps %xmm7, -16(%rdx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%r8), %r8
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%rdx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%rcx), %xmm1
+ movaps 15(%rcx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 31(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -15(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -1(%rcx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%rcx), %xmm2
+ movaps 31(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %rsi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%rcx), %xmm1
+ movaps 14(%rcx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 30(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -14(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -2(%rcx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%rcx), %xmm2
+ movaps 30(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %rsi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%rcx), %xmm1
+ movaps 13(%rcx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 29(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -13(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -3(%rcx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%rcx), %xmm2
+ movaps 29(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %rsi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%rcx), %xmm1
+ movaps 12(%rcx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 28(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -12(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%rcx), %xmm2
+ movaps 28(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %rsi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%rcx), %xmm1
+ movaps 11(%rcx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 27(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -11(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -5(%rcx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%rcx), %xmm2
+ movaps 27(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %rsi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%rcx), %xmm1
+ movaps 10(%rcx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 26(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -10(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -6(%rcx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%rcx), %xmm2
+ movaps 26(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %rsi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%rcx), %xmm1
+ movaps 9(%rcx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 25(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -9(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -7(%rcx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%rcx), %xmm2
+ movaps 25(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %rsi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%rcx), %xmm1
+ movaps 8(%rcx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 24(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -8(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%rcx), %xmm2
+ movaps 24(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %rsi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%rcx), %xmm1
+ movaps 7(%rcx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 23(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -7(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -9(%rcx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%rcx), %xmm2
+ movaps 23(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %rsi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%rcx), %xmm1
+ movaps 6(%rcx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 22(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -6(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -10(%rcx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%rcx), %xmm2
+ movaps 22(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %rsi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%rcx), %xmm1
+ movaps 5(%rcx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 21(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -5(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -11(%rcx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%rcx), %xmm2
+ movaps 21(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %rsi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%rcx), %xmm1
+ movaps 4(%rcx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 20(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -4(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%rcx), %xmm2
+ movaps 20(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %rsi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%rcx), %xmm1
+ movaps 3(%rcx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 19(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -3(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -13(%rcx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%rcx), %xmm2
+ movaps 19(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %rsi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%rcx), %xmm1
+ movaps 2(%rcx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 18(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -2(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -14(%rcx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%rcx), %xmm2
+ movaps 18(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %rsi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%rcx), %xmm1
+ movaps 1(%rcx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 17(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -1(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -15(%rcx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%rcx), %xmm2
+ movaps 17(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %rsi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+# ifdef USE_AS_STRCAT
+ jmp L(CopyFrom1To16Bytes)
+# endif
+
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %r8
+# endif
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %r8
+ lea 8(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ lea 16(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rsi, %rcx
+ lea (%rsi, %rdx), %rsi
+ lea -9(%r8), %rdx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%rsi), %rdx
+ jz L(ExitHighCase2)
+
+ cmp $1, %r8
+ je L(Exit1)
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ test $0x40, %al
+ jnz L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $9, %r8
+ je L(Exit9)
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $15, %r8
+ je L(Exit15)
+ test $0x40, %ah
+ jnz L(Exit15)
+ jmp L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ cmp $16, %r8
+ je L(Exit16)
+ cmp $8, %r8
+ je L(Exit8)
+ jg L(More8Case3)
+ cmp $4, %r8
+ je L(Exit4)
+ jg L(More4Case3)
+ cmp $2, %r8
+ jl L(Exit1)
+ je L(Exit2)
+ jg L(Exit3)
+L(More8Case3): /* but less than 16 */
+ cmp $12, %r8
+ je L(Exit12)
+ jl L(Less12Case3)
+ cmp $14, %r8
+ jl L(Exit13)
+ je L(Exit14)
+ jg L(Exit15)
+L(More4Case3): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Exit5)
+ je L(Exit6)
+ jg L(Exit7)
+L(Less12Case3): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Exit9)
+ je L(Exit10)
+ jg L(Exit11)
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%rcx), %al
+ movb %al, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %r8
+ lea 1(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %r8
+ lea 2(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+ movb 2(%rcx), %al
+ movb %al, 2(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %r8
+ lea 3(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %r8
+ lea 4(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movb 4(%rcx), %al
+ movb %al, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %r8
+ lea 5(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movw 4(%rcx), %ax
+ movw %ax, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %r8
+ lea 6(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movl 3(%rcx), %eax
+ movl %eax, 3(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %r8
+ lea 7(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %eax
+ mov %eax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %r8
+ lea 9(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %eax
+ mov %eax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %r8
+ lea 10(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %eax
+ mov %eax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %r8
+ lea 11(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %eax
+ mov %eax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %r8
+ lea 12(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %rax
+ mov %rax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %r8
+ lea 13(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %rax
+ mov %rax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %r8
+ lea 14(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %r8
+ lea 15(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%rcx)
+ movb %dl, 2(%rcx)
+ ret
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%rcx)
+ movb %dl, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%rcx)
+ movw %dx, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%rcx)
+ movl %edx, 3(%rcx)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rcx)
+ movb %dl, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rcx)
+ movw %dx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rcx)
+ movl %edx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rcx)
+ movl %edx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rcx)
+ mov %rdx, 5(%rcx)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rcx)
+ mov %rdx, 6(%rcx)
+ ret
+
+ .p2align 4
+L(Fill15):
+ mov %rdx, (%rcx)
+ mov %rdx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill16):
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%r8), %r8
+L(FillFrom1To16Bytes):
+ test %r8, %r8
+ jz L(Fill0)
+ cmp $16, %r8
+ je L(Fill16)
+ cmp $8, %r8
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %r8
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %r8
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %r8
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %r8
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit1)
+
+ pxor %xmm0, %xmm0
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+
+ lea 16(%rcx), %rcx
+
+ mov %rcx, %rdx
+ and $0xf, %rdx
+ sub %rdx, %rcx
+ add %rdx, %r8
+ xor %rdx, %rdx
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ movdqa %xmm0, 32(%rcx)
+ movdqa %xmm0, 48(%rcx)
+ lea 64(%rcx), %rcx
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ lea 32(%rcx), %rcx
+ sub $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+ .p2align 4
+L(Exit0):
+ mov %rdx, %rax
+ ret
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %r8
+ je L(Exit9)
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $1, %r8
+ je L(Exit1)
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+# endif
+
+# ifdef USE_AS_STRNCPY
+
+L(StrncpyLeaveCase2OrCase3):
+ test %rax, %rax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ lea 64(%r8), %r8
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+L(StrncpyExit1Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %rsi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit2Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %rsi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit3Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %rsi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit4Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %rsi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit5Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %rsi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit6Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %rsi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit7Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %rsi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit8Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %rsi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit9Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %rsi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit10Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %rsi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit11Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %rsi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit12Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %rsi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit13Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %rsi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit14Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %rsi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit15Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %rsi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 31+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit1):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $15, %xmm6
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 15(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 30+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit2):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $14, %xmm6
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 14(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 29+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit3):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $13, %xmm6
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 13(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 28+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit4):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $12, %xmm6
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 12(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 27+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit5):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $11, %xmm6
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 11(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 26+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit6):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $10, %xmm6
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 10(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 25+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit7):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $9, %xmm6
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 9(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 24+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit8):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $8, %xmm6
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 8(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 23+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit9):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $7, %xmm6
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 7(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 22+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit10):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $6, %xmm6
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 6(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 21+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit11):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $5, %xmm6
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 5(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 20+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit12):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $4, %xmm6
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 4(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 19+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit13):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $3, %xmm6
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 3(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 18+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit14):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $2, %xmm6
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 2(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 17+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit15):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $1, %xmm6
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 1(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+# endif
+
+END (STRCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 02fa8d0..381060f 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,5 +1,5 @@
-/* strcpy with SSSE3
- Copyright (C) 2009 Free Software Foundation, Inc.
+/* Multiple versions of strcpy
+ Copyright (C) 2009, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,30 +29,32 @@
#ifdef USE_AS_STPCPY
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define __GI_STRCPY __GI_stpncpy
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
# else
-# define STRCPY_SSSE3 __stpcpy_ssse3
-# define STRCPY_SSE2 __stpcpy_sse2
-# define __GI_STRCPY __GI_stpcpy
-# define __GI___STRCPY __GI___stpcpy
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
# endif
#else
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define __GI_STRCPY __GI_strncpy
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
+# define __GI_STRCPY __GI_strncpy
# else
-# define STRCPY_SSSE3 __strcpy_ssse3
-# define STRCPY_SSE2 __strcpy_sse2
-# define __GI_STRCPY __GI_strcpy
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
+# define __GI_STRCPY __GI_strcpy
# endif
#endif
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
@@ -62,1830 +64,16 @@ ENTRY(STRCPY)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq STRCPY_SSE2(%rip), %rax
+1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRCPY_SSE2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq STRCPY_SSSE3(%rip), %rax
2: ret
END(STRCPY)
- .section .text.ssse3,"ax",@progbits
-STRCPY_SSSE3:
- cfi_startproc
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to copy up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRNCPY
- test %rdx, %rdx
- jz LABEL(strncpy_exitz)
- mov %rdx, %r8
-#else
- xor %edx, %edx
-#endif
- mov %esi, %ecx
- and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
- and $15, %ecx
- mov %rdi, %rax /*store return parameter*/
-
-
- pxor %xmm0, %xmm0 /* clear %xmm0 */
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
- shr %cl, %edx /* get real bits left in edx*/
- test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
- jnz LABEL(less16bytes)
-
-#ifdef USE_AS_STRNCPY
- lea -16(%r8,%rcx), %r11
- cmp $0, %r11
- jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
-#endif
-
- mov %rcx, %r9
- or %edi, %ecx
- and $15, %ecx
- lea -16(%r9), %r10
- jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
-
- neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
-
- pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
- pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(less32bytes)
- /*
- * at least 16 byte available to fill destination rdi
- */
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(less32bytes_strncpy_truncation)
-#endif
- mov (%rsi, %r9), %rdx
- mov %rdx, (%rdi)
- mov 8(%rsi, %r9), %rdx
- mov %rdx, 8(%rdi)
-
- /*
- * so far destatination rdi may be aligned by 16, re-calculate rsi to jump
- * crossponding case
- * rcx is offset of rsi
- * rax is offset of rdi
- */
-
- and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
- mov %rax, %rdx /* rax store orignal rdi */
- xor %rdi, %rdx /* equal to and $15, %rdx */
-#ifdef USE_AS_STRNCPY
- add %rdx, %r8
-#endif
-
- add $16, %rdi /* next 16 bytes for rdi */
- sub %rdx, %r9
-
- lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
- mov %esi, %ecx /*store offset of rsi */
- and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
-
- and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
- jz LABEL(ashr_0)
-
- lea -16(%rcx), %r10
- mov %rcx, %r9
- neg %r10
- lea LABEL(unaligned_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
- /*
- * The following cases will be handled by ashr_0 & ashr_0_start
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * 0 0 0 ashr_0
- * n(1~15) n(1~15) 0 ashr_0_start
- *
- */
- .p2align 5
-LABEL(ashr_0):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
- movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
- add $16, %rsi
- add $16, %rdi
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
-
- test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
- jnz LABEL(aligned_16bytes)
-
-LABEL(ashr_0_loop):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jz LABEL(ashr_0_loop)
-
- jmp LABEL(aligned_exit)
- .p2align 4
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_15):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_15_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_15_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_14):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_14_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_14_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_13):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_13_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_13_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_12):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_12_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_12_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_11):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_11_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_11_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_10):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_10_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_10_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_9):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_9_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_9_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_8):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_8_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_8_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_7):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- .p2align 4
-
-LABEL(ashr_7_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_7_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_6):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_6_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_6_use_ssse3)
-
- /*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_5):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_5_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_5_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_4):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_4_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_4_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_3):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_3_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_3_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_2):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_2_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_2_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_1):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_1_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_1_use_ssse3)
-
- .p2align 4
-LABEL(less32bytes):
- xor %ecx, %ecx
-LABEL(unaligned_exit):
- add %r9, %rsi /* r9 stores original offset of rsi*/
- mov %rcx, %r9
- mov %r10, %rcx
- shl %cl, %edx /* after shl, calculate the exact number to be filled*/
- mov %r9, %rcx
- .p2align 4
-LABEL(aligned_exit):
- add %rcx, %rdi /*locate exact address for rdi */
-LABEL(less16bytes):
- add %rcx, %rsi /*locate exact address for rsi */
-LABEL(aligned_16bytes):
-#ifdef USE_AS_STRNCPY
- mov $1, %r9d
- lea -1(%r8), %rcx
- shl %cl, %r9d
- cmp $32, %r8
- ja LABEL(strncpy_tail)
- or %r9d, %edx
-LABEL(strncpy_tail):
-#endif
- bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(less32bytes_strncpy_truncation):
- xor %ecx, %ecx
-LABEL(strncpy_truncation_unaligned):
- add %r9, %rsi
-LABEL(strncpy_truncation_aligned):
- add %rcx, %rdi
- add %rcx, %rsi
- add $16, %r8
- lea -1(%r8), %rcx
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
- .p2align 4
-LABEL(strncpy_exitz):
- mov %rdi, %rax
- ret
-#endif
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(strncpy_fill_tail):
- mov %rax, %rdx
- movzx %cl, %rax
- mov %r8, %rcx
- add %rax, %rdi
- xor %eax, %eax
- shr $3, %ecx
- jz LABEL(strncpy_fill_less_8)
-
- rep stosq
-LABEL(strncpy_fill_less_8):
- mov %r8, %rcx
- and $7, %ecx
- jz LABEL(strncpy_fill_return)
-LABEL(strncpy_fill_less_7):
- sub $1, %ecx
- mov %al, (%rdi, %rcx)
- jnz LABEL(strncpy_fill_less_7)
-LABEL(strncpy_fill_return):
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rdx)
- sbb $-1, %rdx
-#endif
- mov %rdx, %rax
- ret
-#endif
- .p2align 4
-LABEL(tail_0):
- mov (%rsi), %cl
- mov %cl, (%rdi)
-#ifdef USE_AS_STPCPY
- mov %rdi, %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $1, %cl
- sub $1, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_1):
- mov (%rsi), %cx
- mov %cx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $2, %cl
- sub $2, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_2):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov 1(%rsi), %cx
- mov %cx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $3, %cl
- sub $3, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_3):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $4, %cl
- sub $4, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_4):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 1(%rsi), %edx
- mov %edx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $5, %cl
- sub $5, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_5):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 2(%rsi), %edx
- mov %edx, 2(%rdi)
-#ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $6, %cl
- sub $6, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_6):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 3(%rsi), %edx
- mov %edx,3(%rdi)
-#ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $7, %cl
- sub $7, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_7):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $8, %cl
- sub $8, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_8):
-
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %edx
- mov %edx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $9, %cl
- sub $9, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_9):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %edx
- mov %edx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $10, %cl
- sub $10, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_10):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %edx
- mov %edx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $11, %cl
- sub $11, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_11):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %edx
- mov %edx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $12, %cl
- sub $12, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_12):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %rcx
- mov %rcx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $13, %cl
- sub $13, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_13):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %rcx
- mov %rcx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $14, %cl
- sub $14, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_14):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %rcx
- mov %rcx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $15, %cl
- sub $15, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
-LABEL(tail_15):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $16, %cl
- sub $16, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_16):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cl
- mov %cl, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $17, %cl
- sub $17, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_17):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cx
- mov %cx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $18, %cl
- sub $18, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_18):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %ecx
- mov %ecx,15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $19, %cl
- sub $19, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_19):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %ecx
- mov %ecx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $20, %cl
- sub $20, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_20):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 13(%rsi), %rcx
- mov %rcx, 13(%rdi)
-#ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $21, %cl
- sub $21, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_21):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 14(%rsi), %rcx
- mov %rcx, 14(%rdi)
-#ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $22, %cl
- sub $22, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_22):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %rcx
- mov %rcx, 15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $23, %cl
- sub $23, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_23):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $24, %cl
- sub $24, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_24):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %edx
- mov %edx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $25, %cl
- sub $25, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_25):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %edx
- mov %edx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $26, %cl
- sub $26, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_26):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %edx
- mov %edx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $27, %cl
- sub $27, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_27):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %edx
- mov %edx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $28, %cl
- sub $28, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_28):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %rdx
- mov %rdx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $29, %cl
- sub $29, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_29):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %rdx
- mov %rdx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $30, %cl
- sub $30, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
-
- .p2align 4
-LABEL(tail_30):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %rdx
- mov %rdx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $31, %cl
- sub $31, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_31):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %rdx
- mov %rdx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $32, %cl
- sub $32, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- cfi_endproc
- .size STRCPY_SSSE3, .-STRCPY_SSSE3
-
- .p2align 4
- .section .rodata.ssse3,"a",@progbits
-LABEL(tail_table):
- .int LABEL(tail_0) - LABEL(tail_table)
- .int LABEL(tail_1) - LABEL(tail_table)
- .int LABEL(tail_2) - LABEL(tail_table)
- .int LABEL(tail_3) - LABEL(tail_table)
- .int LABEL(tail_4) - LABEL(tail_table)
- .int LABEL(tail_5) - LABEL(tail_table)
- .int LABEL(tail_6) - LABEL(tail_table)
- .int LABEL(tail_7) - LABEL(tail_table)
- .int LABEL(tail_8) - LABEL(tail_table)
- .int LABEL(tail_9) - LABEL(tail_table)
- .int LABEL(tail_10) - LABEL(tail_table)
- .int LABEL(tail_11) - LABEL(tail_table)
- .int LABEL(tail_12) - LABEL(tail_table)
- .int LABEL(tail_13) - LABEL(tail_table)
- .int LABEL(tail_14) - LABEL(tail_table)
- .int LABEL(tail_15) - LABEL(tail_table)
- .int LABEL(tail_16) - LABEL(tail_table)
- .int LABEL(tail_17) - LABEL(tail_table)
- .int LABEL(tail_18) - LABEL(tail_table)
- .int LABEL(tail_19) - LABEL(tail_table)
- .int LABEL(tail_20) - LABEL(tail_table)
- .int LABEL(tail_21) - LABEL(tail_table)
- .int LABEL(tail_22) - LABEL(tail_table)
- .int LABEL(tail_23) - LABEL(tail_table)
- .int LABEL(tail_24) - LABEL(tail_table)
- .int LABEL(tail_25) - LABEL(tail_table)
- .int LABEL(tail_26) - LABEL(tail_table)
- .int LABEL(tail_27) - LABEL(tail_table)
- .int LABEL(tail_28) - LABEL(tail_table)
- .int LABEL(tail_29) - LABEL(tail_table)
- .int LABEL(tail_30) - LABEL(tail_table)
- .int LABEL(tail_31) - LABEL(tail_table)
-
- .p2align 4
-LABEL(unaligned_table):
- .int LABEL(ashr_0) - LABEL(unaligned_table)
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
-
# undef ENTRY
# define ENTRY(name) \
.type STRCPY_SSE2, @function; \
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
new file mode 100644
index 0000000..fcc23a7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000..bf82ee4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"