diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2011-06-24 15:14:22 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-06-24 15:14:22 -0400 |
commit | 8912479f9ea9f56dc188d3d00c4ba4259f600661 (patch) | |
tree | fc91331de86b054859ce0dfe3fdec2a06812aa4c | |
parent | d5495a116c6271c0ae8f6955b64b7b010b1b341a (diff) | |
download | glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.zip glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.gz glibc-8912479f9ea9f56dc188d3d00c4ba4259f600661.tar.bz2 |
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
-rw-r--r-- | ChangeLog | 17 | ||||
-rw-r--r-- | NEWS | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 7 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 1718 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3721 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcpy.S | 1860 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 |
12 files changed, 5508 insertions, 1838 deletions
@@ -1,3 +1,20 @@ +2011-06-22 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 + strcpy-sse2-unaligned strncpy-sse2-unaligned + stpcpy-sse2-unaligned stpncpy-sse2-unaligned. + * sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/stpcpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strncpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strcpy.S: Remove strcpy with SSSE3. + (STRCPY): Support SSE2 and SSSE3 versions. + 2011-06-24 Ulrich Drepper <drepper@gmail.com> [BZ #12874] @@ -20,6 +20,9 @@ Version 2.15 * Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32. Contributed by HJ Lu. + +* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. + Contributed by HJ Lu. Version 2.14 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 19aa4be..88410b3 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,12 +4,15 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) + sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 strlen-sse4 strlen-no-bsf \ - memset-x86-64 + strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ + strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ + strcpy-sse2-unaligned strncpy-sse2-unaligned \ + stpcpy-sse2-unaligned stpncpy-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S new file mode 100644 index 0000000..34231f8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000..d971c2d --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S new file mode 100644 index 0000000..658520f --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000..14ed16f --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S new file mode 100644 index 0000000..9a8d186 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -0,0 +1,1718 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_sse2_unaligned +# endif + +# define JMPTBL(I, B) I - B +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + + .text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + + and $15, %rcx + jz L(SourceStringAlignmentZero) + + and $-16, %rsi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 +# ifdef USE_AS_STRNCPY + add %rcx, %r8 +# endif + pmovmskb %xmm1, %rdx + shr %cl, %rdx +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + + sub %rcx, %rdi + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm1) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +# endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +# if defined USE_AS_STRNCPY +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%rsi), %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + jmp L(Unalign16Both) + +/* ------End of main part with loops--------------------- */ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + .p2align 4 +L(CopyFrom1To16BytesTail): +# if defined USE_AS_STRNCPY + sub %rcx, %r8 +# endif + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +# if defined USE_AS_STRNCPY + sub $16, %r8 +# endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): +# if defined USE_AS_STRNCPY + sub %rcx, %r8 +# endif + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +# if defined USE_AS_STRNCPY +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +# if defined USE_AS_STRNCPY +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +# if defined USE_AS_STRNCPY +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %rcx, %r8 + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %rcx, %r8 + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + sub %rcx, %r8 + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + sub %rcx, %r8 + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +# endif + +/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif + ret + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +# endif + ret + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) + ret + + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + ret + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + ret + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + ret + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + ret + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + ret + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + ret + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + ret + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +# ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +# endif + ret + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): + mov %rdi, %rax + ret + +# endif + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000..efbd3bf --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -0,0 +1,3721 @@ +/* strcpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCPY) + mov %rsi, %rcx +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 +# endif + mov %rdi, %rdx +# ifdef USE_AS_STRNCPY + test %r8, %r8 + jz L(Exit0) + cmp $8, %r8 + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%rcx) + jz L(Exit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + je L(Exit16) +# endif + cmpb $0, 15(%rcx) + jz L(Exit16) + +# ifdef USE_AS_STRNCPY + mov %rcx, %rsi + and $0xf, %rsi + +/* add 16 bytes rcx_shift to r8 */ + + add %rsi, %r8 +# endif + lea 16(%rcx), %rsi +/* Now: + rsi = alignment_16(rcx) + rcx_shift + 16; + rcx_shift = rcx - alignment_16(rcx) +*/ + and $-16, %rsi +/* Now: + rsi = alignment_16(rcx) + 16 +*/ + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) +/* + look if there is zero symbol in next 16 bytes of string + from rsi to rsi + 15 and form mask in xmm0 +*/ + pcmpeqb (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + +/* convert byte mask in xmm0 to bit mask */ + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + +/* rsi = 16 - rcx_shift */ + +/* rax = 0: there isn't end of string from position rsi to rsi+15 */ + +# ifdef USE_AS_STRNCPY + sub $32, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx +/* Now: + rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16 +*/ + and $-16, %rdx + +/* Now: rdx = alignment_16(rdx) + 16 */ + + sub %rdx, %rax + +/* Now: rax = rdx_shift - 16 */ + +# ifdef USE_AS_STRNCPY + add %rax, %rsi + lea -1(%rsi), %rsi + and $1<<31, %esi + test %rsi, %rsi + jnz L(ContinueCopy) + lea 16(%r8), %r8 + +L(ContinueCopy): +# endif + sub %rax, %rcx +/* Now: + case rcx_shift >= rdx_shift: + rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16 + case rcx_shift < rdx_shift: + rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift) +*/ + mov %rcx, %rax + and $0xf, %rax +/* Now: + case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift + case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift) + rax can be 0, 1, ..., 15 +*/ + mov $0, %rsi + +/* case: rcx_shift == rdx_shift */ + + jz L(Align16Both) + + cmp $8, %rax + jae L(ShlHigh8) + cmp $1, %rax + je L(Shl1) + cmp $2, %rax + je L(Shl2) + cmp $3, %rax + je L(Shl3) + cmp $4, %rax + je L(Shl4) + cmp $5, %rax + je L(Shl5) + cmp $6, %rax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %rax + je L(Shl9) + cmp $10, %rax + je L(Shl10) + cmp $11, %rax + je L(Shl11) + cmp $12, %rax + je L(Shl12) + cmp $13, %rax + je L(Shl13) + cmp $14, %rax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + lea 48+64(%r8, %rax), %r8 +# endif + mov $-0x40, %rsi + +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%r8), %r8 +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%rcx), %xmm1 + movaps 15(%rcx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 31(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -15(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -1(%rcx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%rcx), %xmm2 + movaps 31(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %rax, %rax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movaps (%rdx), %xmm6 + psrldq $15, %xmm6 + mov $15, %rsi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%rcx), %xmm1 + movaps 14(%rcx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 30(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -14(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -2(%rcx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%rcx), %xmm2 + movaps 30(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %rax, %rax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movaps (%rdx), %xmm6 + psrldq $14, %xmm6 + mov $14, %rsi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%rcx), %xmm1 + movaps 13(%rcx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 29(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -13(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -3(%rcx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%rcx), %xmm2 + movaps 29(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %rax, %rax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movaps (%rdx), %xmm6 + psrldq $13, %xmm6 + mov $13, %rsi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -4(%rcx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%rcx), %xmm1 + movaps 11(%rcx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 27(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -11(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -5(%rcx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%rcx), %xmm2 + movaps 27(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %rax, %rax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movaps (%rdx), %xmm6 + psrldq $11, %xmm6 + mov $11, %rsi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%rcx), %xmm1 + movaps 10(%rcx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 26(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -10(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -6(%rcx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%rcx), %xmm2 + movaps 26(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %rax, %rax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movaps (%rdx), %xmm6 + psrldq $10, %xmm6 + mov $10, %rsi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%rcx), %xmm1 + movaps 9(%rcx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 25(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -9(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -7(%rcx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%rcx), %xmm2 + movaps 25(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %rax, %rax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movaps (%rdx), %xmm6 + psrldq $9, %xmm6 + mov $9, %rsi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -8(%rcx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%rcx), %xmm1 + movaps 7(%rcx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 23(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -7(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -9(%rcx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%rcx), %xmm2 + movaps 23(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %rax, %rax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movaps (%rdx), %xmm6 + psrldq $7, %xmm6 + mov $7, %rsi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%rcx), %xmm1 + movaps 6(%rcx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 22(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -6(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -10(%rcx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%rcx), %xmm2 + movaps 22(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %rax, %rax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movaps (%rdx), %xmm6 + psrldq $6, %xmm6 + mov $6, %rsi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%rcx), %xmm1 + movaps 5(%rcx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 21(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -5(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -11(%rcx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%rcx), %xmm2 + movaps 21(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %rax, %rax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movaps (%rdx), %xmm6 + psrldq $5, %xmm6 + mov $5, %rsi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -12(%rcx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%rcx), %xmm1 + movaps 3(%rcx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 19(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -3(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -13(%rcx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%rcx), %xmm2 + movaps 19(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %rax, %rax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movaps (%rdx), %xmm6 + psrldq $3, %xmm6 + mov $3, %rsi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%rcx), %xmm1 + movaps 2(%rcx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 18(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -2(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -14(%rcx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%rcx), %xmm2 + movaps 18(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %rax, %rax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movaps (%rdx), %xmm6 + psrldq $2, %xmm6 + mov $2, %rsi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%rcx), %xmm1 + movaps 1(%rcx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 17(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -1(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -15(%rcx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%rcx), %xmm2 + movaps 17(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %rax, %rax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movaps (%rdx), %xmm6 + psrldq $1, %xmm6 + mov $1, %rsi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx) +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %r8 +# endif + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %r8 + lea 8(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 15(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %r8 + lea 16(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + cmp $1, %r8 + je L(Exit1) + test $0x01, %al + jnz L(Exit1) + cmp $2, %r8 + je L(Exit2) + test $0x02, %al + jnz L(Exit2) + cmp $3, %r8 + je L(Exit3) + test $0x04, %al + jnz L(Exit3) + cmp $4, %r8 + je L(Exit4) + test $0x08, %al + jnz L(Exit4) + cmp $5, %r8 + je L(Exit5) + test $0x10, %al + jnz L(Exit5) + cmp $6, %r8 + je L(Exit6) + test $0x20, %al + jnz L(Exit6) + cmp $7, %r8 + je L(Exit7) + test $0x40, %al + jnz L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $9, %r8 + je L(Exit9) + test $0x01, %ah + jnz L(Exit9) + cmp $10, %r8 + je L(Exit10) + test $0x02, %ah + jnz L(Exit10) + cmp $11, %r8 + je L(Exit11) + test $0x04, %ah + jnz L(Exit11) + cmp $12, %r8 + je L(Exit12) + test $0x8, %ah + jnz L(Exit12) + cmp $13, %r8 + je L(Exit13) + test $0x10, %ah + jnz L(Exit13) + cmp $14, %r8 + je L(Exit14) + test $0x20, %ah + jnz L(Exit14) + cmp $15, %r8 + je L(Exit15) + test $0x40, %ah + jnz L(Exit15) + jmp L(Exit16) + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $16, %r8 + je L(Exit16) + cmp $8, %r8 + je L(Exit8) + jg L(More8Case3) + cmp $4, %r8 + je L(Exit4) + jg L(More4Case3) + cmp $2, %r8 + jl L(Exit1) + je L(Exit2) + jg L(Exit3) +L(More8Case3): /* but less than 16 */ + cmp $12, %r8 + je L(Exit12) + jl L(Less12Case3) + cmp $14, %r8 + jl L(Exit13) + je L(Exit14) + jg L(Exit15) +L(More4Case3): /* but less than 8 */ + cmp $6, %r8 + jl L(Exit5) + je L(Exit6) + jg L(Exit7) +L(Less12Case3): /* but more than 8 */ + cmp $10, %r8 + jl L(Exit9) + je L(Exit10) + jg L(Exit11) +# endif + + .p2align 4 +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) +# ifdef USE_AS_STPCPY + lea (%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %r8 + lea 1(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) +# ifdef USE_AS_STPCPY + lea 1(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %r8 + lea 2(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) +# ifdef USE_AS_STPCPY + lea 2(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %r8 + lea 3(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) +# ifdef USE_AS_STPCPY + lea 3(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %r8 + lea 4(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit5): + movl (%rcx), %eax + movl %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 4(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %r8 + lea 5(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit6): + movl (%rcx), %eax + movl %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 5(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %r8 + lea 6(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit7): + movl (%rcx), %eax + movl %eax, (%rdx) + movl 3(%rcx), %eax + movl %eax, 3(%rdx) +# ifdef USE_AS_STPCPY + lea 6(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %r8 + lea 7(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %eax + mov %eax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 8(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %r8 + lea 9(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %eax + mov %eax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 9(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %r8 + lea 10(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 10(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %r8 + lea 11(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 11(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %r8 + lea 12(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %rax + mov %rax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 12(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %r8 + lea 13(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %rax + mov %rax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 13(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %r8 + lea 14(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %r8 + lea 15(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + movb %dl, (%rcx) + ret + + .p2align 4 +L(Fill2): + movw %dx, (%rcx) + ret + + .p2align 4 +L(Fill3): + movw %dx, (%rcx) + movb %dl, 2(%rcx) + ret + + .p2align 4 +L(Fill4): + movl %edx, (%rcx) + ret + + .p2align 4 +L(Fill5): + movl %edx, (%rcx) + movb %dl, 4(%rcx) + ret + + .p2align 4 +L(Fill6): + movl %edx, (%rcx) + movw %dx, 4(%rcx) + ret + + .p2align 4 +L(Fill7): + movl %edx, (%rcx) + movl %edx, 3(%rcx) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rcx) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rcx) + movb %dl, 8(%rcx) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rcx) + movw %dx, 8(%rcx) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rcx) + movl %edx, 7(%rcx) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rcx) + movl %edx, 8(%rcx) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rcx) + mov %rdx, 5(%rcx) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rcx) + mov %rdx, 6(%rcx) + ret + + .p2align 4 +L(Fill15): + mov %rdx, (%rcx) + mov %rdx, 7(%rcx) + ret + + .p2align 4 +L(Fill16): + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + ret + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%r8), %r8 +L(FillFrom1To16Bytes): + test %r8, %r8 + jz L(Fill0) + cmp $16, %r8 + je L(Fill16) + cmp $8, %r8 + je L(Fill8) + jg L(FillMore8) + cmp $4, %r8 + je L(Fill4) + jg L(FillMore4) + cmp $2, %r8 + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %r8 + je L(Fill12) + jl L(FillLess12) + cmp $14, %r8 + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %r8 + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %r8 + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + .p2align 4 +L(StrncpyFillTailWithZero1): + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit1) + + pxor %xmm0, %xmm0 + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + + lea 16(%rcx), %rcx + + mov %rcx, %rdx + and $0xf, %rdx + sub %rdx, %rcx + add %rdx, %r8 + xor %rdx, %rdx + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) + lea 64(%rcx), %rcx + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + lea 32(%rcx), %rcx + sub $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + + .p2align 4 +L(Exit0): + mov %rdx, %rax + ret + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $9, %r8 + je L(Exit9) + cmpb $0, 8(%rcx) + jz L(Exit9) + cmp $10, %r8 + je L(Exit10) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $11, %r8 + je L(Exit11) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $12, %r8 + je L(Exit12) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $13, %r8 + je L(Exit13) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $14, %r8 + je L(Exit14) + cmpb $0, 13(%rcx) + jz L(Exit14) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $1, %r8 + je L(Exit1) + cmpb $0, (%rcx) + jz L(Exit1) + cmp $2, %r8 + je L(Exit2) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $3, %r8 + je L(Exit3) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $4, %r8 + je L(Exit4) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $5, %r8 + je L(Exit5) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $6, %r8 + je L(Exit6) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $7, %r8 + je L(Exit7) + cmpb $0, 6(%rcx) + jz L(Exit7) + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + +# endif + +# ifdef USE_AS_STRNCPY + +L(StrncpyLeaveCase2OrCase3): + test %rax, %rax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + lea 64(%r8), %r8 + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase2) +/*--------------------------------------------------*/ +L(StrncpyExit1Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $15, %xmm6 + mov $15, %rsi + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit2Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $14, %xmm6 + mov $14, %rsi + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit3Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $13, %xmm6 + mov $13, %rsi + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit4Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit5Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $11, %xmm6 + mov $11, %rsi + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit6Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $10, %xmm6 + mov $10, %rsi + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit7Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $9, %xmm6 + mov $9, %rsi + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit8Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit9Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $7, %xmm6 + mov $7, %rsi + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit10Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $6, %xmm6 + mov $6, %rsi + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit11Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $5, %xmm6 + mov $5, %rsi + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit12Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit13Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $3, %xmm6 + mov $3, %rsi + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit14Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $2, %xmm6 + mov $2, %rsi + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyExit15Case2OrCase3): + movaps (%rdx), %xmm6 + psrldq $1, %xmm6 + mov $1, %rsi + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 31+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit1): + movaps (%rdx, %rsi), %xmm6 + psrldq $15, %xmm6 + palignr $1, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 15(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 30+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit2): + movaps (%rdx, %rsi), %xmm6 + psrldq $14, %xmm6 + palignr $2, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 14(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 29+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit3): + movaps (%rdx, %rsi), %xmm6 + psrldq $13, %xmm6 + palignr $3, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 13(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 28+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit4): + movaps (%rdx, %rsi), %xmm6 + psrldq $12, %xmm6 + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 12(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 27+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit5): + movaps (%rdx, %rsi), %xmm6 + psrldq $11, %xmm6 + palignr $5, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 11(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 26+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit6): + movaps (%rdx, %rsi), %xmm6 + psrldq $10, %xmm6 + palignr $6, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 10(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 25+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit7): + movaps (%rdx, %rsi), %xmm6 + psrldq $9, %xmm6 + palignr $7, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 9(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 24+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit8): + movaps (%rdx, %rsi), %xmm6 + psrldq $8, %xmm6 + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 8(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 23+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit9): + movaps (%rdx, %rsi), %xmm6 + psrldq $7, %xmm6 + palignr $9, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 7(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 22+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit10): + movaps (%rdx, %rsi), %xmm6 + psrldq $6, %xmm6 + palignr $10, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 6(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 21+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit11): + movaps (%rdx, %rsi), %xmm6 + psrldq $5, %xmm6 + palignr $11, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 5(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 20+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit12): + movaps (%rdx, %rsi), %xmm6 + psrldq $4, %xmm6 + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 4(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 19+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit13): + movaps (%rdx, %rsi), %xmm6 + psrldq $3, %xmm6 + palignr $13, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 3(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 18+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit14): + movaps (%rdx, %rsi), %xmm6 + psrldq $2, %xmm6 + palignr $14, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 2(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + lea 16(%rsi), %rsi + movaps %xmm2, %xmm3 + sub $16, %r8 + jbe L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, 16(%rdx) + movaps 17+16(%rcx), %xmm2 + movaps %xmm3, %xmm1 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm2, %xmm1 + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit15): + movaps (%rdx, %rsi), %xmm6 + psrldq $1, %xmm6 + palignr $15, %xmm1, %xmm6 + movaps %xmm6, (%rdx, %rsi) + lea 1(%rsi), %rsi + jmp L(CopyFrom1To16BytesCase3) +# endif + +END (STRCPY) + +#endif diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 02fa8d0..381060f 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -1,5 +1,5 @@ -/* strcpy with SSSE3 - Copyright (C) 2009 Free Software Foundation, Inc. +/* Multiple versions of strcpy + Copyright (C) 2009, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -29,30 +29,32 @@ #ifdef USE_AS_STPCPY # ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __stpncpy_ssse3 -# define STRCPY_SSE2 __stpncpy_sse2 -# define __GI_STRCPY __GI_stpncpy +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy # else -# define STRCPY_SSSE3 __stpcpy_ssse3 -# define STRCPY_SSE2 __stpcpy_sse2 -# define __GI_STRCPY __GI_stpcpy -# define __GI___STRCPY __GI___stpcpy +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy # endif #else # ifdef USE_AS_STRNCPY -# define STRCPY_SSSE3 __strncpy_ssse3 -# define STRCPY_SSE2 __strncpy_sse2 -# define __GI_STRCPY __GI_strncpy +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned +# define __GI_STRCPY __GI_strncpy # else -# define STRCPY_SSSE3 __strcpy_ssse3 -# define STRCPY_SSE2 __strcpy_sse2 -# define __GI_STRCPY __GI_strcpy +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned +# define __GI_STRCPY __GI_strcpy # endif #endif -#ifndef LABEL -#define LABEL(l) L(l) -#endif /* Define multiple versions only for the definition in libc. */ #ifndef NOT_IN_libc @@ -62,1830 +64,16 @@ ENTRY(STRCPY) cmpl $0, __cpu_features+KIND_OFFSET(%rip) jne 1f call __init_cpu_features -1: leaq STRCPY_SSE2(%rip), %rax +1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) + jnz 2f + leaq STRCPY_SSE2(%rip), %rax testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) jz 2f leaq STRCPY_SSSE3(%rip), %rax 2: ret END(STRCPY) - .section .text.ssse3,"ax",@progbits -STRCPY_SSSE3: - cfi_startproc - CALL_MCOUNT - -/* - * This implementation uses SSE to copy up to 16 bytes at a time. - */ -#ifdef USE_AS_STRNCPY - test %rdx, %rdx - jz LABEL(strncpy_exitz) - mov %rdx, %r8 -#else - xor %edx, %edx -#endif - mov %esi, %ecx - and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ - and $15, %ecx - mov %rdi, %rax /*store return parameter*/ - - - pxor %xmm0, %xmm0 /* clear %xmm0 */ - pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ - pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ - shr %cl, %edx /* get real bits left in edx*/ - test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ - jnz LABEL(less16bytes) - -#ifdef USE_AS_STRNCPY - lea -16(%r8,%rcx), %r11 - cmp $0, %r11 - jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ -#endif - - mov %rcx, %r9 - or %edi, %ecx - and $15, %ecx - lea -16(%r9), %r10 - jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ - - neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ - - pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ - pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(less32bytes) - /* - * at least 16 byte available to fill destination rdi - */ -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(less32bytes_strncpy_truncation) -#endif - mov (%rsi, %r9), %rdx - mov %rdx, (%rdi) - mov 8(%rsi, %r9), %rdx - mov %rdx, 8(%rdi) - - /* - * so far destatination rdi may be aligned by 16, re-calculate rsi to jump - * crossponding case - * rcx is offset of rsi - * rax is offset of rdi - */ - - and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ - mov %rax, %rdx /* rax store orignal rdi */ - xor %rdi, %rdx /* equal to and $15, %rdx */ -#ifdef USE_AS_STRNCPY - add %rdx, %r8 -#endif - - add $16, %rdi /* next 16 bytes for rdi */ - sub %rdx, %r9 - - lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ - mov %esi, %ecx /*store offset of rsi */ - and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ - - and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ - jz LABEL(ashr_0) - - lea -16(%rcx), %r10 - mov %rcx, %r9 - neg %r10 - lea LABEL(unaligned_table)(%rip), %r11 - movslq (%r11, %rcx,4), %rcx - lea (%r11, %rcx), %rcx - jmp *%rcx - - /* - * The following cases will be handled by ashr_0 & ashr_0_start - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * 0 0 0 ashr_0 - * n(1~15) n(1~15) 0 ashr_0_start - * - */ - .p2align 5 -LABEL(ashr_0): -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) -#endif - movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ - movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ - add $16, %rsi - add $16, %rdi - pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ - pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ - - test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ - jnz LABEL(aligned_16bytes) - -LABEL(ashr_0_loop): -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) -#endif - movdqa (%rsi, %rcx), %xmm1 - movdqa %xmm1, (%rdi, %rcx) - add $16, %rcx - pcmpeqb (%rsi, %rcx), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(aligned_exit) - -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) -#endif - movdqa (%rsi, %rcx), %xmm1 - movdqa %xmm1, (%rdi, %rcx) - add $16, %rcx - pcmpeqb (%rsi, %rcx), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(aligned_exit) - -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) -#endif - movdqa (%rsi, %rcx), %xmm1 - movdqa %xmm1, (%rdi, %rcx) - add $16, %rcx - pcmpeqb (%rsi, %rcx), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(aligned_exit) - -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_aligned) -#endif - movdqa (%rsi, %rcx), %xmm1 - movdqa %xmm1, (%rdi, %rcx) - add $16, %rcx - pcmpeqb (%rsi, %rcx), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jz LABEL(ashr_0_loop) - - jmp LABEL(aligned_exit) - .p2align 4 - -/* - * The following cases will be handled by ashr_15 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_15): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_15_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $15, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $15, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_15_use_ssse3) - -/* - * The following cases will be handled by ashr_14 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_14): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_14_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $14, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $14, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_14_use_ssse3) - -/* - * The following cases will be handled by ashr_13 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_13): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_13_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $13, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $13, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_13_use_ssse3) - -/* - * The following cases will be handled by ashr_12 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_12): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_12_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $12, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $12, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_12_use_ssse3) - -/* - * The following cases will be handled by ashr_11 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_11): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_11_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $11, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $11, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_11_use_ssse3) - -/* - * The following cases will be handled by ashr_10 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_10): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_10_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $10, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $10, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_10_use_ssse3) - -/* - * The following cases will be handled by ashr_9 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_9): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_9_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $9, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $9, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_9_use_ssse3) - -/* - * The following cases will be handled by ashr_8 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_8): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_8_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $8, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $8, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_8_use_ssse3) - -/* - * The following cases will be handled by ashr_7 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_7): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - .p2align 4 - -LABEL(ashr_7_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $7, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $7, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_7_use_ssse3) - -/* - * The following cases will be handled by ashr_6 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_6): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_6_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $6, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $6, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_6_use_ssse3) - - /* - * The following cases will be handled by ashr_5 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_5): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_5_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $5, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $5, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_5_use_ssse3) - -/* - * - * The following cases will be handled by ashr_4 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_4): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_4_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $4, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $4, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_4_use_ssse3) - -/* - * - * The following cases will be handled by ashr_3 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_3): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_3_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $3, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $3, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_3_use_ssse3) - -/* - * - * The following cases will be handled by ashr_2 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_2): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_2_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $2, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $2, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_2_use_ssse3) - -/* - * - * The following cases will be handled by ashr_1 - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case - * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 - * - * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte - */ - .p2align 4 -LABEL(ashr_1): - xor %ecx, %ecx /*clear ecx */ -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - .p2align 4 -LABEL(ashr_1_use_ssse3): - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - - palignr $1, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - - movdqa 16(%rsi, %rcx), %xmm3 - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - jnz LABEL(unaligned_exit) -#ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe LABEL(strncpy_truncation_unaligned) -#endif - palignr $1, (%rsi, %rcx), %xmm3 - movdqa %xmm3, (%rdi, %rcx) - add $16, %rcx - -#ifdef USE_AS_STRNCPY - cmp %r10, %r8 - jbe LABEL(unaligned_exit) -#endif - jmp LABEL(ashr_1_use_ssse3) - - .p2align 4 -LABEL(less32bytes): - xor %ecx, %ecx -LABEL(unaligned_exit): - add %r9, %rsi /* r9 stores original offset of rsi*/ - mov %rcx, %r9 - mov %r10, %rcx - shl %cl, %edx /* after shl, calculate the exact number to be filled*/ - mov %r9, %rcx - .p2align 4 -LABEL(aligned_exit): - add %rcx, %rdi /*locate exact address for rdi */ -LABEL(less16bytes): - add %rcx, %rsi /*locate exact address for rsi */ -LABEL(aligned_16bytes): -#ifdef USE_AS_STRNCPY - mov $1, %r9d - lea -1(%r8), %rcx - shl %cl, %r9d - cmp $32, %r8 - ja LABEL(strncpy_tail) - or %r9d, %edx -LABEL(strncpy_tail): -#endif - bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ - lea LABEL(tail_table)(%rip), %r11 - movslq (%r11, %rcx,4), %rcx - lea (%r11, %rcx), %rcx - jmp *%rcx - -#ifdef USE_AS_STRNCPY - .p2align 4 -LABEL(less32bytes_strncpy_truncation): - xor %ecx, %ecx -LABEL(strncpy_truncation_unaligned): - add %r9, %rsi -LABEL(strncpy_truncation_aligned): - add %rcx, %rdi - add %rcx, %rsi - add $16, %r8 - lea -1(%r8), %rcx - lea LABEL(tail_table)(%rip), %r11 - movslq (%r11, %rcx,4), %rcx - lea (%r11, %rcx), %rcx - jmp *%rcx - .p2align 4 -LABEL(strncpy_exitz): - mov %rdi, %rax - ret -#endif - -#ifdef USE_AS_STRNCPY - .p2align 4 -LABEL(strncpy_fill_tail): - mov %rax, %rdx - movzx %cl, %rax - mov %r8, %rcx - add %rax, %rdi - xor %eax, %eax - shr $3, %ecx - jz LABEL(strncpy_fill_less_8) - - rep stosq -LABEL(strncpy_fill_less_8): - mov %r8, %rcx - and $7, %ecx - jz LABEL(strncpy_fill_return) -LABEL(strncpy_fill_less_7): - sub $1, %ecx - mov %al, (%rdi, %rcx) - jnz LABEL(strncpy_fill_less_7) -LABEL(strncpy_fill_return): -#ifdef USE_AS_STPCPY - cmpb $1, (%rdx) - sbb $-1, %rdx -#endif - mov %rdx, %rax - ret -#endif - .p2align 4 -LABEL(tail_0): - mov (%rsi), %cl - mov %cl, (%rdi) -#ifdef USE_AS_STPCPY - mov %rdi, %rax -#endif -#ifdef USE_AS_STRNCPY - mov $1, %cl - sub $1, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_1): - mov (%rsi), %cx - mov %cx, (%rdi) -#ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $2, %cl - sub $2, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_2): - mov (%rsi), %cx - mov %cx, (%rdi) - mov 1(%rsi), %cx - mov %cx, 1(%rdi) -#ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $3, %cl - sub $3, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_3): - mov (%rsi), %ecx - mov %ecx, (%rdi) -#ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $4, %cl - sub $4, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_4): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov 1(%rsi), %edx - mov %edx, 1(%rdi) -#ifdef USE_AS_STPCPY - lea 4(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $5, %cl - sub $5, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_5): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov 2(%rsi), %edx - mov %edx, 2(%rdi) -#ifdef USE_AS_STPCPY - lea 5(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $6, %cl - sub $6, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_6): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov 3(%rsi), %edx - mov %edx,3(%rdi) -#ifdef USE_AS_STPCPY - lea 6(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $7, %cl - sub $7, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_7): - mov (%rsi), %rcx - mov %rcx, (%rdi) -#ifdef USE_AS_STPCPY - lea 7(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $8, %cl - sub $8, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_8): - - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 5(%rsi), %edx - mov %edx, 5(%rdi) -#ifdef USE_AS_STPCPY - lea 8(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $9, %cl - sub $9, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_9): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 6(%rsi), %edx - mov %edx, 6(%rdi) -#ifdef USE_AS_STPCPY - lea 9(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $10, %cl - sub $10, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_10): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 7(%rsi), %edx - mov %edx, 7(%rdi) -#ifdef USE_AS_STPCPY - lea 10(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $11, %cl - sub $11, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_11): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %edx - mov %edx, 8(%rdi) -#ifdef USE_AS_STPCPY - lea 11(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $12, %cl - sub $12, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_12): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 5(%rsi), %rcx - mov %rcx, 5(%rdi) -#ifdef USE_AS_STPCPY - lea 12(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $13, %cl - sub $13, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_13): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 6(%rsi), %rcx - mov %rcx, 6(%rdi) -#ifdef USE_AS_STPCPY - lea 13(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $14, %cl - sub $14, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_14): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 7(%rsi), %rcx - mov %rcx, 7(%rdi) -#ifdef USE_AS_STPCPY - lea 14(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $15, %cl - sub $15, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - -LABEL(tail_15): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) -#ifdef USE_AS_STPCPY - lea 15(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $16, %cl - sub $16, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - - ret - - .p2align 4 -LABEL(tail_16): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %cl - mov %cl, 16(%rdi) -#ifdef USE_AS_STPCPY - lea 16(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $17, %cl - sub $17, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_17): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %cx - mov %cx, 16(%rdi) -#ifdef USE_AS_STPCPY - lea 17(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $18, %cl - sub $18, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_18): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 15(%rsi), %ecx - mov %ecx,15(%rdi) -#ifdef USE_AS_STPCPY - lea 18(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $19, %cl - sub $19, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_19): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %ecx - mov %ecx, 16(%rdi) -#ifdef USE_AS_STPCPY - lea 19(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $20, %cl - sub $20, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_20): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 13(%rsi), %rcx - mov %rcx, 13(%rdi) -#ifdef USE_AS_STPCPY - lea 20(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $21, %cl - sub $21, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_21): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 14(%rsi), %rcx - mov %rcx, 14(%rdi) -#ifdef USE_AS_STPCPY - lea 21(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $22, %cl - sub $22, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_22): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 15(%rsi), %rcx - mov %rcx, 15(%rdi) -#ifdef USE_AS_STPCPY - lea 22(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $23, %cl - sub $23, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_23): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) -#ifdef USE_AS_STPCPY - lea 23(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $24, %cl - sub $24, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - - ret - - .p2align 4 -LABEL(tail_24): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 21(%rsi), %edx - mov %edx, 21(%rdi) -#ifdef USE_AS_STPCPY - lea 24(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $25, %cl - sub $25, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_25): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 22(%rsi), %edx - mov %edx, 22(%rdi) -#ifdef USE_AS_STPCPY - lea 25(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $26, %cl - sub $26, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_26): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 23(%rsi), %edx - mov %edx, 23(%rdi) -#ifdef USE_AS_STPCPY - lea 26(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $27, %cl - sub $27, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_27): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 24(%rsi), %edx - mov %edx, 24(%rdi) -#ifdef USE_AS_STPCPY - lea 27(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $28, %cl - sub $28, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - .p2align 4 -LABEL(tail_28): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 21(%rsi), %rdx - mov %rdx, 21(%rdi) -#ifdef USE_AS_STPCPY - lea 28(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $29, %cl - sub $29, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - - ret - - .p2align 4 -LABEL(tail_29): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 22(%rsi), %rdx - mov %rdx, 22(%rdi) -#ifdef USE_AS_STPCPY - lea 29(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $30, %cl - sub $30, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - - ret - - - .p2align 4 -LABEL(tail_30): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 23(%rsi), %rdx - mov %rdx, 23(%rdi) -#ifdef USE_AS_STPCPY - lea 30(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $31, %cl - sub $31, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - - .p2align 4 -LABEL(tail_31): - mov (%rsi), %rcx - mov %rcx, (%rdi) - mov 8(%rsi), %rdx - mov %rdx, 8(%rdi) - mov 16(%rsi), %rcx - mov %rcx, 16(%rdi) - mov 24(%rsi), %rdx - mov %rdx, 24(%rdi) -#ifdef USE_AS_STPCPY - lea 31(%rdi), %rax -#endif -#ifdef USE_AS_STRNCPY - mov $32, %cl - sub $32, %r8 - jnz LABEL(strncpy_fill_tail) -#ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -#endif -#endif - ret - cfi_endproc - .size STRCPY_SSSE3, .-STRCPY_SSSE3 - - .p2align 4 - .section .rodata.ssse3,"a",@progbits -LABEL(tail_table): - .int LABEL(tail_0) - LABEL(tail_table) - .int LABEL(tail_1) - LABEL(tail_table) - .int LABEL(tail_2) - LABEL(tail_table) - .int LABEL(tail_3) - LABEL(tail_table) - .int LABEL(tail_4) - LABEL(tail_table) - .int LABEL(tail_5) - LABEL(tail_table) - .int LABEL(tail_6) - LABEL(tail_table) - .int LABEL(tail_7) - LABEL(tail_table) - .int LABEL(tail_8) - LABEL(tail_table) - .int LABEL(tail_9) - LABEL(tail_table) - .int LABEL(tail_10) - LABEL(tail_table) - .int LABEL(tail_11) - LABEL(tail_table) - .int LABEL(tail_12) - LABEL(tail_table) - .int LABEL(tail_13) - LABEL(tail_table) - .int LABEL(tail_14) - LABEL(tail_table) - .int LABEL(tail_15) - LABEL(tail_table) - .int LABEL(tail_16) - LABEL(tail_table) - .int LABEL(tail_17) - LABEL(tail_table) - .int LABEL(tail_18) - LABEL(tail_table) - .int LABEL(tail_19) - LABEL(tail_table) - .int LABEL(tail_20) - LABEL(tail_table) - .int LABEL(tail_21) - LABEL(tail_table) - .int LABEL(tail_22) - LABEL(tail_table) - .int LABEL(tail_23) - LABEL(tail_table) - .int LABEL(tail_24) - LABEL(tail_table) - .int LABEL(tail_25) - LABEL(tail_table) - .int LABEL(tail_26) - LABEL(tail_table) - .int LABEL(tail_27) - LABEL(tail_table) - .int LABEL(tail_28) - LABEL(tail_table) - .int LABEL(tail_29) - LABEL(tail_table) - .int LABEL(tail_30) - LABEL(tail_table) - .int LABEL(tail_31) - LABEL(tail_table) - - .p2align 4 -LABEL(unaligned_table): - .int LABEL(ashr_0) - LABEL(unaligned_table) - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - # undef ENTRY # define ENTRY(name) \ .type STRCPY_SSE2, @function; \ diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S new file mode 100644 index 0000000..fcc23a7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000..bf82ee4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" |