aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S767
-rw-r--r--sysdeps/x86_64/multiarch/wcscpy-ssse3.S64
2 files changed, 323 insertions, 508 deletions
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index c4ec54c..b104765 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -29,6 +29,7 @@
.section .text.ssse3,"ax",@progbits
ENTRY (STRCPY)
+
mov %rsi, %rcx
# ifdef USE_AS_STRNCPY
mov %rdx, %r8
@@ -39,7 +40,7 @@ ENTRY (STRCPY)
jz L(Exit0)
cmp $8, %r8
jbe L(StrncpyExit8Bytes)
-# endif
+# endif
cmpb $0, (%rcx)
jz L(Exit1)
cmpb $0, 1(%rcx)
@@ -56,10 +57,10 @@ ENTRY (STRCPY)
jz L(Exit7)
cmpb $0, 7(%rcx)
jz L(Exit8)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
jb L(StrncpyExit15Bytes)
-# endif
+# endif
cmpb $0, 8(%rcx)
jz L(Exit9)
cmpb $0, 9(%rcx)
@@ -74,10 +75,10 @@ ENTRY (STRCPY)
jz L(Exit14)
cmpb $0, 14(%rcx)
jz L(Exit15)
-# ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
cmp $16, %r8
je L(Exit16)
-# endif
+# endif
cmpb $0, 15(%rcx)
jz L(Exit16)
# endif
@@ -87,25 +88,15 @@ ENTRY (STRCPY)
sub $16, %r8
and $0xf, %rsi
-/* add 16 bytes rcx_shift to r8 */
+/* add 16 bytes rcx_offset to r8 */
+
add %rsi, %r8
# endif
lea 16(%rcx), %rsi
-/* Now:
- rsi = alignment_16(rcx) + rcx_shift + 16;
- rcx_shift = rcx - alignment_16(rcx)
-*/
and $-16, %rsi
-/* Now:
- rsi = alignment_16(rcx) + 16
-*/
pxor %xmm0, %xmm0
mov (%rcx), %r9
mov %r9, (%rdx)
-/*
- look if there is zero symbol in next 16 bytes of string
- from rsi to rsi + 15 and form mask in xmm0
-*/
pcmpeqb (%rsi), %xmm0
mov 8(%rcx), %r9
mov %r9, 8(%rdx)
@@ -115,10 +106,6 @@ ENTRY (STRCPY)
pmovmskb %xmm0, %rax
sub %rcx, %rsi
-/* rsi = 16 - rcx_shift */
-
-/* rax = 0: there isn't end of string from position rsi to rsi+15 */
-
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(CopyFrom1To16BytesCase2OrCase3)
@@ -128,17 +115,9 @@ ENTRY (STRCPY)
mov %rdx, %rax
lea 16(%rdx), %rdx
-/* Now:
- rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16
-*/
and $-16, %rdx
-
-/* Now: rdx = alignment_16(rdx) + 16 */
-
sub %rdx, %rax
-/* Now: rax = rdx_shift - 16 */
-
# ifdef USE_AS_STRNCPY
add %rax, %rsi
lea -1(%rsi), %rsi
@@ -150,22 +129,11 @@ ENTRY (STRCPY)
L(ContinueCopy):
# endif
sub %rax, %rcx
-/* Now:
- case rcx_shift >= rdx_shift:
- rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16
- case rcx_shift < rdx_shift:
- rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift)
-*/
mov %rcx, %rax
and $0xf, %rax
-/* Now:
- case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift
- case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift)
- rax can be 0, 1, ..., 15
-*/
mov $0, %rsi
-/* case: rcx_shift == rdx_shift */
+/* case: rcx_offset == rdx_offset */
jz L(Align16Both)
@@ -282,10 +250,11 @@ L(Align16Both):
sub %rcx, %rax
sub %rax, %rdx
# ifdef USE_AS_STRNCPY
- lea 48+64(%r8, %rax), %r8
+ lea 112(%r8, %rax), %r8
# endif
mov $-0x40, %rsi
+ .p2align 4
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
@@ -366,7 +335,6 @@ L(Shl1Start):
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
@@ -374,7 +342,7 @@ L(Shl1Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
@@ -382,10 +350,9 @@ L(Shl1Start):
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -400,7 +367,6 @@ L(Shl1Start):
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
@@ -408,7 +374,6 @@ L(Shl1Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit1Case2OrCase3)
@@ -416,8 +381,7 @@ L(Shl1Start):
test %rax, %rax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 31(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -432,6 +396,8 @@ L(Shl1Start):
# endif
movaps -1(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl1LoopStart):
movaps 15(%rcx), %xmm2
movaps 31(%rcx), %xmm3
@@ -465,11 +431,9 @@ L(Shl1LoopStart):
jmp L(Shl1LoopStart)
L(Shl1LoopExit):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm1
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -488,7 +452,6 @@ L(Shl2Start):
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
@@ -496,7 +459,7 @@ L(Shl2Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
@@ -504,10 +467,9 @@ L(Shl2Start):
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -522,7 +484,6 @@ L(Shl2Start):
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
@@ -530,7 +491,6 @@ L(Shl2Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit2Case2OrCase3)
@@ -538,8 +498,7 @@ L(Shl2Start):
test %rax, %rax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 30(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -554,6 +513,8 @@ L(Shl2Start):
# endif
movaps -2(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl2LoopStart):
movaps 14(%rcx), %xmm2
movaps 30(%rcx), %xmm3
@@ -587,11 +548,9 @@ L(Shl2LoopStart):
jmp L(Shl2LoopStart)
L(Shl2LoopExit):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm1
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -610,7 +569,6 @@ L(Shl3Start):
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
@@ -618,7 +576,7 @@ L(Shl3Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
@@ -626,10 +584,9 @@ L(Shl3Start):
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -644,7 +601,6 @@ L(Shl3Start):
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
@@ -652,7 +608,6 @@ L(Shl3Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit3Case2OrCase3)
@@ -660,8 +615,7 @@ L(Shl3Start):
test %rax, %rax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 29(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -676,6 +630,8 @@ L(Shl3Start):
# endif
movaps -3(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl3LoopStart):
movaps 13(%rcx), %xmm2
movaps 29(%rcx), %xmm3
@@ -709,11 +665,9 @@ L(Shl3LoopStart):
jmp L(Shl3LoopStart)
L(Shl3LoopExit):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm1
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -732,7 +686,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -740,7 +693,7 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
@@ -748,10 +701,9 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -766,7 +718,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -774,7 +725,6 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit4Case2OrCase3)
@@ -782,8 +732,7 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -798,6 +747,8 @@ L(Shl4Start):
# endif
movaps -4(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
@@ -831,11 +782,9 @@ L(Shl4LoopStart):
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm1
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -4(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -854,7 +803,6 @@ L(Shl5Start):
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
@@ -862,7 +810,7 @@ L(Shl5Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
@@ -870,10 +818,9 @@ L(Shl5Start):
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -888,7 +835,6 @@ L(Shl5Start):
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
@@ -896,7 +842,6 @@ L(Shl5Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit5Case2OrCase3)
@@ -904,8 +849,7 @@ L(Shl5Start):
test %rax, %rax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 27(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -920,6 +864,8 @@ L(Shl5Start):
# endif
movaps -5(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl5LoopStart):
movaps 11(%rcx), %xmm2
movaps 27(%rcx), %xmm3
@@ -953,11 +899,9 @@ L(Shl5LoopStart):
jmp L(Shl5LoopStart)
L(Shl5LoopExit):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm1
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -5(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -976,7 +920,6 @@ L(Shl6Start):
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
@@ -984,7 +927,7 @@ L(Shl6Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
@@ -992,10 +935,9 @@ L(Shl6Start):
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1010,7 +952,6 @@ L(Shl6Start):
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
@@ -1018,7 +959,6 @@ L(Shl6Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit6Case2OrCase3)
@@ -1026,8 +966,7 @@ L(Shl6Start):
test %rax, %rax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 26(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1042,6 +981,8 @@ L(Shl6Start):
# endif
movaps -6(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl6LoopStart):
movaps 10(%rcx), %xmm2
movaps 26(%rcx), %xmm3
@@ -1075,11 +1016,11 @@ L(Shl6LoopStart):
jmp L(Shl6LoopStart)
L(Shl6LoopExit):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
+ mov (%rcx), %r9
+ mov 6(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 6(%rdx)
mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1098,7 +1039,6 @@ L(Shl7Start):
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
@@ -1106,7 +1046,7 @@ L(Shl7Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
@@ -1114,10 +1054,9 @@ L(Shl7Start):
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1132,7 +1071,6 @@ L(Shl7Start):
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
@@ -1140,7 +1078,6 @@ L(Shl7Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit7Case2OrCase3)
@@ -1148,8 +1085,7 @@ L(Shl7Start):
test %rax, %rax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 25(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1164,6 +1100,8 @@ L(Shl7Start):
# endif
movaps -7(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl7LoopStart):
movaps 9(%rcx), %xmm2
movaps 25(%rcx), %xmm3
@@ -1197,11 +1135,11 @@ L(Shl7LoopStart):
jmp L(Shl7LoopStart)
L(Shl7LoopExit):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
+ mov (%rcx), %r9
+ mov 5(%rcx), %esi
+ mov %r9, (%rdx)
+ mov %esi, 5(%rdx)
mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1220,7 +1158,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -1228,7 +1165,7 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
@@ -1236,10 +1173,9 @@ L(Shl8Start):
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1254,7 +1190,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -1262,7 +1197,6 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit8Case2OrCase3)
@@ -1270,8 +1204,7 @@ L(Shl8Start):
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1286,6 +1219,8 @@ L(Shl8Start):
# endif
movaps -8(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
@@ -1319,11 +1254,9 @@ L(Shl8LoopStart):
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1342,7 +1275,6 @@ L(Shl9Start):
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
@@ -1350,7 +1282,7 @@ L(Shl9Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
@@ -1358,10 +1290,9 @@ L(Shl9Start):
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1376,7 +1307,6 @@ L(Shl9Start):
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
@@ -1384,7 +1314,6 @@ L(Shl9Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit9Case2OrCase3)
@@ -1392,8 +1321,7 @@ L(Shl9Start):
test %rax, %rax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 23(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1408,6 +1336,8 @@ L(Shl9Start):
# endif
movaps -9(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl9LoopStart):
movaps 7(%rcx), %xmm2
movaps 23(%rcx), %xmm3
@@ -1441,11 +1371,9 @@ L(Shl9LoopStart):
jmp L(Shl9LoopStart)
L(Shl9LoopExit):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1464,7 +1392,6 @@ L(Shl10Start):
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
@@ -1472,7 +1399,7 @@ L(Shl10Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
@@ -1480,10 +1407,9 @@ L(Shl10Start):
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1498,7 +1424,6 @@ L(Shl10Start):
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
@@ -1506,7 +1431,6 @@ L(Shl10Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit10Case2OrCase3)
@@ -1514,8 +1438,7 @@ L(Shl10Start):
test %rax, %rax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 22(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1530,6 +1453,8 @@ L(Shl10Start):
# endif
movaps -10(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl10LoopStart):
movaps 6(%rcx), %xmm2
movaps 22(%rcx), %xmm3
@@ -1563,11 +1488,9 @@ L(Shl10LoopStart):
jmp L(Shl10LoopStart)
L(Shl10LoopExit):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1586,7 +1509,6 @@ L(Shl11Start):
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
@@ -1594,7 +1516,7 @@ L(Shl11Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
@@ -1602,10 +1524,9 @@ L(Shl11Start):
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1620,7 +1541,6 @@ L(Shl11Start):
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
@@ -1628,7 +1548,6 @@ L(Shl11Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit11Case2OrCase3)
@@ -1636,8 +1555,7 @@ L(Shl11Start):
test %rax, %rax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 21(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1652,6 +1570,8 @@ L(Shl11Start):
# endif
movaps -11(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl11LoopStart):
movaps 5(%rcx), %xmm2
movaps 21(%rcx), %xmm3
@@ -1685,11 +1605,9 @@ L(Shl11LoopStart):
jmp L(Shl11LoopStart)
L(Shl11LoopExit):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1708,7 +1626,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -1716,7 +1633,7 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
@@ -1724,10 +1641,9 @@ L(Shl12Start):
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1742,7 +1658,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -1750,7 +1665,6 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit12Case2OrCase3)
@@ -1758,8 +1672,7 @@ L(Shl12Start):
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1774,6 +1687,8 @@ L(Shl12Start):
# endif
movaps -12(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
@@ -1807,11 +1722,9 @@ L(Shl12LoopStart):
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1830,7 +1743,6 @@ L(Shl13Start):
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
@@ -1838,7 +1750,7 @@ L(Shl13Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
@@ -1846,10 +1758,9 @@ L(Shl13Start):
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1864,7 +1775,6 @@ L(Shl13Start):
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
@@ -1872,7 +1782,6 @@ L(Shl13Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit13Case2OrCase3)
@@ -1880,8 +1789,7 @@ L(Shl13Start):
test %rax, %rax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 19(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -1896,6 +1804,8 @@ L(Shl13Start):
# endif
movaps -13(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl13LoopStart):
movaps 3(%rcx), %xmm2
movaps 19(%rcx), %xmm3
@@ -1929,11 +1839,9 @@ L(Shl13LoopStart):
jmp L(Shl13LoopStart)
L(Shl13LoopExit):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1952,7 +1860,6 @@ L(Shl14Start):
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
@@ -1960,7 +1867,7 @@ L(Shl14Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
@@ -1968,10 +1875,9 @@ L(Shl14Start):
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -1986,7 +1892,6 @@ L(Shl14Start):
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
@@ -1994,7 +1899,6 @@ L(Shl14Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit14Case2OrCase3)
@@ -2002,8 +1906,7 @@ L(Shl14Start):
test %rax, %rax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 18(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -2018,6 +1921,8 @@ L(Shl14Start):
# endif
movaps -14(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl14LoopStart):
movaps 2(%rcx), %xmm2
movaps 18(%rcx), %xmm3
@@ -2051,11 +1956,9 @@ L(Shl14LoopStart):
jmp L(Shl14LoopStart)
L(Shl14LoopExit):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -2074,7 +1977,6 @@ L(Shl15Start):
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
@@ -2082,7 +1984,7 @@ L(Shl15Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
@@ -2090,10 +1992,9 @@ L(Shl15Start):
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqb %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -2108,7 +2009,6 @@ L(Shl15Start):
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
@@ -2116,7 +2016,6 @@ L(Shl15Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %r8
jbe L(StrncpyExit15Case2OrCase3)
@@ -2124,8 +2023,7 @@ L(Shl15Start):
test %rax, %rax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 17(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -2140,6 +2038,8 @@ L(Shl15Start):
# endif
movaps -15(%rcx), %xmm1
+/* 64 bytes loop */
+ .p2align 4
L(Shl15LoopStart):
movaps 1(%rcx), %xmm2
movaps 17(%rcx), %xmm3
@@ -2173,16 +2073,15 @@ L(Shl15LoopStart):
jmp L(Shl15LoopStart)
L(Shl15LoopExit):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
# ifdef USE_AS_STRCAT
jmp L(CopyFrom1To16Bytes)
# endif
# ifndef USE_AS_STRCAT
+
.p2align 4
L(CopyFrom1To16Bytes):
# ifdef USE_AS_STRNCPY
@@ -2463,7 +2362,7 @@ L(Exit4):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2485,7 +2384,7 @@ L(Exit5):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2507,7 +2406,7 @@ L(Exit6):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2617,7 +2516,7 @@ L(Exit12):
# ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
-# endif
+# endif
# endif
ret
@@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes):
ret
# endif
-
# endif
# ifdef USE_AS_STRNCPY
-
+ .p2align 4
L(StrncpyLeaveCase2OrCase3):
test %rax, %rax
jnz L(Aligned64LeaveCase2)
@@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2):
lea -16(%r8), %r8
jmp L(CopyFrom1To16BytesCase2)
/*--------------------------------------------------*/
+ .p2align 4
L(StrncpyExit1Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $15, %xmm6
+ movdqu -1(%rcx), %xmm0
+ movdqu %xmm0, -1(%rdx)
mov $15, %rsi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit2Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $14, %xmm6
+ movdqu -2(%rcx), %xmm0
+ movdqu %xmm0, -2(%rdx)
mov $14, %rsi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit3Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $13, %xmm6
+ movdqu -3(%rcx), %xmm0
+ movdqu %xmm0, -3(%rdx)
mov $13, %rsi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit4Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm0
+ movdqu %xmm0, -4(%rdx)
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit5Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $11, %xmm6
+ movdqu -5(%rcx), %xmm0
+ movdqu %xmm0, -5(%rdx)
mov $11, %rsi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit6Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $10, %xmm6
- mov $10, %rsi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 6(%rcx), %r9d
+ mov %r9d, 6(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $10, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit7Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $9, %xmm6
- mov $9, %rsi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov (%rcx), %rsi
+ mov 5(%rcx), %r9d
+ mov %r9d, 5(%rdx)
+ mov %rsi, (%rdx)
test %rax, %rax
+ mov $9, %rsi
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit8Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit9Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $7, %xmm6
+ mov -1(%rcx), %r9
mov $7, %rsi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit10Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $6, %xmm6
+ mov -2(%rcx), %r9
mov $6, %rsi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit11Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $5, %xmm6
+ mov -3(%rcx), %r9
mov $5, %rsi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit12Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit13Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $3, %xmm6
+ mov -1(%rcx), %r9d
mov $3, %rsi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -1(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit14Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $2, %xmm6
+ mov -2(%rcx), %r9d
mov $2, %rsi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -2(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyExit15Case2OrCase3):
- movaps (%rdx), %xmm6
- psrldq $1, %xmm6
+ mov -3(%rcx), %r9d
mov $1, %rsi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, -3(%rdx)
test %rax, %rax
jnz L(CopyFrom1To16BytesCase2)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave1):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit1)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 31(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 31+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit1)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit1):
- movaps (%rdx, %rsi), %xmm6
- psrldq $15, %xmm6
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 15(%rsi), %rsi
+ lea 15(%rdx, %rsi), %rdx
+ lea 15(%rcx, %rsi), %rcx
+ mov -15(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -15(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave2):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit2)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 30(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 30+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit2)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit2):
- movaps (%rdx, %rsi), %xmm6
- psrldq $14, %xmm6
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 14(%rsi), %rsi
+ lea 14(%rdx, %rsi), %rdx
+ lea 14(%rcx, %rsi), %rcx
+ mov -14(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -14(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave3):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit3)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 29(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 29+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit3)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit3):
- movaps (%rdx, %rsi), %xmm6
- psrldq $13, %xmm6
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 13(%rsi), %rsi
+ lea 13(%rdx, %rsi), %rdx
+ lea 13(%rcx, %rsi), %rcx
+ mov -13(%rcx), %rsi
+ mov -8(%rcx), %rax
+ mov %rsi, -13(%rdx)
+ mov %rax, -8(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave4):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit4)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 28+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit4)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit4):
- movaps (%rdx, %rsi), %xmm6
- psrldq $12, %xmm6
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 12(%rsi), %rsi
+ lea 12(%rdx, %rsi), %rdx
+ lea 12(%rcx, %rsi), %rcx
+ mov -12(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -12(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave5):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit5)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 27(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 27+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit5)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit5):
- movaps (%rdx, %rsi), %xmm6
- psrldq $11, %xmm6
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 11(%rsi), %rsi
+ lea 11(%rdx, %rsi), %rdx
+ lea 11(%rcx, %rsi), %rcx
+ mov -11(%rcx), %rsi
+ mov -4(%rcx), %eax
+ mov %rsi, -11(%rdx)
+ mov %eax, -4(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave6):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit6)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 26(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 26+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit6)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit6):
- movaps (%rdx, %rsi), %xmm6
- psrldq $10, %xmm6
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 10(%rsi), %rsi
+ lea 10(%rdx, %rsi), %rdx
+ lea 10(%rcx, %rsi), %rcx
+ mov -10(%rcx), %rsi
+ movw -2(%rcx), %ax
+ mov %rsi, -10(%rdx)
+ movw %ax, -2(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave7):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit7)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 25(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 25+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit7)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit7):
- movaps (%rdx, %rsi), %xmm6
- psrldq $9, %xmm6
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 9(%rsi), %rsi
+ lea 9(%rdx, %rsi), %rdx
+ lea 9(%rcx, %rsi), %rcx
+ mov -9(%rcx), %rsi
+ movb -1(%rcx), %ah
+ mov %rsi, -9(%rdx)
+ movb %ah, -1(%rdx)
+ xor %rsi, %rsi
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave8):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit8)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 24+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit8)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit8):
- movaps (%rdx, %rsi), %xmm6
- psrldq $8, %xmm6
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 8(%rsi), %rsi
+ lea 8(%rdx, %rsi), %rdx
+ lea 8(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave9):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit9)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 23(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 23+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit9)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit9):
- movaps (%rdx, %rsi), %xmm6
- psrldq $7, %xmm6
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 7(%rsi), %rsi
+ lea 7(%rdx, %rsi), %rdx
+ lea 7(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave10):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit10)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 22(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 22+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit10)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit10):
- movaps (%rdx, %rsi), %xmm6
- psrldq $6, %xmm6
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 6(%rsi), %rsi
+ lea 6(%rdx, %rsi), %rdx
+ lea 6(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave11):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit11)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 21(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 21+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit11)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit11):
- movaps (%rdx, %rsi), %xmm6
- psrldq $5, %xmm6
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 5(%rsi), %rsi
+ lea 5(%rdx, %rsi), %rdx
+ lea 5(%rcx, %rsi), %rcx
+ mov -8(%rcx), %rax
+ xor %rsi, %rsi
+ mov %rax, -8(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave12):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit12)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 20+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit12)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit12):
- movaps (%rdx, %rsi), %xmm6
- psrldq $4, %xmm6
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 4(%rsi), %rsi
+ lea 4(%rdx, %rsi), %rdx
+ lea 4(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave13):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit13)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 19(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 19+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit13)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit13):
- movaps (%rdx, %rsi), %xmm6
- psrldq $3, %xmm6
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 3(%rsi), %rsi
+ lea 3(%rdx, %rsi), %rdx
+ lea 3(%rcx, %rsi), %rcx
+ mov -4(%rcx), %eax
+ xor %rsi, %rsi
+ mov %eax, -4(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave14):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit14)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 18(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 18+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit14)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit14):
- movaps (%rdx, %rsi), %xmm6
- psrldq $2, %xmm6
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 2(%rsi), %rsi
+ lea 2(%rdx, %rsi), %rdx
+ lea 2(%rcx, %rsi), %rcx
+ movw -2(%rcx), %ax
+ xor %rsi, %rsi
+ movw %ax, -2(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+ .p2align 4
L(StrncpyLeave15):
movaps %xmm2, %xmm3
add $48, %r8
jle L(StrncpyExit15)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 17(%rcx), %xmm2
lea 16(%rsi), %rsi
- movaps %xmm2, %xmm3
sub $16, %r8
jbe L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, 16(%rdx)
- movaps 17+16(%rcx), %xmm2
- movaps %xmm3, %xmm1
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm2, %xmm1
movaps %xmm4, 32(%rdx)
lea 16(%rsi), %rsi
sub $16, %r8
jbe L(StrncpyExit15)
- movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
lea 16(%rsi), %rsi
lea -16(%r8), %r8
L(StrncpyExit15):
- movaps (%rdx, %rsi), %xmm6
- psrldq $1, %xmm6
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%rdx, %rsi)
- lea 1(%rsi), %rsi
+ lea 1(%rdx, %rsi), %rdx
+ lea 1(%rcx, %rsi), %rcx
+ movb -1(%rcx), %ah
+ xor %rsi, %rsi
+ movb %ah, -1(%rdx)
jmp L(CopyFrom1To16BytesCase3)
+
# endif
# ifndef USE_AS_STRCAT
END (STRCPY)
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 4e292f3..477b2cb 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -21,8 +21,9 @@
#ifndef NOT_IN_libc
# include <sysdep.h>
-.text
+ .section .text.ssse3,"ax",@progbits
ENTRY (__wcscpy_ssse3)
+
mov %rsi, %rcx
mov %rdi, %rdx
@@ -136,6 +137,7 @@ L(Align16Both):
mov $-0x40, %rsi
+ .p2align 4
L(Aligned64Loop):
movaps (%rcx), %xmm2
movaps %xmm2, %xmm4
@@ -205,7 +207,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -213,15 +214,14 @@ L(Shl4Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -233,7 +233,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 28(%rcx), %xmm2
@@ -245,8 +244,7 @@ L(Shl4Start):
test %rax, %rax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 28(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -259,6 +257,7 @@ L(Shl4Start):
movaps -4(%rcx), %xmm1
+ .p2align 4
L(Shl4LoopStart):
movaps 12(%rcx), %xmm2
movaps 28(%rcx), %xmm3
@@ -289,11 +288,9 @@ L(Shl4LoopStart):
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%rdx), %xmm6
- psrldq $12, %xmm6
+ movdqu -4(%rcx), %xmm1
mov $12, %rsi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ movdqu %xmm1, -4(%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -309,7 +306,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -317,15 +313,14 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -337,7 +332,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 24(%rcx), %xmm2
@@ -345,13 +339,11 @@ L(Shl8Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 24(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -364,6 +356,7 @@ L(Shl8Start):
movaps -8(%rcx), %xmm1
+ .p2align 4
L(Shl8LoopStart):
movaps 8(%rcx), %xmm2
movaps 24(%rcx), %xmm3
@@ -394,11 +387,9 @@ L(Shl8LoopStart):
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%rdx), %xmm6
- psrldq $8, %xmm6
+ mov (%rcx), %r9
mov $8, %rsi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9, (%rdx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -414,7 +405,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -422,15 +412,14 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
+ movaps %xmm2, %xmm1
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
- movaps %xmm3, %xmm1
pcmpeqd %xmm2, %xmm0
lea 16(%rdx), %rdx
@@ -442,7 +431,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%rdx)
movaps 20(%rcx), %xmm2
@@ -450,13 +438,11 @@ L(Shl12Start):
lea 16(%rdx), %rdx
pmovmskb %xmm0, %rax
lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
test %rax, %rax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
lea 20(%rcx), %rcx
lea 16(%rdx), %rdx
@@ -469,6 +455,7 @@ L(Shl12Start):
movaps -12(%rcx), %xmm1
+ .p2align 4
L(Shl12LoopStart):
movaps 4(%rcx), %xmm2
movaps 20(%rcx), %xmm3
@@ -498,11 +485,10 @@ L(Shl12LoopStart):
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%rdx), %xmm6
- psrldq $4, %xmm6
+ mov (%rcx), %r9d
mov $4, %rsi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%rdx)
+ mov %r9d, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
.p2align 4
L(CopyFrom1To16Bytes):
@@ -556,8 +542,10 @@ L(Exit12):
.p2align 4
L(Exit16):
- movdqu (%rcx), %xmm0
- movdqu %xmm0, (%rdx)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
mov %rdi, %rax
ret