aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorAndrew Senkevich <andrew.senkevich@intel.com>2014-12-29 14:39:46 +0300
committerH.J. Lu <hjl.tools@gmail.com>2014-12-30 07:19:38 -0800
commit8b4416d83c79ba77b0669203741c712880a09ae4 (patch)
treec0701090d02b9e3c9ddcb840e2ad62084c498b4a /sysdeps
parent6d6d7fde04c8ef830205a9900bf101597a2f4b18 (diff)
downloadglibc-8b4416d83c79ba77b0669203741c712880a09ae4.zip
glibc-8b4416d83c79ba77b0669203741c712880a09ae4.tar.gz
glibc-8b4416d83c79ba77b0669203741c712880a09ae4.tar.bz2
i386: memcpy functions with SSE2 unaligned load/store
These new memcpy functions are the 32-bit version of x86_64 SSE2 unaligned memcpy. Memcpy average performace benefit is 18% on Silvermont, other platforms also improved about 35%, benchmarked on Silvermont, Haswell, Ivy Bridge, Sandy Bridge and Westmere, performance results attached in https://sourceware.org/ml/libc-alpha/2014-07/msg00157.html * sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S: New file. * sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S: Likewise. * sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S: Likewise. * sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S: Likewise. * sysdeps/i386/i686/multiarch/bcopy.S: Select the sse2_unaligned version if bit_Fast_Unaligned_Load is set. * sysdeps/i386/i686/multiarch/memcpy.S: Likewise. * sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise. * sysdeps/i386/i686/multiarch/memmove.S: Likewise. * sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise. * sysdeps/i386/i686/multiarch/mempcpy.S: Likewise. * sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise. * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add bcopy-sse2-unaligned, memcpy-sse2-unaligned, memmove-sse2-unaligned and mempcpy-sse2-unaligned. * sysdeps/i386/i686/multiarch/ifunc-impl-list.c (MAX_IFUNC): Set to 4. (__libc_ifunc_impl_list): Test __bcopy_sse2_unaligned, __memmove_chk_sse2_unaligned, __memmove_sse2_unaligned, __memcpy_chk_sse2_unaligned, __memcpy_sse2_unaligned, __mempcpy_chk_sse2_unaligned, and __mempcpy_sse2_unaligned.
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S4
-rw-r--r--sysdeps/i386/i686/multiarch/bcopy.S5
-rw-r--r--sysdeps/i386/i686/multiarch/ifunc-impl-list.c16
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S681
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy.S5
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy_chk.S5
-rw-r--r--sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S4
-rw-r--r--sysdeps/i386/i686/multiarch/memmove.S10
-rw-r--r--sysdeps/i386/i686/multiarch/memmove_chk.S22
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S4
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy.S5
-rw-r--r--sysdeps/i386/i686/multiarch/mempcpy_chk.S7
13 files changed, 769 insertions, 3 deletions
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 55778cb..11ce4ba 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -23,7 +23,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strnlen-sse2 strnlen-c \
strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \
strncase_l-c strncase-c strncase_l-ssse3 \
- strcasecmp_l-sse4 strncase_l-sse4
+ strcasecmp_l-sse4 strncase_l-sse4 \
+ bcopy-sse2-unaligned memcpy-sse2-unaligned \
+ mempcpy-sse2-unaligned memmove-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
new file mode 100644
index 0000000..efef2a1
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index a0fca88..4041eed 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -35,6 +35,11 @@ ENTRY(bcopy)
jne 1f
call __init_cpu_features
1: leal __bcopy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __bcopy_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __bcopy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index e475776..4efa9c5 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -23,7 +23,7 @@
#include "init-arch.h"
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 3
+#define MAX_IFUNC 4
/* Fill ARRAY of MAX elements with IFUNC implementations for function
NAME and return the number of valid entries. */
@@ -41,6 +41,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3,
__bcopy_ssse3_rep)
IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSSE3, __bcopy_ssse3)
+ IFUNC_IMPL_ADD (array, i, bcopy, HAS_SSE2,
+ __bcopy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
/* Support sysdeps/i386/i686/multiarch/bzero.S. */
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_chk_ssse3_rep)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
__memmove_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSE2,
+ __memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_ia32))
@@ -78,6 +82,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3_rep)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_SSE2,
+ __memmove_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
/* Support sysdeps/i386/i686/multiarch/memrchr.S. */
@@ -268,6 +274,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_chk_ssse3_rep)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
__memcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSE2,
+ __memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_ia32))
@@ -276,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_rep)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSE2,
+ __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */
@@ -284,6 +294,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_chk_ssse3_rep)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
__mempcpy_chk_ssse3)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSE2,
+ __mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_ia32))
@@ -293,6 +305,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_rep)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSE2,
+ __mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/strlen.S. */
diff --git a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000..ff89de2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,681 @@
+/* memcpy optimized with SSE2 unaligned memory access instructions.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc) \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_sse2_unaligned
+# define MEMCPY_CHK __memcpy_chk_sse2_unaligned
+# endif
+
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+
+ .section .text.sse2,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+ cmp %edx, %eax
+
+# ifdef USE_AS_MEMMOVE
+ jg L(check_forward)
+
+L(mm_len_0_or_more_backward):
+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmpl $32, %ecx
+ jg L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_backward):
+ cmpl $64, %ecx
+ jg L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_backward):
+ cmpl $128, %ecx
+ jg L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_backward):
+ add %ecx, %eax
+ cmp %edx, %eax
+ movl SRC(%esp), %eax
+ jle L(forward)
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu (%eax), %xmm4
+ movdqu 16(%eax), %xmm5
+ movdqu 32(%eax), %xmm6
+ movdqu 48(%eax), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu -16(%eax, %ecx), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ movl %esi, %ecx
+ andl $-16, %ecx
+ leal (%ecx), %ebx
+ subl %edx, %ebx
+ leal (%eax, %ebx), %eax
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG (bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%eax)
+
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movaps %xmm0, -64(%ecx)
+ subl $64, %eax
+ movaps %xmm1, -48(%ecx)
+ movaps %xmm2, -32(%ecx)
+ movaps %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_backward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %cl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %cl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_backward):
+ PUSH (%esi)
+ movl -4(%eax,%ecx), %ebx
+ movl -8(%eax,%ecx), %esi
+ movl %ebx, -4(%edx,%ecx)
+ movl %esi, -8(%edx,%ecx)
+ subl $8, %ecx
+ POP (%esi)
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+/* Big length copy backward part. */
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%eax), %xmm0
+ movdqu -48(%eax), %xmm1
+ movdqu -32(%eax), %xmm2
+ movdqu -16(%eax), %xmm3
+ movntdq %xmm0, -64(%ecx)
+ subl $64, %eax
+ movntdq %xmm1, -48(%ecx)
+ movntdq %xmm2, -32(%ecx)
+ movntdq %xmm3, -16(%ecx)
+ subl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_backward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, -16(%esi)
+ movdqu %xmm4, (%edx)
+ movdqu %xmm5, 16(%edx)
+ movdqu %xmm6, 32(%edx)
+ movdqu %xmm7, 48(%edx)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(check_forward):
+ add %edx, %ecx
+ cmp %eax, %ecx
+ movl LEN(%esp), %ecx
+ jle L(forward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmpl $32, %ecx
+ ja L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_32_or_more_forward):
+ cmpl $64, %ecx
+ ja L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_64_or_more_forward):
+ cmpl $128, %ecx
+ ja L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(return)
+
+L(mm_len_128_or_more_forward):
+ PUSH (%esi)
+ PUSH (%edi)
+ PUSH (%ebx)
+
+/* Aligning the address of destination. */
+ movdqu -16(%eax, %ecx), %xmm4
+ movdqu -32(%eax, %ecx), %xmm5
+ movdqu -48(%eax, %ecx), %xmm6
+ movdqu -64(%eax, %ecx), %xmm7
+ leal (%edx, %ecx), %esi
+ movdqu (%eax), %xmm0
+ subl $16, %esp
+ movdqu %xmm0, (%esp)
+ mov %ecx, %edi
+ leal 16(%edx), %ecx
+ andl $-16, %ecx
+ movl %ecx, %ebx
+ subl %edx, %ebx
+ addl %ebx, %eax
+ movl %esi, %ebx
+ subl %ecx, %ebx
+ shrl $6, %ebx
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %edi
+# else
+# ifdef SHARED
+ PUSH (%ebx)
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi
+ POP (%ebx)
+# else
+ cmp __x86_shared_cache_size_half, %edi
+# endif
+# endif
+ jae L(mm_large_page_loop_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%eax)
+
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqa %xmm0, (%ecx)
+ addl $64, %eax
+ movaps %xmm1, 16(%ecx)
+ movaps %xmm2, 32(%ecx)
+ movaps %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_main_loop_forward)
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %cl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(return)
+
+L(mm_len_9_16_bytes_forward):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(mm_return_pop_all):
+ movl %edx, %eax
+ POP (%edi)
+ POP (%esi)
+ RETURN
+
+/* Big length copy forward part. */
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movntdq %xmm0, (%ecx)
+ addl $64, %eax
+ movntdq %xmm1, 16(%ecx)
+ movntdq %xmm2, 32(%ecx)
+ movntdq %xmm3, 48(%ecx)
+ addl $64, %ecx
+ sub $1, %ebx
+ jnz L(mm_large_page_loop_forward)
+ sfence
+ movdqu (%esp), %xmm0
+ addl $16, %esp
+ movdqu %xmm0, (%edx)
+ movdqu %xmm4, -16(%esi)
+ movdqu %xmm5, -32(%esi)
+ movdqu %xmm6, -48(%esi)
+ movdqu %xmm7, -64(%esi)
+ POP (%ebx)
+ jmp L(mm_return_pop_all)
+# endif
+
+L(forward):
+ cmp $16, %ecx
+ jbe L(len_0_16_bytes)
+
+# ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_shared_cache_size_half, %ecx
+# endif
+# endif
+ jae L(large_page)
+
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ cmpl $32, %ecx
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 16(%eax), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ cmpl $64, %ecx
+ movdqu %xmm0, 16(%edx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 32(%eax), %xmm0
+ movdqu 48(%eax), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+ cmpl $128, %ecx
+ movdqu %xmm0, 32(%edx)
+ movdqu %xmm1, 48(%edx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ jbe L(return)
+
+/* Now the main loop: we align the address of the destination. */
+ leal 64(%edx), %ebx
+ andl $-64, %ebx
+
+ addl %edx, %ecx
+ andl $-64, %ecx
+
+ subl %edx, %eax
+
+/* We should stop two iterations before the termination
+ (in order not to misprefetch). */
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_just_one_iteration)
+
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_last_two_iterations)
+
+ .p2align 4
+L(main_loop_cache):
+
+ prefetcht0 128(%ebx, %eax)
+
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ lea 64(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ movaps %xmm4, 64(%ebx)
+ movaps %xmm5, 80(%ebx)
+ movaps %xmm6, 96(%ebx)
+ movaps %xmm7, 112(%ebx)
+ jmp L(return)
+
+L(main_loop_just_one_iteration):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movaps %xmm1, 16(%ebx)
+ movaps %xmm2, 32(%ebx)
+ movaps %xmm3, 48(%ebx)
+ jmp L(return)
+
+L(large_page):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+
+ movdqu 64(%eax), %xmm0
+ movdqu 80(%eax), %xmm1
+ movdqu 96(%eax), %xmm2
+ movdqu 112(%eax), %xmm3
+ movdqu -128(%eax, %ecx), %xmm4
+ movdqu -112(%eax, %ecx), %xmm5
+ movdqu -96(%eax, %ecx), %xmm6
+ movdqu -80(%eax, %ecx), %xmm7
+ movdqu %xmm0, 64(%edx)
+ movdqu %xmm1, 80(%edx)
+ movdqu %xmm2, 96(%edx)
+ movdqu %xmm3, 112(%edx)
+ movdqu %xmm4, -128(%edx, %ecx)
+ movdqu %xmm5, -112(%edx, %ecx)
+ movdqu %xmm6, -96(%edx, %ecx)
+ movdqu %xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+ the address of the destination. */
+ leal 128(%edx), %ebx
+ andl $-128, %ebx
+
+ addl %edx, %ecx
+ andl $-128, %ecx
+
+ subl %edx, %eax
+
+ .p2align 4
+L(main_loop_large_page):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movntdq %xmm0, (%ebx)
+ movntdq %xmm1, 16(%ebx)
+ movntdq %xmm2, 32(%ebx)
+ movntdq %xmm3, 48(%ebx)
+ movntdq %xmm4, 64(%ebx)
+ movntdq %xmm5, 80(%ebx)
+ movntdq %xmm6, 96(%ebx)
+ movntdq %xmm7, 112(%ebx)
+ lea 128(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_large_page)
+ sfence
+ jmp L(return)
+
+L(len_0_16_bytes):
+ testb $24, %cl
+ jne L(len_9_16_bytes)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(len_5_8_bytes)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%eax), %ebx
+ testb $2, %cl
+ movb %bl, (%edx)
+ je L(return)
+ movzwl -2(%eax,%ecx), %ebx
+ movw %bx, -2(%edx,%ecx)
+ jmp L(return)
+
+L(len_9_16_bytes):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(len_5_8_bytes):
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ movl -4(%eax,%ecx), %ebx
+ movl %ebx, -4(%edx,%ecx)
+
+L(return):
+ movl %edx, %eax
+# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY
+ movl LEN(%esp), %ecx
+ add %ecx, %eax
+# endif
+ RETURN
+
+END (MEMCPY)
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index 76195a5..845492c 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -36,6 +36,11 @@ ENTRY(memcpy)
jne 1f
call __init_cpu_features
1: leal __memcpy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __memcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index c67968e..415d910 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -37,6 +37,11 @@ ENTRY(__memcpy_chk)
jne 1f
call __init_cpu_features
1: leal __memcpy_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __memcpy_chk_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
new file mode 100644
index 0000000..3873594
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_sse2_unaligned
+#define MEMCPY_CHK __memmove_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index d8de7c6..29644dd 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -35,6 +35,11 @@ ENTRY(memmove)
jne 1f
call __init_cpu_features
1: leal __memmove_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __memmove_ssse3@GOTOFF(%ebx), %eax
@@ -63,6 +68,11 @@ ENTRY(memmove)
jne 1f
call __init_cpu_features
1: leal __memmove_ia32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal __memmove_sse2_unaligned, %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
jz 2f
leal __memmove_ssse3, %eax
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index 3442ce1..fea9b54 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -35,6 +35,11 @@ ENTRY(__memmove_chk)
jne 1f
call __init_cpu_features
1: leal __memmove_chk_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __memmove_chk_ssse3@GOTOFF(%ebx), %eax
@@ -54,6 +59,11 @@ ENTRY(__memmove_chk)
jne 1f
call __init_cpu_features
1: leal __memmove_chk_ia32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal __memmove_chk_sse2_unaligned, %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
jz 2f
leal __memmove_chk_ssse3, %eax
@@ -63,6 +73,18 @@ ENTRY(__memmove_chk)
2: ret
END(__memmove_chk)
+ .type __memmove_chk_sse2_unaligned, @function
+ .p2align 4;
+__memmove_chk_sse2_unaligned:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_sse2_unaligned
+ cfi_endproc
+ .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned
+
.type __memmove_chk_ssse3, @function
.p2align 4;
__memmove_chk_ssse3:
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
new file mode 100644
index 0000000..a1cea50
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_sse2_unaligned
+#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned
+#include "memcpy-sse2-unaligned.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index 3aab63d..fd8b82c 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -36,6 +36,11 @@ ENTRY(__mempcpy)
jne 1f
call __init_cpu_features
1: leal __mempcpy_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __mempcpy_ssse3@GOTOFF(%ebx), %eax
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 5d81f4e..ed23b1b 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -36,7 +36,12 @@ ENTRY(__mempcpy_chk)
cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
-1: leal __mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+ leal __mempcpy_chk_ia32@GOTOFF(%ebx), %eax
+1: testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
leal __mempcpy_chk_ssse3@GOTOFF(%ebx), %eax