aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-06-08 13:57:50 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-06-08 13:58:08 -0700
commitc867597bff2562180a18da4b8dba89d24e8b65c4 (patch)
tree3770c51728e718a0fffe569aca738749982b535a /sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
parent5e8c5bb1ac83aa2577d64d82467a653fa413f7ce (diff)
downloadglibc-c867597bff2562180a18da4b8dba89d24e8b65c4.zip
glibc-c867597bff2562180a18da4b8dba89d24e8b65c4.tar.gz
glibc-c867597bff2562180a18da4b8dba89d24e8b65c4.tar.bz2
X86-64: Remove previous default/SSE2/AVX2 memcpy/memmove
Since the new SSE2/AVX2 memcpy/memmove are faster than the previous ones, we can remove the previous SSE2/AVX2 memcpy/memmove and replace them with the new ones. No change in IFUNC selection if SSE2 and AVX2 memcpy/memmove weren't used before. If SSE2 or AVX2 memcpy/memmove were used, the new SSE2 or AVX2 memcpy/memmove optimized with Enhanced REP MOVSB will be used for processors with ERMS. The new AVX512 memcpy/memmove will be used for processors with AVX512 which prefer vzeroupper. Since the new SSE2 memcpy/memmove are faster than the previous default memcpy/memmove used in libc.a and ld.so, we also remove the previous default memcpy/memmove and make them the default memcpy/memmove, except that non-temporal store isn't used in ld.so. Together, it reduces the size of libc.so by about 6 KB and the size of ld.so by about 2 KB. [BZ #19776] * sysdeps/x86_64/memcpy.S: Make it dummy. * sysdeps/x86_64/mempcpy.S: Likewise. * sysdeps/x86_64/memmove.S: New file. * sysdeps/x86_64/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/memmove.c: Removed. * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove memcpy-sse2-unaligned, memmove-avx-unaligned, memcpy-avx-unaligned and memmove-sse2-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Replace __memmove_chk_avx512_unaligned_2 with __memmove_chk_avx512_unaligned. Remove __memmove_chk_avx_unaligned_2. Replace __memmove_chk_sse2_unaligned_2 with __memmove_chk_sse2_unaligned. Remove __memmove_chk_sse2 and __memmove_avx_unaligned_2. Replace __memmove_avx512_unaligned_2 with __memmove_avx512_unaligned. Replace __memmove_sse2_unaligned_2 with __memmove_sse2_unaligned. Remove __memmove_sse2. Replace __memcpy_chk_avx512_unaligned_2 with __memcpy_chk_avx512_unaligned. Remove __memcpy_chk_avx_unaligned_2. Replace __memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned. Remove __memcpy_chk_sse2. Remove __memcpy_avx_unaligned_2. Replace __memcpy_avx512_unaligned_2 with __memcpy_avx512_unaligned. Remove __memcpy_sse2_unaligned_2 and __memcpy_sse2. Replace __mempcpy_chk_avx512_unaligned_2 with __mempcpy_chk_avx512_unaligned. Remove __mempcpy_chk_avx_unaligned_2. Replace __mempcpy_chk_sse2_unaligned_2 with __mempcpy_chk_sse2_unaligned. Remove __mempcpy_chk_sse2. Replace __mempcpy_avx512_unaligned_2 with __mempcpy_avx512_unaligned. Remove __mempcpy_avx_unaligned_2. Replace __mempcpy_sse2_unaligned_2 with __mempcpy_sse2_unaligned. Remove __mempcpy_sse2. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support __memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned. Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms if processor has ERMS. Default to __memcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../memcpy.S. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support __memcpy_chk_avx512_unaligned_erms and __memcpy_chk_avx512_unaligned. Use __memcpy_chk_avx_unaligned_erms and __memcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __memcpy_chk_sse2_unaligned. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S Change function suffix from unaligned_2 to unaligned. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support __mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned. Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms if processor has ERMS. Default to __mempcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../mempcpy.S. (mempcpy): New. Add a weak alias. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support __mempcpy_chk_avx512_unaligned_erms and __mempcpy_chk_avx512_unaligned. Use __mempcpy_chk_avx_unaligned_erms and __mempcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __mempcpy_chk_sse2_unaligned.
Diffstat (limited to 'sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S')
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S175
1 files changed, 0 insertions, 175 deletions
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
deleted file mode 100644
index c450983..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,175 +0,0 @@
-/* memcpy with unaliged loads
- Copyright (C) 2013-2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-#include <sysdep.h>
-
-#include "asm-syntax.h"
-
-
-ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
- jb L(overlapping)
- cmpq $16, %rdx
- jbe L(less_16)
- movdqu (%rsi), %xmm8
- cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
-L(return):
- movq %rdi, %rax
- ret
- .p2align 4,,10
- .p2align 4
-.L31:
- movdqu 16(%rsi), %xmm8
- cmpq $64, %rdx
- movdqu %xmm8, 16(%rdi)
- movdqu -32(%rsi,%rdx), %xmm8
- movdqu %xmm8, -32(%rdi,%rdx)
- jbe L(return)
- movdqu 32(%rsi), %xmm8
- cmpq $128, %rdx
- movdqu %xmm8, 32(%rdi)
- movdqu -48(%rsi,%rdx), %xmm8
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu 48(%rsi), %xmm8
- movdqu %xmm8, 48(%rdi)
- movdqu -64(%rsi,%rdx), %xmm8
- movdqu %xmm8, -64(%rdi,%rdx)
- jbe L(return)
- leaq 64(%rdi), %rcx
- addq %rdi, %rdx
- andq $-64, %rdx
- andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
- cmpq %rdx, %rcx
- je L(return)
- movq %rsi, %r10
- subq %rcx, %r10
- leaq 16(%r10), %r9
- leaq 32(%r10), %r8
- leaq 48(%r10), %rax
- .p2align 4,,10
- .p2align 4
-L(loop):
- movdqu (%rcx,%r10), %xmm8
- movdqa %xmm8, (%rcx)
- movdqu (%rcx,%r9), %xmm8
- movdqa %xmm8, 16(%rcx)
- movdqu (%rcx,%r8), %xmm8
- movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
- movdqa %xmm8, 48(%rcx)
- addq $64, %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- jmp L(return)
-L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
- movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %rax, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
- ja .L21
- jmp L(return)
-L(less_16):
- testb $24, %dl
- jne L(between_9_16)
- testb $4, %dl
- .p2align 4,,5
- jne L(between_5_8)
- testq %rdx, %rdx
- .p2align 4,,2
- je L(return)
- movzbl (%rsi), %eax
- testb $2, %dl
- movb %al, (%rdi)
- je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
- .p2align 4
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
-L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
-L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
-END(__memcpy_sse2_unaligned)
-
-#endif