diff options
Diffstat (limited to 'sysdeps/x86_64/memcmp.S')
-rw-r--r-- | sysdeps/x86_64/memcmp.S | 566 |
1 files changed, 8 insertions, 558 deletions
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index 5718a7d..2623ed6 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -1,4 +1,4 @@ -/* memcmp with SSE2 +/* memcmp hook for non-multiarch and RTLD build. Copyright (C) 2009-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,563 +16,13 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#define MEMCMP memcmp -#ifdef USE_AS_WMEMCMP -# define PCMPEQ pcmpeqd -# define CHAR_SIZE 4 -# define SIZE_OFFSET (0) -#else -# define PCMPEQ pcmpeqb -# define CHAR_SIZE 1 -#endif +#define DEFAULT_IMPL_V1 "multiarch/memcmp-sse2.S" +#define DEFAULT_IMPL_V3 "multiarch/memcmp-avx2-movbe.S" +#define DEFAULT_IMPL_V4 "multiarch/memcmp-evex-movbe.S" -#ifdef USE_AS_MEMCMPEQ -# define SIZE_OFFSET (0) -# define CHECK_CMP(x, y) subl x, y -#else -# ifndef SIZE_OFFSET -# define SIZE_OFFSET (CHAR_PER_VEC * 2) -# endif -# define CHECK_CMP(x, y) cmpl x, y -#endif +#include "isa-default-impl.h" -#define VEC_SIZE 16 -#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - -#ifndef MEMCMP -# define MEMCMP memcmp -#endif - - .text -ENTRY(MEMCMP) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx -# endif -#ifdef USE_AS_WMEMCMP - /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store - in ecx for code size. This is preferable to using `incw` as - it avoids partial register stalls on older hardware (pre - SnB). */ - movl $0xffff, %ecx -#endif - cmpq $CHAR_PER_VEC, %rdx - ja L(more_1x_vec) - -#ifdef USE_AS_WMEMCMP - /* saves a byte of code keeping the fall through path n = [2, 4] - in the initial cache line. */ - decl %edx - jle L(cmp_0_1) - - movq (%rsi), %xmm0 - movq (%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - subl %ecx, %eax - jnz L(ret_nonzero_vec_start_0) - - movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 - movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - subl %ecx, %eax - jnz L(ret_nonzero_vec_end_0_adj) -#else - cmpl $8, %edx - ja L(cmp_9_16) - - cmpl $4, %edx - jb L(cmp_0_3) - -# ifdef USE_AS_MEMCMPEQ - movl (%rsi), %eax - subl (%rdi), %eax - - movl -4(%rsi, %rdx), %esi - subl -4(%rdi, %rdx), %esi - - orl %esi, %eax - ret -# else - /* Combine comparisons for lo and hi 4-byte comparisons. */ - movl -4(%rsi, %rdx), %ecx - movl -4(%rdi, %rdx), %eax - shlq $32, %rcx - shlq $32, %rax - movl (%rsi), %esi - movl (%rdi), %edi - orq %rsi, %rcx - orq %rdi, %rax - /* Only compute proper return if not-equal. */ - cmpq %rcx, %rax - jnz L(ret_nonzero) - xorl %eax, %eax - ret -# endif - - .p2align 4,, 10 -L(cmp_9_16): -# ifdef USE_AS_MEMCMPEQ - movq (%rsi), %rax - subq (%rdi), %rax - - movq -8(%rsi, %rdx), %rcx - subq -8(%rdi, %rdx), %rcx - orq %rcx, %rax - /* Convert 64 bit -> 32 bit boolean (we should have made the ABI - return long). */ - setnz %cl - movzbl %cl, %eax -# else - movq (%rsi), %rcx - movq (%rdi), %rax - /* Only compute proper return if not-equal. */ - cmpq %rcx, %rax - jnz L(ret_nonzero) - - movq -8(%rsi, %rdx, CHAR_SIZE), %rcx - movq -8(%rdi, %rdx, CHAR_SIZE), %rax - /* Only compute proper return if not-equal. */ - cmpq %rcx, %rax - jnz L(ret_nonzero) - xorl %eax, %eax -# endif -#endif - ret - - .p2align 4,, 8 -L(cmp_0_1): - /* Flag set by earlier comparison against 1. */ - jne L(cmp_0_0) -#ifdef USE_AS_WMEMCMP - movl (%rdi), %ecx - xorl %edx, %edx - cmpl (%rsi), %ecx - je L(cmp_0_0) - setg %dl - leal -1(%rdx, %rdx), %eax -#else - movzbl (%rdi), %eax - movzbl (%rsi), %ecx - subl %ecx, %eax -#endif - ret - - /* Fits in aligning bytes. */ -L(cmp_0_0): - xorl %eax, %eax - ret - -#ifdef USE_AS_WMEMCMP - .p2align 4 -L(ret_nonzero_vec_start_0): - bsfl %eax, %eax - movl (%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax - ret -#else - -# ifndef USE_AS_MEMCMPEQ - .p2align 4,, 14 -L(ret_nonzero): - /* Need to bswap to get proper return without branch. */ - bswapq %rcx - bswapq %rax - subq %rcx, %rax - sbbl %eax, %eax - orl $1, %eax - ret -# endif - - .p2align 4 -L(cmp_0_3): -# ifdef USE_AS_MEMCMPEQ - /* No reason to add to dependency chain on rdx. Saving a the - bytes here doesn't change number of fetch blocks. */ - cmpl $1, %edx - jbe L(cmp_0_1) -# else - /* We need the code size to prevent taking an extra fetch block. - */ - decl %edx - jle L(cmp_0_1) -# endif - movzwl (%rsi), %ecx - movzwl (%rdi), %eax - -# ifdef USE_AS_MEMCMPEQ - subl %ecx, %eax - - movzbl -1(%rsi, %rdx), %esi - movzbl -1(%rdi, %rdx), %edi - subl %edi, %esi - orl %esi, %eax -# else - bswapl %ecx - bswapl %eax - - /* Implicit right shift by one. We just need to displace the - sign bits. */ - shrl %ecx - shrl %eax - - /* Eat a partial register stall here. Saves code stopping - L(cmp_0_3) from bleeding into the next fetch block and saves - an ALU. */ - movb (%rsi, %rdx), %cl - movzbl (%rdi, %rdx), %edi - orl %edi, %eax - subl %ecx, %eax -# endif - ret -#endif - - .p2align 5 -L(more_1x_vec): -#ifndef USE_AS_WMEMCMP - /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store - in ecx for code size. This is preferable to using `incw` as - it avoids partial register stalls on older hardware (pre - SnB). */ - movl $0xffff, %ecx -#endif - movups (%rsi), %xmm0 - movups (%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - subl %ecx, %eax - jnz L(ret_nonzero_vec_start_0) -#if SIZE_OFFSET == 0 - cmpq $(CHAR_PER_VEC * 2), %rdx -#else - /* Offset rdx. Saves just enough code size to keep the - L(last_2x_vec) case and the non-zero return in a single - cache line. */ - subq $(CHAR_PER_VEC * 2), %rdx -#endif - ja L(more_2x_vec) - - movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 - movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - subl %ecx, %eax -#ifndef USE_AS_MEMCMPEQ - /* Don't use `incw ax` as machines this code runs on are liable - to have partial register stall. */ - jnz L(ret_nonzero_vec_end_0) -#else - /* Various return targets for memcmpeq. Will always be hot in - Icache and get short encoding. */ -L(ret_nonzero_vec_start_1): -L(ret_nonzero_vec_start_0): -L(ret_nonzero_vec_end_0): -#endif - ret - -#ifndef USE_AS_MEMCMPEQ -# ifdef USE_AS_WMEMCMP - .p2align 4 -L(ret_nonzero_vec_end_0_adj): - addl $3, %edx -# else - .p2align 4,, 8 -# endif -L(ret_nonzero_vec_end_0): - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - leal (%rax, %rdx, CHAR_SIZE), %eax - movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - addl %edx, %eax - movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret -# ifndef USE_AS_WMEMCMP - .p2align 4,, 10 -L(ret_nonzero_vec_start_0): - bsfl %eax, %eax - movzbl (%rsi, %rax), %ecx - movzbl (%rdi, %rax), %eax - subl %ecx, %eax - ret -# endif -#else -#endif - - .p2align 5 -L(more_2x_vec): - movups (VEC_SIZE * 1)(%rsi), %xmm0 - movups (VEC_SIZE * 1)(%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - pmovmskb %xmm1, %eax - subl %ecx, %eax - jnz L(ret_nonzero_vec_start_1) - - cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx - jbe L(last_2x_vec) - - cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx - ja L(more_8x_vec) - - /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. - This can harm performance if non-zero return in [65, 80] or - [97, 112] but helps performance otherwise. Generally zero- - return is hotter. */ - movups (VEC_SIZE * 2)(%rsi), %xmm0 - movups (VEC_SIZE * 2)(%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - movups (VEC_SIZE * 3)(%rsi), %xmm2 - movups (VEC_SIZE * 3)(%rdi), %xmm3 - PCMPEQ %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - CHECK_CMP (%ecx, %eax) - jnz L(ret_nonzero_vec_start_2_3) - - cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx - jbe L(last_2x_vec) - - movups (VEC_SIZE * 4)(%rsi), %xmm0 - movups (VEC_SIZE * 4)(%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - movups (VEC_SIZE * 5)(%rsi), %xmm2 - movups (VEC_SIZE * 5)(%rdi), %xmm3 - PCMPEQ %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - CHECK_CMP (%ecx, %eax) -#ifdef USE_AS_MEMCMPEQ - jz L(last_2x_vec) - ret -#else - jnz L(ret_nonzero_vec_start_4_5) -#endif - .p2align 4 -L(last_2x_vec): - movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 - movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 - PCMPEQ %xmm0, %xmm1 - movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 - movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 - PCMPEQ %xmm2, %xmm3 - pand %xmm1, %xmm3 - pmovmskb %xmm3, %eax - subl %ecx, %eax -#ifdef USE_AS_MEMCMPEQ - /* Various return targets for memcmpeq. Will always be hot in - Icache and get short encoding. */ -L(ret_nonzero_vec_start_2_3): -L(ret_nonzero_vec_start_4_5): - ret -#else - jnz L(ret_nonzero_vec_end_1) - ret - - .p2align 4,, 8 -L(ret_nonzero_vec_end_1): - pmovmskb %xmm1, %ecx - /* High 16 bits of eax guranteed to be all ones. Rotate them in - to we can do `or + not` with just `xor`. */ - rorl $16, %eax - xorl %ecx, %eax - /* Partial register stall. */ - - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - leal (%rax, %rdx, CHAR_SIZE), %eax - movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - addl %edx, %eax - movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4 -L(ret_nonzero_vec_start_4_5): - pmovmskb %xmm1, %edx - sall $16, %eax - leal 1(%rax, %rdx), %eax - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - movl (VEC_SIZE * 4)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4,, 8 -L(ret_nonzero_vec_start_1): - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - movl (VEC_SIZE * 1)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret -#endif - - .p2align 4 -L(more_8x_vec): - subq %rdi, %rsi - leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx - andq $(VEC_SIZE * -1), %rdi - addq %rdi, %rsi - .p2align 4 -L(loop_4x): - movups (VEC_SIZE * 2)(%rsi), %xmm0 - movups (VEC_SIZE * 3)(%rsi), %xmm1 - - PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 - PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 - - movups (VEC_SIZE * 4)(%rsi), %xmm2 - movups (VEC_SIZE * 5)(%rsi), %xmm3 - - PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 - PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 - - pand %xmm0, %xmm1 - pand %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - subl %ecx, %eax - jnz L(ret_nonzero_loop) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rsi - cmpq %rdi, %rdx - ja L(loop_4x) - /* Get remaining length in edx. */ - subl %edi, %edx - /* Restore offset so we can reuse L(last_2x_vec). */ - addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx -#ifdef USE_AS_WMEMCMP - shrl $2, %edx -#endif - cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx - jbe L(last_2x_vec) - - - movups (VEC_SIZE * 2)(%rsi), %xmm0 - movups (VEC_SIZE * 2)(%rdi), %xmm1 - PCMPEQ %xmm0, %xmm1 - movups (VEC_SIZE * 3)(%rsi), %xmm2 - movups (VEC_SIZE * 3)(%rdi), %xmm3 - PCMPEQ %xmm2, %xmm3 - pand %xmm1, %xmm3 - - pmovmskb %xmm3, %eax - CHECK_CMP (%ecx, %eax) - jz L(last_2x_vec) -#ifdef USE_AS_MEMCMPEQ -L(ret_nonzero_loop): - ret -#else - - .p2align 4 -L(ret_nonzero_vec_start_2_3): - pmovmskb %xmm1, %edx - sall $16, %eax - leal 1(%rax, %rdx), %eax - - bsfl %eax, %eax -# ifdef USE_AS_WMEMCMP - movl (VEC_SIZE * 2)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret - - .p2align 4 -L(ret_nonzero_loop): - pmovmskb %xmm0, %ecx - pmovmskb %xmm1, %edx - sall $(VEC_SIZE * 1), %edx - leal 1(%rcx, %rdx), %edx - pmovmskb %xmm2, %ecx - /* High 16 bits of eax guranteed to be all ones. Rotate them in - to we can do `or + not` with just `xor`. */ - rorl $16, %eax - xorl %ecx, %eax - - salq $32, %rax - orq %rdx, %rax - - bsfq %rax, %rax -# ifdef USE_AS_WMEMCMP - movl (VEC_SIZE * 2)(%rdi, %rax), %ecx - xorl %edx, %edx - cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx - /* NB: no partial register stall here because xorl zero idiom - above. */ - setg %dl - leal -1(%rdx, %rdx), %eax -# else - movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx - movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax - subl %ecx, %eax -# endif - ret -#endif -END(MEMCMP) - -#ifndef USE_AS_WMEMCMP -# ifdef USE_AS_MEMCMPEQ -libc_hidden_def (MEMCMP) -# else -# undef bcmp -weak_alias (MEMCMP, bcmp) -libc_hidden_builtin_def (MEMCMP) -# endif -#endif +libc_hidden_builtin_def(memcmp) +weak_alias (memcmp, bcmp) |