From be13f7bff66e1850f9057dd813d6e7be022d9516 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Sat, 15 Oct 2011 11:10:08 -0400 Subject: Optimized memcmp and wmemcmp for x86-64 and x86-32 --- sysdeps/i386/i686/multiarch/Makefile | 3 +- sysdeps/i386/i686/multiarch/memcmp-sse4.S | 396 ++++++++++++++----- sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 565 ++++++++++++++++++---------- sysdeps/i386/i686/multiarch/wmemcmp-c.c | 5 + sysdeps/i386/i686/multiarch/wmemcmp-sse4.S | 4 + sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S | 4 + sysdeps/i386/i686/multiarch/wmemcmp.S | 59 +++ 7 files changed, 754 insertions(+), 282 deletions(-) create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-c.c create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp.S (limited to 'sysdeps/i386/i686') diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 8a4c219..98d1ad6 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \ memrchr-sse2 memrchr-sse2-bsf memrchr-c \ - rawmemchr-sse2 rawmemchr-sse2-bsf + rawmemchr-sse2 rawmemchr-sse2-bsf \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S index b1ed778..1f5dbc1 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -1,5 +1,5 @@ -/* memcmp with SSE4.2 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,84 +20,97 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_sse4_2 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) -#ifdef SHARED -# define JMPTBL(I, B) I - B +# ifdef SHARED +# define JMPTBL(I, B) I - B /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - call __i686.get_pc_thunk.bx; \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table and adjuested EDX/ESI. Go. */ \ - jmp *%ebx - - .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits - .globl __i686.get_pc_thunk.bx - .hidden __i686.get_pc_thunk.bx - ALIGN (4) - .type __i686.get_pc_thunk.bx,@function -__i686.get_pc_thunk.bx: - movl (%esp), %ebx - ret -#else -# define JMPTBL(I, B) I + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + call __i686.get_pc_thunk.bx; \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjuested EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ .section .text.sse4.2,"ax",@progbits ENTRY (MEMCMP) movl BLK1(%esp), %eax movl BLK2(%esp), %edx movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else cmp $1, %ecx jbe L(less1bytes) +# endif + pxor %xmm0, %xmm0 cmp $64, %ecx ja L(64bytesormore) cmp $8, %ecx - PUSH (%ebx) + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else jb L(less8bytes) + PUSH (%ebx) +# endif + add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less8bytes): mov (%eax), %bl cmpb (%edx), %bl @@ -141,22 +154,49 @@ L(less8bytes): mov 6(%eax), %bl cmpb 6(%edx), %bl je L(0bytes) + L(nonzero): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(above) neg %eax L(above): ret CFI_PUSH (%ebx) +# endif - ALIGN (4) + .p2align 4 L(0bytes): - POP (%ebx) + POP (%ebx) xor %eax, %eax ret - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(0bytesend) movzbl (%eax), %eax @@ -164,14 +204,14 @@ L(less1bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(0bytesend): xor %eax, %eax ret - - ALIGN (4) +# endif + .p2align 4 L(64bytesormore): - PUSH (%ebx) + PUSH (%ebx) mov %ecx, %ebx mov $64, %ecx sub $64, %ebx @@ -208,7 +248,14 @@ L(64bytesormore_loop): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) - ALIGN (4) +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 L(find_16diff): sub $16, %ecx L(find_32diff): @@ -218,9 +265,9 @@ L(find_48diff): L(find_64diff): add %ecx, %edx add %ecx, %eax - jmp L(16bytes) - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(16bytes): mov -16(%eax), %ecx mov -16(%edx), %ebx @@ -243,8 +290,30 @@ L(4bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(49bytes): movdqu -49(%eax), %xmm1 movdqu -49(%edx), %xmm2 @@ -285,7 +354,7 @@ L(5bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(50bytes): mov $-50, %ebx movdqu -50(%eax), %xmm1 @@ -330,7 +399,7 @@ L(2bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(51bytes): mov $-51, %ebx movdqu -51(%eax), %xmm1 @@ -378,8 +447,8 @@ L(1bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(52bytes): movdqu -52(%eax), %xmm1 movdqu -52(%edx), %xmm2 @@ -402,13 +471,18 @@ L(20bytes): ptest %xmm2, %xmm0 jnc L(less16bytes) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(53bytes): movdqu -53(%eax), %xmm1 movdqu -53(%edx), %xmm2 @@ -440,7 +514,7 @@ L(21bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(54bytes): movdqu -54(%eax), %xmm1 movdqu -54(%edx), %xmm2 @@ -476,7 +550,7 @@ L(22bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(55bytes): movdqu -55(%eax), %xmm1 movdqu -55(%edx), %xmm2 @@ -513,8 +587,8 @@ L(23bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(56bytes): movdqu -56(%eax), %xmm1 movdqu -56(%edx), %xmm2 @@ -538,18 +612,27 @@ L(24bytes): jnc L(less16bytes) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(57bytes): movdqu -57(%eax), %xmm1 movdqu -57(%edx), %xmm2 @@ -585,7 +668,7 @@ L(25bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(58bytes): movdqu -58(%eax), %xmm1 movdqu -58(%edx), %xmm2 @@ -627,7 +710,7 @@ L(26bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(59bytes): movdqu -59(%eax), %xmm1 movdqu -59(%edx), %xmm2 @@ -668,8 +751,8 @@ L(27bytes): mov $0, %eax jne L(end) RETURN - - ALIGN (4) +# endif + .p2align 4 L(60bytes): movdqu -60(%eax), %xmm1 movdqu -60(%edx), %xmm2 @@ -691,22 +774,38 @@ L(28bytes): pxor %xmm1, %xmm2 ptest %xmm2, %xmm0 jnc L(less16bytes) + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(61bytes): movdqu -61(%eax), %xmm1 movdqu -61(%edx), %xmm2 @@ -749,7 +848,7 @@ L(29bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(62bytes): movdqu -62(%eax), %xmm1 movdqu -62(%edx), %xmm2 @@ -792,7 +891,7 @@ L(30bytes): jne L(end) RETURN - ALIGN (4) + .p2align 4 L(63bytes): movdqu -63(%eax), %xmm1 movdqu -63(%edx), %xmm2 @@ -838,8 +937,9 @@ L(31bytes): mov $0, %eax jne L(end) RETURN +# endif - ALIGN (4) + .p2align 4 L(64bytes): movdqu -64(%eax), %xmm1 movdqu -64(%edx), %xmm2 @@ -863,28 +963,45 @@ L(32bytes): jnc L(less16bytes) mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -16(%edx), %ebx cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif jne L(find_diff) mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -12(%edx), %ebx cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif jne L(find_diff) mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -8(%edx), %ebx cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif jne L(find_diff) mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP mov -4(%edx), %ebx cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif mov $0, %eax jne L(find_diff) RETURN - ALIGN (4) +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less16bytes): add %ebx, %eax add %ebx, %edx @@ -910,9 +1027,35 @@ L(less16bytes): mov $0, %eax jne L(find_diff) RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif - ALIGN (4) + .p2align 4 L(find_diff): +# ifndef USE_AS_WMEMCMP cmpb %bl, %cl jne L(end) cmp %bx, %cx @@ -923,17 +1066,29 @@ L(find_diff): jne L(end) cmp %bx, %cx L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif END (MEMCMP) .section .rodata.sse4.2,"a",@progbits - ALIGN (2) + .p2align 2 .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) .int JMPTBL (L(1bytes), L(table_64bytes)) @@ -1000,5 +1155,72 @@ L(table_64bytes): .int JMPTBL (L(62bytes), L(table_64bytes)) .int JMPTBL (L(63bytes), L(table_64bytes)) .int JMPTBL (L(64bytes), L(table_64bytes)) - .size L(table_64bytes), .-L(table_64bytes) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif #endif diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S index 2e0d15f..eab85c1 100644 --- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -1,5 +1,5 @@ -/* memcmp with SSSE3 - Copyright (C) 2010 Free Software Foundation, Inc. +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010, 2011 Free Software Foundation, Inc. Contributed by Intel Corporation. This file is part of the GNU C Library. @@ -20,47 +20,64 @@ #ifndef NOT_IN_libc -#include -#include "asm-syntax.h" +# include -#ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -#endif +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ - .section .text.ssse3,"ax",@progbits + atom_text_section ENTRY (MEMCMP) movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + movl BLK1(%esp), %eax cmp $48, %ecx movl BLK2(%esp), %edx jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP cmp $1, %ecx jbe L(less1bytes) - PUSH (%ebx) +# endif + + PUSH (%ebx) add %ecx, %edx add %ecx, %eax jmp L(less48bytes) - ALIGN (4) - CFI_POP (%ebx) + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 L(less1bytes): jb L(zero) movb (%eax), %cl @@ -71,29 +88,30 @@ L(less1bytes): neg %eax L(1bytesend): ret +# endif - ALIGN (4) + .p2align 4 L(zero): - mov $0, %eax + xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) cfi_remember_state - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 movl %eax, %edi movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx lea 16(%edi), %edi - sub $0xffff, %edx + sub $0xffff, %edx lea 16(%esi), %esi - jnz L(less16bytes) + jnz L(less16bytes) mov %edi, %edx and $0xf, %edx xor %edx, %edi @@ -104,6 +122,7 @@ L(48bytesormore): jz L(shr_0) xor %edx, %esi +# ifndef USE_AS_WMEMCMP cmp $8, %edx jae L(next_unaligned_table) cmp $0, %edx @@ -122,7 +141,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (4) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -139,8 +158,17 @@ L(next_unaligned_table): cmp $14, %edx je L(shr_14) jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %ecx jae L(shr_0_gobble) @@ -159,13 +187,13 @@ L(shr_0): lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_0_gobble): lea -48(%ecx), %ecx movdqa (%esi), %xmm0 @@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next): jnz L(exit) lea (%ecx, %edi,1), %eax lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %ecx lea -48(%ecx), %ecx @@ -235,13 +264,13 @@ L(shr_1): jnz L(exit) lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -288,14 +317,14 @@ L(shr_1_gobble_next): lea (%ecx, %edi,1), %eax lea 1(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %ecx lea -48(%ecx), %ecx @@ -319,13 +348,13 @@ L(shr_2): jnz L(exit) lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -372,13 +401,13 @@ L(shr_2_gobble_next): lea (%ecx, %edi,1), %eax lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %ecx lea -48(%ecx), %ecx @@ -402,13 +431,13 @@ L(shr_3): jnz L(exit) lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -455,13 +484,14 @@ L(shr_3_gobble_next): lea (%ecx, %edi,1), %eax lea 3(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %ecx lea -48(%ecx), %ecx @@ -485,13 +515,13 @@ L(shr_4): jnz L(exit) lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -538,13 +568,14 @@ L(shr_4_gobble_next): lea (%ecx, %edi,1), %eax lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %ecx lea -48(%ecx), %ecx @@ -568,13 +599,13 @@ L(shr_5): jnz L(exit) lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -621,13 +652,13 @@ L(shr_5_gobble_next): lea (%ecx, %edi,1), %eax lea 5(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %ecx lea -48(%ecx), %ecx @@ -651,13 +682,13 @@ L(shr_6): jnz L(exit) lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -704,13 +735,13 @@ L(shr_6_gobble_next): lea (%ecx, %edi,1), %eax lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %ecx lea -48(%ecx), %ecx @@ -734,13 +765,13 @@ L(shr_7): jnz L(exit) lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -787,13 +818,14 @@ L(shr_7_gobble_next): lea (%ecx, %edi,1), %eax lea 7(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %ecx lea -48(%ecx), %ecx @@ -817,13 +849,13 @@ L(shr_8): jnz L(exit) lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -870,13 +902,14 @@ L(shr_8_gobble_next): lea (%ecx, %edi,1), %eax lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %ecx lea -48(%ecx), %ecx @@ -900,13 +933,13 @@ L(shr_9): jnz L(exit) lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -953,13 +986,13 @@ L(shr_9_gobble_next): lea (%ecx, %edi,1), %eax lea 9(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %ecx lea -48(%ecx), %ecx @@ -983,13 +1016,13 @@ L(shr_10): jnz L(exit) lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1036,13 +1069,13 @@ L(shr_10_gobble_next): lea (%ecx, %edi,1), %eax lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1066,13 +1099,13 @@ L(shr_11): jnz L(exit) lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1119,13 +1152,14 @@ L(shr_11_gobble_next): lea (%ecx, %edi,1), %eax lea 11(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1149,13 +1183,13 @@ L(shr_12): jnz L(exit) lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1202,13 +1236,14 @@ L(shr_12_gobble_next): lea (%ecx, %edi,1), %eax lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# ifndef USE_AS_WMEMCMP cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1232,13 +1267,13 @@ L(shr_13): jnz L(exit) lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1285,13 +1320,13 @@ L(shr_13_gobble_next): lea (%ecx, %edi,1), %eax lea 13(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1315,13 +1350,13 @@ L(shr_14): jnz L(exit) lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1368,13 +1403,13 @@ L(shr_14_gobble_next): lea (%ecx, %edi,1), %eax lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %ecx lea -48(%ecx), %ecx @@ -1398,13 +1433,13 @@ L(shr_15): jnz L(exit) lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %ecx movdqa 16(%esi), %xmm0 @@ -1451,13 +1486,14 @@ L(shr_15_gobble_next): lea (%ecx, %edi,1), %eax lea 15(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) + POP (%edi) + POP (%esi) jmp L(less48bytes) +# endif cfi_restore_state cfi_remember_state - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %ebx sub $0xffff, %ebx @@ -1465,9 +1501,12 @@ L(exit): lea -16(%esi), %esi lea -16(%edi), %edi mov %ebx, %edx + L(first16bytes): add %eax, %esi L(less16bytes): + +# ifndef USE_AS_WMEMCMP test %dl, %dl jz L(next_24_bytes) @@ -1492,61 +1531,61 @@ L(less16bytes): test $0x40, %dl jnz L(Byte22) L(Byte23): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte16): - movzbl -16(%edi), %eax - movzbl -16(%esi), %edx + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte17): - movzbl -15(%edi), %eax - movzbl -15(%esi), %edx + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte18): - movzbl -14(%edi), %eax - movzbl -14(%esi), %edx + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte19): - movzbl -13(%edi), %eax - movzbl -13(%esi), %edx + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte20): - movzbl -12(%edi), %eax - movzbl -12(%esi), %edx + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte21): - movzbl -11(%edi), %eax - movzbl -11(%esi), %edx + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(Byte22): - movzbl -10(%edi), %eax - movzbl -10(%esi), %edx + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx sub %edx, %eax RETURN - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%edi), %edi lea 8(%esi), %esi @@ -1571,20 +1610,69 @@ L(next_24_bytes): test $0x40, %dh jnz L(Byte22) - ALIGN (4) + .p2align 4 L(Byte31): - movzbl -9(%edi), %eax - movzbl -9(%esi), %edx + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx sub %edx, %eax RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif CFI_PUSH (%ebx) - ALIGN (4) + + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) cmp $8, %ecx je L(8bytes) +# ifndef USE_AS_WMEMCMP cmp $9, %ecx je L(9bytes) cmp $10, %ecx @@ -1598,13 +1686,17 @@ L(more8bytes): cmp $14, %ecx je L(14bytes) jmp L(15bytes) +# else + jmp L(12bytes) +# endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) cmp $16, %ecx je L(16bytes) +# ifndef USE_AS_WMEMCMP cmp $17, %ecx je L(17bytes) cmp $18, %ecx @@ -1618,13 +1710,17 @@ L(more16bytes): cmp $22, %ecx je L(22bytes) jmp L(23bytes) +# else + jmp L(20bytes) +# endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) cmp $24, %ecx je L(24bytes) +# ifndef USE_AS_WMEMCMP cmp $25, %ecx je L(25bytes) cmp $26, %ecx @@ -1638,13 +1734,17 @@ L(more24bytes): cmp $30, %ecx je L(30bytes) jmp L(31bytes) +# else + jmp L(28bytes) +# endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) cmp $32, %ecx je L(32bytes) +# ifndef USE_AS_WMEMCMP cmp $33, %ecx je L(33bytes) cmp $34, %ecx @@ -1658,11 +1758,35 @@ L(more32bytes): cmp $38, %ecx je L(38bytes) jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) +# ifndef USE_AS_WMEMCMP cmp $41, %ecx je L(41bytes) cmp $42, %ecx @@ -1677,23 +1801,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) - - ALIGN (4) + .p2align 4 L(44bytes): mov -44(%eax), %ecx mov -44(%edx), %ebx @@ -1750,11 +1858,64 @@ L(4bytes): cmp %ebx, %ecx mov $0, %eax jne L(find_diff) - POP (%ebx) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) ret CFI_PUSH (%ebx) +# endif - ALIGN (4) +# ifndef USE_AS_WMEMCMP + + .p2align 4 L(45bytes): mov -45(%eax), %ecx mov -45(%edx), %ebx @@ -1814,11 +1975,11 @@ L(5bytes): cmp -1(%edx), %cl mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(46bytes): mov -46(%eax), %ecx mov -46(%edx), %ebx @@ -1882,11 +2043,11 @@ L(2bytes): cmp %bh, %ch mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%eax), %ecx movl -47(%edx), %ebx @@ -1953,11 +2114,11 @@ L(3bytes): cmpb -1(%edx), %al mov $0, %eax jne L(end) - POP (%ebx) + POP (%ebx) ret CFI_PUSH (%ebx) - ALIGN (4) + .p2align 4 L(find_diff): cmpb %bl, %cl jne L(end) @@ -1968,14 +2129,30 @@ L(find_diff): cmp %bl, %cl jne L(end) cmp %bx, %cx + + .p2align 4 L(end): - POP (%ebx) + POP (%ebx) mov $1, %eax ja L(bigger) neg %eax L(bigger): ret +# else -END (MEMCMP) +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) #endif diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 0000000..94ff615 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define WMEMCMP __wmemcmp_ia32 +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000..1a857c7 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000..a41ef95 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 0000000..5080c14 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,59 @@ +/* Multiple versions of wmemcmp + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include + +/* Define multiple versions only for the definition in libc. */ + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function + __i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax + testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(wmemcmp) +#endif -- cgit v1.1