From cca457f9c51a90cf82cae75432ed3de20942519c Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Fri, 24 Dec 2021 18:54:53 -0600 Subject: x86: Optimize L(less_vec) case in memcmpeq-evex.S No bug. Optimizations are twofold. 1) Replace page cross and 0/1 checks with masked load instructions in L(less_vec). In applications this reduces branch-misses in the hot [0, 32] case. 2) Change controlflow so that L(less_vec) case gets the fall through. Change 2) helps copies in the [0, 32] size range but comes at the cost of copies in the [33, 64] size range. From profiles of GCC and Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this appears to the the right tradeoff. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++++----------------------- 1 file changed, 43 insertions(+), 127 deletions(-) (limited to 'sysdeps') diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S index f27e732..b5e1edb 100644 --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S @@ -39,6 +39,7 @@ # define MEMCMPEQ __memcmpeq_evex # endif +# define VMOVU_MASK vmovdqu8 # define VMOVU vmovdqu64 # define VPCMP vpcmpub # define VPTEST vptestmb @@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) movl %edx, %edx # endif cmp $VEC_SIZE, %RDX_LP - jb L(less_vec) + /* Fall through for [0, VEC_SIZE] as its the hottest. */ + ja L(more_1x_vec) + + /* Create mask of bytes that are guranteed to be valid because + of length (edx). Using masked movs allows us to skip checks for + page crosses/zero size. */ + movl $-1, %ecx + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k2 + + /* Use masked loads as VEC_SIZE could page cross where length + (edx) would not. */ + VMOVU_MASK (%rsi), %YMM2{%k2} + VPCMP $4,(%rdi), %YMM2, %k1{%k2} + kmovd %k1, %eax + ret - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + +L(last_1x_vec): + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 + kmovd %k1, %eax +L(return_neq0): + ret + + + + .p2align 4 +L(more_1x_vec): + /* From VEC + 1 to 2 * VEC. */ VMOVU (%rsi), %YMM1 /* Use compare not equals to directly check for mismatch. */ - VPCMP $4, (%rdi), %YMM1, %k1 + VPCMP $4,(%rdi), %YMM1, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) @@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) /* Check third and fourth VEC no matter what. */ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 kmovd %k1, %eax testl %eax, %eax jnz L(return_neq0) @@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ VPTEST %YMM4, %YMM4, %k1 kmovd %k1, %eax -L(return_neq0): - ret - - /* Fits in padding needed to .p2align 5 L(less_vec). */ -L(last_1x_vec): - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 - kmovd %k1, %eax - ret - - /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 - byte aligned. */ - .p2align 5 -L(less_vec): - /* Check if one or less char. This is necessary for size = 0 but - is also faster for size = 1. */ - cmpl $1, %edx - jbe L(one_or_less) - - /* Check if loading one VEC from either s1 or s2 could cause a - page cross. This can have false positives but is by far the - fastest method. */ - movl %edi, %eax - orl %esi, %eax - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(page_cross_less_vec) - - /* No page cross possible. */ - VMOVU (%rsi), %YMM2 - VPCMP $4, (%rdi), %YMM2, %k1 - kmovd %k1, %eax - /* Result will be zero if s1 and s2 match. Otherwise first set - bit will be first mismatch. */ - bzhil %edx, %eax, %eax - ret - - /* Relatively cold but placing close to L(less_vec) for 2 byte - jump encoding. */ - .p2align 4 -L(one_or_less): - jb L(zero) - movzbl (%rsi), %ecx - movzbl (%rdi), %eax - subl %ecx, %eax - /* No ymm register was touched. */ - ret - /* Within the same 16 byte block is L(one_or_less). */ -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(last_2x_vec): - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 - VPTEST %YMM2, %YMM2, %k1 - kmovd %k1, %eax ret .p2align 4 @@ -211,7 +179,7 @@ L(loop_4x_vec): vpxorq (%rdi), %YMM1, %YMM1 VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 - vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2 + vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 @@ -238,7 +206,7 @@ L(loop_4x_vec): VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while oring with YMM4. Result is stored in YMM4. */ - vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 + vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 cmpl $(VEC_SIZE * 2), %edi jae L(8x_last_2x_vec) @@ -256,68 +224,16 @@ L(8x_last_2x_vec): L(return_neq2): ret - /* Relatively cold case as page cross are unexpected. */ - .p2align 4 -L(page_cross_less_vec): - cmpl $16, %edx - jae L(between_16_31) - cmpl $8, %edx - ja L(between_9_15) - cmpl $4, %edx - jb L(between_2_3) - /* From 4 to 8 bytes. No branch when size == 4. */ - movl (%rdi), %eax - subl (%rsi), %eax - movl -4(%rdi, %rdx), %ecx - movl -4(%rsi, %rdx), %edi - subl %edi, %ecx - orl %ecx, %eax - ret - - .p2align 4,, 8 -L(between_16_31): - /* From 16 to 31 bytes. No branch when size == 16. */ - - /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. - */ - vmovdqu (%rsi), %xmm1 - vpcmpeqb (%rdi), %xmm1, %xmm1 - vmovdqu -16(%rsi, %rdx), %xmm2 - vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 - vpand %xmm1, %xmm2, %xmm2 - vpmovmskb %xmm2, %eax - notw %ax - /* No ymm register was touched. */ - ret - .p2align 4,, 8 -L(between_9_15): - /* From 9 to 15 bytes. */ - movq (%rdi), %rax - subq (%rsi), %rax - movq -8(%rdi, %rdx), %rcx - movq -8(%rsi, %rdx), %rdi - subq %rdi, %rcx - orq %rcx, %rax - /* edx is guranteed to be a non-zero int. */ - cmovnz %edx, %eax - ret - - /* Don't align. This is cold and aligning here will cause code - to spill into next cache line. */ -L(between_2_3): - /* From 2 to 3 bytes. No branch when size == 2. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - subl %ecx, %eax - movzbl -1(%rdi, %rdx), %ecx - /* All machines that support evex will insert a "merging uop" - avoiding any serious partial register stalls. */ - subb -1(%rsi, %rdx), %cl - orl %ecx, %eax - /* No ymm register was touched. */ +L(last_2x_vec): + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 + VPTEST %YMM2, %YMM2, %k1 + kmovd %k1, %eax ret - /* 4 Bytes from next cache line. */ + /* 1 Bytes from next cache line. */ END (MEMCMPEQ) #endif -- cgit v1.1