aboutsummaryrefslogtreecommitdiff
path: root/sysdeps
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2021-12-24 18:54:53 -0600
committerNoah Goldstein <goldstein.w.n@gmail.com>2021-12-27 03:18:58 -0600
commitcca457f9c51a90cf82cae75432ed3de20942519c (patch)
tree84222287827e96165605200016965b48ef0d5928 /sysdeps
parentabddd61de090ae84e380aff68a98bd94ef704667 (diff)
downloadglibc-cca457f9c51a90cf82cae75432ed3de20942519c.zip
glibc-cca457f9c51a90cf82cae75432ed3de20942519c.tar.gz
glibc-cca457f9c51a90cf82cae75432ed3de20942519c.tar.bz2
x86: Optimize L(less_vec) case in memcmpeq-evex.S
No bug. Optimizations are twofold. 1) Replace page cross and 0/1 checks with masked load instructions in L(less_vec). In applications this reduces branch-misses in the hot [0, 32] case. 2) Change controlflow so that L(less_vec) case gets the fall through. Change 2) helps copies in the [0, 32] size range but comes at the cost of copies in the [33, 64] size range. From profiles of GCC and Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this appears to the the right tradeoff. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Diffstat (limited to 'sysdeps')
-rw-r--r--sysdeps/x86_64/multiarch/memcmpeq-evex.S170
1 files changed, 43 insertions, 127 deletions
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index f27e732..b5e1edb 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -39,6 +39,7 @@
# define MEMCMPEQ __memcmpeq_evex
# endif
+# define VMOVU_MASK vmovdqu8
# define VMOVU vmovdqu64
# define VPCMP vpcmpub
# define VPTEST vptestmb
@@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
- jb L(less_vec)
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
+ ja L(more_1x_vec)
+
+ /* Create mask of bytes that are guranteed to be valid because
+ of length (edx). Using masked movs allows us to skip checks for
+ page crosses/zero size. */
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k2
+
+ /* Use masked loads as VEC_SIZE could page cross where length
+ (edx) would not. */
+ VMOVU_MASK (%rsi), %YMM2{%k2}
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ kmovd %k1, %eax
+ ret
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+
+L(last_1x_vec):
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+ kmovd %k1, %eax
+L(return_neq0):
+ ret
+
+
+
+ .p2align 4
+L(more_1x_vec):
+ /* From VEC + 1 to 2 * VEC. */
VMOVU (%rsi), %YMM1
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4, (%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %YMM1, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Check third and fourth VEC no matter what. */
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %eax
-L(return_neq0):
- ret
-
- /* Fits in padding needed to .p2align 5 L(less_vec). */
-L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
- kmovd %k1, %eax
- ret
-
- /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
- byte aligned. */
- .p2align 5
-L(less_vec):
- /* Check if one or less char. This is necessary for size = 0 but
- is also faster for size = 1. */
- cmpl $1, %edx
- jbe L(one_or_less)
-
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
-
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Result will be zero if s1 and s2 match. Otherwise first set
- bit will be first mismatch. */
- bzhil %edx, %eax, %eax
- ret
-
- /* Relatively cold but placing close to L(less_vec) for 2 byte
- jump encoding. */
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
- /* No ymm register was touched. */
- ret
- /* Within the same 16 byte block is L(one_or_less). */
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(last_2x_vec):
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
- vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
- vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
- VPTEST %YMM2, %YMM2, %k1
- kmovd %k1, %eax
ret
.p2align 4
@@ -211,7 +179,7 @@ L(loop_4x_vec):
vpxorq (%rdi), %YMM1, %YMM1
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
+ vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
@@ -238,7 +206,7 @@ L(loop_4x_vec):
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
oring with YMM4. Result is stored in YMM4. */
- vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+ vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
@@ -256,68 +224,16 @@ L(8x_last_2x_vec):
L(return_neq2):
ret
- /* Relatively cold case as page cross are unexpected. */
- .p2align 4
-L(page_cross_less_vec):
- cmpl $16, %edx
- jae L(between_16_31)
- cmpl $8, %edx
- ja L(between_9_15)
- cmpl $4, %edx
- jb L(between_2_3)
- /* From 4 to 8 bytes. No branch when size == 4. */
- movl (%rdi), %eax
- subl (%rsi), %eax
- movl -4(%rdi, %rdx), %ecx
- movl -4(%rsi, %rdx), %edi
- subl %edi, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
-
- /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
- */
- vmovdqu (%rsi), %xmm1
- vpcmpeqb (%rdi), %xmm1, %xmm1
- vmovdqu -16(%rsi, %rdx), %xmm2
- vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
- vpand %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- notw %ax
- /* No ymm register was touched. */
- ret
-
.p2align 4,, 8
-L(between_9_15):
- /* From 9 to 15 bytes. */
- movq (%rdi), %rax
- subq (%rsi), %rax
- movq -8(%rdi, %rdx), %rcx
- movq -8(%rsi, %rdx), %rdi
- subq %rdi, %rcx
- orq %rcx, %rax
- /* edx is guranteed to be a non-zero int. */
- cmovnz %edx, %eax
- ret
-
- /* Don't align. This is cold and aligning here will cause code
- to spill into next cache line. */
-L(between_2_3):
- /* From 2 to 3 bytes. No branch when size == 2. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- subl %ecx, %eax
- movzbl -1(%rdi, %rdx), %ecx
- /* All machines that support evex will insert a "merging uop"
- avoiding any serious partial register stalls. */
- subb -1(%rsi, %rdx), %cl
- orl %ecx, %eax
- /* No ymm register was touched. */
+L(last_2x_vec):
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
+ vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
+ vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
+ VPTEST %YMM2, %YMM2, %k1
+ kmovd %k1, %eax
ret
- /* 4 Bytes from next cache line. */
+ /* 1 Bytes from next cache line. */
END (MEMCMPEQ)
#endif