aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-06-27 07:55:00 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-06-27 07:55:00 -0700
commite94c31035739b693c3699b3c4cad0206631fbee7 (patch)
treeb0666d325944a10e78d4dd6333c19c108c50d74c
parent6980be7cbf4f108a4936ac64242f58340d56c806 (diff)
downloadglibc-e94c31035739b693c3699b3c4cad0206631fbee7.zip
glibc-e94c31035739b693c3699b3c4cad0206631fbee7.tar.gz
glibc-e94c31035739b693c3699b3c4cad0206631fbee7.tar.bz2
x86-64: Optimize memcmp-avx2-movbe.S for short difference
Check the first 32 bytes before checking size when size >= 32 bytes to avoid unnecessary branch if the difference is in the first 32 bytes. Replace vpmovmskb/subl/jnz with vptest/jnc. On Haswell, the new version is as fast as the previous one. On Skylake, the new version is a little bit faster. * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S (MEMCMP): Check the first 32 bytes before checking size when size >= 32 bytes. Replace vpmovmskb/subl/jnz with vptest/jnc.
-rw-r--r--ChangeLog6
-rw-r--r--sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S118
2 files changed, 68 insertions, 56 deletions
diff --git a/ChangeLog b/ChangeLog
index 48821c0..88dde2b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-06-27 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S (MEMCMP): Check
+ the first 32 bytes before checking size when size >= 32 bytes.
+ Replace vpmovmskb/subl/jnz with vptest/jnc.
+
2017-06-27 Stefan Liebler <stli@linux.vnet.ibm.com>
* sysdeps/s390/s390-32/tls-macros.h (TLS_IE): Use r12 for GOT address.
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index abcc61c..16f4630 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -62,9 +62,68 @@ ENTRY (MEMCMP)
# endif
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
+ jbe L(last_vec)
+
+ VPCMPEQ %ymm0, %ymm0, %ymm0
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm5
+
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ vpand %ymm2, %ymm1, %ymm5
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ vpand %ymm3, %ymm5, %ymm5
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ vpand %ymm4, %ymm5, %ymm5
+
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
L(last_2x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
vmovdqu (%rsi), %ymm2
@@ -219,58 +278,6 @@ L(between_16_31):
ret
.p2align 4
-L(more_2x_vec):
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
-
- /* From 4 * VEC to 8 * VEC, inclusively. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-
- vpand %ymm1, %ymm2, %ymm5
- vpand %ymm3, %ymm4, %ymm6
- vpand %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
- VZEROUPPER
- ret
-
- .p2align 4
L(more_8x_vec):
/* More than 8 * VEC. Check the first VEC. */
vmovdqu (%rsi), %ymm2
@@ -309,9 +316,8 @@ L(loop_4x_vec):
VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
vpand %ymm4, %ymm5, %ymm5
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi