aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-10-18 17:44:05 -0700
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-10-19 17:31:03 -0700
commitb79f8ff26aa6151d2d2167afcddcd1ec46cfbc81 (patch)
tree3d956430a93cb6412a7a0a55a67cb4d82ccae86d /sysdeps/x86_64
parent69717709ec5c2769322678e96a7672d1e270de3a (diff)
downloadglibc-b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81.zip
glibc-b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81.tar.gz
glibc-b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81.tar.bz2
x86: Optimize strnlen-evex.S and implement with VMM headers
Optimizations are: 1. Use the fact that bsf(0) leaves the destination unchanged to save a branch in short string case. 2. Restructure code so that small strings are given the hot path. - This is a net-zero on the benchmark suite but in general makes sense as smaller sizes are far more common. 3. Use more code-size efficient instructions. - tzcnt ... -> bsf ... - vpcmpb $0 ... -> vpcmpeq ... 4. Align labels less aggressively, especially if it doesn't save fetch blocks / causes the basic-block to span extra cache-lines. The optimizations (especially for point 2) make the strnlen and strlen code essentially incompatible so split strnlen-evex to a new file. Code Size Changes: strlen-evex.S : -23 bytes strnlen-evex.S : -167 bytes Net perf changes: Reported as geometric mean of all improvements / regressions from N=10 runs of the benchtests. Value as New Time / Old Time so < 1.0 is improvement and 1.0 is regression. strlen-evex.S : 0.992 (No real change) strnlen-evex.S : 0.947 Full results attached in email. Full check passes on x86-64.
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r--sysdeps/x86_64/multiarch/strlen-evex.S544
-rw-r--r--sysdeps/x86_64/multiarch/strnlen-evex.S427
-rw-r--r--sysdeps/x86_64/multiarch/wcsnlen-evex.S5
3 files changed, 572 insertions, 404 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 2109ec2..487846f 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@
# define STRLEN __strlen_evex
# endif
-# define VMOVA vmovdqa64
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
# ifdef USE_AS_WCSLEN
-# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define VPCMPNEQ vpcmpneqd
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
# define VPMINU vpminud
-# define SHIFT_REG ecx
# define CHAR_SIZE 4
+# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg
# else
-# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define VPCMPNEQ vpcmpneqb
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
# define VPMINU vpminub
-# define SHIFT_REG edx
# define CHAR_SIZE 1
+# define CHAR_SIZE_SHIFT_REG(reg)
+
+# define REG_WIDTH VEC_SIZE
# endif
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
- /* Check zero length. */
- test %RSI_LP, %RSI_LP
- jz L(zero)
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
- mov %RSI_LP, %R8_LP
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+# define TAIL_RETURN_LBL first_vec_x2
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x3
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# else
+
+# define TAIL_RETURN_LBL first_vec_x3
+# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
+
+# define FALLTHROUGH_RETURN_LBL first_vec_x2
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
# endif
+
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
movl %edi, %eax
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
- /* Clear high bits from edi. Only keeping bits relevant to page
- cross check. */
+ vpxorq %XZERO, %XZERO, %XZERO
andl $(PAGE_SIZE - 1), %eax
- /* Check if we may cross page boundary with one vector load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
null byte. */
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
-# ifdef USE_AS_STRNLEN
- /* If length < CHAR_PER_VEC handle special. */
- cmpq $CHAR_PER_VEC, %rsi
- jbe L(first_vec_x0)
-# endif
- testl %eax, %eax
+ VPCMPEQ (%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jz L(aligned_more)
- tzcntl %eax, %eax
- ret
-# ifdef USE_AS_STRNLEN
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(first_vec_x0):
- /* Set bit for max len so that tzcnt will return min of max len
- and position of first match. */
- btsq %rsi, %rax
- tzcntl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- /* Safe to use 32 bit instructions as these are only called for
- size = [1, 159]. */
-# ifdef USE_AS_STRNLEN
- /* Use ecx which was computed earlier to compute correct value.
- */
- leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
- subl %edx, %edi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %edi
-# endif
- leal CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
- ret
-
- .p2align 4
-L(first_vec_x2):
- tzcntl %eax, %eax
- /* Safe to use 32 bit instructions as these are only called for
- size = [1, 159]. */
-# ifdef USE_AS_STRNLEN
- /* Use ecx which was computed earlier to compute correct value.
- */
- leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
- subl %edx, %edi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %edi
-# endif
- leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+ bsf %VRAX, %VRAX
ret
- .p2align 4
-L(first_vec_x3):
- tzcntl %eax, %eax
- /* Safe to use 32 bit instructions as these are only called for
- size = [1, 159]. */
-# ifdef USE_AS_STRNLEN
- /* Use ecx which was computed earlier to compute correct value.
- */
- leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
- subl %edx, %edi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %edi
-# endif
- leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
- ret
-
- .p2align 4
+ .p2align 4,, 8
L(first_vec_x4):
- tzcntl %eax, %eax
- /* Safe to use 32 bit instructions as these are only called for
- size = [1, 159]. */
-# ifdef USE_AS_STRNLEN
- /* Use ecx which was computed earlier to compute correct value.
- */
- leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
- subl %edx, %edi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %edi
-# endif
+ bsf %VRAX, %VRAX
+ subl %ecx, %edi
+ CHAR_SIZE_SHIFT_REG (edi)
leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
ret
- .p2align 5
+
+
+ /* Aligned more for strnlen compares remaining length vs 2 *
+ CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+ going to the loop. */
+ .p2align 4,, 10
L(aligned_more):
- movq %rdi, %rdx
- /* Align data to VEC_SIZE. */
- andq $-(VEC_SIZE), %rdi
+ movq %rdi, %rcx
+ andq $(VEC_SIZE * -1), %rdi
L(cross_page_continue):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
-# ifdef USE_AS_STRNLEN
- /* + CHAR_SIZE because it simplies the logic in
- last_4x_vec_or_less. */
- leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
- subq %rdx, %rcx
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %ecx
-# endif
-# endif
- /* Load first VEC regardless. */
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
- /* Adjust length. If near end handle specially. */
- subq %rcx, %rsi
- jb L(last_4x_vec_or_less)
-# endif
- kmovd %k0, %eax
- testl %eax, %eax
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- test %eax, %eax
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(first_vec_x3)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(first_vec_x4)
- addq $VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
- /* Check if at last VEC_SIZE * 4 length. */
- cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
- jbe L(last_4x_vec_or_less_load)
- movl %edi, %ecx
- andl $(VEC_SIZE * 4 - 1), %ecx
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarl $2, %ecx
-# endif
- /* Readjust length. */
- addq %rcx, %rsi
-# endif
- /* Align data to VEC_SIZE * 4. */
+ subq $(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+ /* No partial register stalls on processors that we use evex512
+ on and this saves code size. */
+ xorb %dil, %dil
+# else
andq $-(VEC_SIZE * 4), %rdi
+# endif
+
+
/* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Load first VEC regardless. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
- /* Break if at end of length. */
- subq $(CHAR_PER_VEC * 4), %rsi
- jb L(last_4x_vec_or_less_cmpeq)
-# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
- VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
- VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
- VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k2
- VPCMP $0, %YMM2, %YMMZERO, %k0
- VPCMP $0, %YMM4, %YMMZERO, %k1
subq $-(VEC_SIZE * 4), %rdi
- kortestd %k0, %k1
+ KORTEST %k0, %k2
jz L(loop_4x_vec)
- /* Check if end was in first half. */
- kmovd %k0, %eax
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- shrq $2, %rdi
-# endif
- testl %eax, %eax
- jz L(second_vec_return)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x0)
- VPCMP $0, %YMM1, %YMMZERO, %k2
- kmovd %k2, %edx
- /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
-# ifdef USE_AS_WCSLEN
- sall $CHAR_PER_VEC, %eax
- orl %edx, %eax
- tzcntl %eax, %eax
-# else
- salq $CHAR_PER_VEC, %rax
- orq %rdx, %rax
- tzcntq %rax, %rax
-# endif
- addq %rdi, %rax
- ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
- VPCMP $0, %YMM1, %YMMZERO, %k0
- addq $(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
- kmovd %k0, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
- testl $(CHAR_PER_VEC * 2), %esi
- jnz L(last_4x_vec)
-
- /* length may have been negative or positive by an offset of
- CHAR_PER_VEC * 4 depending on where this was called from. This
- fixes that. */
- andl $(CHAR_PER_VEC * 4 - 1), %esi
- testl %eax, %eax
- jnz L(last_vec_x1_check)
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x1)
- /* Check the end of data. */
- subl $CHAR_PER_VEC, %esi
- jb L(max)
+ VPTESTN %VMM(3), %VMM(3), %k0
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpl %eax, %esi
- jb L(max)
-
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
- leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
- ret
-L(max):
- movq %r8, %rax
- ret
-# endif
-
- /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
- in the 4x VEC loop can use 2 byte encoding. */
- .p2align 4
-L(second_vec_return):
- VPCMP $0, %YMM3, %YMMZERO, %k0
- /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
-# ifdef USE_AS_WCSLEN
- kunpckbw %k0, %k1, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
+# if CHAR_PER_VEC == 64
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
+ jnz L(first_vec_x2)
+ KMOV %k2, %VRAX
# else
- kunpckdq %k0, %k1, %k0
- kmovq %k0, %rax
- tzcntq %rax, %rax
+ /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+ */
+ kmovd %k2, %edx
+ kmovd %k0, %eax
+ salq $CHAR_PER_VEC, %rdx
+ orq %rdx, %rax
# endif
- leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
- ret
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpl %eax, %esi
- jb L(max)
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
- leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
+ /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+ */
+ .p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+ bsfq %rax, %rax
+ subq %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
ret
- .p2align 4
-L(last_4x_vec):
- /* Test first 2x VEC normally. */
- testl %eax, %eax
- jnz L(last_vec_x1)
-
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(last_vec_x2)
-
- /* Normalize length. */
- andl $(CHAR_PER_VEC * 4 - 1), %esi
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(last_vec_x3)
-
- /* Check the end of data. */
- subl $(CHAR_PER_VEC * 3), %esi
- jb L(max)
-
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpl %eax, %esi
- jb L(max_end)
-
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
- leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ .p2align 4,, 8
+L(first_vec_x0):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
+ addq %rdi, %rax
ret
- .p2align 4
-L(last_vec_x1):
- tzcntl %eax, %eax
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
+ .p2align 4,, 10
+L(first_vec_x1):
+ bsf %VRAX, %VRAX
+ sub %rcx, %rdi
+ CHAR_SIZE_SHIFT_REG (rdi)
leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
- .p2align 4
-L(last_vec_x2):
- tzcntl %eax, %eax
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
- leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(last_vec_x3):
- tzcntl %eax, %eax
- subl $(CHAR_PER_VEC * 2), %esi
- /* Check the end of data. */
- cmpl %eax, %esi
- jb L(max_end)
- subq %rdx, %rdi
-# ifdef USE_AS_WCSLEN
- /* NB: Divide bytes by 4 to get the wchar_t count. */
- sarq $2, %rdi
-# endif
- leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
- ret
-L(max_end):
- movq %r8, %rax
+ .p2align 4,, 10
+ /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+ */
+L(TAIL_RETURN_LBL):
+ bsf %VRAX, %VRAX
+ sub %VRCX, %VRDI
+ CHAR_SIZE_SHIFT_REG (VRDI)
+ lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
ret
-# endif
- /* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
- movq %rdi, %rdx
+ movq %rdi, %rcx
/* Align data to VEC_SIZE. */
andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- /* Remove the leading bytes. */
+
+ VPCMPEQ (%rdi), %VZERO, %k0
+
+ KMOV %k0, %VRAX
# ifdef USE_AS_WCSLEN
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
- bytes. */
- movl %edx, %ecx
- shrl $2, %ecx
- andl $(CHAR_PER_VEC - 1), %ecx
-# endif
- /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
- sarxl %SHIFT_REG, %eax, %eax
+ movl %ecx, %edx
+ shrl $2, %edx
+ andl $(CHAR_PER_VEC - 1), %edx
+ shrx %edx, %eax, %eax
testl %eax, %eax
-# ifndef USE_AS_STRNLEN
- jz L(cross_page_continue)
- tzcntl %eax, %eax
- ret
# else
- jnz L(cross_page_less_vec)
-# ifndef USE_AS_WCSLEN
- movl %edx, %ecx
- andl $(CHAR_PER_VEC - 1), %ecx
-# endif
- movl $CHAR_PER_VEC, %eax
- subl %ecx, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- ja L(cross_page_continue)
- movl %esi, %eax
- ret
-L(cross_page_less_vec):
- tzcntl %eax, %eax
- /* Select min of length and position of first null. */
- cmpq %rax, %rsi
- cmovb %esi, %eax
- ret
+ shr %cl, %VRAX
# endif
+ jz L(cross_page_continue)
+ bsf %VRAX, %VRAX
+ ret
END (STRLEN)
#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 64a9fc2..443a32a 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+# define STRNLEN __strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPCMPNEQ vpcmpneqd
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPCMPNEQ vpcmpneqb
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+
+# define REG_WIDTH VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
+# else
+# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
+# else
+# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO VMM_128(0)
+# define VZERO VMM(0)
+# define PAGE_SIZE 4096
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+
+ movl %edi, %eax
+ vpxorq %XZERO, %XZERO, %XZERO
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
+ null byte. */
+ VPCMPEQ (%rdi), %VZERO, %k0
+
+ KMOV %k0, %VRCX
+ movq %rsi, %rax
+
+ /* If src (rcx) is zero, bsf does not change the result. NB:
+ Must use 64-bit bsf here so that upper bits of len are not
+ cleared. */
+ bsfq %rcx, %rax
+ /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+ CHAR) and rsi must be > CHAR_PER_VEC. */
+ cmpq $CHAR_PER_VEC, %rax
+ ja L(more_1x_vec)
+ /* Check if first match in bounds. */
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+ ret
+
+
+# if CHAR_PER_VEC != 32
+ .p2align 4,, 2
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+# endif
+
+ /* Aligned more for strnlen compares remaining length vs 2 *
+ CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+ going to the loop. */
+ .p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+ /* Compute number of words checked after aligning. */
+# ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ movq %rdi, %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+ sarq $2, %rax
+ leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+ leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
+ andq $(VEC_SIZE * -1), %rdi
+ subq %rdi, %rax
+# endif
+
+
+ VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
+
+ cmpq $(CHAR_PER_VEC * 2), %rax
+ ja L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+
+ /* Check the end of data. */
+ SUB_SHORT (CHAR_PER_VEC, rax)
+ jbe L(max_0)
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jz L(max_0)
+ /* Best place for LAST_VEC_CHECK if ZMM. */
+ .p2align 4,, 8
+L(last_vec_check):
+ bsf %VRDX, %VRDX
+ sub %eax, %edx
+ lea (%rsi, %rdx), %eax
+ cmovae %esi, %eax
+ ret
+
+# if CHAR_PER_VEC == 32
+ .p2align 4,, 2
+L(zero):
+L(max_0):
+ movl %esi, %eax
+ ret
+# endif
+
+ .p2align 4,, 8
+L(last_4x_vec_or_less):
+ addl $(CHAR_PER_VEC * -4), %eax
+ VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
+ subq $(VEC_SIZE * -4), %rdi
+ cmpl $(CHAR_PER_VEC * 2), %eax
+ jbe L(last_2x_vec_or_less)
+
+ .p2align 4,, 6
+L(more_2x_vec):
+ /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+ rechecking bounds. */
-#define USE_AS_STRNLEN 1
-#define STRLEN STRNLEN
+ KMOV %k0, %VRDX
-#include "strlen-evex.S"
+ test %VRDX, %VRDX
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x2)
+
+ cmpq $(CHAR_PER_VEC * 4), %rax
+ ja L(more_4x_vec)
+
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ addl $(CHAR_PER_VEC * -2), %eax
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+
+ subl $(CHAR_PER_VEC), %eax
+ jbe L(max_1)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+
+ test %VRDX, %VRDX
+ jnz L(last_vec_check)
+L(max_1):
+ movl %esi, %eax
+ ret
+
+ .p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+ ret
+ .p2align 4,, 6
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+ ret
+
+
+ .p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+ /* If VEC_SIZE == 64 we can fit logic for full return label in
+ spare bytes before next cache line. */
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+ ret
+ .p2align 4,, 6
+# else
+ addl $CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+ bsf %VRDX, %VRDX
+ sub %eax, %esi
+ leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+ ret
+
+ .p2align 4,, 5
+L(more_4x_vec):
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(first_vec_x4)
+
+ /* Check if at last VEC_SIZE * 4 length before aligning for the
+ loop. */
+ cmpq $(CHAR_PER_VEC * 8), %rax
+ jbe L(last_4x_vec_or_less)
+
+
+ /* Compute number of words checked after aligning. */
+# ifdef USE_AS_WCSLEN
+ /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+ overflow. */
+ leaq (VEC_SIZE * -3)(%rdi), %rdx
+# else
+ leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+ subq $(VEC_SIZE * -1), %rdi
+
+ /* Align data to VEC_SIZE * 4. */
+# if VEC_SIZE == 64
+ /* Saves code size. No evex512 processor has partial register
+ stalls. If that change this can be replaced with `andq
+ $-(VEC_SIZE * 4), %rdi`. */
+ xorb %dil, %dil
+# else
+ andq $-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+ subq %rdi, %rdx
+ sarq $2, %rdx
+ addq %rdx, %rax
+# else
+ subq %rdi, %rax
+# endif
+ /* Compare 4 * VEC at a time forward. */
+ .p2align 4,, 11
+L(loop_4x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
+ VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+ VPTESTN %VMM(2), %VMM(2), %k0
+ VPTESTN %VMM(4), %VMM(4), %k2
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Break if at end of length. */
+ subq $(CHAR_PER_VEC * 4), %rax
+ jbe L(loop_len_end)
+
+
+ KORTEST %k0, %k2
+ jz L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+ movq %rsi, %rcx
+ subq %rax, %rsi
+ VPTESTN %VMM(1), %VMM(1), %k1
+ KMOV %k1, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_x0)
+
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_x1)
+
+ VPTESTN %VMM(3), %VMM(3), %k0
+
+ /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+ returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+ individually, for VEC_SIZE == 32 we combine them in a single
+ 64-bit GPR. */
+# if CHAR_PER_VEC == 64
+ KMOV %k0, %VRDX
+ test %VRDX, %VRDX
+ jnz L(last_vec_x2)
+ KMOV %k2, %VRDX
+# else
+ /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+ */
+ kmovd %k2, %edx
+ kmovd %k0, %eax
+ salq $CHAR_PER_VEC, %rdx
+ orq %rax, %rdx
+# endif
+
+ /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+ */
+ bsfq %rdx, %rdx
+ leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+ cmpq %rax, %rcx
+ cmovb %rcx, %rax
+ ret
+
+ /* Handle last 4x VEC after loop. All VECs have been loaded. */
+ .p2align 4,, 4
+L(loop_len_end):
+ KORTEST %k0, %k2
+ jnz L(loop_last_4x_vec)
+ movq %rsi, %rax
+ ret
+
+
+# if CHAR_PER_VEC == 64
+ /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+ need return label for it. */
+ .p2align 4,, 8
+L(last_vec_x2):
+ bsf %VRDX, %VRDX
+ leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+ cmpq %rax, %rcx
+ cmovb %rcx, %rax
+ ret
+# endif
+
+
+ .p2align 4,, 10
+L(last_vec_x1):
+ addq $CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+ bsf %VRDX, %VRDX
+ leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+ cmpq %rax, %rcx
+ cmovb %rcx, %rax
+ ret
+
+
+ .p2align 4,, 8
+L(cross_page_boundary):
+ /* Align data to VEC_SIZE. */
+ movq %rdi, %rcx
+ andq $-VEC_SIZE, %rcx
+ VPCMPEQ (%rcx), %VZERO, %k0
+
+ KMOV %k0, %VRCX
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+# endif
+ shrx %VRAX, %VRCX, %VRCX
+
+ negl %eax
+ andl $(CHAR_PER_VEC - 1), %eax
+ movq %rsi, %rdx
+ bsf %VRCX, %VRDX
+ cmpq %rax, %rdx
+ ja L(cross_page_continue)
+ movl %edx, %eax
+ cmpq %rdx, %rsi
+ cmovb %esi, %eax
+ ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
index e2aad94..57a7e93 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@
# define WCSNLEN __wcsnlen_evex
#endif
-#define STRLEN WCSNLEN
+#define STRNLEN WCSNLEN
#define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
-#include "strlen-evex.S"
+#include "strnlen-evex.S"