From ae86a6bd4825d24cf4a79e6eb040c4507146649d Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Mon, 19 Apr 2021 19:36:06 -0400 Subject: x86: Optimize strlen-evex.S No bug. This commit optimizes strlen-evex.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein (cherry picked from commit 4ba65586847751372520a36757c17f114588794e) --- sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++++++--------------- 1 file changed, 317 insertions(+), 264 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 0583819..4bf6874 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -29,11 +29,13 @@ # ifdef USE_AS_WCSLEN # define VPCMP vpcmpd # define VPMINU vpminud -# define SHIFT_REG r9d +# define SHIFT_REG ecx +# define CHAR_SIZE 4 # else # define VPCMP vpcmpb # define VPMINU vpminub -# define SHIFT_REG ecx +# define SHIFT_REG edx +# define CHAR_SIZE 1 # endif # define XMMZERO xmm16 @@ -46,132 +48,165 @@ # define YMM6 ymm22 # define VEC_SIZE 32 +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) -# ifdef USE_AS_WCSLEN - shl $2, %RSI_LP -# elif defined __ILP32__ +# ifdef __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx - movq %rdi, %rdx + movl %edi, %eax vpxorq %XMMZERO, %XMMZERO, %XMMZERO - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Each bit in K0 represents a null byte. */ VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < CHAR_PER_VEC handle special. */ + cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + ret # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax + ret # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - -# ifdef USE_AS_WCSLEN - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG +L(first_vec_x1): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal CHAR_PER_VEC(%rdi, %rax), %eax # endif - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax + ret - /* Remove the leading bytes. */ - sarxl %SHIFT_REG, %eax, %eax - testl %eax, %eax - jz L(aligned_more) + .p2align 4 +L(first_vec_x2): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) -# endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax # endif ret .p2align 4 -L(aligned_more): +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax # endif + ret - addq $VEC_SIZE, %rdi - + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax +# else + subl %edx, %edi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %edi +# endif + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax # endif + ret -L(more_4x_vec): + .p2align 5 +L(aligned_more): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-(VEC_SIZE), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - +# ifdef USE_AS_STRNLEN + /* + CHAR_SIZE because it simplies the logic in + last_4x_vec_or_less. */ + leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx + subq %rdx, %rcx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif +# endif + /* Load first VEC regardless. */ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif kmovd %k0, %eax testl %eax, %eax jnz L(first_vec_x1) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax + test %eax, %eax jnz L(first_vec_x2) VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 @@ -179,258 +214,276 @@ L(more_4x_vec): testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x4) + addq $VEC_SIZE, %rdi # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Check if at last VEC_SIZE * 4 length. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx +# endif + /* Readjust length. */ addq %rcx, %rsi # endif + /* Align data to VEC_SIZE * 4. */ + andq $-(VEC_SIZE * 4), %rdi + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - VMOVA (%rdi), %YMM1 - VMOVA VEC_SIZE(%rdi), %YMM2 - VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 - VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 - - VPMINU %YMM1, %YMM2, %YMM5 - VPMINU %YMM3, %YMM4, %YMM6 + /* Load first VEC regardless. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + + VPCMP $0, %YMM2, %YMMZERO, %k0 + VPCMP $0, %YMM4, %YMMZERO, %k1 + subq $-(VEC_SIZE * 4), %rdi + kortestd %k0, %k1 + jz L(loop_4x_vec) + + /* Check if end was in first half. */ + kmovd %k0, %eax + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + shrq $2, %rdi +# endif + testl %eax, %eax + jz L(second_vec_return) - VPMINU %YMM5, %YMM6, %YMM5 - VPCMP $0, %YMM5, %YMMZERO, %k0 - ktestd %k0, %k0 - jnz L(4x_vec_end) + VPCMP $0, %YMM1, %YMMZERO, %k2 + kmovd %k2, %edx + /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ +# ifdef USE_AS_WCSLEN + sall $CHAR_PER_VEC, %eax + orl %edx, %eax + tzcntl %eax, %eax +# else + salq $CHAR_PER_VEC, %rax + orq %rdx, %rax + tzcntq %rax, %rax +# endif + addq %rdi, %rax + ret - addq $(VEC_SIZE * 4), %rdi -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else - subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) +# ifdef USE_AS_STRNLEN +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 +L(last_4x_vec_or_less_cmpeq): + VPCMP $0, %YMM1, %YMMZERO, %k0 + addq $(VEC_SIZE * 3), %rdi L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) - - VPCMP $0, (%rdi), %YMMZERO, %k0 kmovd %k0, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(CHAR_PER_VEC * 2), %esi + jnz L(last_4x_vec) + + /* length may have been negative or positive by an offset of + CHAR_PER_VEC * 4 depending on where this was called from. This + fixes that. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x1_check) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x1) + /* Check the end of data. */ + subl $CHAR_PER_VEC, %esi + jb L(max) VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x3_check) + subq %rdx, %rdi +# ifdef USE_AS_WCSLEN + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret +L(max): movq %r8, %rax + ret +# endif + + /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) + in the 4x VEC loop can use 2 byte encoding. */ + .p2align 4 +L(second_vec_return): + VPCMP $0, %YMM3, %YMMZERO, %k0 + /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ +# ifdef USE_AS_WCSLEN + kunpckbw %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax +# else + kunpckdq %k0, %k1, %k0 + kmovq %k0, %rax + tzcntq %rax, %rax +# endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + +# ifdef USE_AS_STRNLEN +L(last_vec_x1_check): + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) - VPCMP $0, (%rdi), %YMMZERO, %k0 + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %esi - jle L(max) + jnz L(last_vec_x2) - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 + /* Normalize length. */ + andl $(CHAR_PER_VEC * 4 - 1), %esi + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret + jnz L(last_vec_x3) - .p2align 4 -L(first_vec_x0_check): + /* Check the end of data. */ + subl $(CHAR_PER_VEC * 3), %esi + jb L(max) + + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x1_check): +L(last_vec_x1): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x2_check): +L(last_vec_x2): tzcntl %eax, %eax + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ret .p2align 4 -L(first_vec_x3_check): +L(last_vec_x3): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif + subl $(CHAR_PER_VEC * 2), %esi /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarq $2, %rdi # endif + leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax ret - - .p2align 4 -L(max): +L(max_end): movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - ret - - .p2align 4 -L(zero): - xorl %eax, %eax ret # endif + /* Cold case for crossing page with first load. */ .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq %rdi, %rax - subq %rdx, %rax +L(cross_page_boundary): + movq %rdi, %rdx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + /* Remove the leading bytes. */ # ifdef USE_AS_WCSLEN - shrq $2, %rax + /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ + movl %edx, %ecx + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx # endif - ret - - .p2align 4 -L(first_vec_x1): + /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ + sarxl %SHIFT_REG, %eax, %eax + testl %eax, %eax +# ifndef USE_AS_STRNLEN + jz L(cross_page_continue) tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $VEC_SIZE, %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif ret - - .p2align 4 -L(first_vec_x2): - tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 2), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif +# else + jnz L(cross_page_less_vec) +# ifndef USE_AS_WCSLEN + movl %edx, %ecx + andl $(CHAR_PER_VEC - 1), %ecx +# endif + movl $CHAR_PER_VEC, %eax + subl %ecx, %eax + /* Check the end of data. */ + cmpq %rax, %rsi + ja L(cross_page_continue) + movl %esi, %eax ret - - .p2align 4 -L(4x_vec_end): - VPCMP $0, %YMM1, %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMP $0, %YMM2, %YMMZERO, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMP $0, %YMM3, %YMMZERO, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(first_vec_x2) - VPCMP $0, %YMM4, %YMMZERO, %k3 - kmovd %k3, %eax -L(first_vec_x3): +L(cross_page_less_vec): tzcntl %eax, %eax -# ifdef USE_AS_WCSLEN - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - sall $2, %eax -# endif - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif + /* Select min of length and position of first null. */ + cmpq %rax, %rsi + cmovb %esi, %eax ret +# endif END (STRLEN) #endif -- cgit v1.1 From a3dd272e9c048581c32537cb1390d3efbbc2c3a2 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 9 Jun 2021 16:17:14 -0400 Subject: String: Add overflow tests for strnlen, memchr, and strncat [BZ #27974] This commit adds tests for a bug in the wide char variant of the functions where the implementation may assume that maxlen for wcsnlen or n for wmemchr/strncat will not overflow when multiplied by sizeof(wchar_t). These tests show the following implementations failing on x86_64: wcsnlen-sse4_1 wcsnlen-avx2 wmemchr-sse2 wmemchr-avx2 strncat would fail as well if it where on a system that prefered either of the wcsnlen implementations that failed as it relies on wcsnlen. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu (cherry picked from commit da5a6fba0febbfc90896ce1b2eb75c6d8a88a72d) --- string/test-memchr.c | 39 +++++++++++++++++++++++++++++--- string/test-strncat.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ string/test-strnlen.c | 33 ++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) diff --git a/string/test-memchr.c b/string/test-memchr.c index 58bb54e..32582fd 100644 --- a/string/test-memchr.c +++ b/string/test-memchr.c @@ -65,8 +65,8 @@ do_one_test (impl_t *impl, const CHAR *s, int c, size_t n, CHAR *exp_res) CHAR *res = CALL (impl, s, c, n); if (res != exp_res) { - error (0, 0, "Wrong result in function %s %p %p", impl->name, - res, exp_res); + error (0, 0, "Wrong result in function %s (%p, %d, %zu) -> %p != %p", + impl->name, s, c, n, res, exp_res); ret = 1; return; } @@ -91,7 +91,7 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char) } buf[align + len] = 0; - if (pos < len) + if (pos < MIN(n, len)) { buf[align + pos] = seek_char; buf[align + len] = -seek_char; @@ -108,6 +108,38 @@ do_test (size_t align, size_t pos, size_t len, size_t n, int seek_char) } static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + uintptr_t buf_addr = (uintptr_t) buf1; + + for (i = 0; i < 750; ++i) + { + do_test (0, i, 751, SIZE_MAX - i, BIG_CHAR); + do_test (0, i, 751, i - buf_addr, BIG_CHAR); + do_test (0, i, 751, -buf_addr - i, BIG_CHAR); + do_test (0, i, 751, SIZE_MAX - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, SIZE_MAX - buf_addr + i, BIG_CHAR); + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + do_test (0, i, 751, len - i, BIG_CHAR); + do_test (0, i, 751, len + i, BIG_CHAR); + do_test (0, i, 751, len - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, len - buf_addr + i, BIG_CHAR); + + do_test (0, i, 751, ~len - i, BIG_CHAR); + do_test (0, i, 751, ~len + i, BIG_CHAR); + do_test (0, i, 751, ~len - buf_addr - i, BIG_CHAR); + do_test (0, i, 751, ~len - buf_addr + i, BIG_CHAR); + } + } +} + +static void do_random_tests (void) { size_t i, j, n, align, pos, len; @@ -221,6 +253,7 @@ test_main (void) do_test (page_size / 2 - i, i, i, 1, 0x9B); do_random_tests (); + do_overflow_tests (); return ret; } diff --git a/string/test-strncat.c b/string/test-strncat.c index 4b71f4a..b9c8c57 100644 --- a/string/test-strncat.c +++ b/string/test-strncat.c @@ -135,6 +135,66 @@ do_test (size_t align1, size_t align2, size_t len1, size_t len2, } static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + CHAR *s1, *s2; + uintptr_t s1_addr; + s1 = (CHAR *) buf1; + s2 = (CHAR *) buf2; + s1_addr = (uintptr_t)s1; + for (j = 0; j < 200; ++j) + s2[j] = 32 + 23 * j % (BIG_CHAR - 32); + s2[200] = 0; + for (i = 0; i < 750; ++i) { + for (j = 0; j < i; ++j) + s1[j] = 32 + 23 * j % (BIG_CHAR - 32); + s1[i] = '\0'; + + FOR_EACH_IMPL (impl, 0) + { + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, i - s1_addr); + s2[200] = '\0'; + do_one_test (impl, s2, s1, -s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, SIZE_MAX - s1_addr + i); + } + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + FOR_EACH_IMPL (impl, 0) + { + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len + i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, len - s1_addr + i); + + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len + i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - s1_addr - i); + s2[200] = '\0'; + do_one_test (impl, s2, s1, ~len - s1_addr + i); + } + } + } +} + +static void do_random_tests (void) { size_t i, j, n, align1, align2, len1, len2, N; @@ -316,6 +376,7 @@ test_main (void) } do_random_tests (); + do_overflow_tests (); return ret; } diff --git a/string/test-strnlen.c b/string/test-strnlen.c index f7d0896..027b9ad 100644 --- a/string/test-strnlen.c +++ b/string/test-strnlen.c @@ -90,6 +90,38 @@ do_test (size_t align, size_t len, size_t maxlen, int max_char) } static void +do_overflow_tests (void) +{ + size_t i, j, len; + const size_t one = 1; + uintptr_t buf_addr = (uintptr_t) buf1; + + for (i = 0; i < 750; ++i) + { + do_test (0, i, SIZE_MAX - i, BIG_CHAR); + do_test (0, i, i - buf_addr, BIG_CHAR); + do_test (0, i, -buf_addr - i, BIG_CHAR); + do_test (0, i, SIZE_MAX - buf_addr - i, BIG_CHAR); + do_test (0, i, SIZE_MAX - buf_addr + i, BIG_CHAR); + + len = 0; + for (j = 8 * sizeof(size_t) - 1; j ; --j) + { + len |= one << j; + do_test (0, i, len - i, BIG_CHAR); + do_test (0, i, len + i, BIG_CHAR); + do_test (0, i, len - buf_addr - i, BIG_CHAR); + do_test (0, i, len - buf_addr + i, BIG_CHAR); + + do_test (0, i, ~len - i, BIG_CHAR); + do_test (0, i, ~len + i, BIG_CHAR); + do_test (0, i, ~len - buf_addr - i, BIG_CHAR); + do_test (0, i, ~len - buf_addr + i, BIG_CHAR); + } + } +} + +static void do_random_tests (void) { size_t i, j, n, align, len; @@ -274,6 +306,7 @@ test_main (void) do_random_tests (); do_page_tests (); do_page_2_tests (); + do_overflow_tests (); return ret; } -- cgit v1.1 From 25ed98a8827c083c8241cf9b5e2d2ec81c9dbe6f Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 27 Jan 2022 16:02:09 -0800 Subject: NEWS: Add a bug fix entry for BZ #27974 --- NEWS | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS b/NEWS index a89412d..808634b 100644 --- a/NEWS +++ b/NEWS @@ -61,6 +61,7 @@ The following bugs are resolved with this release: [27130] "rep movsb" performance issue [27177] GLIBC_TUNABLES=glibc.cpu.x86_ibt=on:glibc.cpu.x86_shstk=on doesn't work [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts + [27974] Overflow bug in some implementation of wcsnlen, wmemchr, and wcsncat [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex -- cgit v1.1 From 071e2bdd850de9387b22b387f5f42e5c7d6668de Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 30 Jun 2021 10:47:06 -0700 Subject: x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033] From https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html * Intel TSX will be disabled by default. * The processor will force abort all Restricted Transactional Memory (RTM) transactions by default. * A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated, which is set to indicate to updated software that the loaded microcode is forcing RTM abort. * On processors that enumerate support for RTM, the CPUID enumeration bits for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to be set by default after microcode update. * Workloads that were benefited from Intel TSX might experience a change in performance. * System software may use a new bit in Model-Specific Register (MSR) 0x10F TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock Elision (HLE) and RTM bits to indicate to software that Intel TSX is disabled. 1. Add RTM_ALWAYS_ABORT to CPUID features. 2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the string/tst-memchr-rtm etc. testcases on the affected processors, which always fail after a microcde update. 3. Check RTM feature, instead of usability, against /proc/cpuinfo. This fixes BZ #28033. (cherry picked from commit ea8e465a6b8d0f26c72bcbe453a854de3abf68ec) --- sysdeps/x86/cpu-features.c | 3 +++ sysdeps/x86/cpu-features.h | 3 +++ sysdeps/x86/tst-get-cpu-features.c | 1 + 3 files changed, 7 insertions(+) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index a4d1eac..00ca2fe 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -333,6 +333,9 @@ init_cpu_features (struct cpu_features *cpu_features) get_extended_indices (cpu_features); + if (CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT)) + cpu_features->cpuid[index_cpu_RTM].reg_RTM &= ~bit_cpu_RTM; + if (family == 0x06) { model += extended_model; diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h index ca2924b..3599dd8 100644 --- a/sysdeps/x86/cpu-features.h +++ b/sysdeps/x86/cpu-features.h @@ -499,6 +499,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define bit_cpu_AVX512_4VNNIW (1u << 2) #define bit_cpu_AVX512_4FMAPS (1u << 3) #define bit_cpu_FSRM (1u << 4) +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11) #define bit_cpu_PCONFIG (1u << 18) #define bit_cpu_IBT (1u << 20) #define bit_cpu_IBRS_IBPB (1u << 26) @@ -667,6 +668,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define index_cpu_AVX512_4VNNIW COMMON_CPUID_INDEX_7 #define index_cpu_AVX512_4FMAPS COMMON_CPUID_INDEX_7 #define index_cpu_FSRM COMMON_CPUID_INDEX_7 +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7 #define index_cpu_PCONFIG COMMON_CPUID_INDEX_7 #define index_cpu_IBT COMMON_CPUID_INDEX_7 #define index_cpu_IBRS_IBPB COMMON_CPUID_INDEX_7 @@ -835,6 +837,7 @@ extern const struct cpu_features *__get_cpu_features (void) #define reg_AVX512_4VNNIW edx #define reg_AVX512_4FMAPS edx #define reg_FSRM edx +#define reg_RTM_ALWAYS_ABORT edx #define reg_PCONFIG edx #define reg_IBT edx #define reg_IBRS_IBPB edx diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c index bf2b9b2..08aa421 100644 --- a/sysdeps/x86/tst-get-cpu-features.c +++ b/sysdeps/x86/tst-get-cpu-features.c @@ -176,6 +176,7 @@ do_test (void) CHECK_CPU_FEATURE (AVX512_4VNNIW); CHECK_CPU_FEATURE (AVX512_4FMAPS); CHECK_CPU_FEATURE (FSRM); + CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT); CHECK_CPU_FEATURE (PCONFIG); CHECK_CPU_FEATURE (IBT); CHECK_CPU_FEATURE (IBRS_IBPB); -- cgit v1.1 From 76e30dd44ef7aab743a538dcd4a81761a1f612d9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 14 Jan 2022 14:48:01 -0800 Subject: x86: Black list more Intel CPUs for TSX [BZ #27398] Disable TSX and enable RTM_ALWAYS_ABORT for Intel CPUs listed in: https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html This fixes BZ #27398. Reviewed-by: Noah Goldstein (cherry picked from commit 1e000d3d33211d5a954300e2a69b90f93f18a1a1) --- sysdeps/x86/cpu-features.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c index 00ca2fe..91215f8 100644 --- a/sysdeps/x86/cpu-features.c +++ b/sysdeps/x86/cpu-features.c @@ -397,11 +397,42 @@ init_cpu_features (struct cpu_features *cpu_features) break; } - /* Disable TSX on some Haswell processors to avoid TSX on kernels that - weren't updated with the latest microcode package (which disables - broken feature by default). */ + /* Disable TSX on some processors to avoid TSX on kernels that + weren't updated with the latest microcode package (which + disables broken feature by default). */ switch (model) { + case 0x55: + if (stepping <= 5) + goto disable_tsx; + break; + case 0x8e: + /* NB: Although the errata documents that for model == 0x8e, + only 0xb stepping or lower are impacted, the intention of + the errata was to disable TSX on all client processors on + all steppings. Include 0xc stepping which is an Intel + Core i7-8665U, a client mobile processor. */ + case 0x9e: + if (stepping > 0xc) + break; + /* Fall through. */ + case 0x4e: + case 0x5e: + { + /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for + processors listed in: + +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + */ +disable_tsx: + cpu_features->cpuid[index_cpu_HLE].reg_HLE + &= ~bit_cpu_HLE; + cpu_features->cpuid[index_cpu_RTM].reg_RTM + &= ~bit_cpu_RTM; + cpu_features->cpuid[index_cpu_RTM_ALWAYS_ABORT].reg_RTM_ALWAYS_ABORT + |= bit_cpu_RTM_ALWAYS_ABORT; + } + break; case 0x3f: /* Xeon E7 v3 with stepping >= 4 has working TSX. */ if (stepping >= 4) -- cgit v1.1 From 23bc964c62c155fe50f19ed3bf6bb451d556a29d Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Thu, 8 Jul 2021 16:13:19 -0400 Subject: x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ #28064] The following commit commit 6f573a27b6c8b4236445810a44660612323f5a73 Author: Noah Goldstein Date: Wed Jun 23 01:19:34 2021 -0400 x86-64: Add wcslen optimize for sse4.1 Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc implementation list and adding wcslen-sse4.1 to the ifunc implementation list. Testing: test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as well as all other tests in wcsmbs and string. Signed-off-by: Noah Goldstein Reviewed-by: H.J. Lu Reviewed-by: H.J. Lu (cherry picked from commit 0679442defedf7e52a94264975880ab8674736b2) --- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index e57fb42..28971a6 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && HAS_ARCH_FEATURE (AVX512BW_Usable) && HAS_CPU_FEATURE (BMI2)), __wcslen_evex) - IFUNC_IMPL_ADD (array, i, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcslen, CPU_FEATURE_USABLE (SSE4_1), - __wcsnlen_sse4_1) + __wcslen_sse4_1) IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ -- cgit v1.1 From 8ad5519ae1cbfff987c5335253dfaa6bbccdddcb Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 8 Jul 2021 16:13:17 -0400 Subject: x86-64: Test strlen and wcslen with 0 in the RSI register [BZ #28064] commit 6f573a27b6c8b4236445810a44660612323f5a73 Author: Noah Goldstein Date: Wed Jun 23 01:19:34 2021 -0400 x86-64: Add wcslen optimize for sse4.1 added wcsnlen-sse4.1 to the wcslen ifunc implementation list. Since the random value in the the RSI register is larger than the wide-character string length in the existing wcslen test, it didn't trigger the wcslen test failure. Add a test to force 0 into the RSI register before calling wcslen. (cherry picked from commit a6e7c3745d73ff876b4ba6991fb00768a938aef5) --- sysdeps/x86_64/Makefile | 7 ++++ sysdeps/x86_64/tst-rsi-strlen.c | 81 +++++++++++++++++++++++++++++++++++++++++ sysdeps/x86_64/tst-rsi-wcslen.c | 20 ++++++++++ 3 files changed, 108 insertions(+) create mode 100644 sysdeps/x86_64/tst-rsi-strlen.c create mode 100644 sysdeps/x86_64/tst-rsi-wcslen.c diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index d51cf03..b1951ad 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -20,6 +20,8 @@ endif ifeq ($(subdir),string) sysdep_routines += cacheinfo strcasecmp_l-nonascii strncase_l-nonascii gen-as-const-headers += locale-defines.sym +tests += \ + tst-rsi-strlen endif ifeq ($(subdir),elf) @@ -150,6 +152,11 @@ ifeq ($(subdir),csu) gen-as-const-headers += tlsdesc.sym rtld-offsets.sym endif +ifeq ($(subdir),wcsmbs) +tests += \ + tst-rsi-wcslen +endif + $(objpfx)x86_64/tst-x86_64mod-1.os: $(objpfx)tst-x86_64mod-1.os $(make-target-directory) rm -f $@ diff --git a/sysdeps/x86_64/tst-rsi-strlen.c b/sysdeps/x86_64/tst-rsi-strlen.c new file mode 100644 index 0000000..a80c4f8 --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-strlen.c @@ -0,0 +1,81 @@ +/* Test strlen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef WIDE +# define TEST_NAME "wcslen" +#else +# define TEST_NAME "strlen" +#endif /* WIDE */ + +#define TEST_MAIN +#include + +#ifdef WIDE +# include +# define STRLEN wcslen +# define CHAR wchar_t +#else +# define STRLEN strlen +# define CHAR char +#endif /* WIDE */ + +IMPL (STRLEN, 1) + +typedef size_t (*proto_t) (const CHAR *); + +typedef struct +{ + void (*fn) (void); +} parameter_t; + +size_t +__attribute__ ((weak, noinline, noclone)) +do_strlen (parameter_t *a, int zero, const CHAR *str) +{ + return CALL (a, str); +} + +static int +test_main (void) +{ + test_init (); + + size_t size = page_size / sizeof (CHAR) - 1; + CHAR *buf = (CHAR *) buf2; + buf[size] = 0; + + parameter_t a; + + int ret = 0; + FOR_EACH_IMPL (impl, 0) + { + a.fn = impl->fn; + /* NB: Pass 0 in RSI. */ + size_t res = do_strlen (&a, 0, buf); + if (res != size) + { + error (0, 0, "Wrong result in function %s: %zu != %zu", + impl->name, res, size); + ret = 1; + } + } + + return ret ? EXIT_FAILURE : EXIT_SUCCESS; +} + +#include diff --git a/sysdeps/x86_64/tst-rsi-wcslen.c b/sysdeps/x86_64/tst-rsi-wcslen.c new file mode 100644 index 0000000..f45a7df --- /dev/null +++ b/sysdeps/x86_64/tst-rsi-wcslen.c @@ -0,0 +1,20 @@ +/* Test wcslen with 0 in the RSI register. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include "tst-rsi-strlen.c" -- cgit v1.1 From 70522b1c1d1ffa5e3bd55aa3c064ea93a330bde9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 17 Feb 2022 08:10:35 -0800 Subject: string: Add a testcase for wcsncmp with SIZE_MAX [BZ #28755] Verify that wcsncmp (L("abc"), L("abd"), SIZE_MAX) == 0. The new test fails without commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Author: Noah Goldstein Date: Sun Jan 9 16:02:21 2022 -0600 x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] and commit 7e08db3359c86c94918feb33a1182cd0ff3bb10b Author: Noah Goldstein Date: Sun Jan 9 16:02:28 2022 -0600 x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755] This is for BZ #28755. Reviewed-by: Sunil K Pandey (cherry picked from commit aa5a720056d37cf24924c138a3dbe6dace98e97c) --- string/test-strncmp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/string/test-strncmp.c b/string/test-strncmp.c index 042e85e..af071bd 100644 --- a/string/test-strncmp.c +++ b/string/test-strncmp.c @@ -403,6 +403,18 @@ check2 (void) free (s2); } +static void +check3 (void) +{ + const CHAR *s1 = L ("abc"); + CHAR *s2 = STRDUP (s1); + + FOR_EACH_IMPL (impl, 0) + check_result (impl, s1, s2, SIZE_MAX, 0); + + free (s2); +} + int test_main (void) { @@ -412,6 +424,7 @@ test_main (void) check1 (); check2 (); + check3 (); printf ("%23s", ""); FOR_EACH_IMPL (impl, 0) -- cgit v1.1 From a3ef8a0c3d81ac9e480512137d4eb880cb875a6a Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 15 Feb 2022 08:18:15 -0600 Subject: x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #28896] In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would call strcmp-avx2 and wcscmp-avx2 respectively. This would have not checks around vzeroupper and would trigger spurious aborts. This commit fixes that. test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on AVX2 machines with and without RTM. Co-authored-by: H.J. Lu (cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf) --- sysdeps/x86/Makefile | 2 +- sysdeps/x86/tst-strncmp-rtm.c | 17 ++++++++++++++++- sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +- sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +- 7 files changed, 22 insertions(+), 5 deletions(-) diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index a93139b..b0ac1a6 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -43,7 +43,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm CFLAGS-tst-strchr-rtm.c += -mrtm CFLAGS-tst-strcpy-rtm.c += -mrtm CFLAGS-tst-strlen-rtm.c += -mrtm -CFLAGS-tst-strncmp-rtm.c += -mrtm +CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error CFLAGS-tst-strrchr-rtm.c += -mrtm endif diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c index 236ad95..4d0004b 100644 --- a/sysdeps/x86/tst-strncmp-rtm.c +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -16,6 +16,7 @@ License along with the GNU C Library; if not, see . */ +#include #include #define LOOP 3000 @@ -45,8 +46,22 @@ function (void) return 1; } +__attribute__ ((noinline, noclone)) +static int +function_overflow (void) +{ + if (strncmp (string1, string2, SIZE_MAX) == 0) + return 0; + else + return 1; +} + static int do_test (void) { - return do_test_1 ("strncmp", LOOP, prepare, function); + int status = do_test_1 ("strncmp", LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); + return status; } diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index f199c91..5c41ba6 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -95,7 +95,7 @@ ENTRY (STRCMP) length to bound a valid memory region. In these cases just use 'wcscmp'. */ shrq $56, %rcx - jnz __wcscmp_avx2 + jnz OVERFLOW_STRCMP # endif /* Convert units: from wide to byte char. */ shl $2, %RDX_LP diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S index 37d1224..68bad36 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2_rtm #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S index 1678bcc..f138e9f 100644 --- a/sysdeps/x86_64/multiarch/strncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S @@ -1,3 +1,4 @@ #define STRCMP __strncmp_avx2 #define USE_AS_STRNCMP 1 +#define OVERFLOW_STRCMP __strcmp_avx2 #include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S index 4e88c70..f467582 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2_rtm #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2_rtm #include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S index 4fa1de4..e9ede52 100644 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S @@ -1,5 +1,5 @@ #define STRCMP __wcsncmp_avx2 #define USE_AS_STRNCMP 1 #define USE_AS_WCSCMP 1 - +#define OVERFLOW_STRCMP __wcscmp_avx2 #include "strcmp-avx2.S" -- cgit v1.1 From 6428a662cc128606ee9e889f1872b8e5ff4a558b Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Fri, 18 Feb 2022 14:19:15 -0600 Subject: x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896] In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would call strcmp-avx2 and wcscmp-avx2 respectively. This would have not checks around vzeroupper and would trigger spurious aborts. This commit fixes that. test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on AVX2 machines with and without RTM. Reviewed-by: H.J. Lu (cherry picked from commit 7835d611af0854e69a0c71e3806f8fe379282d6f) --- sysdeps/x86/Makefile | 5 ++++- sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++--------- sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++ 3 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile index b0ac1a6..b7aec5d 100644 --- a/sysdeps/x86/Makefile +++ b/sysdeps/x86/Makefile @@ -33,7 +33,9 @@ tests += \ tst-strcpy-rtm \ tst-strlen-rtm \ tst-strncmp-rtm \ - tst-strrchr-rtm + tst-strrchr-rtm \ + tst-wcsncmp-rtm \ +# tests CFLAGS-tst-memchr-rtm.c += -mrtm CFLAGS-tst-memcmp-rtm.c += -mrtm @@ -45,6 +47,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm CFLAGS-tst-strlen-rtm.c += -mrtm CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error CFLAGS-tst-strrchr-rtm.c += -mrtm +CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error endif ifeq ($(enable-cet),yes) diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c index 4d0004b..4e9f094 100644 --- a/sysdeps/x86/tst-strncmp-rtm.c +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -19,18 +19,32 @@ #include #include +#ifdef WIDE +# define CHAR wchar_t +# define MEMSET wmemset +# define STRNCMP wcsncmp +# define TEST_NAME wcsncmp +#else /* !WIDE */ +# define CHAR char +# define MEMSET memset +# define STRNCMP strncmp +# define TEST_NAME strncmp +#endif /* !WIDE */ + + + #define LOOP 3000 #define STRING_SIZE 1024 -char string1[STRING_SIZE]; -char string2[STRING_SIZE]; +CHAR string1[STRING_SIZE]; +CHAR string2[STRING_SIZE]; __attribute__ ((noinline, noclone)) static int prepare (void) { - memset (string1, 'a', STRING_SIZE - 1); - memset (string2, 'a', STRING_SIZE - 1); - if (strncmp (string1, string2, STRING_SIZE) == 0) + MEMSET (string1, 'a', STRING_SIZE - 1); + MEMSET (string2, 'a', STRING_SIZE - 1); + if (STRNCMP (string1, string2, STRING_SIZE) == 0) return EXIT_SUCCESS; else return EXIT_FAILURE; @@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone)) static int function (void) { - if (strncmp (string1, string2, STRING_SIZE) == 0) + if (STRNCMP (string1, string2, STRING_SIZE) == 0) return 0; else return 1; @@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone)) static int function_overflow (void) { - if (strncmp (string1, string2, SIZE_MAX) == 0) + if (STRNCMP (string1, string2, SIZE_MAX) == 0) return 0; else return 1; @@ -59,9 +73,9 @@ function_overflow (void) static int do_test (void) { - int status = do_test_1 ("strncmp", LOOP, prepare, function); + int status = do_test_1 (TEST_NAME, LOOP, prepare, function); if (status != EXIT_SUCCESS) return status; - status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); return status; } diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c new file mode 100644 index 0000000..bad3b86 --- /dev/null +++ b/sysdeps/x86/tst-wcsncmp-rtm.c @@ -0,0 +1,21 @@ +/* Test case for wcsncmp inside a transactionally executing RTM region. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define WIDE 1 +#include +#include "tst-strncmp-rtm.c" -- cgit v1.1 From 31615a5f4773e75f2ea007acae668c0605b96f93 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Fri, 18 Feb 2022 17:00:25 -0600 Subject: x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c Previously TEST_NAME was passing a function pointer. This didn't fail because of the -Wno-error flag (to allow for overflow sizes passed to strncmp/wcsncmp) Reviewed-by: H.J. Lu (cherry picked from commit b98d0bbf747f39770e0caba7e984ce9f8f900330) --- sysdeps/x86/tst-strncmp-rtm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c index 4e9f094..aef9866 100644 --- a/sysdeps/x86/tst-strncmp-rtm.c +++ b/sysdeps/x86/tst-strncmp-rtm.c @@ -23,12 +23,12 @@ # define CHAR wchar_t # define MEMSET wmemset # define STRNCMP wcsncmp -# define TEST_NAME wcsncmp +# define TEST_NAME "wcsncmp" #else /* !WIDE */ # define CHAR char # define MEMSET memset # define STRNCMP strncmp -# define TEST_NAME strncmp +# define TEST_NAME "strncmp" #endif /* !WIDE */ -- cgit v1.1 From a79bc55e5be07d797d77b63fc0018eb515ae7049 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 18 Feb 2022 19:13:02 -0800 Subject: NEWS: Add a bug fix entry for BZ #28896 --- NEWS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS b/NEWS index 808634b..8715fd7 100644 --- a/NEWS +++ b/NEWS @@ -63,6 +63,8 @@ The following bugs are resolved with this release: [27457] vzeroupper use in AVX2 multiarch string functions cause HTM aborts [27974] Overflow bug in some implementation of wcsnlen, wmemchr, and wcsncat [28755] overflow bug in wcsncmp_avx2 and wcsncmp_evex + [28896] strncmp-avx2-rtm and wcsncmp-avx2-rtm fallback on non-rtm + variants when avoiding overflow Version 2.30 -- cgit v1.1