From 58e6cd4bcbe9f29949f1545953a17145bf732aa0 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 12 Jul 2022 12:29:01 -0700 Subject: x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/strlen-vec.S | 267 ---------------------------------- 1 file changed, 267 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S (limited to 'sysdeps/x86_64/multiarch/strlen-vec.S') diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S deleted file mode 100644 index 874123d..0000000 --- a/sysdeps/x86_64/multiarch/strlen-vec.S +++ /dev/null @@ -1,267 +0,0 @@ -/* SSE2 version of strlen and SSE4.1 version of wcslen. - Copyright (C) 2012-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -#ifdef AS_WCSLEN -# define PMINU pminud -# define PCMPEQ pcmpeqd -# define SHIFT_RETURN shrq $2, %rax -#else -# define PMINU pminub -# define PCMPEQ pcmpeqb -# define SHIFT_RETURN -#endif - -#ifndef SECTION -# define SECTION(p) p -#endif - -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm3 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - - .section SECTION(.text),"ax",@progbits -ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - PCMPEQ (%rax), %xmm0; \ - PCMPEQ 16(%rax), %xmm1; \ - PCMPEQ 32(%rax), %xmm2; \ - PCMPEQ 48(%rax), %xmm3; \ - pmovmskb %xmm0, %esi; \ - pmovmskb %xmm1, %edx; \ - pmovmskb %xmm2, %r8d; \ - pmovmskb %xmm3, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %RSI_LP, %RSI_LP - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): -# ifdef AS_WCSLEN -/* Check for overflow from maxlen * sizeof(wchar_t). If it would - overflow the only way this program doesn't have undefined behavior - is if there is a null terminator in valid memory so wcslen will - suffice. */ - mov %RSI_LP, %R10_LP - sar $62, %R10_LP - jnz __wcslen_sse4_1 - sal $2, %RSI_LP -# endif - -/* Initialize long lived registers. */ - add %RDI_LP, %RSI_LP - mov %RSI_LP, %R10_LP - and $-64, %R10_LP - mov %RSI_LP, %R11_LP -#endif - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(cross_page) - -#ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - SHIFT_RETURN; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm4 - PCMPEQ %xmm0, %xmm4 - pmovmskb %xmm4, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ - SHIFT_RETURN - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - PCMPEQ 16(%rax), %xmm1 - PCMPEQ 32(%rax), %xmm2 - PCMPEQ 48(%rax), %xmm3 - pmovmskb %xmm1, %edx - pmovmskb %xmm2, %r8d - pmovmskb %xmm3, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx - salq $32, %rcx - orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm1-3 are zero so we do not have to - zero them. */ - PROLOG(loop) - - .p2align 4 -L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - SHIFT_RETURN - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 -#ifdef AS_STRNLEN - .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm0, %xmm0 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - - .p2align 4 -L(exit): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm0 - PMINU 80(%rax), %xmm0 - PMINU 96(%rax), %xmm0 - PMINU 112(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - - .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#endif - -END(strlen) -- cgit v1.1