aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/x86_64
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-01-13 07:51:48 -0800
committerUlrich Drepper <drepper@redhat.com>2010-01-13 07:51:48 -0800
commit5a7af22fbb661e351473bfde639beeac16fc32f7 (patch)
tree70b559065a13a15cf87e1907c35cda904f10a64f /sysdeps/x86_64
parent52e96a8092e4c0bfef5bd9748b35861bba6f6b88 (diff)
downloadglibc-5a7af22fbb661e351473bfde639beeac16fc32f7.zip
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.gz
glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.bz2
Unroll the loop x86-64 SSE4.2 strlen.
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r--sysdeps/x86_64/multiarch/strlen.S60
1 files changed, 45 insertions, 15 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 509f9c9..f964113 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -46,28 +46,58 @@ END(strlen)
__strlen_sse42:
cfi_startproc
CALL_MCOUNT
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
movq %rdi, %r8
andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %esi
- subq %rdi, %rcx
- shll %cl, %esi
- pmovmskb %xmm2, %edx
- andl %esi, %edx
- jnz 1f
-
-2: pcmpistri $0x08, 16(%rdi), %xmm1
- leaq 16(%rdi), %rdi
- jnz 2b
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
leaq (%rdi,%rcx), %rax
subq %r8, %rax
ret
-1: subq %r8, %rdi
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
bsfl %edx, %eax
addq %rdi, %rax
ret