aboutsummaryrefslogtreecommitdiff
path: root/gcc/builtins.cc
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2022-05-13 22:26:29 +0100
committerRoger Sayle <roger@nextmovesoftware.com>2022-05-13 22:30:47 +0100
commitd75d4293dcc029a7b00f902d9b03416c9439af4d (patch)
tree99bdb13b7dea11771d72f29801326046d4244765 /gcc/builtins.cc
parent14e678a2c4a76433fd4029568d28530c921e11ee (diff)
downloadgcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.zip
gcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.tar.gz
gcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.tar.bz2
Improved V1TI (and V2DI) mode equality/inequality on x86_64.
This patch improves support for vector equality and inequality of V1TImode vectors, and V2DImode vectors with sse2 but not sse4. Consider the three functions below: typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16))); typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } uv2di eq_v2di(uv2di x, uv2di y) { return x == y; } uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; } These all perform vector comparisons of 128bit SSE2 registers, generating the result as a vector, where ~0 (all 1 bits) represents true and a zero represents false. eq_v4si is trivially implemented by x86_64's pcmpeqd instruction. This patch improves the other two cases: For v2di, gcc -O2 currently generates: movq %xmm0, %rdx movq %xmm1, %rax movdqa %xmm0, %xmm2 cmpq %rax, %rdx movhlps %xmm2, %xmm3 movhlps %xmm1, %xmm4 sete %al movq %xmm3, %rdx movzbl %al, %eax negq %rax movq %rax, %xmm0 movq %xmm4, %rax cmpq %rax, %rdx sete %al movzbl %al, %eax negq %rax movq %rax, %xmm5 punpcklqdq %xmm5, %xmm0 ret but with this patch we now generate: pcmpeqd %xmm0, %xmm1 pshufd $177, %xmm1, %xmm0 pand %xmm1, %xmm0 ret where the results of a V4SI comparison are shuffled and bit-wise ANDed to produce the desired result. There's no change in the code generated for "-O2 -msse4" where the compiler generates a single "pcmpeqq" insn. For V1TI mode, the results are equally dramatic, where the current -O2 output looks like: movaps %xmm0, -40(%rsp) movq -40(%rsp), %rax movq -32(%rsp), %rdx movaps %xmm1, -24(%rsp) movq -24(%rsp), %rcx movq -16(%rsp), %rsi xorq %rcx, %rax xorq %rsi, %rdx orq %rdx, %rax sete %al xorl %edx, %edx movzbl %al, %eax negq %rax adcq $0, %rdx movq %rax, %xmm2 negq %rdx movq %rdx, -40(%rsp) movhps -40(%rsp), %xmm2 movdqa %xmm2, %xmm0 ret with this patch we now generate: pcmpeqd %xmm0, %xmm1 pshufd $177, %xmm1, %xmm0 pand %xmm1, %xmm0 pshufd $78, %xmm0, %xmm1 pand %xmm1, %xmm0 ret performing a V2DI comparison, followed by a shuffle and pand, and with -O2 -msse4 take advantages of SSE4.1's pcmpeqq: pcmpeqq %xmm0, %xmm1 pshufd $78, %xmm1, %xmm0 pand %xmm1, %xmm0 ret 2022-05-13 Roger Sayle <roger@nextmovesoftware.com> Uroš Bizjak <ubizjak@gmail.com> gcc/ChangeLog * config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2. For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed by a pshufd and pand. (vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode vector equality as a V2DImode vector comparison (see above), followed by a pshufd and pand. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-v1ti-veq.c: New test case. * gcc.target/i386/sse2-v1ti-vne.c: New test case.
Diffstat (limited to 'gcc/builtins.cc')
0 files changed, 0 insertions, 0 deletions