diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2022-05-13 22:26:29 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2022-05-13 22:30:47 +0100 |
commit | d75d4293dcc029a7b00f902d9b03416c9439af4d (patch) | |
tree | 99bdb13b7dea11771d72f29801326046d4244765 | |
parent | 14e678a2c4a76433fd4029568d28530c921e11ee (diff) | |
download | gcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.zip gcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.tar.gz gcc-d75d4293dcc029a7b00f902d9b03416c9439af4d.tar.bz2 |
Improved V1TI (and V2DI) mode equality/inequality on x86_64.
This patch improves support for vector equality and inequality of
V1TImode vectors, and V2DImode vectors with sse2 but not sse4.
Consider the three functions below:
typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));
typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
uv4si eq_v4si(uv4si x, uv4si y) { return x == y; }
uv2di eq_v2di(uv2di x, uv2di y) { return x == y; }
uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; }
These all perform vector comparisons of 128bit SSE2 registers, generating
the result as a vector, where ~0 (all 1 bits) represents true and a zero
represents false. eq_v4si is trivially implemented by x86_64's pcmpeqd
instruction. This patch improves the other two cases:
For v2di, gcc -O2 currently generates:
movq %xmm0, %rdx
movq %xmm1, %rax
movdqa %xmm0, %xmm2
cmpq %rax, %rdx
movhlps %xmm2, %xmm3
movhlps %xmm1, %xmm4
sete %al
movq %xmm3, %rdx
movzbl %al, %eax
negq %rax
movq %rax, %xmm0
movq %xmm4, %rax
cmpq %rax, %rdx
sete %al
movzbl %al, %eax
negq %rax
movq %rax, %xmm5
punpcklqdq %xmm5, %xmm0
ret
but with this patch we now generate:
pcmpeqd %xmm0, %xmm1
pshufd $177, %xmm1, %xmm0
pand %xmm1, %xmm0
ret
where the results of a V4SI comparison are shuffled and bit-wise ANDed
to produce the desired result. There's no change in the code generated
for "-O2 -msse4" where the compiler generates a single "pcmpeqq" insn.
For V1TI mode, the results are equally dramatic, where the current -O2
output looks like:
movaps %xmm0, -40(%rsp)
movq -40(%rsp), %rax
movq -32(%rsp), %rdx
movaps %xmm1, -24(%rsp)
movq -24(%rsp), %rcx
movq -16(%rsp), %rsi
xorq %rcx, %rax
xorq %rsi, %rdx
orq %rdx, %rax
sete %al
xorl %edx, %edx
movzbl %al, %eax
negq %rax
adcq $0, %rdx
movq %rax, %xmm2
negq %rdx
movq %rdx, -40(%rsp)
movhps -40(%rsp), %xmm2
movdqa %xmm2, %xmm0
ret
with this patch we now generate:
pcmpeqd %xmm0, %xmm1
pshufd $177, %xmm1, %xmm0
pand %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1
pand %xmm1, %xmm0
ret
performing a V2DI comparison, followed by a shuffle and pand, and with
-O2 -msse4 take advantages of SSE4.1's pcmpeqq:
pcmpeqq %xmm0, %xmm1
pshufd $78, %xmm1, %xmm0
pand %xmm1, %xmm0
ret
2022-05-13 Roger Sayle <roger@nextmovesoftware.com>
Uroš Bizjak <ubizjak@gmail.com>
gcc/ChangeLog
* config/i386/sse.md (vec_cmpeqv2div2di): Enable for TARGET_SSE2.
For !TARGET_SSE4_1, expand as a V4SI vector comparison, followed
by a pshufd and pand.
(vec_cmpeqv1tiv1ti): New define_expand implementing V1TImode
vector equality as a V2DImode vector comparison (see above),
followed by a pshufd and pand.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-v1ti-veq.c: New test case.
* gcc.target/i386/sse2-v1ti-vne.c: New test case.
-rw-r--r-- | gcc/config/i386/sse.md | 48 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c | 12 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c | 13 |
3 files changed, 71 insertions, 2 deletions
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6578104..88fc521 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4390,13 +4390,57 @@ (match_operator:V2DI 1 "" [(match_operand:V2DI 2 "register_operand") (match_operand:V2DI 3 "vector_operand")]))] - "TARGET_SSE4_1" + "TARGET_SSE2" { - bool ok = ix86_expand_int_vec_cmp (operands); + bool ok; + if (!TARGET_SSE4_1) + { + rtx ops[4]; + ops[0] = gen_reg_rtx (V4SImode); + ops[2] = gen_lowpart (V4SImode, force_reg (V2DImode, operands[2])); + ops[3] = gen_lowpart (V4SImode, force_reg (V2DImode, operands[3])); + ops[1] = gen_rtx_fmt_ee (GET_CODE (operands[1]), V4SImode, + ops[2], ops[3]); + ok = ix86_expand_int_vec_cmp (ops); + + rtx tmp1 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd (tmp1, ops[0], GEN_INT (0xb1))); + + rtx tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_andv4si3 (tmp2, tmp1, ops[0])); + + emit_move_insn (operands[0], gen_lowpart (V2DImode, tmp2)); + } + else + ok = ix86_expand_int_vec_cmp (operands); gcc_assert (ok); DONE; }) +(define_expand "vec_cmpeqv1tiv1ti" + [(set (match_operand:V1TI 0 "register_operand") + (match_operator:V1TI 1 "" + [(match_operand:V1TI 2 "register_operand") + (match_operand:V1TI 3 "vector_operand")]))] + "TARGET_SSE2" +{ + rtx dst = gen_reg_rtx (V2DImode); + rtx op1 = gen_lowpart (V2DImode, force_reg (V1TImode, operands[2])); + rtx op2 = gen_lowpart (V2DImode, force_reg (V1TImode, operands[3])); + rtx cmp = gen_rtx_fmt_ee (GET_CODE (operands[1]), V2DImode, op1, op2); + emit_insn (gen_vec_cmpeqv2div2di (dst, cmp, op1, op2)); + + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_lowpart (V4SImode, dst); + emit_insn (gen_sse2_pshufd (tmp1, tmp2, GEN_INT (0x4e))); + + rtx tmp3 = gen_reg_rtx (V4SImode); + emit_insn (gen_andv4si3 (tmp3, tmp2, tmp1)); + + emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3)); + DONE; +}) + (define_expand "vcond<V_512:mode><VF_512:mode>" [(set (match_operand:V_512 0 "register_operand") (if_then_else:V_512 diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c new file mode 100644 index 0000000..b3837c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-veq.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); +typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16))); +typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); + +uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x == y; } +uv2di eq_v2di(uv2di x, uv2di y) { return x == y; } +uv4si eq_v4si(uv4si x, uv4si y) { return x == y; } + +/* { dg-final { scan-assembler-times "pcmpeq" 3 } } */ +/* { dg-final { scan-assembler "pshufd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c new file mode 100644 index 0000000..767b0e4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c @@ -0,0 +1,13 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); +typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16))); +typedef unsigned int uv4si __attribute__ ((__vector_size__ (16))); + +uv1ti eq_v1ti(uv1ti x, uv1ti y) { return x != y; } +uv2di eq_v2di(uv2di x, uv2di y) { return x != y; } +uv4si eq_v4si(uv4si x, uv4si y) { return x != y; } + +/* { dg-final { scan-assembler-times "pcmpeq" 6 } } */ +/* { dg-final { scan-assembler-times "pxor" 3 } } */ +/* { dg-final { scan-assembler "pshufd" } } */ |