diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2022-06-07 07:49:40 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2022-06-07 07:49:40 +0100 |
commit | c4320bde42c6497b701e2e6b8f1c5069bed19818 (patch) | |
tree | a32c7f647ec3f9ef4a79c2e97812b4c89e12007f | |
parent | 63eab5d5775da6c656f3527ad974b7dc25f5ad5e (diff) | |
download | gcc-c4320bde42c6497b701e2e6b8f1c5069bed19818.zip gcc-c4320bde42c6497b701e2e6b8f1c5069bed19818.tar.gz gcc-c4320bde42c6497b701e2e6b8f1c5069bed19818.tar.bz2 |
Recognize vpcmov in combine with -mxop on x86.
By way of an apology for causing PR target/105791, where I'd overlooked
the need to support V1TImode in TARGET_XOP's vpcmov instruction, this
patch further improves support for TARGET_XOP's vpcmov instruction, by
recognizing it in combine.
Currently, the test case:
typedef int v4si __attribute__ ((vector_size (16)));
v4si foo(v4si c, v4si t, v4si f)
{
return (c&t)|(~c&f);
}
on x86_64 with -O2 -mxop generates:
vpxor %xmm2, %xmm1, %xmm1
vpand %xmm0, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm0
ret
but with this patch now generates:
vpcmov %xmm0, %xmm2, %xmm1, %xmm0
ret
On its own, the new combine splitter works fine on TARGET_64BIT, but
alas with -m32 combine incorrectly thinks the replacement instruction
is more expensive, as IF_THEN_ELSE isn't currently/correctly handled
in ix86_rtx_costs. So to avoid the need for a target selector in the
new tescase, I've updated ix86_rtx_costs to report that AMD's vpcmov
has a latency of two cycles [it's now an obsolete instruction set
extension and there's unlikely to ever be a processor where this
instruction has a different timing], and while there I also added
rtx_costs for x86_64's integer conditional move instructions (which
have single cycle latency).
2022-06-07 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386.cc (ix86_rtx_costs): Add a new case for
IF_THEN_ELSE, and provide costs for TARGET_XOP's vpcmov and
TARGET_CMOVE's (scalar integer) conditional moves.
* config/i386/sse.md (define_split): Recognize XOP's vpcmov
from its equivalent (canonical) pxor;pand;pxor sequence.
gcc/testsuite/ChangeLog
* gcc.target/i386/xop-pcmov3.c: New test case.
-rw-r--r-- | gcc/config/i386/i386.cc | 31 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 24 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/xop-pcmov3.c | 10 |
3 files changed, 65 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 11f4ddf..0823161 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21009,6 +21009,37 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } return false; + case IF_THEN_ELSE: + if (TARGET_XOP + && VECTOR_MODE_P (mode) + && (GET_MODE_SIZE (mode) == 16 || GET_MODE_SIZE (mode) == 32)) + { + /* vpcmov. */ + *total = speed ? COSTS_N_INSNS (2) : COSTS_N_BYTES (6); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + if (!REG_P (XEXP (x, 2))) + *total += rtx_cost (XEXP (x, 2), mode, code, 2, speed); + return true; + } + else if (TARGET_CMOVE + && SCALAR_INT_MODE_P (mode) + && GET_MODE_SIZE (mode) <= UNITS_PER_WORD) + { + /* cmov. */ + *total = COSTS_N_INSNS (1); + if (!REG_P (XEXP (x, 0))) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!REG_P (XEXP (x, 1))) + *total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); + if (!REG_P (XEXP (x, 2))) + *total += rtx_cost (XEXP (x, 2), mode, code, 2, speed); + return true; + } + return false; + default: return false; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 62688f8..3ca89b9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -23868,6 +23868,30 @@ "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) +;; Recognize XOP's vpcmov from canonical (xor (and (xor t f) c) f) +(define_split + [(set (match_operand:V_128_256 0 "register_operand") + (xor:V_128_256 + (and:V_128_256 + (xor:V_128_256 (match_operand:V_128_256 1 "register_operand") + (match_operand:V_128_256 2 "register_operand")) + (match_operand:V_128_256 3 "nonimmediate_operand")) + (match_operand:V_128_256 4 "register_operand")))] + "TARGET_XOP + && (REGNO (operands[4]) == REGNO (operands[1]) + || REGNO (operands[4]) == REGNO (operands[2]))" + [(set (match_dup 0) (if_then_else:V_128_256 (match_dup 3) + (match_dup 5) + (match_dup 4)))] +{ + /* To handle the commutivity of XOR, operands[4] is either operands[1] + or operands[2], we need operands[5] to be the other one. */ + if (REGNO (operands[4]) == REGNO (operands[1])) + operands[5] = operands[2]; + else + operands[5] = operands[1]; +}) + ;; XOP horizontal add/subtract instructions (define_insn "xop_phadd<u>bw" [(set (match_operand:V8HI 0 "register_operand" "=x") diff --git a/gcc/testsuite/gcc.target/i386/xop-pcmov3.c b/gcc/testsuite/gcc.target/i386/xop-pcmov3.c new file mode 100644 index 0000000..6c40f33 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xop-pcmov3.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mxop" } */ +typedef int v4si __attribute__ ((vector_size (16))); + +v4si foo(v4si c, v4si t, v4si f) +{ + return (c&t)|(~c&f); +} + +/* { dg-final { scan-assembler "vpcmov" } } */ |