aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-06-11 21:22:42 +0800
committerliuhongt <hongtao.liu@intel.com>2024-06-14 15:02:42 +0800
commitd3fae2bea034edb001cd45d1d86c5ceef146899b (patch)
tree7b4042dc72dac54f49ed8c397f1d39f412688d10 /gcc
parent8b69efd9819f86b973d7a550e987ce455fce6d62 (diff)
downloadgcc-d3fae2bea034edb001cd45d1d86c5ceef146899b.zip
gcc-d3fae2bea034edb001cd45d1d86c5ceef146899b.tar.gz
gcc-d3fae2bea034edb001cd45d1d86c5ceef146899b.tar.bz2
Adjust ix86_rtx_costs for pternlog_operand_p.
r15-1100-gec985bc97a0157 improves handling of ternlog instructions, now GCC can recognize lots of pternlog_operand with different variants. The patch adjust rtx_costs for that, so pass_combine can reasonably generate more optimal vpternlog instructions. .i.e for avx512f-vpternlog-3.c, with the patch, 2 vpternlog are combined into one. 1532,1533c1526 < vpternlogd $168, %zmm1, %zmm0, %zmm2 < vpternlogd $0x55, %zmm2, %zmm2, %zmm2 > vpternlogd $87, %zmm1, %zmm0, %zmm2 1732,1733c1725,1726 < vpand %xmm0, %xmm1, %xmm0 < vpternlogd $0x55, %zmm0, %zmm0, %zmm0 > vpternlogd $63, %zmm1, %zmm0, %zmm1 > vmovdqa %xmm1, %xmm0 1804,1805c1797 < vpternlogd $188, %zmm2, %zmm0, %zmm1 < vpternlogd $0x55, %zmm1, %zmm1, %zmm1 > vpternlogd $37, %zmm0, %zmm2, %zmm1 gcc/ChangeLog: * config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for pternlog_operand under AVX512, also adjust VEC_DUPLICATE according since vec_dup:mem can't be that cheap. gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-pr98461.c: Scan either notl or vpternlog. * gcc.target/i386/avx512f-pr96891-3.c: Also scan for inversed condition. * gcc.target/i386/avx512f-vpternlogd-3.c: Adjust vpternlog number to 673. * gcc.target/i386/avx512f-vpternlogd-4.c: Ditto. * gcc.target/i386/avx512f-vpternlogd-5.c: Ditto. * gcc.target/i386/sse2-v1ti-vne.c: Add -mno-avx512f.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.cc39
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-pr98461.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c2
7 files changed, 44 insertions, 7 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index c72f64d..d4ccc24 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21571,6 +21571,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
= speed ? ix86_tune_cost : &ix86_size_cost;
int src_cost;
+ /* Handling different vternlog variants. */
+ if ((GET_MODE_SIZE (mode) == 64
+ ? (TARGET_AVX512F && TARGET_EVEX512)
+ : (TARGET_AVX512VL
+ || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+ && GET_MODE_SIZE (mode) >= 16
+ && outer_code_i == SET
+ && ternlog_operand (x, mode))
+ {
+ rtx args[3];
+
+ args[0] = NULL_RTX;
+ args[1] = NULL_RTX;
+ args[2] = NULL_RTX;
+ int idx = ix86_ternlog_idx (x, args);
+ gcc_assert (idx >= 0);
+
+ *total = cost->sse_op;
+ for (int i = 0; i != 3; i++)
+ if (args[i])
+ *total += rtx_cost (args[i], GET_MODE (args[i]), UNSPEC, i, speed);
+ return true;
+ }
+
+
switch (code)
{
case SET:
@@ -22233,6 +22258,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
else if (XINT (x, 1) == UNSPEC_VTERNLOG)
{
*total = cost->sse_op;
+ *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed);
return true;
}
else if (XINT (x, 1) == UNSPEC_PTEST)
@@ -22260,12 +22288,21 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case VEC_SELECT:
case VEC_CONCAT:
- case VEC_DUPLICATE:
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
same cost. */
*total = cost->sse_op;
return true;
+ case VEC_DUPLICATE:
+ *total = rtx_cost (XEXP (x, 0),
+ GET_MODE (XEXP (x, 0)),
+ VEC_DUPLICATE, 0, speed);
+ /* It's broadcast instruction, not embedded broadcasting. */
+ if (outer_code == SET)
+ *total += cost->sse_op;
+
+ return true;
+
case VEC_MERGE:
mask = XEXP (x, 2);
/* This is masked instruction, assume the same cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
index 15f49b8..225f2ab 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
@@ -2,7 +2,7 @@
/* { dg-do compile } */
/* { dg-options "-O2 -mavx2 -masm=att" } */
/* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */
-/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */
+/* { dg-final { scan-assembler-times "\t(?:notl|vpternlog\[dq\])\t" 6 } } */
/* { dg-final { scan-assembler-not "\tvpcmpeq" } } */
/* { dg-final { scan-assembler-not "\tvpxor" } } */
/* { dg-final { scan-assembler-not "\tvpandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
index 06db752..5b26081 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr96891-3.c
@@ -3,7 +3,7 @@
/* { dg-final { scan-assembler-not {not[bwlqd]\]} } } */
/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$5} 4} } */
/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$6} 4} } */
-/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$7} 4} } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]*\$[37]} 4} } */
/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$5} 2} } */
/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$6} 2} } */
/* { dg-final { scan-assembler-times {(?n)vcmpp[sd][ \t]*\$7} 2} } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
index fc66a9f..9ed4680 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-3.c
@@ -952,4 +952,4 @@ V foo_254_3(V a, V b, V c) { return (c|b)|a; }
V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 694 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
index 1429650..eb39ffc 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-4.c
@@ -952,4 +952,4 @@ V foo_254_3(V a, V b, V c) { return (c|b)|a; }
V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 694 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
index 3dbd954..85de5b0 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vpternlogd-5.c
@@ -952,4 +952,4 @@ V foo_254_3(V a, V b, V c) { return (c|b)|a; }
V foo_255_1(V a, V b, V c) { return (V){~0,~0,~0,~0}; }
-/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 679 } } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]" 673 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
index 767b0e4..2394cff 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c
@@ -1,5 +1,5 @@
/* { dg-do compile { target int128 } } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-avx512f" } */
typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
typedef unsigned long long uv2di __attribute__ ((__vector_size__ (16)));
typedef unsigned int uv4si __attribute__ ((__vector_size__ (16)));