aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2024-06-26 13:52:24 +0800
committerliuhongt <hongtao.liu@intel.com>2024-07-01 09:09:59 +0800
commite62ea4fb8ffcab06ddd02f26db91b29b7270743f (patch)
tree974cc9322edd39a38bb5886c55a1ea892f28dc6e /gcc
parent8e1fa107a63b2e160b6bf69de4fe163dd3cebd80 (diff)
downloadgcc-e62ea4fb8ffcab06ddd02f26db91b29b7270743f.zip
gcc-e62ea4fb8ffcab06ddd02f26db91b29b7270743f.tar.gz
gcc-e62ea4fb8ffcab06ddd02f26db91b29b7270743f.tar.bz2
Enable flate-combine.
Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also define target_insn_cost to prevent post_reload pass_late_combine to revert the optimziation did in pass_rpad. Adjust testcases since pass_late_combine generates better code but break scan assembly. .i.e Under 32-bit target, gcc used to generate broadcast from stack and then do the real operation. After flate_combine, they're combined into embeded broadcast operations. gcc/ChangeLog: * config/i386/i386-features.cc (ix86_rpad_gate): New function. * config/i386/i386-options.cc (ix86_override_options_after_change): Don't disable flate_combine. * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad after pre_reload pas_late_combine. * config/i386/i386-protos.h (ix86_rpad_gate): New declare. * config/i386/i386.cc (ix86_insn_cost): New function. (TARGET_INSN_COST): Define. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus testcase. * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/pr91333.c: Ditto. * gcc.target/i386/vect-strided-4.c: Ditto.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386-features.cc16
-rw-r--r--gcc/config/i386/i386-options.cc4
-rw-r--r--gcc/config/i386/i386-passes.def4
-rw-r--r--gcc/config/i386/i386-protos.h1
-rw-r--r--gcc/config/i386/i386.cc18
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c1
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c4
-rw-r--r--gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/pr91333.c2
-rw-r--r--gcc/testsuite/gcc.target/i386/vect-strided-4.c2
15 files changed, 42 insertions, 24 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 607d199..fc224ed 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
return new pass_insert_endbr_and_patchable_area (ctxt);
}
+bool
+ix86_rpad_gate ()
+{
+ return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
/* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -3232,11 +3242,7 @@ public:
/* opt_pass methods: */
bool gate (function *) final override
{
- return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+ return ix86_rpad_gate ();
}
unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9c12d49..1ef2c71a 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
}
- /* Late combine tends to undo some of the effects of STV and RPAD,
- by combining instructions back to their original form. */
- if (!OPTION_SET_P (flag_late_combine_instructions))
- flag_late_combine_instructions = 0;
}
/* Clear stack slot assignments remembered from previous functions.
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766..2d29f65 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -25,11 +25,11 @@ along with GCC; see the file COPYING3. If not see
*/
INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
- INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
/* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
CONSTM1_RTX generated by the STV pass can be CSEed. */
INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
- INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 1a76090..68f5739 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -425,6 +425,7 @@ extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
extern bool ix86_has_no_direct_extern_access;
+extern bool ix86_rpad_gate ();
/* In i386-expand.cc. */
bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6f742ef..bd74111 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21382,6 +21382,22 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
}
}
+static int
+ix86_insn_cost (rtx_insn *insn, bool speed)
+{
+ int insn_cost = 0;
+ /* Add extra cost to avoid post_reload late_combine revert
+ the optimization did in pass_rpad. */
+ if (reload_completed
+ && ix86_rpad_gate ()
+ && recog_memoized (insn) >= 0
+ && get_attr_avx_partial_xmm_update (insn)
+ == AVX_PARTIAL_XMM_UPDATE_TRUE)
+ insn_cost += COSTS_N_INSNS (3);
+
+ return insn_cost + pattern_cost (PATTERN (insn), speed);
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -26528,6 +26544,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
#define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
#undef TARGET_RTX_COSTS
#define TARGET_RTX_COSTS ix86_rtx_costs
+#undef TARGET_INSN_COST
+#define TARGET_INSN_COST ix86_insn_cost
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
index 138dbb4..3a50749 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
@@ -3,8 +3,8 @@
/* { dg-options "-O2 -mavx512f -mavx512dq" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 3 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 3 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
index d22251b..ea2f648 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
@@ -3,7 +3,6 @@
/* { dg-options "-O2 -mavx512f" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 4 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
index 8c11720..bbcc5ed 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
index cc705af..fc72dd6 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
index db5c346..342de48 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
index 7815251..f56a3f8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
#define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
index e6df4d2..0889844 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
@@ -3,8 +3,8 @@
/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } } */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 3 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
index ebdc361..c57a2e2 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
@@ -3,8 +3,6 @@
/* { dg-options "-O2 -mavx512f -mavx512vl" } */
/* { dg-additional-options "-fno-PIE" { target ia32 } } */
/* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } } */
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 4 } } */
/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %xmm\[0-9\]+" 4 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr91333.c b/gcc/testsuite/gcc.target/i386/pr91333.c
index 2bdff871..b4940b5 100644
--- a/gcc/testsuite/gcc.target/i386/pr91333.c
+++ b/gcc/testsuite/gcc.target/i386/pr91333.c
@@ -1,6 +1,6 @@
/* { dg-do compile { target { ! ia32 } } } */
/* { dg-options "-O2 -mavx" } */
-/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */
+/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */
static inline double g (double x){
asm volatile ("" : "+x" (x));
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
index dd92292..3fb9f07 100644
--- a/gcc/testsuite/gcc.target/i386/vect-strided-4.c
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
@@ -15,6 +15,6 @@ void foo (int * __restrict a, int * __restrict b, int *c, int s)
/* Vectorization factor two, two two-element stores to a using movq
and two two-element stores to b via pextrq/movhps of the high part. */
-/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */
/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */