aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorliuhongt <hongtao.liu@intel.com>2022-03-30 20:35:55 +0800
committerliuhongt <hongtao.liu@intel.com>2022-04-05 12:51:37 +0800
commite3174d6183e5c042e822d9feabb670235b737441 (patch)
tree3cc052908840aea684e965682f7eec3d2eebc39b /gcc
parent418967ca275853a570b0ae566d7022ff38e7cd0d (diff)
downloadgcc-e3174d6183e5c042e822d9feabb670235b737441.zip
gcc-e3174d6183e5c042e822d9feabb670235b737441.tar.gz
gcc-e3174d6183e5c042e822d9feabb670235b737441.tar.bz2
Split vector load from parm_del to elemental loads to avoid STLF stalls.
Since cfg is freed before machine_reorg, just do a rough calculation of the window according to the layout. Also according to an experiment on CLX, set window size to 64. Currently only handle V2DFmode load since it doesn't need any scratch registers, and it's sufficient to recover cray performance for -O2 compared to GCC11. gcc/ChangeLog: PR target/101908 * config/i386/i386.cc (ix86_split_stlf_stall_load): New function (ix86_reorg): Call ix86_split_stlf_stall_load. * config/i386/i386.opt (-param=x86-stlf-window-ninsns=): New param. gcc/testsuite/ChangeLog: * gcc.target/i386/pr101908-1.c: New test. * gcc.target/i386/pr101908-2.c: New test. * gcc.target/i386/pr101908-3.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/config/i386/i386.cc61
-rw-r--r--gcc/config/i386/i386.opt4
-rw-r--r--gcc/testsuite/gcc.target/i386/pr101908-1.c12
-rw-r--r--gcc/testsuite/gcc.target/i386/pr101908-2.c12
-rw-r--r--gcc/testsuite/gcc.target/i386/pr101908-3.c14
5 files changed, 103 insertions, 0 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d84047a..c959b71 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21942,6 +21942,65 @@ ix86_seh_fixup_eh_fallthru (void)
emit_insn_after (gen_nops (const1_rtx), insn);
}
}
+/* Split vector load from parm_decl to elemental loads to avoid STLF
+ stalls. */
+static void
+ix86_split_stlf_stall_load ()
+{
+ rtx_insn* insn, *start = get_insns ();
+ unsigned window = 0;
+
+ for (insn = start; insn; insn = NEXT_INSN (insn))
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ window++;
+ /* Insert 64 vaddps %xmm18, %xmm19, %xmm20(no dependence between each
+ other, just emulate for pipeline) before stalled load, stlf stall
+ case is as fast as no stall cases on CLX.
+ Since CFG is freed before machine_reorg, just do a rough
+ calculation of the window according to the layout. */
+ if (window > (unsigned) x86_stlf_window_ninsns)
+ return;
+
+ if (any_uncondjump_p (insn)
+ || ANY_RETURN_P (PATTERN (insn))
+ || CALL_P (insn))
+ return;
+
+ rtx set = single_set (insn);
+ if (!set)
+ continue;
+ rtx src = SET_SRC (set);
+ if (!MEM_P (src)
+ /* Only handle V2DFmode load since it doesn't need any scratch
+ register. */
+ || GET_MODE (src) != E_V2DFmode
+ || !MEM_EXPR (src)
+ || TREE_CODE (get_base_address (MEM_EXPR (src))) != PARM_DECL)
+ continue;
+
+ rtx zero = CONST0_RTX (V2DFmode);
+ rtx dest = SET_DEST (set);
+ rtx m = adjust_address (src, DFmode, 0);
+ rtx loadlpd = gen_sse2_loadlpd (dest, zero, m);
+ emit_insn_before (loadlpd, insn);
+ m = adjust_address (src, DFmode, 8);
+ rtx loadhpd = gen_sse2_loadhpd (dest, dest, m);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fputs ("Due to potential STLF stall, split instruction:\n",
+ dump_file);
+ print_rtl_single (dump_file, insn);
+ fputs ("To:\n", dump_file);
+ print_rtl_single (dump_file, loadlpd);
+ print_rtl_single (dump_file, loadhpd);
+ }
+ PATTERN (insn) = loadhpd;
+ INSN_CODE (insn) = -1;
+ gcc_assert (recog_memoized (insn) != -1);
+ }
+}
/* Implement machine specific optimizations. We implement padding of returns
for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
@@ -21957,6 +22016,8 @@ ix86_reorg (void)
if (optimize && optimize_function_for_speed_p (cfun))
{
+ if (TARGET_SSE2)
+ ix86_split_stlf_stall_load ();
if (TARGET_PAD_SHORT_FUNCTION)
ix86_pad_short_function ();
else if (TARGET_PAD_RETURNS)
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index d8e8656..a6b0e28 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1210,3 +1210,7 @@ Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5
mdirect-extern-access
Target Var(ix86_direct_extern_access) Init(1)
Do not use GOT to access external symbols.
+
+-param=x86-stlf-window-ninsns=
+Target Joined UInteger Var(x86_stlf_window_ninsns) Init(64) Param
+Instructions number above which STFL stall penalty can be compensated.
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-1.c b/gcc/testsuite/gcc.target/i386/pr101908-1.c
new file mode 100644
index 0000000..33d9684
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]} } } */
+
+struct X { double x[2]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X* x, struct X* y)
+{
+ return (v2df) {x->x[1], x->x[0] } + (v2df) { y->x[1], y->x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-2.c b/gcc/testsuite/gcc.target/i386/pr101908-2.c
new file mode 100644
index 0000000..45060b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-times {(?n)movhpd[ \t]+} "2" } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+ return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101908-3.c b/gcc/testsuite/gcc.target/i386/pr101908-3.c
new file mode 100644
index 0000000..ddd3e8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101908-3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-final { scan-assembler-not {(?n)movhpd[ \t]+} } } */
+
+struct X { double x[4]; };
+typedef double v2df __attribute__((vector_size(16)));
+
+extern void bar (void);
+v2df __attribute__((noipa))
+foo (struct X x, struct X y)
+{
+ bar ();
+ return (v2df) {x.x[1], x.x[0] } + (v2df) { y.x[1], y.x[0] };
+}