aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2019-02-22 15:54:08 +0000
committerH.J. Lu <hjl@gcc.gnu.org>2019-02-22 07:54:08 -0800
commitf14322806253059eb49011c2cfdbae4c85ad47e4 (patch)
treeec05edc6d020eb2a3f7877c68288cd415f08d978
parent965779b4ad0f0abbdc8ab0addd2fae14165a08f4 (diff)
downloadgcc-f14322806253059eb49011c2cfdbae4c85ad47e4.zip
gcc-f14322806253059eb49011c2cfdbae4c85ad47e4.tar.gz
gcc-f14322806253059eb49011c2cfdbae4c85ad47e4.tar.bz2
i386: Add pass_remove_partial_avx_dependency
With -mavx, for $ cat foo.i extern float f; extern double d; extern int i; void foo (void) { d = f; f = i; } we need to generate vxorp[ds] %xmmN, %xmmN, %xmmN ... vcvtss2sd f(%rip), %xmmN, %xmmX ... vcvtsi2ss i(%rip), %xmmN, %xmmY to avoid partial XMM register stall. This patch adds a pass to generate a single vxorps %xmmN, %xmmN, %xmmN at entry of the nearest dominator for basic blocks with SF/DF conversions, which is in the fake loop that contains the whole function, instead of generating one vxorp[ds] %xmmN, %xmmN, %xmmN for each SF/DF conversion. NB: The LCM algorithm isn't appropriate here since it may place a vxorps inside the loop. Simple testcase show this: $ cat badcase.c extern float f; extern double d; void foo (int n, int k) { for (int j = 0; j != n; j++) if (j < k) d = f; } It generates ... loop: if(j < k) vxorps %xmm0, %xmm0, %xmm0 vcvtss2sd f(%rip), %xmm0, %xmm0 ... loopend ... This is because LCM only works when there is a certain benifit. But for conditional branch, LCM wouldn't move vxorps %xmm0, %xmm0, %xmm0 out of loop. SPEC CPU 2017 on Intel Xeon with AVX512 shows: 1. The nearest dominator |RATE |Improvement| |500.perlbench_r | 0.55% | |538.imagick_r | 8.43% | |544.nab_r | 0.71% | 2. LCM |RATE |Improvement| |500.perlbench_r | -0.76% | |538.imagick_r | 7.96% | |544.nab_r | -0.13% | Performance impacts of SPEC CPU 2017 rate on Intel Xeon with AVX512 using -Ofast -flto -march=skylake-avx512 -funroll-loops before commit e739972ad6ad05e32a1dd5c29c0b950a4c4bd576 Author: uros <uros@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Thu Jan 31 20:06:42 2019 +0000 PR target/89071 * config/i386/i386.md (*extendsfdf2): Split out reg->reg alternative to avoid partial SSE register stall for TARGET_AVX. (truncdfsf2): Ditto. (sse4_1_round<mode>2): Ditto. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@268427 138bc75d-0d04-0410-961f-82ee72b054a4 are: |INT RATE |Improvement| |500.perlbench_r | 0.55% | |502.gcc_r | 0.14% | |505.mcf_r | 0.08% | |523.xalancbmk_r | 0.18% | |525.x264_r |-0.49% | |531.deepsjeng_r |-0.04% | |541.leela_r |-0.26% | |548.exchange2_r |-0.3% | |557.xz_r |BuildSame| |FP RATE |Improvement| |503.bwaves_r |-0.29% | |507.cactuBSSN_r | 0.04% | |508.namd_r |-0.74% | |510.parest_r |-0.01% | |511.povray_r | 2.23% | |519.lbm_r | 0.1% | |521.wrf_r | 0.49% | |526.blender_r | 0.13% | |527.cam4_r | 0.65% | |538.imagick_r | 8.43% | |544.nab_r | 0.71% | |549.fotonik3d_r | 0.15% | |554.roms_r | 0.08% | After commit e739972ad6ad05e32a1dd5c29c0b950a4c4bd576, on Skylake client, impacts on 538.imagick_r with -fno-unsafe-math-optimizations -march=native -Ofast -funroll-loops -flto 1. Size comparision: before: text data bss dec hex filename 2436377 8352 4528 2449257 255f69 imagick_r after: text data bss dec hex filename 2425249 8352 4528 2438129 2533f1 imagick_r 2. Number of vxorps: before after difference 4948 4135 -19.66% 3. Performance improvement: |RATE |Improvement| |538.imagick_r | 5.5% | gcc/ 2019-02-22 H.J. Lu <hongjiu.lu@intel.com> Hongtao Liu <hongtao.liu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/87007 * config/i386/i386-passes.def: Add pass_remove_partial_avx_dependency. * config/i386/i386-protos.h (make_pass_remove_partial_avx_dependency): New. * config/i386/i386.c (make_pass_remove_partial_avx_dependency): New function. (pass_data_remove_partial_avx_dependency): New. (pass_remove_partial_avx_dependency): Likewise. (make_pass_remove_partial_avx_dependency): Likewise. * config/i386/i386.md (avx_partial_xmm_update): New attribute. (*extendsfdf2): Add avx_partial_xmm_update. (truncdfsf2): Likewise. (*float<SWI48:mode><MODEF:mode>2): Likewise. (SF/DF conversion splitters): Disabled for TARGET_AVX. gcc/testsuite/ 2019-02-22 H.J. Lu <hongjiu.lu@intel.com> Hongtao Liu <hongtao.liu@intel.com> Sunil K Pandey <sunil.k.pandey@intel.com> PR target/87007 * gcc.target/i386/pr87007-1.c: New test. * gcc.target/i386/pr87007-2.c: Likewise. Co-Authored-By: Hongtao Liu <hongtao.liu@intel.com> Co-Authored-By: Sunil K Pandey <sunil.k.pandey@intel.com> From-SVN: r269119
-rw-r--r--gcc/ChangeLog20
-rw-r--r--gcc/config/i386/i386-passes.def2
-rw-r--r--gcc/config/i386/i386-protos.h2
-rw-r--r--gcc/config/i386/i386.c174
-rw-r--r--gcc/config/i386/i386.md16
-rw-r--r--gcc/testsuite/ChangeLog8
-rw-r--r--gcc/testsuite/gcc.target/i386/pr87007-1.c15
-rw-r--r--gcc/testsuite/gcc.target/i386/pr87007-2.c18
8 files changed, 252 insertions, 3 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9d79670..41deb1c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2019-02-22 H.J. Lu <hongjiu.lu@intel.com>
+ Hongtao Liu <hongtao.liu@intel.com>
+ Sunil K Pandey <sunil.k.pandey@intel.com>
+
+ PR target/87007
+ * config/i386/i386-passes.def: Add
+ pass_remove_partial_avx_dependency.
+ * config/i386/i386-protos.h
+ (make_pass_remove_partial_avx_dependency): New.
+ * config/i386/i386.c (make_pass_remove_partial_avx_dependency):
+ New function.
+ (pass_data_remove_partial_avx_dependency): New.
+ (pass_remove_partial_avx_dependency): Likewise.
+ (make_pass_remove_partial_avx_dependency): Likewise.
+ * config/i386/i386.md (avx_partial_xmm_update): New attribute.
+ (*extendsfdf2): Add avx_partial_xmm_update.
+ (truncdfsf2): Likewise.
+ (*float<SWI48:mode><MODEF:mode>2): Likewise.
+ (SF/DF conversion splitters): Disabled for TARGET_AVX.
+
2019-02-22 Aldy Hernandez <aldyh@redhat.com>
PR middle-end/85598
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 87cfd94..f4facdc 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -31,3 +31,5 @@ along with GCC; see the file COPYING3. If not see
INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbranch);
+
+ INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 2d60017..83645e8 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -369,3 +369,5 @@ class rtl_opt_pass;
extern rtl_opt_pass *make_pass_insert_vzeroupper (gcc::context *);
extern rtl_opt_pass *make_pass_stv (gcc::context *);
extern rtl_opt_pass *make_pass_insert_endbranch (gcc::context *);
+extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
+ (gcc::context *);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 81dfed1..e77653d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2793,6 +2793,180 @@ make_pass_insert_endbranch (gcc::context *ctxt)
return new pass_insert_endbranch (ctxt);
}
+/* At entry of the nearest common dominator for basic blocks with
+ conversions, generate a single
+ vxorps %xmmN, %xmmN, %xmmN
+ for all
+ vcvtss2sd op, %xmmN, %xmmX
+ vcvtsd2ss op, %xmmN, %xmmX
+ vcvtsi2ss op, %xmmN, %xmmX
+ vcvtsi2sd op, %xmmN, %xmmX
+
+ NB: We want to generate only a single vxorps to cover the whole
+ function. The LCM algorithm isn't appropriate here since it may
+ place a vxorps inside the loop. */
+
+static unsigned int
+remove_partial_avx_dependency (void)
+{
+ timevar_push (TV_MACH_DEP);
+
+ bitmap_obstack_initialize (NULL);
+ bitmap convert_bbs = BITMAP_ALLOC (NULL);
+
+ basic_block bb;
+ rtx_insn *insn, *set_insn;
+ rtx set;
+ rtx v4sf_const0 = NULL_RTX;
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ set = single_set (insn);
+ if (!set)
+ continue;
+
+ if (get_attr_avx_partial_xmm_update (insn)
+ != AVX_PARTIAL_XMM_UPDATE_TRUE)
+ continue;
+
+ if (!v4sf_const0)
+ v4sf_const0 = gen_reg_rtx (V4SFmode);
+
+ /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
+ SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
+ vec_merge with subreg. */
+ rtx src = SET_SRC (set);
+ rtx dest = SET_DEST (set);
+ machine_mode dest_mode = GET_MODE (dest);
+
+ rtx zero;
+ machine_mode dest_vecmode;
+ if (dest_mode == E_SFmode)
+ {
+ dest_vecmode = V4SFmode;
+ zero = v4sf_const0;
+ }
+ else
+ {
+ dest_vecmode = V2DFmode;
+ zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+ }
+
+ /* Change source to vector mode. */
+ src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+ src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+ GEN_INT (HOST_WIDE_INT_1U));
+ /* Change destination to vector mode. */
+ rtx vec = gen_reg_rtx (dest_vecmode);
+ /* Generate an XMM vector SET. */
+ set = gen_rtx_SET (vec, src);
+ set_insn = emit_insn_before (set, insn);
+ df_insn_rescan (set_insn);
+
+ src = gen_rtx_SUBREG (dest_mode, vec, 0);
+ set = gen_rtx_SET (dest, src);
+
+ /* Drop possible dead definitions. */
+ PATTERN (insn) = set;
+
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
+ df_insn_rescan (insn);
+ bitmap_set_bit (convert_bbs, bb->index);
+ }
+ }
+
+ if (v4sf_const0)
+ {
+ calculate_dominance_info (CDI_DOMINATORS);
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_md_add_problem ();
+ df_analyze ();
+
+ /* (Re-)discover loops so that bb->loop_father can be used in the
+ analysis below. */
+ loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+ /* Generate a vxorps at entry of the nearest dominator for basic
+ blocks with conversions, which is in the the fake loop that
+ contains the whole function, so that there is only a single
+ vxorps in the whole function. */
+ bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
+ convert_bbs);
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+
+ insn = BB_HEAD (bb);
+ if (!NONDEBUG_INSN_P (insn))
+ insn = next_nonnote_nondebug_insn (insn);
+ set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+ set_insn = emit_insn_before (set, insn);
+ df_insn_rescan (set_insn);
+ df_process_deferred_rescans ();
+ loop_optimizer_finalize ();
+ }
+
+ bitmap_obstack_release (NULL);
+ BITMAP_FREE (convert_bbs);
+
+ timevar_pop (TV_MACH_DEP);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_partial_avx_dependency =
+{
+ RTL_PASS, /* type */
+ "rpad", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_remove_partial_avx_dependency : public rtl_opt_pass
+{
+public:
+ pass_remove_partial_avx_dependency (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ return remove_partial_avx_dependency ();
+ }
+}; // class pass_rpad
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
+{
+ return new pass_remove_partial_avx_dependency (ctxt);
+}
+
/* Return true if a red-zone is in use. We can't use red-zone when
there are local indirect jumps, like "indirect_jump" or "tablejump",
which jumps to another place in the function, since "call" in the
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b1ae88c..90f1660 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -778,6 +778,10 @@
(define_attr "i387_cw" "trunc,floor,ceil,uninitialized,any"
(const_string "any"))
+;; Define attribute to indicate AVX insns with partial XMM register update.
+(define_attr "avx_partial_xmm_update" "false,true"
+ (const_string "false"))
+
;; Define attribute to classify add/sub insns that consumes carry flag (CF)
(define_attr "use_carry" "0,1" (const_string "0"))
@@ -4392,6 +4396,7 @@
}
}
[(set_attr "type" "fmov,fmov,ssecvt,ssecvt")
+ (set_attr "avx_partial_xmm_update" "false,false,false,true")
(set_attr "prefix" "orig,orig,maybe_vex,maybe_vex")
(set_attr "mode" "SF,XF,DF,DF")
(set (attr "enabled")
@@ -4481,7 +4486,8 @@
[(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
- "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ "!TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!REG_P (operands[1])
|| (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -4558,6 +4564,7 @@
}
}
[(set_attr "type" "fmov,fmov,ssecvt,ssecvt")
+ (set_attr "avx_partial_xmm_update" "false,false,false,true")
(set_attr "mode" "SF")
(set (attr "enabled")
(if_then_else
@@ -4641,7 +4648,8 @@
[(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
- "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ "!TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!REG_P (operands[1])
|| (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1])))
@@ -5017,6 +5025,7 @@
%vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1}
%vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1}"
[(set_attr "type" "fmov,sseicvt,sseicvt")
+ (set_attr "avx_partial_xmm_update" "false,true,true")
(set_attr "prefix" "orig,maybe_vex,maybe_vex")
(set_attr "mode" "<MODEF:MODE>")
(set (attr "prefix_rex")
@@ -5145,7 +5154,8 @@
(define_split
[(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
- "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+ "!TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
&& optimize_function_for_speed_p (cfun)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 1dd1d4b..5b27144 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2019-02-22 H.J. Lu <hongjiu.lu@intel.com>
+ Hongtao Liu <hongtao.liu@intel.com>
+ Sunil K Pandey <sunil.k.pandey@intel.com>
+
+ PR target/87007
+ * gcc.target/i386/pr87007-1.c: New test.
+ * gcc.target/i386/pr87007-2.c: Likewise.
+
2019-02-22 Richard Biener <rguenther@suse.de>
PR tree-optimization/89440
diff --git a/gcc/testsuite/gcc.target/i386/pr87007-1.c b/gcc/testsuite/gcc.target/i386/pr87007-1.c
new file mode 100644
index 0000000..93cf1dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr87007-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (void)
+{
+ d = f;
+ f = i;
+}
+
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr87007-2.c b/gcc/testsuite/gcc.target/i386/pr87007-2.c
new file mode 100644
index 0000000..cca7ae7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr87007-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (int n, int k)
+{
+ for (int i = 0; i != n; i++)
+ if(i < k)
+ d = f;
+ else
+ f = i;
+}
+
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */