diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2019-02-22 15:54:08 +0000 |
---|---|---|
committer | H.J. Lu <hjl@gcc.gnu.org> | 2019-02-22 07:54:08 -0800 |
commit | f14322806253059eb49011c2cfdbae4c85ad47e4 (patch) | |
tree | ec05edc6d020eb2a3f7877c68288cd415f08d978 | |
parent | 965779b4ad0f0abbdc8ab0addd2fae14165a08f4 (diff) | |
download | gcc-f14322806253059eb49011c2cfdbae4c85ad47e4.zip gcc-f14322806253059eb49011c2cfdbae4c85ad47e4.tar.gz gcc-f14322806253059eb49011c2cfdbae4c85ad47e4.tar.bz2 |
i386: Add pass_remove_partial_avx_dependency
With -mavx, for
$ cat foo.i
extern float f;
extern double d;
extern int i;
void
foo (void)
{
d = f;
f = i;
}
we need to generate
vxorp[ds] %xmmN, %xmmN, %xmmN
...
vcvtss2sd f(%rip), %xmmN, %xmmX
...
vcvtsi2ss i(%rip), %xmmN, %xmmY
to avoid partial XMM register stall. This patch adds a pass to generate
a single
vxorps %xmmN, %xmmN, %xmmN
at entry of the nearest dominator for basic blocks with SF/DF conversions,
which is in the fake loop that contains the whole function, instead of
generating one
vxorp[ds] %xmmN, %xmmN, %xmmN
for each SF/DF conversion.
NB: The LCM algorithm isn't appropriate here since it may place a vxorps
inside the loop. Simple testcase show this:
$ cat badcase.c
extern float f;
extern double d;
void
foo (int n, int k)
{
for (int j = 0; j != n; j++)
if (j < k)
d = f;
}
It generates
...
loop:
if(j < k)
vxorps %xmm0, %xmm0, %xmm0
vcvtss2sd f(%rip), %xmm0, %xmm0
...
loopend
...
This is because LCM only works when there is a certain benifit. But for
conditional branch, LCM wouldn't move
vxorps %xmm0, %xmm0, %xmm0
out of loop. SPEC CPU 2017 on Intel Xeon with AVX512 shows:
1. The nearest dominator
|RATE |Improvement|
|500.perlbench_r | 0.55% |
|538.imagick_r | 8.43% |
|544.nab_r | 0.71% |
2. LCM
|RATE |Improvement|
|500.perlbench_r | -0.76% |
|538.imagick_r | 7.96% |
|544.nab_r | -0.13% |
Performance impacts of SPEC CPU 2017 rate on Intel Xeon with AVX512
using
-Ofast -flto -march=skylake-avx512 -funroll-loops
before
commit e739972ad6ad05e32a1dd5c29c0b950a4c4bd576
Author: uros <uros@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Thu Jan 31 20:06:42 2019 +0000
PR target/89071
* config/i386/i386.md (*extendsfdf2): Split out reg->reg
alternative to avoid partial SSE register stall for TARGET_AVX.
(truncdfsf2): Ditto.
(sse4_1_round<mode>2): Ditto.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@268427 138bc75d-0d04-0410-961f-82ee72b054a4
are:
|INT RATE |Improvement|
|500.perlbench_r | 0.55% |
|502.gcc_r | 0.14% |
|505.mcf_r | 0.08% |
|523.xalancbmk_r | 0.18% |
|525.x264_r |-0.49% |
|531.deepsjeng_r |-0.04% |
|541.leela_r |-0.26% |
|548.exchange2_r |-0.3% |
|557.xz_r |BuildSame|
|FP RATE |Improvement|
|503.bwaves_r |-0.29% |
|507.cactuBSSN_r | 0.04% |
|508.namd_r |-0.74% |
|510.parest_r |-0.01% |
|511.povray_r | 2.23% |
|519.lbm_r | 0.1% |
|521.wrf_r | 0.49% |
|526.blender_r | 0.13% |
|527.cam4_r | 0.65% |
|538.imagick_r | 8.43% |
|544.nab_r | 0.71% |
|549.fotonik3d_r | 0.15% |
|554.roms_r | 0.08% |
After commit e739972ad6ad05e32a1dd5c29c0b950a4c4bd576, on Skylake client,
impacts on 538.imagick_r with
-fno-unsafe-math-optimizations -march=native -Ofast -funroll-loops -flto
1. Size comparision:
before:
text data bss dec hex filename
2436377 8352 4528 2449257 255f69 imagick_r
after:
text data bss dec hex filename
2425249 8352 4528 2438129 2533f1 imagick_r
2. Number of vxorps:
before after difference
4948 4135 -19.66%
3. Performance improvement:
|RATE |Improvement|
|538.imagick_r | 5.5% |
gcc/
2019-02-22 H.J. Lu <hongjiu.lu@intel.com>
Hongtao Liu <hongtao.liu@intel.com>
Sunil K Pandey <sunil.k.pandey@intel.com>
PR target/87007
* config/i386/i386-passes.def: Add
pass_remove_partial_avx_dependency.
* config/i386/i386-protos.h
(make_pass_remove_partial_avx_dependency): New.
* config/i386/i386.c (make_pass_remove_partial_avx_dependency):
New function.
(pass_data_remove_partial_avx_dependency): New.
(pass_remove_partial_avx_dependency): Likewise.
(make_pass_remove_partial_avx_dependency): Likewise.
* config/i386/i386.md (avx_partial_xmm_update): New attribute.
(*extendsfdf2): Add avx_partial_xmm_update.
(truncdfsf2): Likewise.
(*float<SWI48:mode><MODEF:mode>2): Likewise.
(SF/DF conversion splitters): Disabled for TARGET_AVX.
gcc/testsuite/
2019-02-22 H.J. Lu <hongjiu.lu@intel.com>
Hongtao Liu <hongtao.liu@intel.com>
Sunil K Pandey <sunil.k.pandey@intel.com>
PR target/87007
* gcc.target/i386/pr87007-1.c: New test.
* gcc.target/i386/pr87007-2.c: Likewise.
Co-Authored-By: Hongtao Liu <hongtao.liu@intel.com>
Co-Authored-By: Sunil K Pandey <sunil.k.pandey@intel.com>
From-SVN: r269119
-rw-r--r-- | gcc/ChangeLog | 20 | ||||
-rw-r--r-- | gcc/config/i386/i386-passes.def | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386-protos.h | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 174 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 16 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 8 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr87007-1.c | 15 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr87007-2.c | 18 |
8 files changed, 252 insertions, 3 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9d79670..41deb1c 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,23 @@ +2019-02-22 H.J. Lu <hongjiu.lu@intel.com> + Hongtao Liu <hongtao.liu@intel.com> + Sunil K Pandey <sunil.k.pandey@intel.com> + + PR target/87007 + * config/i386/i386-passes.def: Add + pass_remove_partial_avx_dependency. + * config/i386/i386-protos.h + (make_pass_remove_partial_avx_dependency): New. + * config/i386/i386.c (make_pass_remove_partial_avx_dependency): + New function. + (pass_data_remove_partial_avx_dependency): New. + (pass_remove_partial_avx_dependency): Likewise. + (make_pass_remove_partial_avx_dependency): Likewise. + * config/i386/i386.md (avx_partial_xmm_update): New attribute. + (*extendsfdf2): Add avx_partial_xmm_update. + (truncdfsf2): Likewise. + (*float<SWI48:mode><MODEF:mode>2): Likewise. + (SF/DF conversion splitters): Disabled for TARGET_AVX. + 2019-02-22 Aldy Hernandez <aldyh@redhat.com> PR middle-end/85598 diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 87cfd94..f4facdc 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -31,3 +31,5 @@ along with GCC; see the file COPYING3. If not see INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbranch); + + INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 2d60017..83645e8 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -369,3 +369,5 @@ class rtl_opt_pass; extern rtl_opt_pass *make_pass_insert_vzeroupper (gcc::context *); extern rtl_opt_pass *make_pass_stv (gcc::context *); extern rtl_opt_pass *make_pass_insert_endbranch (gcc::context *); +extern rtl_opt_pass *make_pass_remove_partial_avx_dependency + (gcc::context *); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 81dfed1..e77653d 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2793,6 +2793,180 @@ make_pass_insert_endbranch (gcc::context *ctxt) return new pass_insert_endbranch (ctxt); } +/* At entry of the nearest common dominator for basic blocks with + conversions, generate a single + vxorps %xmmN, %xmmN, %xmmN + for all + vcvtss2sd op, %xmmN, %xmmX + vcvtsd2ss op, %xmmN, %xmmX + vcvtsi2ss op, %xmmN, %xmmX + vcvtsi2sd op, %xmmN, %xmmX + + NB: We want to generate only a single vxorps to cover the whole + function. The LCM algorithm isn't appropriate here since it may + place a vxorps inside the loop. */ + +static unsigned int +remove_partial_avx_dependency (void) +{ + timevar_push (TV_MACH_DEP); + + bitmap_obstack_initialize (NULL); + bitmap convert_bbs = BITMAP_ALLOC (NULL); + + basic_block bb; + rtx_insn *insn, *set_insn; + rtx set; + rtx v4sf_const0 = NULL_RTX; + + FOR_EACH_BB_FN (bb, cfun) + { + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + set = single_set (insn); + if (!set) + continue; + + if (get_attr_avx_partial_xmm_update (insn) + != AVX_PARTIAL_XMM_UPDATE_TRUE) + continue; + + if (!v4sf_const0) + v4sf_const0 = gen_reg_rtx (V4SFmode); + + /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF, + SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and + vec_merge with subreg. */ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + machine_mode dest_mode = GET_MODE (dest); + + rtx zero; + machine_mode dest_vecmode; + if (dest_mode == E_SFmode) + { + dest_vecmode = V4SFmode; + zero = v4sf_const0; + } + else + { + dest_vecmode = V2DFmode; + zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0); + } + + /* Change source to vector mode. */ + src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src); + src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero, + GEN_INT (HOST_WIDE_INT_1U)); + /* Change destination to vector mode. */ + rtx vec = gen_reg_rtx (dest_vecmode); + /* Generate an XMM vector SET. */ + set = gen_rtx_SET (vec, src); + set_insn = emit_insn_before (set, insn); + df_insn_rescan (set_insn); + + src = gen_rtx_SUBREG (dest_mode, vec, 0); + set = gen_rtx_SET (dest, src); + + /* Drop possible dead definitions. */ + PATTERN (insn) = set; + + INSN_CODE (insn) = -1; + recog_memoized (insn); + df_insn_rescan (insn); + bitmap_set_bit (convert_bbs, bb->index); + } + } + + if (v4sf_const0) + { + calculate_dominance_info (CDI_DOMINATORS); + df_set_flags (DF_DEFER_INSN_RESCAN); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_md_add_problem (); + df_analyze (); + + /* (Re-)discover loops so that bb->loop_father can be used in the + analysis below. */ + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + + /* Generate a vxorps at entry of the nearest dominator for basic + blocks with conversions, which is in the the fake loop that + contains the whole function, so that there is only a single + vxorps in the whole function. */ + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, + convert_bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + insn = BB_HEAD (bb); + if (!NONDEBUG_INSN_P (insn)) + insn = next_nonnote_nondebug_insn (insn); + set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode)); + set_insn = emit_insn_before (set, insn); + df_insn_rescan (set_insn); + df_process_deferred_rescans (); + loop_optimizer_finalize (); + } + + bitmap_obstack_release (NULL); + BITMAP_FREE (convert_bbs); + + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_remove_partial_avx_dependency = +{ + RTL_PASS, /* type */ + "rpad", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_remove_partial_avx_dependency : public rtl_opt_pass +{ +public: + pass_remove_partial_avx_dependency (gcc::context *ctxt) + : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); + } + + virtual unsigned int execute (function *) + { + return remove_partial_avx_dependency (); + } +}; // class pass_rpad + +} // anon namespace + +rtl_opt_pass * +make_pass_remove_partial_avx_dependency (gcc::context *ctxt) +{ + return new pass_remove_partial_avx_dependency (ctxt); +} + /* Return true if a red-zone is in use. We can't use red-zone when there are local indirect jumps, like "indirect_jump" or "tablejump", which jumps to another place in the function, since "call" in the diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b1ae88c..90f1660 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -778,6 +778,10 @@ (define_attr "i387_cw" "trunc,floor,ceil,uninitialized,any" (const_string "any")) +;; Define attribute to indicate AVX insns with partial XMM register update. +(define_attr "avx_partial_xmm_update" "false,true" + (const_string "false")) + ;; Define attribute to classify add/sub insns that consumes carry flag (CF) (define_attr "use_carry" "0,1" (const_string "0")) @@ -4392,6 +4396,7 @@ } } [(set_attr "type" "fmov,fmov,ssecvt,ssecvt") + (set_attr "avx_partial_xmm_update" "false,false,false,true") (set_attr "prefix" "orig,orig,maybe_vex,maybe_vex") (set_attr "mode" "SF,XF,DF,DF") (set (attr "enabled") @@ -4481,7 +4486,8 @@ [(set (match_operand:DF 0 "sse_reg_operand") (float_extend:DF (match_operand:SF 1 "nonimmediate_operand")))] - "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + "!TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed && optimize_function_for_speed_p (cfun) && (!REG_P (operands[1]) || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1]))) @@ -4558,6 +4564,7 @@ } } [(set_attr "type" "fmov,fmov,ssecvt,ssecvt") + (set_attr "avx_partial_xmm_update" "false,false,false,true") (set_attr "mode" "SF") (set (attr "enabled") (if_then_else @@ -4641,7 +4648,8 @@ [(set (match_operand:SF 0 "sse_reg_operand") (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand")))] - "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + "!TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed && optimize_function_for_speed_p (cfun) && (!REG_P (operands[1]) || (!TARGET_AVX && REGNO (operands[0]) != REGNO (operands[1]))) @@ -5017,6 +5025,7 @@ %vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1} %vcvtsi2<MODEF:ssemodesuffix><SWI48:rex64suffix>\t{%1, %d0|%d0, %1}" [(set_attr "type" "fmov,sseicvt,sseicvt") + (set_attr "avx_partial_xmm_update" "false,true,true") (set_attr "prefix" "orig,maybe_vex,maybe_vex") (set_attr "mode" "<MODEF:MODE>") (set (attr "prefix_rex") @@ -5145,7 +5154,8 @@ (define_split [(set (match_operand:MODEF 0 "sse_reg_operand") (float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))] - "TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed + "!TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed && optimize_function_for_speed_p (cfun) && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1dd1d4b..5b27144 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2019-02-22 H.J. Lu <hongjiu.lu@intel.com> + Hongtao Liu <hongtao.liu@intel.com> + Sunil K Pandey <sunil.k.pandey@intel.com> + + PR target/87007 + * gcc.target/i386/pr87007-1.c: New test. + * gcc.target/i386/pr87007-2.c: Likewise. + 2019-02-22 Richard Biener <rguenther@suse.de> PR tree-optimization/89440 diff --git a/gcc/testsuite/gcc.target/i386/pr87007-1.c b/gcc/testsuite/gcc.target/i386/pr87007-1.c new file mode 100644 index 0000000..93cf1dc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr87007-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ + +extern float f; +extern double d; +extern int i; + +void +foo (void) +{ + d = f; + f = i; +} + +/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr87007-2.c b/gcc/testsuite/gcc.target/i386/pr87007-2.c new file mode 100644 index 0000000..cca7ae7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr87007-2.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake" } */ + +extern float f; +extern double d; +extern int i; + +void +foo (int n, int k) +{ + for (int i = 0; i != n; i++) + if(i < k) + d = f; + else + f = i; +} + +/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */ |