diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/i386/i386-features.cc | 107 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 17 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr120941-1.c | 49 |
3 files changed, 116 insertions, 57 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 53e86c8..9941e61 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3085,21 +3085,63 @@ ix86_rpad_gate () && optimize_function_for_speed_p (cfun)); } +enum x86_cse_kind +{ + X86_CSE_CONST0_VECTOR, + X86_CSE_CONSTM1_VECTOR, + X86_CSE_VEC_DUP +}; + +struct redundant_load +{ + /* Bitmap of basic blocks with broadcast instructions. */ + auto_bitmap bbs; + /* Bitmap of broadcast instructions. */ + auto_bitmap insns; + /* The broadcast inner scalar. */ + rtx val; + /* The inner scalar mode. */ + machine_mode mode; + /* The instruction which sets the inner scalar. Nullptr if the inner + scalar is applied to the whole function, instead of within the same + block. */ + rtx_insn *def_insn; + /* The widest broadcast source. */ + rtx broadcast_source; + /* The widest broadcast register. */ + rtx broadcast_reg; + /* The basic block of the broadcast instruction. */ + basic_block bb; + /* The number of broadcast instructions with the same inner scalar. */ + unsigned HOST_WIDE_INT count; + /* The threshold of broadcast instructions with the same inner + scalar. */ + unsigned int threshold; + /* The widest broadcast size in bytes. */ + unsigned int size; + /* Load kind. */ + x86_cse_kind kind; +}; + /* Generate a vector set, DEST = SRC, at entry of the nearest dominator for basic block map BBS, which is in the fake loop that contains the whole function, so that there is only a single vector set in the - whole function. If not nullptr, INNER_SCALAR is the inner scalar of - SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ + whole function. If not nullptr, LOAD is a pointer to the load. */ static void ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, - rtx inner_scalar = nullptr) + redundant_load *load = nullptr) { basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); + /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop + to avoid extra spills. */ + if (!load || load->kind != X86_CSE_VEC_DUP) + { + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + } rtx set = gen_rtx_SET (dest, src); @@ -3141,8 +3183,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, } } - if (inner_scalar) + if (load && load->kind == X86_CSE_VEC_DUP) { + /* Get the source from LOAD as (reg:SI 99) in + + (vec_duplicate:V4SI (reg:SI 99)) + + */ + rtx inner_scalar = load->val; /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ rtx reg = XEXP (src, 0); if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) @@ -3489,44 +3537,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const, } } -enum x86_cse_kind -{ - X86_CSE_CONST0_VECTOR, - X86_CSE_CONSTM1_VECTOR, - X86_CSE_VEC_DUP -}; - -struct redundant_load -{ - /* Bitmap of basic blocks with broadcast instructions. */ - auto_bitmap bbs; - /* Bitmap of broadcast instructions. */ - auto_bitmap insns; - /* The broadcast inner scalar. */ - rtx val; - /* The inner scalar mode. */ - machine_mode mode; - /* The instruction which sets the inner scalar. Nullptr if the inner - scalar is applied to the whole function, instead of within the same - block. */ - rtx_insn *def_insn; - /* The widest broadcast source. */ - rtx broadcast_source; - /* The widest broadcast register. */ - rtx broadcast_reg; - /* The basic block of the broadcast instruction. */ - basic_block bb; - /* The number of broadcast instructions with the same inner scalar. */ - unsigned HOST_WIDE_INT count; - /* The threshold of broadcast instructions with the same inner - scalar. */ - unsigned int threshold; - /* The widest broadcast size in bytes. */ - unsigned int size; - /* Load kind. */ - x86_cse_kind kind; -}; - /* Return the inner scalar if OP is a broadcast, else return nullptr. */ static rtx @@ -3872,10 +3882,7 @@ remove_redundant_vector_load (void) else ix86_place_single_vector_set (load->broadcast_reg, load->broadcast_source, - load->bbs, - (load->kind == X86_CSE_VEC_DUP - ? load->val - : nullptr)); + load->bbs, load); } loop_optimizer_finalize (); diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index c1e708b..105a60d 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -20612,18 +20612,22 @@ LTO output files. @opindex fdump-rtl-@var{pass} @item -d@var{letters} @itemx -fdump-rtl-@var{pass} -@itemx -fdump-rtl-@var{pass}=@var{filename} +@itemx -fdump-rtl-@var{pass}-@var{options} +@itemx -fdump-rtl-@var{pass}-@var{options}=@var{filename} Says to make debugging dumps during compilation at times specified by -@var{letters}. This is used for debugging the RTL-based passes of the +@var{letters} when using @option{-d} or by @var{pass} when using +@option{-fdump-rtl}. This is used for debugging the RTL-based passes of the compiler. Some @option{-d@var{letters}} switches have different meaning when @option{-E} is used for preprocessing. @xref{Preprocessor Options}, for information about preprocessor-specific dump options. -Debug dumps can be enabled with a @option{-fdump-rtl} switch or some -@option{-d} option @var{letters}. Here are the possible -letters for use in @var{pass} and @var{letters}, and their meanings: +The @samp{-@var{options}} form allows greater control over the details of the +dump. See @option{-fdump-tree}. + +Here are actual instances of command-line options following these patterns and +their meanings: @table @gcctabopt @@ -21150,8 +21154,7 @@ GraphViz to @file{@var{file}.@var{passid}.@var{pass}.dot}. Each function in the file is pretty-printed as a subgraph, so that GraphViz can render them all in a single plot. -This option currently only works for RTL dumps, and the RTL is always -dumped in slim form. +RTL is always dumped in slim form. @item vops Enable showing virtual operands for every statement. @item lineno diff --git a/gcc/testsuite/gcc.target/i386/pr120941-1.c b/gcc/testsuite/gcc.target/i386/pr120941-1.c new file mode 100644 index 0000000..b4fc6ac --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120941-1.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=x86-64-v3" } */ +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc'). */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */ + +/* +**bar: +**.LFB[0-9]+: +**... +** vbroadcastsd .LC4\(%rip\), %ymm2 +** leal 2\(%rbx\), %eax +** vbroadcastsd .LC2\(%rip\), %ymm4 +** negl %eax +**... +*/ + +extern void foo (int); + +enum { N_CELL_ENTRIES1 = 2 } +typedef LBM_Grid1[64]; +enum { N_CELL_ENTRIES2 = 2 } +typedef LBM_Grid2[64]; +LBM_Grid1 grid1; +LBM_Grid2 grid2; +extern int n; + +void +LBM_handleInOutFlow() +{ + int i, j; + for (; i; i += 2) + { + for (j = 0; j < n; j++) + { + grid1[i] = 1.0 / 36.0 * i; + grid2[i] = 1.0 / 36.0 * i; + } + } +} + +int main_t; +void +bar (void) +{ + for (; main_t; main_t++) { + LBM_handleInOutFlow(); + foo (main_t); + } +} |