From df7b0cc4aae0620d830b8e9f8ed8a586e68f470b Mon Sep 17 00:00:00 2001 From: Enkovich Ilya Date: Tue, 6 Sep 2011 16:42:47 +0000 Subject: PR middle-end/44382: Tree reassociation improvement gcc/ 2011-09-06 Enkovich Ilya PR middle-end/44382 * target.def (reassociation_width): New hook. * doc/tm.texi.in (reassociation_width): Likewise. * doc/tm.texi (reassociation_width): Likewise. * doc/invoke.texi (tree-reassoc-width): New param documented. * hooks.h (hook_int_uint_mode_1): New default hook. * hooks.c (hook_int_uint_mode_1): Likewise. * config/i386/i386.h (ix86_tune_indices): Add X86_TUNE_REASSOC_INT_TO_PARALLEL and X86_TUNE_REASSOC_FP_TO_PARALLEL. (TARGET_REASSOC_INT_TO_PARALLEL): New. (TARGET_REASSOC_FP_TO_PARALLEL): Likewise. * config/i386/i386.c (initial_ix86_tune_features): Add X86_TUNE_REASSOC_INT_TO_PARALLEL and X86_TUNE_REASSOC_FP_TO_PARALLEL. (ix86_reassociation_width) implementation of new hook for i386 target. * params.def (PARAM_TREE_REASSOC_WIDTH): New param added. * tree-ssa-reassoc.c (get_required_cycles): New function. (get_reassociation_width): Likewise. (swap_ops_for_binary_stmt): Likewise. (rewrite_expr_tree_parallel): Likewise. (rewrite_expr_tree): Refactored. Part of code moved into swap_ops_for_binary_stmt. (reassociate_bb): Now checks reassociation width to be used and call rewrite_expr_tree_parallel instead of rewrite_expr_tree if needed. gcc/testsuite/ 2011-09-06 Enkovich Ilya * gcc.dg/tree-ssa/pr38533.c (dg-options): Added option --param tree-reassoc-width=1. * gcc.dg/tree-ssa/reassoc-24.c: New test. * gcc.dg/tree-ssa/reassoc-25.c: Likewise. From-SVN: r178602 --- gcc/ChangeLog | 43 +++++ gcc/config/i386/i386.c | 38 +++- gcc/config/i386/i386.h | 7 + gcc/doc/invoke.texi | 5 + gcc/doc/tm.texi | 5 + gcc/doc/tm.texi.in | 2 + gcc/hooks.c | 7 + gcc/hooks.h | 1 + gcc/params.def | 7 + gcc/target.def | 10 + gcc/testsuite/ChangeLog | 8 + gcc/testsuite/gcc.dg/tree-ssa/pr38533.c | 2 +- gcc/testsuite/gcc.dg/tree-ssa/reassoc-24.c | 25 +++ gcc/testsuite/gcc.dg/tree-ssa/reassoc-25.c | 19 ++ gcc/tree-ssa-reassoc.c | 295 ++++++++++++++++++++++++----- 15 files changed, 425 insertions(+), 49 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-24.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-25.c (limited to 'gcc') diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f867f97..bf50b80 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,46 @@ +2011-09-06 Enkovich Ilya + + PR middle-end/44382 + * target.def (reassociation_width): New hook. + + * doc/tm.texi.in (reassociation_width): Likewise. + + * doc/tm.texi (reassociation_width): Likewise. + + * doc/invoke.texi (tree-reassoc-width): New param documented. + + * hooks.h (hook_int_uint_mode_1): New default hook. + + * hooks.c (hook_int_uint_mode_1): Likewise. + + * config/i386/i386.h (ix86_tune_indices): Add + X86_TUNE_REASSOC_INT_TO_PARALLEL and + X86_TUNE_REASSOC_FP_TO_PARALLEL. + + (TARGET_REASSOC_INT_TO_PARALLEL): New. + (TARGET_REASSOC_FP_TO_PARALLEL): Likewise. + + * config/i386/i386.c (initial_ix86_tune_features): Add + X86_TUNE_REASSOC_INT_TO_PARALLEL and + X86_TUNE_REASSOC_FP_TO_PARALLEL. + + (ix86_reassociation_width) implementation of + new hook for i386 target. + + * params.def (PARAM_TREE_REASSOC_WIDTH): New param added. + + * tree-ssa-reassoc.c (get_required_cycles): New function. + (get_reassociation_width): Likewise. + (swap_ops_for_binary_stmt): Likewise. + (rewrite_expr_tree_parallel): Likewise. + + (rewrite_expr_tree): Refactored. Part of code moved into + swap_ops_for_binary_stmt. + + (reassociate_bb): Now checks reassociation width to be used + and call rewrite_expr_tree_parallel instead of rewrite_expr_tree + if needed. + 2011-09-06 Richard Guenther PR tree-optimization/47025 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index a9c0aa7..555db59 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -2168,7 +2168,15 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ - m_BDVER + m_BDVER, + + /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations + during reassociation of integer computation. */ + m_ATOM, + + /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations + during reassociation of fp computation. */ + m_ATOM }; /* Feature tests against the various architecture variations. */ @@ -34843,6 +34851,8 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree) #define TARGET_SCHED_DISPATCH has_dispatch #undef TARGET_SCHED_DISPATCH_DO #define TARGET_SCHED_DISPATCH_DO do_dispatch +#undef TARGET_SCHED_REASSOCIATION_WIDTH +#define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width /* The size of the dispatch window is the total number of bytes of object code allowed in a window. */ @@ -35640,6 +35650,32 @@ has_dispatch (rtx insn, int action) return false; } +/* Implementation of reassociation_width target hook used by + reassoc phase to identify parallelism level in reassociated + tree. Statements tree_code is passed in OPC. Arguments type + is passed in MODE. + + Currently parallel reassociation is enabled for Atom + processors only and we set reassociation width to be 2 + because Atom may issue up to 2 instructions per cycle. + + Return value should be fixed if parallel reassociation is + enabled for other processors. */ + +static int +ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, + enum machine_mode mode) +{ + int res = 1; + + if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) + res = 2; + else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) + res = 2; + + return res; +} + /* ??? No autovectorization into MMX or 3DNOW until we can reliably place emms and femms instructions. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 47442a0e..21f5075 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -318,6 +318,8 @@ enum ix86_tune_indices { X86_TUNE_VECTORIZE_DOUBLE, X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, X86_TUNE_AVX128_OPTIMAL, + X86_TUNE_REASSOC_INT_TO_PARALLEL, + X86_TUNE_REASSOC_FP_TO_PARALLEL, X86_TUNE_LAST }; @@ -416,6 +418,11 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] #define TARGET_AVX128_OPTIMAL \ ix86_tune_features[X86_TUNE_AVX128_OPTIMAL] +#define TARGET_REASSOC_INT_TO_PARALLEL \ + ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL] +#define TARGET_REASSOC_FP_TO_PARALLEL \ + ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL] + /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { X86_ARCH_CMOVE, /* || TARGET_SSE */ diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index c5b19eb..b1ff187 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -9076,6 +9076,11 @@ The smallest number of different values for which it is best to use a jump-table instead of a tree of conditional branches. If the value is 0, use the default for the machine. The default is 0. +@item tree-reassoc-width +Set the maximum number of instructions executed in parallel in +reassociated tree. This parameter overrides target dependent +heuristics used by default if has non zero value. + @end table @end table diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 7364aa1..335c1d1 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6840,6 +6840,11 @@ the order of instructions is important for correctness when scheduling, but also the latencies of operations. @end deftypevr +@deftypefn {Target Hook} int TARGET_SCHED_REASSOCIATION_WIDTH (unsigned int @var{opc}, enum machine_mode @var{mode}) +This hook is called by tree reassociator to determine a level of +parallelism required in output calculations chain. +@end deftypefn + @node Sections @section Dividing the Output into Sections (Texts, Data, @dots{}) @c the above section title is WAY too long. maybe cut the part between diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 4535fd6..6783826 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -6774,6 +6774,8 @@ in its second parameter. @hook TARGET_SCHED_EXPOSED_PIPELINE +@hook TARGET_SCHED_REASSOCIATION_WIDTH + @node Sections @section Dividing the Output into Sections (Texts, Data, @dots{}) @c the above section title is WAY too long. maybe cut the part between diff --git a/gcc/hooks.c b/gcc/hooks.c index 9025183..1ba44f9 100644 --- a/gcc/hooks.c +++ b/gcc/hooks.c @@ -161,6 +161,13 @@ default_can_output_mi_thunk_no_vcall (const_tree a ATTRIBUTE_UNUSED, } int +hook_int_uint_mode_1 (unsigned int a ATTRIBUTE_UNUSED, + enum machine_mode b ATTRIBUTE_UNUSED) +{ + return 1; +} + +int hook_int_const_tree_0 (const_tree a ATTRIBUTE_UNUSED) { return 0; diff --git a/gcc/hooks.h b/gcc/hooks.h index 156d708..54ace24 100644 --- a/gcc/hooks.h +++ b/gcc/hooks.h @@ -68,6 +68,7 @@ extern void hook_void_tree_treeptr (tree, tree *); extern void hook_void_int_int (int, int); extern void hook_void_gcc_optionsp (struct gcc_options *); +extern int hook_int_uint_mode_1 (unsigned int, enum machine_mode); extern int hook_int_const_tree_0 (const_tree); extern int hook_int_const_tree_const_tree_1 (const_tree, const_tree); extern int hook_int_rtx_0 (rtx); diff --git a/gcc/params.def b/gcc/params.def index e8372ed..c376c80 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -914,6 +914,13 @@ DEFPARAM (PARAM_ALLOW_STORE_DATA_RACES, "Allow new data races on stores to be introduced", 1, 0, 1) +/* Reassociation width to be used by tree reassoc optimization. */ +DEFPARAM (PARAM_TREE_REASSOC_WIDTH, + "tree-reassoc-width", + "Set the maximum number of instructions executed in parallel in " + "reassociated tree. If 0, use the target dependent heuristic.", + 0, 0, 0) + /* Local variables: diff --git a/gcc/target.def b/gcc/target.def index 857f463..1e09ba7 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -913,6 +913,16 @@ the order of instructions is important for correctness when scheduling, but\n\ also the latencies of operations.", bool, false) +/* The following member value is a function that returns number + of operations reassociator should try to put in parallel for + statements of the given type. By default 1 is used. */ +DEFHOOK +(reassociation_width, +"This hook is called by tree reassociator to determine a level of\n\ +parallelism required in output calculations chain.", +int, (unsigned int opc, enum machine_mode mode), +hook_int_uint_mode_1) + HOOK_VECTOR_END (sched) /* Functions relating to vectorization. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c1c0073..db1c7fd 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2011-09-06 Enkovich Ilya + + * gcc.dg/tree-ssa/pr38533.c (dg-options): Added option + --param tree-reassoc-width=1. + + * gcc.dg/tree-ssa/reassoc-24.c: New test. + * gcc.dg/tree-ssa/reassoc-25.c: Likewise. + 2011-09-06 Richard Guenther PR tree-optimization/48149 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr38533.c b/gcc/testsuite/gcc.dg/tree-ssa/pr38533.c index e787227..a80a5a8 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr38533.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr38533.c @@ -1,6 +1,6 @@ /* PR middle-end/38533 */ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-reassoc1" } */ +/* { dg-options "-O2 --param tree-reassoc-width=1 -fdump-tree-reassoc1" } */ #define A asm volatile ("" : "=r" (f) : "0" (0)); e |= f; #define B A A A A A A A A A A A diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-24.c b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-24.c new file mode 100644 index 0000000..c871628 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-24.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 --param tree-reassoc-width=2 -fdump-tree-reassoc1" } */ + +unsigned int +foo (void) +{ + unsigned int a = 0; + unsigned int b; + + asm volatile ("" : "=r" (b) : "0" (0)); + a += b; + asm volatile ("" : "=r" (b) : "0" (0)); + a += b; + asm volatile ("" : "=r" (b) : "0" (0)); + a += b; + asm volatile ("" : "=r" (b) : "0" (0)); + a += b; + + return a; +} + +/* Verify there are two pairs of __asm__ statements with no + intervening stmts. */ +/* { dg-final { scan-tree-dump-times "__asm__\[^;\n]*;\n *__asm__" 2 "reassoc1"} } */ +/* { dg-final { cleanup-tree-dump "reassoc1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-25.c b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-25.c new file mode 100644 index 0000000..4ff66ef --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-25.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 --param tree-reassoc-width=3 -fdump-tree-reassoc1-details" } */ + +unsigned int +foo (int a, int b, int c, int d) +{ + unsigned int s = 0; + + s += a; + s += b; + s += c; + s += d; + + return s; +} + +/* Verify reassociation width was chosen to be 2. */ +/* { dg-final { scan-tree-dump-times "Width = 2" 1 "reassoc1"} } */ +/* { dg-final { cleanup-tree-dump "reassoc1" } } */ diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index 51f7ef8..03e0672 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -40,6 +40,8 @@ along with GCC; see the file COPYING3. If not see #include "pointer-set.h" #include "cfgloop.h" #include "flags.h" +#include "target.h" +#include "params.h" /* This is a simple global reassociation pass. It is, in part, based on the LLVM pass of the same name (They do some things more/less @@ -1617,6 +1619,62 @@ remove_visited_stmt_chain (tree var) } } +/* This function checks three consequtive operands in + passed operands vector OPS starting from OPINDEX and + swaps two operands if it is profitable for binary operation + consuming OPINDEX + 1 abnd OPINDEX + 2 operands. + + We pair ops with the same rank if possible. + + The alternative we try is to see if STMT is a destructive + update style statement, which is like: + b = phi (a, ...) + a = c + b; + In that case, we want to use the destructive update form to + expose the possible vectorizer sum reduction opportunity. + In that case, the third operand will be the phi node. This + check is not performed if STMT is null. + + We could, of course, try to be better as noted above, and do a + lot of work to try to find these opportunities in >3 operand + cases, but it is unlikely to be worth it. */ + +static void +swap_ops_for_binary_stmt (VEC(operand_entry_t, heap) * ops, + unsigned int opindex, gimple stmt) +{ + operand_entry_t oe1, oe2, oe3; + + oe1 = VEC_index (operand_entry_t, ops, opindex); + oe2 = VEC_index (operand_entry_t, ops, opindex + 1); + oe3 = VEC_index (operand_entry_t, ops, opindex + 2); + + if ((oe1->rank == oe2->rank + && oe2->rank != oe3->rank) + || (stmt && is_phi_for_stmt (stmt, oe3->op) + && !is_phi_for_stmt (stmt, oe1->op) + && !is_phi_for_stmt (stmt, oe2->op))) + { + struct operand_entry temp = *oe3; + oe3->op = oe1->op; + oe3->rank = oe1->rank; + oe1->op = temp.op; + oe1->rank= temp.rank; + } + else if ((oe1->rank == oe3->rank + && oe2->rank != oe3->rank) + || (stmt && is_phi_for_stmt (stmt, oe2->op) + && !is_phi_for_stmt (stmt, oe1->op) + && !is_phi_for_stmt (stmt, oe3->op))) + { + struct operand_entry temp = *oe2; + oe2->op = oe1->op; + oe2->rank = oe1->rank; + oe1->op = temp.op; + oe1->rank= temp.rank; + } +} + /* Recursively rewrite our linearized statements so that the operators match those in OPS[OPINDEX], putting the computation in rank order. */ @@ -1629,53 +1687,10 @@ rewrite_expr_tree (gimple stmt, unsigned int opindex, tree rhs2 = gimple_assign_rhs2 (stmt); operand_entry_t oe; - /* If we have three operands left, then we want to make sure the one - that gets the double binary op are the ones with the same rank. - - The alternative we try is to see if this is a destructive - update style statement, which is like: - b = phi (a, ...) - a = c + b; - In that case, we want to use the destructive update form to - expose the possible vectorizer sum reduction opportunity. - In that case, the third operand will be the phi node. - - We could, of course, try to be better as noted above, and do a - lot of work to try to find these opportunities in >3 operand - cases, but it is unlikely to be worth it. */ + /* If we have three operands left, then we want to make sure the ones + that get the double binary op are chosen wisely. */ if (opindex + 3 == VEC_length (operand_entry_t, ops)) - { - operand_entry_t oe1, oe2, oe3; - - oe1 = VEC_index (operand_entry_t, ops, opindex); - oe2 = VEC_index (operand_entry_t, ops, opindex + 1); - oe3 = VEC_index (operand_entry_t, ops, opindex + 2); - - if ((oe1->rank == oe2->rank - && oe2->rank != oe3->rank) - || (is_phi_for_stmt (stmt, oe3->op) - && !is_phi_for_stmt (stmt, oe1->op) - && !is_phi_for_stmt (stmt, oe2->op))) - { - struct operand_entry temp = *oe3; - oe3->op = oe1->op; - oe3->rank = oe1->rank; - oe1->op = temp.op; - oe1->rank= temp.rank; - } - else if ((oe1->rank == oe3->rank - && oe2->rank != oe3->rank) - || (is_phi_for_stmt (stmt, oe2->op) - && !is_phi_for_stmt (stmt, oe1->op) - && !is_phi_for_stmt (stmt, oe3->op))) - { - struct operand_entry temp = *oe2; - oe2->op = oe1->op; - oe2->rank = oe1->rank; - oe1->op = temp.op; - oe1->rank= temp.rank; - } - } + swap_ops_for_binary_stmt (ops, opindex, stmt); /* The final recursion case for this function is that you have exactly two operations left. @@ -1760,6 +1775,178 @@ rewrite_expr_tree (gimple stmt, unsigned int opindex, rewrite_expr_tree (SSA_NAME_DEF_STMT (rhs1), opindex + 1, ops, moved); } +/* Find out how many cycles we need to compute statements chain. + OPS_NUM holds number os statements in a chain. CPU_WIDTH is a + maximum number of independent statements we may execute per cycle. */ + +static int +get_required_cycles (int ops_num, int cpu_width) +{ + int res; + int elog; + unsigned int rest; + + /* While we have more than 2 * cpu_width operands + we may reduce number of operands by cpu_width + per cycle. */ + res = ops_num / (2 * cpu_width); + + /* Remained operands count may be reduced twice per cycle + until we have only one operand. */ + rest = (unsigned)(ops_num - res * cpu_width); + elog = exact_log2 (rest); + if (elog >= 0) + res += elog; + else + res += floor_log2 (rest) + 1; + + return res; +} + +/* Returns an optimal number of registers to use for computation of + given statements. */ + +static int +get_reassociation_width (int ops_num, enum tree_code opc, + enum machine_mode mode) +{ + int param_width = PARAM_VALUE (PARAM_TREE_REASSOC_WIDTH); + int width; + int width_min; + int cycles_best; + + if (param_width > 0) + width = param_width; + else + width = targetm.sched.reassociation_width (opc, mode); + + if (width == 1) + return width; + + /* Get the minimal time required for sequence computation. */ + cycles_best = get_required_cycles (ops_num, width); + + /* Check if we may use less width and still compute sequence for + the same time. It will allow us to reduce registers usage. + get_required_cycles is monotonically increasing with lower width + so we can perform a binary search for the minimal width that still + results in the optimal cycle count. */ + width_min = 1; + while (width > width_min) + { + int width_mid = (width + width_min) / 2; + + if (get_required_cycles (ops_num, width_mid) == cycles_best) + width = width_mid; + else if (width_min < width_mid) + width_min = width_mid; + else + break; + } + + return width; +} + +/* Recursively rewrite our linearized statements so that the operators + match those in OPS[OPINDEX], putting the computation in rank + order and trying to allow operations to be executed in + parallel. */ + +static void +rewrite_expr_tree_parallel (gimple stmt, int width, + VEC(operand_entry_t, heap) * ops) +{ + enum tree_code opcode = gimple_assign_rhs_code (stmt); + int op_num = VEC_length (operand_entry_t, ops); + int stmt_num = op_num - 1; + gimple *stmts = XALLOCAVEC (gimple, stmt_num); + int op_index = op_num - 1; + int stmt_index = 0; + int ready_stmts_end = 0; + int i = 0; + tree last_rhs1 = gimple_assign_rhs1 (stmt); + tree lhs_var; + + /* We start expression rewriting from the top statements. + So, in this loop we create a full list of statements + we will work with. */ + stmts[stmt_num - 1] = stmt; + for (i = stmt_num - 2; i >= 0; i--) + stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1])); + + lhs_var = create_tmp_reg (TREE_TYPE (last_rhs1), NULL); + add_referenced_var (lhs_var); + + for (i = 0; i < stmt_num; i++) + { + tree op1, op2; + + /* Determine whether we should use results of + already handled statements or not. */ + if (ready_stmts_end == 0 + && (i - stmt_index >= width || op_index < 1)) + ready_stmts_end = i; + + /* Now we choose operands for the next statement. Non zero + value in ready_stmts_end means here that we should use + the result of already generated statements as new operand. */ + if (ready_stmts_end > 0) + { + op1 = gimple_assign_lhs (stmts[stmt_index++]); + if (ready_stmts_end > stmt_index) + op2 = gimple_assign_lhs (stmts[stmt_index++]); + else if (op_index >= 0) + op2 = VEC_index (operand_entry_t, ops, op_index--)->op; + else + { + gcc_assert (stmt_index < i); + op2 = gimple_assign_lhs (stmts[stmt_index++]); + } + + if (stmt_index >= ready_stmts_end) + ready_stmts_end = 0; + } + else + { + if (op_index > 1) + swap_ops_for_binary_stmt (ops, op_index - 2, NULL); + op2 = VEC_index (operand_entry_t, ops, op_index--)->op; + op1 = VEC_index (operand_entry_t, ops, op_index--)->op; + } + + /* If we emit the last statement then we should put + operands into the last statement. It will also + break the loop. */ + if (op_index < 0 && stmt_index == i) + i = stmt_num - 1; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Transforming "); + print_gimple_stmt (dump_file, stmts[i], 0, 0); + } + + /* We keep original statement only for the last one. All + others are recreated. */ + if (i == stmt_num - 1) + { + gimple_assign_set_rhs1 (stmts[i], op1); + gimple_assign_set_rhs2 (stmts[i], op2); + update_stmt (stmts[i]); + } + else + stmts[i] = build_and_add_sum (lhs_var, op1, op2, opcode); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, " into "); + print_gimple_stmt (dump_file, stmts[i], 0, 0); + } + } + + remove_visited_stmt_chain (last_rhs1); +} + /* Transform STMT, which is really (A +B) + (C + D) into the left linear form, ((A+B)+C)+D. Recurse on D if necessary. */ @@ -2282,7 +2469,21 @@ reassociate_bb (basic_block bb) } } else - rewrite_expr_tree (stmt, 0, ops, false); + { + enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + int ops_num = VEC_length (operand_entry_t, ops); + int width = get_reassociation_width (ops_num, rhs_code, mode); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Width = %d was chosen for reassociation\n", width); + + if (width > 1 + && VEC_length (operand_entry_t, ops) > 3) + rewrite_expr_tree_parallel (stmt, width, ops); + else + rewrite_expr_tree (stmt, 0, ops, false); + } VEC_free (operand_entry_t, heap, ops); } -- cgit v1.1