diff options
author | Enkovich Ilya <ilya.enkovich@intel.com> | 2011-09-06 16:42:47 +0000 |
---|---|---|
committer | H.J. Lu <hjl@gcc.gnu.org> | 2011-09-06 09:42:47 -0700 |
commit | df7b0cc4aae0620d830b8e9f8ed8a586e68f470b (patch) | |
tree | 1f9104f6409decb81158ba4c73d5eed24b186893 /gcc/tree-ssa-reassoc.c | |
parent | df2f61000e93a5c300703c9f23af007c0621693f (diff) | |
download | gcc-df7b0cc4aae0620d830b8e9f8ed8a586e68f470b.zip gcc-df7b0cc4aae0620d830b8e9f8ed8a586e68f470b.tar.gz gcc-df7b0cc4aae0620d830b8e9f8ed8a586e68f470b.tar.bz2 |
PR middle-end/44382: Tree reassociation improvement
gcc/
2011-09-06 Enkovich Ilya <ilya.enkovich@intel.com>
PR middle-end/44382
* target.def (reassociation_width): New hook.
* doc/tm.texi.in (reassociation_width): Likewise.
* doc/tm.texi (reassociation_width): Likewise.
* doc/invoke.texi (tree-reassoc-width): New param documented.
* hooks.h (hook_int_uint_mode_1): New default hook.
* hooks.c (hook_int_uint_mode_1): Likewise.
* config/i386/i386.h (ix86_tune_indices): Add
X86_TUNE_REASSOC_INT_TO_PARALLEL and
X86_TUNE_REASSOC_FP_TO_PARALLEL.
(TARGET_REASSOC_INT_TO_PARALLEL): New.
(TARGET_REASSOC_FP_TO_PARALLEL): Likewise.
* config/i386/i386.c (initial_ix86_tune_features): Add
X86_TUNE_REASSOC_INT_TO_PARALLEL and
X86_TUNE_REASSOC_FP_TO_PARALLEL.
(ix86_reassociation_width) implementation of
new hook for i386 target.
* params.def (PARAM_TREE_REASSOC_WIDTH): New param added.
* tree-ssa-reassoc.c (get_required_cycles): New function.
(get_reassociation_width): Likewise.
(swap_ops_for_binary_stmt): Likewise.
(rewrite_expr_tree_parallel): Likewise.
(rewrite_expr_tree): Refactored. Part of code moved into
swap_ops_for_binary_stmt.
(reassociate_bb): Now checks reassociation width to be used
and call rewrite_expr_tree_parallel instead of rewrite_expr_tree
if needed.
gcc/testsuite/
2011-09-06 Enkovich Ilya <ilya.enkovich@intel.com>
* gcc.dg/tree-ssa/pr38533.c (dg-options): Added option
--param tree-reassoc-width=1.
* gcc.dg/tree-ssa/reassoc-24.c: New test.
* gcc.dg/tree-ssa/reassoc-25.c: Likewise.
From-SVN: r178602
Diffstat (limited to 'gcc/tree-ssa-reassoc.c')
-rw-r--r-- | gcc/tree-ssa-reassoc.c | 295 |
1 files changed, 248 insertions, 47 deletions
diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index 51f7ef8..03e0672 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -40,6 +40,8 @@ along with GCC; see the file COPYING3. If not see #include "pointer-set.h" #include "cfgloop.h" #include "flags.h" +#include "target.h" +#include "params.h" /* This is a simple global reassociation pass. It is, in part, based on the LLVM pass of the same name (They do some things more/less @@ -1617,6 +1619,62 @@ remove_visited_stmt_chain (tree var) } } +/* This function checks three consequtive operands in + passed operands vector OPS starting from OPINDEX and + swaps two operands if it is profitable for binary operation + consuming OPINDEX + 1 abnd OPINDEX + 2 operands. + + We pair ops with the same rank if possible. + + The alternative we try is to see if STMT is a destructive + update style statement, which is like: + b = phi (a, ...) + a = c + b; + In that case, we want to use the destructive update form to + expose the possible vectorizer sum reduction opportunity. + In that case, the third operand will be the phi node. This + check is not performed if STMT is null. + + We could, of course, try to be better as noted above, and do a + lot of work to try to find these opportunities in >3 operand + cases, but it is unlikely to be worth it. */ + +static void +swap_ops_for_binary_stmt (VEC(operand_entry_t, heap) * ops, + unsigned int opindex, gimple stmt) +{ + operand_entry_t oe1, oe2, oe3; + + oe1 = VEC_index (operand_entry_t, ops, opindex); + oe2 = VEC_index (operand_entry_t, ops, opindex + 1); + oe3 = VEC_index (operand_entry_t, ops, opindex + 2); + + if ((oe1->rank == oe2->rank + && oe2->rank != oe3->rank) + || (stmt && is_phi_for_stmt (stmt, oe3->op) + && !is_phi_for_stmt (stmt, oe1->op) + && !is_phi_for_stmt (stmt, oe2->op))) + { + struct operand_entry temp = *oe3; + oe3->op = oe1->op; + oe3->rank = oe1->rank; + oe1->op = temp.op; + oe1->rank= temp.rank; + } + else if ((oe1->rank == oe3->rank + && oe2->rank != oe3->rank) + || (stmt && is_phi_for_stmt (stmt, oe2->op) + && !is_phi_for_stmt (stmt, oe1->op) + && !is_phi_for_stmt (stmt, oe3->op))) + { + struct operand_entry temp = *oe2; + oe2->op = oe1->op; + oe2->rank = oe1->rank; + oe1->op = temp.op; + oe1->rank= temp.rank; + } +} + /* Recursively rewrite our linearized statements so that the operators match those in OPS[OPINDEX], putting the computation in rank order. */ @@ -1629,53 +1687,10 @@ rewrite_expr_tree (gimple stmt, unsigned int opindex, tree rhs2 = gimple_assign_rhs2 (stmt); operand_entry_t oe; - /* If we have three operands left, then we want to make sure the one - that gets the double binary op are the ones with the same rank. - - The alternative we try is to see if this is a destructive - update style statement, which is like: - b = phi (a, ...) - a = c + b; - In that case, we want to use the destructive update form to - expose the possible vectorizer sum reduction opportunity. - In that case, the third operand will be the phi node. - - We could, of course, try to be better as noted above, and do a - lot of work to try to find these opportunities in >3 operand - cases, but it is unlikely to be worth it. */ + /* If we have three operands left, then we want to make sure the ones + that get the double binary op are chosen wisely. */ if (opindex + 3 == VEC_length (operand_entry_t, ops)) - { - operand_entry_t oe1, oe2, oe3; - - oe1 = VEC_index (operand_entry_t, ops, opindex); - oe2 = VEC_index (operand_entry_t, ops, opindex + 1); - oe3 = VEC_index (operand_entry_t, ops, opindex + 2); - - if ((oe1->rank == oe2->rank - && oe2->rank != oe3->rank) - || (is_phi_for_stmt (stmt, oe3->op) - && !is_phi_for_stmt (stmt, oe1->op) - && !is_phi_for_stmt (stmt, oe2->op))) - { - struct operand_entry temp = *oe3; - oe3->op = oe1->op; - oe3->rank = oe1->rank; - oe1->op = temp.op; - oe1->rank= temp.rank; - } - else if ((oe1->rank == oe3->rank - && oe2->rank != oe3->rank) - || (is_phi_for_stmt (stmt, oe2->op) - && !is_phi_for_stmt (stmt, oe1->op) - && !is_phi_for_stmt (stmt, oe3->op))) - { - struct operand_entry temp = *oe2; - oe2->op = oe1->op; - oe2->rank = oe1->rank; - oe1->op = temp.op; - oe1->rank= temp.rank; - } - } + swap_ops_for_binary_stmt (ops, opindex, stmt); /* The final recursion case for this function is that you have exactly two operations left. @@ -1760,6 +1775,178 @@ rewrite_expr_tree (gimple stmt, unsigned int opindex, rewrite_expr_tree (SSA_NAME_DEF_STMT (rhs1), opindex + 1, ops, moved); } +/* Find out how many cycles we need to compute statements chain. + OPS_NUM holds number os statements in a chain. CPU_WIDTH is a + maximum number of independent statements we may execute per cycle. */ + +static int +get_required_cycles (int ops_num, int cpu_width) +{ + int res; + int elog; + unsigned int rest; + + /* While we have more than 2 * cpu_width operands + we may reduce number of operands by cpu_width + per cycle. */ + res = ops_num / (2 * cpu_width); + + /* Remained operands count may be reduced twice per cycle + until we have only one operand. */ + rest = (unsigned)(ops_num - res * cpu_width); + elog = exact_log2 (rest); + if (elog >= 0) + res += elog; + else + res += floor_log2 (rest) + 1; + + return res; +} + +/* Returns an optimal number of registers to use for computation of + given statements. */ + +static int +get_reassociation_width (int ops_num, enum tree_code opc, + enum machine_mode mode) +{ + int param_width = PARAM_VALUE (PARAM_TREE_REASSOC_WIDTH); + int width; + int width_min; + int cycles_best; + + if (param_width > 0) + width = param_width; + else + width = targetm.sched.reassociation_width (opc, mode); + + if (width == 1) + return width; + + /* Get the minimal time required for sequence computation. */ + cycles_best = get_required_cycles (ops_num, width); + + /* Check if we may use less width and still compute sequence for + the same time. It will allow us to reduce registers usage. + get_required_cycles is monotonically increasing with lower width + so we can perform a binary search for the minimal width that still + results in the optimal cycle count. */ + width_min = 1; + while (width > width_min) + { + int width_mid = (width + width_min) / 2; + + if (get_required_cycles (ops_num, width_mid) == cycles_best) + width = width_mid; + else if (width_min < width_mid) + width_min = width_mid; + else + break; + } + + return width; +} + +/* Recursively rewrite our linearized statements so that the operators + match those in OPS[OPINDEX], putting the computation in rank + order and trying to allow operations to be executed in + parallel. */ + +static void +rewrite_expr_tree_parallel (gimple stmt, int width, + VEC(operand_entry_t, heap) * ops) +{ + enum tree_code opcode = gimple_assign_rhs_code (stmt); + int op_num = VEC_length (operand_entry_t, ops); + int stmt_num = op_num - 1; + gimple *stmts = XALLOCAVEC (gimple, stmt_num); + int op_index = op_num - 1; + int stmt_index = 0; + int ready_stmts_end = 0; + int i = 0; + tree last_rhs1 = gimple_assign_rhs1 (stmt); + tree lhs_var; + + /* We start expression rewriting from the top statements. + So, in this loop we create a full list of statements + we will work with. */ + stmts[stmt_num - 1] = stmt; + for (i = stmt_num - 2; i >= 0; i--) + stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1])); + + lhs_var = create_tmp_reg (TREE_TYPE (last_rhs1), NULL); + add_referenced_var (lhs_var); + + for (i = 0; i < stmt_num; i++) + { + tree op1, op2; + + /* Determine whether we should use results of + already handled statements or not. */ + if (ready_stmts_end == 0 + && (i - stmt_index >= width || op_index < 1)) + ready_stmts_end = i; + + /* Now we choose operands for the next statement. Non zero + value in ready_stmts_end means here that we should use + the result of already generated statements as new operand. */ + if (ready_stmts_end > 0) + { + op1 = gimple_assign_lhs (stmts[stmt_index++]); + if (ready_stmts_end > stmt_index) + op2 = gimple_assign_lhs (stmts[stmt_index++]); + else if (op_index >= 0) + op2 = VEC_index (operand_entry_t, ops, op_index--)->op; + else + { + gcc_assert (stmt_index < i); + op2 = gimple_assign_lhs (stmts[stmt_index++]); + } + + if (stmt_index >= ready_stmts_end) + ready_stmts_end = 0; + } + else + { + if (op_index > 1) + swap_ops_for_binary_stmt (ops, op_index - 2, NULL); + op2 = VEC_index (operand_entry_t, ops, op_index--)->op; + op1 = VEC_index (operand_entry_t, ops, op_index--)->op; + } + + /* If we emit the last statement then we should put + operands into the last statement. It will also + break the loop. */ + if (op_index < 0 && stmt_index == i) + i = stmt_num - 1; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Transforming "); + print_gimple_stmt (dump_file, stmts[i], 0, 0); + } + + /* We keep original statement only for the last one. All + others are recreated. */ + if (i == stmt_num - 1) + { + gimple_assign_set_rhs1 (stmts[i], op1); + gimple_assign_set_rhs2 (stmts[i], op2); + update_stmt (stmts[i]); + } + else + stmts[i] = build_and_add_sum (lhs_var, op1, op2, opcode); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, " into "); + print_gimple_stmt (dump_file, stmts[i], 0, 0); + } + } + + remove_visited_stmt_chain (last_rhs1); +} + /* Transform STMT, which is really (A +B) + (C + D) into the left linear form, ((A+B)+C)+D. Recurse on D if necessary. */ @@ -2282,7 +2469,21 @@ reassociate_bb (basic_block bb) } } else - rewrite_expr_tree (stmt, 0, ops, false); + { + enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs)); + int ops_num = VEC_length (operand_entry_t, ops); + int width = get_reassociation_width (ops_num, rhs_code, mode); + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Width = %d was chosen for reassociation\n", width); + + if (width > 1 + && VEC_length (operand_entry_t, ops) > 3) + rewrite_expr_tree_parallel (stmt, width, ops); + else + rewrite_expr_tree (stmt, 0, ops, false); + } VEC_free (operand_entry_t, heap, ops); } |