From a1f072e2441c58f6a486f90bb9a32bd5f6c51cb4 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 26 Oct 2018 07:38:59 +0000 Subject: re PR tree-optimization/87105 (Autovectorization [X86, SSE2, AVX2, DoublePrecision]) 2018-10-26 Richard Biener PR tree-optimization/87105 * tree-vectorizer.h (_slp_tree::refcnt): New member. * tree-vect-slp.c (vect_free_slp_tree): Decrement and honor refcnt. (vect_create_new_slp_node): Initialize refcnt to one. (bst_traits): Move. (scalar_stmts_set_t, bst_fail): Remove. (vect_build_slp_tree_2): Add bst_map argument and adjust calls. (vect_build_slp_tree): Add bst_map argument and lookup already created SLP nodes. (vect_print_slp_tree): Handle a SLP graph, print SLP node addresses. (vect_slp_rearrange_stmts): Handle a SLP graph. (vect_analyze_slp_instance): Adjust and free SLP nodes from the CSE map. Fix indenting. (vect_schedule_slp_instance): Add short-cut. * g++.dg/vect/slp-pr87105.cc: Adjust. * gcc.dg/torture/20181024-1.c: New testcase. * g++.dg/opt/20181025-1.C: Likewise. From-SVN: r265522 --- gcc/tree-vect-slp.c | 281 +++++++++++++++++++++++++++++----------------------- 1 file changed, 158 insertions(+), 123 deletions(-) (limited to 'gcc/tree-vect-slp.c') diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 3aae177..ab8504a 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -57,6 +57,9 @@ vect_free_slp_tree (slp_tree node, bool final_p) int i; slp_tree child; + if (--node->refcnt != 0) + return; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) vect_free_slp_tree (child, final_p); @@ -82,7 +85,6 @@ vect_free_slp_tree (slp_tree node, bool final_p) free (node); } - /* Free the memory allocated for the SLP instance. FINAL_P is true if we have vectorized the instance or if we have made a final decision not to vectorize the statements in any way. */ @@ -126,6 +128,7 @@ vect_create_new_slp_node (vec scalar_stmts) SLP_TREE_LOAD_PERMUTATION (node) = vNULL; SLP_TREE_TWO_OPERATORS (node) = false; SLP_TREE_DEF_TYPE (node) = vect_internal_def; + node->refcnt = 1; unsigned i; FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info) @@ -1021,9 +1024,6 @@ bst_traits::equal (value_type existing, value_type candidate) return true; } -typedef hash_set , bst_traits> scalar_stmts_set_t; -static scalar_stmts_set_t *bst_fail; - typedef hash_map , slp_tree, simple_hashmap_traits > scalar_stmts_to_slp_tree_map_t; @@ -1034,30 +1034,33 @@ vect_build_slp_tree_2 (vec_info *vinfo, poly_uint64 *max_nunits, vec *loads, bool *matches, unsigned *npermutes, unsigned *tree_size, - unsigned max_tree_size); + unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map); static slp_tree vect_build_slp_tree (vec_info *vinfo, vec stmts, unsigned int group_size, poly_uint64 *max_nunits, vec *loads, bool *matches, unsigned *npermutes, unsigned *tree_size, - unsigned max_tree_size) + unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map) { - if (bst_fail->contains (stmts)) - return NULL; - slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits, - loads, matches, npermutes, tree_size, - max_tree_size); - /* When SLP build fails for stmts record this, otherwise SLP build - can be exponential in time when we allow to construct parts from - scalars, see PR81723. */ - if (! res) + if (slp_tree *leader = bst_map->get (stmts)) { - vec x; - x.create (stmts.length ()); - x.splice (stmts); - bst_fail->add (x); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n", + *leader ? "" : "failed ", *leader); + if (*leader) + (*leader)->refcnt++; + return *leader; } + slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits, + loads, matches, npermutes, tree_size, + max_tree_size, bst_map); + /* Keep a reference for the bst_map use. */ + if (res) + res->refcnt++; + bst_map->put (stmts.copy (), res); return res; } @@ -1074,7 +1077,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, poly_uint64 *max_nunits, vec *loads, bool *matches, unsigned *npermutes, unsigned *tree_size, - unsigned max_tree_size) + unsigned max_tree_size, + scalar_stmts_to_slp_tree_map_t *bst_map) { unsigned nops, i, this_tree_size = 0; poly_uint64 this_max_nunits = *max_nunits; @@ -1205,7 +1209,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, group_size, &this_max_nunits, &this_loads, matches, npermutes, &this_tree_size, - max_tree_size)) != NULL) + max_tree_size, bst_map)) != NULL) { /* If we have all children of child built up from scalars then just throw that away and build it up this node from scalars. */ @@ -1348,7 +1352,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, group_size, &this_max_nunits, &this_loads, tem, npermutes, &this_tree_size, - max_tree_size)) != NULL) + max_tree_size, bst_map)) != NULL) { /* ... so if successful we can apply the operand swapping to the GIMPLE IL. This is necessary because for example @@ -1441,21 +1445,37 @@ fail: static void vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, - slp_tree node) + slp_tree node, hash_set &visited) { int i; stmt_vec_info stmt_info; slp_tree child; - dump_printf_loc (dump_kind, loc, "node%s\n", + if (visited.add (node)) + return; + + dump_printf_loc (dump_kind, loc, "node%s %p\n", SLP_TREE_DEF_TYPE (node) != vect_internal_def - ? " (external)" : ""); + ? " (external)" : "", node); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) dump_printf_loc (dump_kind, loc, "\tstmt %d %G", i, stmt_info->stmt); + if (SLP_TREE_CHILDREN (node).is_empty ()) + return; + dump_printf_loc (dump_kind, loc, "\tchildren"); FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_print_slp_tree (dump_kind, loc, child); + dump_printf (dump_kind, " %p", (void *)child); + dump_printf (dump_kind, "\n"); + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + vect_print_slp_tree (dump_kind, loc, child, visited); } +static void +vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, + slp_tree node) +{ + hash_set visited; + vect_print_slp_tree (dump_kind, loc, node, visited); +} /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID). If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index @@ -1509,15 +1529,19 @@ vect_mark_slp_stmts_relevant (slp_tree node) static void vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, - vec permutation) + vec permutation, + hash_set &visited) { stmt_vec_info stmt_info; vec tmp_stmts; unsigned int i; slp_tree child; + if (visited.add (node)) + return; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_slp_rearrange_stmts (child, group_size, permutation); + vect_slp_rearrange_stmts (child, group_size, permutation, visited); gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); tmp_stmts.create (group_size); @@ -1578,8 +1602,9 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) statements in the nodes is not important unless they are memory accesses, we can rearrange the statements in all the nodes according to the order of the loads. */ + hash_set visited; vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, - node->load_permutation); + node->load_permutation, visited); /* We are done, no actual permutations need to be generated. */ poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); @@ -1889,12 +1914,18 @@ vect_analyze_slp_instance (vec_info *vinfo, /* Build the tree for the SLP instance. */ bool *matches = XALLOCAVEC (bool, group_size); unsigned npermutes = 0; - bst_fail = new scalar_stmts_set_t (); + scalar_stmts_to_slp_tree_map_t *bst_map + = new scalar_stmts_to_slp_tree_map_t (); poly_uint64 max_nunits = nunits; node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, &max_nunits, &loads, matches, &npermutes, - NULL, max_tree_size); - delete bst_fail; + NULL, max_tree_size, bst_map); + /* The map keeps a reference on SLP nodes built, release that. */ + for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); + it != bst_map->end (); ++it) + if ((*it).second) + vect_free_slp_tree ((*it).second, false); + delete bst_map; if (node != NULL) { /* Calculate the unrolling factor based on the smallest type. */ @@ -1924,109 +1955,109 @@ vect_analyze_slp_instance (vec_info *vinfo, } else { - /* Create a new SLP instance. */ - new_instance = XNEW (struct _slp_instance); - SLP_INSTANCE_TREE (new_instance) = node; - SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; - SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; - SLP_INSTANCE_LOADS (new_instance) = loads; - - /* Compute the load permutation. */ - slp_tree load_node; - bool loads_permuted = false; - FOR_EACH_VEC_ELT (loads, i, load_node) - { - vec load_permutation; - int j; - stmt_vec_info load_info; - bool this_load_permuted = false; - load_permutation.create (group_size); - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_SCALAR_STMTS (load_node)[0]); - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) + /* Create a new SLP instance. */ + new_instance = XNEW (struct _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; + SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; + SLP_INSTANCE_LOADS (new_instance) = loads; + + /* Compute the load permutation. */ + slp_tree load_node; + bool loads_permuted = false; + FOR_EACH_VEC_ELT (loads, i, load_node) { - int load_place = vect_get_place_in_interleaving_chain - (load_info, first_stmt_info); - gcc_assert (load_place != -1); - if (load_place != j) - this_load_permuted = true; - load_permutation.safe_push (load_place); + vec load_permutation; + int j; + stmt_vec_info load_info; + bool this_load_permuted = false; + load_permutation.create (group_size); + stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_SCALAR_STMTS (load_node)[0]); + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) + { + int load_place = vect_get_place_in_interleaving_chain + (load_info, first_stmt_info); + gcc_assert (load_place != -1); + if (load_place != j) + this_load_permuted = true; + load_permutation.safe_push (load_place); + } + if (!this_load_permuted + /* The load requires permutation when unrolling exposes + a gap either because the group is larger than the SLP + group-size or because there is a gap between the groups. */ + && (known_eq (unrolling_factor, 1U) + || (group_size == DR_GROUP_SIZE (first_stmt_info) + && DR_GROUP_GAP (first_stmt_info) == 0))) + { + load_permutation.release (); + continue; + } + SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; + loads_permuted = true; } - if (!this_load_permuted - /* The load requires permutation when unrolling exposes - a gap either because the group is larger than the SLP - group-size or because there is a gap between the groups. */ - && (known_eq (unrolling_factor, 1U) - || (group_size == DR_GROUP_SIZE (first_stmt_info) - && DR_GROUP_GAP (first_stmt_info) == 0))) + + if (loads_permuted) { - load_permutation.release (); - continue; + if (!vect_supported_load_permutation_p (new_instance)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: unsupported load " + "permutation %G", stmt_info->stmt); + vect_free_slp_instance (new_instance, false); + return false; + } } - SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation; - loads_permuted = true; - } - - if (loads_permuted) - { - if (!vect_supported_load_permutation_p (new_instance)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unsupported load " - "permutation %G", stmt_info->stmt); - vect_free_slp_instance (new_instance, false); - return false; - } - } /* If the loads and stores can be handled with load/store-lan - instructions do not generate this SLP instance. */ - if (is_a (vinfo) - && loads_permuted - && dr && vect_store_lanes_supported (vectype, group_size, false)) - { - slp_tree load_node; - FOR_EACH_VEC_ELT (loads, i, load_node) + instructions do not generate this SLP instance. */ + if (is_a (vinfo) + && loads_permuted + && dr && vect_store_lanes_supported (vectype, group_size, false)) { - stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_SCALAR_STMTS (load_node)[0]); - /* Use SLP for strided accesses (or if we can't load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || ! vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - DR_GROUP_SIZE (stmt_vinfo), false)) - break; + slp_tree load_node; + FOR_EACH_VEC_ELT (loads, i, load_node) + { + stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT + (SLP_TREE_SCALAR_STMTS (load_node)[0]); + /* Use SLP for strided accesses (or if we can't load-lanes). */ + if (STMT_VINFO_STRIDED_P (stmt_vinfo) + || ! vect_load_lanes_supported + (STMT_VINFO_VECTYPE (stmt_vinfo), + DR_GROUP_SIZE (stmt_vinfo), false)) + break; + } + if (i == loads.length ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Built SLP cancelled: can use " + "load/store-lanes\n"); + vect_free_slp_instance (new_instance, false); + return false; + } } - if (i == loads.length ()) + + vinfo->slp_instances.safe_push (new_instance); + + if (dump_enabled_p ()) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - vect_free_slp_instance (new_instance, false); - return false; + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance:\n"); + vect_print_slp_tree (MSG_NOTE, vect_location, node); } - } - vinfo->slp_instances.safe_push (new_instance); - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Final SLP tree for instance:\n"); - vect_print_slp_tree (MSG_NOTE, vect_location, node); + return true; } - - return true; - } } else { - /* Failed to SLP. */ - /* Free the allocated memory. */ - scalar_stmts.release (); - loads.release (); + /* Failed to SLP. */ + /* Free the allocated memory. */ + scalar_stmts.release (); + loads.release (); } /* For basic block SLP, try to break the group up into multiples of the @@ -3749,8 +3780,13 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) return; + /* See if we have already vectorized the node in the graph of the + SLP instance. */ + if (SLP_TREE_VEC_STMTS (node).exists ()) + return; + /* See if we have already vectorized the same set of stmts and reuse their - vectorized stmts. */ + vectorized stmts across instances. */ if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node))) { SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader)); @@ -3778,8 +3814,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, group_size = SLP_INSTANCE_GROUP_SIZE (instance); gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); - if (!SLP_TREE_VEC_STMTS (node).exists ()) - SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); + SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, -- cgit v1.1