From f74c4b2c4427a4309d48bfc45bc140422a75aa6f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 8 Jan 2020 12:49:14 +0000 Subject: re PR tree-optimization/93199 (Compile time hog in sink_clobbers) 2019-01-08 Richard Biener PR middle-end/93199 c/ * gimple-parser.c (c_parser_parse_gimple_body): Remove __PHI IFN permanently. * gimple-fold.c (rewrite_to_defined_overflow): Mark stmt modified. * tree-ssa-loop-im.c (move_computations_worker): Properly adjust virtual operand, also updating SSA use. * gimple-loop-interchange.cc (loop_cand::undo_simple_reduction): Update stmt after resetting virtual operand. (tree_loop_interchange::move_code_to_inner_loop): Likewise. * gimple-iterator.c (gsi_remove): When not removing the stmt permanently do not delink immediate uses or mark the stmt modified. From-SVN: r280000 --- gcc/tree-ssa-loop-im.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index dd9df25..3e64ae7 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -1231,7 +1231,8 @@ move_computations_worker (basic_block bb) gphi *phi = gsi2.phi (); if (virtual_operand_p (gimple_phi_result (phi))) { - gimple_set_vuse (stmt, PHI_ARG_DEF_FROM_EDGE (phi, e)); + SET_USE (gimple_vuse_op (stmt), + PHI_ARG_DEF_FROM_EDGE (phi, e)); break; } } -- cgit v1.1 From 4e3d3e40726e1b68bf52fa205c68495124ea60b8 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 18 Mar 2020 09:13:17 +0100 Subject: middle-end/94188 fix fold of addr expression generation This adds a missing type conversion to build_fold_addr_expr and adjusts fallout - build_fold_addr_expr was used as a convenience to build an ADDR_EXPR but some callers do not expect the result to be simplified to something else. 2020-03-18 Richard Biener PR middle-end/94188 * fold-const.c (build_fold_addr_expr): Convert address to correct type. * asan.c (maybe_create_ssa_name): Strip useless type conversions. * gimple-fold.c (gimple_fold_stmt_to_constant_1): Use build1 to build the ADDR_EXPR which we don't really want to simplify. * tree-ssa-dom.c (record_equivalences_from_stmt): Likewise. * tree-ssa-loop-im.c (gather_mem_refs_stmt): Likewise. * tree-ssa-forwprop.c (forward_propagate_addr_expr_1): Likewise. (simplify_builtin_call): Strip useless type conversions. * tree-ssa-strlen.c (new_strinfo): Likewise. * gcc.dg/pr94188.c: New testcase. --- gcc/tree-ssa-loop-im.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 3e64ae7..273a580 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -1527,7 +1527,8 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt) tree ref_alias_type = reference_alias_ptr_type (*mem); unsigned int ref_align = get_object_alignment (*mem); tree ref_type = TREE_TYPE (*mem); - tree tmp = build_fold_addr_expr (unshare_expr (mem_base)); + tree tmp = build1 (ADDR_EXPR, ptr_type_node, + unshare_expr (mem_base)); if (TYPE_ALIGN (ref_type) != ref_align) ref_type = build_aligned_type (ref_type, ref_align); (*slot)->mem.ref -- cgit v1.1 From df30ab70690d6088f367e74757f0b5dd6a2587e5 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 28 Apr 2020 14:36:54 +0200 Subject: fix regression with MEM commoning This fixes a regression when canonicalizing refs for LIM PR84362. This possibly unshares and rewrites the refs in the internal data and thus pointer equality no longer works in ref_always_accessed computation. 2020-04-29 Richard Biener * tree-ssa-loop-im.c (ref_always_accessed::operator ()): Just check whether the stmt stores. --- gcc/tree-ssa-loop-im.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 273a580..abd5f70 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2179,20 +2179,21 @@ ref_always_accessed::operator () (mem_ref_loc *loc) { class loop *must_exec; - if (!get_lim_data (loc->stmt)) + struct lim_aux_data *lim_data = get_lim_data (loc->stmt); + if (!lim_data) return false; /* If we require an always executed store make sure the statement - stores to the reference. */ + is a store. */ if (stored_p) { tree lhs = gimple_get_lhs (loc->stmt); if (!lhs - || lhs != *loc->ref) + || !(DECL_P (lhs) || REFERENCE_CLASS_P (lhs))) return false; } - must_exec = get_lim_data (loc->stmt)->always_executed_in; + must_exec = lim_data->always_executed_in; if (!must_exec) return false; -- cgit v1.1 From f9e1ea10e657af9fb02fafecf1a600740fd34409 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 30 Apr 2020 10:47:15 +0200 Subject: tree-optimization/39612 - avoid issueing loads in SM when possible Currently store-motion emits a load of the value in the loop preheader even when the original loop does not contain any read of the reference. This avoids doing this. In the conditional store-motion case we need to mark the sunk stores with no-warning since the control dependence is too tricky to figure out for the uninit warning. 2020-05-04 Richard Biener PR tree-optimization/39612 * tree-ssa-loop-im.c (im_mem_ref::loaded): New member. (set_ref_loaded_in_loop): New. (mark_ref_loaded): Likewise. (gather_mem_refs_stmt): Call mark_ref_loaded for loads. (execute_sm): Avoid issueing a load when it was not there. (execute_sm_if_changed): Avoid issueing warnings for the conditional store. * gcc.dg/tree-ssa/pr39612.c: New testcase. --- gcc/tree-ssa-loop-im.c | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index abd5f70..18e5c18 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -127,6 +127,8 @@ public: bitmap stored; /* The set of loops in that this memory location is stored to. */ + bitmap loaded; /* The set of loops in that this memory location + is loaded from. */ vec accesses_in_loop; /* The locations of the accesses. Vector indexed by the loop number. */ @@ -1395,6 +1397,7 @@ mem_ref_alloc (ao_ref *mem, unsigned hash, unsigned id) ref->ref_decomposed = false; ref->hash = hash; ref->stored = NULL; + ref->loaded = NULL; bitmap_initialize (&ref->indep_loop, &lim_bitmap_obstack); bitmap_initialize (&ref->dep_loop, &lim_bitmap_obstack); ref->accesses_in_loop.create (1); @@ -1435,6 +1438,27 @@ mark_ref_stored (im_mem_ref *ref, class loop *loop) loop = loop_outer (loop); } +/* Set the LOOP bit in REF loaded bitmap and allocate that if + necessary. Return whether a bit was changed. */ + +static bool +set_ref_loaded_in_loop (im_mem_ref *ref, class loop *loop) +{ + if (!ref->loaded) + ref->loaded = BITMAP_ALLOC (&lim_bitmap_obstack); + return bitmap_set_bit (ref->loaded, loop->num); +} + +/* Marks reference REF as loaded in LOOP. */ + +static void +mark_ref_loaded (im_mem_ref *ref, class loop *loop) +{ + while (loop != current_loops->tree_root + && set_ref_loaded_in_loop (ref, loop)) + loop = loop_outer (loop); +} + /* Gathers memory references in statement STMT in LOOP, storing the information about them in the memory_accesses structure. Marks the vops accessed through unrecognized statements there as @@ -1571,6 +1595,8 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt) bitmap_set_bit (&memory_accesses.refs_stored_in_loop[loop->num], ref->id); mark_ref_stored (ref, loop); } + else + mark_ref_loaded (ref, loop); init_lim_data (stmt)->ref = ref->id; return; } @@ -1968,6 +1994,8 @@ execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, gsi = gsi_start_bb (then_bb); /* Insert actual store. */ stmt = gimple_build_assign (unshare_expr (mem), tmp_var); + /* Make sure to not warn about maybe-uninit uses of tmp_var here. */ + gimple_set_no_warning (stmt, true); gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); edge e1 = single_succ_edge (new_bb); @@ -2115,14 +2143,17 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) by move_computations after all dependencies. */ gsi = gsi_for_stmt (first_mem_ref_loc (loop, ref)->stmt); - /* FIXME/TODO: For the multi-threaded variant, we could avoid this - load altogether, since the store is predicated by a flag. We - could, do the load only if it was originally in the loop. */ - load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); - lim_data = init_lim_data (load); - lim_data->max_loop = loop; - lim_data->tgt_loop = loop; - gsi_insert_before (&gsi, load, GSI_SAME_STMT); + /* Avoid doing a load if there was no load of the ref in the loop. + Esp. when the ref is not always stored we cannot optimize it + away later. */ + if (ref->loaded && bitmap_bit_p (ref->loaded, loop->num)) + { + load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); + lim_data = init_lim_data (load); + lim_data->max_loop = loop; + lim_data->tgt_loop = loop; + gsi_insert_before (&gsi, load, GSI_SAME_STMT); + } if (multi_threaded_model_p) { -- cgit v1.1 From 0424a5ece5307cc22bbc0fe97edf4707d7a798ed Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 5 May 2020 11:00:09 +0200 Subject: tree-optimization/94949 - fix load eliding in SM This fixes the case of not using the multithreaded model when only conditionally storing to the destination. We cannot elide the load in this case. 2020-05-05 Richard Biener PR tree-optimization/94949 * tree-ssa-loop-im.c (execute_sm): Check whether we use the multithreaded model or always compute the stored value before eliding a load. * gcc.dg/torture/pr94949.c: New testcase. --- gcc/tree-ssa-loop-im.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 18e5c18..554dd4b 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2128,9 +2128,9 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) fmt_data.orig_loop = loop; for_each_index (&ref->mem.ref, force_move_till, &fmt_data); + bool always_stored = ref_always_accessed_p (loop, ref, true); if (bb_in_transaction (loop_preheader_edge (loop)->src) - || (! flag_store_data_races - && ! ref_always_accessed_p (loop, ref, true))) + || (! flag_store_data_races && ! always_stored)) multi_threaded_model_p = true; if (multi_threaded_model_p) @@ -2145,8 +2145,10 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) /* Avoid doing a load if there was no load of the ref in the loop. Esp. when the ref is not always stored we cannot optimize it - away later. */ - if (ref->loaded && bitmap_bit_p (ref->loaded, loop->num)) + away later. But when it is not always stored we must use a conditional + store then. */ + if ((!always_stored && !multi_threaded_model_p) + || (ref->loaded && bitmap_bit_p (ref->loaded, loop->num))) { load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); lim_data = init_lim_data (load); -- cgit v1.1 From 371905d12259c180efb9b1f1b5716e969feb60f9 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 6 May 2020 09:39:45 +0200 Subject: tree-optimization/94963 - avoid bogus uninit warning with store-motion Eliding the load for store-motion causes an uninitialized variable flowing into the loop, conditionally initialized and used. The uninit warning cannot relate the flag used to guard the initialization and use with the actual initialization so the following robustifies the previous approach of marking the conditional store as not to be warned on by instead initializing the variable on loop entry from an uninitialized variable we mark as not to be warned for. 2020-05-06 Richard Biener PR tree-optimization/94963 * tree-ssa-loop-im.c (execute_sm_if_changed): Remove no-warning marking of the conditional store. (execute_sm): Instead mark the uninitialized state on loop entry to be not warned about. * gcc.dg/pr94963.c: New testcase. --- gcc/tree-ssa-loop-im.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 554dd4b..3056b4b 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -1994,8 +1994,6 @@ execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, gsi = gsi_start_bb (then_bb); /* Insert actual store. */ stmt = gimple_build_assign (unshare_expr (mem), tmp_var); - /* Make sure to not warn about maybe-uninit uses of tmp_var here. */ - gimple_set_no_warning (stmt, true); gsi_insert_after (&gsi, stmt, GSI_CONTINUE_LINKING); edge e1 = single_succ_edge (new_bb); @@ -2149,13 +2147,19 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) store then. */ if ((!always_stored && !multi_threaded_model_p) || (ref->loaded && bitmap_bit_p (ref->loaded, loop->num))) + load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); + else { - load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); - lim_data = init_lim_data (load); - lim_data->max_loop = loop; - lim_data->tgt_loop = loop; - gsi_insert_before (&gsi, load, GSI_SAME_STMT); + /* If not emitting a load mark the uninitialized state on the + loop entry as not to be warned for. */ + tree uninit = create_tmp_reg (TREE_TYPE (tmp_var)); + TREE_NO_WARNING (uninit) = 1; + load = gimple_build_assign (tmp_var, uninit); } + lim_data = init_lim_data (load); + lim_data->max_loop = loop; + lim_data->tgt_loop = loop; + gsi_insert_before (&gsi, load, GSI_SAME_STMT); if (multi_threaded_model_p) { -- cgit v1.1 From 283cb9ea6293e813e48a1b769e1e0779918ea20a Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 27 Apr 2020 14:45:54 +0200 Subject: tree-optimization/57359 - rewrite SM code This rewrites store-motion to process candidates where we can ensure order preserving separately and with no need to disambiguate against all stores. Those candidates we cannot handle this way are validated to be independent on all stores (w/o TBAA) and then processed as "unordered" (all conditionally executed stores are so as well). This will necessary cause FAIL: gcc.dg/graphite/pr80906.c scan-tree-dump graphite "isl AST to Gimple succeeded" because the SM previously performed is not valid for exactly the PR57359 reason, we still perform SM of qc for the innermost loop but that's not enough. There is still room for improvements because we still check some constraints for the order preserving cases that are only necessary in the current strict way for the unordered ones. Leaving that for the furture. 2020-05-07 Richard Biener PR tree-optimization/57359 * tree-ssa-loop-im.c (im_mem_ref::indep_loop): Remove. (in_mem_ref::dep_loop): Repurpose. (LOOP_DEP_BIT): Remove. (enum dep_kind): New. (enum dep_state): Likewise. (record_loop_dependence): New function to populate the dependence cache. (query_loop_dependence): New function to query the dependence cache. (memory_accesses::refs_in_loop): Rename to ... (memory_accesses::refs_loaded_in_loop): ... this and change to only record loads. (outermost_indep_loop): Adjust. (mem_ref_alloc): Likewise. (gather_mem_refs_stmt): Likewise. (mem_refs_may_alias_p): Add tbaa_p parameter and pass it down. (struct sm_aux): New. (execute_sm): Split code generation on exits, record state into new hash-map. (enum sm_kind): New. (execute_sm_exit): Exit code generation part. (sm_seq_push_down): Helper for sm_seq_valid_bb performing dependence checking on stores reached from exits. (sm_seq_valid_bb): New function gathering SM stores on exits. (hoist_memory_references): Re-implement. (refs_independent_p): Add tbaa_p parameter and pass it down. (record_dep_loop): Remove. (ref_indep_loop_p_1): Fold into ... (ref_indep_loop_p): ... this and generalize for three kinds of dependence queries. (can_sm_ref_p): Adjust according to hoist_memory_references changes. (store_motion_loop): Don't do anything if the set of SM candidates is empty. (tree_ssa_lim_initialize): Adjust. (tree_ssa_lim_finalize): Likewise. * gcc.dg/torture/pr57359-1.c: New testcase. * gcc.dg/torture/pr57359-1.c: Likewise. * gcc.dg/tree-ssa/ssa-lim-14.c: Likewise. * gcc.dg/graphite/pr80906.c: XFAIL. --- gcc/tree-ssa-loop-im.c | 580 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 464 insertions(+), 116 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 3056b4b..2aabb54c 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -133,24 +133,41 @@ public: /* The locations of the accesses. Vector indexed by the loop number. */ - /* The following sets are computed on demand. We keep both set and - its complement, so that we know whether the information was - already computed or not. */ - bitmap_head indep_loop; /* The set of loops in that the memory - reference is independent, meaning: - If it is stored in the loop, this store - is independent on all other loads and - stores. - If it is only loaded, then it is independent - on all stores in the loop. */ - bitmap_head dep_loop; /* The complement of INDEP_LOOP. */ + /* The following set is computed on demand. */ + bitmap_head dep_loop; /* The set of loops in that the memory + reference is {in,}dependent in + different modes. */ }; -/* We use two bits per loop in the ref->{in,}dep_loop bitmaps, the first - to record (in)dependence against stores in the loop and its subloops, the - second to record (in)dependence against all references in the loop - and its subloops. */ -#define LOOP_DEP_BIT(loopnum, storedp) (2 * (loopnum) + (storedp ? 1 : 0)) +/* We use six bits per loop in the ref->dep_loop bitmap to record + the dep_kind x dep_state combinations. */ + +enum dep_kind { lim_raw, sm_war, sm_waw }; +enum dep_state { dep_unknown, dep_independent, dep_dependent }; + +/* Populate the loop dependence cache of REF for LOOP, KIND with STATE. */ + +static void +record_loop_dependence (class loop *loop, im_mem_ref *ref, + dep_kind kind, dep_state state) +{ + gcc_assert (state != dep_unknown); + unsigned bit = 6 * loop->num + kind * 2 + state == dep_dependent ? 1 : 0; + bitmap_set_bit (&ref->dep_loop, bit); +} + +/* Query the loop dependence cache of REF for LOOP, KIND. */ + +static dep_state +query_loop_dependence (class loop *loop, im_mem_ref *ref, dep_kind kind) +{ + unsigned first_bit = 6 * loop->num + kind * 2; + if (bitmap_bit_p (&ref->dep_loop, first_bit)) + return dep_independent; + else if (bitmap_bit_p (&ref->dep_loop, first_bit + 1)) + return dep_dependent; + return dep_unknown; +} /* Mem_ref hashtable helpers. */ @@ -211,7 +228,7 @@ static struct vec refs_list; /* The set of memory references accessed in each loop. */ - vec refs_in_loop; + vec refs_loaded_in_loop; /* The set of memory references stored in each loop. */ vec refs_stored_in_loop; @@ -227,8 +244,9 @@ static struct static bitmap_obstack lim_bitmap_obstack; static obstack mem_ref_obstack; -static bool ref_indep_loop_p (class loop *, im_mem_ref *); +static bool ref_indep_loop_p (class loop *, im_mem_ref *, dep_kind); static bool ref_always_accessed_p (class loop *, im_mem_ref *, bool); +static bool refs_independent_p (im_mem_ref *, im_mem_ref *, bool = true); /* Minimum cost of an expensive expression. */ #define LIM_EXPENSIVE ((unsigned) param_lim_expensive) @@ -573,10 +591,10 @@ outermost_indep_loop (class loop *outer, class loop *loop, im_mem_ref *ref) aloop != loop; aloop = superloop_at_depth (loop, loop_depth (aloop) + 1)) if ((!ref->stored || !bitmap_bit_p (ref->stored, aloop->num)) - && ref_indep_loop_p (aloop, ref)) + && ref_indep_loop_p (aloop, ref, lim_raw)) return aloop; - if (ref_indep_loop_p (loop, ref)) + if (ref_indep_loop_p (loop, ref, lim_raw)) return loop; else return NULL; @@ -1398,7 +1416,6 @@ mem_ref_alloc (ao_ref *mem, unsigned hash, unsigned id) ref->hash = hash; ref->stored = NULL; ref->loaded = NULL; - bitmap_initialize (&ref->indep_loop, &lim_bitmap_obstack); bitmap_initialize (&ref->dep_loop, &lim_bitmap_obstack); ref->accesses_in_loop.create (1); @@ -1589,14 +1606,17 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt) record_mem_ref_loc (ref, stmt, mem); } - bitmap_set_bit (&memory_accesses.refs_in_loop[loop->num], ref->id); if (is_stored) { bitmap_set_bit (&memory_accesses.refs_stored_in_loop[loop->num], ref->id); mark_ref_stored (ref, loop); } - else - mark_ref_loaded (ref, loop); + /* A not simple memory op is also a read when it is a write. */ + if (!is_stored || id == UNANALYZABLE_MEM_ID) + { + bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], ref->id); + mark_ref_loaded (ref, loop); + } init_lim_data (stmt)->ref = ref->id; return; } @@ -1698,7 +1718,8 @@ analyze_memory_references (void) static bool mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2, - hash_map **ttae_cache) + hash_map **ttae_cache, + bool tbaa_p) { /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same object and their offset differ in such a way that the locations cannot @@ -1707,7 +1728,7 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2, aff_tree off1, off2; /* Perform basic offset and type-based disambiguation. */ - if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, true)) + if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, tbaa_p)) return false; /* The expansion of addresses may be a bit expensive, thus we only do @@ -2094,23 +2115,28 @@ execute_sm_if_changed_flag_set (class loop *loop, im_mem_ref *ref, return flag; } +struct sm_aux +{ + tree tmp_var; + tree store_flag; + hash_set flag_bbs; +}; + /* Executes store motion of memory reference REF from LOOP. Exits from the LOOP are stored in EXITS. The initialization of the temporary variable is put to the preheader of the loop, and assignments to the reference from the temporary variable are emitted to exits. */ static void -execute_sm (class loop *loop, vec exits, im_mem_ref *ref) +execute_sm (class loop *loop, im_mem_ref *ref, + hash_map &aux_map) { - tree tmp_var, store_flag = NULL_TREE; - unsigned i; gassign *load; struct fmt_data fmt_data; - edge ex; struct lim_aux_data *lim_data; bool multi_threaded_model_p = false; gimple_stmt_iterator gsi; - hash_set flag_bbs; + sm_aux *aux = new sm_aux; if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -2119,8 +2145,8 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) fprintf (dump_file, " from loop %d\n", loop->num); } - tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), - get_lsm_tmp_name (ref->mem.ref, ~0)); + aux->tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), + get_lsm_tmp_name (ref->mem.ref, ~0)); fmt_data.loop = loop; fmt_data.orig_loop = loop; @@ -2132,9 +2158,15 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) multi_threaded_model_p = true; if (multi_threaded_model_p) - store_flag = execute_sm_if_changed_flag_set (loop, ref, &flag_bbs); + aux->store_flag + = execute_sm_if_changed_flag_set (loop, ref, &aux->flag_bbs); + else + aux->store_flag = NULL_TREE; + + /* Remember variable setup. */ + aux_map.put (ref, aux); - rewrite_mem_refs (loop, ref, tmp_var); + rewrite_mem_refs (loop, ref, aux->tmp_var); /* Emit the load code on a random exit edge or into the latch if the loop does not exit, so that we are sure it will be processed @@ -2147,14 +2179,14 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) store then. */ if ((!always_stored && !multi_threaded_model_p) || (ref->loaded && bitmap_bit_p (ref->loaded, loop->num))) - load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); + load = gimple_build_assign (aux->tmp_var, unshare_expr (ref->mem.ref)); else { /* If not emitting a load mark the uninitialized state on the loop entry as not to be warned for. */ - tree uninit = create_tmp_reg (TREE_TYPE (tmp_var)); + tree uninit = create_tmp_reg (TREE_TYPE (aux->tmp_var)); TREE_NO_WARNING (uninit) = 1; - load = gimple_build_assign (tmp_var, uninit); + load = gimple_build_assign (aux->tmp_var, uninit); } lim_data = init_lim_data (load); lim_data->max_loop = loop; @@ -2163,24 +2195,269 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) if (multi_threaded_model_p) { - load = gimple_build_assign (store_flag, boolean_false_node); + load = gimple_build_assign (aux->store_flag, boolean_false_node); lim_data = init_lim_data (load); lim_data->max_loop = loop; lim_data->tgt_loop = loop; gsi_insert_before (&gsi, load, GSI_SAME_STMT); } +} - /* Sink the store to every exit from the loop. */ - FOR_EACH_VEC_ELT (exits, i, ex) - if (!multi_threaded_model_p) +/* sm_ord is used for ordinary stores we can retain order with respect + to other stores + sm_unord is used for conditional executed stores which need to be + able to execute in arbitrary order with respect to other stores + sm_other is used for stores we do not try to apply store motion to. */ +enum sm_kind { sm_ord, sm_unord, sm_other }; +typedef std::pair seq_entry; + +static void +execute_sm_exit (class loop *loop, edge ex, vec &seq, + hash_map &aux_map, sm_kind kind) +{ + /* Sink the stores to exit from the loop. */ + for (unsigned i = seq.length (); i > 0; --i) + { + if (seq[i-1].second != kind) + continue; + im_mem_ref *ref = memory_accesses.refs_list[seq[i-1].first]; + sm_aux *aux = *aux_map.get (ref); + if (!aux->store_flag) + { + gassign *store; + store = gimple_build_assign (unshare_expr (ref->mem.ref), + aux->tmp_var); + gsi_insert_on_edge (ex, store); + } + else + execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, aux->store_flag, + loop_preheader_edge (loop), &aux->flag_bbs); + } +} + +/* Push the SM candidate at index PTR in the sequence SEQ down until + we hit the next SM candidate. Return true if that went OK and + false if we could not disambiguate agains another unrelated ref. */ + +static bool +sm_seq_push_down (vec &seq, unsigned ptr) +{ + for (; ptr > 0; --ptr) + { + seq_entry &new_cand = seq[ptr]; + seq_entry &against = seq[ptr-1]; + if (against.second == sm_ord) + /* Found the tail of the sequence. */ + break; + if (!refs_independent_p (memory_accesses.refs_list[new_cand.first], + memory_accesses.refs_list[against.first], + false)) + /* ??? Prune new_cand from the list of refs to apply SM to. */ + return false; + std::swap (new_cand, against); + } + return true; +} + +/* Computes the sequence of stores from candidates in REFS_NOT_IN_SEQ to SEQ + walking backwards from VDEF (or the end of BB if VDEF is NULL). */ + +static int +sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, + vec &seq, bitmap refs_not_in_seq, + bitmap refs_not_supported, bool forked) +{ + if (!vdef) + for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi); + gsi_prev (&gsi)) { - gassign *store; - store = gimple_build_assign (unshare_expr (ref->mem.ref), tmp_var); - gsi_insert_on_edge (ex, store); + vdef = gimple_vdef (gsi_stmt (gsi)); + if (vdef) + break; } - else - execute_sm_if_changed (ex, ref->mem.ref, tmp_var, store_flag, - loop_preheader_edge (loop), &flag_bbs); + if (!vdef) + { + gphi *vphi = get_virtual_phi (bb); + if (vphi) + vdef = gimple_phi_result (vphi); + } + if (!vdef) + { + if (single_pred_p (bb)) + /* This handles the perfect nest case. */ + return sm_seq_valid_bb (loop, single_pred (bb), vdef, + seq, refs_not_in_seq, refs_not_supported, + forked); + return 0; + } + do + { + gimple *def = SSA_NAME_DEF_STMT (vdef); + if (gimple_bb (def) != bb) + { + /* If we forked by processing a PHI do not allow our walk to + merge again until we handle that robustly. */ + if (forked) + { + /* Mark refs_not_in_seq as unsupported. */ + bitmap_ior_into (refs_not_supported, refs_not_in_seq); + return 1; + } + /* Otherwise it doesn't really matter if we end up in different + BBs. */ + bb = gimple_bb (def); + } + if (gphi *phi = dyn_cast (def)) + { + /* Handle CFG merges. Until we handle forks (gimple_bb (def) != bb) + this is still linear. + Eventually we want to cache intermediate results per BB + (but we can't easily cache for different exits?). */ + /* Stop at PHIs with possible backedges. */ + if (bb == bb->loop_father->header + || bb->flags & BB_IRREDUCIBLE_LOOP) + { + /* Mark refs_not_in_seq as unsupported. */ + bitmap_ior_into (refs_not_supported, refs_not_in_seq); + return 1; + } + if (gimple_phi_num_args (phi) == 1) + return sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, + gimple_phi_arg_def (phi, 0), seq, + refs_not_in_seq, refs_not_supported, + false); + auto_vec first_edge_seq; + auto_bitmap tem_refs_not_in_seq (&lim_bitmap_obstack); + int eret; + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); + eret = sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, + gimple_phi_arg_def (phi, 0), + first_edge_seq, + tem_refs_not_in_seq, refs_not_supported, + true); + if (eret != 1) + return -1; + /* Simplify our lives by pruning the sequence of !sm_ord. */ + while (!first_edge_seq.is_empty () + && first_edge_seq.last ().second != sm_ord) + first_edge_seq.pop (); + for (unsigned int i = 1; i < gimple_phi_num_args (phi); ++i) + { + tree vuse = gimple_phi_arg_def (phi, i); + edge e = gimple_phi_arg_edge (phi, i); + auto_vec edge_seq; + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); + eret = sm_seq_valid_bb (loop, e->src, vuse, edge_seq, + tem_refs_not_in_seq, refs_not_supported, + true); + if (eret != 1) + return -1; + /* Simplify our lives by pruning the sequence of !sm_ord. */ + while (!edge_seq.is_empty () + && edge_seq.last ().second != sm_ord) + edge_seq.pop (); + unsigned min_len = MIN(first_edge_seq.length (), + edge_seq.length ()); + /* Incrementally merge seqs into first_edge_seq. */ + for (unsigned int i = 0; i < min_len; ++i) + { + /* ??? We can more intelligently merge when we face different + order by additional sinking operations in one sequence. + For now we simply mark them as to be processed by the + not order-preserving SM code. */ + if (first_edge_seq[i].first != edge_seq[i].first) + { + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + bitmap_set_bit (refs_not_supported, edge_seq[i].first); + first_edge_seq[i].second = sm_unord; + } + /* sm_unord prevails. */ + else if (first_edge_seq[i].second != edge_seq[i].second) + { + /* This is just an optimization. */ + gcc_assert (bitmap_bit_p (refs_not_supported, + first_edge_seq[i].first)); + first_edge_seq[i].second = sm_unord; + } + } + /* Any excess elements become sm_unord since they are now + coonditionally executed. */ + if (first_edge_seq.length () > edge_seq.length ()) + { + for (unsigned i = edge_seq.length (); + i < first_edge_seq.length (); ++i) + { + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + first_edge_seq[i].second = sm_unord; + } + } + else if (edge_seq.length () > first_edge_seq.length ()) + { + for (unsigned i = first_edge_seq.length (); + i < edge_seq.length (); ++i) + bitmap_set_bit (refs_not_supported, edge_seq[i].first); + } + } + /* Use the sequence from the first edge and push SMs down. */ + for (unsigned i = 0; i < first_edge_seq.length (); ++i) + { + if (first_edge_seq[i].second == sm_other) + break; + unsigned id = first_edge_seq[i].first; + seq.safe_push (first_edge_seq[i]); + if (first_edge_seq[i].second == sm_ord + && !sm_seq_push_down (seq, seq.length () - 1)) + { + bitmap_set_bit (refs_not_supported, id); + /* ??? Mark it sm_unord but it's now "somewhere" ... */ + for (unsigned i = seq.length (); i != 0; --i) + if (seq[i - 1].first == id) + { + seq[i - 1].second = sm_unord; + break; + } + } + } + return 1; + } + lim_aux_data *data = get_lim_data (def); + gcc_assert (data); + if (data->ref == UNANALYZABLE_MEM_ID) + return -1; + /* One of the stores we want to apply SM to and we've not yet seen. */ + else if (bitmap_clear_bit (refs_not_in_seq, data->ref)) + { + seq.safe_push (std::make_pair (data->ref, sm_ord)); + + /* 1) push it down the queue until a SMed + and not ignored ref is reached, skipping all not SMed refs + and ignored refs via non-TBAA disambiguation. */ + if (!sm_seq_push_down (seq, seq.length () - 1)) + { + bitmap_set_bit (refs_not_supported, data->ref); + /* ??? Mark it sm_unord but it's now "somewhere" ... */ + for (unsigned i = seq.length (); i != 0; --i) + if (seq[i - 1].first == data->ref) + { + seq[i - 1].second = sm_unord; + break; + } + } + + /* 2) check whether we've seen all refs we want to SM and if so + declare success for the active exit */ + if (bitmap_empty_p (refs_not_in_seq)) + return 1; + } + else + /* Another store not part of the final sequence. Simply push it. */ + seq.safe_push (std::make_pair (data->ref, sm_other)); + + vdef = gimple_vuse (def); + } + while (1); } /* Hoists memory references MEM_REFS out of LOOP. EXITS is the list of exit @@ -2194,11 +2471,104 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, unsigned i; bitmap_iterator bi; + /* To address PR57359 before actually applying store-motion check + the candidates found for validity with regards to reordering + relative to other stores which we until here disambiguated using + TBAA which isn't valid. + What matters is the order of the last stores to the mem_refs + with respect to the other stores of the loop at the point of the + loop exits. */ + + /* For each exit compute the store order, pruning from mem_refs + on the fly. */ + /* The complexity of this is at least + O(number of exits * number of SM refs) but more approaching + O(number of exits * number of SM refs * number of stores). */ + /* ??? Somehow do this in a single sweep over the loop body. */ + auto_vec > > sms; + auto_bitmap refs_not_supported (&lim_bitmap_obstack); + edge e; + FOR_EACH_VEC_ELT (exits, i, e) + { + vec seq; + seq.create (4); + auto_bitmap refs_not_in_seq (&lim_bitmap_obstack); + bitmap_copy (refs_not_in_seq, mem_refs); + int res = sm_seq_valid_bb (loop, e->src, NULL_TREE, + seq, refs_not_in_seq, + refs_not_supported, false); + if (res != 1) + { + bitmap_copy (refs_not_supported, mem_refs); + break; + } + sms.safe_push (std::make_pair (e, seq)); + } + + /* Prune pruned mem_refs from earlier processed exits. */ + bool changed = !bitmap_empty_p (refs_not_supported); + while (changed) + { + changed = false; + std::pair > *seq; + FOR_EACH_VEC_ELT (sms, i, seq) + { + for (unsigned i = 0; i < seq->second.length (); ++i) + { + if (seq->second[i].second == sm_other) + break; + unsigned id = seq->second[i].first; + if (bitmap_bit_p (refs_not_supported, id)) + seq->second[i].second = sm_other; + else if (!sm_seq_push_down (seq->second, i)) + { + if (bitmap_set_bit (refs_not_supported, id)) + changed = true; + } + } + } + } + + /* Verify dependence for refs we cannot handle with the order preserving + code (refs_not_supported) or prune them from mem_refs. */ + auto_vec unord_refs; + EXECUTE_IF_SET_IN_BITMAP (refs_not_supported, 0, i, bi) + { + ref = memory_accesses.refs_list[i]; + if (!ref_indep_loop_p (loop, ref, sm_waw)) + bitmap_clear_bit (mem_refs, i); + /* We've now verified store order for ref with respect to all other + stores in the loop does not matter. */ + else + unord_refs.safe_push (std::make_pair (i, sm_unord)); + } + + hash_map aux_map; + + /* Execute SM but delay the store materialization for ordered + sequences on exit. */ EXECUTE_IF_SET_IN_BITMAP (mem_refs, 0, i, bi) { ref = memory_accesses.refs_list[i]; - execute_sm (loop, exits, ref); + execute_sm (loop, ref, aux_map); } + + /* Materialize ordered store sequences on exits. */ + FOR_EACH_VEC_ELT (exits, i, e) + { + if (i < sms.length ()) + { + gcc_assert (sms[i].first == e); + execute_sm_exit (loop, e, sms[i].second, aux_map, sm_ord); + sms[i].second.release (); + } + if (!unord_refs.is_empty ()) + execute_sm_exit (loop, e, unord_refs, aux_map, sm_unord); + } + + for (hash_map::iterator iter = aux_map.begin (); + iter != aux_map.end (); ++iter) + delete (*iter).second; } class ref_always_accessed @@ -2254,7 +2624,7 @@ ref_always_accessed_p (class loop *loop, im_mem_ref *ref, bool stored_p) /* Returns true if REF1 and REF2 are independent. */ static bool -refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) +refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2, bool tbaa_p) { if (ref1 == ref2) return true; @@ -2263,7 +2633,7 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) fprintf (dump_file, "Querying dependency of refs %u and %u: ", ref1->id, ref2->id); - if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache)) + if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache, tbaa_p)) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "dependent.\n"); @@ -2277,32 +2647,23 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) } } -/* Mark REF dependent on stores or loads (according to STORED_P) in LOOP - and its super-loops. */ - -static void -record_dep_loop (class loop *loop, im_mem_ref *ref, bool stored_p) -{ - /* We can propagate dependent-in-loop bits up the loop - hierarchy to all outer loops. */ - while (loop != current_loops->tree_root - && bitmap_set_bit (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - loop = loop_outer (loop); -} - -/* Returns true if REF is independent on all other memory - references in LOOP. */ +/* Returns true if REF is independent on all other accessess in LOOP. + KIND specifies the kind of dependence to consider. + lim_raw assumes REF is not stored in LOOP and disambiguates RAW + dependences so if true REF can be hoisted out of LOOP + sm_war disambiguates a store REF against all other loads to see + whether the store can be sunk across loads out of LOOP + sm_waw disambiguates a store REF against all other stores to see + whether the store can be sunk across stores out of LOOP. */ static bool -ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) +ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind) { - stored_p |= (ref->stored && bitmap_bit_p (ref->stored, loop->num)); - bool indep_p = true; bitmap refs_to_check; - if (stored_p) - refs_to_check = &memory_accesses.refs_in_loop[loop->num]; + if (kind == sm_war) + refs_to_check = &memory_accesses.refs_loaded_in_loop[loop->num]; else refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num]; @@ -2310,15 +2671,15 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) indep_p = false; else { - if (bitmap_bit_p (&ref->indep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - return true; - if (bitmap_bit_p (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - return false; + /* tri-state, { unknown, independent, dependent } */ + dep_state state = query_loop_dependence (loop, ref, kind); + if (state != dep_unknown) + return state == dep_independent ? true : false; class loop *inner = loop->inner; while (inner) { - if (!ref_indep_loop_p_1 (inner, ref, stored_p)) + if (!ref_indep_loop_p (inner, ref, kind)) { indep_p = false; break; @@ -2333,7 +2694,7 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi) { im_mem_ref *aref = memory_accesses.refs_list[i]; - if (!refs_independent_p (ref, aref)) + if (!refs_independent_p (ref, aref, kind != sm_waw)) { indep_p = false; break; @@ -2343,44 +2704,17 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) } if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Querying dependencies of ref %u in loop %d: %s\n", + fprintf (dump_file, "Querying %s dependencies of ref %u in loop %d: %s\n", + kind == lim_raw ? "RAW" : (kind == sm_war ? "SM WAR" : "SM WAW"), ref->id, loop->num, indep_p ? "independent" : "dependent"); /* Record the computed result in the cache. */ - if (indep_p) - { - if (bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, stored_p)) - && stored_p) - { - /* If it's independend against all refs then it's independent - against stores, too. */ - bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, false)); - } - } - else - { - record_dep_loop (loop, ref, stored_p); - if (!stored_p) - { - /* If it's dependent against stores it's dependent against - all refs, too. */ - record_dep_loop (loop, ref, true); - } - } + record_loop_dependence (loop, ref, kind, + indep_p ? dep_independent : dep_dependent); return indep_p; } -/* Returns true if REF is independent on all other memory references in - LOOP. */ - -static bool -ref_indep_loop_p (class loop *loop, im_mem_ref *ref) -{ - gcc_checking_assert (MEM_ANALYZABLE (ref)); - - return ref_indep_loop_p_1 (loop, ref, false); -} /* Returns true if we can perform store motion of REF from LOOP. */ @@ -2410,12 +2744,25 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref) base = get_base_address (ref->mem.ref); if ((tree_could_trap_p (ref->mem.ref) || (DECL_P (base) && TREE_READONLY (base))) + /* ??? We can at least use false here, allowing loads? We + are forcing conditional stores if the ref is not always + stored to later anyway. So this would only guard + the load we need to emit. Thus when the ref is not + loaded we can elide this completely? */ && !ref_always_accessed_p (loop, ref, true)) return false; - /* And it must be independent on all other memory references - in LOOP. */ - if (!ref_indep_loop_p (loop, ref)) + /* Verify all loads of ref can be hoisted. */ + if (ref->loaded + && bitmap_bit_p (ref->loaded, loop->num) + && !ref_indep_loop_p (loop, ref, lim_raw)) + return false; + + /* Verify the candidate can be disambiguated against all loads, + that is, we can elide all in-loop stores. Disambiguation + against stores is done later when we cannot guarantee preserving + the order of stores. */ + if (!ref_indep_loop_p (loop, ref, sm_war)) return false; return true; @@ -2473,7 +2820,8 @@ store_motion_loop (class loop *loop, bitmap sm_executed) if (loop_suitable_for_sm (loop, exits)) { find_refs_for_sm (loop, sm_executed, sm_in_loop); - hoist_memory_references (loop, sm_in_loop, exits); + if (!bitmap_empty_p (sm_in_loop)) + hoist_memory_references (loop, sm_in_loop, exits); } exits.release (); @@ -2629,8 +2977,8 @@ tree_ssa_lim_initialize (void) memory_accesses.refs_list.quick_push (mem_ref_alloc (NULL, 0, UNANALYZABLE_MEM_ID)); - memory_accesses.refs_in_loop.create (number_of_loops (cfun)); - memory_accesses.refs_in_loop.quick_grow (number_of_loops (cfun)); + memory_accesses.refs_loaded_in_loop.create (number_of_loops (cfun)); + memory_accesses.refs_loaded_in_loop.quick_grow (number_of_loops (cfun)); memory_accesses.refs_stored_in_loop.create (number_of_loops (cfun)); memory_accesses.refs_stored_in_loop.quick_grow (number_of_loops (cfun)); memory_accesses.all_refs_stored_in_loop.create (number_of_loops (cfun)); @@ -2638,7 +2986,7 @@ tree_ssa_lim_initialize (void) for (i = 0; i < number_of_loops (cfun); i++) { - bitmap_initialize (&memory_accesses.refs_in_loop[i], + bitmap_initialize (&memory_accesses.refs_loaded_in_loop[i], &lim_bitmap_obstack); bitmap_initialize (&memory_accesses.refs_stored_in_loop[i], &lim_bitmap_obstack); @@ -2681,7 +3029,7 @@ tree_ssa_lim_finalize (void) memory_accesses.refs_list.release (); obstack_free (&mem_ref_obstack, NULL); - memory_accesses.refs_in_loop.release (); + memory_accesses.refs_loaded_in_loop.release (); memory_accesses.refs_stored_in_loop.release (); memory_accesses.all_refs_stored_in_loop.release (); -- cgit v1.1 From b6ff3ddecfa93d53867afaaa078f85fc848abbbd Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 8 May 2020 12:03:30 +0200 Subject: tree-optimization/94988 - enhance SM some more This enhances store-order preserving store motion to handle the case of non-invariant dependent stores in the sequence of unconditionally executed stores on exit by re-issueing them as part of the sequence of stores on the exit. This fixes the observed regression of gcc.target/i386/pr64110.c which relies on store-motion of 'b' for a loop like for (int i = 0; i < j; ++i) *b++ = x; where for correctness we now no longer apply store-motion. With the patch we emit the correct tem = b; for (int i = 0; i < j; ++i) { tem = tem + 1; *tem = x; } b = tem; *tem = x; preserving the original order of stores. A testcase reflecting the miscompilation done by earlier GCC is added as well. This also fixes the reported ICE in PR95025 and adds checking code to catch it earlier - the issue was not-supported refs propagation leaving stray refs in the sequence. 2020-05-11 Richard Biener PR tree-optimization/94988 PR tree-optimization/95025 * tree-ssa-loop-im.c (seq_entry): Make a struct, add from. (sm_seq_push_down): Take extra parameter denoting where we moved the ref to. (execute_sm_exit): Re-issue sm_other stores in the correct order. (sm_seq_valid_bb): When always executed, allow sm_other to prevail inbetween sm_ord and record their stored value. (hoist_memory_references): Adjust refs_not_supported propagation and prune sm_other from the end of the ordered sequences. * gcc.dg/torture/pr94988.c: New testcase. * gcc.dg/torture/pr95025.c: Likewise. * gcc.dg/torture/pr95045.c: Likewise. * g++.dg/asan/pr95025.C: New testcase. --- gcc/tree-ssa-loop-im.c | 177 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 128 insertions(+), 49 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 2aabb54c..bb78dfb 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2209,7 +2209,14 @@ execute_sm (class loop *loop, im_mem_ref *ref, able to execute in arbitrary order with respect to other stores sm_other is used for stores we do not try to apply store motion to. */ enum sm_kind { sm_ord, sm_unord, sm_other }; -typedef std::pair seq_entry; +struct seq_entry +{ + seq_entry (unsigned f, sm_kind k, tree fr = NULL) + : first (f), second (k), from (fr) {} + unsigned first; + sm_kind second; + tree from; +}; static void execute_sm_exit (class loop *loop, edge ex, vec &seq, @@ -2218,35 +2225,54 @@ execute_sm_exit (class loop *loop, edge ex, vec &seq, /* Sink the stores to exit from the loop. */ for (unsigned i = seq.length (); i > 0; --i) { - if (seq[i-1].second != kind) - continue; im_mem_ref *ref = memory_accesses.refs_list[seq[i-1].first]; - sm_aux *aux = *aux_map.get (ref); - if (!aux->store_flag) + if (seq[i-1].second == sm_other) { - gassign *store; - store = gimple_build_assign (unshare_expr (ref->mem.ref), - aux->tmp_var); + gcc_assert (kind == sm_ord && seq[i-1].from != NULL_TREE); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Re-issueing dependent store of "); + print_generic_expr (dump_file, ref->mem.ref); + fprintf (dump_file, " from loop %d on exit %d -> %d\n", + loop->num, ex->src->index, ex->dest->index); + } + gassign *store = gimple_build_assign (unshare_expr (ref->mem.ref), + seq[i-1].from); gsi_insert_on_edge (ex, store); } else - execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, aux->store_flag, - loop_preheader_edge (loop), &aux->flag_bbs); + { + sm_aux *aux = *aux_map.get (ref); + if (!aux->store_flag) + { + gassign *store; + store = gimple_build_assign (unshare_expr (ref->mem.ref), + aux->tmp_var); + gsi_insert_on_edge (ex, store); + } + else + execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, + aux->store_flag, + loop_preheader_edge (loop), &aux->flag_bbs); + } } } /* Push the SM candidate at index PTR in the sequence SEQ down until we hit the next SM candidate. Return true if that went OK and - false if we could not disambiguate agains another unrelated ref. */ + false if we could not disambiguate agains another unrelated ref. + Update *AT to the index where the candidate now resides. */ static bool -sm_seq_push_down (vec &seq, unsigned ptr) +sm_seq_push_down (vec &seq, unsigned ptr, unsigned *at) { + *at = ptr; for (; ptr > 0; --ptr) { seq_entry &new_cand = seq[ptr]; seq_entry &against = seq[ptr-1]; - if (against.second == sm_ord) + if (against.second == sm_ord + || (against.second == sm_other && against.from != NULL_TREE)) /* Found the tail of the sequence. */ break; if (!refs_independent_p (memory_accesses.refs_list[new_cand.first], @@ -2255,6 +2281,7 @@ sm_seq_push_down (vec &seq, unsigned ptr) /* ??? Prune new_cand from the list of refs to apply SM to. */ return false; std::swap (new_cand, against); + *at = ptr - 1; } return true; } @@ -2367,37 +2394,41 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, not order-preserving SM code. */ if (first_edge_seq[i].first != edge_seq[i].first) { - bitmap_set_bit (refs_not_supported, - first_edge_seq[i].first); - bitmap_set_bit (refs_not_supported, edge_seq[i].first); - first_edge_seq[i].second = sm_unord; + if (first_edge_seq[i].second == sm_ord) + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + if (edge_seq[i].second == sm_ord) + bitmap_set_bit (refs_not_supported, edge_seq[i].first); + first_edge_seq[i].second = sm_other; } - /* sm_unord prevails. */ + /* sm_other prevails. */ else if (first_edge_seq[i].second != edge_seq[i].second) { /* This is just an optimization. */ gcc_assert (bitmap_bit_p (refs_not_supported, first_edge_seq[i].first)); - first_edge_seq[i].second = sm_unord; + first_edge_seq[i].second = sm_other; } } - /* Any excess elements become sm_unord since they are now + /* Any excess elements become sm_other since they are now coonditionally executed. */ if (first_edge_seq.length () > edge_seq.length ()) { for (unsigned i = edge_seq.length (); i < first_edge_seq.length (); ++i) { - bitmap_set_bit (refs_not_supported, - first_edge_seq[i].first); - first_edge_seq[i].second = sm_unord; + if (first_edge_seq[i].second == sm_ord) + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + first_edge_seq[i].second = sm_other; } } else if (edge_seq.length () > first_edge_seq.length ()) { for (unsigned i = first_edge_seq.length (); i < edge_seq.length (); ++i) - bitmap_set_bit (refs_not_supported, edge_seq[i].first); + if (edge_seq[i].second == sm_ord) + bitmap_set_bit (refs_not_supported, edge_seq[i].first); } } /* Use the sequence from the first edge and push SMs down. */ @@ -2407,17 +2438,13 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, break; unsigned id = first_edge_seq[i].first; seq.safe_push (first_edge_seq[i]); + unsigned new_idx; if (first_edge_seq[i].second == sm_ord - && !sm_seq_push_down (seq, seq.length () - 1)) + && !sm_seq_push_down (seq, seq.length () - 1, &new_idx)) { bitmap_set_bit (refs_not_supported, id); - /* ??? Mark it sm_unord but it's now "somewhere" ... */ - for (unsigned i = seq.length (); i != 0; --i) - if (seq[i - 1].first == id) - { - seq[i - 1].second = sm_unord; - break; - } + /* Mark it sm_other. */ + seq[new_idx].second = sm_other; } } return 1; @@ -2429,21 +2456,21 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, /* One of the stores we want to apply SM to and we've not yet seen. */ else if (bitmap_clear_bit (refs_not_in_seq, data->ref)) { - seq.safe_push (std::make_pair (data->ref, sm_ord)); + seq.safe_push (seq_entry (data->ref, sm_ord)); /* 1) push it down the queue until a SMed and not ignored ref is reached, skipping all not SMed refs and ignored refs via non-TBAA disambiguation. */ - if (!sm_seq_push_down (seq, seq.length () - 1)) + unsigned new_idx; + if (!sm_seq_push_down (seq, seq.length () - 1, &new_idx) + /* If that fails but we did not fork yet continue, we'll see + to re-materialize all of the stores in the sequence then. + Further stores will only be pushed up to this one. */ + && forked) { bitmap_set_bit (refs_not_supported, data->ref); - /* ??? Mark it sm_unord but it's now "somewhere" ... */ - for (unsigned i = seq.length (); i != 0; --i) - if (seq[i - 1].first == data->ref) - { - seq[i - 1].second = sm_unord; - break; - } + /* Mark it sm_other. */ + seq[new_idx].second = sm_other; } /* 2) check whether we've seen all refs we want to SM and if so @@ -2453,7 +2480,8 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, } else /* Another store not part of the final sequence. Simply push it. */ - seq.safe_push (std::make_pair (data->ref, sm_other)); + seq.safe_push (seq_entry (data->ref, sm_other, + gimple_assign_rhs1 (def))); vdef = gimple_vuse (def); } @@ -2513,21 +2541,72 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, std::pair > *seq; FOR_EACH_VEC_ELT (sms, i, seq) { + bool need_to_push = false; for (unsigned i = 0; i < seq->second.length (); ++i) { - if (seq->second[i].second == sm_other) + sm_kind kind = seq->second[i].second; + if (kind == sm_other && seq->second[i].from == NULL_TREE) break; unsigned id = seq->second[i].first; - if (bitmap_bit_p (refs_not_supported, id)) - seq->second[i].second = sm_other; - else if (!sm_seq_push_down (seq->second, i)) + unsigned new_idx; + if (kind == sm_ord + && bitmap_bit_p (refs_not_supported, id)) { - if (bitmap_set_bit (refs_not_supported, id)) - changed = true; + seq->second[i].second = sm_other; + gcc_assert (seq->second[i].from == NULL_TREE); + need_to_push = true; + } + else if (need_to_push + && !sm_seq_push_down (seq->second, i, &new_idx)) + { + /* We need to push down both sm_ord and sm_other + but for the latter we need to disqualify all + following refs. */ + if (kind == sm_ord) + { + if (bitmap_set_bit (refs_not_supported, id)) + changed = true; + seq->second[new_idx].second = sm_other; + } + else + { + for (unsigned j = seq->second.length () - 1; + j > new_idx; --j) + if (seq->second[j].second == sm_ord + && bitmap_set_bit (refs_not_supported, + seq->second[j].first)) + changed = true; + seq->second.truncate (new_idx); + break; + } } } } } + std::pair > *seq; + FOR_EACH_VEC_ELT (sms, i, seq) + { + /* Prune sm_other from the end. */ + while (!seq->second.is_empty () + && seq->second.last ().second == sm_other) + seq->second.pop (); + /* Prune duplicates from the start. */ + auto_bitmap seen (&lim_bitmap_obstack); + unsigned j, k; + for (j = k = 0; j < seq->second.length (); ++j) + if (bitmap_set_bit (seen, seq->second[j].first)) + { + if (k != j) + seq->second[k] = seq->second[j]; + ++k; + } + seq->second.truncate (k); + /* And verify. */ + seq_entry *e; + FOR_EACH_VEC_ELT (seq->second, j, e) + gcc_assert (e->second == sm_ord + || (e->second == sm_other && e->from != NULL_TREE)); + } /* Verify dependence for refs we cannot handle with the order preserving code (refs_not_supported) or prune them from mem_refs. */ @@ -2540,7 +2619,7 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, /* We've now verified store order for ref with respect to all other stores in the loop does not matter. */ else - unord_refs.safe_push (std::make_pair (i, sm_unord)); + unord_refs.safe_push (seq_entry (i, sm_unord)); } hash_map aux_map; -- cgit v1.1 From bb63ca63e744c08bc5a9ffa53df62ea35f098b0b Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 11 May 2020 15:26:09 +0200 Subject: tree-optimization/95045 - fix SM with exit exiting multiple loops Since we apply SM to an edge which exits multiple loops we have to make sure to commit insertions on it immediately since otherwise store order is not preserved. 2020-05-12 Richard Biener PR tree-optimization/95045 * dbgcnt.def (lim): Add debug-counter. * tree-ssa-loop-im.c: Include dbgcnt.h. (find_refs_for_sm): Use lim debug counter for store motion candidates. (do_store_motion): Rename form store_motion. Commit edge insertions... (store_motion_loop): ... here. (tree_ssa_lim): Adjust. --- gcc/tree-ssa-loop-im.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index bb78dfb..0d77aaa 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see #include "alias.h" #include "builtins.h" #include "tree-dfa.h" +#include "dbgcnt.h" /* TODO: Support for predicated code motion. I.e. @@ -2862,7 +2863,7 @@ find_refs_for_sm (class loop *loop, bitmap sm_executed, bitmap refs_to_sm) EXECUTE_IF_AND_COMPL_IN_BITMAP (refs, sm_executed, 0, i, bi) { ref = memory_accesses.refs_list[i]; - if (can_sm_ref_p (loop, ref)) + if (can_sm_ref_p (loop, ref) && dbg_cnt (lim)) bitmap_set_bit (refs_to_sm, i); } } @@ -2900,7 +2901,12 @@ store_motion_loop (class loop *loop, bitmap sm_executed) { find_refs_for_sm (loop, sm_executed, sm_in_loop); if (!bitmap_empty_p (sm_in_loop)) - hoist_memory_references (loop, sm_in_loop, exits); + { + hoist_memory_references (loop, sm_in_loop, exits); + /* Commit edge inserts here to preserve the order of stores + when an exit exits multiple loops. */ + gsi_commit_edge_inserts (); + } } exits.release (); @@ -2915,7 +2921,7 @@ store_motion_loop (class loop *loop, bitmap sm_executed) loops. */ static void -store_motion (void) +do_store_motion (void) { class loop *loop; bitmap sm_executed = BITMAP_ALLOC (&lim_bitmap_obstack); @@ -2924,7 +2930,6 @@ store_motion (void) store_motion_loop (loop, sm_executed); BITMAP_FREE (sm_executed); - gsi_commit_edge_inserts (); } /* Fills ALWAYS_EXECUTED_IN information for basic blocks of LOOP, i.e. @@ -3141,7 +3146,7 @@ tree_ssa_lim (void) /* Execute store motion. Force the necessary invariants to be moved out of the loops as well. */ - store_motion (); + do_store_motion (); /* Move the expressions that are expensive enough. */ todo = move_computations (); -- cgit v1.1 From 52a0f83980082c9995f2d8ec9b88548520fb8a5f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 18 May 2020 09:17:24 +0200 Subject: tree-optimization/95172 - avoid mixing conditionalized and ordered SM The following testcase shows a missed optimization that then leads to wrong-code when issueing SMed stores on exits. When we were able to compute an ordered sequence of stores for an exit we need to emit that in the correct order and we can emit it disregarding to any conditional for whether a store actually happened (we know it did). We can also improve detection as of whether we need conditional processing at all. Both parts fix the testcase. 2020-05-18 Richard Biener PR tree-optimization/95172 * tree-ssa-loop-im.c (execute_sm): Get flag whether we eventually need the conditional processing. (execute_sm_exit): When processing an orderd sequence avoid doing any conditional processing. (hoist_memory_references): Pass down whether all edges have ordered processing for a ref to execute_sm. * gcc.dg/torture/pr95172.c: New testcase. --- gcc/tree-ssa-loop-im.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 0d77aaa..63f4ef8 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2130,7 +2130,7 @@ struct sm_aux static void execute_sm (class loop *loop, im_mem_ref *ref, - hash_map &aux_map) + hash_map &aux_map, bool maybe_mt) { gassign *load; struct fmt_data fmt_data; @@ -2154,8 +2154,9 @@ execute_sm (class loop *loop, im_mem_ref *ref, for_each_index (&ref->mem.ref, force_move_till, &fmt_data); bool always_stored = ref_always_accessed_p (loop, ref, true); - if (bb_in_transaction (loop_preheader_edge (loop)->src) - || (! flag_store_data_races && ! always_stored)) + if (maybe_mt + && (bb_in_transaction (loop_preheader_edge (loop)->src) + || (! flag_store_data_races && ! always_stored))) multi_threaded_model_p = true; if (multi_threaded_model_p) @@ -2244,7 +2245,7 @@ execute_sm_exit (class loop *loop, edge ex, vec &seq, else { sm_aux *aux = *aux_map.get (ref); - if (!aux->store_flag) + if (!aux->store_flag || kind == sm_ord) { gassign *store; store = gimple_build_assign (unshare_expr (ref->mem.ref), @@ -2630,7 +2631,7 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, EXECUTE_IF_SET_IN_BITMAP (mem_refs, 0, i, bi) { ref = memory_accesses.refs_list[i]; - execute_sm (loop, ref, aux_map); + execute_sm (loop, ref, aux_map, bitmap_bit_p (refs_not_supported, i)); } /* Materialize ordered store sequences on exits. */ -- cgit v1.1 From b6ed2e2bca54d1d290f553549d28b0c60a0f240f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 22 May 2020 08:48:04 +0200 Subject: tree-optimization/95248 - fix oversight in SM rewrite This fixes a leftover early out in determining the sequence of stores to materialize. 2020-05-22 Richard Biener PR tree-optimization/95248 * tree-ssa-loop-im.c (sm_seq_valid_bb): Remove bogus early out. * gcc.dg/torture/pr95248.c: New testcase. --- gcc/tree-ssa-loop-im.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 63f4ef8..fcca099 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2436,8 +2436,6 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, /* Use the sequence from the first edge and push SMs down. */ for (unsigned i = 0; i < first_edge_seq.length (); ++i) { - if (first_edge_seq[i].second == sm_other) - break; unsigned id = first_edge_seq[i].first; seq.safe_push (first_edge_seq[i]); unsigned new_idx; -- cgit v1.1 From 4acca1c0635dfa43cd8c4bfe2b22e17909fc23a3 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 25 May 2020 10:09:44 +0200 Subject: tree-optimization/95295 - fix wrong-code with SM We failed to compare the rematerialized store values when merging paths after walking PHIs. 2020-05-25 Richard Biener PR tree-optimization/95295 * tree-ssa-loop-im.c (sm_seq_valid_bb): Compare remat stores RHSes and drop to full sm_other if they are not equal. * gcc.dg/torture/pr95295-1.c: New testcase. * gcc.dg/torture/pr95295-2.c: Likewise. * gcc.dg/torture/pr95283.c: Likewise. --- gcc/tree-ssa-loop-im.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index fcca099..b399bd0 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2402,6 +2402,7 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, if (edge_seq[i].second == sm_ord) bitmap_set_bit (refs_not_supported, edge_seq[i].first); first_edge_seq[i].second = sm_other; + first_edge_seq[i].from = NULL_TREE; } /* sm_other prevails. */ else if (first_edge_seq[i].second != edge_seq[i].second) @@ -2410,7 +2411,14 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, gcc_assert (bitmap_bit_p (refs_not_supported, first_edge_seq[i].first)); first_edge_seq[i].second = sm_other; + first_edge_seq[i].from = NULL_TREE; } + else if (first_edge_seq[i].second == sm_other + && first_edge_seq[i].from != NULL_TREE + && (edge_seq[i].from == NULL_TREE + || !operand_equal_p (first_edge_seq[i].from, + edge_seq[i].from, 0))) + first_edge_seq[i].from = NULL_TREE; } /* Any excess elements become sm_other since they are now coonditionally executed. */ -- cgit v1.1 From 6c8e16aea85286721eb5689f9bcae09d36003cb1 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 27 May 2020 13:05:07 +0200 Subject: tree-optimization/95295 - fix sinking after path merging in new SM code This fixes a missed sinking of remat stores across unrelated stores after merging from different paths. 2020-05-27 Richard Biener PR tree-optimization/95295 * tree-ssa-loop-im.c (sm_seq_valid_bb): Fix sinking after merging stores from paths. * gcc.dg/torture/pr95295-3.c: New testcase. --- gcc/tree-ssa-loop-im.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'gcc/tree-ssa-loop-im.c') diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index b399bd0..d33f533 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -2447,12 +2447,16 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, unsigned id = first_edge_seq[i].first; seq.safe_push (first_edge_seq[i]); unsigned new_idx; - if (first_edge_seq[i].second == sm_ord + if ((first_edge_seq[i].second == sm_ord + || (first_edge_seq[i].second == sm_other + && first_edge_seq[i].from != NULL_TREE)) && !sm_seq_push_down (seq, seq.length () - 1, &new_idx)) { - bitmap_set_bit (refs_not_supported, id); + if (first_edge_seq[i].second == sm_ord) + bitmap_set_bit (refs_not_supported, id); /* Mark it sm_other. */ seq[new_idx].second = sm_other; + seq[new_idx].from = NULL_TREE; } } return 1; -- cgit v1.1