/* Avoid store forwarding optimization pass. Copyright (C) 2024 Free Software Foundation, Inc. Contributed by VRULL GmbH. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #include "avoid-store-forwarding.h" #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "target.h" #include "rtl.h" #include "alias.h" #include "rtlanal.h" #include "cfgrtl.h" #include "tree-pass.h" #include "cselib.h" #include "predict.h" #include "insn-config.h" #include "expmed.h" #include "recog.h" #include "regset.h" #include "df.h" #include "expr.h" #include "memmodel.h" #include "emit-rtl.h" #include "vec.h" /* This pass tries to detect and avoid cases of store forwarding. On many processors there is a large penalty when smaller stores are forwarded to larger loads. The idea used to avoid the stall is to move the store after the load and in addition emit a bit insert sequence so the load register has the correct value. For example the following: strb w2, [x1, 1] ldr x0, [x1] Will be transformed to: ldr x0, [x1] strb w2, [x1] bfi x0, x2, 0, 8 */ namespace { const pass_data pass_data_avoid_store_forwarding = { RTL_PASS, /* type. */ "avoid_store_forwarding", /* name. */ OPTGROUP_NONE, /* optinfo_flags. */ TV_AVOID_STORE_FORWARDING, /* tv_id. */ 0, /* properties_required. */ 0, /* properties_provided. */ 0, /* properties_destroyed. */ 0, /* todo_flags_start. */ TODO_df_finish /* todo_flags_finish. */ }; class pass_rtl_avoid_store_forwarding : public rtl_opt_pass { public: pass_rtl_avoid_store_forwarding (gcc::context *ctxt) : rtl_opt_pass (pass_data_avoid_store_forwarding, ctxt) {} /* opt_pass methods: */ virtual bool gate (function *) { return flag_avoid_store_forwarding && optimize >= 1; } virtual unsigned int execute (function *) override; }; // class pass_rtl_avoid_store_forwarding /* Handler for finding and avoiding store forwardings. */ class store_forwarding_analyzer { public: unsigned int stats_sf_detected = 0; unsigned int stats_sf_avoided = 0; bool is_store_forwarding (rtx store_mem, rtx load_mem, HOST_WIDE_INT *off_val); bool process_store_forwarding (vec &, rtx_insn *load_insn, rtx load_mem); void avoid_store_forwarding (basic_block); void update_stats (function *); }; /* Return a bit insertion sequence that would make DEST have the correct value if the store represented by STORE_INFO were to be moved after DEST. */ static rtx_insn * generate_bit_insert_sequence (store_fwd_info *store_info, rtx dest) { /* Memory size should be a constant at this stage. */ unsigned HOST_WIDE_INT store_size = MEM_SIZE (store_info->store_mem).to_constant (); start_sequence (); unsigned HOST_WIDE_INT bitsize = store_size * BITS_PER_UNIT; unsigned HOST_WIDE_INT start = store_info->offset * BITS_PER_UNIT; /* Adjust START for machines with BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN. Given that the bytes will be reversed in this case, we need to calculate the starting position from the end of the destination register. */ if (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN) { unsigned HOST_WIDE_INT load_mode_bitsize = (GET_MODE_BITSIZE (GET_MODE (dest))).to_constant (); start = load_mode_bitsize - bitsize - start; } rtx mov_reg = store_info->mov_reg; store_bit_field (dest, bitsize, start, 0, 0, GET_MODE (mov_reg), mov_reg, false, false); rtx_insn *insns = get_insns (); unshare_all_rtl_in_chain (insns); end_sequence (); for (rtx_insn *insn = insns; insn; insn = NEXT_INSN (insn)) if (contains_mem_rtx_p (PATTERN (insn)) || recog_memoized (insn) < 0) return NULL; return insns; } /* Return true iff a store to STORE_MEM would write to a sub-region of bytes from what LOAD_MEM would read. If true also store the relative byte offset of the store within the load to OFF_VAL. */ bool store_forwarding_analyzer:: is_store_forwarding (rtx store_mem, rtx load_mem, HOST_WIDE_INT *off_val) { poly_int64 load_offset, store_offset; rtx load_base = strip_offset (XEXP (load_mem, 0), &load_offset); rtx store_base = strip_offset (XEXP (store_mem, 0), &store_offset); return (MEM_SIZE (load_mem).is_constant () && rtx_equal_p (load_base, store_base) && known_subrange_p (store_offset, MEM_SIZE (store_mem), load_offset, MEM_SIZE (load_mem)) && (store_offset - load_offset).is_constant (off_val)); } /* Given a list of small stores that are forwarded to LOAD_INSN, try to rearrange them so that a store-forwarding penalty doesn't occur. The stores must be given in reverse program order, starting from the one closer to LOAD_INSN. */ bool store_forwarding_analyzer:: process_store_forwarding (vec &stores, rtx_insn *load_insn, rtx load_mem) { machine_mode load_mem_mode = GET_MODE (load_mem); /* Memory sizes should be constants at this stage. */ HOST_WIDE_INT load_size = MEM_SIZE (load_mem).to_constant (); /* If the stores cover all the bytes of the load without overlap then we can eliminate the load entirely and use the computed value instead. */ sbitmap forwarded_bytes = sbitmap_alloc (load_size); bitmap_clear (forwarded_bytes); unsigned int i; store_fwd_info* it; FOR_EACH_VEC_ELT (stores, i, it) { HOST_WIDE_INT store_size = MEM_SIZE (it->store_mem).to_constant (); if (bitmap_bit_in_range_p (forwarded_bytes, it->offset, it->offset + store_size - 1)) break; bitmap_set_range (forwarded_bytes, it->offset, store_size); } bitmap_not (forwarded_bytes, forwarded_bytes); bool load_elim = bitmap_empty_p (forwarded_bytes); stats_sf_detected++; if (dump_file) { fprintf (dump_file, "Store forwarding detected:\n"); FOR_EACH_VEC_ELT (stores, i, it) { fprintf (dump_file, "From: "); print_rtl_single (dump_file, it->store_insn); } fprintf (dump_file, "To: "); print_rtl_single (dump_file, load_insn); if (load_elim) fprintf (dump_file, "(Load elimination candidate)\n"); } rtx load = single_set (load_insn); rtx dest; if (load_elim) dest = gen_reg_rtx (load_mem_mode); else dest = SET_DEST (load); int move_to_front = -1; int total_cost = 0; /* Check if we can emit bit insert instructions for all forwarded stores. */ FOR_EACH_VEC_ELT (stores, i, it) { it->mov_reg = gen_reg_rtx (GET_MODE (it->store_mem)); rtx_insn *insns = NULL; /* If we're eliminating the load then find the store with zero offset and use it as the base register to avoid a bit insert if possible. */ if (load_elim && it->offset == 0) { start_sequence (); /* We can use a paradoxical subreg to force this to a wider mode, as the only use will be inserting the bits (i.e., we don't care about the value of the higher bits). */ rtx ext0 = lowpart_subreg (GET_MODE (dest), it->mov_reg, GET_MODE (it->mov_reg)); if (ext0) { rtx_insn *move0 = emit_move_insn (dest, ext0); if (recog_memoized (move0) >= 0) { insns = get_insns (); move_to_front = (int) i; } } end_sequence (); } if (!insns) insns = generate_bit_insert_sequence (&(*it), dest); if (!insns) { if (dump_file) { fprintf (dump_file, "Failed due to: "); print_rtl_single (dump_file, it->store_insn); } return false; } total_cost += seq_cost (insns, true); it->bits_insert_insns = insns; rtx store_set = single_set (it->store_insn); /* Create a register move at the store's original position to save the stored value. */ start_sequence (); rtx_insn *insn1 = emit_insn (gen_rtx_SET (it->mov_reg, SET_SRC (store_set))); end_sequence (); if (recog_memoized (insn1) < 0) { if (dump_file) { fprintf (dump_file, "Failed due to unrecognizable insn: "); print_rtl_single (dump_file, insn1); } return false; } it->save_store_value_insn = insn1; /* Create a new store after the load with the saved original value. This avoids the forwarding stall. */ start_sequence (); rtx_insn *insn2 = emit_insn (gen_rtx_SET (SET_DEST (store_set), it->mov_reg)); end_sequence (); if (recog_memoized (insn2) < 0) { if (dump_file) { fprintf (dump_file, "Failed due to unrecognizable insn: "); print_rtl_single (dump_file, insn2); } return false; } it->store_saved_value_insn = insn2; } if (load_elim) total_cost -= insn_cost (load_insn, true); /* Let the target decide if transforming this store forwarding instance is profitable. */ if (!targetm.avoid_store_forwarding_p (stores, load_mem, total_cost, load_elim)) { if (dump_file) fprintf (dump_file, "Not transformed due to target decision.\n"); return false; } /* If we have a move instead of bit insert, it needs to be emitted first in the resulting sequence. */ if (move_to_front != -1) { store_fwd_info copy = stores[move_to_front]; stores.safe_push (copy); stores.ordered_remove (move_to_front); } if (load_elim) { machine_mode outer_mode = GET_MODE (SET_DEST (load)); rtx load_move; rtx load_value = dest; if (outer_mode != load_mem_mode) { load_value = simplify_gen_unary (GET_CODE (SET_SRC (load)), outer_mode, dest, load_mem_mode); } load_move = gen_rtx_SET (SET_DEST (load), load_value); start_sequence (); rtx_insn *insn = emit_insn (load_move); rtx_insn *seq = get_insns (); end_sequence (); if (recog_memoized (insn) < 0) return false; emit_insn_after (seq, load_insn); } if (dump_file) { fprintf (dump_file, "Store forwarding avoided with bit inserts:\n"); FOR_EACH_VEC_ELT (stores, i, it) { if (stores.length () > 1) { fprintf (dump_file, "For: "); print_rtl_single (dump_file, it->store_insn); } fprintf (dump_file, "With sequence:\n"); for (rtx_insn *insn = it->bits_insert_insns; insn; insn = NEXT_INSN (insn)) { fprintf (dump_file, " "); print_rtl_single (dump_file, insn); } } } stats_sf_avoided++; /* Done, emit all the generated instructions and delete the stores. Note that STORES are in reverse program order. */ FOR_EACH_VEC_ELT (stores, i, it) { emit_insn_after (it->bits_insert_insns, load_insn); emit_insn_after (it->store_saved_value_insn, load_insn); } FOR_EACH_VEC_ELT (stores, i, it) { emit_insn_before (it->save_store_value_insn, it->store_insn); delete_insn (it->store_insn); } df_insn_rescan (load_insn); if (load_elim) delete_insn (load_insn); return true; } /* Try to modify BB so that expensive store forwarding cases are avoided. */ void store_forwarding_analyzer::avoid_store_forwarding (basic_block bb) { if (!optimize_bb_for_speed_p (bb)) return; auto_vec store_exprs; rtx_insn *insn; unsigned int insn_cnt = 0; FOR_BB_INSNS (bb, insn) { if (!NONDEBUG_INSN_P (insn)) continue; vec_rtx_properties properties; properties.add_insn (insn, false); rtx set = single_set (insn); if (!set || insn_could_throw_p (insn)) { store_exprs.truncate (0); continue; } /* The inner mem RTX if INSN is a load, NULL_RTX otherwise. */ rtx load_mem = SET_SRC (set); if (GET_CODE (load_mem) == ZERO_EXTEND || GET_CODE (load_mem) == SIGN_EXTEND) load_mem = XEXP (load_mem, 0); if (!MEM_P (load_mem)) load_mem = NULL_RTX; /* The mem RTX if INSN is a store, NULL_RTX otherwise. */ rtx store_mem = MEM_P (SET_DEST (set)) ? SET_DEST (set) : NULL_RTX; /* We cannot analyze memory RTXs that have unknown size. */ if ((store_mem && (!MEM_SIZE_KNOWN_P (store_mem) || !MEM_SIZE (store_mem).is_constant ())) || (load_mem && (!MEM_SIZE_KNOWN_P (load_mem) || !MEM_SIZE (load_mem).is_constant ()))) { store_exprs.truncate (0); continue; } bool is_simple = !properties.has_asm && !properties.has_side_effects (); bool is_simple_store = is_simple && store_mem && !contains_mem_rtx_p (SET_SRC (set)); bool is_simple_load = is_simple && load_mem && !contains_mem_rtx_p (SET_DEST (set)); int removed_count = 0; if (is_simple_store) { /* Record store forwarding candidate. */ store_fwd_info info; info.store_insn = insn; info.store_mem = store_mem; info.insn_cnt = insn_cnt; info.remove = false; info.forwarded = false; store_exprs.safe_push (info); } bool reads_mem = false; bool writes_mem = false; for (auto ref : properties.refs ()) if (ref.is_mem ()) { reads_mem |= ref.is_read (); writes_mem |= ref.is_write (); } else if (ref.is_write ()) { /* Drop store forwarding candidates when the address register is overwritten. */ bool remove_rest = false; unsigned int i; store_fwd_info *it; FOR_EACH_VEC_ELT_REVERSE (store_exprs, i, it) { if (remove_rest || reg_overlap_mentioned_p (regno_reg_rtx[ref.regno], it->store_mem)) { it->remove = true; removed_count++; remove_rest = true; } } } if (is_simple_load) { /* Process load for possible store forwarding cases. Possible newly created/moved stores, resulted from a successful forwarding, will be processed in subsequent iterations. */ auto_vec forwardings; bool partial_forwarding = false; bool remove_rest = false; bool vector_load = VECTOR_MODE_P (GET_MODE (load_mem)); unsigned int i; store_fwd_info *it; FOR_EACH_VEC_ELT_REVERSE (store_exprs, i, it) { rtx store_mem = it->store_mem; HOST_WIDE_INT off_val; bool vector_store = VECTOR_MODE_P (GET_MODE (store_mem)); if (remove_rest) { it->remove = true; removed_count++; } else if (vector_load ^ vector_store) { /* Vector stores followed by a non-vector load or the opposite, cause store_bit_field to generate non-canonical expressions, like (subreg:V4SI (reg:DI ...) 0)). Cases like that should be handled using vec_duplicate, so we reject the transformation in those cases. */ it->remove = true; removed_count++; remove_rest = true; } else if (is_store_forwarding (store_mem, load_mem, &off_val)) { /* Check if moving this store after the load is legal. */ bool write_dep = false; for (unsigned int j = store_exprs.length () - 1; j != i; j--) { if (!store_exprs[j].forwarded && output_dependence (store_mem, store_exprs[j].store_mem)) { write_dep = true; break; } } if (!write_dep) { it->forwarded = true; it->offset = off_val; forwardings.safe_push (*it); } else partial_forwarding = true; it->remove = true; removed_count++; } else if (true_dependence (store_mem, GET_MODE (store_mem), load_mem)) { /* We cannot keep a store forwarding candidate if it possibly interferes with this load. */ it->remove = true; removed_count++; remove_rest = true; } } if (!forwardings.is_empty () && !partial_forwarding) process_store_forwarding (forwardings, insn, load_mem); } if ((writes_mem && !is_simple_store) || (reads_mem && !is_simple_load)) store_exprs.truncate (0); if (removed_count) { unsigned int i, j; store_fwd_info *it; VEC_ORDERED_REMOVE_IF (store_exprs, i, j, it, it->remove); } /* Don't consider store forwarding if the RTL instruction distance is more than PARAM_STORE_FORWARDING_MAX_DISTANCE and the cost checks are not disabled. */ const bool unlimited_cost = (param_store_forwarding_max_distance == 0); if (!unlimited_cost && !store_exprs.is_empty () && (store_exprs[0].insn_cnt + param_store_forwarding_max_distance <= insn_cnt)) store_exprs.ordered_remove (0); insn_cnt++; } } /* Update pass statistics. */ void store_forwarding_analyzer::update_stats (function *fn) { statistics_counter_event (fn, "Cases of store forwarding detected: ", stats_sf_detected); statistics_counter_event (fn, "Cases of store forwarding avoided: ", stats_sf_detected); } unsigned int pass_rtl_avoid_store_forwarding::execute (function *fn) { df_set_flags (DF_DEFER_INSN_RESCAN); init_alias_analysis (); store_forwarding_analyzer analyzer; basic_block bb; FOR_EACH_BB_FN (bb, fn) analyzer.avoid_store_forwarding (bb); end_alias_analysis (); analyzer.update_stats (fn); return 0; } } // anon namespace. rtl_opt_pass * make_pass_rtl_avoid_store_forwarding (gcc::context *ctxt) { return new pass_rtl_avoid_store_forwarding (ctxt); }