// Late-stage instruction combination pass.
// Copyright (C) 2023-2024 Free Software Foundation, Inc.
//
// This file is part of GCC.
//
// GCC is free software; you can redistribute it and/or modify it under
// the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 3, or (at your option) any later
// version.
//
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License
// along with GCC; see the file COPYING3. If not see
// .
// The current purpose of this pass is to substitute definitions into
// all uses, so that the definition can be removed. However, it could
// be extended to handle other combination-related optimizations in future.
//
// The pass can run before or after register allocation. When running
// before register allocation, it tries to avoid cases that are likely
// to increase register pressure. For the same reason, it avoids moving
// instructions around, even if doing so would allow an optimization to
// succeed. These limitations are removed when running after register
// allocation.
#define INCLUDE_ALGORITHM
#define INCLUDE_FUNCTIONAL
#define INCLUDE_ARRAY
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "df.h"
#include "rtl-ssa.h"
#include "print-rtl.h"
#include "tree-pass.h"
#include "cfgcleanup.h"
#include "target.h"
#include "dbgcnt.h"
using namespace rtl_ssa;
namespace {
const pass_data pass_data_late_combine =
{
RTL_PASS, // type
"late_combine", // name
OPTGROUP_NONE, // optinfo_flags
TV_LATE_COMBINE, // tv_id
0, // properties_required
0, // properties_provided
0, // properties_destroyed
0, // todo_flags_start
TODO_df_finish, // todo_flags_finish
};
// Represents an attempt to substitute a single-set definition into all
// uses of the definition.
class insn_combination
{
public:
insn_combination (set_info *, rtx, rtx);
bool run ();
array_slice use_changes () const;
private:
use_array get_new_uses (use_info *);
bool substitute_nondebug_use (use_info *);
bool substitute_nondebug_uses (set_info *);
bool try_to_preserve_debug_info (insn_change &, use_info *);
void substitute_debug_use (use_info *);
bool substitute_note (insn_info *, rtx, bool);
void substitute_notes (insn_info *, bool);
void substitute_note_uses (use_info *);
void substitute_optional_uses (set_info *);
// Represents the state of the function's RTL at the start of this
// combination attempt.
insn_change_watermark m_rtl_watermark;
// Represents the rtl-ssa state at the start of this combination attempt.
obstack_watermark m_attempt;
// The instruction that contains the definition, and that we're trying
// to delete.
insn_info *m_def_insn;
// The definition itself.
set_info *m_def;
// The destination and source of the single set that defines m_def.
// The destination is known to be a plain REG.
rtx m_dest;
rtx m_src;
// Contains the full list of changes that we want to make, in reverse
// postorder.
auto_vec m_nondebug_changes;
};
// Class that represents one run of the pass.
class late_combine
{
public:
unsigned int execute (function *);
private:
rtx optimizable_set (insn_info *);
bool check_register_pressure (insn_info *, rtx);
bool check_uses (set_info *, rtx);
bool combine_into_uses (insn_info *, insn_info *);
auto_vec m_worklist;
};
insn_combination::insn_combination (set_info *def, rtx dest, rtx src)
: m_rtl_watermark (),
m_attempt (crtl->ssa->new_change_attempt ()),
m_def_insn (def->insn ()),
m_def (def),
m_dest (dest),
m_src (src),
m_nondebug_changes ()
{
}
array_slice
insn_combination::use_changes () const
{
return { m_nondebug_changes.address () + 1,
m_nondebug_changes.length () - 1 };
}
// USE is a direct or indirect use of m_def. Return the list of uses
// that would be needed after substituting m_def into the instruction.
// The returned list is marked as invalid if USE's insn and m_def_insn
// use different definitions for the same resource (register or memory).
use_array
insn_combination::get_new_uses (use_info *use)
{
auto *def = use->def ();
auto *use_insn = use->insn ();
use_array new_uses = use_insn->uses ();
new_uses = remove_uses_of_def (m_attempt, new_uses, def);
new_uses = merge_access_arrays (m_attempt, m_def_insn->uses (), new_uses);
if (new_uses.is_valid () && use->ebb () != m_def->ebb ())
new_uses = crtl->ssa->make_uses_available (m_attempt, new_uses, use->bb (),
use_insn->is_debug_insn ());
return new_uses;
}
// Start the process of trying to replace USE by substitution, given that
// USE occurs in a non-debug instruction. Check:
//
// - that the substitution can be represented in RTL
//
// - that each use of a resource (register or memory) within the new
// instruction has a consistent definition
//
// - that the new instruction is a recognized pattern
//
// - that the instruction can be placed somewhere that makes all definitions
// and uses valid, and that permits any new hard-register clobbers added
// during the recognition process
//
// Return true on success.
bool
insn_combination::substitute_nondebug_use (use_info *use)
{
insn_info *use_insn = use->insn ();
rtx_insn *use_rtl = use_insn->rtl ();
if (dump_file && (dump_flags & TDF_DETAILS))
dump_insn_slim (dump_file, use->insn ()->rtl ());
// Reject second and subsequent uses if the target does not allow
// the defining instruction to be copied.
if (targetm.cannot_copy_insn_p
&& m_nondebug_changes.length () >= 2
&& targetm.cannot_copy_insn_p (m_def_insn->rtl ()))
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- The target does not allow multiple"
" copies of insn %d\n", m_def_insn->uid ());
return false;
}
// Check that we can change the instruction pattern. Leave recognition
// of the result till later.
insn_propagation prop (use_rtl, m_dest, m_src);
if (!prop.apply_to_pattern (&PATTERN (use_rtl))
|| prop.num_replacements == 0)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- RTL substitution failed\n");
return false;
}
use_array new_uses = get_new_uses (use);
if (!new_uses.is_valid ())
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- could not prove that all sources"
" are available\n");
return false;
}
// Create a tentative change for the use.
auto *where = XOBNEW (m_attempt, insn_change);
auto *use_change = new (where) insn_change (use_insn);
m_nondebug_changes.safe_push (use_change);
use_change->new_uses = new_uses;
struct local_ignore : ignore_nothing
{
local_ignore (const set_info *def, const insn_info *use_insn)
: m_def (def), m_use_insn (use_insn) {}
// We don't limit the number of insns per optimization, so ignoring all
// insns for all insns would lead to quadratic complexity. Just ignore
// the use and definition, which should be enough for most purposes.
bool
should_ignore_insn (const insn_info *insn)
{
return insn == m_def->insn () || insn == m_use_insn;
}
// Ignore the definition that we're removing, and all uses of it.
bool should_ignore_def (const def_info *def) { return def == m_def; }
const set_info *m_def;
const insn_info *m_use_insn;
};
auto ignore = local_ignore (m_def, use_insn);
// Moving instructions before register allocation could increase
// register pressure. Only try moving them after RA.
if (reload_completed && can_move_insn_p (use_insn))
use_change->move_range = { use_insn->bb ()->head_insn (),
use_insn->ebb ()->last_bb ()->end_insn () };
if (!restrict_movement (*use_change, ignore))
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "-- cannot satisfy all definitions and uses"
" in insn %d\n", INSN_UID (use_insn->rtl ()));
return false;
}
if (!recog (m_attempt, *use_change, ignore))
return false;
return true;
}
// Apply substitute_nondebug_use to all direct and indirect uses of DEF.
// There will be at most one level of indirection.
bool
insn_combination::substitute_nondebug_uses (set_info *def)
{
for (use_info *use : def->nondebug_insn_uses ())
if (!use->is_live_out_use ()
&& !use->only_occurs_in_notes ()
&& !substitute_nondebug_use (use))
return false;
for (use_info *use : def->phi_uses ())
if (!substitute_nondebug_uses (use->phi ()))
return false;
return true;
}
// USE_CHANGE.insn () is a debug instruction that uses m_def. Try to
// substitute the definition into the instruction and try to describe
// the result in USE_CHANGE. Return true on success. Failure means that
// the instruction must be reset instead.
bool
insn_combination::try_to_preserve_debug_info (insn_change &use_change,
use_info *use)
{
// Punt on unsimplified subregs of hard registers. In that case,
// propagation can succeed and create a wider reg than the one we
// started with.
if (HARD_REGISTER_NUM_P (use->regno ())
&& use->includes_subregs ())
return false;
insn_info *use_insn = use_change.insn ();
rtx_insn *use_rtl = use_insn->rtl ();
use_change.new_uses = get_new_uses (use);
if (!use_change.new_uses.is_valid ()
|| !restrict_movement (use_change))
return false;
insn_propagation prop (use_rtl, m_dest, m_src);
return prop.apply_to_pattern (&INSN_VAR_LOCATION_LOC (use_rtl));
}
// USE_INSN is a debug instruction that uses m_def. Update it to reflect
// the fact that m_def is going to disappear. Try to preserve the source
// value if possible, but reset the instruction if not.
void
insn_combination::substitute_debug_use (use_info *use)
{
auto *use_insn = use->insn ();
rtx_insn *use_rtl = use_insn->rtl ();
auto use_change = insn_change (use_insn);
if (!try_to_preserve_debug_info (use_change, use))
{
use_change.new_uses = {};
use_change.move_range = use_change.insn ();
INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();
}
insn_change *changes[] = { &use_change };
crtl->ssa->change_insns (changes);
}
// NOTE is a reg note of USE_INSN, which previously used m_def. Update
// the note to reflect the fact that m_def is going to disappear. Return
// true on success, or false if the note must be deleted.
//
// CAN_PROPAGATE is true if m_dest can be replaced with m_use.
bool
insn_combination::substitute_note (insn_info *use_insn, rtx note,
bool can_propagate)
{
if (REG_NOTE_KIND (note) == REG_EQUAL
|| REG_NOTE_KIND (note) == REG_EQUIV)
{
insn_propagation prop (use_insn->rtl (), m_dest, m_src);
return (prop.apply_to_note (&XEXP (note, 0))
&& (can_propagate || prop.num_replacements == 0));
}
return true;
}
// Update USE_INSN's notes after deciding to go ahead with the optimization.
// CAN_PROPAGATE is true if m_dest can be replaced with m_use.
void
insn_combination::substitute_notes (insn_info *use_insn, bool can_propagate)
{
rtx_insn *use_rtl = use_insn->rtl ();
rtx *ptr = ®_NOTES (use_rtl);
while (rtx note = *ptr)
{
if (substitute_note (use_insn, note, can_propagate))
ptr = &XEXP (note, 1);
else
*ptr = XEXP (note, 1);
}
}
// We've decided to go ahead with the substitution. Update all REG_NOTES
// involving USE.
void
insn_combination::substitute_note_uses (use_info *use)
{
insn_info *use_insn = use->insn ();
bool can_propagate = true;
if (use->only_occurs_in_notes ())
{
// The only uses are in notes. Try to keep the note if we can,
// but removing it is better than aborting the optimization.
insn_change use_change (use_insn);
use_change.new_uses = get_new_uses (use);
if (!use_change.new_uses.is_valid ()
|| !restrict_movement (use_change))
{
use_change.move_range = use_insn;
use_change.new_uses = remove_uses_of_def (m_attempt,
use_insn->uses (),
use->def ());
can_propagate = false;
}
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "%s notes in:\n",
can_propagate ? "updating" : "removing");
dump_insn_slim (dump_file, use_insn->rtl ());
}
substitute_notes (use_insn, can_propagate);
insn_change *changes[] = { &use_change };
crtl->ssa->change_insns (changes);
}
else
// We've already decided to update the insn's pattern and know that m_src
// will be available at the insn's new location. Now update its notes.
substitute_notes (use_insn, can_propagate);
}
// We've decided to go ahead with the substitution and we've dealt with
// all uses that occur in the patterns of non-debug insns. Update all
// other uses for the fact that m_def is about to disappear.
void
insn_combination::substitute_optional_uses (set_info *def)
{
if (auto insn_uses = def->all_insn_uses ())
{
use_info *use = *insn_uses.begin ();
while (use)
{
use_info *next_use = use->next_any_insn_use ();
if (use->is_in_debug_insn ())
substitute_debug_use (use);
else if (!use->is_live_out_use ())
substitute_note_uses (use);
use = next_use;
}
}
for (use_info *use : def->phi_uses ())
substitute_optional_uses (use->phi ());
}
// Try to perform the substitution. Return true on success.
bool
insn_combination::run ()
{
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "\ntrying to combine definition of r%d in:\n",
m_def->regno ());
dump_insn_slim (dump_file, m_def_insn->rtl ());
fprintf (dump_file, "into:\n");
}
auto def_change = insn_change::delete_insn (m_def_insn);
m_nondebug_changes.safe_push (&def_change);
if (!substitute_nondebug_uses (m_def)
|| !changes_are_worthwhile (m_nondebug_changes)
|| !crtl->ssa->verify_insn_changes (m_nondebug_changes))
return false;
// We've now decided that the optimization is valid and profitable.
// Allow it to be suppressed for bisection purposes.
if (!dbg_cnt (::late_combine))
return false;
substitute_optional_uses (m_def);
confirm_change_group ();
crtl->ssa->change_insns (m_nondebug_changes);
return true;
}
// See whether INSN is a single_set that we can optimize. Return the
// set if so, otherwise return null.
rtx
late_combine::optimizable_set (insn_info *insn)
{
if (!insn->can_be_optimized ()
|| insn->is_asm ()
|| insn->is_call ()
|| insn->has_volatile_refs ()
|| insn->has_pre_post_modify ()
|| !can_move_insn_p (insn))
return NULL_RTX;
return single_set (insn->rtl ());
}
// Suppose that we can replace all uses of SET_DEST (SET) with SET_SRC (SET),
// where SET occurs in INSN. Return true if doing so is not likely to
// increase register pressure.
bool
late_combine::check_register_pressure (insn_info *insn, rtx set)
{
// Plain register-to-register moves do not establish a register class
// preference and have no well-defined effect on the register allocator.
// If changes in register class are needed, the register allocator is
// in the best position to place those changes. If no change in
// register class is needed, then the optimization reduces register
// pressure if SET_SRC (set) was already live at uses, otherwise the
// optimization is pressure-neutral.
rtx src = SET_SRC (set);
if (REG_P (src))
return true;
// On the same basis, substituting a SET_SRC that contains a single
// pseudo register either reduces pressure or is pressure-neutral,
// subject to the constraints below. We would need to do more
// analysis for SET_SRCs that use more than one pseudo register.
unsigned int nregs = 0;
for (auto *use : insn->uses ())
if (use->is_reg ()
&& !HARD_REGISTER_NUM_P (use->regno ())
&& !use->only_occurs_in_notes ())
if (++nregs > 1)
return false;
// If there are no pseudo registers in SET_SRC then the optimization
// should improve register pressure.
if (nregs == 0)
return true;
// We'd be substituting (set (reg R1) SRC) where SRC is known to
// contain a single pseudo register R2. Assume for simplicity that
// each new use of R2 would need to be in the same class C as the
// current use of R2. If, for a realistic allocation, C is a
// non-strict superset of the R1's register class, the effect on
// register pressure should be positive or neutral. If instead
// R1 occupies a different register class from R2, or if R1 has
// more allocation freedom than R2, then there's a higher risk that
// the effect on register pressure could be negative.
//
// First use constrain_operands to get the most likely choice of
// alternative. For simplicity, just handle the case where the
// output operand is operand 0.
extract_insn (insn->rtl ());
rtx dest = SET_DEST (set);
if (recog_data.n_operands == 0
|| recog_data.operand[0] != dest)
return false;
if (!constrain_operands (0, get_enabled_alternatives (insn->rtl ())))
return false;
preprocess_constraints (insn->rtl ());
auto *alt = which_op_alt ();
auto dest_class = alt[0].cl;
// Check operands 1 and above.
auto check_src = [&] (unsigned int i)
{
if (recog_data.is_operator[i])
return true;
rtx op = recog_data.operand[i];
if (CONSTANT_P (op))
return true;
if (SUBREG_P (op))
op = SUBREG_REG (op);
if (REG_P (op))
{
// Ignore hard registers. We've already rejected uses of non-fixed
// hard registers in the SET_SRC.
if (HARD_REGISTER_P (op))
return true;
// Make sure that the source operand's class is at least as
// permissive as the destination operand's class.
auto src_class = alternative_class (alt, i);
if (!reg_class_subset_p (dest_class, src_class))
return false;
// Make sure that the source operand occupies no more hard
// registers than the destination operand. This mostly matters
// for subregs.
if (targetm.class_max_nregs (dest_class, GET_MODE (dest))
< targetm.class_max_nregs (src_class, GET_MODE (op)))
return false;
return true;
}
return false;
};
for (int i = 1; i < recog_data.n_operands; ++i)
if (recog_data.operand_type[i] != OP_OUT && !check_src (i))
return false;
return true;
}
// Check uses of DEF to see whether there is anything obvious that
// prevents the substitution of SET into uses of DEF.
bool
late_combine::check_uses (set_info *def, rtx set)
{
use_info *prev_use = nullptr;
for (use_info *use : def->nondebug_insn_uses ())
{
insn_info *use_insn = use->insn ();
if (use->is_live_out_use ())
continue;
if (use->only_occurs_in_notes ())
continue;
// We cannot replace all uses if the value is live on exit.
if (use->is_artificial ())
return false;
// Avoid increasing the complexity of instructions that
// reference allocatable hard registers.
if (!REG_P (SET_SRC (set))
&& !reload_completed
&& (accesses_include_nonfixed_hard_registers (use_insn->uses ())
|| accesses_include_nonfixed_hard_registers (use_insn->defs ())))
return false;
// Don't substitute into a non-local goto, since it can then be
// treated as a jump to local label, e.g. in shorten_branches.
// ??? But this shouldn't be necessary.
if (use_insn->is_jump ()
&& find_reg_note (use_insn->rtl (), REG_NON_LOCAL_GOTO, NULL_RTX))
return false;
// Reject cases where one of the uses is a function argument.
// The combine attempt should fail anyway, but this is a common
// case that is easy to check early.
if (use_insn->is_call ()
&& HARD_REGISTER_P (SET_DEST (set))
&& find_reg_fusage (use_insn->rtl (), USE, SET_DEST (set)))
return false;
// We'll keep the uses in their original order, even if we move
// them relative to other instructions. Make sure that non-final
// uses do not change any values that occur in the SET_SRC.
if (prev_use && prev_use->ebb () == use->ebb ())
{
def_info *ultimate_def = look_through_degenerate_phi (def);
if (insn_clobbers_resources (prev_use->insn (),
ultimate_def->insn ()->uses ()))
return false;
}
prev_use = use;
}
for (use_info *use : def->phi_uses ())
if (!use->phi ()->is_degenerate ()
|| !check_uses (use->phi (), set))
return false;
return true;
}
// Try to remove INSN by substituting a definition into all uses.
// If the optimization moves any instructions before CURSOR, add those
// instructions to the end of m_worklist.
bool
late_combine::combine_into_uses (insn_info *insn, insn_info *cursor)
{
// For simplicity, don't try to handle sets of multiple hard registers.
// And for correctness, don't remove any assignments to the stack or
// frame pointers, since that would implicitly change the set of valid
// memory locations between this assignment and the next.
//
// Removing assignments to the hard frame pointer would invalidate
// backtraces.
set_info *def = single_set_info (insn);
if (!def
|| !def->is_reg ()
|| def->regno () == STACK_POINTER_REGNUM
|| def->regno () == FRAME_POINTER_REGNUM
|| def->regno () == HARD_FRAME_POINTER_REGNUM)
return false;
rtx set = optimizable_set (insn);
if (!set)
return false;
// For simplicity, don't try to handle subreg destinations.
rtx dest = SET_DEST (set);
if (!REG_P (dest) || def->regno () != REGNO (dest))
return false;
// Don't prolong the live ranges of allocatable hard registers, or put
// them into more complicated instructions. Failing to prevent this
// could lead to spill failures, or at least to worst register allocation.
if (!reload_completed
&& accesses_include_nonfixed_hard_registers (insn->uses ()))
return false;
if (!reload_completed && !check_register_pressure (insn, set))
return false;
if (!check_uses (def, set))
return false;
insn_combination combination (def, SET_DEST (set), SET_SRC (set));
if (!combination.run ())
return false;
for (auto *use_change : combination.use_changes ())
if (*use_change->insn () < *cursor)
m_worklist.safe_push (use_change->insn ());
else
break;
return true;
}
// Run the pass on function FN.
unsigned int
late_combine::execute (function *fn)
{
// Initialization.
calculate_dominance_info (CDI_DOMINATORS);
df_analyze ();
crtl->ssa = new rtl_ssa::function_info (fn);
// Don't allow memory_operand to match volatile MEMs.
init_recog_no_volatile ();
insn_info *insn = *crtl->ssa->nondebug_insns ().begin ();
while (insn)
{
if (!insn->is_artificial ())
{
insn_info *prev = insn->prev_nondebug_insn ();
if (combine_into_uses (insn, prev))
{
// Any instructions that get added to the worklist were
// previously after PREV. Thus if we were able to move
// an instruction X before PREV during one combination,
// X cannot depend on any instructions that we move before
// PREV during subsequent combinations. This means that
// the worklist should be free of backwards dependencies,
// even if it isn't necessarily in RPO.
for (unsigned int i = 0; i < m_worklist.length (); ++i)
combine_into_uses (m_worklist[i], prev);
m_worklist.truncate (0);
insn = prev;
}
}
insn = insn->next_nondebug_insn ();
}
// Finalization.
if (crtl->ssa->perform_pending_updates ())
cleanup_cfg (0);
// Make the recognizer allow volatile MEMs again.
init_recog ();
free_dominance_info (CDI_DOMINATORS);
return 0;
}
class pass_late_combine : public rtl_opt_pass
{
public:
pass_late_combine (gcc::context *ctxt)
: rtl_opt_pass (pass_data_late_combine, ctxt)
{}
// opt_pass methods:
opt_pass *clone () override { return new pass_late_combine (m_ctxt); }
bool gate (function *) override;
unsigned int execute (function *) override;
};
bool
pass_late_combine::gate (function *)
{
return optimize > 0 && flag_late_combine_instructions;
}
unsigned int
pass_late_combine::execute (function *fn)
{
return late_combine ().execute (fn);
}
} // end namespace
// Create a new CC fusion pass instance.
rtl_opt_pass *
make_pass_late_combine (gcc::context *ctxt)
{
return new pass_late_combine (ctxt);
}