diff options
author | Artemiy Volkov <artemiyv@acm.org> | 2025-09-06 15:06:36 -0600 |
---|---|---|
committer | Jeff Law <jlaw@ventanamicro.com> | 2025-09-06 15:07:27 -0600 |
commit | 41b0c7a674e87074fdc8088479cb93f6fe1e070f (patch) | |
tree | e5fa08f17781a2eae62ebba1d828c34bb5e171f7 | |
parent | 044d828724cb0169ee7721ae208297c2b8d5a8fd (diff) | |
download | gcc-41b0c7a674e87074fdc8088479cb93f6fe1e070f.zip gcc-41b0c7a674e87074fdc8088479cb93f6fe1e070f.tar.gz gcc-41b0c7a674e87074fdc8088479cb93f6fe1e070f.tar.bz2 |
gcc: introduce the dep_fusion pass
Presently, the scheduler code only considers consecutive instructions
for macro-op fusion (see sched-deps.cc::sched_macro_fuse_insns () for
details). This patch introduces the new dep_fusion pass, which is
intended to uncover more fusion opportunities by reordering eligible
instructions to form fusible pairs (based solely on the value of the
TARGET_SCHED_MACRO_FUSION_PAIR_P hook). This is achieved by using
the RTL-SSA framework, and only the single-use instructions are
considered for the first instruction of a pair.
Aside from reordering instructions, this pass also sets the SCHED_GROUP
flag for the second instruction so that following passes can implement
special handling of the fused pairs. For instance, RA and regrename
should make use of this information to preserve single-output property
for some of such pairs. Accordingly, in passes.def, this patch adds two
invocations of the new pass: just before IRA and just before regrename.
The new pass is enabled at -O2+ and -Os.
gcc/ChangeLog:
* Makefile.in (OBJS): Add dep-fusion.o.
* common.opt (fdep-fusion): Add option.
* dep-fusion.cc: New pass.
* doc/invoke.texi: Document it.
* opts.cc (default_options_table): Enable it at -O2+ and -Os.
* passes.def: Insert two instances of dep_fusion.
* tree-pass.h (make_pass_dep_fusion): Declare new function.
-rw-r--r-- | gcc/Makefile.in | 1 | ||||
-rw-r--r-- | gcc/common.opt | 4 | ||||
-rw-r--r-- | gcc/dep-fusion.cc | 148 | ||||
-rw-r--r-- | gcc/doc/invoke.texi | 15 | ||||
-rw-r--r-- | gcc/opts.cc | 1 | ||||
-rw-r--r-- | gcc/passes.def | 2 | ||||
-rw-r--r-- | gcc/tree-pass.h | 1 |
7 files changed, 169 insertions, 3 deletions
diff --git a/gcc/Makefile.in b/gcc/Makefile.in index d35fced..4503dab 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1448,6 +1448,7 @@ OBJS = \ dce.o \ ddg.o \ debug.o \ + dep-fusion.o \ df-core.o \ df-problems.o \ df-scan.o \ diff --git a/gcc/common.opt b/gcc/common.opt index cd6a224..f6d93dc 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1394,6 +1394,10 @@ fdelete-null-pointer-checks Common Var(flag_delete_null_pointer_checks) Init(-1) Optimization Delete useless null pointer checks. +fdep-fusion +Common Var(flag_dep_fusion) Optimization Init(1) +Issue defining instructions back to back with their single uses, provided they are macro-fusible in the target microarchitecture. + fdevirtualize-at-ltrans Common Var(flag_ltrans_devirtualize) Stream extra data to support more aggressive devirtualization in LTO local transformation mode. diff --git a/gcc/dep-fusion.cc b/gcc/dep-fusion.cc new file mode 100644 index 0000000..1e69e68 --- /dev/null +++ b/gcc/dep-fusion.cc @@ -0,0 +1,148 @@ +// Dependency fusion reordering pass. +// Copyright (C) 2025 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. +// +// This pass uses the RTL-SSA representation to detect def-use pairs that are +// macro-op-fusible in the current microarchitecture (using the +// macro_fusion_pair_p () target hook) and place them next to one another, if +// possible. + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY +#define INCLUDE_ARRAY +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-ssa.h" +#include "print-rtl.h" +#include "tree-pass.h" +#include "cfgcleanup.h" +#include "target.h" +#include "dbgcnt.h" + +namespace { +const pass_data pass_data_dep_fusion = +{ + RTL_PASS, // type + "dep_fusion", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +class pass_dep_fusion : public rtl_opt_pass +{ +public: + pass_dep_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_dep_fusion, ctxt) + {} + + // opt_pass methods: + opt_pass *clone () override { return new pass_dep_fusion (m_ctxt); } + bool gate (function *) override; + unsigned int execute (function *) override; +}; + +bool +pass_dep_fusion::gate (function *) +{ + return optimize > 0 && flag_dep_fusion; +} + +unsigned int +pass_dep_fusion::execute (function *fn) +{ + // Initialization. + calculate_dominance_info (CDI_DOMINATORS); + df_analyze (); + crtl->ssa = new rtl_ssa::function_info (fn); + + init_recog_no_volatile (); + + for (rtl_ssa::insn_info *insn = *crtl->ssa->nondebug_insns ().begin (); + insn; + insn = insn->next_nondebug_insn ()) + { + if (!insn->can_be_optimized () || insn->num_defs () != 1) + continue; + + rtl_ssa::set_info *def = single_set_info (insn); + if (!def) + continue; + + rtl_ssa::use_info *use_insn = def->single_nondebug_insn_use (); + if (!use_insn + || !use_insn->insn ()->can_be_optimized () + || !targetm.sched.macro_fusion_pair_p (insn->rtl (), + use_insn->insn ()->rtl ())) + continue; + + auto attempt = crtl->ssa->new_change_attempt (); + rtl_ssa::insn_change change (use_insn->insn ()); + + if (use_insn->insn () != insn->next_any_insn ()) + { + if (!can_move_insn_p (use_insn->insn ())) + continue; + + change.move_range = insn; + if (!rtl_ssa::restrict_movement (change)) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Moved a single-use instruction:\n"); + dump_insn_slim (dump_file, use_insn->insn ()->rtl ()); + fprintf (dump_file, "right after its definition:\n"); + dump_insn_slim (dump_file, insn->rtl ()); + } + } + + SCHED_GROUP_P (use_insn->insn ()->rtl ()) = 1; + confirm_change_group (); + crtl->ssa->change_insn (change); + } + + // Finalization. + if (crtl->ssa->perform_pending_updates ()) + cleanup_cfg (0); + + delete crtl->ssa; + + init_recog (); + free_dominance_info (CDI_DOMINATORS); + return 0; +} + +} // end namespace + +// Create a new dep fusion pass instance. + +rtl_opt_pass * +make_pass_dep_fusion (gcc::context *ctxt) +{ + return new pass_dep_fusion (ctxt); +} diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index c586956..b2a937f 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -587,8 +587,8 @@ Objective-C and Objective-C++ Dialects}. -fcse-follow-jumps -fcse-skip-blocks -fcx-fortran-rules -fcx-limited-range -fcx-method -fdata-sections -fdce -fdelayed-branch --fdelete-null-pointer-checks -fdevirtualize -fdevirtualize-speculatively --fdevirtualize-at-ltrans -fdse +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize +-fdevirtualize-speculatively -fdevirtualize-at-ltrans -fdse -fearly-inlining -fipa-sra -fexpensive-optimizations -ffat-lto-objects -ffast-math -ffinite-math-only -ffloat-store -fexcess-precision=@var{style} -ffinite-loops @@ -13108,7 +13108,7 @@ also turns on the following optimization flags: -fcode-hoisting -fcrossjumping -fcse-follow-jumps -fcse-skip-blocks --fdelete-null-pointer-checks +-fdelete-null-pointer-checks -fdep-fusion -fdevirtualize -fdevirtualize-speculatively -fexpensive-optimizations -ffinite-loops @@ -16028,6 +16028,15 @@ more efficiently if they are adjacent to each other in the instruction flow. Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. +@opindex fdep-fusion +@item -fdep-fusion +Detect macro-op fusible pairs consisting of single-use instructions and their +uses, and place such pairs together in the instruction stream to increase +fusion opportunities in hardware. This pass is executed once before register +allocation, and another time before register renaming. + +Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}. + @opindex ftracer @item -ftracer Perform tail duplication to enlarge superblock size. This transformation diff --git a/gcc/opts.cc b/gcc/opts.cc index baba084..10ce2c3 100644 --- a/gcc/opts.cc +++ b/gcc/opts.cc @@ -636,6 +636,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_2_PLUS, OPT_fcode_hoisting, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcrossjumping, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fcse_follow_jumps, NULL, 1 }, + { OPT_LEVELS_2_PLUS, OPT_fdep_fusion, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fdevirtualize_speculatively, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fexpensive_optimizations, NULL, 1 }, diff --git a/gcc/passes.def b/gcc/passes.def index c870170..e9610de 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -516,6 +516,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched); NEXT_PASS (pass_rtl_avoid_store_forwarding); NEXT_PASS (pass_early_remat); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_ira); NEXT_PASS (pass_reload); /* In the following, some passes are tied to 'pass_postreload' and others @@ -537,6 +538,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_sched_fusion); NEXT_PASS (pass_peephole2); NEXT_PASS (pass_if_after_reload); + NEXT_PASS (pass_dep_fusion); NEXT_PASS (pass_regrename); NEXT_PASS (pass_fold_mem_offsets); NEXT_PASS (pass_cprop_hardreg); diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 1c68a69..61cec52 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -625,6 +625,7 @@ extern rtl_opt_pass *make_pass_value_profile_transformations (gcc::context *ctxt); extern rtl_opt_pass *make_pass_postreload_cse (gcc::context *ctxt); extern rtl_opt_pass *make_pass_late_combine (gcc::context *ctxt); +extern rtl_opt_pass *make_pass_dep_fusion (gcc::context *ctxt); extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt); extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt); extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context |