diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2025-08-20 13:20:02 +0100 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2025-08-20 13:20:02 +0100 |
commit | 724d88900b7aa8f249b737a33e9b11eedf48ebae (patch) | |
tree | 12d18630c279106a50e5912e69adc2c9c2984c03 /gcc | |
parent | 481f96296e87b42b7f25944edd627cc9211dd803 (diff) | |
download | gcc-724d88900b7aa8f249b737a33e9b11eedf48ebae.zip gcc-724d88900b7aa8f249b737a33e9b11eedf48ebae.tar.gz gcc-724d88900b7aa8f249b737a33e9b11eedf48ebae.tar.bz2 |
Merge aarch64-cc-fusion into late-combine
I'd added the aarch64-specific CC fusion pass to fold a PTEST
instruction into the instruction that feeds the PTEST, in cases
where the latter instruction can set the appropriate flags as a
side-effect.
Combine does the same optimisation. However, as explained in the
comments, the PTEST case often has:
A: set predicate P based on inputs X
B: clobber X
C: test P
and so the fusion is only possible if we move C before B.
That's something that combine currently can't do (for the cases
that we needed).
The optimisation was never really AArch64-specific. It's just that,
in an all-too-familiar fashion, we needed it in stage 3, when it was
too late to add something target-independent.
late-combine adds a convenient place to do the optimisation in a
target-independent way, just as combine is a convenient place to
do its related optimisation.
gcc/
* config.gcc (aarch64*-*-*): Remove aarch64-cc-fusion.o from
extra_objs.
* config/aarch64/aarch64-passes.def (pass_cc_fusion): Delete.
* config/aarch64/aarch64-protos.h (make_pass_cc_fusion): Delete.
* config/aarch64/t-aarch64 (aarch64-cc-fusion.o): Delete.
* config/aarch64/aarch64-cc-fusion.cc: Delete.
* late-combine.cc (late_combine::optimizable_set): Take a set_info *
rather than an insn_info * and move destination tests from...
(late_combine::combine_into_uses): ...here. Take a set_info * rather
an insn_info *. Take the rtx set.
(late_combine::parallelize_insns, late_combine::combine_cc_setter)
(late_combine::combine_insn): New member functions.
(late_combine::m_parallel): New member variable.
* rtlanal.cc (pattern_cost): Handle sets of CC registers in the
same way as comparisons.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config.gcc | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-cc-fusion.cc | 297 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-passes.def | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/t-aarch64 | 6 | ||||
-rw-r--r-- | gcc/late-combine.cc | 243 | ||||
-rw-r--r-- | gcc/rtlanal.cc | 3 |
7 files changed, 208 insertions, 345 deletions
diff --git a/gcc/config.gcc b/gcc/config.gcc index 5624638..517df40 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -351,7 +351,7 @@ aarch64*-*-*) c_target_objs="aarch64-c.o" cxx_target_objs="aarch64-c.o" d_target_objs="aarch64-d.o" - extra_objs="aarch64-builtins.o aarch-common.o aarch64-elf-metadata.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o aarch64-sve-builtins-sme.o cortex-a57-fma-steering.o aarch64-speculation.o aarch-bti-insert.o aarch64-cc-fusion.o aarch64-early-ra.o aarch64-ldp-fusion.o" + extra_objs="aarch64-builtins.o aarch-common.o aarch64-elf-metadata.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o aarch64-sve-builtins-sme.o cortex-a57-fma-steering.o aarch64-speculation.o aarch-bti-insert.o aarch64-early-ra.o aarch64-ldp-fusion.o" target_gtfiles="\$(srcdir)/config/aarch64/aarch64-protos.h \$(srcdir)/config/aarch64/aarch64-builtins.h \$(srcdir)/config/aarch64/aarch64-builtins.cc \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc" target_has_targetm_common=yes ;; diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc deleted file mode 100644 index cea54de..0000000 --- a/gcc/config/aarch64/aarch64-cc-fusion.cc +++ /dev/null @@ -1,297 +0,0 @@ -// Pass to fuse CC operations with other instructions. -// Copyright (C) 2021-2025 Free Software Foundation, Inc. -// -// This file is part of GCC. -// -// GCC is free software; you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 3, or (at your option) any later -// version. -// -// GCC is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License -// along with GCC; see the file COPYING3. If not see -// <http://www.gnu.org/licenses/>. - -// This pass looks for sequences of the form: -// -// A: (set (reg R1) X1) -// B: ...instructions that might change the value of X1... -// C: (set (reg CC) X2) // X2 uses R1 -// -// and tries to change them to: -// -// C': [(set (reg CC) X2') -// (set (reg R1) X1)] -// B: ...instructions that might change the value of X1... -// -// where X2' is the result of replacing R1 with X1 in X2. -// -// This sequence occurs in SVE code in two important cases: -// -// (a) Sometimes, to deal correctly with overflow, we need to increment -// an IV after a WHILELO rather than before it. In this case: -// - A is a WHILELO, -// - B includes an IV increment and -// - C is a separate PTEST. -// -// (b) ACLE code of the form: -// -// svbool_t ok = svrdffr (); -// if (svptest_last (pg, ok)) -// ... -// -// must, for performance reasons, be code-generated as: -// -// RDFFRS Pok.B, Pg/Z -// ...branch on flags result... -// -// without a separate PTEST of Pok. In this case: -// - A is an aarch64_rdffr -// - B includes an aarch64_update_ffrt -// - C is a separate PTEST -// -// Combine can handle this optimization if B doesn't exist and if A and -// C are in the same BB. This pass instead handles cases where B does -// exist and cases where A and C are in different BBs of the same EBB. - -#define IN_TARGET_CODE 1 - -#define INCLUDE_ALGORITHM -#define INCLUDE_FUNCTIONAL -#define INCLUDE_ARRAY -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "df.h" -#include "rtl-ssa.h" -#include "tree-pass.h" - -using namespace rtl_ssa; - -namespace { -const pass_data pass_data_cc_fusion = -{ - RTL_PASS, // type - "cc_fusion", // name - OPTGROUP_NONE, // optinfo_flags - TV_NONE, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - TODO_df_finish, // todo_flags_finish -}; - -// Class that represents one run of the pass. -class cc_fusion -{ -public: - cc_fusion () : m_parallel () {} - void execute (); - -private: - rtx optimizable_set (const insn_info *); - bool parallelize_insns (def_info *, rtx, def_info *, rtx); - void optimize_cc_setter (def_info *, rtx); - - // A spare PARALLEL rtx, or null if none. - rtx m_parallel; -}; - -// See whether INSN is a single_set that we can optimize. Return the -// set if so, otherwise return null. -rtx -cc_fusion::optimizable_set (const insn_info *insn) -{ - if (!insn->can_be_optimized () - || insn->is_asm () - || insn->has_volatile_refs () - || insn->has_pre_post_modify ()) - return NULL_RTX; - - return single_set (insn->rtl ()); -} - -// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise -// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the -// CC register and the instruction that contains CC_SET is known to use -// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. -bool -cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set, - def_info *other_def, rtx other_set) -{ - auto attempt = crtl->ssa->new_change_attempt (); - - insn_info *cc_insn = cc_def->insn (); - insn_info *other_insn = other_def->insn (); - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", - other_insn->uid (), cc_insn->uid ()); - - // Try to substitute OTHER_SET into CC_INSN. - insn_change_watermark rtl_watermark; - rtx_insn *cc_rtl = cc_insn->rtl (); - insn_propagation prop (cc_rtl, SET_DEST (other_set), - SET_SRC (other_set)); - if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) - || prop.num_replacements == 0) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- failed to substitute all uses of r%d\n", - other_def->regno ()); - return false; - } - - // Restrict the uses to those outside notes. - use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); - use_array other_set_uses = remove_note_accesses (attempt, - other_insn->uses ()); - - // Remove the use of the substituted value. - access_array_builder uses_builder (attempt); - uses_builder.reserve (cc_uses.size ()); - for (use_info *use : cc_uses) - if (use->def () != other_def) - uses_builder.quick_push (use); - cc_uses = use_array (uses_builder.finish ()); - - // Get the list of uses for the new instruction. - insn_change cc_change (cc_insn); - cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); - if (!cc_change.new_uses.is_valid ()) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- cannot merge uses\n"); - return false; - } - - // The instruction initially defines just two registers. recog can add - // extra clobbers if necessary. - auto_vec<access_info *, 2> new_defs; - new_defs.quick_push (cc_def); - new_defs.quick_push (other_def); - sort_accesses (new_defs); - cc_change.new_defs = def_array (access_array (new_defs)); - - // Make sure there is somewhere that the new instruction could live. - auto other_change = insn_change::delete_insn (other_insn); - insn_change *changes[] = { &other_change, &cc_change }; - cc_change.move_range = cc_insn->ebb ()->insn_range (); - if (!restrict_movement (cc_change, ignore_changing_insns (changes))) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); - return false; - } - - // Tentatively install the new pattern. By convention, the CC set - // must be first. - if (m_parallel) - { - XVECEXP (m_parallel, 0, 0) = cc_set; - XVECEXP (m_parallel, 0, 1) = other_set; - } - else - { - rtvec vec = gen_rtvec (2, cc_set, other_set); - m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); - } - validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); - - // These routines report failures themselves. - if (!recog (attempt, cc_change, ignore_changing_insns (changes)) - || !changes_are_worthwhile (changes) - || !crtl->ssa->verify_insn_changes (changes)) - return false; - - remove_reg_equal_equiv_notes (cc_rtl); - confirm_change_group (); - crtl->ssa->change_insns (changes); - m_parallel = NULL_RTX; - return true; -} - -// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes -// a definition of the CC register by CC_SET. -void -cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set) -{ - // Search the registers used by the CC setter for an easily-substitutable - // def-use chain. - for (use_info *other_use : cc_def->insn ()->uses ()) - if (def_info *other_def = other_use->def ()) - if (other_use->regno () != CC_REGNUM - && other_def->ebb () == cc_def->ebb ()) - if (rtx other_set = optimizable_set (other_def->insn ())) - { - rtx dest = SET_DEST (other_set); - if (REG_P (dest) - && REGNO (dest) == other_def->regno () - && REG_NREGS (dest) == 1 - && parallelize_insns (cc_def, cc_set, other_def, other_set)) - return; - } -} - -// Run the pass on the current function. -void -cc_fusion::execute () -{ - // Initialization. - calculate_dominance_info (CDI_DOMINATORS); - df_analyze (); - crtl->ssa = new rtl_ssa::function_info (cfun); - - // Walk through all instructions that set CC. Look for a PTEST instruction - // that we can optimize. - // - // ??? The PTEST test isn't needed for correctness, but it ensures that the - // pass no effect on non-SVE code. - for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM)) - if (rtx cc_set = optimizable_set (def->insn ())) - if (REG_P (SET_DEST (cc_set)) - && REGNO (SET_DEST (cc_set)) == CC_REGNUM - && GET_CODE (SET_SRC (cc_set)) == UNSPEC - && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST) - optimize_cc_setter (def, cc_set); - - // Finalization. - crtl->ssa->perform_pending_updates (); - free_dominance_info (CDI_DOMINATORS); -} - -class pass_cc_fusion : public rtl_opt_pass -{ -public: - pass_cc_fusion (gcc::context *ctxt) - : rtl_opt_pass (pass_data_cc_fusion, ctxt) - {} - - // opt_pass methods: - virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; } - virtual unsigned int execute (function *); -}; - -unsigned int -pass_cc_fusion::execute (function *) -{ - cc_fusion ().execute (); - return 0; -} - -} // end namespace - -// Create a new CC fusion pass instance. - -rtl_opt_pass * -make_pass_cc_fusion (gcc::context *ctxt) -{ - return new pass_cc_fusion (ctxt); -} diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def index 9cf9d3e..6a53ff3 100644 --- a/gcc/config/aarch64/aarch64-passes.def +++ b/gcc/config/aarch64/aarch64-passes.def @@ -24,6 +24,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation); INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm); INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti); -INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion); INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion); INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index d26e1d5..56efcf2 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1237,7 +1237,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *); rtl_opt_pass *make_pass_track_speculation (gcc::context *); rtl_opt_pass *make_pass_late_track_speculation (gcc::context *); rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt); -rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt); rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt); rtl_opt_pass *make_pass_ldp_fusion (gcc::context *); diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index 38a8c06..63ca8e9 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -190,12 +190,6 @@ aarch-bti-insert.o: $(srcdir)/config/arm/aarch-bti-insert.cc \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/arm/aarch-bti-insert.cc -aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \ - $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ - $(RTL_SSA_H) tree-pass.h - $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ - $(srcdir)/config/aarch64/aarch64-cc-fusion.cc - aarch64-early-ra.o: $(srcdir)/config/aarch64/aarch64-early-ra.cc \ $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ $(RTL_SSA_H) tree-pass.h diff --git a/gcc/late-combine.cc b/gcc/late-combine.cc index 90d7ef0..770780eb 100644 --- a/gcc/late-combine.cc +++ b/gcc/late-combine.cc @@ -17,9 +17,16 @@ // along with GCC; see the file COPYING3. If not see // <http://www.gnu.org/licenses/>. -// The current purpose of this pass is to substitute definitions into -// all uses, so that the definition can be removed. However, it could -// be extended to handle other combination-related optimizations in future. +// This pass currently has two purposes: +// +// - to substitute definitions into all uses, so that the definition +// can be removed. +// +// - to try to parallelise sets of condition-code registers with a +// related instruction (see combine_cc_setter for details). +// +// However, it could be extended to handle other combination-related +// optimizations in future. // // The pass can run before or after register allocation. When running // before register allocation, it tries to avoid cases that are likely @@ -111,12 +118,18 @@ public: unsigned int execute (function *); private: - rtx optimizable_set (insn_info *); + rtx optimizable_set (set_info *); bool check_register_pressure (insn_info *, rtx); bool check_uses (set_info *, rtx); - bool combine_into_uses (insn_info *, insn_info *); + bool combine_into_uses (set_info *, rtx, insn_info *); + bool parallelize_insns (set_info *, rtx, set_info *, rtx); + bool combine_cc_setter (set_info *, rtx); + bool combine_insn (insn_info *, insn_info *); auto_vec<insn_info *> m_worklist; + + // A spare PARALLEL rtx, or null if none. + rtx m_parallel = NULL_RTX; }; insn_combination::insn_combination (set_info *def, rtx dest, rtx src) @@ -454,11 +467,26 @@ insn_combination::run () return true; } -// See whether INSN is a single_set that we can optimize. Return the -// set if so, otherwise return null. +// DEF is the result of calling single_set_info on its instruction. +// See whether that instruction is a single_set that we can optimize. +// Return the set if so, otherwise return null. rtx -late_combine::optimizable_set (insn_info *insn) +late_combine::optimizable_set (set_info *def) { + // For simplicity, don't try to handle sets of multiple hard registers. + // And for correctness, don't remove any assignments to the stack or + // frame pointers, since that would implicitly change the set of valid + // memory locations between this assignment and the next. + // + // Removing assignments to the hard frame pointer would invalidate + // backtraces. + if (!def->is_reg () + || def->regno () == STACK_POINTER_REGNUM + || def->regno () == FRAME_POINTER_REGNUM + || def->regno () == HARD_FRAME_POINTER_REGNUM) + return NULL_RTX; + + auto *insn = def->insn (); if (!insn->can_be_optimized () || insn->is_asm () || insn->is_call () @@ -467,7 +495,16 @@ late_combine::optimizable_set (insn_info *insn) || !can_move_insn_p (insn)) return NULL_RTX; - return single_set (insn->rtl ()); + rtx set = single_set (insn->rtl ()); + if (!set) + return NULL_RTX; + + // For simplicity, don't try to handle subreg destinations. + rtx dest = SET_DEST (set); + if (!REG_P (dest) || REG_NREGS (dest) != 1 || def->regno () != REGNO (dest)) + return NULL_RTX; + + return set; } // Suppose that we can replace all uses of SET_DEST (SET) with SET_SRC (SET), @@ -643,35 +680,13 @@ late_combine::check_uses (set_info *def, rtx set) return true; } -// Try to remove INSN by substituting a definition into all uses. -// If the optimization moves any instructions before CURSOR, add those -// instructions to the end of m_worklist. +// Try to remove DEF's instruction by substituting DEF into all uses. +// SET is the rtx set associated with DEF. If the optimization moves any +// instructions before CURSOR, add those instructions to the end of m_worklist. bool -late_combine::combine_into_uses (insn_info *insn, insn_info *cursor) +late_combine::combine_into_uses (set_info *def, rtx set, insn_info *cursor) { - // For simplicity, don't try to handle sets of multiple hard registers. - // And for correctness, don't remove any assignments to the stack or - // frame pointers, since that would implicitly change the set of valid - // memory locations between this assignment and the next. - // - // Removing assignments to the hard frame pointer would invalidate - // backtraces. - set_info *def = single_set_info (insn); - if (!def - || !def->is_reg () - || def->regno () == STACK_POINTER_REGNUM - || def->regno () == FRAME_POINTER_REGNUM - || def->regno () == HARD_FRAME_POINTER_REGNUM) - return false; - - rtx set = optimizable_set (insn); - if (!set) - return false; - - // For simplicity, don't try to handle subreg destinations. - rtx dest = SET_DEST (set); - if (!REG_P (dest) || def->regno () != REGNO (dest)) - return false; + auto *insn = def->insn (); // Don't prolong the live ranges of allocatable hard registers, or put // them into more complicated instructions. Failing to prevent this @@ -698,6 +713,158 @@ late_combine::combine_into_uses (insn_info *insn, insn_info *cursor) return true; } +// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise +// a single_set that sets (only) OTHER_DEF. CC_SET is known to set a +// condition-code register and the instruction that contains CC_SET is +// known to use OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. +bool +late_combine::parallelize_insns (set_info *cc_def, rtx cc_set, + set_info *other_def, rtx other_set) +{ + auto attempt = crtl->ssa->new_change_attempt (); + + insn_info *cc_insn = cc_def->insn (); + insn_info *other_insn = other_def->insn (); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", + other_insn->uid (), cc_insn->uid ()); + + // Try to substitute OTHER_SET into CC_INSN. + insn_change_watermark rtl_watermark; + rtx_insn *cc_rtl = cc_insn->rtl (); + insn_propagation prop (cc_rtl, SET_DEST (other_set), SET_SRC (other_set)); + if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) + || prop.num_replacements == 0) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- failed to substitute all uses of r%d\n", + other_def->regno ()); + return false; + } + + // Restrict the uses to those outside notes. + use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); + use_array other_set_uses = remove_note_accesses (attempt, + other_insn->uses ()); + + // Remove the use of the substituted value. + cc_uses = remove_uses_of_def (attempt, cc_uses, other_def); + + // Get the list of uses for the new instruction. + insn_change cc_change (cc_insn); + cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); + if (!cc_change.new_uses.is_valid ()) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot merge uses\n"); + return false; + } + + // The instruction initially defines just two registers. recog can add + // extra clobbers if necessary. + auto_vec<access_info *, 2> new_defs; + new_defs.quick_push (cc_def); + new_defs.quick_push (other_def); + sort_accesses (new_defs); + cc_change.new_defs = def_array (access_array (new_defs)); + + // Make sure there is somewhere that the new instruction could live. + auto other_change = insn_change::delete_insn (other_insn); + insn_change *changes[] = { &other_change, &cc_change }; + cc_change.move_range = cc_insn->ebb ()->insn_range (); + if (!restrict_movement (cc_change, ignore_changing_insns (changes))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); + return false; + } + + // Tentatively install the new pattern. By convention, the CC set + // must be first. + if (m_parallel) + { + XVECEXP (m_parallel, 0, 0) = cc_set; + XVECEXP (m_parallel, 0, 1) = other_set; + } + else + { + rtvec vec = gen_rtvec (2, cc_set, other_set); + m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); + } + validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); + + // These routines report failures themselves. + if (!recog (attempt, cc_change, ignore_changing_insns (changes)) + || !changes_are_worthwhile (changes) + || !crtl->ssa->verify_insn_changes (changes)) + return false; + + remove_reg_equal_equiv_notes (cc_rtl); + confirm_change_group (); + crtl->ssa->change_insns (changes); + m_parallel = NULL_RTX; + return true; +} + +// CC_SET is a single_set that sets (only) CC_DEF. See whether CC_DEF +// is a definition of a condition-code register and try to optimize it +// with related instructions if so. Return true if something changed. +// +// This function looks for sequences of the form: +// +// A: (set (reg R1) X1) +// B: ...instructions that might change the value of X1... +// C: (set (reg CC) X2) // X2 uses R1 +// +// and tries to change them to: +// +// C': [(set (reg CC) X2') +// (set (reg R1) X1)] +// B: ...instructions that might change the value of X1... +// +// where X2' is the result of replacing R1 with X1 in X2. +// +// Combine can handle this optimization if B doesn't exist and if A and +// C are in the same BB. This pass instead handles cases where B does +// exist and cases where A and C are in different BBs of the same EBB. +bool +late_combine::combine_cc_setter (set_info *cc_def, rtx cc_set) +{ + // Check for a set of a CC register. This isn't needed for correctness; + // it's just a way of narrowing the search space. It could be relaxed if + // there are other situations that would benefit from the same optimization. + if (!HARD_REGISTER_NUM_P (cc_def->regno ()) + || GET_MODE_CLASS (cc_def->mode()) != MODE_CC) + return false; + + // Search the registers used by the CC setter for an easily-substitutable + // def-use chain. + for (use_info *other_use : cc_def->insn ()->uses ()) + if (auto *other_def = other_use->def ()) + if (other_use->regno () != cc_def->regno () + && other_def->ebb () == cc_def->ebb () + && other_def == single_set_info (other_def->insn ())) + if (rtx other_set = optimizable_set (other_def)) + if (parallelize_insns (cc_def, cc_set, other_def, other_set)) + return true; + + return false; +} + +// Try to optimize INSN in some way. If the optimization moves any +// instructions before CURSOR, and if further optimizations might be +// possible on those instructions, add them to the end of m_worklist. +bool +late_combine::combine_insn (insn_info *insn, insn_info *cursor) +{ + if (set_info *def = single_set_info (insn)) + if (rtx set = optimizable_set (def)) + return (combine_into_uses (def, set, cursor) + || combine_cc_setter (def, set)); + + return false; +} + // Run the pass on function FN. unsigned int late_combine::execute (function *fn) @@ -715,7 +882,7 @@ late_combine::execute (function *fn) if (!insn->is_artificial ()) { insn_info *prev = insn->prev_nondebug_insn (); - if (combine_into_uses (insn, prev)) + if (combine_insn (insn, prev)) { // Any instructions that get added to the worklist were // previously after PREV. Thus if we were able to move @@ -725,7 +892,7 @@ late_combine::execute (function *fn) // the worklist should be free of backwards dependencies, // even if it isn't necessarily in RPO. for (unsigned int i = 0; i < m_worklist.length (); ++i) - combine_into_uses (m_worklist[i], prev); + combine_insn (m_worklist[i], prev); m_worklist.truncate (0); insn = prev; } diff --git a/gcc/rtlanal.cc b/gcc/rtlanal.cc index 19b6645..63a1d08 100644 --- a/gcc/rtlanal.cc +++ b/gcc/rtlanal.cc @@ -5740,7 +5740,8 @@ pattern_cost (rtx pat, bool speed) rtx x = XVECEXP (pat, 0, i); if (GET_CODE (x) == SET) { - if (GET_CODE (SET_SRC (x)) == COMPARE) + if (GET_CODE (SET_SRC (x)) == COMPARE + || GET_MODE_CLASS (GET_MODE (SET_DEST (x))) == MODE_CC) { if (comparison) return 0; |