// Pass to fuse CC operations with other instructions. // Copyright (C) 2021-2023 Free Software Foundation, Inc. // // This file is part of GCC. // // GCC is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free // Software Foundation; either version 3, or (at your option) any later // version. // // GCC is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License // along with GCC; see the file COPYING3. If not see // . // This pass looks for sequences of the form: // // A: (set (reg R1) X1) // B: ...instructions that might change the value of X1... // C: (set (reg CC) X2) // X2 uses R1 // // and tries to change them to: // // C': [(set (reg CC) X2') // (set (reg R1) X1)] // B: ...instructions that might change the value of X1... // // where X2' is the result of replacing R1 with X1 in X2. // // This sequence occurs in SVE code in two important cases: // // (a) Sometimes, to deal correctly with overflow, we need to increment // an IV after a WHILELO rather than before it. In this case: // - A is a WHILELO, // - B includes an IV increment and // - C is a separate PTEST. // // (b) ACLE code of the form: // // svbool_t ok = svrdffr (); // if (svptest_last (pg, ok)) // ... // // must, for performance reasons, be code-generated as: // // RDFFRS Pok.B, Pg/Z // ...branch on flags result... // // without a separate PTEST of Pok. In this case: // - A is an aarch64_rdffr // - B includes an aarch64_update_ffrt // - C is a separate PTEST // // Combine can handle this optimization if B doesn't exist and if A and // C are in the same BB. This pass instead handles cases where B does // exist and cases where A and C are in different BBs of the same EBB. #define IN_TARGET_CODE 1 #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL #include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" #include "rtl.h" #include "df.h" #include "rtl-ssa.h" #include "tree-pass.h" using namespace rtl_ssa; namespace { const pass_data pass_data_cc_fusion = { RTL_PASS, // type "cc_fusion", // name OPTGROUP_NONE, // optinfo_flags TV_NONE, // tv_id 0, // properties_required 0, // properties_provided 0, // properties_destroyed 0, // todo_flags_start TODO_df_finish, // todo_flags_finish }; // Class that represents one run of the pass. class cc_fusion { public: cc_fusion () : m_parallel () {} void execute (); private: rtx optimizable_set (const insn_info *); bool parallelize_insns (def_info *, rtx, def_info *, rtx); void optimize_cc_setter (def_info *, rtx); // A spare PARALLEL rtx, or null if none. rtx m_parallel; }; // See whether INSN is a single_set that we can optimize. Return the // set if so, otherwise return null. rtx cc_fusion::optimizable_set (const insn_info *insn) { if (!insn->can_be_optimized () || insn->is_asm () || insn->has_volatile_refs () || insn->has_pre_post_modify ()) return NULL_RTX; return single_set (insn->rtl ()); } // CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise // a single_set that sets (only) OTHER_DEF. CC_SET is known to set the // CC register and the instruction that contains CC_SET is known to use // OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. bool cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set, def_info *other_def, rtx other_set) { auto attempt = crtl->ssa->new_change_attempt (); insn_info *cc_insn = cc_def->insn (); insn_info *other_insn = other_def->insn (); if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", other_insn->uid (), cc_insn->uid ()); // Try to substitute OTHER_SET into CC_INSN. insn_change_watermark rtl_watermark; rtx_insn *cc_rtl = cc_insn->rtl (); insn_propagation prop (cc_rtl, SET_DEST (other_set), SET_SRC (other_set)); if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) || prop.num_replacements == 0) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "-- failed to substitute all uses of r%d\n", other_def->regno ()); return false; } // Restrict the uses to those outside notes. use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); use_array other_set_uses = remove_note_accesses (attempt, other_insn->uses ()); // Remove the use of the substituted value. access_array_builder uses_builder (attempt); uses_builder.reserve (cc_uses.size ()); for (use_info *use : cc_uses) if (use->def () != other_def) uses_builder.quick_push (use); cc_uses = use_array (uses_builder.finish ()); // Get the list of uses for the new instruction. insn_change cc_change (cc_insn); cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); if (!cc_change.new_uses.is_valid ()) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "-- cannot merge uses\n"); return false; } // The instruction initially defines just two registers. recog can add // extra clobbers if necessary. auto_vec new_defs; new_defs.quick_push (cc_def); new_defs.quick_push (other_def); sort_accesses (new_defs); cc_change.new_defs = def_array (access_array (new_defs)); // Make sure there is somewhere that the new instruction could live. auto other_change = insn_change::delete_insn (other_insn); insn_change *changes[] = { &other_change, &cc_change }; cc_change.move_range = cc_insn->ebb ()->insn_range (); if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes))) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); return false; } // Tentatively install the new pattern. By convention, the CC set // must be first. if (m_parallel) { XVECEXP (m_parallel, 0, 0) = cc_set; XVECEXP (m_parallel, 0, 1) = other_set; } else { rtvec vec = gen_rtvec (2, cc_set, other_set); m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); } validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); // These routines report failures themselves. if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes)) || !changes_are_worthwhile (changes) || !crtl->ssa->verify_insn_changes (changes)) return false; remove_reg_equal_equiv_notes (cc_rtl); confirm_change_group (); crtl->ssa->change_insns (changes); m_parallel = NULL_RTX; return true; } // Try to optimize the instruction that contains CC_DEF, where CC_DEF describes // a definition of the CC register by CC_SET. void cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set) { // Search the registers used by the CC setter for an easily-substitutable // def-use chain. for (use_info *other_use : cc_def->insn ()->uses ()) if (def_info *other_def = other_use->def ()) if (other_use->regno () != CC_REGNUM && other_def->ebb () == cc_def->ebb ()) if (rtx other_set = optimizable_set (other_def->insn ())) { rtx dest = SET_DEST (other_set); if (REG_P (dest) && REGNO (dest) == other_def->regno () && REG_NREGS (dest) == 1 && parallelize_insns (cc_def, cc_set, other_def, other_set)) return; } } // Run the pass on the current function. void cc_fusion::execute () { // Initialization. calculate_dominance_info (CDI_DOMINATORS); df_analyze (); crtl->ssa = new rtl_ssa::function_info (cfun); // Walk through all instructions that set CC. Look for a PTEST instruction // that we can optimize. // // ??? The PTEST test isn't needed for correctness, but it ensures that the // pass no effect on non-SVE code. for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM)) if (rtx cc_set = optimizable_set (def->insn ())) if (REG_P (SET_DEST (cc_set)) && REGNO (SET_DEST (cc_set)) == CC_REGNUM && GET_CODE (SET_SRC (cc_set)) == UNSPEC && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST) optimize_cc_setter (def, cc_set); // Finalization. crtl->ssa->perform_pending_updates (); free_dominance_info (CDI_DOMINATORS); } class pass_cc_fusion : public rtl_opt_pass { public: pass_cc_fusion (gcc::context *ctxt) : rtl_opt_pass (pass_data_cc_fusion, ctxt) {} // opt_pass methods: virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; } virtual unsigned int execute (function *); }; unsigned int pass_cc_fusion::execute (function *) { cc_fusion ().execute (); return 0; } } // end namespace // Create a new CC fusion pass instance. rtl_opt_pass * make_pass_cc_fusion (gcc::context *ctxt) { return new pass_cc_fusion (ctxt); }