diff options
-rw-r--r-- | gcc/Makefile.in | 7 | ||||
-rw-r--r-- | gcc/config.gcc | 2 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-cc-fusion.cc | 296 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-passes.def | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/aarch64-protos.h | 1 | ||||
-rw-r--r-- | gcc/config/aarch64/t-aarch64 | 6 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c | 32 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c | 10 |
8 files changed, 354 insertions, 1 deletions
diff --git a/gcc/Makefile.in b/gcc/Makefile.in index de8af61..a63c5d9c 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1024,6 +1024,13 @@ PLUGIN_H = plugin.h $(GCC_PLUGIN_H) PLUGIN_VERSION_H = plugin-version.h configargs.h CONTEXT_H = context.h GENSUPPORT_H = gensupport.h read-md.h optabs.def +RTL_SSA_H = $(PRETTY_PRINT_H) insn-config.h splay-tree-utils.h \ + $(RECOG_H) $(REGS_H) function-abi.h obstack-utils.h \ + mux-utils.h rtlanal.h memmodel.h $(EMIT_RTL_H) \ + rtl-ssa/accesses.h rtl-ssa/insns.h rtl-ssa/blocks.h \ + rtl-ssa/changes.h rtl-ssa/functions.h rtl-ssa/is-a.inl \ + rtl-ssa/access-utils.h rtl-ssa/insn-utils.h rtl-ssa/movement.h \ + rtl-ssa/change-utils.h rtl-ssa/member-fns.inl # # Now figure out from those variables how to compile and link. diff --git a/gcc/config.gcc b/gcc/config.gcc index 9fb57e9..17fea83 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -321,7 +321,7 @@ aarch64*-*-*) c_target_objs="aarch64-c.o" cxx_target_objs="aarch64-c.o" d_target_objs="aarch64-d.o" - extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o" + extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o aarch64-cc-fusion.o" target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc" target_has_targetm_common=yes ;; diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc new file mode 100644 index 0000000..09069a2 --- /dev/null +++ b/gcc/config/aarch64/aarch64-cc-fusion.cc @@ -0,0 +1,296 @@ +// Pass to fuse CC operations with other instructions. +// Copyright (C) 2021 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// This pass looks for sequences of the form: +// +// A: (set (reg R1) X1) +// B: ...instructions that might change the value of X1... +// C: (set (reg CC) X2) // X2 uses R1 +// +// and tries to change them to: +// +// C': [(set (reg CC) X2') +// (set (reg R1) X1)] +// B: ...instructions that might change the value of X1... +// +// where X2' is the result of replacing R1 with X1 in X2. +// +// This sequence occurs in SVE code in two important cases: +// +// (a) Sometimes, to deal correctly with overflow, we need to increment +// an IV after a WHILELO rather than before it. In this case: +// - A is a WHILELO, +// - B includes an IV increment and +// - C is a separate PTEST. +// +// (b) ACLE code of the form: +// +// svbool_t ok = svrdffr (); +// if (svptest_last (pg, ok)) +// ... +// +// must, for performance reasons, be code-generated as: +// +// RDFFRS Pok.B, Pg/Z +// ...branch on flags result... +// +// without a separate PTEST of Pok. In this case: +// - A is an aarch64_rdffr +// - B includes an aarch64_update_ffrt +// - C is a separate PTEST +// +// Combine can handle this optimization if B doesn't exist and if A and +// C are in the same BB. This pass instead handles cases where B does +// exist and cases where A and C are in different BBs of the same EBB. + +#define IN_TARGET_CODE 1 + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-ssa.h" +#include "tree-pass.h" + +using namespace rtl_ssa; + +namespace { +const pass_data pass_data_cc_fusion = +{ + RTL_PASS, // type + "cc_fusion", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +// Class that represents one run of the pass. +class cc_fusion +{ +public: + cc_fusion () : m_parallel () {} + void execute (); + +private: + rtx optimizable_set (const insn_info *); + bool parallelize_insns (def_info *, rtx, def_info *, rtx); + void optimize_cc_setter (def_info *, rtx); + + // A spare PARALLEL rtx, or null if none. + rtx m_parallel; +}; + +// See whether INSN is a single_set that we can optimize. Return the +// set if so, otherwise return null. +rtx +cc_fusion::optimizable_set (const insn_info *insn) +{ + if (!insn->can_be_optimized () + || insn->is_asm () + || insn->has_volatile_refs () + || insn->has_pre_post_modify ()) + return NULL_RTX; + + return single_set (insn->rtl ()); +} + +// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise +// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the +// CC register and the instruction that contains CC_SET is known to use +// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. +bool +cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set, + def_info *other_def, rtx other_set) +{ + auto attempt = crtl->ssa->new_change_attempt (); + + insn_info *cc_insn = cc_def->insn (); + insn_info *other_insn = other_def->insn (); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", + other_insn->uid (), cc_insn->uid ()); + + // Try to substitute OTHER_SET into CC_INSN. + insn_change_watermark rtl_watermark; + rtx_insn *cc_rtl = cc_insn->rtl (); + insn_propagation prop (cc_rtl, SET_DEST (other_set), + SET_SRC (other_set)); + if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) + || prop.num_replacements == 0) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- failed to substitute all uses of r%d\n", + other_def->regno ()); + return false; + } + + // Restrict the uses to those outside notes. + use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); + use_array other_set_uses = remove_note_accesses (attempt, + other_insn->uses ()); + + // Remove the use of the substituted value. + access_array_builder uses_builder (attempt); + uses_builder.reserve (cc_uses.size ()); + for (use_info *use : cc_uses) + if (use->def () != other_def) + uses_builder.quick_push (use); + cc_uses = use_array (uses_builder.finish ()); + + // Get the list of uses for the new instruction. + insn_change cc_change (cc_insn); + cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); + if (!cc_change.new_uses.is_valid ()) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot merge uses\n"); + return false; + } + + // The instruction initially defines just two registers. recog can add + // extra clobbers if necessary. + auto_vec<access_info *, 2> new_defs; + new_defs.quick_push (cc_def); + new_defs.quick_push (other_def); + sort_accesses (new_defs); + cc_change.new_defs = def_array (access_array (new_defs)); + + // Make sure there is somewhere that the new instruction could live. + auto other_change = insn_change::delete_insn (other_insn); + insn_change *changes[] = { &other_change, &cc_change }; + cc_change.move_range = cc_insn->ebb ()->insn_range (); + if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); + return false; + } + + // Tentatively install the new pattern. By convention, the CC set + // must be first. + if (m_parallel) + { + XVECEXP (m_parallel, 0, 0) = cc_set; + XVECEXP (m_parallel, 0, 1) = other_set; + } + else + { + rtvec vec = gen_rtvec (2, cc_set, other_set); + m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); + } + validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); + + // These routines report failures themselves. + if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes)) + || !changes_are_worthwhile (changes) + || !crtl->ssa->verify_insn_changes (changes)) + return false; + + remove_reg_equal_equiv_notes (cc_rtl); + confirm_change_group (); + crtl->ssa->change_insns (changes); + m_parallel = NULL_RTX; + return true; +} + +// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes +// a definition of the CC register by CC_SET. +void +cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set) +{ + // Search the registers used by the CC setter for an easily-substitutable + // def-use chain. + for (use_info *other_use : cc_def->insn ()->uses ()) + if (def_info *other_def = other_use->def ()) + if (other_use->regno () != CC_REGNUM + && other_def->ebb () == cc_def->ebb ()) + if (rtx other_set = optimizable_set (other_def->insn ())) + { + rtx dest = SET_DEST (other_set); + if (REG_P (dest) + && REGNO (dest) == other_def->regno () + && REG_NREGS (dest) == 1 + && parallelize_insns (cc_def, cc_set, other_def, other_set)) + return; + } +} + +// Run the pass on the current function. +void +cc_fusion::execute () +{ + // Initialization. + calculate_dominance_info (CDI_DOMINATORS); + df_analyze (); + crtl->ssa = new rtl_ssa::function_info (cfun); + + // Walk through all instructions that set CC. Look for a PTEST instruction + // that we can optimize. + // + // ??? The PTEST test isn't needed for correctness, but it ensures that the + // pass no effect on non-SVE code. + for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM)) + if (rtx cc_set = optimizable_set (def->insn ())) + if (REG_P (SET_DEST (cc_set)) + && REGNO (SET_DEST (cc_set)) == CC_REGNUM + && GET_CODE (SET_SRC (cc_set)) == UNSPEC + && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST) + optimize_cc_setter (def, cc_set); + + // Finalization. + crtl->ssa->perform_pending_updates (); + free_dominance_info (CDI_DOMINATORS); +} + +class pass_cc_fusion : public rtl_opt_pass +{ +public: + pass_cc_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_cc_fusion, ctxt) + {} + + // opt_pass methods: + virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; } + virtual unsigned int execute (function *); +}; + +unsigned int +pass_cc_fusion::execute (function *) +{ + cc_fusion ().execute (); + return 0; +} + +} // end namespace + +// Create a new CC fusion pass instance. + +rtl_opt_pass * +make_pass_cc_fusion (gcc::context *ctxt) +{ + return new pass_cc_fusion (ctxt); +} diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def index 5c2bd52..0b773d2 100644 --- a/gcc/config/aarch64/aarch64-passes.def +++ b/gcc/config/aarch64/aarch64-passes.def @@ -22,3 +22,4 @@ INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering); INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation); INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti); +INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2ef6b6d..ff87ced 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -790,6 +790,7 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *); rtl_opt_pass *make_pass_track_speculation (gcc::context *); rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *); rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt); +rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt); poly_uint64 aarch64_regmode_natural_size (machine_mode); diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index bb17eed..7e1606c 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -158,6 +158,12 @@ aarch64-bti-insert.o: $(srcdir)/config/aarch64/aarch64-bti-insert.c \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/aarch64/aarch64-bti-insert.c +aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \ + $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ + $(RTL_SSA_H) tree-pass.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/aarch64/aarch64-cc-fusion.cc + comma=, MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG)))) MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG)) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c new file mode 100644 index 0000000..b55cc1f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include <arm_sve.h> + +void +foo (int8_t *ptr1, int8_t *ptr2, svint8_t *res1, svint8_t *res2) +{ + svbool_t pg = svptrue_b8 (); + + svsetffr (); + svint8_t x1 = svldff1 (pg, ptr1); + svbool_t ok1 = svrdffr (); + if (!svptest_last (pg, ok1)) + { + x1 = svsel (ok1, x1, svdup_s8 (0)); + svsetffr (); + } + + svint8_t x2 = svldff1 (pg, ptr2); + svbool_t ok2 = svrdffr (); + if (!svptest_last (pg, ok2)) + x2 = svsel (ok2, x2, svdup_s8 (0)); + + *res1 = x1; + *res2 = x2; +} + +/* { dg-final { scan-assembler-times {\trdffrs\t} 2 } } */ +/* { dg-final { scan-assembler-times {\t(?:b.last|b.nfrst)\t} 2 } } */ +/* { dg-final { scan-assembler-not {\trdffr\t} } } */ +/* { dg-final { scan-assembler-not {\tptest\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c new file mode 100644 index 0000000..abc5108 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c @@ -0,0 +1,10 @@ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void +f (char *restrict x, char *restrict y, char *restrict z, unsigned long n) +{ + for (unsigned long i = 0; i < n; i += 1) + x[i] = y[i] + z[i]; +} + +/* { dg-final { scan-assembler-not {\tptest\t} } } */ |