aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/Makefile.in7
-rw-r--r--gcc/config.gcc2
-rw-r--r--gcc/config/aarch64/aarch64-cc-fusion.cc296
-rw-r--r--gcc/config/aarch64/aarch64-passes.def1
-rw-r--r--gcc/config/aarch64/aarch64-protos.h1
-rw-r--r--gcc/config/aarch64/t-aarch646
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c32
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c10
8 files changed, 354 insertions, 1 deletions
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index de8af61..a63c5d9c 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1024,6 +1024,13 @@ PLUGIN_H = plugin.h $(GCC_PLUGIN_H)
PLUGIN_VERSION_H = plugin-version.h configargs.h
CONTEXT_H = context.h
GENSUPPORT_H = gensupport.h read-md.h optabs.def
+RTL_SSA_H = $(PRETTY_PRINT_H) insn-config.h splay-tree-utils.h \
+ $(RECOG_H) $(REGS_H) function-abi.h obstack-utils.h \
+ mux-utils.h rtlanal.h memmodel.h $(EMIT_RTL_H) \
+ rtl-ssa/accesses.h rtl-ssa/insns.h rtl-ssa/blocks.h \
+ rtl-ssa/changes.h rtl-ssa/functions.h rtl-ssa/is-a.inl \
+ rtl-ssa/access-utils.h rtl-ssa/insn-utils.h rtl-ssa/movement.h \
+ rtl-ssa/change-utils.h rtl-ssa/member-fns.inl
#
# Now figure out from those variables how to compile and link.
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 9fb57e9..17fea83 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -321,7 +321,7 @@ aarch64*-*-*)
c_target_objs="aarch64-c.o"
cxx_target_objs="aarch64-c.o"
d_target_objs="aarch64-d.o"
- extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o"
+ extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o aarch64-cc-fusion.o"
target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
target_has_targetm_common=yes
;;
diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc
new file mode 100644
index 0000000..09069a2
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-cc-fusion.cc
@@ -0,0 +1,296 @@
+// Pass to fuse CC operations with other instructions.
+// Copyright (C) 2021 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+//
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+// This pass looks for sequences of the form:
+//
+// A: (set (reg R1) X1)
+// B: ...instructions that might change the value of X1...
+// C: (set (reg CC) X2) // X2 uses R1
+//
+// and tries to change them to:
+//
+// C': [(set (reg CC) X2')
+// (set (reg R1) X1)]
+// B: ...instructions that might change the value of X1...
+//
+// where X2' is the result of replacing R1 with X1 in X2.
+//
+// This sequence occurs in SVE code in two important cases:
+//
+// (a) Sometimes, to deal correctly with overflow, we need to increment
+// an IV after a WHILELO rather than before it. In this case:
+// - A is a WHILELO,
+// - B includes an IV increment and
+// - C is a separate PTEST.
+//
+// (b) ACLE code of the form:
+//
+// svbool_t ok = svrdffr ();
+// if (svptest_last (pg, ok))
+// ...
+//
+// must, for performance reasons, be code-generated as:
+//
+// RDFFRS Pok.B, Pg/Z
+// ...branch on flags result...
+//
+// without a separate PTEST of Pok. In this case:
+// - A is an aarch64_rdffr
+// - B includes an aarch64_update_ffrt
+// - C is a separate PTEST
+//
+// Combine can handle this optimization if B doesn't exist and if A and
+// C are in the same BB. This pass instead handles cases where B does
+// exist and cases where A and C are in different BBs of the same EBB.
+
+#define IN_TARGET_CODE 1
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "tree-pass.h"
+
+using namespace rtl_ssa;
+
+namespace {
+const pass_data pass_data_cc_fusion =
+{
+ RTL_PASS, // type
+ "cc_fusion", // name
+ OPTGROUP_NONE, // optinfo_flags
+ TV_NONE, // tv_id
+ 0, // properties_required
+ 0, // properties_provided
+ 0, // properties_destroyed
+ 0, // todo_flags_start
+ TODO_df_finish, // todo_flags_finish
+};
+
+// Class that represents one run of the pass.
+class cc_fusion
+{
+public:
+ cc_fusion () : m_parallel () {}
+ void execute ();
+
+private:
+ rtx optimizable_set (const insn_info *);
+ bool parallelize_insns (def_info *, rtx, def_info *, rtx);
+ void optimize_cc_setter (def_info *, rtx);
+
+ // A spare PARALLEL rtx, or null if none.
+ rtx m_parallel;
+};
+
+// See whether INSN is a single_set that we can optimize. Return the
+// set if so, otherwise return null.
+rtx
+cc_fusion::optimizable_set (const insn_info *insn)
+{
+ if (!insn->can_be_optimized ()
+ || insn->is_asm ()
+ || insn->has_volatile_refs ()
+ || insn->has_pre_post_modify ())
+ return NULL_RTX;
+
+ return single_set (insn->rtl ());
+}
+
+// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
+// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the
+// CC register and the instruction that contains CC_SET is known to use
+// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel.
+bool
+cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
+ def_info *other_def, rtx other_set)
+{
+ auto attempt = crtl->ssa->new_change_attempt ();
+
+ insn_info *cc_insn = cc_def->insn ();
+ insn_info *other_insn = other_def->insn ();
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
+ other_insn->uid (), cc_insn->uid ());
+
+ // Try to substitute OTHER_SET into CC_INSN.
+ insn_change_watermark rtl_watermark;
+ rtx_insn *cc_rtl = cc_insn->rtl ();
+ insn_propagation prop (cc_rtl, SET_DEST (other_set),
+ SET_SRC (other_set));
+ if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
+ || prop.num_replacements == 0)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
+ other_def->regno ());
+ return false;
+ }
+
+ // Restrict the uses to those outside notes.
+ use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
+ use_array other_set_uses = remove_note_accesses (attempt,
+ other_insn->uses ());
+
+ // Remove the use of the substituted value.
+ access_array_builder uses_builder (attempt);
+ uses_builder.reserve (cc_uses.size ());
+ for (use_info *use : cc_uses)
+ if (use->def () != other_def)
+ uses_builder.quick_push (use);
+ cc_uses = use_array (uses_builder.finish ());
+
+ // Get the list of uses for the new instruction.
+ insn_change cc_change (cc_insn);
+ cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
+ if (!cc_change.new_uses.is_valid ())
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "-- cannot merge uses\n");
+ return false;
+ }
+
+ // The instruction initially defines just two registers. recog can add
+ // extra clobbers if necessary.
+ auto_vec<access_info *, 2> new_defs;
+ new_defs.quick_push (cc_def);
+ new_defs.quick_push (other_def);
+ sort_accesses (new_defs);
+ cc_change.new_defs = def_array (access_array (new_defs));
+
+ // Make sure there is somewhere that the new instruction could live.
+ auto other_change = insn_change::delete_insn (other_insn);
+ insn_change *changes[] = { &other_change, &cc_change };
+ cc_change.move_range = cc_insn->ebb ()->insn_range ();
+ if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
+ return false;
+ }
+
+ // Tentatively install the new pattern. By convention, the CC set
+ // must be first.
+ if (m_parallel)
+ {
+ XVECEXP (m_parallel, 0, 0) = cc_set;
+ XVECEXP (m_parallel, 0, 1) = other_set;
+ }
+ else
+ {
+ rtvec vec = gen_rtvec (2, cc_set, other_set);
+ m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
+ }
+ validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
+
+ // These routines report failures themselves.
+ if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
+ || !changes_are_worthwhile (changes)
+ || !crtl->ssa->verify_insn_changes (changes))
+ return false;
+
+ remove_reg_equal_equiv_notes (cc_rtl);
+ confirm_change_group ();
+ crtl->ssa->change_insns (changes);
+ m_parallel = NULL_RTX;
+ return true;
+}
+
+// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
+// a definition of the CC register by CC_SET.
+void
+cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
+{
+ // Search the registers used by the CC setter for an easily-substitutable
+ // def-use chain.
+ for (use_info *other_use : cc_def->insn ()->uses ())
+ if (def_info *other_def = other_use->def ())
+ if (other_use->regno () != CC_REGNUM
+ && other_def->ebb () == cc_def->ebb ())
+ if (rtx other_set = optimizable_set (other_def->insn ()))
+ {
+ rtx dest = SET_DEST (other_set);
+ if (REG_P (dest)
+ && REGNO (dest) == other_def->regno ()
+ && REG_NREGS (dest) == 1
+ && parallelize_insns (cc_def, cc_set, other_def, other_set))
+ return;
+ }
+}
+
+// Run the pass on the current function.
+void
+cc_fusion::execute ()
+{
+ // Initialization.
+ calculate_dominance_info (CDI_DOMINATORS);
+ df_analyze ();
+ crtl->ssa = new rtl_ssa::function_info (cfun);
+
+ // Walk through all instructions that set CC. Look for a PTEST instruction
+ // that we can optimize.
+ //
+ // ??? The PTEST test isn't needed for correctness, but it ensures that the
+ // pass no effect on non-SVE code.
+ for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
+ if (rtx cc_set = optimizable_set (def->insn ()))
+ if (REG_P (SET_DEST (cc_set))
+ && REGNO (SET_DEST (cc_set)) == CC_REGNUM
+ && GET_CODE (SET_SRC (cc_set)) == UNSPEC
+ && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
+ optimize_cc_setter (def, cc_set);
+
+ // Finalization.
+ crtl->ssa->perform_pending_updates ();
+ free_dominance_info (CDI_DOMINATORS);
+}
+
+class pass_cc_fusion : public rtl_opt_pass
+{
+public:
+ pass_cc_fusion (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_cc_fusion, ctxt)
+ {}
+
+ // opt_pass methods:
+ virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
+ virtual unsigned int execute (function *);
+};
+
+unsigned int
+pass_cc_fusion::execute (function *)
+{
+ cc_fusion ().execute ();
+ return 0;
+}
+
+} // end namespace
+
+// Create a new CC fusion pass instance.
+
+rtl_opt_pass *
+make_pass_cc_fusion (gcc::context *ctxt)
+{
+ return new pass_cc_fusion (ctxt);
+}
diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def
index 5c2bd52..0b773d2 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -22,3 +22,4 @@ INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
+INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 2ef6b6d..ff87ced 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -790,6 +790,7 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *);
rtl_opt_pass *make_pass_track_speculation (gcc::context *);
rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
+rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
poly_uint64 aarch64_regmode_natural_size (machine_mode);
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index bb17eed..7e1606c 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -158,6 +158,12 @@ aarch64-bti-insert.o: $(srcdir)/config/aarch64/aarch64-bti-insert.c \
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/aarch64/aarch64-bti-insert.c
+aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
+ $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
+ $(RTL_SSA_H) tree-pass.h
+ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+ $(srcdir)/config/aarch64/aarch64-cc-fusion.cc
+
comma=,
MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c
new file mode 100644
index 0000000..b55cc1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <arm_sve.h>
+
+void
+foo (int8_t *ptr1, int8_t *ptr2, svint8_t *res1, svint8_t *res2)
+{
+ svbool_t pg = svptrue_b8 ();
+
+ svsetffr ();
+ svint8_t x1 = svldff1 (pg, ptr1);
+ svbool_t ok1 = svrdffr ();
+ if (!svptest_last (pg, ok1))
+ {
+ x1 = svsel (ok1, x1, svdup_s8 (0));
+ svsetffr ();
+ }
+
+ svint8_t x2 = svldff1 (pg, ptr2);
+ svbool_t ok2 = svrdffr ();
+ if (!svptest_last (pg, ok2))
+ x2 = svsel (ok2, x2, svdup_s8 (0));
+
+ *res1 = x1;
+ *res2 = x2;
+}
+
+/* { dg-final { scan-assembler-times {\trdffrs\t} 2 } } */
+/* { dg-final { scan-assembler-times {\t(?:b.last|b.nfrst)\t} 2 } } */
+/* { dg-final { scan-assembler-not {\trdffr\t} } } */
+/* { dg-final { scan-assembler-not {\tptest\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c
new file mode 100644
index 0000000..abc5108
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+f (char *restrict x, char *restrict y, char *restrict z, unsigned long n)
+{
+ for (unsigned long i = 0; i < n; i += 1)
+ x[i] = y[i] + z[i];
+}
+
+/* { dg-final { scan-assembler-not {\tptest\t} } } */