1 files changed, 296 insertions, 0 deletions
diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc
new file mode 100644
index 0000000..09069a2
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-cc-fusion.cc
@@ -0,0 +1,296 @@
+// Pass to fuse CC operations with other instructions.
+// Copyright (C) 2021 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+//
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// This pass looks for sequences of the form:
+//
+//    A: (set (reg R1) X1)
+//    B: ...instructions that might change the value of X1...
+//    C: (set (reg CC) X2) // X2 uses R1
+//
+// and tries to change them to:
+//
+//    C': [(set (reg CC) X2')
+//         (set (reg R1) X1)]
+//    B: ...instructions that might change the value of X1...
+//
+// where X2' is the result of replacing R1 with X1 in X2.
+//
+// This sequence occurs in SVE code in two important cases:
+//
+// (a) Sometimes, to deal correctly with overflow, we need to increment
+//     an IV after a WHILELO rather than before it.  In this case:
+//     - A is a WHILELO,
+//     - B includes an IV increment and
+//     - C is a separate PTEST.
+//
+// (b) ACLE code of the form:
+//
+//       svbool_t ok = svrdffr ();
+//       if (svptest_last (pg, ok))
+//         ...
+//
+//     must, for performance reasons, be code-generated as:
+//
+//       RDFFRS Pok.B, Pg/Z
+//       ...branch on flags result...
+//
+//     without a separate PTEST of Pok.  In this case:
+//     - A is an aarch64_rdffr
+//     - B includes an aarch64_update_ffrt
+//     - C is a separate PTEST
+//
+// Combine can handle this optimization if B doesn't exist and if A and
+// C are in the same BB.  This pass instead handles cases where B does
+// exist and cases where A and C are in different BBs of the same EBB.
+
+#define IN_TARGET_CODE 1
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "tree-pass.h"
+
+using namespace rtl_ssa;
+
+namespace {
+const pass_data pass_data_cc_fusion =
+{
+  RTL_PASS, // type
+  "cc_fusion", // name
+  OPTGROUP_NONE, // optinfo_flags
+  TV_NONE, // tv_id
+  0, // properties_required
+  0, // properties_provided
+  0, // properties_destroyed
+  0, // todo_flags_start
+  TODO_df_finish, // todo_flags_finish
+};
+
+// Class that represents one run of the pass.
+class cc_fusion
+{
+public:
+  cc_fusion ()  : m_parallel () {}
+  void execute ();
+
+private:
+  rtx optimizable_set (const insn_info *);
+  bool parallelize_insns (def_info *, rtx, def_info *, rtx);
+  void optimize_cc_setter (def_info *, rtx);
+
+  // A spare PARALLEL rtx, or null if none.
+  rtx m_parallel;
+};
+
+// See whether INSN is a single_set that we can optimize.  Return the
+// set if so, otherwise return null.
+rtx
+cc_fusion::optimizable_set (const insn_info *insn)
+{
+  if (!insn->can_be_optimized ()
+      || insn->is_asm ()
+      || insn->has_volatile_refs ()
+      || insn->has_pre_post_modify ())
+    return NULL_RTX;
+
+  return single_set (insn->rtl ());
+}
+
+// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise
+// a single_set that sets (only) OTHER_DEF.  CC_SET is known to set the
+// CC register and the instruction that contains CC_SET is known to use
+// OTHER_DEF.  Try to do CC_SET and OTHER_SET in parallel.
+bool
+cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set,
+			      def_info *other_def, rtx other_set)
+{
+  auto attempt = crtl->ssa->new_change_attempt ();
+
+  insn_info *cc_insn = cc_def->insn ();
+  insn_info *other_insn = other_def->insn ();
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "trying to parallelize insn %d and insn %d\n",
+	     other_insn->uid (), cc_insn->uid ());
+
+  // Try to substitute OTHER_SET into CC_INSN.
+  insn_change_watermark rtl_watermark;
+  rtx_insn *cc_rtl = cc_insn->rtl ();
+  insn_propagation prop (cc_rtl, SET_DEST (other_set),
+			 SET_SRC (other_set));
+  if (!prop.apply_to_pattern (&PATTERN (cc_rtl))
+      || prop.num_replacements == 0)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "-- failed to substitute all uses of r%d\n",
+		 other_def->regno ());
+      return false;
+    }
+
+  // Restrict the uses to those outside notes.
+  use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ());
+  use_array other_set_uses = remove_note_accesses (attempt,
+						   other_insn->uses ());
+
+  // Remove the use of the substituted value.
+  access_array_builder uses_builder (attempt);
+  uses_builder.reserve (cc_uses.size ());
+  for (use_info *use : cc_uses)
+    if (use->def () != other_def)
+      uses_builder.quick_push (use);
+  cc_uses = use_array (uses_builder.finish ());
+
+  // Get the list of uses for the new instruction.
+  insn_change cc_change (cc_insn);
+  cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses);
+  if (!cc_change.new_uses.is_valid ())
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "-- cannot merge uses\n");
+      return false;
+    }
+
+  // The instruction initially defines just two registers.  recog can add
+  // extra clobbers if necessary.
+  auto_vec<access_info *, 2> new_defs;
+  new_defs.quick_push (cc_def);
+  new_defs.quick_push (other_def);
+  sort_accesses (new_defs);
+  cc_change.new_defs = def_array (access_array (new_defs));
+
+  // Make sure there is somewhere that the new instruction could live.
+  auto other_change = insn_change::delete_insn (other_insn);
+  insn_change *changes[] = { &other_change, &cc_change };
+  cc_change.move_range = cc_insn->ebb ()->insn_range ();
+  if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes)))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "-- cannot satisfy all definitions and uses\n");
+      return false;
+    }
+
+  // Tentatively install the new pattern.  By convention, the CC set
+  // must be first.
+  if (m_parallel)
+    {
+      XVECEXP (m_parallel, 0, 0) = cc_set;
+      XVECEXP (m_parallel, 0, 1) = other_set;
+    }
+  else
+    {
+      rtvec vec = gen_rtvec (2, cc_set, other_set);
+      m_parallel = gen_rtx_PARALLEL (VOIDmode, vec);
+    }
+  validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1);
+
+  // These routines report failures themselves.
+  if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes))
+      || !changes_are_worthwhile (changes)
+      || !crtl->ssa->verify_insn_changes (changes))
+    return false;
+
+  remove_reg_equal_equiv_notes (cc_rtl);
+  confirm_change_group ();
+  crtl->ssa->change_insns (changes);
+  m_parallel = NULL_RTX;
+  return true;
+}
+
+// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes
+// a definition of the CC register by CC_SET.
+void
+cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set)
+{
+  // Search the registers used by the CC setter for an easily-substitutable
+  // def-use chain.
+  for (use_info *other_use : cc_def->insn ()->uses ())
+    if (def_info *other_def = other_use->def ())
+      if (other_use->regno () != CC_REGNUM
+	  && other_def->ebb () == cc_def->ebb ())
+	if (rtx other_set = optimizable_set (other_def->insn ()))
+	  {
+	    rtx dest = SET_DEST (other_set);
+	    if (REG_P (dest)
+		&& REGNO (dest) == other_def->regno ()
+		&& REG_NREGS (dest) == 1
+		&& parallelize_insns (cc_def, cc_set, other_def, other_set))
+	      return;
+	  }
+}
+
+// Run the pass on the current function.
+void
+cc_fusion::execute ()
+{
+  // Initialization.
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_analyze ();
+  crtl->ssa = new rtl_ssa::function_info (cfun);
+
+  // Walk through all instructions that set CC.  Look for a PTEST instruction
+  // that we can optimize.
+  //
+  // ??? The PTEST test isn't needed for correctness, but it ensures that the
+  // pass no effect on non-SVE code.
+  for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM))
+    if (rtx cc_set = optimizable_set (def->insn ()))
+      if (REG_P (SET_DEST (cc_set))
+	  && REGNO (SET_DEST (cc_set)) == CC_REGNUM
+	  && GET_CODE (SET_SRC (cc_set)) == UNSPEC
+	  && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST)
+	optimize_cc_setter (def, cc_set);
+
+  // Finalization.
+  crtl->ssa->perform_pending_updates ();
+  free_dominance_info (CDI_DOMINATORS);
+}
+
+class pass_cc_fusion : public rtl_opt_pass
+{
+public:
+  pass_cc_fusion (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_cc_fusion, ctxt)
+  {}
+
+  // opt_pass methods:
+  virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; }
+  virtual unsigned int execute (function *);
+};
+
+unsigned int
+pass_cc_fusion::execute (function *)
+{
+  cc_fusion ().execute ();
+  return 0;
+}
+
+} // end namespace
+
+// Create a new CC fusion pass instance.
+
+rtl_opt_pass *
+make_pass_cc_fusion (gcc::context *ctxt)
+{
+  return new pass_cc_fusion (ctxt);
+}