AArch64: update costing for combining vector conditionals

boolean comparisons have different cost depending on the mode. e.g. for SVE, a && b doesn't require an additional instruction when a or b is predicated by combining the predicate of the one operation into the second one. At the moment though we only fuse compares so this update requires one of the operands to be a comparison. Scalars also don't require this because the non-ifcvt variant is a series of branches where following the branch sequences themselves are natural ANDs. Advanced SIMD however does require an actual AND to combine the boolean values. As such this patch discounts Scalar and SVE boolean operation latency and throughput. With this patch comparison heavy code prefers SVE as it should, especially in cases with SVE VL == Advanced SIMD VL where previously the SVE prologue costs would tip it towards Advanced SIMD. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_bool_compound_p): New. (aarch64_adjust_stmt_cost, aarch64_vector_costs::count_ops): Use it.
author: Tamar Christina <tamar.christina@arm.com> 2023-08-04 13:48:35 +0100
committer: Tamar Christina <tamar.christina@arm.com> 2023-08-04 13:48:35 +0100
commit: 8787b195f014883a2d2454faf2001303dfefef8c (patch)
tree: 1f7555013b6ac121d5e84266ab9efca02a1db53d
parent: 0e5205912994fbc43719b43282a62bb35957f8a2 (diff)
download: gcc-8787b195f014883a2d2454faf2001303dfefef8c.zip
gcc-8787b195f014883a2d2454faf2001303dfefef8c.tar.gz
gcc-8787b195f014883a2d2454faf2001303dfefef8c.tar.bz2
1 files changed, 57 insertions, 2 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 53fbecb..7cd230c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16447,6 +16447,49 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
   return false;
 }
 
+/* Return true if STMT_INFO is the second part of a two-statement boolean AND
+   expression sequence that might be suitable for fusing into a
+   single instruction.  If VEC_FLAGS is zero, analyze the operation as
+   a scalar one, otherwise analyze it as an operation on vectors with those
+   VEC_* flags.  */
+
+static bool
+aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
+			 unsigned int vec_flags)
+{
+  gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
+  if (!assign
+      || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
+      || !STMT_VINFO_VECTYPE (stmt_info)
+      || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
+    return false;
+
+  for (int i = 1; i < 3; ++i)
+    {
+      tree rhs = gimple_op (assign, i);
+
+      if (TREE_CODE (rhs) != SSA_NAME)
+	continue;
+
+      stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
+      if (!def_stmt_info
+	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
+	continue;
+
+      gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
+      if (!rhs_assign
+	  || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
+		!= tcc_comparison)
+	continue;
+
+      if (vec_flags & VEC_ADVSIMD)
+	return false;
+
+      return true;
+    }
+  return false;
+}
+
 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
    in-loop reduction that SVE supports directly, return its latency in cycles,
    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
@@ -16744,10 +16787,16 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
 	}
 
       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
-      if (assign && !vect_is_reduction (stmt_info))
+      if (assign)
 	{
 	  /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
-	  if (aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
+	  if (!vect_is_reduction (stmt_info)
+	      && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
+	    return 0;
+
+	  /* For vector boolean ANDs with a compare operand we just need
+	     one insn.  */
+	  if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
 	    return 0;
 	}
 
@@ -16823,6 +16872,12 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
     return;
 
+  /* Assume that bool AND with compare operands will become a single
+     operation.  */
+  if (stmt_info
+      && aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
+    return;
+
   /* Count the basic operation cost associated with KIND.  */
   switch (kind)
     {
author	Tamar Christina <tamar.christina@arm.com>	2023-08-04 13:48:35 +0100
committer	Tamar Christina <tamar.christina@arm.com>	2023-08-04 13:48:35 +0100
commit	8787b195f014883a2d2454faf2001303dfefef8c (patch)
tree	1f7555013b6ac121d5e84266ab9efca02a1db53d
parent	0e5205912994fbc43719b43282a62bb35957f8a2 (diff)
download	gcc-8787b195f014883a2d2454faf2001303dfefef8c.zip gcc-8787b195f014883a2d2454faf2001303dfefef8c.tar.gz gcc-8787b195f014883a2d2454faf2001303dfefef8c.tar.bz2