tree-optimization/98855 - redo BB vectorization costing

The following attempts to account for the fact that BB vectorization regions now can span multiple loop levels and that an unprofitable inner loop vectorization shouldn't be offsetted by a profitable outer loop vectorization to make it overall profitable. For now I've implemented a heuristic based on the premise that vectorization should be profitable even if loops may not be entered or if they iterate any number of times. Especially the first assumption then requires that stmts directly belonging to loop A need to be costed separately from stmts belonging to another loop which also simplifies the implementation. On x86 the added testcase has in the outer loop t.c:38:20: note: Cost model analysis for part in loop 1: Vector cost: 56 Scalar cost: 192 and the inner loop t.c:38:20: note: Cost model analysis for part in loop 2: Vector cost: 132 Scalar cost: 48 and thus the vectorization is considered not profitable (note the same would happen in case the 2nd cost were for a loop outer to the 1st costing). Future enhancements may consider static knowledge of whether a loop is always entered which would allow some inefficiency in the vectorization of its loop header. Likewise stmts only reachable from a loop exit can be treated this way. 2021-02-05 Richard Biener <rguenther@suse.de> PR tree-optimization/98855 * tree-vectorizer.h (add_stmt_cost): New overload. * tree-vect-slp.c (li_cost_vec_cmp): New. (vect_bb_slp_scalar_cost): Cost individual loop regions separately. Account for the scalar instance root stmt. * g++.dg/vect/slp-pr98855.cc: New testcase.
author: Richard Biener <rguenther@suse.de> 2021-02-05 09:54:00 +0100
committer: Richard Biener <rguenther@suse.de> 2021-02-05 14:03:00 +0100
commit: 63538886d1f7fc7cbf066b4c2d6d7fd4da537259 (patch)
tree: 0890b535b876a493d6284ea5a6132a0a0e4da400 /gcc
parent: 4ede02a5f2af1205434f0e05aaaeff762b24e329 (diff)
download: gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.zip
gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.tar.gz
gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.tar.bz2
3 files changed, 232 insertions, 34 deletions
diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
new file mode 100644
index 0000000..0b4e479
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
@@ -0,0 +1,84 @@
+// { dg-do compile }
+// { dg-additional-options "-fvect-cost-model=cheap" }
+// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } }
+
+#include <stdint.h>
+#include <stdlib.h>
+
+inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
+{
+  return ((static_cast<uint32_t>(i0) << 24) |
+	  (static_cast<uint32_t>(i1) << 16) |
+	  (static_cast<uint32_t>(i2) <<  8) |
+	  (static_cast<uint32_t>(i3)));
+}
+
+inline uint32_t load_be(const uint8_t in[], size_t off)
+{
+  in += off * sizeof(uint32_t);
+  return make_uint32(in[0], in[1], in[2], in[3]);
+}
+
+template<typename T>
+inline void load_be(const uint8_t in[],
+		    T& x0, T& x1, T& x2, T& x3,
+		    T& x4, T& x5, T& x6, T& x7)
+{
+  x0 = load_be(in, 0);
+  x1 = load_be(in, 1);
+  x2 = load_be(in, 2);
+  x3 = load_be(in, 3);
+  x4 = load_be(in, 4);
+  x5 = load_be(in, 5);
+  x6 = load_be(in, 6);
+  x7 = load_be(in, 7);
+}
+
+inline void store_be(uint32_t in, uint8_t out[4])
+{
+  uint32_t o = __builtin_bswap32 (in);
+  __builtin_memcpy (out, &o, sizeof (uint32_t));
+}
+
+template<typename T>
+inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3,
+		     T x4, T x5, T x6, T x7)
+{
+  store_be(x0, out + (0 * sizeof(T)));
+  store_be(x1, out + (1 * sizeof(T)));
+  store_be(x2, out + (2 * sizeof(T)));
+  store_be(x3, out + (3 * sizeof(T)));
+  store_be(x4, out + (4 * sizeof(T)));
+  store_be(x5, out + (5 * sizeof(T)));
+  store_be(x6, out + (6 * sizeof(T)));
+  store_be(x7, out + (7 * sizeof(T)));
+}
+
+#define BLOCK_SIZE 8
+void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
+{
+  const size_t blocks4 = blocks / 4;
+
+  for (size_t i = 0; i < blocks4; i++)
+    {
+      uint32_t L0, R0, L1, R1, L2, R2, L3, R3;
+      load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+
+      for(size_t r = 0; r != 32; ++r)
+	{
+	  L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r];
+	  L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r];
+	  L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r];
+	  L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r];
+
+	  R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1];
+	  R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1];
+	  R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1];
+	  R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1];
+	}
+
+      store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+    }
+}
+
+// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } }
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2305bbd..b9f12c3 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4340,6 +4340,20 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
     }
 }
 
+/* Comparator for the loop-index sorted cost vectors.  */
+
+static int
+li_cost_vec_cmp (const void *a_, const void *b_)
+{
+  auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
+  auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
+  if (a->first < b->first)
+    return -1;
+  else if (a->first == b->first)
+    return 0;
+  return 1;
+}
+
 /* Check if vectorization of the basic block is profitable for the
    subgraph denoted by SLP_INSTANCES.  */
 
@@ -4352,61 +4366,152 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
 
-  void *vect_target_cost_data = init_cost (NULL);
-
   /* Calculate scalar cost and sum the cost for the vector stmts
      previously collected.  */
-  stmt_vector_for_cost scalar_costs;
-  scalar_costs.create (0);
+  stmt_vector_for_cost scalar_costs = vNULL;
+  stmt_vector_for_cost vector_costs = vNULL;
   hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
 			      true);
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+	record_stmt_cost (&scalar_costs, 1, scalar_stmt,
+			  SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
       vect_bb_slp_scalar_cost (bb_vinfo,
 			       SLP_INSTANCE_TREE (instance),
 			       &life, &scalar_costs, visited);
-      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+      vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
   /* Unset visited flag.  */
-  stmt_info_for_cost *si;
-  FOR_EACH_VEC_ELT (scalar_costs, i, si)
-    gimple_set_visited  (si->stmt_info->stmt, false);
+  stmt_info_for_cost *cost;
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    gimple_set_visited  (cost->stmt_info->stmt, false);
 
-  void *scalar_target_cost_data = init_cost (NULL);
-  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
-  scalar_costs.release ();
-  unsigned dummy;
-  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
-  destroy_cost_data (scalar_target_cost_data);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
+
+  /* When costing non-loop vectorization we need to consider each covered
+     loop independently and make sure vectorization is profitable.  For
+     now we assume a loop may be not entered or executed an arbitrary
+     number of iterations (???  static information can provide more
+     precise info here) which means we can simply cost each containing
+     loops stmts separately.  */
+
+  /* First produce cost vectors sorted by loop index.  */
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_scalar_costs (scalar_costs.length ());
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_vector_costs (vector_costs.length ());
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    {
+      unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_scalar_costs.quick_push (std::make_pair (l, cost));
+    }
+  /* Use a random used loop as fallback in case the first vector_costs
+     entry does not have a stmt_info associated with it.  */
+  unsigned l = li_scalar_costs[0].first;
+  FOR_EACH_VEC_ELT (vector_costs, i, cost)
+    {
+      /* We inherit from the previous COST, invariants, externals and
+	 extracts immediately follow the cost for the related stmt.  */
+      if (cost->stmt_info)
+	l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_vector_costs.quick_push (std::make_pair (l, cost));
+    }
+  li_scalar_costs.qsort (li_cost_vec_cmp);
+  li_vector_costs.qsort (li_cost_vec_cmp);
+
+  /* Now cost the portions individually.  */
+  unsigned vi = 0;
+  unsigned si = 0;
+  do
+    {
+      unsigned sl = li_scalar_costs[si].first;
+      unsigned vl = li_vector_costs[vi].first;
+      if (sl != vl)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Scalar %d and vector %d loop part do not "
+			     "match up, skipping scalar part\n", sl, vl);
+	  /* Skip the scalar part, assuming zero cost on the vector side.  */
+	  do
+	    {
+	      si++;
+	    }
+	  while (si < li_scalar_costs.length ()
+		 && li_scalar_costs[si].first == sl);
+	  continue;
+	}
 
-  /* Complete the target-specific vector cost calculation.  */
-  finish_cost (vect_target_cost_data, &vec_prologue_cost,
-	       &vec_inside_cost, &vec_epilogue_cost);
-  destroy_cost_data (vect_target_cost_data);
+      void *scalar_target_cost_data = init_cost (NULL);
+      do
+	{
+	  add_stmt_cost (bb_vinfo, scalar_target_cost_data,
+			 li_scalar_costs[si].second);
+	  si++;
+	}
+      while (si < li_scalar_costs.length ()
+	     && li_scalar_costs[si].first == sl);
+      unsigned dummy;
+      finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+      destroy_cost_data (scalar_target_cost_data);
+
+      /* Complete the target-specific vector cost calculation.  */
+      void *vect_target_cost_data = init_cost (NULL);
+      do
+	{
+	  add_stmt_cost (bb_vinfo, vect_target_cost_data,
+			 li_vector_costs[vi].second);
+	  vi++;
+	}
+      while (vi < li_vector_costs.length ()
+	     && li_vector_costs[vi].first == vl);
+      finish_cost (vect_target_cost_data, &vec_prologue_cost,
+		   &vec_inside_cost, &vec_epilogue_cost);
+      destroy_cost_data (vect_target_cost_data);
 
-  vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
+      vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
 
-  if (dump_enabled_p ())
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Cost model analysis for part in loop %d:\n", sl);
+	  dump_printf (MSG_NOTE, "  Vector cost: %d\n",
+		       vec_inside_cost + vec_outside_cost);
+	  dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
+	}
+
+      /* Vectorization is profitable if its cost is more than the cost of scalar
+	 version.  Note that we err on the vector side for equal cost because
+	 the cost estimate is otherwise quite pessimistic (constant uses are
+	 free on the scalar side but cost a load on the vector side for
+	 example).  */
+      if (vec_outside_cost + vec_inside_cost > scalar_cost)
+	{
+	  scalar_costs.release ();
+	  vector_costs.release ();
+	  return false;
+	}
+    }
+  while (si < li_scalar_costs.length ()
+	 && vi < li_vector_costs.length ());
+  if (vi < li_vector_costs.length ())
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
-      dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
-		   vec_inside_cost);
-      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
-      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
-      dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Excess vector cost for part in loop %d:\n",
+			 li_vector_costs[vi].first);
+      scalar_costs.release ();
+      vector_costs.release ();
+      return false;
     }
 
-  /* Vectorization is profitable if its cost is more than the cost of scalar
-     version.  Note that we err on the vector side for equal cost because
-     the cost estimate is otherwise quite pessimistic (constant uses are
-     free on the scalar side but cost a load on the vector side for
-     example).  */
-  if (vec_outside_cost + vec_inside_cost > scalar_cost)
-    return false;
-
+  scalar_costs.release ();
+  vector_costs.release ();
   return true;
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e564fcf..b861c97 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1481,6 +1481,15 @@ add_stmt_cost (vec_info *vinfo, void *data, int count,
   return cost;
 }
 
+/* Alias targetm.vectorize.add_stmt_cost.  */
+
+static inline unsigned
+add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i)
+{
+  return add_stmt_cost (vinfo, data, i->count, i->kind, i->stmt_info,
+			i->vectype, i->misalign, i->where);
+}
+
 /* Alias targetm.vectorize.finish_cost.  */
 
 static inline void
author	Richard Biener <rguenther@suse.de>	2021-02-05 09:54:00 +0100
committer	Richard Biener <rguenther@suse.de>	2021-02-05 14:03:00 +0100
commit	63538886d1f7fc7cbf066b4c2d6d7fd4da537259 (patch)
tree	0890b535b876a493d6284ea5a6132a0a0e4da400 /gcc
parent	4ede02a5f2af1205434f0e05aaaeff762b24e329 (diff)
download	gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.zip gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.tar.gz gcc-63538886d1f7fc7cbf066b4c2d6d7fd4da537259.tar.bz2