Handle SLP permutations for variable-length vectors

The SLP code currently punts for all variable-length permutes. This patch makes it handle the easy case of N->N permutes in which the number of vector lanes is a multiple of N. Every permute then uses the same mask, and that mask repeats (with a stride) every N elements. The patch uses the same path for constant-length vectors, since it should be slightly cheaper in terms of compile time. 2018-08-24 Richard Sandiford <richard.sandiford@arm.com> gcc/ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out the case in which the permute needs only a single element and repeats for every vector of the result. Extend that case to handle variable-length vectors. * tree-vect-stmts.c (vectorizable_load): Update accordingly. gcc/testsuite/ * gcc.target/aarch64/sve/slp_perm_1.c: New test. * gcc.target/aarch64/sve/slp_perm_2.c: Likewise. * gcc.target/aarch64/sve/slp_perm_3.c: Likewise. * gcc.target/aarch64/sve/slp_perm_4.c: Likewise. * gcc.target/aarch64/sve/slp_perm_5.c: Likewise. * gcc.target/aarch64/sve/slp_perm_6.c: Likewise. * gcc.target/aarch64/sve/slp_perm_7.c: Likewise. From-SVN: r263832
author: Richard Sandiford <richard.sandiford@arm.com> 2018-08-24 13:05:36 +0000
committer: Richard Sandiford <rsandifo@gcc.gnu.org> 2018-08-24 13:05:36 +0000
commit: ab7e60cec1a6f4185b0428f3a2b3e71df0bae533 (patch)
tree: c666f7a31ae111b41e904e26bb4df9009c6db203 /gcc/tree-vect-slp.c
parent: 1ade64c9d8cd37c8db0a07383189f1719c7164da (diff)
download: gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.zip
gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.gz
gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.bz2
1 files changed, 88 insertions, 62 deletions
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 0a9ce24..0ab7bd8 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3606,13 +3606,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   vec_info *vinfo = stmt_info->vinfo;
-  tree mask_element_type = NULL_TREE, mask_type;
   int vec_index = 0;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
   unsigned int mask_element;
   machine_mode mode;
-  unsigned HOST_WIDE_INT nunits, const_vf;
 
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
     return false;
@@ -3620,22 +3618,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
 
   mode = TYPE_MODE (vectype);
-
-  /* At the moment, all permutations are represented using per-element
-     indices, so we can't cope with variable vector lengths or
-     vectorization factors.  */
-  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-      || !vf.is_constant (&const_vf))
-    return false;
-
-  /* The generic VEC_PERM_EXPR code always uses an integral type of the
-     same size as the vector element being permuted.  */
-  mask_element_type = lang_hooks.types.type_for_mode
-    (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
-  mask_type = get_vectype_for_scalar_type (mask_element_type);
-  vec_perm_builder mask (nunits, nunits, 1);
-  mask.quick_grow (nunits);
-  vec_perm_indices indices;
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   /* Initialize the vect stmts of NODE to properly insert the generated
      stmts later.  */
@@ -3669,14 +3652,53 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   bool noop_p = true;
   *n_perms = 0;
 
-  for (unsigned int j = 0; j < const_vf; j++)
+  vec_perm_builder mask;
+  unsigned int nelts_to_build;
+  unsigned int nvectors_per_build;
+  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+		      && multiple_p (nunits, group_size));
+  if (repeating_p)
     {
-      for (int k = 0; k < group_size; k++)
+      /* A single vector contains a whole number of copies of the node, so:
+	 (a) all permutes can use the same mask; and
+	 (b) the permutes only need a single vector input.  */
+      mask.new_vector (nunits, group_size, 3);
+      nelts_to_build = mask.encoded_nelts ();
+      nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+    }
+  else
+    {
+      /* We need to construct a separate mask for each vector statement.  */
+      unsigned HOST_WIDE_INT const_nunits, const_vf;
+      if (!nunits.is_constant (&const_nunits)
+	  || !vf.is_constant (&const_vf))
+	return false;
+      mask.new_vector (const_nunits, const_nunits, 1);
+      nelts_to_build = const_vf * group_size;
+      nvectors_per_build = 1;
+    }
+
+  unsigned int count = mask.encoded_nelts ();
+  mask.quick_grow (count);
+  vec_perm_indices indices;
+
+  for (unsigned int j = 0; j < nelts_to_build; j++)
+    {
+      unsigned int iter_num = j / group_size;
+      unsigned int stmt_num = j % group_size;
+      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+			+ SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+      if (repeating_p)
 	{
-	  unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
-			    + j * DR_GROUP_SIZE (stmt_info));
-	  vec_index = i / nunits;
-	  mask_element = i % nunits;
+	  first_vec_index = 0;
+	  mask_element = i;
+	}
+      else
+	{
+	  /* Enforced before the loop when !repeating_p.  */
+	  unsigned int const_nunits = nunits.to_constant ();
+	  vec_index = i / const_nunits;
+	  mask_element = i % const_nunits;
 	  if (vec_index == first_vec_index
 	      || first_vec_index == -1)
 	    {
@@ -3686,7 +3708,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 		   || second_vec_index == -1)
 	    {
 	      second_vec_index = vec_index;
-	      mask_element += nunits;
+	      mask_element += const_nunits;
 	    }
 	  else
 	    {
@@ -3702,50 +3724,54 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 	      return false;
 	    }
 
-	  gcc_assert (mask_element < 2 * nunits);
-	  if (mask_element != index)
-	    noop_p = false;
-	  mask[index++] = mask_element;
+	  gcc_assert (mask_element < 2 * const_nunits);
+	}
+
+      if (mask_element != index)
+	noop_p = false;
+      mask[index++] = mask_element;
 
-	  if (index == nunits && !noop_p)
+      if (index == count && !noop_p)
+	{
+	  indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
+	  if (!can_vec_perm_const_p (mode, indices))
 	    {
-	      indices.new_vector (mask, 2, nunits);
-	      if (!can_vec_perm_const_p (mode, indices))
+	      if (dump_enabled_p ())
 		{
-		  if (dump_enabled_p ())
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				   vect_location,
+				   "unsupported vect permute { ");
+		  for (i = 0; i < count; ++i)
 		    {
-		      dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-				       vect_location, 
-				       "unsupported vect permute { ");
-		      for (i = 0; i < nunits; ++i)
-			{
-			  dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
-			  dump_printf (MSG_MISSED_OPTIMIZATION, " ");
-			}
-		      dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
 		    }
-		  gcc_assert (analyze_only);
-		  return false;
+		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
 		}
-
-	      ++*n_perms;
+	      gcc_assert (analyze_only);
+	      return false;
 	    }
 
-	  if (index == nunits)
+	  ++*n_perms;
+	}
+
+      if (index == count)
+	{
+	  if (!analyze_only)
 	    {
-	      if (!analyze_only)
-		{
-		  tree mask_vec = NULL_TREE;
+	      tree mask_vec = NULL_TREE;
 		  
-		  if (! noop_p)
-		    mask_vec = vec_perm_indices_to_tree (mask_type, indices);
+	      if (! noop_p)
+		mask_vec = vect_gen_perm_mask_checked (vectype, indices);
 
-		  if (second_vec_index == -1)
-		    second_vec_index = first_vec_index;
+	      if (second_vec_index == -1)
+		second_vec_index = first_vec_index;
 
+	      for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+		{
 		  /* Generate the permute statement if necessary.  */
-		  tree first_vec = dr_chain[first_vec_index];
-		  tree second_vec = dr_chain[second_vec_index];
+		  tree first_vec = dr_chain[first_vec_index + ri];
+		  tree second_vec = dr_chain[second_vec_index + ri];
 		  stmt_vec_info perm_stmt_info;
 		  if (! noop_p)
 		    {
@@ -3771,12 +3797,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 		  SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
 		    = perm_stmt_info;
 		}
-
-	      index = 0;
-	      first_vec_index = -1;
-	      second_vec_index = -1;
-	      noop_p = true;
 	    }
+
+	  index = 0;
+	  first_vec_index = -1;
+	  second_vec_index = -1;
+	  noop_p = true;
 	}
     }
author	Richard Sandiford <richard.sandiford@arm.com>	2018-08-24 13:05:36 +0000
committer	Richard Sandiford <rsandifo@gcc.gnu.org>	2018-08-24 13:05:36 +0000
commit	ab7e60cec1a6f4185b0428f3a2b3e71df0bae533 (patch)
tree	c666f7a31ae111b41e904e26bb4df9009c6db203 /gcc/tree-vect-slp.c
parent	1ade64c9d8cd37c8db0a07383189f1719c7164da (diff)
download	gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.zip gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.gz gcc-ab7e60cec1a6f4185b0428f3a2b3e71df0bae533.tar.bz2