4 files changed, 127 insertions, 70 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
index ee12136..8cefa7f 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
@@ -24,11 +24,17 @@ void rephase (void)
   struct site *s;
   for(i=0,s=lattice;i<sites_on_node;i++,s++)
     for(dir=0;dir<32;dir++)
-      for(j=0;j<3;j++)for(k=0;k<3;k++)
-	{
-	  s->link[dir].e[j][k].real *= s->phase[dir];
-	  s->link[dir].e[j][k].imag *= s->phase[dir];
-	}
+      {
+	for(j=0;j<3;j++)
+	  for(k=0;k<3;k++)
+	    {
+	      s->link[dir].e[j][k].real *= s->phase[dir];
+	      s->link[dir].e[j][k].imag *= s->phase[dir];
+	    }
+	/* Avoid loop vectorizing the outer loop after unrolling
+	   the inners.  */
+	__asm__ volatile ("" : : : "memory");
+      }
 }
 
 int main()
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c
index 18476a4..79ed0bb 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -94,4 +94,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index fee992d..8cb1ac1 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1286,15 +1286,19 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	{
 	  if (load_p
 	      && rhs_code != CFN_GATHER_LOAD
-	      && rhs_code != CFN_MASK_GATHER_LOAD)
+	      && rhs_code != CFN_MASK_GATHER_LOAD
+	      /* Not grouped loads are handled as externals for BB
+		 vectorization.  For loop vectorization we can handle
+		 splats the same we handle single element interleaving.  */
+	      && (is_a <bb_vec_info> (vinfo)
+		  || stmt_info != first_stmt_info))
 	    {
 	      /* Not grouped load.  */
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 				 "Build SLP failed: not grouped load %G", stmt);
 
-	      /* FORNOW: Not grouped loads are not supported.  */
-	      if (is_a <bb_vec_info> (vinfo) && i != 0)
+	      if (i != 0)
 		continue;
 	      /* Fatal mismatch.  */
 	      matches[0] = false;
@@ -1302,7 +1306,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	    }
 
 	  /* Not memory operation.  */
-	  if (!phi_p
+	  if (!load_p
+	      && !phi_p
 	      && rhs_code.is_tree_code ()
 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
 	      && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
@@ -1774,7 +1779,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
     return NULL;
 
   /* If the SLP node is a load, terminate the recursion unless masked.  */
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+  if (STMT_VINFO_DATA_REF (stmt_info)
       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
     {
       if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
@@ -1798,8 +1803,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
 	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
 	    {
-	      int load_place = vect_get_place_in_interleaving_chain
-		  (load_info, first_stmt_info);
+	      int load_place;
+	      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+		load_place = vect_get_place_in_interleaving_chain
+				(load_info, first_stmt_info);
+	      else
+		load_place = 0;
 	      gcc_assert (load_place != -1);
 	      load_permutation.safe_push (load_place);
 	    }
@@ -5439,6 +5448,16 @@ vect_optimize_slp_pass::remove_redundant_permutations ()
 		this_load_permuted = true;
 		break;
 	      }
+	  /* When this isn't a grouped access we know it's single element
+	     and contiguous.  */
+	  if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
+	    {
+	      if (!this_load_permuted
+		  && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
+		      || SLP_TREE_LANES (node) == 1))
+		SLP_TREE_LOAD_PERMUTATION (node).release ();
+	      continue;
+	    }
 	  stmt_vec_info first_stmt_info
 	    = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
 	  if (!this_load_permuted
@@ -8129,12 +8148,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   tree vectype = SLP_TREE_VECTYPE (node);
   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
+  unsigned dr_group_size;
   machine_mode mode;
 
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+    dr_group_size = 1;
+  else
+    {
+      stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      dr_group_size = DR_GROUP_SIZE (stmt_info);
+    }
 
   mode = TYPE_MODE (vectype);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -8175,7 +8198,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   unsigned int nelts_to_build;
   unsigned int nvectors_per_build;
   unsigned int in_nlanes;
-  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+  bool repeating_p = (group_size == dr_group_size
 		      && multiple_p (nunits, group_size));
   if (repeating_p)
     {
@@ -8188,7 +8211,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
 	 it at least one to ensure the later computation for n_perms
 	 proceed.  */
       nvectors_per_build = nstmts > 0 ? nstmts : 1;
-      in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
+      in_nlanes = dr_group_size * 3;
     }
   else
     {
@@ -8200,7 +8223,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
       mask.new_vector (const_nunits, const_nunits, 1);
       nelts_to_build = const_vf * group_size;
       nvectors_per_build = 1;
-      in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
+      in_nlanes = const_vf * dr_group_size;
     }
   auto_sbitmap used_in_lanes (in_nlanes);
   bitmap_clear (used_in_lanes);
@@ -8214,7 +8237,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
     {
       unsigned int iter_num = j / group_size;
       unsigned int stmt_num = j % group_size;
-      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
+      unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
       bitmap_set_bit (used_in_lanes, i);
       if (repeating_p)
 	{
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b31971e..d642d3c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1150,6 +1150,8 @@ vect_model_load_cost (vec_info *vinfo,
       /* If the load is permuted then the alignment is determined by
 	 the first group element not by the first scalar stmt DR.  */
       stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      if (!first_stmt_info)
+	first_stmt_info = stmt_info;
       /* Record the cost for the permutation.  */
       unsigned n_perms, n_loads;
       vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
@@ -2203,12 +2205,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 {
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
-  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  stmt_vec_info first_stmt_info;
+  unsigned int group_size;
+  unsigned HOST_WIDE_INT gap;
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = DR_GROUP_SIZE (first_stmt_info);
+      gap = DR_GROUP_GAP (first_stmt_info);
+    }
+  else
+    {
+      first_stmt_info = stmt_info;
+      group_size = 1;
+      gap = 0;
+    }
   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
-  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
   bool single_element_p = (stmt_info == first_stmt_info
 			   && !DR_GROUP_NEXT_ELEMENT (stmt_info));
-  unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   /* True if the vectorized statements would access beyond the last
@@ -2311,11 +2325,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		    *memory_access_type = VMAT_ELEMENTWISE;
 		}
 	    }
-	  else
+	  else if (cmp == 0 && loop_vinfo)
 	    {
-	      gcc_assert (!loop_vinfo || cmp > 0);
-	      *memory_access_type = VMAT_CONTIGUOUS;
+	      gcc_assert (vls_type == VLS_LOAD);
+	      *memory_access_type = VMAT_INVARIANT;
+	      /* Invariant accesses perform only component accesses, alignment
+		 is irrelevant for them.  */
+	      *alignment_support_scheme = dr_unaligned_supported;
 	    }
+	  else
+	    *memory_access_type = VMAT_CONTIGUOUS;
 
 	  /* When we have a contiguous access across loop iterations
 	     but the access in the loop doesn't cover the full vector
@@ -2540,7 +2559,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 	 is irrelevant for them.  */
       *alignment_support_scheme = dr_unaligned_supported;
     }
-  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node)
     {
       if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node,
 				      masked_p,
@@ -9464,46 +9483,6 @@ vectorizable_load (vec_info *vinfo,
 	  return false;
 	}
 
-      if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-	{
-	  slp_perm = true;
-
-	  if (!loop_vinfo)
-	    {
-	      /* In BB vectorization we may not actually use a loaded vector
-		 accessing elements in excess of DR_GROUP_SIZE.  */
-	      stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-	      group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-	      unsigned HOST_WIDE_INT nunits;
-	      unsigned j, k, maxk = 0;
-	      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
-		if (k > maxk)
-		  maxk = k;
-	      tree vectype = SLP_TREE_VECTYPE (slp_node);
-	      if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-		  || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-		{
-		  if (dump_enabled_p ())
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "BB vectorization with gaps at the end of "
-				     "a load is not supported\n");
-		  return false;
-		}
-	    }
-
-	  auto_vec<tree> tem;
-	  unsigned n_perms;
-	  if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
-					     true, &n_perms))
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-				 vect_location,
-				 "unsupported load permutation\n");
-	      return false;
-	    }
-	}
-
       /* Invalidate assumptions made by dependence analysis when vectorization
 	 on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)
@@ -9521,6 +9500,46 @@ vectorizable_load (vec_info *vinfo,
   else
     group_size = 1;
 
+  if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    {
+      slp_perm = true;
+
+      if (!loop_vinfo)
+	{
+	  /* In BB vectorization we may not actually use a loaded vector
+	     accessing elements in excess of DR_GROUP_SIZE.  */
+	  stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+	  group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+	  unsigned HOST_WIDE_INT nunits;
+	  unsigned j, k, maxk = 0;
+	  FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+	      if (k > maxk)
+		maxk = k;
+	  tree vectype = SLP_TREE_VECTYPE (slp_node);
+	  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+	      || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "BB vectorization with gaps at the end of "
+				 "a load is not supported\n");
+	      return false;
+	    }
+	}
+
+      auto_vec<tree> tem;
+      unsigned n_perms;
+      if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
+					 true, &n_perms))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+			     vect_location,
+			     "unsupported load permutation\n");
+	  return false;
+	}
+    }
+
   vect_memory_access_type memory_access_type;
   enum dr_alignment_support alignment_support_scheme;
   int misalignment;
@@ -9898,10 +9917,19 @@ vectorizable_load (vec_info *vinfo,
       || (!slp && memory_access_type == VMAT_CONTIGUOUS))
     grouped_load = false;
 
-  if (grouped_load)
+  if (grouped_load
+      || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()))
     {
-      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
-      group_size = DR_GROUP_SIZE (first_stmt_info);
+      if (grouped_load)
+	{
+	  first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+	  group_size = DR_GROUP_SIZE (first_stmt_info);
+	}
+      else
+	{
+	  first_stmt_info = stmt_info;
+	  group_size = 1;
+	}
       /* For SLP vectorization we directly vectorize a subchain
          without permutation.  */
       if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())