tree-optimization/49955 - BB reduction with odd number of lanes

The following enhances BB reduction vectorization to support vectorizing only a subset of the lanes, keeping the rest as scalar ops. For now we try to make the number of lanes even by leaving alone the "last" lane. That's because SLP discovery with all lanes will fail too soon to get us any hint on which lane to strip and likewise we don't know what vector modes the target supports so restricting ourselves to power-of-two or other cases isn't easy. This is enough to get at the vectorization opportunity for the testcase in the PR - albeit with the chosen lanes not optimal but at least vectorizable. PR tree-optimization/49955 * tree-vectorizer.h (_slp_instance::remain_stmts): New. (SLP_INSTANCE_REMAIN_STMTS): Likewise. * tree-vect-slp.cc (vect_free_slp_instance): Release SLP_INSTANCE_REMAIN_STMTS. (vect_build_slp_instance): Make the number of lanes of a BB reduction even. (vectorize_slp_instance_root_stmt): Handle unvectorized defs of a BB reduction. * gfortran.dg/vect/pr49955.f: New testcase.
author: Richard Biener <rguenther@suse.de> 2023-08-07 14:44:20 +0200
committer: Richard Biener <rguenther@suse.de> 2023-08-08 14:38:10 +0200
commit: d9f3ea61fe36e2de3354b90b65ff8245099114c9 (patch)
tree: 8907ce57966ceca77a774965c78cc2df4fd9b7be
parent: 02a015fa320229a057ef721eaf663f3eb22a8ace (diff)
download: gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.zip
gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.tar.gz
gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.tar.bz2
3 files changed, 72 insertions, 1 deletions
diff --git a/gcc/testsuite/gfortran.dg/vect/pr49955.f b/gcc/testsuite/gfortran.dg/vect/pr49955.f
new file mode 100644
index 0000000..a73cd5a
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/pr49955.f
@@ -0,0 +1,38 @@
+! { dg-do compile }
+! { dg-additional-options "-ffast-math -fdump-tree-slp1" }
+
+      subroutine shell(nx,ny,nz,q,dt,cfl,dx,dy,dz,cfll,gm,Pr,Re)
+      implicit none
+      integer nx,ny,nz,i,j,k
+      real*8 cfl,dx,dy,dz,dt
+      real*8 gm,Re,Pr,cfll,t1,t2,t3,t4,t5,t6,t7,t8,mu
+      real*8 q(5,nx,ny,nz)
+
+      if (cfll.ge.cfl) cfll=cfl
+      t8=0.0d0
+
+      do k=1,nz
+         do j=1,ny
+            do i=1,nx
+               t1=q(1,i,j,k)
+               t2=q(2,i,j,k)/t1
+               t3=q(3,i,j,k)/t1
+               t4=q(4,i,j,k)/t1
+               t5=(gm-1.0d0)*(q(5,i,j,k)-0.5d0*t1*(t2*t2+t3*t3+t4*t4))
+               t6=dSQRT(gm*t5/t1)
+               mu=gm*Pr*(gm*t5/t1)**0.75d0*2.0d0/Re/t1
+               t7=((dabs(t2)+t6)/dx+mu/dx**2)**2 +
+     1            ((dabs(t3)+t6)/dy+mu/dy**2)**2 +
+     2            ((dabs(t4)+t6)/dz+mu/dz**2)**2
+               t7=DSQRT(t7)
+               t8=max(t8,t7)
+            enddo
+         enddo
+      enddo
+      dt=cfll / t8
+
+      return
+      end
+
+! We don't have an effective target for reduc_plus_scal optab support
+! { dg-final { scan-tree-dump ".REDUC_PLUS" "slp1" { target x86_64-*-* } } }
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index eab3dcd..070ab3f 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -209,6 +209,7 @@ vect_free_slp_instance (slp_instance instance)
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
   SLP_INSTANCE_LOADS (instance).release ();
   SLP_INSTANCE_ROOT_STMTS (instance).release ();
+  SLP_INSTANCE_REMAIN_STMTS (instance).release ();
   instance->subgraph_entries.release ();
   instance->cost_vec.release ();
   free (instance);
@@ -3128,6 +3129,16 @@ vect_build_slp_instance (vec_info *vinfo,
 			 "  %G", scalar_stmts[i]->stmt);
     }
 
+  /* When a BB reduction doesn't have an even number of lanes
+     strip it down, treating the remaining lane as scalar.
+     ???  Selecting the optimal set of lanes to vectorize would be nice
+     but SLP build for all lanes will fail quickly because we think
+     we're going to need unrolling.  */
+  auto_vec<stmt_vec_info> remain;
+  if (kind == slp_inst_kind_bb_reduc
+      && (scalar_stmts.length () & 1))
+    remain.safe_push (scalar_stmts.pop ());
+
   /* Build the tree for the SLP instance.  */
   unsigned int group_size = scalar_stmts.length ();
   bool *matches = XALLOCAVEC (bool, group_size);
@@ -3175,6 +3186,10 @@ vect_build_slp_instance (vec_info *vinfo,
 	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
+	  if (!remain.is_empty ())
+	    SLP_INSTANCE_REMAIN_STMTS (new_instance) = remain.copy ();
+	  else
+	    SLP_INSTANCE_REMAIN_STMTS (new_instance) = vNULL;
 	  SLP_INSTANCE_KIND (new_instance) = kind;
 	  new_instance->reduc_phis = NULL;
 	  new_instance->cost_vec = vNULL;
@@ -9138,7 +9153,20 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
 	gcc_unreachable ();
       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
 				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
-
+      if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
+	{
+	  tree rem_def = NULL_TREE;
+	  for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
+	    if (!rem_def)
+	      rem_def = gimple_get_lhs (rem->stmt);
+	    else
+	      rem_def = gimple_build (&epilogue, reduc_code,
+				      TREE_TYPE (scalar_def),
+				      rem_def, gimple_get_lhs (rem->stmt));
+	  scalar_def = gimple_build (&epilogue, reduc_code,
+				     TREE_TYPE (scalar_def),
+				     scalar_def, rem_def);
+	}
       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a651614..dea29a7 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -257,6 +257,10 @@ public:
      from, NULL otherwise.  */
   vec<stmt_vec_info> root_stmts;
 
+  /* For slp_inst_kind_bb_reduc the defs that were not vectorized, NULL
+     otherwise.  */
+  vec<stmt_vec_info> remain_stmts;
+
   /* The unrolling factor required to vectorized this SLP instance.  */
   poly_uint64 unrolling_factor;
 
@@ -285,6 +289,7 @@ public:
 #define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
 #define SLP_INSTANCE_LOADS(S)                    (S)->loads
 #define SLP_INSTANCE_ROOT_STMTS(S)               (S)->root_stmts
+#define SLP_INSTANCE_REMAIN_STMTS(S)             (S)->remain_stmts
 #define SLP_INSTANCE_KIND(S)                     (S)->kind
 
 #define SLP_TREE_CHILDREN(S)                     (S)->children
author	Richard Biener <rguenther@suse.de>	2023-08-07 14:44:20 +0200
committer	Richard Biener <rguenther@suse.de>	2023-08-08 14:38:10 +0200
commit	d9f3ea61fe36e2de3354b90b65ff8245099114c9 (patch)
tree	8907ce57966ceca77a774965c78cc2df4fd9b7be
parent	02a015fa320229a057ef721eaf663f3eb22a8ace (diff)
download	gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.zip gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.tar.gz gcc-d9f3ea61fe36e2de3354b90b65ff8245099114c9.tar.bz2