tree-optimization/115841 - reduction epilogue placement issue

When emitting the compensation to the vectorized main loop for a vector reduction value to be re-used in the vectorized epilogue we fail to place it in the correct block when the main loop is known to be entered (no loop_vinfo->main_loop_edge) but the epilogue is not (a loop_vinfo->skip_this_loop_edge). The code currently disregards this situation. With the recent znver4 cost fix I couldn't trigger this situation with the testcase but I adjusted it so it could eventually trigger on other targets. PR tree-optimization/115841 * tree-vect-loop.cc (vect_transform_cycle_phi): Correctly place the partial vector reduction for the accumulator re-use when the main loop cannot be skipped but the epilogue can. * gcc.dg/vect/pr115841.c: New testcase.
author: Richard Biener <rguenther@suse.de> 2024-07-16 11:53:17 +0200
committer: Richard Biener <rguenth@gcc.gnu.org> 2024-07-16 16:05:11 +0200
commit: 016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd (patch)
tree: 6d817652a52a725b48d67b1327398022a5dcab71 /gcc
parent: cca1229b85f2ad9422773fdb954d0924fa1cd350 (diff)
download: gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.zip
gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.tar.gz
gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.tar.bz2
2 files changed, 46 insertions, 3 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/pr115841.c b/gcc/testsuite/gcc.dg/vect/pr115841.c
new file mode 100644
index 0000000..aa5c660
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115841.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fcommon -fvect-cost-model=dynamic --param vect-partial-vector-usage=1" } */
+/* { dg-additional-options "-mavx512vl" { target avx512vl } } */
+
+/* To trigger the bug costing needs to determine that aligning the A170
+   accesses with a prologue is good and there should be a vectorized
+   epilogue with a smaller vector size, re-using the vector accumulator
+   from the vectorized main loop that's statically known to execute
+   but the epilogue loop is not.  */
+
+static unsigned char xl[192];
+unsigned char A170[192*3];
+
+void jerate (unsigned char *, unsigned char *);
+float foo (unsigned n)
+{
+  jerate (xl, A170);
+
+  unsigned i = 32;
+  int kr = 1;
+  float sfn11s = 0.f;
+  float sfn12s = 0.f;
+  do
+    {
+      int krm1 = kr - 1;
+      long j = krm1;
+      float a = (*(float(*)[n])A170)[j];
+      float b = (*(float(*)[n])xl)[j];
+      float c = a * b;
+      float d = c * 6.93149983882904052734375e-1f;
+      float e = (*(float(*)[n])A170)[j+48];
+      float f = (*(float(*)[n])A170)[j+96];
+      float g = d * e;
+      sfn11s = sfn11s + g;
+      float h = f * d;
+      sfn12s = sfn12s + h;
+      kr++;
+    }
+  while (--i != 0);
+  float tem = sfn11s + sfn12s;
+  return tem;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a64b508..b8124a3 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9026,14 +9026,15 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
 	  /* And the reduction could be carried out using a different sign.  */
 	  if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
 	    def = gimple_convert (&stmts, vectype_out, def);
-	  if (loop_vinfo->main_loop_edge)
+	  edge e;
+	  if ((e = loop_vinfo->main_loop_edge)
+	      || (e = loop_vinfo->skip_this_loop_edge))
 	    {
 	      /* While we'd like to insert on the edge this will split
 		 blocks and disturb bookkeeping, we also will eventually
 		 need this on the skip edge.  Rely on sinking to
 		 fixup optimal placement and insert in the pred.  */
-	      gimple_stmt_iterator gsi
-		= gsi_last_bb (loop_vinfo->main_loop_edge->src);
+	      gimple_stmt_iterator gsi = gsi_last_bb (e->src);
 	      /* Insert before a cond that eventually skips the
 		 epilogue.  */
 	      if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
author	Richard Biener <rguenther@suse.de>	2024-07-16 11:53:17 +0200
committer	Richard Biener <rguenth@gcc.gnu.org>	2024-07-16 16:05:11 +0200
commit	016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd (patch)
tree	6d817652a52a725b48d67b1327398022a5dcab71 /gcc
parent	cca1229b85f2ad9422773fdb954d0924fa1cd350 (diff)
download	gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.zip gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.tar.gz gcc-016c947b02e79a5c0c0c2d4ad5cb71aa04db3efd.tar.bz2