tree-optimization/115825 - improve unroll estimates for volatile accesses

The loop unrolling code assumes that one third of all volatile accesses can be possibly optimized away which is of course not true. This leads to excessive unrolling in some cases. The following tracks the number of stmts with side-effects as those are not eliminatable later and only assumes one third of the other stmts can be further optimized. This causes some fallout in the testsuite where we rely on unrolling even when calls are involved. I have XFAILed g++.dg/warn/Warray-bounds-20.C but adjusted the others with a #pragma GCC unroll to mimic previous behavior and retain what the testcase was testing. I've also filed PR117671 for the case where the size estimation fails to honor the stmts we then remove by inserting __builtin_unreachable (). For gcc.dg/tree-ssa/cunroll-2.c the estimate that the code doesn't grow is clearly bogus and we have explicit code to reject unrolling for bodies containing calls so I've adjusted the testcase accordingly. PR tree-optimization/115825 * tree-ssa-loop-ivcanon.cc (loop_size::not_eliminatable_after_peeling): New. (loop_size::last_iteration_not_eliminatable_after_peeling): Likewise. (tree_estimate_loop_size): Count stmts with side-effects as not optimistically eliminatable. (estimated_unrolled_size): Compute the number of stmts that can be optimistically eliminated by followup transforms. (try_unroll_loop_completely): Adjust. * gcc.dg/tree-ssa/cunroll-17.c: New testcase. * gcc.dg/tree-ssa/cunroll-2.c: Adjust to not expect unrolling. * gcc.dg/pr94600-1.c: Force unrolling. * c-c++-common/ubsan/unreachable-3.c: Likewise. * g++.dg/warn/Warray-bounds-20.C: XFAIL cases we rely on unrolling loops created by new expressions and not inlined CTOR invocations.
author: Richard Biener <rguenther@suse.de> 2024-07-10 12:45:02 +0200
committer: Richard Biener <rguenth@gcc.gnu.org> 2024-11-25 08:17:54 +0100
commit: d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5 (patch)
tree: 861ac4d60bd2e15b6ea7f33cf3af20d25b6cff88
parent: c84a8a274af3163a4042bcfd77dd1988bd1eb0ce (diff)
download: gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.zip
gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.tar.gz
gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.tar.bz2
6 files changed, 47 insertions, 12 deletions
diff --git a/gcc/testsuite/c-c++-common/ubsan/unreachable-3.c b/gcc/testsuite/c-c++-common/ubsan/unreachable-3.c
index b7a0d1a..8831a1f 100644
--- a/gcc/testsuite/c-c++-common/ubsan/unreachable-3.c
+++ b/gcc/testsuite/c-c++-common/ubsan/unreachable-3.c
@@ -14,8 +14,9 @@ struct snic {
 void snic_log_q_error(struct snic *snic)
 {
     unsigned int i;
+#pragma GCC unroll 1
     for (i = 0; i < snic->wq_count; i++)
-        ioread32(&snic->wq[i]->error_status);
+      ioread32(&snic->wq[i]->error_status);
 }
 
 /* { dg-final { scan-tree-dump "__builtin___ubsan_handle_builtin_unreachable" "optimized" } } */
diff --git a/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C b/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
index f4876d8..5fc5529 100644
--- a/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
+++ b/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
@@ -53,8 +53,8 @@ void warn_derived_ctor_access_new_alloc ()
 
 void warn_derived_ctor_access_new_array_decl ()
 {
-  char b[sizeof (D1) * 2];    // { dg-message "at offset \\d+ into object 'b' of size 80" "LP64 note" { target lp64 } }
-                              // { dg-message "at offset \\d+ into object 'b' of size 40" "LP64 note" { target ilp32 } .-1 }
+  char b[sizeof (D1) * 2];    // { dg-message "at offset \\d+ into object 'b' of size 80" "LP64 note" { target { lp64 } xfail { lp64 } } }
+                              // { dg-message "at offset \\d+ into object 'b' of size 40" "LP64 note" { target { ilp32 } xfail { ilp32 } } .-1 }
   char *p = b;
   ++p;
   D1 *q = new (p) D1[2];
@@ -63,7 +63,7 @@ void warn_derived_ctor_access_new_array_decl ()
 
 void warn_derived_ctor_access_new_array_alloc ()
 {
-  char *p = new char[sizeof (D1) * 2];            // { dg-message "at offset \\d+ into object of size \\d+ allocated by '\[^\n\r]*operator new\[^\n\r]*" "note" }
+  char *p = new char[sizeof (D1) * 2];            // { dg-message "at offset \\d+ into object of size \\d+ allocated by '\[^\n\r]*operator new\[^\n\r]*" "note" { xfail *-*-* } }
   ++p;
   D1 *q = new (p) D1[2];
   sink (q);
diff --git a/gcc/testsuite/gcc.dg/pr94600-1.c b/gcc/testsuite/gcc.dg/pr94600-1.c
index 149e4f3..d5fb4d1 100644
--- a/gcc/testsuite/gcc.dg/pr94600-1.c
+++ b/gcc/testsuite/gcc.dg/pr94600-1.c
@@ -31,6 +31,7 @@ foo(void)
 {
   __SIZE_TYPE__ i;
   __SIZE_TYPE__ base = 0x000a0000;
+#pragma GCC unroll 5
   for (i = 0; i < (sizeof (a0) / sizeof ((a0)[0])); i++) {
     *(volatile t0 *) (base + 44 + i * 4) = a0[i];
   }
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c
new file mode 100644
index 0000000..282db99
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fdump-tree-optimized" } */
+
+char volatile v;
+void for16 (void)
+{
+  for (char i = 16; i > 0; i -= 2)
+    v = i;
+}
+
+/* { dg-final { scan-tree-dump-times " ={v} " 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-2.c b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-2.c
index b1d1c7d..d1122e0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-2.c
@@ -13,5 +13,6 @@ test(int c)
 	return;
     }
 }
+
 /* We are not able to get rid of the final conditional because the loop has two exits.  */
-/* { dg-final { scan-tree-dump "loop with 1 iterations completely unrolled" "cunroll"} } */
+/* { dg-final { scan-tree-dump "Not unrolling loop 1: contains call and code would grow" "cunroll"} } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 0d496d7..9a94d82 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -139,10 +139,15 @@ struct loop_size
      variable where induction variable starts at known constant.)  */
   int eliminated_by_peeling;
 
+  /* Number of instructions that cannot be further optimized in the
+     peeled loop, for example volatile accesses.  */
+  int not_eliminatable_after_peeling;
+
   /* Same statistics for last iteration of loop: it is smaller because
      instructions after exit are not executed.  */
   int last_iteration;
   int last_iteration_eliminated_by_peeling;
+  int last_iteration_not_eliminatable_after_peeling;
 
   /* If some IV computation will become constant.  */
   bool constant_iv;
@@ -267,8 +272,10 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
   size->overall = 0;
   size->eliminated_by_peeling = 0;
+  size->not_eliminatable_after_peeling = 0;
   size->last_iteration = 0;
   size->last_iteration_eliminated_by_peeling = 0;
+  size->last_iteration_not_eliminatable_after_peeling = 0;
   size->num_pure_calls_on_hot_path = 0;
   size->num_non_pure_calls_on_hot_path = 0;
   size->non_call_stmts_on_hot_path = 0;
@@ -292,6 +299,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	{
 	  gimple *stmt = gsi_stmt (gsi);
 	  int num = estimate_num_insns (stmt, &eni_size_weights);
+	  bool not_eliminatable_after_peeling = false;
 	  bool likely_eliminated = false;
 	  bool likely_eliminated_last = false;
 	  bool likely_eliminated_peeled = false;
@@ -304,7 +312,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
 	  /* Look for reasons why we might optimize this stmt away. */
 
-	  if (!gimple_has_side_effects (stmt))
+	  if (gimple_has_side_effects (stmt))
+	    not_eliminatable_after_peeling = true;
+	  else
 	    {
 	      /* Exit conditional.  */
 	      if (exit && body[i] == exit->src
@@ -377,11 +387,15 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	  size->overall += num;
 	  if (likely_eliminated || likely_eliminated_peeled)
 	    size->eliminated_by_peeling += num;
+	  if (not_eliminatable_after_peeling)
+	    size->not_eliminatable_after_peeling += num;
 	  if (!after_exit)
 	    {
 	      size->last_iteration += num;
 	      if (likely_eliminated || likely_eliminated_last)
 		size->last_iteration_eliminated_by_peeling += num;
+	      if (not_eliminatable_after_peeling)
+		size->last_iteration_not_eliminatable_after_peeling += num;
 	    }
 	  if ((size->overall * 3 / 2 - size->eliminated_by_peeling
 	      - size->last_iteration_eliminated_by_peeling) > upper_bound)
@@ -437,18 +451,24 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
    It is (NUNROLL + 1) * size of loop body with taking into account
    the fact that in last copy everything after exit conditional
    is dead and that some instructions will be eliminated after
-   peeling.  */
+   peeling.  Set *EST_ELIMINATED to the number of stmts that could be
+   optimistically eliminated by followup transforms.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
+			 unsigned HOST_WIDE_INT *est_eliminated,
 			 unsigned HOST_WIDE_INT nunroll)
 {
   HOST_WIDE_INT unr_insns = ((nunroll)
   			     * (HOST_WIDE_INT) (size->overall
 			     			- size->eliminated_by_peeling));
-  if (!nunroll)
-    unr_insns = 0;
+  HOST_WIDE_INT not_elim
+    = ((nunroll) * (HOST_WIDE_INT) size->not_eliminatable_after_peeling);
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
+  not_elim += size->last_iteration_not_eliminatable_after_peeling;
 
+  /* Testcases rely on rounding up, so do not write as
+     (unr_insns - not_elim) / 3.  */
+  *est_eliminated = unr_insns - not_elim - (unr_insns - not_elim) * 2 / 3;
   return unr_insns;
 }
 
@@ -829,8 +849,9 @@ try_unroll_loop_completely (class loop *loop,
 	    }
 
 	  unsigned HOST_WIDE_INT ninsns = size.overall;
+	  unsigned HOST_WIDE_INT est_eliminated;
 	  unsigned HOST_WIDE_INT unr_insns
-	    = estimated_unrolled_size (&size, n_unroll);
+	    = estimated_unrolled_size (&size, &est_eliminated, n_unroll);
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
 	      fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
@@ -842,7 +863,7 @@ try_unroll_loop_completely (class loop *loop,
 	     cautious on guessing if the unrolling is going to be
 	     profitable.
 	     Move from estimated_unrolled_size to unroll small loops.  */
-	  if (unr_insns * 2 / 3
+	  if (unr_insns - est_eliminated
 	      /* If there is IV variable that will become constant, we
 		 save one instruction in the loop prologue we do not
 		 account otherwise.  */
@@ -919,7 +940,7 @@ try_unroll_loop_completely (class loop *loop,
 	     2) Big loop after completely unroll may not be vectorized
 	     by BB vectorizer.  */
 	  else if ((cunrolli && !loop->inner
-		    ? unr_insns : unr_insns * 2 / 3)
+		    ? unr_insns : unr_insns - est_eliminated)
 		   > (unsigned) param_max_completely_peeled_insns)
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))
author	Richard Biener <rguenther@suse.de>	2024-07-10 12:45:02 +0200
committer	Richard Biener <rguenth@gcc.gnu.org>	2024-11-25 08:17:54 +0100
commit	d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5 (patch)
tree	861ac4d60bd2e15b6ea7f33cf3af20d25b6cff88
parent	c84a8a274af3163a4042bcfd77dd1988bd1eb0ce (diff)
download	gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.zip gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.tar.gz gcc-d1cf0d7a0f27fdd55302785f19f07d1c3f162ba5.tar.bz2