aarch64: Take into account when VF is higher than known scalar iters

Consider low overhead loops like: void foo (char *restrict a, int *restrict b, int *restrict c, int n) { for (int i = 0; i < 9; i++) { int res = c[i]; int t = b[i]; if (a[i] != 0) res = t; c[i] = res; } } For such loops we use latency only costing since the loop bounds is known and small. The current costing however does not consider the case where niters < VF. So when comparing the scalar vs vector costs it doesn't keep in mind that the scalar code can't perform VF iterations. This makes it overestimate the cost for the scalar loop and we incorrectly vectorize. This patch takes the minimum of the VF and niters in such cases. Before the patch we generate: note: Original vector body cost = 46 note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 32) = 32.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Low iteration count, so using pure latency costs note: Cost model analysis: vs after: note: Original vector body cost = 46 note: Known loop bounds, capping VF to 9 for analysis note: Vector loop iterates at most 1 times note: Scalar issue estimate: note: load operations = 2 note: store operations = 1 note: general operations = 1 note: reduction latency = 0 note: estimated min cycles per iteration = 1.000000 note: estimated cycles per vector iteration (for VF 9) = 9.000000 note: SVE issue estimate: note: load operations = 5 note: store operations = 4 note: general operations = 11 note: predicate operations = 12 note: reduction latency = 0 note: estimated min cycles per iteration without predication = 5.500000 note: estimated min cycles per iteration for predication = 12.000000 note: estimated min cycles per iteration = 12.000000 note: Increasing body cost to 1472 because the scalar code could issue within the limit imposed by predicate operations note: Low iteration count, so using pure latency costs note: Cost model analysis: gcc/ChangeLog: * config/aarch64/aarch64.cc (adjust_body_cost): Cap VF for low iteration loops. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/asrdiv_4.c: Update bounds. * gcc.target/aarch64/sve/cond_asrd_2.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_6.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_7.c: Likewise. * gcc.target/aarch64/sve/cond_uxt_8.c: Likewise. * gcc.target/aarch64/sve/miniloop_1.c: Likewise. * gcc.target/aarch64/sve/spill_6.c: Likewise. * gcc.target/aarch64/sve/sve_iters_low_1.c: New test. * gcc.target/aarch64/sve/sve_iters_low_2.c: New test.
author: Tamar Christina <tamar.christina@arm.com> 2024-09-22 13:34:10 +0100
committer: Tamar Christina <tamar.christina@arm.com> 2024-09-22 13:34:10 +0100
commit: e84e5d034124c6733d3b36d8623c56090d4d17f7 (patch)
tree: 68b0498b773f821d9c9a630bab3d4717dc9e4715
parent: 673822455ed8b68559e13aef149163294490c69e (diff)
download: gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.zip
gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.tar.gz
gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.tar.bz2
10 files changed, 79 insertions, 29 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 92763d4..68913be 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17565,6 +17565,19 @@ adjust_body_cost (loop_vec_info loop_vinfo,
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Original vector body cost = %d\n", body_cost);
 
+  /* If we know we have a single partial vector iteration, cap the VF
+     to the number of scalar iterations for costing purposes.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      if (niters < estimated_vf && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "Scalar loop iterates at most %wd times.  Capping VF "
+			 " from %d to %wd\n", niters, estimated_vf, niters);
+
+      estimated_vf = MIN (estimated_vf, niters);
+    }
+
   fractional_cost scalar_cycles_per_iter
     = scalar_ops.min_cycles_per_iter () * estimated_vf;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
index 6684fe1..10a96a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/asrdiv_4.c
@@ -15,12 +15,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
index e4040ee..db1721e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_2.c
@@ -14,12 +14,12 @@
   }
 
 #define TEST_ALL(T) \
-  T (int16_t, int8_t, 7) \
-  T (int32_t, int8_t, 3) \
-  T (int32_t, int16_t, 3) \
-  T (int64_t, int8_t, 5) \
-  T (int64_t, int16_t, 5) \
-  T (int64_t, int32_t, 5)
+  T (int16_t, int8_t, 70) \
+  T (int32_t, int8_t, 30) \
+  T (int32_t, int16_t, 30) \
+  T (int64_t, int8_t, 50) \
+  T (int64_t, int16_t, 50) \
+  T (int64_t, int32_t, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
index e47276a..b8b3e86 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_6.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
index f49915c..2d02fb7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_7.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
index 42eb4b2..8fe2455 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_8.c
@@ -14,11 +14,11 @@
   }
 
 #define TEST_ALL(T)			\
-  T (int32_t, uint16_t, 0xff, 3)	\
+  T (int32_t, uint16_t, 0xff, 30)	\
 					\
-  T (int64_t, uint16_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xff, 5)	\
-  T (int64_t, uint32_t, 0xffff, 5)
+  T (int64_t, uint16_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xff, 50)	\
+  T (int64_t, uint32_t, 0xffff, 50)
 
 TEST_ALL (DEF_LOOP)
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
index 09eb414..cd1fd2b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/miniloop_1.c
@@ -6,7 +6,7 @@ void loop (int * __restrict__ a, int * __restrict__ b, int * __restrict__ c,
 	   int * __restrict__ g, int * __restrict__ h)
 {
   int i = 0;
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < 30; i++)
     {
       a[i] += i;
       b[i] += i;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
index ae9c338..2ff969c 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_6.c
@@ -11,20 +11,20 @@ void consumer (void *);
   {									\
     if (which)								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x1[i] += VAL;							\
 	consumer (x1);							\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x2[i] -= VAL;							\
 	consumer (x2);							\
       }									\
     else								\
       {									\
-	for (int i = 0; i < 7; ++i)					\
+	for (int i = 0; i < 70; ++i)					\
 	  x3[i] &= VAL;							\
 	consumer (x3);							\
       }									\
-    for (int i = 0; i < 7; ++i)						\
+    for (int i = 0; i < 70; ++i)					\
       x4[i] |= VAL;							\
     consumer (x4);							\
   }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
new file mode 100644
index 0000000..952a4b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n)
+{
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
new file mode 100644
index 0000000..02d10de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sve_iters_low_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a -Ofast -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
+{
+  if (stride <= 1)
+    return;
+
+  for (int i = 0; i < 9; i++)
+    {
+      int res = c[i];
+      int t = b[i*stride];
+      if (a[i] != 0)
+        res = t;
+      c[i] = res;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
author	Tamar Christina <tamar.christina@arm.com>	2024-09-22 13:34:10 +0100
committer	Tamar Christina <tamar.christina@arm.com>	2024-09-22 13:34:10 +0100
commit	e84e5d034124c6733d3b36d8623c56090d4d17f7 (patch)
tree	68b0498b773f821d9c9a630bab3d4717dc9e4715
parent	673822455ed8b68559e13aef149163294490c69e (diff)
download	gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.zip gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.tar.gz gcc-e84e5d034124c6733d3b36d8623c56090d4d17f7.tar.bz2