Account for the cost of generating loop masks

We didn't take the cost of generating loop masks into account, and so tended to underestimate the cost of loops that need multiple masks. 2019-11-13 Richard Sandiford <richard.sandiford@arm.com> gcc/ * tree-vect-loop.c (vect_estimate_min_profitable_iters): Include the cost of generating loop masks. gcc/testsuite/ * gcc.target/aarch64/sve/mask_struct_store_3.c: Add -fno-vect-cost-model. * gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2.c: Likewise. * gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3.c: Likewise. * gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise. From-SVN: r278125
author: Richard Sandiford <richard.sandiford@arm.com> 2019-11-13 09:12:17 +0000
committer: Richard Sandiford <rsandifo@gcc.gnu.org> 2019-11-13 09:12:17 +0000
commit: 61e5f2df0345bcc1f7675125922692d727e20603 (patch)
tree: d68f3d4304c78fa570671e44fcc8a57ab66db752 /gcc
parent: 6eed64b96d886da2518d86eae5bc0a5ed66cabe0 (diff)
download: gcc-61e5f2df0345bcc1f7675125922692d727e20603.zip
gcc-61e5f2df0345bcc1f7675125922692d727e20603.tar.gz
gcc-61e5f2df0345bcc1f7675125922692d727e20603.tar.bz2
9 files changed, 48 insertions, 7 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index e7b0433..0470528 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,10 @@
 2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
 
+	* tree-vect-loop.c (vect_estimate_min_profitable_iters): Include
+	the cost of generating loop masks.
+
+2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
+
 	* tree-vectorizer.h (vect_apply_runtime_profitability_check_p):
 	New function.
 	* tree-vect-loop-manip.c (vect_loop_versioning): Use it.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index a253a53..834c17a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,15 @@
 2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
 
+	* gcc.target/aarch64/sve/mask_struct_store_3.c: Add
+	-fno-vect-cost-model.
+	* gcc.target/aarch64/sve/mask_struct_store_3_run.c: Likewise.
+	* gcc.target/aarch64/sve/peel_ind_2.c: Likewise.
+	* gcc.target/aarch64/sve/peel_ind_2_run.c: Likewise.
+	* gcc.target/aarch64/sve/peel_ind_3.c: Likewise.
+	* gcc.target/aarch64/sve/peel_ind_3_run.c: Likewise.
+
+2019-11-13  Richard Sandiford  <richard.sandiford@arm.com>
+
 	PR c++/92206
 	* g++.dg/cpp0x/alias-decl-pr92206-1.C: New test.
 	* g++.dg/cpp0x/alias-decl-pr92206-2.C: Likewise.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
index 001f5be..1765d54 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
index 31d661b..4dbe033 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_store_3_run.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
 
 #include "mask_struct_store_3.c"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
index e792cdf..df82d58 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* Pick an arbitrary target for which unaligned accesses are more
    expensive.  */
-/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
 
 #define N 512
 #define START 7
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
index 9c5ae1b..b978535 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_2_run.c
@@ -1,6 +1,6 @@
 /* { dg-do run { target aarch64_sve_hw } } */
 /* { dg-options "-O3 -mtune=thunderx" } */
-/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
 
 #include "peel_ind_2.c"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
index 441589e..1707f02 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* Pick an arbitrary target for which unaligned accesses are more
    expensive.  */
-/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx" } */
+/* { dg-options "-O3 -msve-vector-bits=256 -mtune=thunderx -fno-vect-cost-model" } */
 
 #define N 32
 #define MAX_START 8
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
index 384a38e..9838967 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_3_run.c
@@ -1,6 +1,6 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O3 -mtune=thunderx" } */
-/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-options "-O3 -mtune=thunderx -fno-vect-cost-model" } */
+/* { dg-options "-O3 -mtune=thunderx -msve-vector-bits=256 -fno-vect-cost-model" { target aarch64_sve256_hw } } */
 
 #include "peel_ind_3.c"
 
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 83fb848..005fa30 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3291,6 +3291,32 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 				  si->kind, si->stmt_info, si->misalign,
 				  vect_epilogue);
 	}
+
+      /* Calculate how many masks we need to generate.  */
+      unsigned int num_masks = 0;
+      rgroup_masks *rgm;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+	if (rgm->mask_type)
+	  num_masks += num_vectors_m1 + 1;
+      gcc_assert (num_masks > 0);
+
+      /* In the worst case, we need to generate each mask in the prologue
+	 and in the loop body.  One of the loop body mask instructions
+	 replaces the comparison in the scalar loop, and since we don't
+	 count the scalar comparison against the scalar body, we shouldn't
+	 count that vector instruction against the vector body either.
+
+	 Sometimes we can use unpacks instead of generating prologue
+	 masks and sometimes the prologue mask will fold to a constant,
+	 so the actual prologue cost might be smaller.  However, it's
+	 simpler and safer to use the worst-case cost; if this ends up
+	 being the tie-breaker between vectorizing or not, then it's
+	 probably better not to vectorize.  */
+      (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
+			    NULL, 0, vect_prologue);
+      (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
+			    NULL, 0, vect_body);
     }
   else if (npeel < 0)
     {
author	Richard Sandiford <richard.sandiford@arm.com>	2019-11-13 09:12:17 +0000
committer	Richard Sandiford <rsandifo@gcc.gnu.org>	2019-11-13 09:12:17 +0000
commit	61e5f2df0345bcc1f7675125922692d727e20603 (patch)
tree	d68f3d4304c78fa570671e44fcc8a57ab66db752 /gcc
parent	6eed64b96d886da2518d86eae5bc0a5ed66cabe0 (diff)
download	gcc-61e5f2df0345bcc1f7675125922692d727e20603.zip gcc-61e5f2df0345bcc1f7675125922692d727e20603.tar.gz gcc-61e5f2df0345bcc1f7675125922692d727e20603.tar.bz2