aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Sandiford <richard.sandiford@arm.com>2021-01-14 11:36:25 +0000
committerRichard Sandiford <richard.sandiford@arm.com>2021-01-14 11:36:25 +0000
commite45c41988bfd655b1df7cff8fcf111dc6fb732e3 (patch)
tree85a81838fedfbd72c45587b2ca0f098b73658759
parent48f8d1d48f2c7c2bc724dee979bcf56957f233cb (diff)
downloadgcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.zip
gcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.tar.gz
gcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.tar.bz2
vect: Account for unused IFN_LOAD_LANES results
At the moment, if we use only one vector of an LD4 result, we'll treat the LD4 as having the cost of a single load. But all 4 loads and any associated permutes take place regardless of which results are actually used. This patch therefore counts the cost of unused LOAD_LANES results against the first statement in a group. An alternative would be to multiply the ncopies of the first stmt by the group size and treat other stmts in the group as having zero cost, but I thought that might be more surprising when reading dumps. gcc/ * tree-vect-stmts.c (vect_model_load_cost): Account for unused IFN_LOAD_LANES results. gcc/testsuite/ * gcc.target/aarch64/sve/cost_model_11.c: New test. * gcc.target/aarch64/sve/mask_struct_load_5.c: Use -fno-vect-cost-model.
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c12
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c2
-rw-r--r--gcc/tree-vect-stmts.c24
3 files changed, 37 insertions, 1 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c
new file mode 100644
index 0000000..d9f4ccc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c
@@ -0,0 +1,12 @@
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=128" } */
+
+long
+f (long *x, long *y, long *z, long n)
+{
+ long res = 0;
+ for (long i = 0; i < n; ++i)
+ z[i] = x[i * 4] + y[i * 4];
+ return res;
+}
+
+/* { dg-final { scan-assembler-not {\tld4d\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c
index da367e4..2a33ee8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */
#include <stdint.h>
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 068e498..4d72c4d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1120,6 +1120,30 @@ vect_model_load_cost (vec_info *vinfo,
once per group anyhow. */
bool first_stmt_p = (first_stmt_info == stmt_info);
+ /* An IFN_LOAD_LANES will load all its vector results, regardless of which
+ ones we actually need. Account for the cost of unused results. */
+ if (first_stmt_p && !slp_node && memory_access_type == VMAT_LOAD_STORE_LANES)
+ {
+ unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
+ stmt_vec_info next_stmt_info = first_stmt_info;
+ do
+ {
+ gaps -= 1;
+ next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
+ }
+ while (next_stmt_info);
+ if (gaps)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_model_load_cost: %d unused vectors.\n",
+ gaps);
+ vect_get_load_cost (vinfo, stmt_info, ncopies * gaps, false,
+ &inside_cost, &prologue_cost,
+ cost_vec, cost_vec, true);
+ }
+ }
+
/* We assume that the cost of a single load-lanes instruction is
equivalent to the cost of DR_GROUP_SIZE separate loads. If a grouped
access is instead being provided by a load-and-permute operation,