diff options
author | Richard Sandiford <richard.sandiford@arm.com> | 2021-01-14 11:36:25 +0000 |
---|---|---|
committer | Richard Sandiford <richard.sandiford@arm.com> | 2021-01-14 11:36:25 +0000 |
commit | e45c41988bfd655b1df7cff8fcf111dc6fb732e3 (patch) | |
tree | 85a81838fedfbd72c45587b2ca0f098b73658759 /gcc | |
parent | 48f8d1d48f2c7c2bc724dee979bcf56957f233cb (diff) | |
download | gcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.zip gcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.tar.gz gcc-e45c41988bfd655b1df7cff8fcf111dc6fb732e3.tar.bz2 |
vect: Account for unused IFN_LOAD_LANES results
At the moment, if we use only one vector of an LD4 result,
we'll treat the LD4 as having the cost of a single load.
But all 4 loads and any associated permutes take place
regardless of which results are actually used.
This patch therefore counts the cost of unused LOAD_LANES
results against the first statement in a group. An alternative
would be to multiply the ncopies of the first stmt by the group
size and treat other stmts in the group as having zero cost,
but I thought that might be more surprising when reading dumps.
gcc/
* tree-vect-stmts.c (vect_model_load_cost): Account for unused
IFN_LOAD_LANES results.
gcc/testsuite/
* gcc.target/aarch64/sve/cost_model_11.c: New test.
* gcc.target/aarch64/sve/mask_struct_load_5.c: Use
-fno-vect-cost-model.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c | 12 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 24 |
3 files changed, 37 insertions, 1 deletions
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c new file mode 100644 index 0000000..d9f4ccc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_11.c @@ -0,0 +1,12 @@ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=128" } */ + +long +f (long *x, long *y, long *z, long n) +{ + long res = 0; + for (long i = 0; i < n; ++i) + z[i] = x[i * 4] + y[i * 4]; + return res; +} + +/* { dg-final { scan-assembler-not {\tld4d\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c index da367e4..2a33ee8 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_struct_load_5.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math -fno-vect-cost-model" } */ #include <stdint.h> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 068e498..4d72c4d 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1120,6 +1120,30 @@ vect_model_load_cost (vec_info *vinfo, once per group anyhow. */ bool first_stmt_p = (first_stmt_info == stmt_info); + /* An IFN_LOAD_LANES will load all its vector results, regardless of which + ones we actually need. Account for the cost of unused results. */ + if (first_stmt_p && !slp_node && memory_access_type == VMAT_LOAD_STORE_LANES) + { + unsigned int gaps = DR_GROUP_SIZE (first_stmt_info); + stmt_vec_info next_stmt_info = first_stmt_info; + do + { + gaps -= 1; + next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); + } + while (next_stmt_info); + if (gaps) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_load_cost: %d unused vectors.\n", + gaps); + vect_get_load_cost (vinfo, stmt_info, ncopies * gaps, false, + &inside_cost, &prologue_cost, + cost_vec, cost_vec, true); + } + } + /* We assume that the cost of a single load-lanes instruction is equivalent to the cost of DR_GROUP_SIZE separate loads. If a grouped access is instead being provided by a load-and-permute operation, |