diff options
author | Richard Biener <rguenther@suse.de> | 2024-10-16 10:09:36 +0200 |
---|---|---|
committer | Richard Biener <rguenth@gcc.gnu.org> | 2024-10-16 13:04:38 +0200 |
commit | 62cdddd4e621a8182c58161188009f1e9b256e1b (patch) | |
tree | d266809386d19a401eec682eabe1ae80325165c3 /gcc | |
parent | ae224de0631a7fcac37ac1384f457f1dc1a487b2 (diff) | |
download | gcc-62cdddd4e621a8182c58161188009f1e9b256e1b.zip gcc-62cdddd4e621a8182c58161188009f1e9b256e1b.tar.gz gcc-62cdddd4e621a8182c58161188009f1e9b256e1b.tar.bz2 |
Enhance gather fallback for PR65518 with SLP
With SLP forced we fail to use gather for PR65518 on RISC-V as expected
because we're failing due to not effective peeling for gaps. The
following appropriately moves the memory_access_type adjustment before
doing all the overrun checking since using VMAT_ELEMENTWISE means
there's no overrun.
* tree-vect-stmts.cc (get_group_load_store_type): Move
VMAT_ELEMENTWISE fallback for single-element interleaving
of too large groups before overrun checking.
* gcc.dg/vect/pr65518.c: Adjust.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/pr65518.c | 109 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 58 |
2 files changed, 85 insertions, 82 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/pr65518.c b/gcc/testsuite/gcc.dg/vect/pr65518.c index 189a655..6d85150 100644 --- a/gcc/testsuite/gcc.dg/vect/pr65518.c +++ b/gcc/testsuite/gcc.dg/vect/pr65518.c @@ -1,54 +1,55 @@ -#include "tree-vect.h"
-
-#if VECTOR_BITS > 256
-#define NINTS (VECTOR_BITS / 32)
-#else
-#define NINTS 8
-#endif
-
-#define N (NINTS * 2)
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
-
-extern void abort (void);
-
-typedef struct giga
-{
- unsigned int g[N];
-} giga;
-
-unsigned long __attribute__((noinline,noclone))
-addfst(giga const *gptr, int num)
-{
- unsigned int retval = 0;
- int i;
- for (i = 0; i < num; i++)
- retval += gptr[i].g[0];
- return retval;
-}
-
-int main ()
-{
- struct giga g[NINTS];
- unsigned int n = 1;
- int i, j;
- check_vect ();
- for (i = 0; i < NINTS; ++i)
- for (j = 0; j < N; ++j)
- {
- g[i].g[j] = n++;
- __asm__ volatile ("");
- }
- if (addfst (g, NINTS) != RESULT)
- abort ();
- return 0;
-}
-
-/* We don't want to vectorize the single-element interleaving in the way
- we currently do that (without ignoring not needed vectors in the
- gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
- sub-optimal and causes memory explosion (even though the cost model
- should reject that in the end). */
-
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */
-/* We end up using gathers for the strided load on RISC-V which would be OK. */
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */
+#include "tree-vect.h" + +#if VECTOR_BITS > 256 +#define NINTS (VECTOR_BITS / 32) +#else +#define NINTS 8 +#endif + +#define N (NINTS * 2) +#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS) + +extern void abort (void); + +typedef struct giga +{ + unsigned int g[N]; +} giga; + +unsigned long __attribute__((noinline,noclone)) +addfst(giga const *gptr, int num) +{ + unsigned int retval = 0; + int i; + for (i = 0; i < num; i++) + retval += gptr[i].g[0]; + return retval; +} + +int main () +{ + struct giga g[NINTS]; + unsigned int n = 1; + int i, j; + check_vect (); + for (i = 0; i < NINTS; ++i) + for (j = 0; j < N; ++j) + { + g[i].g[j] = n++; + __asm__ volatile (""); + } + if (addfst (g, NINTS) != RESULT) + abort (); + return 0; +} + +/* We don't want to vectorize the single-element interleaving in the way + we currently do that (without ignoring not needed vectors in the + gap between gptr[0].g[0] and gptr[1].g[0]), because that's very + sub-optimal and causes memory explosion (even though the cost model + should reject that in the end). */ + +/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */ +/* We should end up using gathers for the strided load on RISC-V. */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */ +/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 9b14b96..6967d50 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2081,6 +2081,35 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, else *memory_access_type = VMAT_CONTIGUOUS; + /* If this is single-element interleaving with an element + distance that leaves unused vector loads around punt - we + at least create very sub-optimal code in that case (and + blow up memory, see PR65518). */ + if (loop_vinfo + && *memory_access_type == VMAT_CONTIGUOUS + && single_element_p + && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) + { + if (SLP_TREE_LANES (slp_node) == 1) + { + *memory_access_type = VMAT_ELEMENTWISE; + overrun_p = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads, using " + "elementwise access\n"); + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads\n"); + return false; + } + } + overrun_p = loop_vinfo && gap != 0; if (overrun_p && vls_type != VLS_LOAD) { @@ -2149,6 +2178,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, "Peeling for outer loop is not supported\n"); return false; } + /* Peeling for gaps assumes that a single scalar iteration is enough to make sure the last vector iteration doesn't access excess elements. */ @@ -2179,34 +2209,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, return false; } } - - /* If this is single-element interleaving with an element - distance that leaves unused vector loads around punt - we - at least create very sub-optimal code in that case (and - blow up memory, see PR65518). */ - if (loop_vinfo - && *memory_access_type == VMAT_CONTIGUOUS - && single_element_p - && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) - { - if (SLP_TREE_LANES (slp_node) == 1) - { - *memory_access_type = VMAT_ELEMENTWISE; - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads, using " - "elementwise access\n"); - } - else - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; - } - } } } else |