aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2024-05-27 16:04:35 +0200
committerRichard Biener <rguenther@suse.de>2024-05-29 13:05:24 +0200
commitf46eaad445e680034df51bd0dec4e6c7b1f372a4 (patch)
treedd1c04eef158c554d4cf5cb9af6856b8573ca008
parent1065a7db6f2a69770a85b4d53b9123b090dd1771 (diff)
downloadgcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.zip
gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.gz
gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.bz2
tree-optimization/115252 - enhance peeling for gaps avoidance
Code generation for contiguous load vectorization can already deal with generalized avoidance of loading from a gap. The following extends detection of peeling for gaps requirement with that, gets rid of the old special casing of a half load and makes sure when we do access the gap we have peeling for gaps enabled. PR tree-optimization/115252 * tree-vect-stmts.cc (get_group_load_store_type): Enhance detecting the number of cases where we can avoid accessing a gap during code generation. (vectorizable_load): Remove old half-vector peeling for gap avoidance which is now redundant. Add gap-aligned case where it's OK to access the gap. Add assert that we have peeling for gaps enabled when we access a gap. * gcc.dg/vect/slp-gap-1.c: New testcase.
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-gap-1.c18
-rw-r--r--gcc/tree-vect-stmts.cc58
2 files changed, 46 insertions, 30 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
new file mode 100644
index 0000000..36463ca
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+typedef unsigned char uint8_t;
+typedef short int16_t;
+void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) {
+ for (int y = 0; y < 4; y++) {
+ for (int x = 0; x < 4; x++)
+ diff[x + y * 4] = pix1[x] - pix2[x];
+ pix1 += 16;
+ pix2 += 32;
+ }
+}
+
+/* We can vectorize this without peeling for gaps and thus without epilogue,
+ but the only thing we can reliably scan is the zero-padding trick for the
+ partial loads. */
+/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target vect64 } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4219ad8..935d80f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2072,16 +2072,22 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
dr_alignment_support alss;
int misalign = dr_misalignment (first_dr_info, vectype);
tree half_vtype;
+ poly_uint64 remain;
+ unsigned HOST_WIDE_INT tem, num;
if (overrun_p
&& !masked_p
&& (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
vectype, misalign)))
== dr_aligned
|| alss == dr_unaligned_supported)
- && known_eq (nunits, (group_size - gap) * 2)
- && known_eq (nunits, group_size)
- && (vector_vector_composition_type (vectype, 2, &half_vtype)
- != NULL_TREE))
+ && can_div_trunc_p (group_size
+ * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
+ nunits, &tem, &remain)
+ && (known_eq (remain, 0u)
+ || (constant_multiple_p (nunits, remain, &num)
+ && (vector_vector_composition_type (vectype, num,
+ &half_vtype)
+ != NULL_TREE))))
overrun_p = false;
if (overrun_p && !can_overrun_p)
@@ -11513,33 +11519,14 @@ vectorizable_load (vec_info *vinfo,
unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
unsigned int vect_align
= vect_known_alignment_in_bytes (first_dr_info, vectype);
- unsigned int scalar_dr_size
- = vect_get_scalar_dr_size (first_dr_info);
- /* If there's no peeling for gaps but we have a gap
- with slp loads then load the lower half of the
- vector only. See get_group_load_store_type for
- when we apply this optimization. */
- if (slp
- && loop_vinfo
- && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
- && known_eq (nunits, (group_size - gap) * 2)
- && known_eq (nunits, group_size)
- && gap >= (vect_align / scalar_dr_size))
- {
- tree half_vtype;
- new_vtype
- = vector_vector_composition_type (vectype, 2,
- &half_vtype);
- if (new_vtype != NULL_TREE)
- ltype = half_vtype;
- }
/* Try to use a single smaller load when we are about
to load excess elements compared to the unrolled
- scalar loop.
- ??? This should cover the above case as well. */
- else if (known_gt ((vec_num * j + i + 1) * nunits,
+ scalar loop. */
+ if (known_gt ((vec_num * j + i + 1) * nunits,
(group_size * vf - gap)))
{
+ poly_uint64 remain = ((group_size * vf - gap)
+ - (vec_num * j + i) * nunits);
if (known_ge ((vec_num * j + i + 1) * nunits
- (group_size * vf - gap), nunits))
/* DR will be unused. */
@@ -11551,11 +11538,15 @@ vectorizable_load (vec_info *vinfo,
at least one element is accessed in the
scalar loop. */
;
+ else if (known_gt (vect_align,
+ ((nunits - remain)
+ * vect_get_scalar_dr_size
+ (first_dr_info))))
+ /* Aligned access to the gap area when there's
+ at least one element in it is OK. */
+ ;
else
{
- auto remain
- = ((group_size * vf - gap)
- - (vec_num * j + i) * nunits);
/* remain should now be > 0 and < nunits. */
unsigned num;
if (constant_multiple_p (nunits, remain, &num))
@@ -11569,6 +11560,13 @@ vectorizable_load (vec_info *vinfo,
ltype = ptype;
}
/* Else use multiple loads or a masked load? */
+ /* For loop vectorization we now should have
+ an alternate type or LOOP_VINFO_PEELING_FOR_GAPS
+ set. */
+ if (loop_vinfo)
+ gcc_assert (new_vtype
+ || LOOP_VINFO_PEELING_FOR_GAPS
+ (loop_vinfo));
}
}
tree offset