diff options
author | Richard Biener <rguenther@suse.de> | 2024-05-27 16:04:35 +0200 |
---|---|---|
committer | Richard Biener <rguenther@suse.de> | 2024-05-29 13:05:24 +0200 |
commit | f46eaad445e680034df51bd0dec4e6c7b1f372a4 (patch) | |
tree | dd1c04eef158c554d4cf5cb9af6856b8573ca008 /gcc | |
parent | 1065a7db6f2a69770a85b4d53b9123b090dd1771 (diff) | |
download | gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.zip gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.gz gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.bz2 |
tree-optimization/115252 - enhance peeling for gaps avoidance
Code generation for contiguous load vectorization can already deal
with generalized avoidance of loading from a gap. The following
extends detection of peeling for gaps requirement with that,
gets rid of the old special casing of a half load and makes sure
when we do access the gap we have peeling for gaps enabled.
PR tree-optimization/115252
* tree-vect-stmts.cc (get_group_load_store_type): Enhance
detecting the number of cases where we can avoid accessing a gap
during code generation.
(vectorizable_load): Remove old half-vector peeling for gap
avoidance which is now redundant. Add gap-aligned case where
it's OK to access the gap. Add assert that we have peeling for
gaps enabled when we access a gap.
* gcc.dg/vect/slp-gap-1.c: New testcase.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/slp-gap-1.c | 18 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 58 |
2 files changed, 46 insertions, 30 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c new file mode 100644 index 0000000..36463ca --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3" } */ + +typedef unsigned char uint8_t; +typedef short int16_t; +void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) { + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) + diff[x + y * 4] = pix1[x] - pix2[x]; + pix1 += 16; + pix2 += 32; + } +} + +/* We can vectorize this without peeling for gaps and thus without epilogue, + but the only thing we can reliably scan is the zero-padding trick for the + partial loads. */ +/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target vect64 } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 4219ad8..935d80f 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2072,16 +2072,22 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, dr_alignment_support alss; int misalign = dr_misalignment (first_dr_info, vectype); tree half_vtype; + poly_uint64 remain; + unsigned HOST_WIDE_INT tem, num; if (overrun_p && !masked_p && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info, vectype, misalign))) == dr_aligned || alss == dr_unaligned_supported) - && known_eq (nunits, (group_size - gap) * 2) - && known_eq (nunits, group_size) - && (vector_vector_composition_type (vectype, 2, &half_vtype) - != NULL_TREE)) + && can_div_trunc_p (group_size + * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap, + nunits, &tem, &remain) + && (known_eq (remain, 0u) + || (constant_multiple_p (nunits, remain, &num) + && (vector_vector_composition_type (vectype, num, + &half_vtype) + != NULL_TREE)))) overrun_p = false; if (overrun_p && !can_overrun_p) @@ -11513,33 +11519,14 @@ vectorizable_load (vec_info *vinfo, unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info); unsigned int vect_align = vect_known_alignment_in_bytes (first_dr_info, vectype); - unsigned int scalar_dr_size - = vect_get_scalar_dr_size (first_dr_info); - /* If there's no peeling for gaps but we have a gap - with slp loads then load the lower half of the - vector only. See get_group_load_store_type for - when we apply this optimization. */ - if (slp - && loop_vinfo - && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0 - && known_eq (nunits, (group_size - gap) * 2) - && known_eq (nunits, group_size) - && gap >= (vect_align / scalar_dr_size)) - { - tree half_vtype; - new_vtype - = vector_vector_composition_type (vectype, 2, - &half_vtype); - if (new_vtype != NULL_TREE) - ltype = half_vtype; - } /* Try to use a single smaller load when we are about to load excess elements compared to the unrolled - scalar loop. - ??? This should cover the above case as well. */ - else if (known_gt ((vec_num * j + i + 1) * nunits, + scalar loop. */ + if (known_gt ((vec_num * j + i + 1) * nunits, (group_size * vf - gap))) { + poly_uint64 remain = ((group_size * vf - gap) + - (vec_num * j + i) * nunits); if (known_ge ((vec_num * j + i + 1) * nunits - (group_size * vf - gap), nunits)) /* DR will be unused. */ @@ -11551,11 +11538,15 @@ vectorizable_load (vec_info *vinfo, at least one element is accessed in the scalar loop. */ ; + else if (known_gt (vect_align, + ((nunits - remain) + * vect_get_scalar_dr_size + (first_dr_info)))) + /* Aligned access to the gap area when there's + at least one element in it is OK. */ + ; else { - auto remain - = ((group_size * vf - gap) - - (vec_num * j + i) * nunits); /* remain should now be > 0 and < nunits. */ unsigned num; if (constant_multiple_p (nunits, remain, &num)) @@ -11569,6 +11560,13 @@ vectorizable_load (vec_info *vinfo, ltype = ptype; } /* Else use multiple loads or a masked load? */ + /* For loop vectorization we now should have + an alternate type or LOOP_VINFO_PEELING_FOR_GAPS + set. */ + if (loop_vinfo) + gcc_assert (new_vtype + || LOOP_VINFO_PEELING_FOR_GAPS + (loop_vinfo)); } } tree offset |