diff options
author | Tamar Christina <tamar.christina@arm.com> | 2025-04-16 13:09:05 +0100 |
---|---|---|
committer | Tamar Christina <tamar.christina@arm.com> | 2025-04-16 13:09:05 +0100 |
commit | 46ccce1de686c1b437eff43431dc20d20d4687c0 (patch) | |
tree | 3d6bd3f717931c1669431f7effb17e0cc6ea87d7 /gcc/tree-vectorizer.h | |
parent | 473dde525248a694c0f4e62b31a7fc24b238c5b0 (diff) | |
download | gcc-46ccce1de686c1b437eff43431dc20d20d4687c0.zip gcc-46ccce1de686c1b437eff43431dc20d20d4687c0.tar.gz gcc-46ccce1de686c1b437eff43431dc20d20d4687c0.tar.bz2 |
middle-end: Fix incorrect codegen with PFA and VLS [PR119351]
The following example:
#define N 512
#define START 2
#define END 505
int x[N] __attribute__((aligned(32)));
int __attribute__((noipa))
foo (void)
{
for (signed int i = START; i < END; ++i)
{
if (x[i] == 0)
return i;
}
return -1;
}
generates incorrect code with fixed length SVE because for early break we need
to know which value to start the scalar loop with if we take an early exit.
Historically this means that we take the first element of every induction.
this is because there's an assumption in place, that even with masked loops the
masks come from a whilel* instruction.
As such we reduce using a BIT_FIELD_REF <, 0>.
When PFA was added this assumption was correct for non-masked loop, however we
assumed that PFA for VLA wouldn't work for now, and disabled it using the
alignment requirement checks. We also expected VLS to PFA using scalar loops.
However as this PR shows, for VLS the vectorizer can, and does in some
circumstances choose to peel using masks by masking the first iteration of the
loop with an additional alignment mask.
When this is done, the first elements of the predicate can be inactive. In this
example element 1 is inactive based on the calculated misalignment. hence the
-1 value in the first vector IV element.
When we reduce using BIT_FIELD_REF we get the wrong value.
This patch updates it by creating a new scalar PHI that keeps track of whether
we are the first iteration of the loop (with the additional masking) or whether
we have taken a loop iteration already.
The generated sequence:
pre-header:
bb1:
i_1 = <number of leading inactive elements>
header:
bb2:
i_2 = PHI <i_1(bb1), 0(latch)>
…
early-exit:
bb3:
i_3 = iv_step * i_2 + PHI<vector-iv>
Which eliminates the need to do an expensive mask based reduction.
This fixes gromacs with one OpenMP thread. But with > 1 there is still an issue.
gcc/ChangeLog:
PR tree-optimization/119351
* tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET,
LOOP_VINFO_NON_LINEAR_IV): New.
(class _loop_vec_info): Add mask_skip_niters_pfa_offset and
nonlinear_iv.
* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them.
(vect_analyze_scalar_cycles_1): Record non-linear inductions.
(vectorizable_induction): If early break and PFA using masking create a
new phi which tracks where the scalar code needs to start...
(vectorizable_live_operation): ...and generate the adjustments here.
(vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and
early break needing peeling.
gcc/testsuite/ChangeLog:
PR tree-optimization/119351
* gcc.target/aarch64/sve/peel_ind_10.c: New test.
* gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
* gcc.target/aarch64/sve/peel_ind_5.c: New test.
* gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
* gcc.target/aarch64/sve/peel_ind_6.c: New test.
* gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
* gcc.target/aarch64/sve/peel_ind_7.c: New test.
* gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
* gcc.target/aarch64/sve/peel_ind_8.c: New test.
* gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
* gcc.target/aarch64/sve/peel_ind_9.c: New test.
* gcc.target/aarch64/sve/peel_ind_9_run.c: New test.
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r-- | gcc/tree-vectorizer.h | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 97caf61..01d19c7 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -818,6 +818,11 @@ public: elements that should be false in the first mask). */ tree mask_skip_niters; + /* If we are using a loop mask to align memory addresses and we're in an + early break loop then this variable contains the number of elements that + were skipped during the initial iteration of the loop. */ + tree mask_skip_niters_pfa_offset; + /* The type that the loop control IV should be converted to before testing which of the VF scalars are active and inactive. Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ @@ -854,6 +859,9 @@ public: /* The mask used to check the alignment of pointers or arrays. */ int ptr_mask; + /* Indicates whether the loop has any non-linear IV. */ + bool nonlinear_iv; + /* Data Dependence Relations defining address ranges that are candidates for a run-time aliasing check. */ auto_vec<ddr_p> may_alias_ddrs; @@ -1064,6 +1072,7 @@ public: #define LOOP_VINFO_MASKS(L) (L)->masks #define LOOP_VINFO_LENS(L) (L)->lens #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters +#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type #define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style @@ -1073,6 +1082,7 @@ public: #define LOOP_VINFO_DDRS(L) (L)->shared->ddrs #define LOOP_VINFO_INT_NITERS(L) (TREE_INT_CST_LOW ((L)->num_iters)) #define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment +#define LOOP_VINFO_NON_LINEAR_IV(L) (L)->nonlinear_iv #define LOOP_VINFO_UNALIGNED_DR(L) (L)->unaligned_dr #define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts #define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs @@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop) inline bool vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo) { + /* With early break vectorization we don't know whether the accesses will stay + inside the loop or not. TODO: The early break adjustment code can be + implemented the same way as vectorizable_linear_induction. However we + can't test this today so reject it. */ return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo) + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))); } /* Return the number of vectors of type VECTYPE that are needed to get |