aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop-manip.cc
diff options
context:
space:
mode:
authorPengfei Li <Pengfei.Li2@arm.com>2025-08-07 11:08:35 +0000
committerPengfei Li <Pengfei.Li2@arm.com>2025-08-07 11:10:10 +0000
commiteee51f9a4b6e584230f75e4616438bb5ad5935a9 (patch)
tree5b19bca0a9b0b84b5eaeced1ca649a8023cf53cb /gcc/tree-vect-loop-manip.cc
parentb7fd1fe6cf3df1eed2a80a9cf8b812522ecba8d8 (diff)
downloadgcc-eee51f9a4b6e584230f75e4616438bb5ad5935a9.zip
gcc-eee51f9a4b6e584230f75e4616438bb5ad5935a9.tar.gz
gcc-eee51f9a4b6e584230f75e4616438bb5ad5935a9.tar.bz2
vect: Extend peeling and versioning for alignment to VLA modes
This patch extends the support for peeling and versioning for alignment from VLS modes to VLA modes. The key change is allowing the DR target alignment to be set to a non-constant poly_int. Since the value must be a power-of-two, for variable VFs, the power-of-two check is deferred to runtime through loop versioning. The vectorizable check for speculative loads is also refactored in this patch to handle both constant and variable target alignment values. Additional changes for VLA modes include: 1) Peeling In VLA modes, we use peeling with masking - using a partial vector in the first iteration of the vectorized loop to ensure aligned DRs in subsequent iterations. It was already enabled for VLS modes to avoid scalar peeling. This patch reuses most of the existing logic and just fixes a small issue of incorrect IV offset in VLA code path. This also removes a power-of-two rounding when computing the number of iterations to peel, as power-of-two VF has been guaranteed by a new runtime check. 2) Versioning The type of the mask for runtime alignment check is updated to poly_int to support variable VFs. After this change, both standalone versioning and peeling with versioning are available in VLA modes. This patch also introduces another runtime check for speculative read amount, to ensure that all speculative loads remain within current valid memory page. We plan to remove these runtime checks in the future by introducing capped VF - using partial vectors to limit the actual VF value at runtime. 3) Speculative read flag DRs whose scalar accesses are known to be in-bounds will be considered unaligned unsupported with a variable target alignment. But in fact, speculative reads can be naturally avoided for in-bounds DRs as long as partial vectors are used. Therefore, this patch clears the speculative flags and sets the "must use partial vectors" flag for these cases. This patch is bootstrapped and regression-tested on x86_64-linux-gnu, arm-linux-gnueabihf and aarch64-linux-gnu with bootstrap-O3. gcc/ChangeLog: * tree-vect-data-refs.cc (vect_compute_data_ref_alignment): Allow DR target alignment to be a poly_int. (vect_enhance_data_refs_alignment): Support peeling and versioning for VLA modes. * tree-vect-loop-manip.cc (get_misalign_in_elems): Remove power-of-two rounding in peeling. (vect_create_cond_for_align_checks): Update alignment check logic for poly_int mask. (vect_create_cond_for_vla_spec_read): New runtime checks. (vect_loop_versioning): Support new runtime checks. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new loop_vinfo field. (vectorizable_induction): Fix wrong IV offset issue. * tree-vect-stmts.cc (get_load_store_type): Refactor vectorizable checks for speculative loads. * tree-vectorizer.h (LOOP_VINFO_MAX_SPEC_READ_AMOUNT): New macro for new runtime checks. (LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ): Likewise (LOOP_REQUIRES_VERSIONING): Update macro for new runtime checks. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/peel_ind_11.c: New test. * gcc.target/aarch64/sve/peel_ind_11_run.c: New test. * gcc.target/aarch64/sve/peel_ind_12.c: New test. * gcc.target/aarch64/sve/peel_ind_12_run.c: New test. * gcc.target/aarch64/sve/peel_ind_13.c: New test. * gcc.target/aarch64/sve/peel_ind_13_run.c: New test.
Diffstat (limited to 'gcc/tree-vect-loop-manip.cc')
-rw-r--r--gcc/tree-vect-loop-manip.cc71
1 files changed, 63 insertions, 8 deletions
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 6c1b26a..566308f 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2454,10 +2454,7 @@ get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
else
{
tree vla = build_int_cst (type, target_align);
- tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
- fold_build2 (MINUS_EXPR, type,
- build_int_cst (type, 0), vla));
- target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
+ target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla,
build_int_cst (type, 1));
}
@@ -3840,7 +3837,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
const vec<stmt_vec_info> &may_misalign_stmts
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
stmt_vec_info stmt_info;
- int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
+ poly_uint64 mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
tree mask_cst;
unsigned int i;
tree int_ptrsize_type;
@@ -3852,9 +3849,7 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
tree ptrsize_zero;
tree part_cond_expr;
- /* Check that mask is one less than a power of 2, i.e., mask is
- all zeros followed by all ones. */
- gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
+ gcc_assert (known_ne (mask, 0U));
int_ptrsize_type = signed_type_for (ptr_type_node);
@@ -3962,6 +3957,62 @@ vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
chain_cond_expr (cond_expr, part_cond_expr);
}
+/* Function vect_create_cond_for_vla_spec_read.
+
+ Create a conditional expression that represents the run-time checks with
+ max speculative read amount in VLA modes. We check two things:
+ 1) if the max speculative read amount exceeds the min page size
+ 2) if the VF is power-of-2 - done by checking the max read amount instead
+
+ Input:
+ COND_EXPR - input conditional expression. New conditions will be chained
+ with logical AND operation.
+ LOOP_VINFO - field LOOP_VINFO_MAX_SPEC_READ_AMOUNT contains the max
+ possible speculative read amount in VLA modes.
+
+ Output:
+ COND_EXPR - conditional expression.
+
+ The returned COND_EXPR is the conditional expression to be used in the
+ if statement that controls which version of the loop gets executed at
+ runtime. */
+
+static void
+vect_create_cond_for_vla_spec_read (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+ poly_uint64 read_amount_poly = LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo);
+ tree amount = build_int_cst (long_unsigned_type_node, read_amount_poly);
+
+ /* Both the read amount and the VF must be variants, and the read amount must
+ be a constant power-of-2 multiple of the VF. */
+ unsigned HOST_WIDE_INT multiple;
+ gcc_assert (!read_amount_poly.is_constant ()
+ && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+ && constant_multiple_p (read_amount_poly,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ &multiple)
+ && pow2p_hwi (multiple));
+
+ tree cst_ul_zero = build_int_cstu (long_unsigned_type_node, 0U);
+ tree cst_ul_one = build_int_cstu (long_unsigned_type_node, 1U);
+ tree cst_ul_pagesize = build_int_cstu (long_unsigned_type_node,
+ (unsigned long) param_min_pagesize);
+
+ /* Create an expression of "amount & (amount - 1) == 0". */
+ tree amount_m1 = fold_build2 (MINUS_EXPR, long_unsigned_type_node,
+ amount, cst_ul_one);
+ tree amount_and_expr = fold_build2 (BIT_AND_EXPR, long_unsigned_type_node,
+ amount, amount_m1);
+ tree powof2_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
+ amount_and_expr, cst_ul_zero);
+ chain_cond_expr (cond_expr, powof2_cond_expr);
+
+ /* Create an expression of "amount <= cst_ul_pagesize". */
+ tree pagesize_cond_expr = fold_build2 (LE_EXPR, boolean_type_node,
+ amount, cst_ul_pagesize);
+ chain_cond_expr (cond_expr, pagesize_cond_expr);
+}
+
/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
@@ -4087,6 +4138,7 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
gimple_seq gimplify_stmt_list = NULL;
tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
+ bool version_spec_read = LOOP_REQUIRES_VERSIONING_FOR_SPEC_READ (loop_vinfo);
bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
poly_uint64 versioning_threshold
@@ -4145,6 +4197,9 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
&cond_expr_stmt_list);
+ if (version_spec_read)
+ vect_create_cond_for_vla_spec_read (loop_vinfo, &cond_expr);
+
if (version_alias)
{
vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);