diff options
author | Kugan Vivekanandarajah <kuganv@linaro.org> | 2019-06-13 03:34:28 +0000 |
---|---|---|
committer | Kugan Vivekanandarajah <kugan@gcc.gnu.org> | 2019-06-13 03:34:28 +0000 |
commit | 9b884225bfc609606f9b169b021c4da93feba48e (patch) | |
tree | 28c9e494f42ea0d6f529cb8033d031f8f2b46f04 /gcc/tree-vect-loop.c | |
parent | fa9863e7d34ecd011ae75083be2ae124e5831b64 (diff) | |
download | gcc-9b884225bfc609606f9b169b021c4da93feba48e.zip gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.gz gcc-9b884225bfc609606f9b169b021c4da93feba48e.tar.bz2 |
re PR target/88838 ([SVE] Use 32-bit WHILELO in LP64 mode)
gcc/ChangeLog:
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88838
* tree-vect-loop-manip.c (vect_set_loop_masks_directly): If the
compare_type is not with Pmode size, we will create an IV with
Pmode size with truncated use (i.e. converted to the correct type).
* tree-vect-loop.c (vect_verify_full_masking): Find IV type.
(vect_iv_limit_for_full_masking): New. Factored out of
vect_set_loop_condition_masked.
* tree-vectorizer.h (LOOP_VINFO_MASK_IV_TYPE): New.
(vect_iv_limit_for_full_masking): Declare.
gcc/testsuite/ChangeLog:
2019-06-13 Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
PR target/88838
* gcc.target/aarch64/pr88838.c: New test.
* gcc.target/aarch64/sve/while_1.c: Adjust.
From-SVN: r272233
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 85 |
1 files changed, 80 insertions, 5 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 4942c69..671ef2f 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1030,6 +1030,8 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); unsigned int min_ni_width; + unsigned int max_nscalars_per_iter + = vect_get_max_nscalars_per_iter (loop_vinfo); /* Use a normal loop if there are no statements that need masking. This only happens in rare degenerate cases: it means that the loop @@ -1048,7 +1050,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) max_ni = wi::smin (max_ni, max_back_edges + 1); /* Account for rgroup masks, in which each bit is replicated N times. */ - max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); + max_ni *= max_nscalars_per_iter; /* Work out how many bits we need to represent the limit. */ min_ni_width = wi::min_precision (max_ni, UNSIGNED); @@ -1056,6 +1058,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) /* Find a scalar mode for which WHILE_ULT is supported. */ opt_scalar_int_mode cmp_mode_iter; tree cmp_type = NULL_TREE; + tree iv_type = NULL_TREE; + widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); + widest_int iv_precision = UINT_MAX; + + if (iv_limit != -1) + iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, + UNSIGNED); + FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) { unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); @@ -1067,10 +1077,32 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) && can_produce_all_loop_masks_p (loop_vinfo, this_type)) { /* Although we could stop as soon as we find a valid mode, - it's often better to continue until we hit Pmode, since the - operands to the WHILE are more likely to be reusable in - address calculations. */ - cmp_type = this_type; + there are at least two reasons why that's not always the + best choice: + + - An IV that's Pmode or wider is more likely to be reusable + in address calculations than an IV that's narrower than + Pmode. + + - Doing the comparison in IV_PRECISION or wider allows + a natural 0-based IV, whereas using a narrower comparison + type requires mitigations against wrap-around. + + Conversely, if the IV limit is variable, doing the comparison + in a wider type than the original type can introduce + unnecessary extensions, so picking the widest valid mode + is not always a good choice either. + + Here we prefer the first IV type that's Pmode or wider, + and the first comparison type that's IV_PRECISION or wider. + (The comparison type must be no wider than the IV type, + to avoid extensions in the vector loop.) + + ??? We might want to try continuing beyond Pmode for ILP32 + targets if CMP_BITS < IV_PRECISION. */ + iv_type = this_type; + if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) + cmp_type = this_type; if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) break; } @@ -1081,6 +1113,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) return false; LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; + LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type; return true; } @@ -9014,3 +9047,45 @@ optimize_mask_stores (struct loop *loop) add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); } } + +/* Decide whether it is possible to use a zero-based induction variable + when vectorizing LOOP_VINFO with a fully-masked loop. If it is, + return the value that the induction variable must be able to hold + in order to ensure that the loop ends with an all-false mask. + Return -1 otherwise. */ +widest_int +vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo) +{ + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); + + /* Calculate the value that the induction variable must be able + to hit in order to ensure that we end the loop with an all-false mask. + This involves adding the maximum number of inactive trailing scalar + iterations. */ + widest_int iv_limit = -1; + if (max_loop_iterations (loop, &iv_limit)) + { + if (niters_skip) + { + /* Add the maximum number of skipped iterations to the + maximum iteration count. */ + if (TREE_CODE (niters_skip) == INTEGER_CST) + iv_limit += wi::to_widest (niters_skip); + else + iv_limit += max_vf - 1; + } + else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) + /* Make a conservatively-correct assumption. */ + iv_limit += max_vf - 1; + + /* IV_LIMIT is the maximum number of latch iterations, which is also + the maximum in-range IV value. Round this value down to the previous + vector alignment boundary and then add an extra full iteration. */ + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; + } + return iv_limit; +} + |