aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2023-06-06 13:05:56 +0200
committerRichard Biener <rguenther@suse.de>2023-06-19 09:28:46 +0200
commit85c39a3cf157916ac494e8e90ad1df2cb6bf5cf2 (patch)
tree76dd656da7dd3f69734d9229d6983ffd088e1dc0 /gcc/tree-vectorizer.h
parentdf03c9a91439b376b55d3a8f1fd7878af5c1c390 (diff)
downloadgcc-85c39a3cf157916ac494e8e90ad1df2cb6bf5cf2.zip
gcc-85c39a3cf157916ac494e8e90ad1df2cb6bf5cf2.tar.gz
gcc-85c39a3cf157916ac494e8e90ad1df2cb6bf5cf2.tar.bz2
AVX512 fully masked vectorization
This implemens fully masked vectorization or a masked epilog for AVX512 style masks which single themselves out by representing each lane with a single bit and by using integer modes for the mask (both is much like GCN). AVX512 is also special in that it doesn't have any instruction to compute the mask from a scalar IV like SVE has with while_ult. Instead the masks are produced by vector compares and the loop control retains the scalar IV (mainly to avoid dependences on mask generation, a suitable mask test instruction is available). Like RVV code generation prefers a decrementing IV though IVOPTs messes things up in some cases removing that IV to eliminate it with an incrementing one used for address generation. One of the motivating testcases is from PR108410 which in turn is extracted from x264 where large size vectorization shows issues with small trip loops. Execution time there improves compared to classic AVX512 with AVX2 epilogues for the cases of less than 32 iterations. size scalar 128 256 512 512e 512f 1 9.42 11.32 9.35 11.17 15.13 16.89 2 5.72 6.53 6.66 6.66 7.62 8.56 3 4.49 5.10 5.10 5.74 5.08 5.73 4 4.10 4.33 4.29 5.21 3.79 4.25 6 3.78 3.85 3.86 4.76 2.54 2.85 8 3.64 1.89 3.76 4.50 1.92 2.16 12 3.56 2.21 3.75 4.26 1.26 1.42 16 3.36 0.83 1.06 4.16 0.95 1.07 20 3.39 1.42 1.33 4.07 0.75 0.85 24 3.23 0.66 1.72 4.22 0.62 0.70 28 3.18 1.09 2.04 4.20 0.54 0.61 32 3.16 0.47 0.41 0.41 0.47 0.53 34 3.16 0.67 0.61 0.56 0.44 0.50 38 3.19 0.95 0.95 0.82 0.40 0.45 42 3.09 0.58 1.21 1.13 0.36 0.40 'size' specifies the number of actual iterations, 512e is for a masked epilog and 512f for the fully masked loop. From 4 scalar iterations on the AVX512 masked epilog code is clearly the winner, the fully masked variant is clearly worse and it's size benefit is also tiny. This patch does not enable using fully masked loops or masked epilogues by default. More work on cost modeling and vectorization kind selection on x86_64 is necessary for this. Implementation wise this introduces LOOP_VINFO_PARTIAL_VECTORS_STYLE which could be exploited further to unify some of the flags we have right now but there didn't seem to be many easy things to merge, so I'm leaving this for followups. Mask requirements as registered by vect_record_loop_mask are kept in their original form and recorded in a hash_set now instead of being processed to a vector of rgroup_controls. Instead that's now left to the final analysis phase which tries forming the rgroup_controls vector using while_ult and if that fails now tries AVX512 style which needs a different organization and instead fills a hash_map with the relevant info. vect_get_loop_mask now has two implementations, one for the two mask styles we then have. I have decided against interweaving vect_set_loop_condition_partial_vectors with conditions to do AVX512 style masking and instead opted to "duplicate" this to vect_set_loop_condition_partial_vectors_avx512. Likewise for vect_verify_full_masking vs vect_verify_full_masking_avx512. The vect_prepare_for_masked_peels hunk might run into issues with SVE, I didn't check yet but using LOOP_VINFO_RGROUP_COMPARE_TYPE looked odd. Bootstrapped and tested on x86_64-unknown-linux-gnu. I've run the testsuite with --param vect-partial-vector-usage=2 with and without -fno-vect-cost-model and filed two bugs, one ICE (PR110221) and one latent wrong-code (PR110237). * tree-vectorizer.h (enum vect_partial_vector_style): New. (_loop_vec_info::partial_vector_style): Likewise. (LOOP_VINFO_PARTIAL_VECTORS_STYLE): Likewise. (rgroup_controls::compare_type): Add. (vec_loop_masks): Change from a typedef to auto_vec<> to a structure. * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors): Adjust. Convert niters_skip to compare_type. (vect_set_loop_condition_partial_vectors_avx512): New function implementing the AVX512 partial vector codegen. (vect_set_loop_condition): Dispatch to the correct vect_set_loop_condition_partial_vectors_* function based on LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_prepare_for_masked_peels): Compute LOOP_VINFO_MASK_SKIP_NITERS in the original niter type. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize partial_vector_style. (can_produce_all_loop_masks_p): Adjust. (vect_verify_full_masking): Produce the rgroup_controls vector here. Set LOOP_VINFO_PARTIAL_VECTORS_STYLE on success. (vect_verify_full_masking_avx512): New function implementing verification of AVX512 style masking. (vect_verify_loop_lens): Set LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_analyze_loop_2): Also try AVX512 style masking. Adjust condition. (vect_estimate_min_profitable_iters): Implement AVX512 style mask producing cost. (vect_record_loop_mask): Do not build the rgroup_controls vector here but record masks in a hash-set. (vect_get_loop_mask): Implement AVX512 style mask query, complementing the existing while_ult style.
Diffstat (limited to 'gcc/tree-vectorizer.h')
-rw-r--r--gcc/tree-vectorizer.h35
1 files changed, 32 insertions, 3 deletions
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 767a077..a36974c 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -300,6 +300,13 @@ public:
#define SLP_TREE_LANES(S) (S)->lanes
#define SLP_TREE_CODE(S) (S)->code
+enum vect_partial_vector_style {
+ vect_partial_vectors_none,
+ vect_partial_vectors_while_ult,
+ vect_partial_vectors_avx512,
+ vect_partial_vectors_len
+};
+
/* Key for map that records association between
scalar conditions and corresponding loop mask, and
is populated by vect_record_loop_mask. */
@@ -591,12 +598,15 @@ is_a_helper <_bb_vec_info *>::test (vec_info *i)
/* The controls (like masks or lengths) needed by rgroups with nV vectors,
according to the description above. */
struct rgroup_controls {
- /* The largest nS for all rgroups that use these controls. */
+ /* The largest nS for all rgroups that use these controls.
+ For vect_partial_vectors_avx512 this is the constant nscalars_per_iter
+ for all members of the group. */
unsigned int max_nscalars_per_iter;
/* For the largest nS recorded above, the loop controls divide each scalar
into FACTOR equal-sized pieces. This is useful if we need to split
- element-based accesses into byte-based accesses. */
+ element-based accesses into byte-based accesses.
+ For vect_partial_vectors_avx512 this records nV instead. */
unsigned int factor;
/* This is a vector type with MAX_NSCALARS_PER_ITER * VF / nV elements.
@@ -605,6 +615,10 @@ struct rgroup_controls {
specified number of elements; the type of the elements doesn't matter. */
tree type;
+ /* When there is no uniformly used LOOP_VINFO_RGROUP_COMPARE_TYPE this
+ is the rgroup specific type used. */
+ tree compare_type;
+
/* A vector of nV controls, in iteration order. */
vec<tree> controls;
@@ -613,7 +627,17 @@ struct rgroup_controls {
tree bias_adjusted_ctrl;
};
-typedef auto_vec<rgroup_controls> vec_loop_masks;
+struct vec_loop_masks
+{
+ bool is_empty () const { return mask_set.is_empty (); }
+
+ /* Set to record vectype, nvector pairs. */
+ hash_set<pair_hash <nofree_ptr_hash <tree_node>,
+ int_hash<unsigned, 0>>> mask_set;
+
+ /* rgroup_controls used for the partial vector scheme. */
+ auto_vec<rgroup_controls> rgc_vec;
+};
typedef auto_vec<rgroup_controls> vec_loop_lens;
@@ -741,6 +765,10 @@ public:
LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */
tree rgroup_iv_type;
+ /* The style used for implementing partial vectors when
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */
+ vect_partial_vector_style partial_vector_style;
+
/* Unknown DRs according to which loop was peeled. */
class dr_vec_info *unaligned_dr;
@@ -914,6 +942,7 @@ public:
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
#define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
+#define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
#define LOOP_VINFO_N_STMTS(L) (L)->shared->n_stmts
#define LOOP_VINFO_LOOP_NEST(L) (L)->shared->loop_nest