From c3a2bc6daaa2d278cb5f323e2df4b8c2af4198ac Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Fri, 19 Mar 2021 14:29:36 +0000 Subject: slp: remove unneeded permute calculation (PR99656) The attach testcase ICEs because as you showed on the PR we have one child which is an internal with a PERM of EVENEVEN and one with TOP. The problem is while we can conceptually merge the permute itself into EVENEVEN, merging the lanes don't really make sense. That said, we no longer even require the merged lanes as we create the permutes based on the KIND directly. This patch just removes all of that code. Unfortunately it still won't vectorize with the cost model enabled due to the blend that's created combining the load and the external note: node 0x51f2ce8 (max_nunits=1, refcnt=1) note: op: VEC_PERM_EXPR note: { } note: lane permutation { 0[0] 1[1] } note: children 0x51f23e0 0x51f2578 note: node 0x51f23e0 (max_nunits=2, refcnt=1) note: op template: _16 = REALPART_EXPR <*t1_9(D)>; note: stmt 0 _16 = REALPART_EXPR <*t1_9(D)>; note: stmt 1 _16 = REALPART_EXPR <*t1_9(D)>; note: load permutation { 0 0 } note: node (external) 0x51f2578 (max_nunits=1, refcnt=1) note: { _18, _18 } which costs the cost for the load-and-split and the cost of the external splat, and the one for blending them while in reality it's just a scalar load and insert. The compiler (with the cost model disabled) generates ldr q1, [x19] dup v1.2d, v1.d[0] ldr d0, [x19, 8] fneg d0, d0 ins v1.d[1], v0.d[0] while really it should be ldp d1, d0, [x19] fneg d0, d0 ins v1.d[1], v0.d[0] but that's for another time. gcc/ChangeLog: PR tree-optimization/99656 * tree-vect-slp-patterns.c (linear_loads_p, complex_add_pattern::matches, is_eq_or_top, vect_validate_multiplication, complex_mul_pattern::matches, complex_fms_pattern::matches): Remove complex_perm_kinds_t. * tree-vectorizer.h: (complex_load_perm_t): Removed. (slp_tree_to_load_perm_map_t): Use complex_perm_kinds_t instead of complex_load_perm_t. gcc/testsuite/ChangeLog: PR tree-optimization/99656 * gfortran.dg/vect/pr99656.f90: New test. --- gcc/tree-vectorizer.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index b861c97..9861d9e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2059,13 +2059,8 @@ typedef enum _complex_perm_kinds { PERM_TOP } complex_perm_kinds_t; -/* A pair with a load permute and a corresponding complex_perm_kind which gives - information about the load it represents. */ -typedef std::pair - complex_load_perm_t; - /* Cache from nodes to the load permutation they represent. */ -typedef hash_map +typedef hash_map slp_tree_to_load_perm_map_t; /* Vector pattern matcher base class. All SLP pattern matchers must inherit -- cgit v1.1 From 096f8215d2172ca4177cb26035e748d8f182fc8f Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Mon, 10 May 2021 22:13:27 -0500 Subject: vect: Add costing_for_scalar parameter to init_cost hook rs6000 port function rs6000_density_test wants to differentiate the current cost model is for the scalar version of a loop or block, or the vector version. As Richi suggested, this patch introduces one new parameter costing_for_scalar to init_cost hook to pass down this information explicitly. gcc/ChangeLog: * doc/tm.texi: Regenerated. * target.def (init_cost): Add new parameter costing_for_scalar. * targhooks.c (default_init_cost): Adjust for new parameter. * targhooks.h (default_init_cost): Likewise. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Likewise. (vect_compute_single_scalar_iteration_cost): Likewise. (vect_analyze_loop_2): Likewise. * tree-vect-slp.c (_bb_vec_info::_bb_vec_info): Likewise. (vect_bb_vectorization_profitable_p): Likewise. * tree-vectorizer.h (init_cost): Likewise. * config/aarch64/aarch64.c (aarch64_init_cost): Likewise. * config/i386/i386.c (ix86_init_cost): Likewise. * config/rs6000/rs6000.c (rs6000_init_cost): Likewise. --- gcc/tree-vectorizer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 9861d9e..8d1ffaf 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1455,9 +1455,9 @@ int vect_get_stmt_cost (enum vect_cost_for_stmt type_of_cost) /* Alias targetm.vectorize.init_cost. */ static inline void * -init_cost (class loop *loop_info) +init_cost (class loop *loop_info, bool costing_for_scalar) { - return targetm.vectorize.init_cost (loop_info); + return targetm.vectorize.init_cost (loop_info, costing_for_scalar); } extern void dump_stmt_cost (FILE *, void *, int, enum vect_cost_for_stmt, -- cgit v1.1 From 9c5bd1e9811babe255ddbbdcda1d00ea5997b826 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Wed, 19 May 2021 05:42:51 -0500 Subject: vect: Replace hardcoded inner loop cost factor This patch is to replace the current hardcoded weight factor 50, which is applied by the loop vectorizer to the cost of statements in an inner loop relative to the loop being vectorized, with one newly added member inner_loop_cost_factor in loop vinfo. It also introduces one parameter vect-inner-loop-cost-factor whose default value is 50, and is used to initialize the inner_loop_cost_factor member. The motivation here is that: if targets want to have one unique function to gather some information in each add_stmt_cost call, no matter that it's put before or after the cost tweaking part for inner loop, it may have the need to adjust (expand or shrink) the gathered data as the factor. Now the factor is hardcoded, it's not easily maintained. Bootstrapped/regtested on powerpc64le-linux-gnu P9, x86_64-redhat-linux and aarch64-linux-gnu. gcc/ChangeLog: * doc/invoke.texi (vect-inner-loop-cost-factor): Document new parameter. * params.opt (vect-inner-loop-cost-factor): New. * targhooks.c (default_add_stmt_cost): Replace hardcoded factor 50 with LOOP_VINFO_INNER_LOOP_COST_FACTOR, include head file tree-vectorizer.h and its required ones. * config/aarch64/aarch64.c (aarch64_add_stmt_cost): Replace hardcoded factor 50 with LOOP_VINFO_INNER_LOOP_COST_FACTOR. * config/arm/arm.c (arm_add_stmt_cost): Likewise. * config/i386/i386.c (ix86_add_stmt_cost): Likewise. * config/rs6000/rs6000.c (rs6000_add_stmt_cost): Likewise. * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Likewise. (_loop_vec_info::_loop_vec_info): Init inner_loop_cost_factor. * tree-vectorizer.h (_loop_vec_info): Add inner_loop_cost_factor. (LOOP_VINFO_INNER_LOOP_COST_FACTOR): New macro. --- gcc/tree-vectorizer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 8d1ffaf..7dcb4cd 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -689,6 +689,10 @@ public: /* The cost of the vector loop body. */ int vec_inside_cost; + /* The factor used to over weight those statements in an inner loop + relative to the loop being vectorized. */ + unsigned int inner_loop_cost_factor; + /* Is the loop vectorizable? */ bool vectorizable; @@ -807,6 +811,7 @@ public: #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info #define LOOP_VINFO_SIMD_IF_COND(L) (L)->simd_if_cond +#define LOOP_VINFO_INNER_LOOP_COST_FACTOR(L) (L)->inner_loop_cost_factor #define LOOP_VINFO_FULLY_MASKED_P(L) \ (LOOP_VINFO_USING_PARTIAL_VECTORS_P (L) \ -- cgit v1.1 From 4bd2cdb74e26f68a4e9d51e74ace60326c9950d1 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 2 Jun 2021 13:25:59 +0200 Subject: Make SLP root stmt a vector This fixes a TODO noticed when adding vectorization of BIT_INSERT_EXPRs and what's now useful for vectorization of BB reductions. 2021-06-08 Richard Biener * tree-vectorizer.h (_slp_instance::root_stmt): Change to... (_slp_instance::root_stmts): ... a vector. (SLP_INSTANCE_ROOT_STMT): Rename to ... (SLP_INSTANCE_ROOT_STMTS): ... this. (slp_root::root): Change to... (slp_root::roots): ... a vector. (slp_root::slp_root): Adjust. * tree-vect-slp.c (_slp_instance::location): Adjust. (vect_free_slp_instance): Release the root stmt vector. (vect_build_slp_instance): Adjust. (vect_analyze_slp): Likewise. (_bb_vec_info::~_bb_vec_info): Likewise. (vect_slp_analyze_operations): Likewise. (vect_bb_vectorization_profitable_p): Likewise. Adjust costs for the root stmt. (vect_slp_check_for_constructors): Gather all BIT_INSERT_EXPRs as root stmts. (vect_slp_analyze_bb_1): Simplify by marking all root stmts as pure_slp. (vectorize_slp_instance_root_stmt): Adjust. (vect_schedule_slp): Likewise. --- gcc/tree-vectorizer.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 7dcb4cd..06d20c7 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -197,7 +197,7 @@ public: /* For vector constructors, the constructor stmt that the SLP tree is built from, NULL otherwise. */ - stmt_vec_info root_stmt; + vec root_stmts; /* The unrolling factor required to vectorized this SLP instance. */ poly_uint64 unrolling_factor; @@ -226,7 +226,7 @@ public: #define SLP_INSTANCE_TREE(S) (S)->root #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor #define SLP_INSTANCE_LOADS(S) (S)->loads -#define SLP_INSTANCE_ROOT_STMT(S) (S)->root_stmt +#define SLP_INSTANCE_ROOT_STMTS(S) (S)->root_stmts #define SLP_INSTANCE_KIND(S) (S)->kind #define SLP_TREE_CHILDREN(S) (S)->children @@ -861,11 +861,11 @@ loop_vec_info_for_loop (class loop *loop) struct slp_root { slp_root (slp_instance_kind kind_, vec stmts_, - stmt_vec_info root_) - : kind(kind_), stmts(stmts_), root(root_) {} + vec roots_) + : kind(kind_), stmts(stmts_), roots(roots_) {} slp_instance_kind kind; vec stmts; - stmt_vec_info root; + vec roots; }; typedef class _bb_vec_info : public vec_info -- cgit v1.1 From ce670e4faafb296d1f1a7828d20f8c8ba4686797 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 18 Nov 2020 14:17:34 +0100 Subject: tree-optimization/97832 - handle associatable chains in SLP discovery This makes SLP discovery handle associatable (including mixed plus/minus) chains better by swapping operands across the whole chain. To work this adds caching of the 'matches' lanes for failed SLP discovery attempts, thereby fixing a failed SLP discovery for the slp-pr98855.cc testcase which results in building an operand from scalars as expected. Unfortunately this makes us trip over the cost threshold so I'm XFAILing the testcase for now. For BB vectorization all this doesn't work because we have no way to distinguish good from bad associations as we eventually build operands from scalars and thus not fail in the classical sense. 2021-05-31 Richard Biener PR tree-optimization/97832 * tree-vectorizer.h (_slp_tree::failed): New. * tree-vect-slp.c (_slp_tree::_slp_tree): Initialize failed member. (_slp_tree::~_slp_tree): Free failed. (vect_build_slp_tree): Retain failed nodes and record matches in them, copying that back out when running into a cached fail. Dump start and end of discovery. (dt_sort_cmp): New. (vect_build_slp_tree_2): Handle associatable chains together doing more aggressive operand swapping. * gcc.dg/vect/pr97832-1.c: New testcase. * gcc.dg/vect/pr97832-2.c: Likewise. * gcc.dg/vect/pr97832-3.c: Likewise. * g++.dg/vect/slp-pr98855.cc: XFAIL. --- gcc/tree-vectorizer.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 06d20c7..1fb46c6 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -167,6 +167,11 @@ struct _slp_tree { int vertex; + /* If not NULL this is a cached failed SLP discovery attempt with + the lanes that failed during SLP discovery as 'false'. This is + a copy of the matches array. */ + bool *failed; + /* Allocate from slp_tree_pool. */ static void *operator new (size_t); -- cgit v1.1 From 3dfa4fe9f1a089b2b3906c83e22a1b39c49d937c Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 8 Jun 2021 15:10:45 +0200 Subject: Vectorization of BB reductions This adds a simple reduction vectorization capability to the non-loop vectorizer. Simple meaning it lacks any of the fancy ways to generate the reduction epilogue but only supports those we can handle via a direct internal function reducing a vector to a scalar. One of the main reasons is to avoid massive refactoring at this point but also that more complex epilogue operations are hardly profitable. Mixed sign reductions are for now fend off and I'm not finally settled with whether we want an explicit SLP node for the reduction epilogue operation. Handling mixed signs could be done by multiplying with a { 1, -1, .. } vector. Fend off are also reductions with non-internal operands (constants or register parameters for example). Costing is done by accounting the original scalar participating stmts for the scalar cost and log2 permutes and operations for the vectorized epilogue. -- SPEC CPU 2017 FP with rate workload measurements show (picked fastest runs of three) regressions for 507.cactuBSSN_r (1.5%), 508.namd_r (2.5%), 511.povray_r (2.5%), 526.blender_r (0.5) and 527.cam4_r (2.5%) and improvements for 510.parest_r (5%) and 538.imagick_r (1.5%). This is with -Ofast -march=znver2 on a Zen2. Statistics on CPU 2017 shows that the overwhelming number of seeds we find are reductions of two lanes (well - that's basically every associative operation). That means we put a quite high pressure on the SLP discovery process this way. In total we find 583218 seeds we put to SLP discovery out of which 66205 pass that and only 6185 of those make it through code generation checks. 796 of those are discarded because the reduction is part of a larger SLP instance. 4195 of the remaining are deemed not profitable to vectorize and 1194 are finally vectorized. That's a poor 0.2% rate. Of the 583218 seeds 486826 (83%) have two lanes, 60912 have three (10%), 28181 four (5%), 4808 five, 909 six and there are instances up to 120 lanes. There's a set of 54086 candidate seeds we reject because they contain a constant or invariant (not implemented yet) but still have two or more lanes that could be put to SLP discovery. 2021-06-16 Richard Biener PR tree-optimization/54400 * tree-vectorizer.h (enum slp_instance_kind): Add slp_inst_kind_bb_reduc. (reduction_fn_for_scalar_code): Declare. * tree-vect-data-refs.c (vect_slp_analyze_instance_dependence): Check SLP_INSTANCE_KIND instead of looking at the representative. (vect_slp_analyze_instance_alignment): Likewise. * tree-vect-loop.c (reduction_fn_for_scalar_code): Export. * tree-vect-slp.c (vect_slp_linearize_chain): Split out chain linearization from vect_build_slp_tree_2 and generalize for the use of BB reduction vectorization. (vect_build_slp_tree_2): Adjust accordingly. (vect_optimize_slp): Elide permutes at the root of BB reduction instances. (vectorizable_bb_reduc_epilogue): New function. (vect_slp_prune_covered_roots): Likewise. (vect_slp_analyze_operations): Use them. (vect_slp_check_for_constructors): Recognize associatable chains for BB reduction vectorization. (vectorize_slp_instance_root_stmt): Generate code for the BB reduction epilogue. * gcc.dg/vect/bb-slp-pr54400.c: New testcase. --- gcc/tree-vectorizer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 1fb46c6..04c20f8 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -190,6 +190,7 @@ enum slp_instance_kind { slp_inst_kind_store, slp_inst_kind_reduc_group, slp_inst_kind_reduc_chain, + slp_inst_kind_bb_reduc, slp_inst_kind_ctor }; @@ -1971,6 +1972,7 @@ extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int, unsigned int); extern gimple_seq vect_gen_len (tree, tree, tree, tree); extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info); +extern bool reduction_fn_for_scalar_code (enum tree_code, internal_fn *); /* Drive for loop transformation stage. */ extern class loop *vect_transform_loop (loop_vec_info, gimple *); -- cgit v1.1 From 0ad9c7087ef3904da89f2db6007b6d28b116087f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 18 Jun 2021 14:07:00 +0200 Subject: tree-optimization/101120 - fix compile-time issue with SLP groups This places two hacks to avoid an old compile-time issue when vectorizing large permuted SLP groups with gaps where we end up emitting loads and IV adjustments for the gap as well and those have quite a high cost until they are eventually cleaned up. The first hack is to fold the auto-inc style IV updates early in the vectorizer rather than in the next forwprop pass which shortens the SSA use-def chains of the used IV. The second hack is to remove the unused loads after we've picked all that we possibly use. 2021-06-18 Richard Biener PR tree-optimization/101120 * tree-vect-data-refs.c (bump_vector_ptr): Fold the built increment. * tree-vect-slp.c (vect_transform_slp_perm_load): Add DR chain DCE capability. * tree-vectorizer.h (vect_transform_slp_perm_load): Adjust. * tree-vect-stmts.c (vectorizable_load): Remove unused loads in the DR chain for SLP. --- gcc/tree-vectorizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 04c20f8..5c71fbc 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2012,7 +2012,7 @@ extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec, gimple_stmt_iterator *, poly_uint64, bool, unsigned *, - unsigned * = nullptr); + unsigned * = nullptr, bool = false); extern bool vect_slp_analyze_operations (vec_info *); extern void vect_schedule_slp (vec_info *, vec); extern opt_result vect_analyze_slp (vec_info *, unsigned); -- cgit v1.1 From 7a6c31f0f84a7295433ebac09b94fae2d5cc2892 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 31 May 2021 13:19:01 +0200 Subject: Add x86 addsub SLP pattern This addds SLP pattern recognition for the SSE3/AVX [v]addsubp{ds} v0, v1 instructions which compute { v0[0] - v1[0], v0[1], + v1[1], ... } thus subtract, add alternating on lanes, starting with subtract. It adds a corresponding optab and direct internal function, vec_addsub$a3 and renames the existing i386 backend patterns to the new canonical name. The SLP pattern matches the exact alternating lane sequence rather than trying to be clever and anticipating incoming permutes - we could permute the two input vectors to the needed lane alternation, do the addsub and then permute the result vector back but that's only profitable in case the two input or the output permute will vanish - something Tamars refactoring of SLP pattern recog should make possible. 2021-06-17 Richard Biener * config/i386/sse.md (avx_addsubv4df3): Rename to vec_addsubv4df3. (avx_addsubv8sf3): Rename to vec_addsubv8sf3. (sse3_addsubv2df3): Rename to vec_addsubv2df3. (sse3_addsubv4sf3): Rename to vec_addsubv4sf3. * config/i386/i386-builtin.def: Adjust. * internal-fn.def (VEC_ADDSUB): New internal optab fn. * optabs.def (vec_addsub_optab): New optab. * tree-vect-slp-patterns.c (class addsub_pattern): New. (slp_patterns): Add addsub_pattern. * tree-vect-slp.c (vect_optimize_slp): Disable propagation across CFN_VEC_ADDSUB. * tree-vectorizer.h (vect_pattern::vect_pattern): Make m_ops optional. * doc/md.texi (vec_addsub3): Document. * gcc.target/i386/vect-addsubv2df.c: New testcase. * gcc.target/i386/vect-addsubv4sf.c: Likewise. * gcc.target/i386/vect-addsubv4df.c: Likewise. * gcc.target/i386/vect-addsubv8sf.c: Likewise. * gcc.target/i386/vect-addsub-2.c: Likewise. * gcc.target/i386/vect-addsub-3.c: Likewise. --- gcc/tree-vectorizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 5c71fbc..fa28336 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2100,7 +2100,8 @@ class vect_pattern this->m_ifn = ifn; this->m_node = node; this->m_ops.create (0); - this->m_ops.safe_splice (*m_ops); + if (m_ops) + this->m_ops.safe_splice (*m_ops); } public: -- cgit v1.1 From d592920c89973acd8d9f5b1f6b0526036ce63ccb Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 13 Jul 2021 10:17:39 +0100 Subject: vect: Add a vect_phi_initial_value helper function This patch adds a helper function called vect_phi_initial_value for returning the incoming value of a given loop phi. The main reason for adding it is to ensure that the right preheader edge is used when vectorising nested loops. (PHI_ARG_DEF_FROM_EDGE itself doesn't assert that the given edge is for the right block, although I guess that would be good to add separately.) gcc/ * tree-vectorizer.h: Include tree-ssa-operands.h. (vect_phi_initial_value): New function. * tree-vect-loop.c (neutral_op_for_slp_reduction): Use it. (get_initial_defs_for_reduction, info_for_reduction): Likewise. (vect_create_epilog_for_reduction, vectorizable_reduction): Likewise. (vect_transform_cycle_phi, vectorizable_induction): Likewise. --- gcc/tree-vectorizer.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index fa28336..e2fd360 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -27,7 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-hash-traits.h" #include "target.h" #include "internal-fn.h" - +#include "tree-ssa-operands.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -1369,6 +1369,25 @@ nested_in_vect_loop_p (class loop *loop, stmt_vec_info stmt_info) && (loop->inner == (gimple_bb (stmt_info->stmt))->loop_father)); } +/* PHI is either a scalar reduction phi or a scalar induction phi. + Return the initial value of the variable on entry to the containing + loop. */ + +static inline tree +vect_phi_initial_value (gphi *phi) +{ + basic_block bb = gimple_bb (phi); + edge pe = loop_preheader_edge (bb->loop_father); + gcc_assert (pe->dest == bb); + return PHI_ARG_DEF_FROM_EDGE (phi, pe); +} + +static inline tree +vect_phi_initial_value (stmt_vec_info stmt_info) +{ + return vect_phi_initial_value (as_a (stmt_info->stmt)); +} + /* Return true if STMT_INFO should produce a vector mask type rather than a normal nonmask type. */ -- cgit v1.1 From 1583b8bff0be7e41aa721dde79f90ca0763bd4e2 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 13 Jul 2021 10:17:43 +0100 Subject: vect: Reuse reduction accumulators between loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for reusing a main loop's reduction accumulator in an epilogue loop. This in turn lets the loops share a single piece of vector->scalar reduction code. The patch has the following restrictions: (1) The epilogue reduction can only operate on a single vector (e.g. ncopies must be 1 for non-SLP reductions, and the group size must be <= the element count for SLP reductions). (2) Both loops must use the same vector mode for their accumulators. This means that the patch is restricted to targets that support --param vect-partial-vector-usage=1. (3) The reduction must be a standard “tree code” reduction. However, these restrictions could be lifted in future. For example, if the main loop operates on 128-bit vectors and the epilogue loop operates on 64-bit vectors, we could in future reduce the 128-bit vector by one stage and use the 64-bit result as the starting point for the epilogue result. The patch tries to handle chained SLP reductions, unchained SLP reductions and non-SLP reductions. It also handles cases in which the epilogue loop is entered directly (rather than via the main loop) and cases in which the epilogue loop can be skipped. vect_get_main_loop_result is a bit more general than the current patch needs. gcc/ * tree-vectorizer.h (vect_reusable_accumulator): New structure. (_loop_vec_info::main_loop_edge): New field. (_loop_vec_info::skip_main_loop_edge): Likewise. (_loop_vec_info::skip_this_loop_edge): Likewise. (_loop_vec_info::reusable_accumulators): Likewise. (_stmt_vec_info::reduc_scalar_results): Likewise. (_stmt_vec_info::reused_accumulator): Likewise. (vect_get_main_loop_result): Declare. * tree-vectorizer.c (vec_info::new_stmt_vec_info): Initialize reduc_scalar_inputs. (vec_info::free_stmt_vec_info): Free reduc_scalar_inputs. * tree-vect-loop-manip.c (vect_get_main_loop_result): New function. (vect_do_peeling): Fill an epilogue loop's main_loop_edge, skip_main_loop_edge and skip_this_loop_edge fields. * tree-vect-loop.c (INCLUDE_ALGORITHM): Define. (vect_emit_reduction_init_stmts): New function. (get_initial_def_for_reduction): Use it. (get_initial_defs_for_reduction): Likewise. Change the vinfo parameter to a loop_vec_info. (vect_create_epilog_for_reduction): Store the scalar results in the reduc_info. If an epilogue loop is reusing an accumulator from the main loop, and if the epilogue loop can also be skipped, try to place the reduction code in the join block. Record accumulators that could potentially be reused by epilogue loops. (vect_transform_cycle_phi): When vectorizing epilogue loops, try to reuse accumulators from the main loop. Record the initial value in reduc_info for non-SLP reductions too. gcc/testsuite/ * gcc.target/aarch64/sve/reduc_9.c: New test. * gcc.target/aarch64/sve/reduc_9_run.c: Likewise. * gcc.target/aarch64/sve/reduc_10.c: Likewise. * gcc.target/aarch64/sve/reduc_10_run.c: Likewise. * gcc.target/aarch64/sve/reduc_11.c: Likewise. * gcc.target/aarch64/sve/reduc_11_run.c: Likewise. * gcc.target/aarch64/sve/reduc_12.c: Likewise. * gcc.target/aarch64/sve/reduc_12_run.c: Likewise. * gcc.target/aarch64/sve/reduc_13.c: Likewise. * gcc.target/aarch64/sve/reduc_13_run.c: Likewise. * gcc.target/aarch64/sve/reduc_14.c: Likewise. * gcc.target/aarch64/sve/reduc_14_run.c: Likewise. * gcc.target/aarch64/sve/reduc_15.c: Likewise. * gcc.target/aarch64/sve/reduc_15_run.c: Likewise. --- gcc/tree-vectorizer.h | 56 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e2fd360..d825b0c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -551,6 +551,18 @@ typedef auto_vec vec_loop_lens; typedef auto_vec > drs_init_vec; +/* Information about a reduction accumulator from the main loop that could + conceivably be reused as the input to a reduction in an epilogue loop. */ +struct vect_reusable_accumulator { + /* The final value of the accumulator, which forms the input to the + reduction operation. */ + tree reduc_input; + + /* The stmt_vec_info that describes the reduction (i.e. the one for + which is_reduc_info is true). */ + stmt_vec_info reduc_info; +}; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -588,6 +600,26 @@ public: /* Unrolling factor */ poly_uint64 vectorization_factor; + /* If this loop is an epilogue loop whose main loop can be skipped, + MAIN_LOOP_EDGE is the edge from the main loop to this loop's + preheader. SKIP_MAIN_LOOP_EDGE is then the edge that skips the + main loop and goes straight to this loop's preheader. + + Both fields are null otherwise. */ + edge main_loop_edge; + edge skip_main_loop_edge; + + /* If this loop is an epilogue loop that might be skipped after executing + the main loop, this edge is the one that skips the epilogue. */ + edge skip_this_loop_edge; + + /* The vectorized form of a standard reduction replaces the original + scalar code's final result (a loop-closed SSA PHI) with the result + of a vector-to-scalar reduction operation. After vectorization, + this variable maps these vector-to-scalar results to information + about the reductions that generated them. */ + hash_map reusable_accumulators; + /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR if there is no particular limit. */ unsigned HOST_WIDE_INT max_vectorization_factor; @@ -1186,6 +1218,23 @@ public: /* The vector type for performing the actual reduction. */ tree reduc_vectype; + /* If IS_REDUC_INFO is true and if the vector code is performing + N scalar reductions in parallel, this variable gives the initial + scalar values of those N reductions. */ + vec reduc_initial_values; + + /* If IS_REDUC_INFO is true and if the vector code is performing + N scalar reductions in parallel, this variable gives the vectorized code's + final (scalar) result for each of those N reductions. In other words, + REDUC_SCALAR_RESULTS[I] replaces the original scalar code's loop-closed + SSA PHI for reduction number I. */ + vec reduc_scalar_results; + + /* Only meaningful if IS_REDUC_INFO. If non-null, the reduction is + being performed by an epilogue loop and we have decided to reuse + this accumulator from the main loop. */ + vect_reusable_accumulator *reused_accumulator; + /* Whether we force a single cycle PHI during reduction vectorization. */ bool force_single_cycle; @@ -1382,12 +1431,6 @@ vect_phi_initial_value (gphi *phi) return PHI_ARG_DEF_FROM_EDGE (phi, pe); } -static inline tree -vect_phi_initial_value (stmt_vec_info stmt_info) -{ - return vect_phi_initial_value (as_a (stmt_info->stmt)); -} - /* Return true if STMT_INFO should produce a vector mask type rather than a normal nonmask type. */ @@ -1818,6 +1861,7 @@ class loop *vect_loop_versioning (loop_vec_info, gimple *); extern class loop *vect_do_peeling (loop_vec_info, tree, tree, tree *, tree *, tree *, int, bool, bool, tree *); +extern tree vect_get_main_loop_result (loop_vec_info, tree, tree); extern void vect_prepare_for_masked_peels (loop_vec_info); extern dump_user_location_t find_loop_location (class loop *); extern bool vect_can_advance_ivs_p (loop_vec_info); -- cgit v1.1 From 92acae5047e4b8c5be035f067099942a93e55d0c Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 15 Jul 2021 11:41:12 +0200 Subject: Streamline vect_gen_while This adjusts the vect_gen_while API to match that of vect_gen_while_not allowing further patches to generate more than one stmt for the while case. 2021-07-15 Richard Biener * tree-vectorizer.h (vect_gen_while): Match up with vect_gen_while_not. * tree-vect-stmts.c (vect_gen_while): Adjust API to that of vect_gen_while_not. (vect_gen_while_not): Adjust. * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Likewise. --- gcc/tree-vectorizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index d825b0c..f7c08ca 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1948,7 +1948,8 @@ extern bool vect_supportable_shift (vec_info *, enum tree_code, tree); extern tree vect_gen_perm_mask_any (tree, const vec_perm_indices &); extern tree vect_gen_perm_mask_checked (tree, const vec_perm_indices &); extern void optimize_mask_stores (class loop*); -extern gcall *vect_gen_while (tree, tree, tree); +extern tree vect_gen_while (gimple_seq *, tree, tree, tree, + const char * = nullptr); extern tree vect_gen_while_not (gimple_seq *, tree, tree, tree); extern opt_result vect_get_vector_types_for_stmt (vec_info *, stmt_vec_info, tree *, -- cgit v1.1 From 650c70a9fe7198394d3bbe4c0b1a7a73dc0bdd4a Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 16 Jul 2021 13:26:20 +0200 Subject: Remove more gimple_expr_type uses This removes a few more uses. 2021-07-16 Richard Biener * gimple-ssa-store-merging.c (verify_symbolic_number_p): Use the type of the LHS. (find_bswap_or_nop_1): Likewise. (find_bswap_or_nop): Likewise. * tree-vectorizer.h (vect_get_smallest_scalar_type): Adjust prototype. * tree-vect-data-refs.c (vect_get_smallest_scalar_type): Remove unused parameters, pass in the scalar type. Fix internal store function handling. * tree-vect-stmts.c (vect_analyze_stmt): Remove assert. (vect_get_vector_types_for_stmt): Move down check for existing vector stmt after we've determined a scalar type. Pass down the used scalar type to vect_get_smallest_scalar_type. * tree-vect-generic.c (expand_vector_condition): Use the type of the LHS. (expand_vector_scalar_condition): Likewise. (expand_vector_operations_1): Likewise. * tree-vect-patterns.c (vect_widened_op_tree): Likewise. (vect_recog_dot_prod_pattern): Likewise. (vect_recog_sad_pattern): Likewise. (vect_recog_widen_op_pattern): Likewise. (vect_recog_widen_sum_pattern): Likewise. (vect_recog_mixed_size_cond_pattern): Likewise. --- gcc/tree-vectorizer.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index f7c08ca..d9f0195 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1960,8 +1960,7 @@ extern opt_tree vect_get_mask_type_for_stmt (stmt_vec_info, unsigned int = 0); extern bool vect_can_force_dr_alignment_p (const_tree, poly_uint64); extern enum dr_alignment_support vect_supportable_dr_alignment (vec_info *, dr_vec_info *, bool); -extern tree vect_get_smallest_scalar_type (stmt_vec_info, HOST_WIDE_INT *, - HOST_WIDE_INT *); +extern tree vect_get_smallest_scalar_type (stmt_vec_info, tree); extern opt_result vect_analyze_data_ref_dependences (loop_vec_info, unsigned int *); extern bool vect_slp_analyze_instance_dependence (vec_info *, slp_instance); extern opt_result vect_enhance_data_refs_alignment (loop_vec_info); -- cgit v1.1 From 00dcc88a0ed7bd148ea86d900b6c93574a2e1f26 Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Tue, 20 Jul 2021 11:14:19 -0600 Subject: Adjust by-value function vec arguments to by-reference. gcc/c-family/ChangeLog: * c-common.c (c_build_shufflevector): Adjust by-value argument to by-const-reference. * c-common.h (c_build_shufflevector): Same. gcc/c/ChangeLog: * c-tree.h (c_build_function_call_vec): Adjust by-value argument to by-const-reference. * c-typeck.c (c_build_function_call_vec): Same. gcc/ChangeLog: * cfgloop.h (single_likely_exit): Adjust by-value argument to by-const-reference. * cfgloopanal.c (single_likely_exit): Same. * cgraph.h (struct cgraph_node): Same. * cgraphclones.c (cgraph_node::create_virtual_clone): Same. * genautomata.c (merge_states): Same. * genextract.c (VEC_char_to_string): Same. * genmatch.c (dt_node::gen_kids_1): Same. (walk_captures): Adjust by-value argument to by-reference. * gimple-ssa-store-merging.c (check_no_overlap): Adjust by-value argument to by-const-reference. * gimple.c (gimple_build_call_vec): Same. (gimple_build_call_internal_vec): Same. (gimple_build_switch): Same. (sort_case_labels): Same. (preprocess_case_label_vec_for_gimple): Adjust by-value argument to by-reference. * gimple.h (gimple_build_call_vec): Adjust by-value argument to by-const-reference. (gimple_build_call_internal_vec): Same. (gimple_build_switch): Same. (sort_case_labels): Same. (preprocess_case_label_vec_for_gimple): Adjust by-value argument to by-reference. * haifa-sched.c (calc_priorities): Adjust by-value argument to by-const-reference. (sched_init_luids): Same. (haifa_init_h_i_d): Same. * ipa-cp.c (ipa_get_indirect_edge_target_1): Same. (adjust_callers_for_value_intersection): Adjust by-value argument to by-reference. (find_more_scalar_values_for_callers_subset): Adjust by-value argument to by-const-reference. (find_more_contexts_for_caller_subset): Same. (find_aggregate_values_for_callers_subset): Same. (copy_useful_known_contexts): Same. * ipa-fnsummary.c (remap_edge_summaries): Same. (remap_freqcounting_predicate): Same. * ipa-inline.c (add_new_edges_to_heap): Adjust by-value argument to by-reference. * ipa-predicate.c (predicate::remap_after_inlining): Adjust by-value argument to by-const-reference. * ipa-predicate.h (predicate::remap_after_inlining): Same. * ipa-prop.c (ipa_find_agg_cst_for_param): Same. * ipa-prop.h (ipa_find_agg_cst_for_param): Same. * ira-build.c (ira_loop_tree_body_rev_postorder): Same. * read-rtl.c (add_overload_instance): Same. * rtl.h (native_decode_rtx): Same. (native_decode_vector_rtx): Same. * sched-int.h (sched_init_luids): Same. (haifa_init_h_i_d): Same. * simplify-rtx.c (native_decode_vector_rtx): Same. (native_decode_rtx): Same. * tree-call-cdce.c (gen_shrink_wrap_conditions): Same. (shrink_wrap_one_built_in_call_with_conds): Same. (shrink_wrap_conditional_dead_built_in_calls): Same. * tree-data-ref.c (create_runtime_alias_checks): Same. (compute_all_dependences): Same. * tree-data-ref.h (compute_all_dependences): Same. (create_runtime_alias_checks): Same. (index_in_loop_nest): Same. * tree-if-conv.c (mask_exists): Same. * tree-loop-distribution.c (class loop_distribution): Same. (loop_distribution::create_rdg_vertices): Same. (dump_rdg_partitions): Same. (debug_rdg_partitions): Same. (partition_contains_all_rw): Same. (loop_distribution::distribute_loop): Same. * tree-parloops.c (oacc_entry_exit_ok_1): Same. (oacc_entry_exit_single_gang): Same. * tree-ssa-loop-im.c (hoist_memory_references): Same. (loop_suitable_for_sm): Same. * tree-ssa-loop-niter.c (bound_index): Same. * tree-ssa-reassoc.c (update_ops): Same. (swap_ops_for_binary_stmt): Same. (rewrite_expr_tree): Same. (rewrite_expr_tree_parallel): Same. * tree-ssa-sccvn.c (ao_ref_init_from_vn_reference): Same. * tree-ssa-sccvn.h (ao_ref_init_from_vn_reference): Same. * tree-ssa-structalias.c (process_all_all_constraints): Same. (make_constraints_to): Same. (handle_lhs_call): Same. (find_func_aliases_for_builtin_call): Same. (sort_fieldstack): Same. (check_for_overlaps): Same. * tree-vect-loop-manip.c (vect_create_cond_for_align_checks): Same. (vect_create_cond_for_unequal_addrs): Same. (vect_create_cond_for_lower_bounds): Same. (vect_create_cond_for_alias_checks): Same. * tree-vect-slp-patterns.c (vect_validate_multiplication): Same. * tree-vect-slp.c (vect_analyze_slp_instance): Same. (vect_make_slp_decision): Same. (vect_slp_bbs): Same. (duplicate_and_interleave): Same. (vect_transform_slp_perm_load): Same. (vect_schedule_slp): Same. * tree-vectorizer.h (vect_transform_slp_perm_load): Same. (vect_schedule_slp): Same. (duplicate_and_interleave): Same. * tree.c (build_vector_from_ctor): Same. (build_vector): Same. (check_vector_cst): Same. (check_vector_cst_duplicate): Same. (check_vector_cst_fill): Same. (check_vector_cst_stepped): Same. * tree.h (build_vector_from_ctor): Same. --- gcc/tree-vectorizer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index d9f0195..deb2247 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2072,12 +2072,12 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree); extern void vect_slp_init (void); extern void vect_slp_fini (void); extern void vect_free_slp_instance (slp_instance); -extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec, +extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec &, gimple_stmt_iterator *, poly_uint64, bool, unsigned *, unsigned * = nullptr, bool = false); extern bool vect_slp_analyze_operations (vec_info *); -extern void vect_schedule_slp (vec_info *, vec); +extern void vect_schedule_slp (vec_info *, const vec &); extern opt_result vect_analyze_slp (vec_info *, unsigned); extern bool vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); @@ -2095,7 +2095,7 @@ extern bool can_duplicate_and_interleave_p (vec_info *, unsigned int, tree, unsigned int * = NULL, tree * = NULL, tree * = NULL); extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree, - vec, unsigned int, vec &); + const vec &, unsigned int, vec &); extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info); extern bool vect_update_shared_vectype (stmt_vec_info, tree); extern slp_tree vect_create_new_slp_node (unsigned, tree_code); -- cgit v1.1 From 783d809f0bb13a9f50139d03c328f59f9e3840c7 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 5 Aug 2021 14:03:23 +0100 Subject: vect: Move costing helpers from aarch64 code aarch64.c has various routines to test for specific kinds of vector statement cost. The routines aren't really target-specific, so following a suggestion from Richi, this patch moves them to a new section of tree-vectorizer.h. gcc/ * tree-vectorizer.h (vect_is_store_elt_extraction, vect_is_reduction) (vect_reduc_type, vect_embedded_comparison_type, vect_comparison_type) (vect_is_extending_load, vect_is_integer_truncation): New functions, moved from aarch64.c but given different names. * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction) (aarch64_is_reduction, aarch64_reduc_type) (aarch64_embedded_comparison_type, aarch64_comparison_type) (aarch64_extending_load_p, aarch64_integer_truncation_p): Delete in favor of the above. Update callers accordingly. --- gcc/tree-vectorizer.h | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index deb2247..686644b4 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2192,4 +2192,108 @@ extern vect_pattern_decl_t slp_patterns[]; /* Number of supported pattern matchers. */ extern size_t num__slp_patterns; +/* ---------------------------------------------------------------------- + Target support routines + ----------------------------------------------------------------------- + The following routines are provided to simplify costing decisions in + target code. Please add more as needed. */ + +/* Return true if an operaton of kind KIND for STMT_INFO represents + the extraction of an element from a vector in preparation for + storing the element to memory. */ +inline bool +vect_is_store_elt_extraction (vect_cost_for_stmt kind, stmt_vec_info stmt_info) +{ + return (kind == vec_to_scalar + && STMT_VINFO_DATA_REF (stmt_info) + && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))); +} + +/* Return true if STMT_INFO represents part of a reduction. */ +inline bool +vect_is_reduction (stmt_vec_info stmt_info) +{ + return (STMT_VINFO_REDUC_DEF (stmt_info) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); +} + +/* If STMT_INFO describes a reduction, return the vect_reduction_type + of the reduction it describes, otherwise return -1. */ +inline int +vect_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) +{ + if (loop_vec_info loop_vinfo = dyn_cast (vinfo)) + if (STMT_VINFO_REDUC_DEF (stmt_info)) + { + stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info); + return int (STMT_VINFO_REDUC_TYPE (reduc_info)); + } + return -1; +} + +/* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the + scalar type of the values being compared. Return null otherwise. */ +inline tree +vect_embedded_comparison_type (stmt_vec_info stmt_info) +{ + if (auto *assign = dyn_cast (stmt_info->stmt)) + if (gimple_assign_rhs_code (assign) == COND_EXPR) + { + tree cond = gimple_assign_rhs1 (assign); + if (COMPARISON_CLASS_P (cond)) + return TREE_TYPE (TREE_OPERAND (cond, 0)); + } + return NULL_TREE; +} + +/* If STMT_INFO is a comparison or contains an embedded comparison, return the + scalar type of the values being compared. Return null otherwise. */ +inline tree +vect_comparison_type (stmt_vec_info stmt_info) +{ + if (auto *assign = dyn_cast (stmt_info->stmt)) + if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison) + return TREE_TYPE (gimple_assign_rhs1 (assign)); + return vect_embedded_comparison_type (stmt_info); +} + +/* Return true if STMT_INFO extends the result of a load. */ +inline bool +vect_is_extending_load (class vec_info *vinfo, stmt_vec_info stmt_info) +{ + /* Although this is quite large for an inline function, this part + at least should be inline. */ + gassign *assign = dyn_cast (stmt_info->stmt); + if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) + return false; + + tree rhs = gimple_assign_rhs1 (stmt_info->stmt); + tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); + tree rhs_type = TREE_TYPE (rhs); + if (!INTEGRAL_TYPE_P (lhs_type) + || !INTEGRAL_TYPE_P (rhs_type) + || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type)) + return false; + + stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs); + return (def_stmt_info + && STMT_VINFO_DATA_REF (def_stmt_info) + && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info))); +} + +/* Return true if STMT_INFO is an integer truncation. */ +inline bool +vect_is_integer_truncation (stmt_vec_info stmt_info) +{ + gassign *assign = dyn_cast (stmt_info->stmt); + if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) + return false; + + tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign)); + tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); + return (INTEGRAL_TYPE_P (lhs_type) + && INTEGRAL_TYPE_P (rhs_type) + && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); +} + #endif /* GCC_TREE_VECTORIZER_H */ -- cgit v1.1 From a3d3e8c362c2d850543eb2e2631128e1efc368f0 Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Thu, 5 Aug 2021 19:50:35 -0600 Subject: Adjust by-value function vec arguments to by-reference. gcc/c/ChangeLog: * c-parser.c (c_parser_declaration_or_fndef): Adjust by-value function vec arguments to by-reference. (c_finish_omp_declare_simd): Same. (c_parser_compound_statement_nostart): Same. (c_parser_for_statement): Same. (c_parser_objc_methodprotolist): Same. (c_parser_oacc_routine): Same. (c_parser_omp_for_loop): Same. (c_parser_omp_declare_simd): Same. gcc/ChangeLog: * dominance.c (prune_bbs_to_update_dominators): Adjust by-value vec arguments to by-reference. (iterate_fix_dominators): Same. * dominance.h (iterate_fix_dominators): Same. * ipa-prop.h: Call auto_vec::to_vec_legacy. * tree-data-ref.c (dump_data_dependence_relation): Adjust by-value vec arguments to by-reference. (debug_data_dependence_relation): Same. (dump_data_dependence_relations): Same. * tree-data-ref.h (debug_data_dependence_relation): Same. (dump_data_dependence_relations): Same. * tree-predcom.c (dump_chains): Same. (initialize_root_vars_lm): Same. (determine_unroll_factor): Same. (replace_phis_by_defined_names): Same. (insert_init_seqs): Same. (pcom_worker::tree_predictive_commoning_loop): Call auto_vec::to_vec_legacy. * tree-ssa-pre.c (insert_into_preds_of_block): Adjust by-value vec arguments to by-reference. * tree-ssa-threadbackward.c (populate_worklist): Same. (back_threader::resolve_def): Same. * tree-vect-data-refs.c (vect_check_nonzero_value): Same. (vect_enhance_data_refs_alignment): Same. (vect_check_lower_bound): Same. (vect_prune_runtime_alias_test_list): Same. (vect_permute_store_chain): Same. * tree-vect-slp-patterns.c (vect_normalize_conj_loc): Same. * tree-vect-stmts.c (vect_create_vectorized_demotion_stmts): Same. * tree-vectorizer.h (vect_permute_store_chain): Same. * vec.c (test_init): New function. (vec_c_tests): Call new function. * vec.h (vec): Declare ctors, dtor, and assignment. (auto_vec::vec_to_legacy): New function. (vec::copy): Adjust initialization. --- gcc/tree-vectorizer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 686644b4..5571b3c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1990,8 +1990,8 @@ extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT); extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); -extern void vect_permute_store_chain (vec_info *, - vec ,unsigned int, stmt_vec_info, +extern void vect_permute_store_chain (vec_info *, vec &, + unsigned int, stmt_vec_info, gimple_stmt_iterator *, vec *); extern tree vect_setup_realignment (vec_info *, stmt_vec_info, gimple_stmt_iterator *, -- cgit v1.1 From f31da42e047e8018ca6ad9809273bc7efb6ffcaf Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 6 Aug 2021 14:39:05 +0200 Subject: tree-optimization/101801 - remove vect_worthwhile_without_simd_p This removes the cost part of vect_worthwhile_without_simd_p, retaining only the correctness bits. The reason is that the cost heuristic do not properly account for SLP plus the check whether "without simd" applies misfires for AVX512 mask vectors at the moment, leading to missed vectorizations there. Any costing decision should take place in the cost modeling, no single stmt is to disable all vectorization on its own. 2021-08-06 Richard Biener PR tree-optimization/101801 * tree-vectorizer.h (vect_worthwhile_without_simd_p): Rename... (vect_can_vectorize_without_simd_p): ... to this. * tree-vect-loop.c (vect_worthwhile_without_simd_p): Rename... (vect_can_vectorize_without_simd_p): ... to this and fold in vect_min_worthwhile_factor. (vect_min_worthwhile_factor): Remove. (vectorizable_reduction): Adjust and remove the cost part. * tree-vect-stmts.c (vectorizable_shift): Likewise. (vectorizable_operation): Likewise. --- gcc/tree-vectorizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 5571b3c..de0ecf8 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2061,7 +2061,7 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info, gimple **, slp_tree); extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree, stmt_vector_for_cost *); -extern bool vect_worthwhile_without_simd_p (vec_info *, tree_code); +extern bool vect_can_vectorize_without_simd_p (tree_code); extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, stmt_vector_for_cost *, stmt_vector_for_cost *, -- cgit v1.1 From 19d1a529fa9f78e7ec7be38d423c90e00cec8f8c Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 9 Aug 2021 11:42:47 +0200 Subject: tree-optimization/101801 - rework generic vector vectorization more This builds ontop of the vect_worthwhile_without_simd_p refactoring done earlier. It was wrong in dropping the appearant double checks for operation support since the optab check can happen with an integer vector emulation mode and thus succeed but vector lowering might not actually support the operation on word_mode. The following patch adds a vect_emulated_vector_p helper and re-instantiates the check where it was previously. It also adds appropriate costing of the scalar stmts emitted by vector lowering to vectorizable_operation which should be the only place such operations are synthesized. I've also cared for the case where the vector mode is supported but the operation is not (though I think this will be unlikely given we're talking about plus, minus and negate). This fixes the observed FAIL of gcc.dg/tree-ssa/gen-vect-11b.c with -m32 where we end up vectorizing a multiplication that ends up being teared down to scalars again by vector lowering. I'm not super happy about all the other places where we're now and previously feeding scalar modes to optab checks where we want to know whether we can vectorize sth but well. 2021-09-08 Richard Biener PR tree-optimization/101801 PR tree-optimization/101819 * tree-vectorizer.h (vect_emulated_vector_p): Declare. * tree-vect-loop.c (vect_emulated_vector_p): New function. (vectorizable_reduction): Re-instantiate a check for emulated operations. * tree-vect-stmts.c (vectorizable_shift): Likewise. (vectorizable_operation): Likewise. Cost emulated vector operations according to the scalar sequence synthesized by vector lowering. --- gcc/tree-vectorizer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index de0ecf8..9c2c29d 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2061,6 +2061,7 @@ extern bool vectorizable_lc_phi (loop_vec_info, stmt_vec_info, gimple **, slp_tree); extern bool vectorizable_phi (vec_info *, stmt_vec_info, gimple **, slp_tree, stmt_vector_for_cost *); +extern bool vect_emulated_vector_p (tree); extern bool vect_can_vectorize_without_simd_p (tree_code); extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, stmt_vector_for_cost *, -- cgit v1.1 From 9216ee6d1195d48388f825cf1b072e570129cbbe Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 24 Aug 2021 12:25:25 +0200 Subject: tree-optimization/100089 - avoid leaving scalar if-converted code around This avoids leaving scalar if-converted code around for the case of BB vectorizing an if-converted loop body when using the very-cheap cost model. In this case we scan not vectorized scalar stmts in the basic-block vectorized for COND_EXPRs and force the vectorization to be marked as not profitable. The patch also makes sure to always consider all BB vectorization subgraphs together for costing purposes when vectorizing an if-converted loop body. 2021-08-24 Richard Biener PR tree-optimization/100089 * tree-vectorizer.h (vect_slp_bb): Rename to ... (vect_slp_if_converted_bb): ... this and get the original loop as new argument. * tree-vectorizer.c (try_vectorize_loop_1): Revert previous fix, pass original loop to vect_slp_if_converted_bb. * tree-vect-slp.c (vect_bb_vectorization_profitable_p): If orig_loop was passed scan the not vectorized stmts for COND_EXPRs and force not profitable if found. (vect_slp_region): Pass down all SLP instances to costing if orig_loop was specified. (vect_slp_bbs): Pass through orig_loop. (vect_slp_bb): Rename to ... (vect_slp_if_converted_bb): ... this and get the original loop as new argument. (vect_slp_function): Adjust. --- gcc/tree-vectorizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 9c2c29d..72e018e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -2087,7 +2087,7 @@ extern void vect_gather_slp_loads (vec_info *); extern void vect_get_slp_defs (slp_tree, vec *); extern void vect_get_slp_defs (vec_info *, slp_tree, vec > *, unsigned n = -1U); -extern bool vect_slp_bb (basic_block); +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop); extern bool vect_slp_function (function *); extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree); -- cgit v1.1 From 153766ec8351d55cfe8bd6d69bdfc0c2cef71e56 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 31 Aug 2021 10:28:40 +0200 Subject: tree-optimization/102139 - fix SLP DR base alignment When doing whole-function SLP we have to make sure the recorded base alignments we compute as the maximum alignment seen for a base anywhere in the function is actually valid at the point we want to make use of it. To make this work we now record the stmt the alignment was derived from in addition to the DRs innermost behavior and we use a dominance check to verify the recorded info is valid when doing BB vectorization. For this to work for groups inside a BB that are separate by a call that might not return we now store the DR analysis group-id permanently and use that for an additional check when the DRs are in the same BB. 2021-08-31 Richard Biener PR tree-optimization/102139 * tree-vectorizer.h (vec_base_alignments): Adjust hash-map type to record a std::pair of the stmt-info and the innermost loop behavior. (dr_vec_info::group): New member. * tree-vect-data-refs.c (vect_record_base_alignment): Adjust. (vect_compute_data_ref_alignment): Verify the recorded base alignment can be used. (data_ref_pair): Remove. (dr_group_sort_cmp): Adjust. (vect_analyze_data_ref_accesses): Store the group-ID in the dr_vec_info and operate on a vector of dr_vec_infos. * gcc.dg/torture/pr102139.c: New testcase. --- gcc/tree-vectorizer.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'gcc/tree-vectorizer.h') diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 72e018e..7453d2a 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -106,10 +106,11 @@ struct stmt_info_for_cost { typedef vec stmt_vector_for_cost; -/* Maps base addresses to an innermost_loop_behavior that gives the maximum - known alignment for that base. */ +/* Maps base addresses to an innermost_loop_behavior and the stmt it was + derived from that gives the maximum known alignment for that base. */ typedef hash_map vec_base_alignments; + std::pair > + vec_base_alignments; /************************************************************************ SLP @@ -1059,6 +1060,9 @@ public: data_reference *dr; /* The statement that contains the data reference. */ stmt_vec_info stmt; + /* The analysis group this DR belongs to when doing BB vectorization. + DRs of the same group belong to the same conditional execution context. */ + unsigned group; /* The misalignment in bytes of the reference, or -1 if not known. */ int misalignment; /* The byte alignment that we'd ideally like the reference to have, -- cgit v1.1