aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.c
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2020-09-04 15:33:19 +0200
committerRichard Biener <rguenther@suse.de>2020-09-07 09:47:36 +0200
commit095d42feed09f880f835ed74d0aa7b1ad7abd03c (patch)
treeac7563d6514c5e7d246968efe8c9653d7b79fbf0 /gcc/tree-vect-loop.c
parentd30869a8d4886aee4020be3b28b15b1b15c8d9ad (diff)
downloadgcc-095d42feed09f880f835ed74d0aa7b1ad7abd03c.zip
gcc-095d42feed09f880f835ed74d0aa7b1ad7abd03c.tar.gz
gcc-095d42feed09f880f835ed74d0aa7b1ad7abd03c.tar.bz2
code generate live lanes in basic-block vectorization
The following adds the capability to code-generate live lanes in basic-block vectorization using lane extracts from vector stmts rather than keeping the original scalar code around for those. This eventually makes previously not profitable vectorizations profitable (the live scalar code was appropriately costed so are the lane extracts now), without considering the cost model this patch doesn't add or remove any basic-block vectorization capabilities. The patch re/ab-uses STMT_VINFO_LIVE_P in basic-block vectorization mode to tell whether a live lane is vectorized or whether it is provided by means of keeping the scalar code live. The patch is a first step towards vectorizing sequences of stmts that do not end up in stores or vector constructors though. Bootstrapped and tested on x86_64-unknown-linux-gnu. 2020-09-04 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vectorizable_live_operation): Adjust. * tree-vect-loop.c (vectorizable_live_operation): Vectorize live lanes out of basic-block vectorization nodes. * tree-vect-slp.c (vect_bb_slp_mark_live_stmts): New function. (vect_slp_analyze_operations): Analyze live lanes and their vectorization possibility after the whole SLP graph is final. (vect_bb_slp_scalar_cost): Adjust for vectorized live lanes. * tree-vect-stmts.c (can_vectorize_live_stmts): Adjust. (vect_transform_stmt): Call can_vectorize_live_stmts also for basic-block vectorization. * gcc.dg/vect/bb-slp-46.c: New testcase. * gcc.dg/vect/bb-slp-47.c: Likewise. * gcc.dg/vect/bb-slp-32.c: Adjust.
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r--gcc/tree-vect-loop.c243
1 files changed, 149 insertions, 94 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 9799b3d..a0c3c5c 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -8012,14 +8012,14 @@ vectorizable_induction (loop_vec_info loop_vinfo,
it can be supported. */
bool
-vectorizable_live_operation (loop_vec_info loop_vinfo,
+vectorizable_live_operation (vec_info *vinfo,
stmt_vec_info stmt_info,
gimple_stmt_iterator *gsi,
slp_tree slp_node, slp_instance slp_node_instance,
int slp_index, bool vec_stmt_p,
- stmt_vector_for_cost *)
+ stmt_vector_for_cost *cost_vec)
{
- class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
imm_use_iterator imm_iter;
tree lhs, lhs_type, bitsize, vec_bitsize;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -8064,10 +8064,6 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
return true;
}
- /* FORNOW. CHECKME. */
- if (nested_in_vect_loop_p (loop, stmt_info))
- return false;
-
/* If STMT is not relevant and it is a simple assignment and its inputs are
invariant then it can remain in place, unvectorized. The original last
scalar value that it computes will be used. */
@@ -8090,12 +8086,11 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
{
gcc_assert (slp_index >= 0);
- int num_scalar = SLP_TREE_LANES (slp_node);
- int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
/* Get the last occurrence of the scalar index from the concatenation of
all the slp vectors. Calculate which slp vector it is and the index
within. */
+ int num_scalar = SLP_TREE_LANES (slp_node);
+ int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
/* Calculate which vector contains the result, and which lane of
@@ -8113,7 +8108,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
if (!vec_stmt_p)
{
/* No transformation required. */
- if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
OPTIMIZE_FOR_SPEED))
@@ -8150,14 +8145,20 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
1, vectype, NULL);
}
}
+ /* ??? Enable for loop costing as well. */
+ if (!loop_vinfo)
+ record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
+ 0, vect_epilogue);
return true;
}
/* Use the lhs of the original scalar statement. */
gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
+ "stmt %G", stmt);
- lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
- : gimple_get_lhs (stmt);
+ lhs = gimple_get_lhs (stmt);
lhs_type = TREE_TYPE (lhs);
bitsize = vector_element_bits_tree (vectype);
@@ -8165,16 +8166,14 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
/* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
tree vec_lhs, bitstart;
+ gimple *vec_stmt;
if (slp_node)
{
- gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+ gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
/* Get the correct slp vectorized stmt. */
- gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
- if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
- vec_lhs = gimple_phi_result (phi);
- else
- vec_lhs = gimple_get_lhs (vec_stmt);
+ vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
+ vec_lhs = gimple_get_lhs (vec_stmt);
/* Get entry to use. */
bitstart = bitsize_int (vec_index);
@@ -8183,102 +8182,158 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
else
{
/* For multiple copies, get the last copy. */
- vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
+ vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
+ vec_lhs = gimple_get_lhs (vec_stmt);
/* Get the last lane in the vector. */
bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
}
- /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
- requirement, insert one phi node for it. It looks like:
- loop;
- BB:
- # lhs' = PHI <lhs>
- ==>
- loop;
- BB:
- # vec_lhs' = PHI <vec_lhs>
- new_tree = lane_extract <vec_lhs', ...>;
- lhs' = new_tree; */
+ if (loop_vinfo)
+ {
+ /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
+ requirement, insert one phi node for it. It looks like:
+ loop;
+ BB:
+ # lhs' = PHI <lhs>
+ ==>
+ loop;
+ BB:
+ # vec_lhs' = PHI <vec_lhs>
+ new_tree = lane_extract <vec_lhs', ...>;
+ lhs' = new_tree; */
+
+ class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ basic_block exit_bb = single_exit (loop)->dest;
+ gcc_assert (single_pred_p (exit_bb));
+
+ tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+ gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+ SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+
+ gimple_seq stmts = NULL;
+ tree new_tree;
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ /* Emit:
- basic_block exit_bb = single_exit (loop)->dest;
- gcc_assert (single_pred_p (exit_bb));
+ SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
- tree vec_lhs_phi = copy_ssa_name (vec_lhs);
- gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
- SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+ where VEC_LHS is the vectorized live-out result and MASK is
+ the loop mask for the final iteration. */
+ gcc_assert (ncopies == 1 && !slp_node);
+ tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+ tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype, 0);
+ tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+ mask, vec_lhs_phi);
- gimple_seq stmts = NULL;
- tree new_tree;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
- {
- /* Emit:
+ /* Convert the extracted vector element to the scalar type. */
+ new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+ else
+ {
+ tree bftype = TREE_TYPE (vectype);
+ if (VECTOR_BOOLEAN_TYPE_P (vectype))
+ bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+ new_tree = build3 (BIT_FIELD_REF, bftype,
+ vec_lhs_phi, bitsize, bitstart);
+ new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+ &stmts, true, NULL_TREE);
+ }
- SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+ if (stmts)
+ {
+ gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
+ gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
- where VEC_LHS is the vectorized live-out result and MASK is
- the loop mask for the final iteration. */
- gcc_assert (ncopies == 1 && !slp_node);
- tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
- tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
- vectype, 0);
- tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
- mask, vec_lhs_phi);
+ /* Remove existing phi from lhs and create one copy from new_tree. */
+ tree lhs_phi = NULL_TREE;
+ gimple_stmt_iterator gsi;
+ for (gsi = gsi_start_phis (exit_bb);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *phi = gsi_stmt (gsi);
+ if ((gimple_phi_arg_def (phi, 0) == lhs))
+ {
+ remove_phi_node (&gsi, false);
+ lhs_phi = gimple_phi_result (phi);
+ gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+ gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+ break;
+ }
+ }
+ }
- /* Convert the extracted vector element to the required scalar type. */
- new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ /* Replace use of lhs with newly computed result. If the use stmt is a
+ single arg PHI, just replace all uses of PHI result. It's necessary
+ because lcssa PHI defining lhs may be before newly inserted stmt. */
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+ if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
+ && !is_gimple_debug (use_stmt))
+ {
+ if (gimple_code (use_stmt) == GIMPLE_PHI
+ && gimple_phi_num_args (use_stmt) == 1)
+ {
+ replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+ }
+ else
+ {
+ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+ SET_USE (use_p, new_tree);
+ }
+ update_stmt (use_stmt);
+ }
}
else
{
+ /* For basic-block vectorization simply insert the lane-extraction. */
tree bftype = TREE_TYPE (vectype);
if (VECTOR_BOOLEAN_TYPE_P (vectype))
bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
- new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+ tree new_tree = build3 (BIT_FIELD_REF, bftype,
+ vec_lhs, bitsize, bitstart);
+ gimple_seq stmts = NULL;
new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
&stmts, true, NULL_TREE);
- }
- if (stmts)
- {
- gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
- gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
- /* Remove existing phi from lhs and create one copy from new_tree. */
- tree lhs_phi = NULL_TREE;
- gimple_stmt_iterator gsi;
- for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
- {
- gimple *phi = gsi_stmt (gsi);
- if ((gimple_phi_arg_def (phi, 0) == lhs))
- {
- remove_phi_node (&gsi, false);
- lhs_phi = gimple_phi_result (phi);
- gimple *copy = gimple_build_assign (lhs_phi, new_tree);
- gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
- break;
- }
- }
- }
-
- /* Replace use of lhs with newly computed result. If the use stmt is a
- single arg PHI, just replace all uses of PHI result. It's necessary
- because lcssa PHI defining lhs may be before newly inserted stmt. */
- use_operand_p use_p;
- FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
- if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
- && !is_gimple_debug (use_stmt))
- {
- if (gimple_code (use_stmt) == GIMPLE_PHI
- && gimple_phi_num_args (use_stmt) == 1)
- {
- replace_uses_by (gimple_phi_result (use_stmt), new_tree);
- }
- else
- {
- FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
- SET_USE (use_p, new_tree);
- }
- update_stmt (use_stmt);
+ /* Replace use of lhs with newly computed result. If the use stmt is a
+ single arg PHI, just replace all uses of PHI result. It's necessary
+ because lcssa PHI defining lhs may be before newly inserted stmt. */
+ use_operand_p use_p;
+ stmt_vec_info use_stmt_info;
+ FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+ if (!is_gimple_debug (use_stmt)
+ && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
+ || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
+ {
+ /* ??? This can happen when the live lane ends up being
+ used in a vector construction code-generated by an
+ external SLP node (and code-generation for that already
+ happened). See gcc.dg/vect/bb-slp-47.c.
+ Doing this is what would happen if that vector CTOR
+ were not code-generated yet so it is not too bad.
+ ??? In fact we'd likely want to avoid this situation
+ in the first place. */
+ if (gimple_code (use_stmt) != GIMPLE_PHI
+ && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt))
+ {
+ gcc_assert (is_gimple_assign (use_stmt)
+ && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "Using original scalar computation for "
+ "live lane because use preceeds vector "
+ "def\n");
+ continue;
+ }
+ FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+ SET_USE (use_p, new_tree);
+ update_stmt (use_stmt);
+ }
}
return true;