aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vect-loop.cc
diff options
context:
space:
mode:
authorFeng Xue <fxue@os.amperecomputing.com>2024-05-29 17:22:36 +0800
committerFeng Xue <fxue@os.amperecomputing.com>2024-07-17 21:54:05 +0800
commit178cc419512f7e358f88dfe2336625aa99cd7438 (patch)
tree518eacbefd2e5457b33c4d3a0cc87c3943ff6306 /gcc/tree-vect-loop.cc
parent8b59fa9d8ca25bdf0792390a8bdeae151532a530 (diff)
downloadgcc-178cc419512f7e358f88dfe2336625aa99cd7438.zip
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.gz
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.bz2
vect: Support multiple lane-reducing operations for loop reduction [PR114440]
For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current vectorizer could only handle the pattern if the reduction chain does not contain other operation, no matter the other is normal or lane-reducing. This patches removes some constraints in reduction analysis to allow multiple arbitrary lane-reducing operations with mixed input vectypes in a loop reduction chain. For example: int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> sum += w[i]; // widen-sum <vector(16) char> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit vectorization factor is 16. Reduction statements would be transformed as: vector<4> int sum_v0 = { 0, 0, 0, 1 }; vector<4> int sum_v1 = { 0, 0, 0, 0 }; vector<4> int sum_v2 = { 0, 0, 0, 0 }; vector<4> int sum_v3 = { 0, 0, 0, 0 }; for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy } sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 2024-03-22 Feng Xue <fxue@os.amperecomputing.com> gcc/ PR tree-optimization/114440 * tree-vectorizer.h (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing. (vect_transform_reduction): Adjust comments with updated example. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c * gcc.dg/vect/vect-reduc-dot-slp-1.c
Diffstat (limited to 'gcc/tree-vect-loop.cc')
-rw-r--r--gcc/tree-vect-loop.cc241
1 files changed, 171 insertions, 70 deletions
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9c5c305..1c3dbf4 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
if (!gimple_extract_op (orig_stmt_info->stmt, &op))
gcc_unreachable ();
- bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
if (reduction_type == EXTRACT_LAST_REDUCTION)
/* No extra instructions are needed in the prologue. The loop body
operations are costed in vectorizable_condition. */
@@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
initial result of the data reduction, initial value of the index
reduction. */
prologue_stmts = 4;
- else if (emulated_mixed_dot_prod)
- /* We need the initial reduction value and two invariants:
- one that contains the minimum signed value and one that
- contains half of its negative. */
- prologue_stmts = 3;
else
+ /* We need the initial reduction value. */
prologue_stmts = 1;
prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
scalar_to_vec, stmt_info, 0,
@@ -7482,6 +7476,142 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
}
}
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+ the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
+ and the analysis is for slp if SLP_NODE is not NULL.
+
+ For a lane-reducing operation, the loop reduction path that it lies in,
+ may contain normal operation, or other lane-reducing operation of different
+ input type size, an example as:
+
+ int sum = 0;
+ for (i)
+ {
+ ...
+ sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += w[i]; // widen-sum <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
+ ...
+ }
+
+ Vectorization factor is essentially determined by operation whose input
+ vectype has the most lanes ("vector(16) char" in the example), while we
+ need to choose input vectype with the least lanes ("vector(4) int" in the
+ example) to determine effective number of vector reduction PHIs. */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+ slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+ gimple *stmt = stmt_info->stmt;
+
+ if (!lane_reducing_stmt_p (stmt))
+ return false;
+
+ tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+ if (!INTEGRAL_TYPE_P (type))
+ return false;
+
+ /* Do not try to vectorize bit-precision reductions. */
+ if (!type_has_mode_precision_p (type))
+ return false;
+
+ stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+ /* TODO: Support lane-reducing operation that does not directly participate
+ in loop reduction. */
+ if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+ return false;
+
+ /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+ recoginized. */
+ gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+ gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+ for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+ {
+ stmt_vec_info def_stmt_info;
+ slp_tree slp_op;
+ tree op;
+ tree vectype;
+ enum vect_def_type dt;
+
+ if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+ &slp_op, &dt, &vectype, &def_stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "use not simple.\n");
+ return false;
+ }
+
+ if (!vectype)
+ {
+ vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+ slp_op);
+ if (!vectype)
+ return false;
+ }
+
+ if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+
+ if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+ continue;
+
+ /* There should be at most one cycle def in the stmt. */
+ if (VECTORIZABLE_CYCLE_DEF (dt))
+ return false;
+ }
+
+ tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+ gcc_assert (vectype_in);
+
+ /* Compute number of effective vector statements for costing. */
+ unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
+ vectype_in);
+ gcc_assert (ncopies_for_cost >= 1);
+
+ if (vect_is_emulated_mixed_dot_prod (stmt_info))
+ {
+ /* We need extra two invariants: one that contains the minimum signed
+ value and one that contains half of its negative. */
+ int prologue_stmts = 2;
+ unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+ scalar_to_vec, stmt_info, 0,
+ vect_prologue);
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+ "extra prologue_cost = %d .\n", cost);
+
+ /* Three dot-products and a subtraction. */
+ ncopies_for_cost *= 4;
+ }
+
+ record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
+ 0, vect_body);
+
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+ vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+ slp_node, code, type,
+ vectype_in);
+ }
+
+ /* Transform via vect_transform_reduction. */
+ STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+ return true;
+}
+
/* Function vectorizable_reduction.
Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7721,7 +7851,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
/* For lane-reducing operation vectorizable analysis needs the
- reduction PHI information */
+ reduction PHI information. */
STMT_VINFO_REDUC_DEF (def) = phi_info;
/* Each lane-reducing operation has its own input vectype, while
@@ -7815,18 +7945,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
if (!type_has_mode_precision_p (op.type))
return false;
- /* For lane-reducing ops we're reducing the number of reduction PHIs
- which means the only use of that may be in the lane-reducing operation. */
- if (lane_reducing
- && reduc_chain_length != 1
- && !only_slp_reduc_chain)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "lane-reducing reduction with extra stmts.\n");
- return false;
- }
-
/* Lane-reducing ops also never can be used in a SLP reduction group
since we'll mix lanes belonging to different reductions. But it's
OK to use them in a reduction chain or when the reduction group
@@ -8366,14 +8484,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
&& loop_vinfo->suggested_unroll_factor == 1)
single_defuse_cycle = true;
- if (single_defuse_cycle || lane_reducing)
+ if (single_defuse_cycle && !lane_reducing)
{
gcc_assert (op.code != COND_EXPR);
- /* 4. Supportable by target? */
- bool ok = true;
-
- /* 4.1. check support for the operation in the loop
+ /* 4. check support for the operation in the loop
This isn't necessary for the lane reduction codes, since they
can only be produced by pattern matching, and it's up to the
@@ -8382,14 +8497,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
mixed-sign dot-products can be implemented using signed
dot-products. */
machine_mode vec_mode = TYPE_MODE (vectype_in);
- if (!lane_reducing
- && !directly_supported_p (op.code, vectype_in, optab_vector))
+ if (!directly_supported_p (op.code, vectype_in, optab_vector))
{
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "op not supported by target.\n");
if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
|| !vect_can_vectorize_without_simd_p (op.code))
- ok = false;
+ single_defuse_cycle = false;
else
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8402,16 +8516,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
dump_printf (MSG_NOTE, "using word mode not possible.\n");
return false;
}
-
- /* lane-reducing operations have to go through vect_transform_reduction.
- For the other cases try without the single cycle optimization. */
- if (!ok)
- {
- if (lane_reducing)
- return false;
- else
- single_defuse_cycle = false;
- }
}
if (dump_enabled_p () && single_defuse_cycle)
dump_printf_loc (MSG_NOTE, vect_location,
@@ -8419,22 +8523,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
"multiple vectors to one in the loop body\n");
STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
- /* If the reduction stmt is one of the patterns that have lane
- reduction embedded we cannot handle the case of ! single_defuse_cycle. */
- if ((ncopies > 1 && ! single_defuse_cycle)
- && lane_reducing)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "multi def-use cycle not possible for lane-reducing "
- "reduction operation\n");
- return false;
- }
+ /* For lane-reducing operation, the below processing related to single
+ defuse-cycle will be done in its own vectorizable function. One more
+ thing to note is that the operation must not be involved in fold-left
+ reduction. */
+ single_defuse_cycle &= !lane_reducing;
if (slp_node
- && !(!single_defuse_cycle
- && !lane_reducing
- && reduction_type != FOLD_LEFT_REDUCTION))
+ && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
for (i = 0; i < (int) op.num_ops; i++)
if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
{
@@ -8447,28 +8543,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
reduction_type, ncopies, cost_vec);
/* Cost the reduction op inside the loop if transformed via
- vect_transform_reduction. Otherwise this is costed by the
- separate vectorizable_* routines. */
- if (single_defuse_cycle || lane_reducing)
- {
- int factor = 1;
- if (vect_is_emulated_mixed_dot_prod (stmt_info))
- /* Three dot-products and a subtraction. */
- factor = 4;
- record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
- stmt_info, 0, vect_body);
- }
+ vect_transform_reduction for non-lane-reducing operation. Otherwise
+ this is costed by the separate vectorizable_* routines. */
+ if (single_defuse_cycle)
+ record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
if (dump_enabled_p ()
&& reduction_type == FOLD_LEFT_REDUCTION)
dump_printf_loc (MSG_NOTE, vect_location,
"using an in-order (fold-left) reduction.\n");
STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
- /* All but single defuse-cycle optimized, lane-reducing and fold-left
- reductions go through their own vectorizable_* routines. */
- if (!single_defuse_cycle
- && !lane_reducing
- && reduction_type != FOLD_LEFT_REDUCTION)
+
+ /* All but single defuse-cycle optimized and fold-left reductions go
+ through their own vectorizable_* routines. */
+ if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
{
stmt_vec_info tem
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8746,13 +8834,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
And vector reduction PHIs are always generated to the full extent, no
matter lane-reducing op exists or not. If some copies or PHIs are
actually superfluous, they would be cleaned up by passes after
- vectorization. An example for single-lane slp is given as below.
+ vectorization. An example for single-lane slp, lane-reducing ops
+ with mixed input vectypes in a reduction chain, is given as below.
Similarly, this handling is applicable for multiple-lane slp as well.
int sum = 1;
for (i)
{
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += w[i]; // widen-sum <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
}
The vector size is 128-bit,vectorization factor is 16. Reduction
@@ -8769,9 +8860,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
+
+ sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
+ sum_v1 = sum_v1; // copy
+ sum_v2 = sum_v2; // copy
+ sum_v3 = sum_v3; // copy
+
+ sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+ sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+ sum_v2 = sum_v2; // copy
+ sum_v3 = sum_v3; // copy
}
- sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0
+ sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
*/
unsigned effec_ncopies = vec_oprnds[0].length ();
unsigned total_ncopies = vec_oprnds[reduc_index].length ();