diff options
author | Feng Xue <fxue@os.amperecomputing.com> | 2024-05-29 17:22:36 +0800 |
---|---|---|
committer | Feng Xue <fxue@os.amperecomputing.com> | 2024-07-17 21:54:05 +0800 |
commit | 178cc419512f7e358f88dfe2336625aa99cd7438 (patch) | |
tree | 518eacbefd2e5457b33c4d3a0cc87c3943ff6306 /gcc/tree-vect-loop.cc | |
parent | 8b59fa9d8ca25bdf0792390a8bdeae151532a530 (diff) | |
download | gcc-178cc419512f7e358f88dfe2336625aa99cd7438.zip gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.gz gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.bz2 |
vect: Support multiple lane-reducing operations for loop reduction [PR114440]
For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current
vectorizer could only handle the pattern if the reduction chain does not
contain other operation, no matter the other is normal or lane-reducing.
This patches removes some constraints in reduction analysis to allow multiple
arbitrary lane-reducing operations with mixed input vectypes in a loop
reduction chain. For example:
int sum = 1;
for (i)
{
sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
sum += w[i]; // widen-sum <vector(16) char>
sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
}
The vector size is 128-bit vectorization factor is 16. Reduction statements
would be transformed as:
vector<4> int sum_v0 = { 0, 0, 0, 1 };
vector<4> int sum_v1 = { 0, 0, 0, 0 };
vector<4> int sum_v2 = { 0, 0, 0, 0 };
vector<4> int sum_v3 = { 0, 0, 0, 0 };
for (i / 16)
{
sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
sum_v1 = sum_v1; // copy
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
sum_v2 = sum_v2; // copy
sum_v3 = sum_v3; // copy
}
sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1
2024-03-22 Feng Xue <fxue@os.amperecomputing.com>
gcc/
PR tree-optimization/114440
* tree-vectorizer.h (vectorizable_lane_reducing): New function
declaration.
* tree-vect-stmts.cc (vect_analyze_stmt): Call new function
vectorizable_lane_reducing to analyze lane-reducing operation.
* tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation
code related to emulated_mixed_dot_prod.
(vectorizable_lane_reducing): New function.
(vectorizable_reduction): Allow multiple lane-reducing operations in
loop reduction. Move some original lane-reducing related code to
vectorizable_lane_reducing.
(vect_transform_reduction): Adjust comments with updated example.
gcc/testsuite/
PR tree-optimization/114440
* gcc.dg/vect/vect-reduc-chain-1.c
* gcc.dg/vect/vect-reduc-chain-2.c
* gcc.dg/vect/vect-reduc-chain-3.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-1.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-2.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-3.c
* gcc.dg/vect/vect-reduc-chain-dot-slp-4.c
* gcc.dg/vect/vect-reduc-dot-slp-1.c
Diffstat (limited to 'gcc/tree-vect-loop.cc')
-rw-r--r-- | gcc/tree-vect-loop.cc | 241 |
1 files changed, 171 insertions, 70 deletions
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 9c5c305..1c3dbf4 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, if (!gimple_extract_op (orig_stmt_info->stmt, &op)) gcc_unreachable (); - bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info); - if (reduction_type == EXTRACT_LAST_REDUCTION) /* No extra instructions are needed in the prologue. The loop body operations are costed in vectorizable_condition. */ @@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo, initial result of the data reduction, initial value of the index reduction. */ prologue_stmts = 4; - else if (emulated_mixed_dot_prod) - /* We need the initial reduction value and two invariants: - one that contains the minimum signed value and one that - contains half of its negative. */ - prologue_stmts = 3; else + /* We need the initial reduction value. */ prologue_stmts = 1; prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, scalar_to_vec, stmt_info, 0, @@ -7482,6 +7476,142 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo, } } +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in + the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC, + and the analysis is for slp if SLP_NODE is not NULL. + + For a lane-reducing operation, the loop reduction path that it lies in, + may contain normal operation, or other lane-reducing operation of different + input type size, an example as: + + int sum = 0; + for (i) + { + ... + sum += d0[i] * d1[i]; // dot-prod <vector(16) char> + sum += w[i]; // widen-sum <vector(16) char> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> + sum += n[i]; // normal <vector(4) int> + ... + } + + Vectorization factor is essentially determined by operation whose input + vectype has the most lanes ("vector(16) char" in the example), while we + need to choose input vectype with the least lanes ("vector(4) int" in the + example) to determine effective number of vector reduction PHIs. */ + +bool +vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, + slp_tree slp_node, stmt_vector_for_cost *cost_vec) +{ + gimple *stmt = stmt_info->stmt; + + if (!lane_reducing_stmt_p (stmt)) + return false; + + tree type = TREE_TYPE (gimple_assign_lhs (stmt)); + + if (!INTEGRAL_TYPE_P (type)) + return false; + + /* Do not try to vectorize bit-precision reductions. */ + if (!type_has_mode_precision_p (type)) + return false; + + stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); + + /* TODO: Support lane-reducing operation that does not directly participate + in loop reduction. */ + if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0) + return false; + + /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not + recoginized. */ + gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def); + gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION); + + for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++) + { + stmt_vec_info def_stmt_info; + slp_tree slp_op; + tree op; + tree vectype; + enum vect_def_type dt; + + if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op, + &slp_op, &dt, &vectype, &def_stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + + if (!vectype) + { + vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op), + slp_op); + if (!vectype) + return false; + } + + if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "incompatible vector types for invariants\n"); + return false; + } + + if (i == STMT_VINFO_REDUC_IDX (stmt_info)) + continue; + + /* There should be at most one cycle def in the stmt. */ + if (VECTORIZABLE_CYCLE_DEF (dt)) + return false; + } + + tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info); + + gcc_assert (vectype_in); + + /* Compute number of effective vector statements for costing. */ + unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node, + vectype_in); + gcc_assert (ncopies_for_cost >= 1); + + if (vect_is_emulated_mixed_dot_prod (stmt_info)) + { + /* We need extra two invariants: one that contains the minimum signed + value and one that contains half of its negative. */ + int prologue_stmts = 2; + unsigned cost = record_stmt_cost (cost_vec, prologue_stmts, + scalar_to_vec, stmt_info, 0, + vect_prologue); + if (dump_enabled_p ()) + dump_printf (MSG_NOTE, "vectorizable_lane_reducing: " + "extra prologue_cost = %d .\n", cost); + + /* Three dot-products and a subtraction. */ + ncopies_for_cost *= 4; + } + + record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info, + 0, vect_body); + + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) + { + enum tree_code code = gimple_assign_rhs_code (stmt); + vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info, + slp_node, code, type, + vectype_in); + } + + /* Transform via vect_transform_reduction. */ + STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; + return true; +} + /* Function vectorizable_reduction. Check if STMT_INFO performs a reduction operation that can be vectorized. @@ -7721,7 +7851,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo, } /* For lane-reducing operation vectorizable analysis needs the - reduction PHI information */ + reduction PHI information. */ STMT_VINFO_REDUC_DEF (def) = phi_info; /* Each lane-reducing operation has its own input vectype, while @@ -7815,18 +7945,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, if (!type_has_mode_precision_p (op.type)) return false; - /* For lane-reducing ops we're reducing the number of reduction PHIs - which means the only use of that may be in the lane-reducing operation. */ - if (lane_reducing - && reduc_chain_length != 1 - && !only_slp_reduc_chain) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "lane-reducing reduction with extra stmts.\n"); - return false; - } - /* Lane-reducing ops also never can be used in a SLP reduction group since we'll mix lanes belonging to different reductions. But it's OK to use them in a reduction chain or when the reduction group @@ -8366,14 +8484,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, && loop_vinfo->suggested_unroll_factor == 1) single_defuse_cycle = true; - if (single_defuse_cycle || lane_reducing) + if (single_defuse_cycle && !lane_reducing) { gcc_assert (op.code != COND_EXPR); - /* 4. Supportable by target? */ - bool ok = true; - - /* 4.1. check support for the operation in the loop + /* 4. check support for the operation in the loop This isn't necessary for the lane reduction codes, since they can only be produced by pattern matching, and it's up to the @@ -8382,14 +8497,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo, mixed-sign dot-products can be implemented using signed dot-products. */ machine_mode vec_mode = TYPE_MODE (vectype_in); - if (!lane_reducing - && !directly_supported_p (op.code, vectype_in, optab_vector)) + if (!directly_supported_p (op.code, vectype_in, optab_vector)) { if (dump_enabled_p ()) dump_printf (MSG_NOTE, "op not supported by target.\n"); if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) || !vect_can_vectorize_without_simd_p (op.code)) - ok = false; + single_defuse_cycle = false; else if (dump_enabled_p ()) dump_printf (MSG_NOTE, "proceeding using word mode.\n"); @@ -8402,16 +8516,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, "using word mode not possible.\n"); return false; } - - /* lane-reducing operations have to go through vect_transform_reduction. - For the other cases try without the single cycle optimization. */ - if (!ok) - { - if (lane_reducing) - return false; - else - single_defuse_cycle = false; - } } if (dump_enabled_p () && single_defuse_cycle) dump_printf_loc (MSG_NOTE, vect_location, @@ -8419,22 +8523,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo, "multiple vectors to one in the loop body\n"); STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; - /* If the reduction stmt is one of the patterns that have lane - reduction embedded we cannot handle the case of ! single_defuse_cycle. */ - if ((ncopies > 1 && ! single_defuse_cycle) - && lane_reducing) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "multi def-use cycle not possible for lane-reducing " - "reduction operation\n"); - return false; - } + /* For lane-reducing operation, the below processing related to single + defuse-cycle will be done in its own vectorizable function. One more + thing to note is that the operation must not be involved in fold-left + reduction. */ + single_defuse_cycle &= !lane_reducing; if (slp_node - && !(!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION)) + && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)) for (i = 0; i < (int) op.num_ops; i++) if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i])) { @@ -8447,28 +8543,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo, vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn, reduction_type, ncopies, cost_vec); /* Cost the reduction op inside the loop if transformed via - vect_transform_reduction. Otherwise this is costed by the - separate vectorizable_* routines. */ - if (single_defuse_cycle || lane_reducing) - { - int factor = 1; - if (vect_is_emulated_mixed_dot_prod (stmt_info)) - /* Three dot-products and a subtraction. */ - factor = 4; - record_stmt_cost (cost_vec, ncopies * factor, vector_stmt, - stmt_info, 0, vect_body); - } + vect_transform_reduction for non-lane-reducing operation. Otherwise + this is costed by the separate vectorizable_* routines. */ + if (single_defuse_cycle) + record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) dump_printf_loc (MSG_NOTE, vect_location, "using an in-order (fold-left) reduction.\n"); STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; - /* All but single defuse-cycle optimized, lane-reducing and fold-left - reductions go through their own vectorizable_* routines. */ - if (!single_defuse_cycle - && !lane_reducing - && reduction_type != FOLD_LEFT_REDUCTION) + + /* All but single defuse-cycle optimized and fold-left reductions go + through their own vectorizable_* routines. */ + if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION) { stmt_vec_info tem = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); @@ -8746,13 +8834,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo, And vector reduction PHIs are always generated to the full extent, no matter lane-reducing op exists or not. If some copies or PHIs are actually superfluous, they would be cleaned up by passes after - vectorization. An example for single-lane slp is given as below. + vectorization. An example for single-lane slp, lane-reducing ops + with mixed input vectypes in a reduction chain, is given as below. Similarly, this handling is applicable for multiple-lane slp as well. int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> + sum += w[i]; // widen-sum <vector(16) char> + sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit,vectorization factor is 16. Reduction @@ -8769,9 +8860,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo, sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy + + sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); + sum_v1 = sum_v1; // copy + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy + + sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); + sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); + sum_v2 = sum_v2; // copy + sum_v3 = sum_v3; // copy } - sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 */ unsigned effec_ncopies = vec_oprnds[0].length (); unsigned total_ncopies = vec_oprnds[reduc_index].length (); |