diff options
author | Richard Sandiford <richard.sandiford@linaro.org> | 2018-01-13 17:59:00 +0000 |
---|---|---|
committer | Richard Sandiford <rsandifo@gcc.gnu.org> | 2018-01-13 17:59:00 +0000 |
commit | 0972596e6d2573a2c7e922c66b017974ed03ad89 (patch) | |
tree | 6480f2b68dc621ec33ca6970ec346cd38bc3a6a6 /gcc/tree-vect-loop.c | |
parent | 7cfb4d93595da03abb4e6414758dc98eb7532b34 (diff) | |
download | gcc-0972596e6d2573a2c7e922c66b017974ed03ad89.zip gcc-0972596e6d2573a2c7e922c66b017974ed03ad89.tar.gz gcc-0972596e6d2573a2c7e922c66b017974ed03ad89.tar.bz2 |
Add support for reductions in fully-masked loops
This patch removes the restriction that fully-masked loops cannot
have reductions. The key thing here is to make sure that the
reduction accumulator doesn't include any values associated with
inactive lanes; the patch adds a bunch of conditional binary
operations for doing that.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
gcc/
* doc/md.texi (cond_add@var{mode}, cond_sub@var{mode})
(cond_and@var{mode}, cond_ior@var{mode}, cond_xor@var{mode})
(cond_smin@var{mode}, cond_smax@var{mode}, cond_umin@var{mode})
(cond_umax@var{mode}): Document.
* optabs.def (cond_add_optab, cond_sub_optab, cond_and_optab)
(cond_ior_optab, cond_xor_optab, cond_smin_optab, cond_smax_optab)
(cond_umin_optab, cond_umax_optab): New optabs.
* internal-fn.def (COND_ADD, COND_SUB, COND_MIN, COND_MAX, COND_AND)
(COND_IOR, COND_XOR): New internal functions.
* internal-fn.h (get_conditional_internal_fn): Declare.
* internal-fn.c (cond_binary_direct): New macro.
(expand_cond_binary_optab_fn): Likewise.
(direct_cond_binary_optab_supported_p): Likewise.
(get_conditional_internal_fn): New function.
* tree-vect-loop.c (vectorizable_reduction): Handle fully-masked loops.
Cope with reduction statements that are vectorized as calls rather
than assignments.
* config/aarch64/aarch64-sve.md (cond_<optab><mode>): New insns.
* config/aarch64/iterators.md (UNSPEC_COND_ADD, UNSPEC_COND_SUB)
(UNSPEC_COND_SMAX, UNSPEC_COND_UMAX, UNSPEC_COND_SMIN)
(UNSPEC_COND_UMIN, UNSPEC_COND_AND, UNSPEC_COND_ORR)
(UNSPEC_COND_EOR): New unspecs.
(optab): Add mappings for them.
(SVE_COND_INT_OP, SVE_COND_FP_OP): New int iterators.
(sve_int_op, sve_fp_op): New int attributes.
gcc/testsuite/
* gcc.dg/vect/pr60482.c: Remove XFAIL for variable-length vectors.
* gcc.target/aarch64/sve/reduc_1.c: Expect the loop operations
to be predicated.
* gcc.target/aarch64/sve/slp_5.c: Check for a fully-masked loop.
* gcc.target/aarch64/sve/slp_7.c: Likewise.
* gcc.target/aarch64/sve/reduc_5.c: New test.
* gcc.target/aarch64/sve/slp_13.c: Likewise.
* gcc.target/aarch64/sve/slp_13_run.c: Likewise.
Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>
From-SVN: r256626
Diffstat (limited to 'gcc/tree-vect-loop.c')
-rw-r--r-- | gcc/tree-vect-loop.c | 88 |
1 files changed, 65 insertions, 23 deletions
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 5deb780..15d36b2 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -6893,19 +6893,42 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, return false; } + if (slp_node) + vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + else + vec_num = 1; + + internal_fn cond_fn = get_conditional_internal_fn (code); + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + if (!vec_stmt) /* transformation not required. */ { - if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't use a fully-masked loop due to " - "reduction operation.\n"); - LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; - } - if (first_p) vect_model_reduction_cost (stmt_info, reduc_fn, ncopies); + if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) + { + if (cond_fn == IFN_LAST + || !direct_internal_fn_supported_p (cond_fn, vectype_in, + OPTIMIZE_FOR_SPEED)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop because no" + " conditional operation is available.\n"); + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + } + else if (reduc_index == -1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use a fully-masked loop for chained" + " reductions.\n"); + LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; + } + else + vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, + vectype_in); + } STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; return true; } @@ -6919,16 +6942,15 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, if (code == COND_EXPR) gcc_assert (ncopies == 1); + bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); + /* Create the destination vector */ vec_dest = vect_create_destination_var (scalar_dest, vectype_out); prev_stmt_info = NULL; prev_phi_info = NULL; - if (slp_node) - vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - else + if (!slp_node) { - vec_num = 1; vec_oprnds0.create (1); vec_oprnds1.create (1); if (op_type == ternary_op) @@ -7002,19 +7024,19 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, gcc_assert (reduc_index != -1 || ! single_defuse_cycle); if (single_defuse_cycle && reduc_index == 0) - vec_oprnds0[0] = gimple_assign_lhs (new_stmt); + vec_oprnds0[0] = gimple_get_lhs (new_stmt); else vec_oprnds0[0] = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]); if (single_defuse_cycle && reduc_index == 1) - vec_oprnds1[0] = gimple_assign_lhs (new_stmt); + vec_oprnds1[0] = gimple_get_lhs (new_stmt); else vec_oprnds1[0] = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]); if (op_type == ternary_op) { if (single_defuse_cycle && reduc_index == 2) - vec_oprnds2[0] = gimple_assign_lhs (new_stmt); + vec_oprnds2[0] = gimple_get_lhs (new_stmt); else vec_oprnds2[0] = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]); @@ -7025,13 +7047,33 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) { tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; - if (op_type == ternary_op) - vop[2] = vec_oprnds2[i]; + if (masked_loop_p) + { + /* Make sure that the reduction accumulator is vop[0]. */ + if (reduc_index == 1) + { + gcc_assert (commutative_tree_code (code)); + std::swap (vop[0], vop[1]); + } + tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, + vectype_in, i * ncopies + j); + gcall *call = gimple_build_call_internal (cond_fn, 3, mask, + vop[0], vop[1]); + new_temp = make_ssa_name (vec_dest, call); + gimple_call_set_lhs (call, new_temp); + gimple_call_set_nothrow (call, true); + new_stmt = call; + } + else + { + if (op_type == ternary_op) + vop[2] = vec_oprnds2[i]; - new_temp = make_ssa_name (vec_dest, new_stmt); - new_stmt = gimple_build_assign (new_temp, code, - vop[0], vop[1], vop[2]); - vect_finish_stmt_generation (stmt, new_stmt, gsi); + new_temp = make_ssa_name (vec_dest, new_stmt); + new_stmt = gimple_build_assign (new_temp, code, + vop[0], vop[1], vop[2]); + } + vect_finish_stmt_generation (stmt, new_stmt, gsi); if (slp_node) { @@ -7056,7 +7098,7 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi, /* Finalize the reduction-phi (set its arguments) and create the epilog reduction code. */ if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node) - vect_defs[0] = gimple_assign_lhs (*vec_stmt); + vect_defs[0] = gimple_get_lhs (*vec_stmt); vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt, epilog_copies, reduc_fn, phis, |