diff options
author | Dorit Nuzman <dorit@il.ibm.com> | 2006-01-19 10:24:00 +0000 |
---|---|---|
committer | Dorit Nuzman <dorit@gcc.gnu.org> | 2006-01-19 10:24:00 +0000 |
commit | 20f0622174914622858a91a712e57ace4a44d793 (patch) | |
tree | ea695128e2e8754b43a2c74530a77e2dab617567 /gcc/tree-vect-transform.c | |
parent | 681f47f25ded25effb19bc87d3f1f659f0354746 (diff) | |
download | gcc-20f0622174914622858a91a712e57ace4a44d793.zip gcc-20f0622174914622858a91a712e57ace4a44d793.tar.gz gcc-20f0622174914622858a91a712e57ace4a44d793.tar.bz2 |
Makefile.in (tree-vect-patterns.o): Add rule for new file.
* Makefile.in (tree-vect-patterns.o): Add rule for new file.
* tree-vect-analyze.c (vect_determine_vectorization_factor): Use
existing STMT_VINFO_VECTYPE if available.
(vect_mark_relevant): Add special handling for stmts that are
marked as STMT_VINFO_IN_PATTERN_P.
(vect_analyze_loop): Call vect_pattern_recog.
* tree-vectorizer.c (new_stmt_vec_info): Initialize new fields.
* tree-vectorizer.h (in_pattern_p, related_stmt): New fields in
stmt_info.
(STMT_VINFO_IN_PATTERN_P, STMT_VINFO_RELATED_STMT): New macros.
(vect_recog_func_ptr): New function-pointer type.
* tree-vect-patterns.c: New file.
(vect_recog_widen_sum_pattern, vect_recog_widen_mult_pattern):
(vect_recog_dot_prod_pattern, vect_pattern_recog):
(vect_pattern_recog_1): New functions.
(vect_pattern_recog_funcs): New array of function pointers.
* tree-vectorizer.h (ternary_op): New enum value.
* tree-vect-transform.c (vect_create_epilog_for_reduction): Added
declaration. Revised documentation. Removed redundant dump prints.
Removed redundant argument. Added support for reduction patterns.
(vectorizable_reduction): Added support for reduction patterns.
(vect_transform_stmt): Added support for patterns.
* expr.c (expand_expr_real_1): Added case for DOT_PROD_EXPR.
* genopinit.c (udot_prod_optab, sdot_prod_optab): Initialize.
* optabs.c (optab_for_tree_code): Added case for DOT_PROD_EXPR.
(expand_widen_pattern_expr): New function.
(init_optabs): Initialize new optabs udot_prod_optab,
sdot_prod_optab.
* optabs.h (OTI_sdot_prod, OTI_udot_prod): New.
(sdot_prod_optab, udot_prod_optab): Define new optabs.
(expand_widen_pattern_expr): New function declaration.
* tree.def (DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR): New
tree-codes.
* tree-inline.c (estimate_num_insns_1): Added cases for new
tree-codes DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
* tree-pretty-print.c (dump_generic_node): Likewise.
(op_prio): Likewise.
(op_symbol): Added cases for WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
* tree-ssa-operands.c (get_expr_operands): Added case for
DOT_PROD_EXPR.
* tree-vect-patterns.c (widened_name_p): New function.
(vect_recog_dot_prod_pattern): Added function implementation.
* tree-vect-transform.c (get_initial_def_for_reduction): Added
cases for DOT_PROD_EXPR, WIDEN_SUM_EXPR.
* config/rs6000/altivec.md (udot_prod<mode>, sdot_prodv8hi): New.
* config/i386/sse.md (sdot_prodv8hi, udot_prodv4si): New.
* expr.c (expand_expr_real_1): Added case for WIDEN_SUM_EXPR.
* genopinit.c (widen_ssum_optab, widen_usum_optab): Initialize.
* optabs.c (optab_for_tree_code): Added case for WIDEN_SUM_EXPR.
(init_optabs): Initialize new optabs widen_ssum_optab,
widen_usum_optab.
* optabs.h (OTI_widen_ssum, OTI_widen_usum): New.
(widen_ssum_optab, widen_usum_optab): Define new optabs.
* tree-vect-generic.c: (expand_vector_operations_1): Check type of
use instead of type of def.
* tree-vect-patterns.c (vect_recog_widen_sum_pattern): Added
function implementation.
* config/rs6000/altivec.md (widen_usum<mode>, widen_ssumv16qi,
widen_ssumv8hi): New.
* doc/tm.texi (ssum_widen, usum_widen, sdot_prod, udot_prod): New
patterns.
From-SVN: r109954
Diffstat (limited to 'gcc/tree-vect-transform.c')
-rw-r--r-- | gcc/tree-vect-transform.c | 422 |
1 files changed, 294 insertions, 128 deletions
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 42090f7..db0573c 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -1,5 +1,5 @@ /* Transformation Utilities for Loop Vectorization. - Copyright (C) 2003,2004,2005 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos <dorit@il.ibm.com> This file is part of GCC. @@ -59,6 +59,7 @@ static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); +static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); /* Utility function dealing with loop peeling (not peeling itself). */ @@ -656,6 +657,8 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) switch (code) { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: case PLUS_EXPR: if (INTEGRAL_TYPE_P (type)) def = build_int_cst (type, 0); @@ -711,66 +714,66 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) } -/* Function vect_create_epilog_for_reduction: +/* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. + computation. - LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it - into a single result, by applying the operation REDUC_CODE on the - partial-results-vector. For this, we need to create a new phi node at the - loop exit to preserve loop-closed form, as illustrated below. - - STMT is the original scalar reduction stmt that is being vectorized. - REDUCTION_OP is the scalar reduction-variable. + VECT_DEF is a vector of partial results. + REDUC_CODE is the tree-code for the epilog reduction. + STMT is the scalar reduction stmt that is being vectorized. REDUCTION_PHI is the phi-node that carries the reduction computation. - This function also sets the arguments for the REDUCTION_PHI: - The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP. - The loop-latch argument is VECT_DEF - the vector of partial sums. - This function transforms this: + This function: + 1. Creates the reduction def-use cycle: sets the the arguments for + REDUCTION_PHI: + The loop-entry argument is the vectorized initial-value of the reduction. + The loop-latch argument is VECT_DEF - the vector of partial sums. + 2. "Reduces" the vector of partial results VECT_DEF into a single result, + by applying the operation specified by REDUC_CODE if available, or by + other means (whole-vector shifts or a scalar loop). + The function also creates a new phi node at the loop exit to preserve + loop-closed form, as illustrated below. + + The flow at the entry to this function: loop: - vec_def = phi <null, null> # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi <null, null> # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi <s_loop> # EXIT_PHI - + s_out0 = phi <s_loop> # (scalar) EXIT_PHI use <s_out0> use <s_out0> - Into: + The above is transformed by this function into: loop: - vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi <s_loop> # EXIT_PHI - v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI - - v_out2 = reduc_expr <v_out1> + s_out0 = phi <s_loop> # (scalar) EXIT_PHI + v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI + v_out2 = reduce <v_out1> s_out3 = extract_field <v_out2, 0> - - use <s_out3> - use <s_out3> + s_out4 = adjust_result <s_out3> + use <s_out4> + use <s_out4> */ static void -vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, +vect_create_epilog_for_reduction (tree vect_def, tree stmt, enum tree_code reduc_code, tree reduction_phi) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - enum machine_mode mode = TYPE_MODE (vectype); + tree vectype; + enum machine_mode mode; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb; - tree scalar_dest = TREE_OPERAND (stmt, 0); - tree scalar_type = TREE_TYPE (scalar_dest); + tree scalar_dest; + tree scalar_type; tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; @@ -786,7 +789,16 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, imm_use_iterator imm_iter; use_operand_p use_p; bool extract_scalar_result; + tree reduction_op; + tree orig_stmt; + tree operation = TREE_OPERAND (stmt, 1); + int op_type; + op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + reduction_op = TREE_OPERAND (operation, op_type-1); + vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); + mode = TYPE_MODE (vectype); + /*** 1. Create the reduction def-use cycle ***/ /* 1.1 set the loop-entry arg of the reduction-phi: */ @@ -797,7 +809,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, &scalar_initial_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); - /* 1.2 set the loop-latch arg for the reduction-phi: */ add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop)); @@ -810,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, } - /*** 2. Create epilog code ***/ + /*** 2. Create epilog code + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. + + Step 1 can be accomplished using one the following three schemes: + (scheme 1) using reduc_code, if available. + (scheme 2) using whole-vector shifts, if available. + (scheme 3) using a scalar loop. In this case steps 1+2 above are + combined. + + The overall epilog code looks like this: + + s_out0 = phi <s_loop> # original EXIT_PHI + v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI + v_out2 = reduce <v_out1> # step 1 + s_out3 = extract_field <v_out2, 0> # step 2 + s_out4 = adjust_result <s_out3> # step 3 + + (step 3 is optional, and step2 1 and 2 may be combined). + Lastly, the uses of s_out0 are replaced by s_out4. + + ***/ /* 2.1 Create new loop-exit-phi to preserve loop-closed form: v_out1 = phi <v_loop> */ @@ -818,15 +854,39 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, exit_bb = loop->single_exit->dest; new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def); - exit_bsi = bsi_start (exit_bb); - + /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 + (i.e. when reduc_code is not available) and in the final adjusment code + (if needed). Also get the original scalar reduction variable as + defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it + represents a reduction pattern), the tree-code and scalar-def are + taken from the original stmt that the pattern-stmt (STMT) replaces. + Otherwise (it is a regular reduction) - the tree-code and scalar-def + are taken from STMT. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (!orig_stmt) + { + /* Regular reduction */ + orig_stmt = stmt; + } + else + { + /* Reduction pattern */ + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + } + code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + scalar_dest = TREE_OPERAND (orig_stmt, 0); + scalar_type = TREE_TYPE (scalar_dest); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); bytesize = TYPE_SIZE_UNIT (scalar_type); - /* 2.2 Create the reduction code. */ + /* 2.3 Create the reduction code, using one of the three schemes described + above. */ if (reduc_code < NUM_TREE_CODES) { @@ -849,16 +909,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { enum tree_code shift_code = 0; bool have_whole_vector_shift = true; - enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */ int bit_offset; int element_bitsize = tree_low_cst (bitsize, 1); int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); tree vec_temp; - /* The result of the reduction is expected to be at the least - significant bits of the vector. This is merely convention, - as it's the extraction later that really matters, and that - is also under our control. */ if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) shift_code = VEC_RSHIFT_EXPR; else @@ -881,7 +936,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (have_whole_vector_shift) { - /*** Case 2: + /*** Case 2: Create: for (offset = VS/2; offset >= element_size; offset/=2) { Create: va' = vec_shift <va, offset> @@ -905,17 +960,12 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, build2 (code, vectype, new_name, new_temp)); new_temp = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = true; @@ -924,10 +974,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { tree rhs; - /*** Case 3: - Create: + /*** Case 3: Create: s = extract_field <v_out2, 0> - for (offset=element_size; offset<vector_size; offset+=element_size;) + for (offset = element_size; + offset < vector_size; + offset += element_size;) { Create: s' = extract_field <v_out2, offset> Create: s = op <s, s'> @@ -938,18 +989,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, vec_temp = PHI_RESULT (new_phi); vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, bitsize_zero_node); - BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - rhs); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); for (bit_offset = element_bitsize; bit_offset < vec_size_in_bits; @@ -965,25 +1011,19 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, build2 (code, scalar_type, new_name, new_temp)); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = false; } } - - /* 2.3 Extract the final scalar result. Create: + /* 2.4 Extract the final scalar result. Create: s_out3 = extract_field <v_out2, bitpos> */ if (extract_scalar_result) @@ -993,7 +1033,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "extract scalar result"); - /* The result is in the low order bits. */ if (BYTES_BIG_ENDIAN) bitpos = size_binop (MULT_EXPR, bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), @@ -1007,17 +1046,14 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } - /* 2.4 Adjust the final result by the initial value of the reduction - variable. (when such adjustment is not needed, then + variable. (When such adjustment is not needed, then 'scalar_initial_def' is zero). Create: - s_out = scalar_expr <s_out, scalar_initial_def> */ + s_out4 = scalar_expr <s_out3, scalar_initial_def> */ if (scalar_initial_def) { @@ -1026,18 +1062,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } + /* 2.6 Replace uses of s_out0 with uses of s_out3 */ - /* 2.5 Replace uses of s_out0 with uses of s_out3 */ - - /* Find the loop-closed-use at the loop exit of the original - scalar result. (The reduction result is expected to have - two immediate uses - one at the latch block, and one at the - loop exit). */ + /* Find the loop-closed-use at the loop exit of the original scalar result. + (The reduction result is expected to have two immediate uses - one at the + latch block, and one at the loop exit). */ exit_phi = NULL; FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) { @@ -1047,9 +1078,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, break; } } - + /* We expect to have found an exit_phi because of loop-closed-ssa form. */ + gcc_assert (exit_phi); + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name) SET_USE (use_p, new_temp); } @@ -1060,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, Check if STMT performs a reduction operation that can be vectorized. If VEC_STMT is also passed, vectorize the STMT: create a vectorized stmt to replace it, put it in VEC_STMT, and insert it at BSI. - Return FALSE if not a vectorizable STMT, TRUE otherwise. */ + Return FALSE if not a vectorizable STMT, TRUE otherwise. + + This function also handles reduction idioms (patterns) that have been + recognized in advance during vect_pattern_recog. In this case, STMT may be + of this form: + X = pattern_expr (arg0, arg1, ..., X) + and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original + sequence that had been detected and replaced by the pattern-stmt (STMT). + + In some cases of reduction patterns, the type of the reduction variable X is + different than the type of the other arguments of STMT. + In such cases, the vectype that is used when transforming STMT into a vector + stmt is different than the vectype that is used to determine the + vectorization factor, because it consists of a different number of elements + than the actual number of elements that are being operated upon in parallel. + + For example, consider an accumulation of shorts into an int accumulator. + On some targets it's possible to vectorize this pattern operating on 8 + shorts at a time (hence, the vectype for purposes of determining the + vectorization factor should be V8HI); on the other hand, the vectype that + is used to create the vector form is actually V4SI (the type of the result). + + Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that + indicates what is the actual level of parallelism (V8HI in the example), so + that the right vectorization factor would be derived. This vectype + corresponds to the type of arguments to the reduction stmt, and should *NOT* + be used to create the vectorized stmt. The right vectype for the vectorized + stmt is obtained from the type of the result X: + get_vectype_for_scalar_type (TREE_TYPE (X)) + + This means that, contrary to "regular" reductions (or "regular" stmts in + general), the following equation: + STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) + does *NOT* necessarily hold for reduction patterns. */ bool vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) { tree vec_dest; tree scalar_dest; - tree op0, op1; - tree loop_vec_def; + tree op; + tree loop_vec_def0, loop_vec_def1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree operation; - enum tree_code code, reduc_code = 0; + enum tree_code code, orig_code, epilog_reduc_code = 0; enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; tree new_temp; - tree def0, def1, def_stmt0, def_stmt1; - enum vect_def_type dt0, dt1; + tree def, def_stmt; + enum vect_def_type dt; tree new_phi; tree scalar_type; - bool is_simple_use0; - bool is_simple_use1; + bool is_simple_use; + tree orig_stmt; + stmt_vec_info orig_stmt_info; + tree expr = NULL_TREE; + int i; - /* Is vectorizable reduction? */ + /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -1095,43 +1163,68 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_LIVE_P (stmt_info)) return false; - /* Make sure it was already recognized as a reduction pattern. */ + /* Make sure it was already recognized as a reduction computation. */ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) return false; + /* 2. Has this been recognized as a reduction pattern? + + Check if STMT represents a pattern that has been recognized + in earlier analysis stages. For stmts that represent a pattern, + the STMT_VINFO_RELATED_STMT field records the last stmt in + the original sequence that constitutes the pattern. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt) + { + orig_stmt_info = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); + gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); + } + + /* 3. Check the operands of the operation. The first operands are defined + inside the loop body. The last operand is the reduction variable, + which is defined by the loop-header-phi. */ + gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR); operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); op_type = TREE_CODE_LENGTH (code); - if (op_type != binary_op) + if (op_type != binary_op && op_type != ternary_op) return false; - - op0 = TREE_OPERAND (operation, 0); - op1 = TREE_OPERAND (operation, 1); scalar_dest = TREE_OPERAND (stmt, 0); scalar_type = TREE_TYPE (scalar_dest); - /* Check the first operand. It is expected to be defined inside the loop. */ - is_simple_use0 = - vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0); - is_simple_use1 = - vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1); - - gcc_assert (is_simple_use0); - gcc_assert (is_simple_use1); - gcc_assert (dt0 == vect_loop_def); - gcc_assert (dt1 == vect_reduction_def); - gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE); - gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1)); + /* All uses but the last are expected to be defined in the loop. + The last use is the reduction variable. */ + for (i = 0; i < op_type-1; i++) + { + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_loop_def || dt == vect_invariant_def || + dt == vect_constant_def); + } - if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1))) - return false; + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_reduction_def); + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + if (orig_stmt) + gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); + else + gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); + + if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) + return false; - /* Supportable by target? */ + /* 4. Supportable by target? */ - /* check support for the operation in the loop */ + /* 4.1. check support for the operation in the loop */ optab = optab_for_tree_code (code, vectype); if (!optab) { @@ -1162,21 +1255,69 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } - /* check support for the epilog operation */ - if (!reduction_code_for_scalar_code (code, &reduc_code)) + /* 4.2. Check support for the epilog operation. + + If STMT represents a reduction pattern, then the type of the + reduction variable may be different than the type of the rest + of the arguments. For example, consider the case of accumulation + of shorts into an int accumulator; The original code: + S1: int_a = (int) short_a; + orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; + + was replaced with: + STMT: int_acc = widen_sum <short_a, int_acc> + + This means that: + 1. The tree-code that is used to create the vector operation in the + epilog code (that reduces the partial results) is not the + tree-code of STMT, but is rather the tree-code of the original + stmt from the pattern that STMT is replacing. I.e, in the example + above we want to use 'widen_sum' in the loop, but 'plus' in the + epilog. + 2. The type (mode) we use to check available target support + for the vector operation to be created in the *epilog*, is + determined by the type of the reduction variable (in the example + above we'd check this: plus_optab[vect_int_mode]). + However the type (mode) we use to check available target support + for the vector operation to be created *inside the loop*, is + determined by the type of the other arguments to STMT (in the + example we'd check this: widen_sum_optab[vect_short_mode]). + + This is contrary to "regular" reductions, in which the types of all + the arguments are the same as the type of the reduction variable. + For "regular" reductions we can therefore use the same vector type + (and also the same tree-code) when generating the epilog code and + when generating the code inside the loop. */ + + if (orig_stmt) + { + /* This is a reduction pattern: get the vectype from the type of the + reduction variable, and get the tree-code from orig_stmt. */ + orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); + vec_mode = TYPE_MODE (vectype); + } + else + { + /* Regular reduction: use the same vectype and tree-code as used for + the vector code inside the loop can be used for the epilog code. */ + orig_code = code; + } + + if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) return false; - reduc_optab = optab_for_tree_code (reduc_code, vectype); + reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); if (!reduc_optab) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab for reduction."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "reduc op not supported by target."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (!vec_stmt) /* transformation not required. */ @@ -1193,25 +1334,31 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Create the destination vector */ vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); - /* Prepare the operand that is defined inside the loop body */ - loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL); + op = TREE_OPERAND (operation, 0); + loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); + if (op_type == binary_op) + expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi)); + else if (op_type == ternary_op) + { + op = TREE_OPERAND (operation, 1); + loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); + expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, + PHI_RESULT (new_phi)); + } /* Create the vectorized operation that computes the partial results */ - *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi))); + *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); TREE_OPERAND (*vec_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, *vec_stmt, bsi); - /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ - vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi); + vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } @@ -2040,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) bool is_store = false; tree vec_stmt = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree orig_stmt_in_pattern; bool done; if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -2078,7 +2226,25 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) gcc_unreachable (); } + gcc_assert (vec_stmt); STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; + orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt_in_pattern) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + + /* STMT was inserted by the vectorizer to replace a computation + idiom. ORIG_STMT_IN_PATTERN is a stmt in the original + sequence that computed this idiom. We need to record a pointer + to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN. See more + detail in the documentation of vect_pattern_recog. */ + + STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; + } + } } if (STMT_VINFO_LIVE_P (stmt_info)) |