From 20f0622174914622858a91a712e57ace4a44d793 Mon Sep 17 00:00:00 2001 From: Dorit Nuzman Date: Thu, 19 Jan 2006 10:24:00 +0000 Subject: Makefile.in (tree-vect-patterns.o): Add rule for new file. * Makefile.in (tree-vect-patterns.o): Add rule for new file. * tree-vect-analyze.c (vect_determine_vectorization_factor): Use existing STMT_VINFO_VECTYPE if available. (vect_mark_relevant): Add special handling for stmts that are marked as STMT_VINFO_IN_PATTERN_P. (vect_analyze_loop): Call vect_pattern_recog. * tree-vectorizer.c (new_stmt_vec_info): Initialize new fields. * tree-vectorizer.h (in_pattern_p, related_stmt): New fields in stmt_info. (STMT_VINFO_IN_PATTERN_P, STMT_VINFO_RELATED_STMT): New macros. (vect_recog_func_ptr): New function-pointer type. * tree-vect-patterns.c: New file. (vect_recog_widen_sum_pattern, vect_recog_widen_mult_pattern): (vect_recog_dot_prod_pattern, vect_pattern_recog): (vect_pattern_recog_1): New functions. (vect_pattern_recog_funcs): New array of function pointers. * tree-vectorizer.h (ternary_op): New enum value. * tree-vect-transform.c (vect_create_epilog_for_reduction): Added declaration. Revised documentation. Removed redundant dump prints. Removed redundant argument. Added support for reduction patterns. (vectorizable_reduction): Added support for reduction patterns. (vect_transform_stmt): Added support for patterns. * expr.c (expand_expr_real_1): Added case for DOT_PROD_EXPR. * genopinit.c (udot_prod_optab, sdot_prod_optab): Initialize. * optabs.c (optab_for_tree_code): Added case for DOT_PROD_EXPR. (expand_widen_pattern_expr): New function. (init_optabs): Initialize new optabs udot_prod_optab, sdot_prod_optab. * optabs.h (OTI_sdot_prod, OTI_udot_prod): New. (sdot_prod_optab, udot_prod_optab): Define new optabs. (expand_widen_pattern_expr): New function declaration. * tree.def (DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR): New tree-codes. * tree-inline.c (estimate_num_insns_1): Added cases for new tree-codes DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR. * tree-pretty-print.c (dump_generic_node): Likewise. (op_prio): Likewise. (op_symbol): Added cases for WIDEN_SUM_EXPR, WIDEN_MULT_EXPR. * tree-ssa-operands.c (get_expr_operands): Added case for DOT_PROD_EXPR. * tree-vect-patterns.c (widened_name_p): New function. (vect_recog_dot_prod_pattern): Added function implementation. * tree-vect-transform.c (get_initial_def_for_reduction): Added cases for DOT_PROD_EXPR, WIDEN_SUM_EXPR. * config/rs6000/altivec.md (udot_prod, sdot_prodv8hi): New. * config/i386/sse.md (sdot_prodv8hi, udot_prodv4si): New. * expr.c (expand_expr_real_1): Added case for WIDEN_SUM_EXPR. * genopinit.c (widen_ssum_optab, widen_usum_optab): Initialize. * optabs.c (optab_for_tree_code): Added case for WIDEN_SUM_EXPR. (init_optabs): Initialize new optabs widen_ssum_optab, widen_usum_optab. * optabs.h (OTI_widen_ssum, OTI_widen_usum): New. (widen_ssum_optab, widen_usum_optab): Define new optabs. * tree-vect-generic.c: (expand_vector_operations_1): Check type of use instead of type of def. * tree-vect-patterns.c (vect_recog_widen_sum_pattern): Added function implementation. * config/rs6000/altivec.md (widen_usum, widen_ssumv16qi, widen_ssumv8hi): New. * doc/tm.texi (ssum_widen, usum_widen, sdot_prod, udot_prod): New patterns. From-SVN: r109954 --- gcc/ChangeLog | 68 +++ gcc/Makefile.in | 5 + gcc/config/i386/sse.md | 44 +- gcc/config/rs6000/altivec.md | 73 ++- gcc/doc/md.texi | 21 +- gcc/expr.c | 25 + gcc/genopinit.c | 6 +- gcc/optabs.c | 159 +++++ gcc/optabs.h | 19 +- gcc/testsuite/ChangeLog | 19 + gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c | 70 +++ gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c | 111 ++++ gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c | 77 +++ gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c | 101 ++++ gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c | 60 ++ gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c | 67 +++ gcc/testsuite/gcc.dg/vect/vect.exp | 10 +- .../gcc.dg/vect/wrapv-vect-reduc-dot-s8.c | 108 ++++ .../gcc.dg/vect/wrapv-vect-reduc-pattern-2.c | 59 ++ gcc/testsuite/lib/target-supports.exp | 106 ++++ gcc/tree-inline.c | 6 +- gcc/tree-pretty-print.c | 24 +- gcc/tree-ssa-operands.c | 3 +- gcc/tree-vect-analyze.c | 75 ++- gcc/tree-vect-generic.c | 7 +- gcc/tree-vect-patterns.c | 637 +++++++++++++++++++++ gcc/tree-vect-transform.c | 422 +++++++++----- gcc/tree-vectorizer.c | 4 +- gcc/tree-vectorizer.h | 35 +- gcc/tree.def | 31 +- 30 files changed, 2284 insertions(+), 168 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c create mode 100755 gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c create mode 100644 gcc/tree-vect-patterns.c (limited to 'gcc') diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6fd7bfd..510e1a7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,71 @@ +2006-01-19 Dorit Nuzman + + * Makefile.in (tree-vect-patterns.o): Add rule for new file. + * tree-vect-analyze.c (vect_determine_vectorization_factor): Use + existing STMT_VINFO_VECTYPE if available. + (vect_mark_relevant): Add special handling for stmts that are + marked as STMT_VINFO_IN_PATTERN_P. + (vect_analyze_loop): Call vect_pattern_recog. + * tree-vectorizer.c (new_stmt_vec_info): Initialize new fields. + * tree-vectorizer.h (in_pattern_p, related_stmt): New fields in + stmt_info. + (STMT_VINFO_IN_PATTERN_P, STMT_VINFO_RELATED_STMT): New macros. + (vect_recog_func_ptr): New function-pointer type. + * tree-vect-patterns.c: New file. + (vect_recog_widen_sum_pattern, vect_recog_widen_mult_pattern): + (vect_recog_dot_prod_pattern, vect_pattern_recog): + (vect_pattern_recog_1): New functions. + (vect_pattern_recog_funcs): New array of function pointers. + + * tree-vectorizer.h (ternary_op): New enum value. + * tree-vect-transform.c (vect_create_epilog_for_reduction): Added + declaration. Revised documentation. Removed redundant dump prints. + Removed redundant argument. Added support for reduction patterns. + (vectorizable_reduction): Added support for reduction patterns. + (vect_transform_stmt): Added support for patterns. + + * expr.c (expand_expr_real_1): Added case for DOT_PROD_EXPR. + * genopinit.c (udot_prod_optab, sdot_prod_optab): Initialize. + * optabs.c (optab_for_tree_code): Added case for DOT_PROD_EXPR. + (expand_widen_pattern_expr): New function. + (init_optabs): Initialize new optabs udot_prod_optab, + sdot_prod_optab. + * optabs.h (OTI_sdot_prod, OTI_udot_prod): New. + (sdot_prod_optab, udot_prod_optab): Define new optabs. + (expand_widen_pattern_expr): New function declaration. + * tree.def (DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR): New + tree-codes. + * tree-inline.c (estimate_num_insns_1): Added cases for new + tree-codes DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR. + * tree-pretty-print.c (dump_generic_node): Likewise. + (op_prio): Likewise. + (op_symbol): Added cases for WIDEN_SUM_EXPR, WIDEN_MULT_EXPR. + * tree-ssa-operands.c (get_expr_operands): Added case for + DOT_PROD_EXPR. + * tree-vect-patterns.c (widened_name_p): New function. + (vect_recog_dot_prod_pattern): Added function implementation. + * tree-vect-transform.c (get_initial_def_for_reduction): Added + cases for DOT_PROD_EXPR, WIDEN_SUM_EXPR. + * config/rs6000/altivec.md (udot_prod, sdot_prodv8hi): New. + * config/i386/sse.md (sdot_prodv8hi, udot_prodv4si): New. + + * expr.c (expand_expr_real_1): Added case for WIDEN_SUM_EXPR. + * genopinit.c (widen_ssum_optab, widen_usum_optab): Initialize. + * optabs.c (optab_for_tree_code): Added case for WIDEN_SUM_EXPR. + (init_optabs): Initialize new optabs widen_ssum_optab, + widen_usum_optab. + * optabs.h (OTI_widen_ssum, OTI_widen_usum): New. + (widen_ssum_optab, widen_usum_optab): Define new optabs. + * tree-vect-generic.c: (expand_vector_operations_1): Check type of + use instead of type of def. + * tree-vect-patterns.c (vect_recog_widen_sum_pattern): Added + function implementation. + * config/rs6000/altivec.md (widen_usum, widen_ssumv16qi, + widen_ssumv8hi): New. + + * doc/tm.texi (ssum_widen, usum_widen, sdot_prod, udot_prod): New + patterns. + 2006-01-19 Richard Sandiford PR c/25805 diff --git a/gcc/Makefile.in b/gcc/Makefile.in index a8bd984..e1c6a1d 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -967,6 +967,7 @@ OBJS-common = \ tree-vect-generic.o tree-ssa-loop.o tree-ssa-loop-niter.o \ tree-ssa-loop-manip.o tree-ssa-threadupdate.o \ tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o \ + tree-vect-patterns.o \ tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o \ tree-ssa-math-opts.o \ tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o \ @@ -2065,6 +2066,10 @@ tree-vect-analyze.o: tree-vect-analyze.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ $(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) \ $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \ tree-vectorizer.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) tree-chrec.h +tree-vect-patterns.o: tree-vect-patterns.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \ + $(TM_H) errors.h $(GGC_H) $(OPTABS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) \ + diagnostic.h $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) cfgloop.h \ + tree-vectorizer.h tree-data-ref.h $(EXPR_H) tree-vect-transform.o: tree-vect-transform.c $(CONFIG_H) $(SYSTEM_H) \ coretypes.h $(TM_H) $(GGC_H) $(OPTABS_H) $(RECOG_H) $(TREE_H) $(RTL_H) \ $(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) \ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bd943f7..88c7adf 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005 +;; Copyright (C) 2005, 2006 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -2700,6 +2700,48 @@ DONE; }) +(define_expand "sdot_prodv8hi" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V8HI 1 "nonimmediate_operand" "") + (match_operand:V8HI 2 "nonimmediate_operand" "") + (match_operand:V4SI 3 "register_operand" "")] + "TARGET_SSE2" +{ + rtx t = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2])); + emit_insn (gen_addv4si3 (operands[0], operands[3], t)); + DONE; +}) + +(define_expand "udot_prodv4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "") + (match_operand:V2DI 3 "register_operand" "")] + "TARGET_SSE2" +{ + rtx t1, t2, t3, t4; + + t1 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2])); + emit_insn (gen_addv2di3 (t1, t1, operands[3])); + + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2), + gen_lowpart (TImode, operands[1]), + GEN_INT (32))); + emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3), + gen_lowpart (TImode, operands[2]), + GEN_INT (32))); + + t4 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3)); + + emit_insn (gen_addv2di3 (operands[0], t1, t4)); + DONE; +}) + (define_insn "ashr3" [(set (match_operand:SSEMODE24 0 "register_operand" "=x") (ashiftrt:SSEMODE24 diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 26ec2be..d4bf08e 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1,5 +1,5 @@ ;; AltiVec patterns. -;; Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. +;; Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. ;; Contributed by Aldy Hernandez (aldy@quesejoda.com) ;; This file is part of GCC. @@ -2150,6 +2150,77 @@ DONE; }") +(define_expand "udot_prod" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (plus:V4SI (match_operand:V4SI 3 "register_operand" "v") + (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v") + (match_operand:VIshort 2 "register_operand" "v")] + UNSPEC_VMSUMU)))] + "TARGET_ALTIVEC" + " +{ + emit_insn (gen_altivec_vmsumum (operands[0], operands[1], operands[2], operands[3])); + DONE; +}") + +(define_expand "sdot_prodv8hi" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (plus:V4SI (match_operand:V4SI 3 "register_operand" "v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") + (match_operand:V8HI 2 "register_operand" "v")] + UNSPEC_VMSUMSHM)))] + "TARGET_ALTIVEC" + " +{ + emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], operands[2], operands[3])); + DONE; +}") + +(define_expand "widen_usum3" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (plus:V4SI (match_operand:V4SI 2 "register_operand" "v") + (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")] + UNSPEC_VMSUMU)))] + "TARGET_ALTIVEC" + " +{ + rtx vones = gen_reg_rtx (GET_MODE (operands[1])); + + emit_insn (gen_altivec_vspltis (vones, const1_rtx)); + emit_insn (gen_altivec_vmsumum (operands[0], operands[1], vones, operands[2])); + DONE; +}") + +(define_expand "widen_ssumv16qi3" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (plus:V4SI (match_operand:V4SI 2 "register_operand" "v") + (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")] + UNSPEC_VMSUMM)))] + "TARGET_ALTIVEC" + " +{ + rtx vones = gen_reg_rtx (V16QImode); + + emit_insn (gen_altivec_vspltisb (vones, const1_rtx)); + emit_insn (gen_altivec_vmsummbm (operands[0], operands[1], vones, operands[2])); + DONE; +}") + +(define_expand "widen_ssumv8hi3" + [(set (match_operand:V4SI 0 "register_operand" "=v") + (plus:V4SI (match_operand:V4SI 2 "register_operand" "v") + (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")] + UNSPEC_VMSUMSHM)))] + "TARGET_ALTIVEC" + " +{ + rtx vones = gen_reg_rtx (V8HImode); + + emit_insn (gen_altivec_vspltish (vones, const1_rtx)); + emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], vones, operands[2])); + DONE; +}") + (define_expand "negv4sf2" [(use (match_operand:V4SF 0 "register_operand" "")) (use (match_operand:V4SF 1 "register_operand" ""))] diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 90efcc3..b6dd838 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -1,5 +1,5 @@ @c Copyright (C) 1988, 1989, 1992, 1993, 1994, 1996, 1998, 1999, 2000, 2001, -@c 2002, 2003, 2004, 2005 Free Software Foundation, Inc. +@c 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. @c This is part of the GCC manual. @c For copying conditions, see the file gcc.texi. @@ -3099,6 +3099,25 @@ Compute the sum of the unsigned elements of a vector. The vector is operand 1, and the scalar result is stored in the least significant bits of operand 0 (also a vector). The output and input vector should have the same modes. +@cindex @code{sdot_prod@var{m}} instruction pattern +@item @samp{sdot_prod@var{m}} +@cindex @code{udot_prod@var{m}} instruction pattern +@item @samp{udot_prod@var{m}} +Compute the sum of the products of two signed/unsigned elements. +Operand 1 and operand 2 are of the same mode. Their product, which is of a +wider mode, is computed and added to operand 3. Operand 3 is of a mode equal or +wider than the mode of the product. The result is placed in operand 0, which +is of the same mode as operand 3. + +@cindex @code{ssum_widen@var{m3}} instruction pattern +@item @samp{ssum_widen@var{m3}} +@cindex @code{usum_widen@var{m3}} instruction pattern +@item @samp{usum_widen@var{m3}} +Operands 0 and 2 are of the same mode, which is wider than the mode of +operand 1. Add operand 1 to operand 2 and place the widened result in +operand 0. (This is used express accumulation of elements into an accumulator +of a wider mode.) + @cindex @code{vec_shl_@var{m}} instruction pattern @cindex @code{vec_shr_@var{m}} instruction pattern @item @samp{vec_shl_@var{m}}, @samp{vec_shr_@var{m}} diff --git a/gcc/expr.c b/gcc/expr.c index 92048ff..b15b43c 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -8553,6 +8553,31 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, return temp; } + case DOT_PROD_EXPR: + { + tree oprnd0 = TREE_OPERAND (exp, 0); + tree oprnd1 = TREE_OPERAND (exp, 1); + tree oprnd2 = TREE_OPERAND (exp, 2); + rtx op2; + + expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0); + op2 = expand_expr (oprnd2, NULL_RTX, VOIDmode, 0); + target = expand_widen_pattern_expr (exp, op0, op1, op2, + target, unsignedp); + return target; + } + + case WIDEN_SUM_EXPR: + { + tree oprnd0 = TREE_OPERAND (exp, 0); + tree oprnd1 = TREE_OPERAND (exp, 1); + + expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0); + target = expand_widen_pattern_expr (exp, op0, NULL_RTX, op1, + target, unsignedp); + return target; + } + case REDUC_MAX_EXPR: case REDUC_MIN_EXPR: case REDUC_PLUS_EXPR: diff --git a/gcc/genopinit.c b/gcc/genopinit.c index ec8076b..d958220 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -1,6 +1,6 @@ /* Generate code to initialize optabs from machine description. Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, - 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. This file is part of GCC. @@ -203,6 +203,10 @@ static const char * const optabs[] = "vec_realign_load_optab->handlers[$A].insn_code = CODE_FOR_$(vec_realign_load_$a$)", "vcond_gen_code[$A] = CODE_FOR_$(vcond$a$)", "vcondu_gen_code[$A] = CODE_FOR_$(vcondu$a$)", + "ssum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_ssum$I$a3$)", + "usum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_usum$I$a3$)", + "udot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(udot_prod$I$a$)", + "sdot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(sdot_prod$I$a$)", "reduc_smax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smax_$a$)", "reduc_umax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umax_$a$)", "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)", diff --git a/gcc/optabs.c b/gcc/optabs.c index 5a87ac0..da5251c 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -294,6 +294,12 @@ optab_for_tree_code (enum tree_code code, tree type) case REALIGN_LOAD_EXPR: return vec_realign_load_optab; + case WIDEN_SUM_EXPR: + return TYPE_UNSIGNED (type) ? usum_widen_optab : ssum_widen_optab; + + case DOT_PROD_EXPR: + return TYPE_UNSIGNED (type) ? udot_prod_optab : sdot_prod_optab; + case REDUC_MAX_EXPR: return TYPE_UNSIGNED (type) ? reduc_umax_optab : reduc_smax_optab; @@ -337,6 +343,154 @@ optab_for_tree_code (enum tree_code code, tree type) } +/* Expand vector widening operations. + + There are two different classes of operations handled here: + 1) Operations whose result is wider than all the arguments to the operation. + Examples: VEC_UNPACK_HI/LO_EXPR, VEC_WIDEN_MULT_HI/LO_EXPR + In this case OP0 and optionally OP1 would be initialized, + but WIDE_OP wouldn't (not relevant for this case). + 2) Operations whose result is of the same size as the last argument to the + operation, but wider than all the other arguments to the operation. + Examples: WIDEN_SUM_EXPR, VEC_DOT_PROD_EXPR. + In the case WIDE_OP, OP0 and optionally OP1 would be initialized. + + E.g, when called to expand the following operations, this is how + the arguments will be initialized: + nops OP0 OP1 WIDE_OP + widening-sum 2 oprnd0 - oprnd1 + widening-dot-product 3 oprnd0 oprnd1 oprnd2 + widening-mult 2 oprnd0 oprnd1 - + type-promotion (vec-unpack) 1 oprnd0 - - */ + +rtx +expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op, rtx target, + int unsignedp) +{ + tree oprnd0, oprnd1, oprnd2; + enum machine_mode wmode = 0, tmode0, tmode1 = 0; + optab widen_pattern_optab; + int icode; + enum machine_mode xmode0, xmode1 = 0, wxmode = 0; + rtx temp; + rtx pat; + rtx xop0, xop1, wxop; + int nops = TREE_CODE_LENGTH (TREE_CODE (exp)); + + oprnd0 = TREE_OPERAND (exp, 0); + tmode0 = TYPE_MODE (TREE_TYPE (oprnd0)); + widen_pattern_optab = + optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0)); + icode = (int) widen_pattern_optab->handlers[(int) tmode0].insn_code; + gcc_assert (icode != CODE_FOR_nothing); + xmode0 = insn_data[icode].operand[1].mode; + + if (nops >= 2) + { + oprnd1 = TREE_OPERAND (exp, 1); + tmode1 = TYPE_MODE (TREE_TYPE (oprnd1)); + xmode1 = insn_data[icode].operand[2].mode; + } + + /* The last operand is of a wider mode than the rest of the operands. */ + if (nops == 2) + { + wmode = tmode1; + wxmode = xmode1; + } + else if (nops == 3) + { + gcc_assert (tmode1 == tmode0); + gcc_assert (op1); + oprnd2 = TREE_OPERAND (exp, 2); + wmode = TYPE_MODE (TREE_TYPE (oprnd2)); + wxmode = insn_data[icode].operand[3].mode; + } + + if (!wide_op) + wmode = wxmode = insn_data[icode].operand[0].mode; + + if (!target + || ! (*insn_data[icode].operand[0].predicate) (target, wmode)) + temp = gen_reg_rtx (wmode); + else + temp = target; + + xop0 = op0; + xop1 = op1; + wxop = wide_op; + + /* In case the insn wants input operands in modes different from + those of the actual operands, convert the operands. It would + seem that we don't need to convert CONST_INTs, but we do, so + that they're properly zero-extended, sign-extended or truncated + for their mode. */ + + if (GET_MODE (op0) != xmode0 && xmode0 != VOIDmode) + xop0 = convert_modes (xmode0, + GET_MODE (op0) != VOIDmode + ? GET_MODE (op0) + : tmode0, + xop0, unsignedp); + + if (op1) + if (GET_MODE (op1) != xmode1 && xmode1 != VOIDmode) + xop1 = convert_modes (xmode1, + GET_MODE (op1) != VOIDmode + ? GET_MODE (op1) + : tmode1, + xop1, unsignedp); + + if (wide_op) + if (GET_MODE (wide_op) != wxmode && wxmode != VOIDmode) + wxop = convert_modes (wxmode, + GET_MODE (wide_op) != VOIDmode + ? GET_MODE (wide_op) + : wmode, + wxop, unsignedp); + + /* Now, if insn's predicates don't allow our operands, put them into + pseudo regs. */ + + if (! (*insn_data[icode].operand[1].predicate) (xop0, xmode0) + && xmode0 != VOIDmode) + xop0 = copy_to_mode_reg (xmode0, xop0); + + if (op1) + { + if (! (*insn_data[icode].operand[2].predicate) (xop1, xmode1) + && xmode1 != VOIDmode) + xop1 = copy_to_mode_reg (xmode1, xop1); + + if (wide_op) + { + if (! (*insn_data[icode].operand[3].predicate) (wxop, wxmode) + && wxmode != VOIDmode) + wxop = copy_to_mode_reg (wxmode, wxop); + + pat = GEN_FCN (icode) (temp, xop0, xop1, wxop); + } + else + pat = GEN_FCN (icode) (temp, xop0, xop1); + } + else + { + if (wide_op) + { + if (! (*insn_data[icode].operand[2].predicate) (wxop, wxmode) + && wxmode != VOIDmode) + wxop = copy_to_mode_reg (wxmode, wxop); + + pat = GEN_FCN (icode) (temp, xop0, wxop); + } + else + pat = GEN_FCN (icode) (temp, xop0); + } + + emit_insn (pat); + return temp; +} + /* Generate code to perform an operation specified by TERNARY_OPTAB on operands OP0, OP1 and OP2, with result having machine-mode MODE. @@ -5139,6 +5293,11 @@ init_optabs (void) reduc_splus_optab = init_optab (UNKNOWN); reduc_uplus_optab = init_optab (UNKNOWN); + ssum_widen_optab = init_optab (UNKNOWN); + usum_widen_optab = init_optab (UNKNOWN); + sdot_prod_optab = init_optab (UNKNOWN); + udot_prod_optab = init_optab (UNKNOWN); + vec_extract_optab = init_optab (UNKNOWN); vec_set_optab = init_optab (UNKNOWN); vec_init_optab = init_optab (UNKNOWN); diff --git a/gcc/optabs.h b/gcc/optabs.h index 78cf53b..58fb690 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -1,5 +1,6 @@ /* Definitions for code generation pass of GNU compiler. - Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. This file is part of GCC. @@ -241,6 +242,14 @@ enum optab_index OTI_reduc_splus, OTI_reduc_uplus, + /* Summation, with result machine mode one or more wider than args. */ + OTI_ssum_widen, + OTI_usum_widen, + + /* Dot product, with result machine mode one or more wider than args. */ + OTI_sdot_prod, + OTI_udot_prod, + /* Set specified field of vector operand. */ OTI_vec_set, /* Extract specified field of vector operand. */ @@ -367,6 +376,11 @@ extern GTY(()) optab optab_table[OTI_MAX]; #define reduc_umin_optab (optab_table[OTI_reduc_umin]) #define reduc_splus_optab (optab_table[OTI_reduc_splus]) #define reduc_uplus_optab (optab_table[OTI_reduc_uplus]) + +#define ssum_widen_optab (optab_table[OTI_ssum_widen]) +#define usum_widen_optab (optab_table[OTI_usum_widen]) +#define sdot_prod_optab (optab_table[OTI_sdot_prod]) +#define udot_prod_optab (optab_table[OTI_udot_prod]) #define vec_set_optab (optab_table[OTI_vec_set]) #define vec_extract_optab (optab_table[OTI_vec_extract]) @@ -495,6 +509,9 @@ extern enum insn_code sync_lock_release[NUM_MACHINE_MODES]; /* Define functions given in optabs.c. */ +extern rtx expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op, + rtx target, int unsignedp); + extern rtx expand_ternary_op (enum machine_mode mode, optab ternary_optab, rtx op0, rtx op1, rtx op2, rtx target, int unsignedp); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 5a5e470..c54f6b0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,22 @@ +2006-01-19 Dorit Nuzman + + * lib/target-suports.exp (check_effective_target_vect_sdot_qi): New. + (check_effective_target_vect_udot_qi): New. + (check_effective_target_vect_sdot_hi): New. + (check_effective_target_vect_udot_hi): New. + * gcc.dg/vect/vect.exp: Use dump-details, and compile testcases + prefixed with "wrapv-" with -fwrapv. + * gcc.dg/vect/wrapv-vect-reduc-dot-s8.c: New. + * gcc.dg/vect/vect-reduc-dot-u8.c: New. + * gcc.dg/vect/vect-reduc-dot-u16.c: New. + * gcc.dg/vect/vect-reduc-dot-s8.c: New. + * gcc.dg/vect/vect-reduc-dot-s16.c: New. + + * lib/target-suports.exp (check_effective_target_vect_widen_sum): New. + * gcc.dg/vect/vect-reduc-pattern-1.c: New. + * gcc.dg/vect/vect-reduc-pattern-2.c: New. + * gcc.dg/vect/wrapv-vect-reduc-pattern-2.c: New. + 2006-01-19 Volker Reichelt PR c++/16829 diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c new file mode 100644 index 0000000..ddffc10 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +#define N 64 + +#define DOT1 43680 +#define DOT2 43680 + +signed short X[N] __attribute__ ((__aligned__(16))); +signed short Y[N] __attribute__ ((__aligned__(16))); + +/* short->short->int dot product. + Not detected as a dot-product pattern. + Currently fails to be vectorized due to presence of type conversions. */ +int +foo1(int len) { + int i; + int result = 0; + short prod; + + for (i=0; iint->int dot product. + Detected as a dot-product pattern. + Vectorized on targets that support dot-product for signed shorts. */ +int +foo2(int len) { + int i; + int result = 0; + + for (i=0; i +#include "tree-vect.h" + +#define N 64 + +#define DOT1 43680 +#define DOT2 -21856 +#define DOT3 43680 + +signed char X[N] __attribute__ ((__aligned__(16))); +signed char Y[N] __attribute__ ((__aligned__(16))); + +/* char->short->int dot product. + The dot-product pattern should be detected. + Vectorizable on vect_sdot_qi targets (targets that support dot-product of + signed chars). + + In the future could also be vectorized as widening-mult + widening-summation, + or with type-conversion support. + */ +int +foo1(int len) { + int i; + int result = 0; + short prod; + + for (i=0; ishort->short dot product. + The dot-product pattern should be detected. + The reduction is currently not vectorized becaus of the signed->unsigned->signed + casts, since this patch: + + 2005-12-26 Kazu Hirata + + PR tree-optimization/25125 + + When the dot-product is detected, the loop should be vectorized on vect_sdot_qi + targets (targets that support dot-product of signed char). + This test would currently fail to vectorize on targets that support + dot-product of chars when the accumulator is int. + + In the future could also be vectorized as widening-mult + summation, + or with type-conversion support. + */ +short +foo2(int len) { + int i; + short result = 0; + + for (i=0; iint->int dot product. + Not detected as a dot-product pattern. + Currently fails to be vectorized due to presence of type conversions. */ +int +foo3(int len) { + int i; + int result = 0; + + for (i=0; i +#include "tree-vect.h" + +#define N 64 + +#define DOT1 43680 +#define DOT2 43680 + +unsigned short X[N] __attribute__ ((__aligned__(16))); +unsigned short Y[N] __attribute__ ((__aligned__(16))); + +/* short->short->int dot product. + Not detected as a dot-product pattern. + Not vectorized due to presence of type-conversions. */ +unsigned int +foo1(int len) { + int i; + unsigned int result = 0; + unsigned short prod; + + for (i=0; iint->int dot product. + Currently not detected as a dot-product pattern: the multiplication + promotes the ushorts to int, and then the product is promoted to unsigned + int for the addition. Which results in an int->unsigned int cast, which + since no bits are modified in the cast should be trivially vectorizable. */ +unsigned int +foo2(int len) { + int i; + unsigned int result = 0; + + for (i=0; i +#include "tree-vect.h" + +#define N 64 + +#define DOT1 43680 +#define DOT2 43680 +#define DOT3 43680 + +unsigned char X[N] __attribute__ ((__aligned__(16))); +unsigned char Y[N] __attribute__ ((__aligned__(16))); + +/* char->short->int dot product. + Detected as a dot-product pattern. + Should be vectorized on targets that support dot-product for unsigned chars. + */ +unsigned int +foo1(int len) { + int i; + unsigned int result = 0; + unsigned short prod; + + for (i=0; ishort->short dot product. + Detected as a dot-product pattern. + Should be vectorized on targets that support dot-product for unsigned chars. + This test currently fails to vectorize on targets that support dot-product + of chars only when the accumulator is int. + */ +unsigned short +foo2(int len) { + int i; + unsigned short result = 0; + + for (i=0; iint->int dot product. + Not detected as a dot-product. + Doesn't get vectorized due to presence of type converisons. */ +unsigned int +foo3(int len) { + int i; + unsigned int result = 0; + + for (i=0; i +#include "tree-vect.h" + +#define N 16 +#define SH_SUM 210 +#define CH_SUM 120 + +int main1 () +{ + int i; + unsigned short udata_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28}; + unsigned char udata_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + unsigned int intsum = 0; + unsigned short shortsum = 0; + + /* widenning sum: sum shorts into int. */ + for (i = 0; i < N; i++){ + intsum += udata_sh[i]; + } + + /* check results: */ + if (intsum != SH_SUM) + abort (); + + /* widenning sum: sum chars into int. */ + intsum = 0; + for (i = 0; i < N; i++){ + intsum += udata_ch[i]; + } + + /* check results: */ + if (intsum != CH_SUM) + abort (); + + /* widenning sum: sum chars into short. + pattern detected, but not vectorized yet. */ + for (i = 0; i < N; i++){ + shortsum += udata_ch[i]; + } + + /* check results: */ + if (shortsum != CH_SUM) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c new file mode 100644 index 0000000..5423c43 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c @@ -0,0 +1,67 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +#define N 16 +#define SH_SUM 210 +#define CH_SUM 120 + +int main1 () +{ + int i; + signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28}; + signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + signed int intsum = 0; + signed short shortsum = 0; + + /* widenning sum: sum shorts into int. */ + for (i = 0; i < N; i++){ + intsum += data_sh[i]; + } + + /* check results: */ + if (intsum != SH_SUM) + abort (); + + /* widenning sum: sum chars into int. */ + intsum = 0; + for (i = 0; i < N; i++){ + intsum += data_ch[i]; + } + + /* check results: */ + if (intsum != CH_SUM) + abort (); + + /* widenning sum: sum chars into short. + The widening-summation pattern is currently not detected because of this + patch: + + 2005-12-26 Kazu Hirata + + PR tree-optimization/25125 + */ + for (i = 0; i < N; i++){ + shortsum += data_ch[i]; + } + + /* check results: */ + if (shortsum != CH_SUM) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 2 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp index bfa6dce..9cf78ff 100644 --- a/gcc/testsuite/gcc.dg/vect/vect.exp +++ b/gcc/testsuite/gcc.dg/vect/vect.exp @@ -1,4 +1,4 @@ -# Copyright (C) 1997, 2004 Free Software Foundation, Inc. +# Copyright (C) 1997, 2004, 2005, 2006 Free Software Foundation, Inc. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -78,7 +78,7 @@ dg-init dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/nodump-*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS -lappend DEFAULT_VECTCFLAGS "-ftree-vectorizer-verbose=4" "-fdump-tree-vect-stats" +lappend DEFAULT_VECTCFLAGS "-fdump-tree-vect-details" # Main loop. dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]] \ @@ -96,6 +96,12 @@ lappend DEFAULT_VECTCFLAGS "-ffast-math" dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS +# -fwrapv tests +set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS +lappend DEFAULT_VECTCFLAGS "-fwrapv" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-vect*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + # -ftrapv tests set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS lappend DEFAULT_VECTCFLAGS "-ftrapv" diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c new file mode 100644 index 0000000..b11b9c7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c @@ -0,0 +1,108 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +#define N 64 + +#define DOT1 43680 +#define DOT2 -21856 +#define DOT3 43680 + +signed char X[N] __attribute__ ((__aligned__(16))); +signed char Y[N] __attribute__ ((__aligned__(16))); + +/* char->short->int dot product. + The dot-product pattern should be detected. + Vectorizable on vect_sdot_qi targets (targets that support dot-product of + signed chars). + + In the future could also be vectorized as widening-mult + widening-summation, + or with type-conversion support. + */ +int +foo1(int len) { + int i; + int result = 0; + short prod; + + for (i=0; ishort->short dot product. + The dot-product pattern should be detected. + Should be vectorized on vect_sdot_qi targets (targets that support + dot-product of signed char). + This test currently fails to vectorize on targets that support + dot-product of chars when the accumulator is int. + + In the future could also be vectorized as widening-mult + summation, + or with type-conversion support. + */ +short +foo2(int len) { + int i; + short result = 0; + + for (i=0; iint->int dot product. + Not detected as a dot-product pattern. + Currently fails to be vectorized due to presence of type conversions. */ +int +foo3(int len) { + int i; + int result = 0; + + for (i=0; i +#include "tree-vect.h" + +#define N 16 +#define SH_SUM 210 +#define CH_SUM 120 + +int main1 () +{ + int i; + signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28}; + signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + signed int intsum = 0; + signed short shortsum = 0; + + /* widenning sum: sum shorts into int. */ + for (i = 0; i < N; i++){ + intsum += data_sh[i]; + } + + /* check results: */ + if (intsum != SH_SUM) + abort (); + + /* widenning sum: sum chars into int. */ + intsum = 0; + for (i = 0; i < N; i++){ + intsum += data_ch[i]; + } + + /* check results: */ + if (intsum != CH_SUM) + abort (); + + /* widenning sum: sum chars into short. */ + for (i = 0; i < N; i++){ + shortsum += data_ch[i]; + } + + /* check results: */ + if (shortsum != CH_SUM) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + return main1 (); +} + +/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index cadef96..05a180e 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1364,6 +1364,112 @@ proc check_effective_target_vect_no_bitwise { } { return $et_vect_no_bitwise_saved } +# Return 1 if the target plus current options supports a vector +# widening summation, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_widen_sum { } { + global et_vect_widen_sum + + if [info exists et_vect_widen_sum_saved] { + verbose "check_effective_target_vect_widen_sum: using cached result" 2 + } else { + set et_vect_widen_sum_saved 0 + if { [istarget powerpc*-*-*] + || [istarget ia64-*-*] } { + set et_vect_widen_sum_saved 1 + } + } + verbose "check_effective_target_vect_widen_sum: returning $et_vect_widen_sum_saved" 2 + return $et_vect_widen_sum_saved +} + +# Return 1 if the target plus current options supports a vector +# dot-product of signed chars, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_sdot_qi { } { + global et_vect_sdot_qi + + if [info exists et_vect_sdot_qi_saved] { + verbose "check_effective_target_vect_sdot_qi: using cached result" 2 + } else { + set et_vect_sdot_qi_saved 0 + if { [istarget ia64-*-*] } { + set et_vect_sdot_qi_saved 1 + } + } + verbose "check_effective_target_vect_sdot_qi: returning $et_vect_sdot_qi_saved" 2 + return $et_vect_sdot_qi_saved +} + +# Return 1 if the target plus current options supports a vector +# dot-product of unsigned chars, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_udot_qi { } { + global et_vect_udot_qi + + if [info exists et_vect_udot_qi_saved] { + verbose "check_effective_target_vect_udot_qi: using cached result" 2 + } else { + set et_vect_udot_qi_saved 0 + if { [istarget powerpc*-*-*] + || [istarget ia64-*-*] } { + set et_vect_udot_qi_saved 1 + } + } + verbose "check_effective_target_vect_udot_qi: returning $et_vect_udot_qi_saved" 2 + return $et_vect_udot_qi_saved +} + +# Return 1 if the target plus current options supports a vector +# dot-product of signed shorts, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_sdot_hi { } { + global et_vect_sdot_hi + + if [info exists et_vect_sdot_hi_saved] { + verbose "check_effective_target_vect_sdot_hi: using cached result" 2 + } else { + set et_vect_sdot_hi_saved 0 + if { [istarget powerpc*-*-*] + || [istarget i?86-*-*] + || [istarget x86_64-*-*] + || [istarget ia64-*-*] } { + set et_vect_sdot_hi_saved 1 + } + } + verbose "check_effective_target_vect_sdot_hi: returning $et_vect_sdot_hi_saved" 2 + return $et_vect_sdot_hi_saved +} + +# Return 1 if the target plus current options supports a vector +# dot-product of unsigned shorts, 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_udot_hi { } { + global et_vect_udot_hi + + if [info exists et_vect_udot_hi_saved] { + verbose "check_effective_target_vect_udot_hi: using cached result" 2 + } else { + set et_vect_udot_hi_saved 0 + if { [istarget powerpc*-*-*] } { + set et_vect_udot_hi_saved 1 + } + } + verbose "check_effective_target_vect_udot_hi: returning $et_vect_udot_hi_saved" 2 + return $et_vect_udot_hi_saved +} + + # Return 1 if the target plus current options does not support a vector # alignment mechanism, 0 otherwise. # diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c index 7030b92..9a93427 100644 --- a/gcc/tree-inline.c +++ b/gcc/tree-inline.c @@ -1,5 +1,5 @@ /* Tree inlining. - Copyright 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. Contributed by Alexandre Oliva This file is part of GCC. @@ -1728,6 +1728,10 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data) case REDUC_MAX_EXPR: case REDUC_MIN_EXPR: case REDUC_PLUS_EXPR: + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: + + case WIDEN_MULT_EXPR: case RESX_EXPR: *count += 1; diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c index d7e3391..4e50b8d 100644 --- a/gcc/tree-pretty-print.c +++ b/gcc/tree-pretty-print.c @@ -1,5 +1,6 @@ /* Pretty formatting of GENERIC trees in C syntax. - Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. Adapted from c-pretty-print.c by Diego Novillo This file is part of GCC. @@ -1168,6 +1169,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags, break; /* Binary arithmetic and logic expressions. */ + case WIDEN_SUM_EXPR: + case WIDEN_MULT_EXPR: case MULT_EXPR: case PLUS_EXPR: case MINUS_EXPR: @@ -1686,6 +1689,16 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags, pp_string (buffer, " > "); break; + case DOT_PROD_EXPR: + pp_string (buffer, " DOT_PROD_EXPR < "); + dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false); + pp_string (buffer, " , "); + dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false); + pp_string (buffer, " > "); + break; + case OMP_PARALLEL: pp_string (buffer, "#pragma omp parallel"); dump_omp_clauses (buffer, OMP_PARALLEL_CLAUSES (node), spc, flags); @@ -2105,10 +2118,13 @@ op_prio (tree op) case RROTATE_EXPR: return 11; + case WIDEN_SUM_EXPR: case PLUS_EXPR: case MINUS_EXPR: return 12; + case WIDEN_MULT_EXPR: + case DOT_PROD_EXPR: case MULT_EXPR: case TRUNC_DIV_EXPR: case CEIL_DIV_EXPR: @@ -2263,6 +2279,12 @@ op_symbol_1 (enum tree_code code) case REDUC_PLUS_EXPR: return "r+"; + case WIDEN_SUM_EXPR: + return "w+"; + + case WIDEN_MULT_EXPR: + return "w*"; + case NEGATE_EXPR: case MINUS_EXPR: return "-"; diff --git a/gcc/tree-ssa-operands.c b/gcc/tree-ssa-operands.c index 57cfedc..e3b95e7 100644 --- a/gcc/tree-ssa-operands.c +++ b/gcc/tree-ssa-operands.c @@ -1,5 +1,5 @@ /* SSA operands management for trees. - Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. This file is part of GCC. @@ -1273,6 +1273,7 @@ get_expr_operands (tree stmt, tree *expr_p, int flags) return; } + case DOT_PROD_EXPR: case REALIGN_LOAD_EXPR: { get_expr_operands (stmt, &TREE_OPERAND (expr, 0), flags); diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index ab749fb..c5882d4 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -1,5 +1,5 @@ /* Analysis Utilities for Loop Vectorization. - Copyright (C) 2003,2004,2005 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -142,35 +142,46 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo) return false; } - if (STMT_VINFO_DATA_REF (stmt_info)) - scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); - else if (TREE_CODE (stmt) == MODIFY_EXPR) - scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0)); - else - scalar_type = TREE_TYPE (stmt); + if (STMT_VINFO_VECTYPE (stmt_info)) + { + vectype = STMT_VINFO_VECTYPE (stmt_info); + scalar_type = TREE_TYPE (vectype); + } + else + { + if (STMT_VINFO_DATA_REF (stmt_info)) + scalar_type = + TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); + else if (TREE_CODE (stmt) == MODIFY_EXPR) + scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0)); + else + scalar_type = TREE_TYPE (stmt); - if (vect_print_dump_info (REPORT_DETAILS)) - { - fprintf (vect_dump, "get vectype for scalar type: "); - print_generic_expr (vect_dump, scalar_type, TDF_SLIM); - } + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "get vectype for scalar type: "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } - vectype = get_vectype_for_scalar_type (scalar_type); - if (!vectype) - { - if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) - { - fprintf (vect_dump, "not vectorized: unsupported data-type "); - print_generic_expr (vect_dump, scalar_type, TDF_SLIM); - } - return false; + vectype = get_vectype_for_scalar_type (scalar_type); + if (!vectype) + { + if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) + { + fprintf (vect_dump, + "not vectorized: unsupported data-type "); + print_generic_expr (vect_dump, scalar_type, TDF_SLIM); + } + return false; + } + STMT_VINFO_VECTYPE (stmt_info) = vectype; } + if (vect_print_dump_info (REPORT_DETAILS)) { fprintf (vect_dump, "vectype: "); print_generic_expr (vect_dump, vectype, TDF_SLIM); } - STMT_VINFO_VECTYPE (stmt_info) = vectype; nunits = TYPE_VECTOR_SUBPARTS (vectype); if (vect_print_dump_info (REPORT_DETAILS)) @@ -1439,6 +1450,24 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt, if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "mark relevant %d, live %d.",relevant_p, live_p); + if (STMT_VINFO_IN_PATTERN_P (stmt_info)) + { + tree pattern_stmt; + + /* This is the last stmt in a sequence that was detected as a + pattern that can potentially be vectorized. Don't mark the stmt + as relevant/live because it's not going to vectorized. + Instead mark the pattern-stmt that replaces it. */ + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live."); + pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + stmt_info = vinfo_for_stmt (pattern_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt); + save_relevant_p = STMT_VINFO_RELEVANT_P (stmt_info); + save_live_p = STMT_VINFO_LIVE_P (stmt_info); + stmt = pattern_stmt; + } + STMT_VINFO_LIVE_P (stmt_info) |= live_p; STMT_VINFO_RELEVANT_P (stmt_info) |= relevant_p; @@ -2002,6 +2031,8 @@ vect_analyze_loop (struct loop *loop) vect_analyze_scalar_cycles (loop_vinfo); + vect_pattern_recog (loop_vinfo); + /* Data-flow analysis to detect stmts that do not need to be vectorized. */ ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c index cc834e4f..dd58cb9 100644 --- a/gcc/tree-vect-generic.c +++ b/gcc/tree-vect-generic.c @@ -1,5 +1,5 @@ /* Lower vector operations to scalar operations. - Copyright (C) 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc. This file is part of GCC. @@ -411,6 +411,11 @@ expand_vector_operations_1 (block_stmt_iterator *bsi) gcc_assert (code != CONVERT_EXPR); op = optab_for_tree_code (code, type); + /* For widening vector operations, the relevant type is of the arguments, + not the widened result. */ + if (code == WIDEN_SUM_EXPR) + type = TREE_TYPE (TREE_OPERAND (rhs, 0)); + /* Optabs will try converting a negation into a subtraction, so look for it as well. TODO: negation of floating-point vectors might be turned into an exclusive OR toggling the sign bit. */ diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c new file mode 100644 index 0000000..0ef76fe --- /dev/null +++ b/gcc/tree-vect-patterns.c @@ -0,0 +1,637 @@ +/* Analysis Utilities for Loop Vectorization. + Copyright (C) 2006 Free Software Foundation, Inc. + Contributed by Dorit Nuzman + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING. If not, write to the Free +Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "ggc.h" +#include "tree.h" + +#include "target.h" +#include "basic-block.h" +#include "diagnostic.h" +#include "tree-flow.h" +#include "tree-dump.h" +#include "timevar.h" +#include "cfgloop.h" +#include "expr.h" +#include "optabs.h" +#include "params.h" +#include "tree-data-ref.h" +#include "tree-vectorizer.h" +#include "recog.h" +#include "toplev.h" + +/* Funcion prototypes */ +static void vect_pattern_recog_1 + (tree (* ) (tree, tree *, tree *), block_stmt_iterator); +static bool widened_name_p (tree, tree, tree *, tree *); + +/* Pattern recognition functions */ +static tree vect_recog_widen_sum_pattern (tree, tree *, tree *); +static tree vect_recog_widen_mult_pattern (tree, tree *, tree *); +static tree vect_recog_dot_prod_pattern (tree, tree *, tree *); +static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = { + vect_recog_widen_mult_pattern, + vect_recog_widen_sum_pattern, + vect_recog_dot_prod_pattern}; + + +/* Function widened_name_p + + Check whether NAME, an ssa-name used in USE_STMT, + is a result of a type-promotion, such that: + DEF_STMT: NAME = NOP (name0) + where the type of name0 (HALF_TYPE) is smaller than the type of NAME. +*/ + +static bool +widened_name_p (tree name, tree use_stmt, tree *half_type, tree *def_stmt) +{ + tree dummy; + loop_vec_info loop_vinfo; + stmt_vec_info stmt_vinfo; + tree expr; + tree type = TREE_TYPE (name); + tree oprnd0; + enum vect_def_type dt; + tree def; + + stmt_vinfo = vinfo_for_stmt (use_stmt); + loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); + + if (!vect_is_simple_use (name, loop_vinfo, def_stmt, &def, &dt)) + return false; + + if (dt != vect_loop_def + && dt != vect_invariant_def && dt != vect_constant_def) + return false; + + if (! *def_stmt) + return false; + + if (TREE_CODE (*def_stmt) != MODIFY_EXPR) + return false; + + expr = TREE_OPERAND (*def_stmt, 1); + if (TREE_CODE (expr) != NOP_EXPR) + return false; + + oprnd0 = TREE_OPERAND (expr, 0); + + *half_type = TREE_TYPE (oprnd0); + if (!INTEGRAL_TYPE_P (type) || !INTEGRAL_TYPE_P (*half_type) + || (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (*half_type)) + || (TYPE_PRECISION (type) < (TYPE_PRECISION (*half_type) * 2))) + return false; + + if (!vect_is_simple_use (oprnd0, loop_vinfo, &dummy, &dummy, &dt)) + return false; + + if (dt != vect_invariant_def && dt != vect_constant_def + && dt != vect_loop_def) + return false; + + return true; +} + + +/* Function vect_recog_dot_prod_pattern + + Try to find the following pattern: + + type x_t, y_t; + TYPE1 prod; + TYPE2 sum = init; + loop: + sum_0 = phi + S1 x_t = ... + S2 y_t = ... + S3 x_T = (TYPE1) x_t; + S4 y_T = (TYPE1) y_t; + S5 prod = x_T * y_T; + [S6 prod = (TYPE2) prod; #optional] + S7 sum_1 = prod + sum_0; + + where 'TYPE1' is exactly double the size of type 'type', and 'TYPE2' is the + same size of 'TYPE1' or bigger. This is a sepcial case of a reduction + computation. + + Input: + + * LAST_STMT: A stmt from which the pattern search begins. In the example, + when this function is called with S7, the pattern {S3,S4,S5,S6,S7} will be + detected. + + Output: + + * TYPE_IN: The type of the input arguments to the pattern. + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the sequence of + stmts that constitute the pattern. In this case it will be: + WIDEN_DOT_PRODUCT +*/ + +static tree +vect_recog_dot_prod_pattern (tree last_stmt, tree *type_in, tree *type_out) +{ + tree stmt, expr; + tree oprnd0, oprnd1; + tree oprnd00, oprnd01; + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); + tree type, half_type; + tree pattern_expr; + tree prod_type; + + if (TREE_CODE (last_stmt) != MODIFY_EXPR) + return NULL; + + expr = TREE_OPERAND (last_stmt, 1); + type = TREE_TYPE (expr); + + /* Look for the following pattern + DX = (TYPE1) X; + DY = (TYPE1) Y; + DPROD = DX * DY; + DDPROD = (TYPE2) DPROD; + sum_1 = DDPROD + sum_0; + In which + - DX is double the size of X + - DY is double the size of Y + - DX, DY, DPROD all have the same type + - sum is the same size of DPROD or bigger + - sum has been recognized as a reduction variable. + + This is equivalent to: + DPROD = X w* Y; #widen mult + sum_1 = DPROD w+ sum_0; #widen summation + or + DPROD = X w* Y; #widen mult + sum_1 = DPROD + sum_0; #summation + */ + + /* Starting from LAST_STMT, follow the defs of its uses in search + of the above pattern. */ + + if (TREE_CODE (expr) != PLUS_EXPR) + return NULL; + + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + /* Has been detected as widening-summation? */ + + stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo); + expr = TREE_OPERAND (stmt, 1); + type = TREE_TYPE (expr); + if (TREE_CODE (expr) != WIDEN_SUM_EXPR) + return NULL; + oprnd0 = TREE_OPERAND (expr, 0); + oprnd1 = TREE_OPERAND (expr, 1); + half_type = TREE_TYPE (oprnd0); + } + else + { + tree def_stmt; + + if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def) + return NULL; + oprnd0 = TREE_OPERAND (expr, 0); + oprnd1 = TREE_OPERAND (expr, 1); + if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type) + || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type)) + return NULL; + stmt = last_stmt; + + if (widened_name_p (oprnd0, stmt, &half_type, &def_stmt)) + { + stmt = def_stmt; + expr = TREE_OPERAND (stmt, 1); + oprnd0 = TREE_OPERAND (expr, 0); + } + else + half_type = type; + } + + /* So far so good. Since last_stmt was detected as a (summation) reduction, + we know that oprnd1 is the reduction variable (defined by a loop-header + phi), and oprnd0 is an ssa-name defined by a stmt in the loop body. + Left to check that oprnd0 is defined by a (widen_)mult_expr */ + + prod_type = half_type; + stmt = SSA_NAME_DEF_STMT (oprnd0); + gcc_assert (stmt); + stmt_vinfo = vinfo_for_stmt (stmt); + gcc_assert (stmt_vinfo); + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def); + expr = TREE_OPERAND (stmt, 1); + if (TREE_CODE (expr) != MULT_EXPR) + return NULL; + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + /* Has been detected as a widening multiplication? */ + + stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo); + expr = TREE_OPERAND (stmt, 1); + if (TREE_CODE (expr) != WIDEN_MULT_EXPR) + return NULL; + stmt_vinfo = vinfo_for_stmt (stmt); + gcc_assert (stmt_vinfo); + gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def); + oprnd00 = TREE_OPERAND (expr, 0); + oprnd01 = TREE_OPERAND (expr, 1); + } + else + { + tree half_type0, half_type1; + tree def_stmt; + tree oprnd0, oprnd1; + + oprnd0 = TREE_OPERAND (expr, 0); + oprnd1 = TREE_OPERAND (expr, 1); + if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) + != TYPE_MAIN_VARIANT (prod_type) + || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) + != TYPE_MAIN_VARIANT (prod_type)) + return NULL; + if (!widened_name_p (oprnd0, stmt, &half_type0, &def_stmt)) + return NULL; + oprnd00 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0); + if (!widened_name_p (oprnd1, stmt, &half_type1, &def_stmt)) + return NULL; + oprnd01 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0); + if (TYPE_MAIN_VARIANT (half_type0) != TYPE_MAIN_VARIANT (half_type1)) + return NULL; + if (TYPE_PRECISION (prod_type) != TYPE_PRECISION (half_type0) * 2) + return NULL; + } + + half_type = TREE_TYPE (oprnd00); + *type_in = half_type; + *type_out = type; + + /* Pattern detected. Create a stmt to be used to replace the pattern: */ + pattern_expr = build3 (DOT_PROD_EXPR, type, oprnd00, oprnd01, oprnd1); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vect_recog_dot_prod_pattern: detected: "); + print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); + } + return pattern_expr; +} + + +/* Function vect_recog_widen_mult_pattern + + Try to find the following pattern: + + type a_t, b_t; + TYPE a_T, b_T, prod_T; + + S1 a_t = ; + S2 b_t = ; + S3 a_T = (TYPE) a_t; + S4 b_T = (TYPE) b_t; + S5 prod_T = a_T * b_T; + + where type 'TYPE' is at least double the size of type 'type'. + + Input: + + * LAST_STMT: A stmt from which the pattern search begins. In the example, + when this function is called with S5, the pattern {S3,S4,S5} is be detected. + + Output: + + * TYPE_IN: The type of the input arguments to the pattern. + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the sequence of + stmts that constitute the pattern. In this case it will be: + WIDEN_MULT +*/ + +static tree +vect_recog_widen_mult_pattern (tree last_stmt ATTRIBUTE_UNUSED, + tree *type_in ATTRIBUTE_UNUSED, + tree *type_out ATTRIBUTE_UNUSED) +{ + /* Yet to be implemented. */ + return NULL; +} + + +/* Function vect_recog_widen_sum_pattern + + Try to find the following pattern: + + type x_t; + TYPE x_T, sum = init; + loop: + sum_0 = phi + S1 x_t = *p; + S2 x_T = (TYPE) x_t; + S3 sum_1 = x_T + sum_0; + + where type 'TYPE' is at least double the size of type 'type', i.e - we're + summing elements of type 'type' into an accumulator of type 'TYPE'. This is + a sepcial case of a reduction computation. + + Input: + + * LAST_STMT: A stmt from which the pattern search begins. In the example, + when this function is called with S3, the pattern {S2,S3} will be detected. + + Output: + + * TYPE_IN: The type of the input arguments to the pattern. + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the sequence of + stmts that constitute the pattern. In this case it will be: + WIDEN_SUM +*/ + +static tree +vect_recog_widen_sum_pattern (tree last_stmt, tree *type_in, tree *type_out) +{ + tree stmt, expr; + tree oprnd0, oprnd1; + stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt); + tree type, half_type; + tree pattern_expr; + + if (TREE_CODE (last_stmt) != MODIFY_EXPR) + return NULL; + + expr = TREE_OPERAND (last_stmt, 1); + type = TREE_TYPE (expr); + + /* Look for the following pattern + DX = (TYPE) X; + sum_1 = DX + sum_0; + In which DX is at least double the size of X, and sum_1 has been + recognized as a reduction variable. + */ + + /* Starting from LAST_STMT, follow the defs of its uses in search + of the above pattern. */ + + if (TREE_CODE (expr) != PLUS_EXPR) + return NULL; + + if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def) + return NULL; + + oprnd0 = TREE_OPERAND (expr, 0); + oprnd1 = TREE_OPERAND (expr, 1); + if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type) + || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type)) + return NULL; + + /* So far so good. Since last_stmt was detected as a (summation) reduction, + we know that oprnd1 is the reduction variable (defined by a loop-header + phi), and oprnd0 is an ssa-name defined by a stmt in the loop body. + Left to check that oprnd0 is defined by a cast from type 'type' to type + 'TYPE'. */ + + if (!widened_name_p (oprnd0, last_stmt, &half_type, &stmt)) + return NULL; + + oprnd0 = TREE_OPERAND (TREE_OPERAND (stmt, 1), 0); + *type_in = half_type; + *type_out = type; + + /* Pattern detected. Create a stmt to be used to replace the pattern: */ + pattern_expr = build2 (WIDEN_SUM_EXPR, type, oprnd0, oprnd1); + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "vect_recog_widen_sum_pattern: detected: "); + print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); + } + return pattern_expr; +} + + +/* Function vect_pattern_recog_1 + + Input: + PATTERN_RECOG_FUNC: A pointer to a function that detects a certain + computation pattern. + STMT: A stmt from which the pattern search should start. + + If PATTERN_RECOG_FUNC successfully detected the pattern, it creates an + expression that computes the same functionality and can be used to + replace the sequence of stmts that are involved in the pattern. + + Output: + This function checks if the expression returned by PATTERN_RECOG_FUNC is + supported in vector form by the target. We use 'TYPE_IN' to obtain the + relevant vector type. If 'TYPE_IN' is already a vector type, then this + indicates that target support had already been checked by PATTERN_RECOG_FUNC. + If 'TYPE_OUT' is also returned by PATTERN_RECOG_FUNC, we check that it fits + to the available target pattern. + + This function also does some bookeeping, as explained in the documentation + for vect_recog_pattern. */ + +static void +vect_pattern_recog_1 ( + tree (* vect_recog_func_ptr) (tree, tree *, tree *), + block_stmt_iterator si) +{ + tree stmt = bsi_stmt (si); + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + stmt_vec_info pattern_stmt_info; + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + tree pattern_expr; + tree pattern_vectype; + tree type_in, type_out; + tree pattern_type; + enum tree_code code; + tree var, var_name; + stmt_ann_t ann; + + pattern_expr = (* vect_recog_func_ptr) (stmt, &type_in, &type_out); + if (!pattern_expr) + return; + + if (VECTOR_MODE_P (TYPE_MODE (type_in))) + { + /* No need to check target support (already checked by the pattern + recognition function). */ + pattern_vectype = type_in; + } + else + { + enum tree_code vec_mode; + enum insn_code icode; + optab optab; + + /* Check target support */ + pattern_vectype = get_vectype_for_scalar_type (type_in); + optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype); + vec_mode = TYPE_MODE (pattern_vectype); + if (!optab + || (icode = optab->handlers[(int) vec_mode].insn_code) == + CODE_FOR_nothing + || (type_out + && (insn_data[icode].operand[0].mode != + TYPE_MODE (get_vectype_for_scalar_type (type_out))))) + return; + } + + /* Found a vectorizable pattern. */ + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "pattern recognized: "); + print_generic_expr (vect_dump, pattern_expr, TDF_SLIM); + } + + /* Mark the stmts that are involved in the pattern, + create a new stmt to express the pattern and insert it. */ + code = TREE_CODE (pattern_expr); + pattern_type = TREE_TYPE (pattern_expr); + var = create_tmp_var (pattern_type, "patt"); + add_referenced_tmp_var (var); + var_name = make_ssa_name (var, NULL_TREE); + pattern_expr = build2 (MODIFY_EXPR, void_type_node, var_name, pattern_expr); + SSA_NAME_DEF_STMT (var_name) = pattern_expr; + bsi_insert_before (&si, pattern_expr, BSI_SAME_STMT); + ann = stmt_ann (pattern_expr); + set_stmt_info ((tree_ann_t)ann, new_stmt_vec_info (pattern_expr, loop_vinfo)); + pattern_stmt_info = vinfo_for_stmt (pattern_expr); + + STMT_VINFO_RELATED_STMT (pattern_stmt_info) = stmt; + STMT_VINFO_DEF_TYPE (pattern_stmt_info) = STMT_VINFO_DEF_TYPE (stmt_info); + STMT_VINFO_VECTYPE (pattern_stmt_info) = pattern_vectype; + STMT_VINFO_IN_PATTERN_P (stmt_info) = true; + STMT_VINFO_RELATED_STMT (stmt_info) = pattern_expr; + + return; +} + + +/* Function vect_pattern_recog + + Input: + LOOP_VINFO - a struct_loop_info of a loop in which we want to look for + computation idioms. + + Output - for each computation idiom that is detected we insert a new stmt + that provides the same functionality and that can be vectorized. We + also record some information in the struct_stmt_info of the relevant + stmts, as explained below: + + At the entry to this function we have the following stmts, with the + following initial value in the STMT_VINFO fields: + + stmt in_pattern_p related_stmt vec_stmt + S1: a_i = .... - - - + S2: a_2 = ..use(a_i).. - - - + S3: a_1 = ..use(a_2).. - - - + S4: a_0 = ..use(a_1).. - - - + S5: ... = ..use(a_0).. - - - + + Say the sequence {S1,S2,S3,S4} was detected as a pattern that can be + represented by a single stmt. We then: + - create a new stmt S6 that will replace the pattern. + - insert the new stmt S6 before the last stmt in the pattern + - fill in the STMT_VINFO fields as follows: + + in_pattern_p related_stmt vec_stmt + S1: a_i = .... - - - + S2: a_2 = ..use(a_i).. - - - + S3: a_1 = ..use(a_2).. - - - + > S6: a_new = .... - S4 - + S4: a_0 = ..use(a_1).. true S6 - + S5: ... = ..use(a_0).. - - - + + (the last stmt in the pattern (S4) and the new pattern stmt (S6) point + to each other through the RELATED_STMT field). + + S6 will be marked as relevant in vect_mark_stmts_to_be_vectorized instead + of S4 because it will replace all its uses. Stmts {S1,S2,S3} will + remain irrelevant unless used by stmts other than S4. + + If vectorization succeeds, vect_transform_stmt will skip over {S1,S2,S3} + (because they are marked as irrelevent). It will vectorize S6, and record + a pointer to the new vector stmt VS6 both from S6 (as usual), and also + from S4. We do that so that when we get to vectorizing stmts that use the + def of S4 (like S5 that uses a_0), we'll know where to take the relevant + vector-def from. S4 will be skipped, and S5 will be vectorized as usual: + + in_pattern_p related_stmt vec_stmt + S1: a_i = .... - - - + S2: a_2 = ..use(a_i).. - - - + S3: a_1 = ..use(a_2).. - - - + > VS6: va_new = .... - - - + S6: a_new = .... - S4 VS6 + S4: a_0 = ..use(a_1).. true S6 VS6 + > VS5: ... = ..vuse(va_new).. - - - + S5: ... = ..use(a_0).. - - - + + DCE could then get rid of {S1,S2,S3,S4,S5,S6} (if their defs are not used + elsewhere), and we'll end up with: + + VS6: va_new = .... + VS5: ... = ..vuse(va_new).. + + If vectorization does not succeed, DCE will clean S6 away (its def is + not used), and we'll end up with the original sequence. +*/ + +void +vect_pattern_recog (loop_vec_info loop_vinfo) +{ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); + unsigned int nbbs = loop->num_nodes; + block_stmt_iterator si; + tree stmt; + unsigned int i, j; + tree (* vect_recog_func_ptr) (tree, tree *, tree *); + + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "=== vect_pattern_recog ==="); + + /* Scan through the loop stmts, applying the pattern recognition + functions starting at each stmt visited: */ + for (i = 0; i < nbbs; i++) + { + basic_block bb = bbs[i]; + for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) + { + stmt = bsi_stmt (si); + + /* Scan over all generic vect_recog_xxx_pattern functions. */ + for (j = 0; j < NUM_PATTERNS; j++) + { + vect_recog_func_ptr = vect_vect_recog_func_ptrs[j]; + vect_pattern_recog_1 (vect_recog_func_ptr, si); + } + } + } +} diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 42090f7..db0573c 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -1,5 +1,5 @@ /* Transformation Utilities for Loop Vectorization. - Copyright (C) 2003,2004,2005 Free Software Foundation, Inc. + Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -59,6 +59,7 @@ static void vect_finish_stmt_generation (tree stmt, tree vec_stmt, block_stmt_iterator *bsi); static bool vect_is_simple_cond (tree, loop_vec_info); static void update_vuses_to_preheader (tree, struct loop*); +static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); static tree get_initial_def_for_reduction (tree, tree, tree *); /* Utility function dealing with loop peeling (not peeling itself). */ @@ -656,6 +657,8 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) switch (code) { + case WIDEN_SUM_EXPR: + case DOT_PROD_EXPR: case PLUS_EXPR: if (INTEGRAL_TYPE_P (type)) def = build_int_cst (type, 0); @@ -711,66 +714,66 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def) } -/* Function vect_create_epilog_for_reduction: +/* Function vect_create_epilog_for_reduction Create code at the loop-epilog to finalize the result of a reduction - computation. + computation. - LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it - into a single result, by applying the operation REDUC_CODE on the - partial-results-vector. For this, we need to create a new phi node at the - loop exit to preserve loop-closed form, as illustrated below. - - STMT is the original scalar reduction stmt that is being vectorized. - REDUCTION_OP is the scalar reduction-variable. + VECT_DEF is a vector of partial results. + REDUC_CODE is the tree-code for the epilog reduction. + STMT is the scalar reduction stmt that is being vectorized. REDUCTION_PHI is the phi-node that carries the reduction computation. - This function also sets the arguments for the REDUCTION_PHI: - The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP. - The loop-latch argument is VECT_DEF - the vector of partial sums. - This function transforms this: + This function: + 1. Creates the reduction def-use cycle: sets the the arguments for + REDUCTION_PHI: + The loop-entry argument is the vectorized initial-value of the reduction. + The loop-latch argument is VECT_DEF - the vector of partial sums. + 2. "Reduces" the vector of partial results VECT_DEF into a single result, + by applying the operation specified by REDUC_CODE if available, or by + other means (whole-vector shifts or a scalar loop). + The function also creates a new phi node at the loop exit to preserve + loop-closed form, as illustrated below. + + The flow at the entry to this function: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - + s_out0 = phi # (scalar) EXIT_PHI use use - Into: + The above is transformed by this function into: loop: - vec_def = phi # REDUCTION_PHI - .... - VECT_DEF = ... - + vec_def = phi # REDUCTION_PHI + VECT_DEF = vector_stmt # vectorized form of STMT + s_loop = scalar_stmt # (scalar) STMT loop_exit: - s_out0 = phi # EXIT_PHI - v_out1 = phi # NEW_EXIT_PHI - - v_out2 = reduc_expr + s_out0 = phi # (scalar) EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce s_out3 = extract_field - - use - use + s_out4 = adjust_result + use + use */ static void -vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, +vect_create_epilog_for_reduction (tree vect_def, tree stmt, enum tree_code reduc_code, tree reduction_phi) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); - tree vectype = STMT_VINFO_VECTYPE (stmt_info); - enum machine_mode mode = TYPE_MODE (vectype); + tree vectype; + enum machine_mode mode; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb; - tree scalar_dest = TREE_OPERAND (stmt, 0); - tree scalar_type = TREE_TYPE (scalar_dest); + tree scalar_dest; + tree scalar_type; tree new_phi; block_stmt_iterator exit_bsi; tree vec_dest; @@ -786,7 +789,16 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, imm_use_iterator imm_iter; use_operand_p use_p; bool extract_scalar_result; + tree reduction_op; + tree orig_stmt; + tree operation = TREE_OPERAND (stmt, 1); + int op_type; + op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); + reduction_op = TREE_OPERAND (operation, op_type-1); + vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); + mode = TYPE_MODE (vectype); + /*** 1. Create the reduction def-use cycle ***/ /* 1.1 set the loop-entry arg of the reduction-phi: */ @@ -797,7 +809,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, &scalar_initial_def); add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); - /* 1.2 set the loop-latch arg for the reduction-phi: */ add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop)); @@ -810,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, } - /*** 2. Create epilog code ***/ + /*** 2. Create epilog code + The reduction epilog code operates across the elements of the vector + of partial results computed by the vectorized loop. + The reduction epilog code consists of: + step 1: compute the scalar result in a vector (v_out2) + step 2: extract the scalar result (s_out3) from the vector (v_out2) + step 3: adjust the scalar result (s_out3) if needed. + + Step 1 can be accomplished using one the following three schemes: + (scheme 1) using reduc_code, if available. + (scheme 2) using whole-vector shifts, if available. + (scheme 3) using a scalar loop. In this case steps 1+2 above are + combined. + + The overall epilog code looks like this: + + s_out0 = phi # original EXIT_PHI + v_out1 = phi # NEW_EXIT_PHI + v_out2 = reduce # step 1 + s_out3 = extract_field # step 2 + s_out4 = adjust_result # step 3 + + (step 3 is optional, and step2 1 and 2 may be combined). + Lastly, the uses of s_out0 are replaced by s_out4. + + ***/ /* 2.1 Create new loop-exit-phi to preserve loop-closed form: v_out1 = phi */ @@ -818,15 +854,39 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, exit_bb = loop->single_exit->dest; new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def); - exit_bsi = bsi_start (exit_bb); - + /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 + (i.e. when reduc_code is not available) and in the final adjusment code + (if needed). Also get the original scalar reduction variable as + defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it + represents a reduction pattern), the tree-code and scalar-def are + taken from the original stmt that the pattern-stmt (STMT) replaces. + Otherwise (it is a regular reduction) - the tree-code and scalar-def + are taken from STMT. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (!orig_stmt) + { + /* Regular reduction */ + orig_stmt = stmt; + } + else + { + /* Reduction pattern */ + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + } + code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + scalar_dest = TREE_OPERAND (orig_stmt, 0); + scalar_type = TREE_TYPE (scalar_dest); new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); bitsize = TYPE_SIZE (scalar_type); bytesize = TYPE_SIZE_UNIT (scalar_type); - /* 2.2 Create the reduction code. */ + /* 2.3 Create the reduction code, using one of the three schemes described + above. */ if (reduc_code < NUM_TREE_CODES) { @@ -849,16 +909,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { enum tree_code shift_code = 0; bool have_whole_vector_shift = true; - enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */ int bit_offset; int element_bitsize = tree_low_cst (bitsize, 1); int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); tree vec_temp; - /* The result of the reduction is expected to be at the least - significant bits of the vector. This is merely convention, - as it's the extraction later that really matters, and that - is also under our control. */ if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) shift_code = VEC_RSHIFT_EXPR; else @@ -881,7 +936,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (have_whole_vector_shift) { - /*** Case 2: + /*** Case 2: Create: for (offset = VS/2; offset >= element_size; offset/=2) { Create: va' = vec_shift @@ -905,17 +960,12 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, build2 (code, vectype, new_name, new_temp)); new_temp = make_ssa_name (vec_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = true; @@ -924,10 +974,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { tree rhs; - /*** Case 3: - Create: + /*** Case 3: Create: s = extract_field - for (offset=element_size; offset Create: s = op @@ -938,18 +989,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, vec_temp = PHI_RESULT (new_phi); vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); - rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, bitsize_zero_node); - BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - rhs); + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); for (bit_offset = element_bitsize; bit_offset < vec_size_in_bits; @@ -965,25 +1011,19 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_name = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_name; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, build2 (code, scalar_type, new_name, new_temp)); new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } extract_scalar_result = false; } } - - /* 2.3 Extract the final scalar result. Create: + /* 2.4 Extract the final scalar result. Create: s_out3 = extract_field */ if (extract_scalar_result) @@ -993,7 +1033,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "extract scalar result"); - /* The result is in the low order bits. */ if (BYTES_BIG_ENDIAN) bitpos = size_binop (MULT_EXPR, bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), @@ -1007,17 +1046,14 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } - /* 2.4 Adjust the final result by the initial value of the reduction - variable. (when such adjustment is not needed, then + variable. (When such adjustment is not needed, then 'scalar_initial_def' is zero). Create: - s_out = scalar_expr */ + s_out4 = scalar_expr */ if (scalar_initial_def) { @@ -1026,18 +1062,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); TREE_OPERAND (epilog_stmt, 0) = new_temp; bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - - if (vect_print_dump_info (REPORT_DETAILS)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); } + /* 2.6 Replace uses of s_out0 with uses of s_out3 */ - /* 2.5 Replace uses of s_out0 with uses of s_out3 */ - - /* Find the loop-closed-use at the loop exit of the original - scalar result. (The reduction result is expected to have - two immediate uses - one at the latch block, and one at the - loop exit). */ + /* Find the loop-closed-use at the loop exit of the original scalar result. + (The reduction result is expected to have two immediate uses - one at the + latch block, and one at the loop exit). */ exit_phi = NULL; FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) { @@ -1047,9 +1078,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, break; } } - + /* We expect to have found an exit_phi because of loop-closed-ssa form. */ + gcc_assert (exit_phi); + /* Replace the uses: */ orig_name = PHI_RESULT (exit_phi); - FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name) SET_USE (use_p, new_temp); } @@ -1060,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, Check if STMT performs a reduction operation that can be vectorized. If VEC_STMT is also passed, vectorize the STMT: create a vectorized stmt to replace it, put it in VEC_STMT, and insert it at BSI. - Return FALSE if not a vectorizable STMT, TRUE otherwise. */ + Return FALSE if not a vectorizable STMT, TRUE otherwise. + + This function also handles reduction idioms (patterns) that have been + recognized in advance during vect_pattern_recog. In this case, STMT may be + of this form: + X = pattern_expr (arg0, arg1, ..., X) + and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original + sequence that had been detected and replaced by the pattern-stmt (STMT). + + In some cases of reduction patterns, the type of the reduction variable X is + different than the type of the other arguments of STMT. + In such cases, the vectype that is used when transforming STMT into a vector + stmt is different than the vectype that is used to determine the + vectorization factor, because it consists of a different number of elements + than the actual number of elements that are being operated upon in parallel. + + For example, consider an accumulation of shorts into an int accumulator. + On some targets it's possible to vectorize this pattern operating on 8 + shorts at a time (hence, the vectype for purposes of determining the + vectorization factor should be V8HI); on the other hand, the vectype that + is used to create the vector form is actually V4SI (the type of the result). + + Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that + indicates what is the actual level of parallelism (V8HI in the example), so + that the right vectorization factor would be derived. This vectype + corresponds to the type of arguments to the reduction stmt, and should *NOT* + be used to create the vectorized stmt. The right vectype for the vectorized + stmt is obtained from the type of the result X: + get_vectype_for_scalar_type (TREE_TYPE (X)) + + This means that, contrary to "regular" reductions (or "regular" stmts in + general), the following equation: + STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) + does *NOT* necessarily hold for reduction patterns. */ bool vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) { tree vec_dest; tree scalar_dest; - tree op0, op1; - tree loop_vec_def; + tree op; + tree loop_vec_def0, loop_vec_def1; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree operation; - enum tree_code code, reduc_code = 0; + enum tree_code code, orig_code, epilog_reduc_code = 0; enum machine_mode vec_mode; int op_type; optab optab, reduc_optab; tree new_temp; - tree def0, def1, def_stmt0, def_stmt1; - enum vect_def_type dt0, dt1; + tree def, def_stmt; + enum vect_def_type dt; tree new_phi; tree scalar_type; - bool is_simple_use0; - bool is_simple_use1; + bool is_simple_use; + tree orig_stmt; + stmt_vec_info orig_stmt_info; + tree expr = NULL_TREE; + int i; - /* Is vectorizable reduction? */ + /* 1. Is vectorizable reduction? */ /* Not supportable if the reduction variable is used in the loop. */ if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -1095,43 +1163,68 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!STMT_VINFO_LIVE_P (stmt_info)) return false; - /* Make sure it was already recognized as a reduction pattern. */ + /* Make sure it was already recognized as a reduction computation. */ if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) return false; + /* 2. Has this been recognized as a reduction pattern? + + Check if STMT represents a pattern that has been recognized + in earlier analysis stages. For stmts that represent a pattern, + the STMT_VINFO_RELATED_STMT field records the last stmt in + the original sequence that constitutes the pattern. */ + + orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt) + { + orig_stmt_info = vinfo_for_stmt (orig_stmt); + gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); + gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); + gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); + } + + /* 3. Check the operands of the operation. The first operands are defined + inside the loop body. The last operand is the reduction variable, + which is defined by the loop-header-phi. */ + gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR); operation = TREE_OPERAND (stmt, 1); code = TREE_CODE (operation); op_type = TREE_CODE_LENGTH (code); - if (op_type != binary_op) + if (op_type != binary_op && op_type != ternary_op) return false; - - op0 = TREE_OPERAND (operation, 0); - op1 = TREE_OPERAND (operation, 1); scalar_dest = TREE_OPERAND (stmt, 0); scalar_type = TREE_TYPE (scalar_dest); - /* Check the first operand. It is expected to be defined inside the loop. */ - is_simple_use0 = - vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0); - is_simple_use1 = - vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1); - - gcc_assert (is_simple_use0); - gcc_assert (is_simple_use1); - gcc_assert (dt0 == vect_loop_def); - gcc_assert (dt1 == vect_reduction_def); - gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE); - gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1)); + /* All uses but the last are expected to be defined in the loop. + The last use is the reduction variable. */ + for (i = 0; i < op_type-1; i++) + { + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_loop_def || dt == vect_invariant_def || + dt == vect_constant_def); + } - if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1))) - return false; + op = TREE_OPERAND (operation, i); + is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); + gcc_assert (is_simple_use); + gcc_assert (dt == vect_reduction_def); + gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); + if (orig_stmt) + gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt)); + else + gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt)); + + if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) + return false; - /* Supportable by target? */ + /* 4. Supportable by target? */ - /* check support for the operation in the loop */ + /* 4.1. check support for the operation in the loop */ optab = optab_for_tree_code (code, vectype); if (!optab) { @@ -1162,21 +1255,69 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; } - /* check support for the epilog operation */ - if (!reduction_code_for_scalar_code (code, &reduc_code)) + /* 4.2. Check support for the epilog operation. + + If STMT represents a reduction pattern, then the type of the + reduction variable may be different than the type of the rest + of the arguments. For example, consider the case of accumulation + of shorts into an int accumulator; The original code: + S1: int_a = (int) short_a; + orig_stmt-> S2: int_acc = plus ; + + was replaced with: + STMT: int_acc = widen_sum + + This means that: + 1. The tree-code that is used to create the vector operation in the + epilog code (that reduces the partial results) is not the + tree-code of STMT, but is rather the tree-code of the original + stmt from the pattern that STMT is replacing. I.e, in the example + above we want to use 'widen_sum' in the loop, but 'plus' in the + epilog. + 2. The type (mode) we use to check available target support + for the vector operation to be created in the *epilog*, is + determined by the type of the reduction variable (in the example + above we'd check this: plus_optab[vect_int_mode]). + However the type (mode) we use to check available target support + for the vector operation to be created *inside the loop*, is + determined by the type of the other arguments to STMT (in the + example we'd check this: widen_sum_optab[vect_short_mode]). + + This is contrary to "regular" reductions, in which the types of all + the arguments are the same as the type of the reduction variable. + For "regular" reductions we can therefore use the same vector type + (and also the same tree-code) when generating the epilog code and + when generating the code inside the loop. */ + + if (orig_stmt) + { + /* This is a reduction pattern: get the vectype from the type of the + reduction variable, and get the tree-code from orig_stmt. */ + orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1)); + vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); + vec_mode = TYPE_MODE (vectype); + } + else + { + /* Regular reduction: use the same vectype and tree-code as used for + the vector code inside the loop can be used for the epilog code. */ + orig_code = code; + } + + if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) return false; - reduc_optab = optab_for_tree_code (reduc_code, vectype); + reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); if (!reduc_optab) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "no optab for reduction."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "reduc op not supported by target."); - reduc_code = NUM_TREE_CODES; + epilog_reduc_code = NUM_TREE_CODES; } if (!vec_stmt) /* transformation not required. */ @@ -1193,25 +1334,31 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* Create the destination vector */ vec_dest = vect_create_destination_var (scalar_dest, vectype); - /* Create the reduction-phi that defines the reduction-operand. */ new_phi = create_phi_node (vec_dest, loop->header); - /* Prepare the operand that is defined inside the loop body */ - loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL); + op = TREE_OPERAND (operation, 0); + loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); + if (op_type == binary_op) + expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi)); + else if (op_type == ternary_op) + { + op = TREE_OPERAND (operation, 1); + loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); + expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, + PHI_RESULT (new_phi)); + } /* Create the vectorized operation that computes the partial results */ - *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi))); + *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr); new_temp = make_ssa_name (vec_dest, *vec_stmt); TREE_OPERAND (*vec_stmt, 0) = new_temp; vect_finish_stmt_generation (stmt, *vec_stmt, bsi); - /* Finalize the reduction-phi (set it's arguments) and create the epilog reduction code. */ - vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi); + vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); return true; } @@ -2040,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) bool is_store = false; tree vec_stmt = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree orig_stmt_in_pattern; bool done; if (STMT_VINFO_RELEVANT_P (stmt_info)) @@ -2078,7 +2226,25 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi) gcc_unreachable (); } + gcc_assert (vec_stmt); STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; + orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); + if (orig_stmt_in_pattern) + { + stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); + if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) + { + gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); + + /* STMT was inserted by the vectorizer to replace a computation + idiom. ORIG_STMT_IN_PATTERN is a stmt in the original + sequence that computed this idiom. We need to record a pointer + to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN. See more + detail in the documentation of vect_pattern_recog. */ + + STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; + } + } } if (STMT_VINFO_LIVE_P (stmt_info)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index d4c6989..f03a2a2 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1,5 +1,5 @@ /* Loop Vectorization - Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -1361,6 +1361,8 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo) STMT_VINFO_LIVE_P (res) = 0; STMT_VINFO_VECTYPE (res) = NULL; STMT_VINFO_VEC_STMT (res) = NULL; + STMT_VINFO_IN_PATTERN_P (res) = false; + STMT_VINFO_RELATED_STMT (res) = NULL; STMT_VINFO_DATA_REF (res) = NULL; if (TREE_CODE (stmt) == PHI_NODE) STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 4f7fd95..c5b1378 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1,5 +1,5 @@ /* Loop Vectorization - Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. Contributed by Dorit Naishlos This file is part of GCC. @@ -43,10 +43,11 @@ enum vect_var_kind { vect_scalar_var }; -/* Defines type of operation: unary or binary. */ +/* Defines type of operation. */ enum operation_type { unary_op = 1, - binary_op + binary_op, + ternary_op }; /* Define type of available alignment support. */ @@ -204,6 +205,20 @@ typedef struct _stmt_vec_info { /* Information about the data-ref (access function, etc). */ struct data_reference *data_ref_info; + /* Stmt is part of some pattern (computation idiom) */ + bool in_pattern_p; + + /* Used for various bookeeping purposes, generally holding a pointer to + some other stmt S that is in some way "related" to this stmt. + Current use of this field is: + If this stmt is part of a pattern (i.e. the field 'in_pattern_p' is + true): S is the "pattern stmt" that represents (and replaces) the + sequence of stmts that constitutes the pattern. Similarly, the + related_stmt of the "pattern stmt" points back to this stmt (which is + the last stmt in the original sequence of stmts that constitutes the + pattern). */ + tree related_stmt; + /* List of datarefs that are known to have the same alignment as the dataref of this stmt. */ VEC(dr_p,heap) *same_align_refs; @@ -222,6 +237,8 @@ typedef struct _stmt_vec_info { #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info +#define STMT_VINFO_IN_PATTERN_P(S) (S)->in_pattern_p +#define STMT_VINFO_RELATED_STMT(S) (S)->related_stmt #define STMT_VINFO_SAME_ALIGN_REFS(S) (S)->same_align_refs #define STMT_VINFO_DEF_TYPE(S) (S)->def_type @@ -312,7 +329,6 @@ extern bool vect_can_force_dr_alignment_p (tree, unsigned int); extern enum dr_alignment_support vect_supportable_dr_alignment (struct data_reference *); extern bool reduction_code_for_scalar_code (enum tree_code, enum tree_code *); - /* Creation and deletion of loop and stmt info structs. */ extern loop_vec_info new_loop_vec_info (struct loop *loop); extern void destroy_loop_vec_info (loop_vec_info); @@ -320,10 +336,21 @@ extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info); /* Main driver. */ extern void vectorize_loops (struct loops *); + /** In tree-vect-analyze.c **/ /* Driver for analysis stage. */ extern loop_vec_info vect_analyze_loop (struct loop *); + +/** In tree-vect-patterns.c **/ +/* Pattern recognition functions. + Additional pattern recognition functions can (and will) be added + in the future. */ +typedef tree (* vect_recog_func_ptr) (tree, tree *, tree *); +#define NUM_PATTERNS 3 +void vect_pattern_recog (loop_vec_info); + + /** In tree-vect-transform.c **/ extern bool vectorizable_load (tree, block_stmt_iterator *, tree *); extern bool vectorizable_store (tree, block_stmt_iterator *, tree *); diff --git a/gcc/tree.def b/gcc/tree.def index 9e7e5b0..3cd03fd 100644 --- a/gcc/tree.def +++ b/gcc/tree.def @@ -1,7 +1,7 @@ /* This file contains the definitions and documentation for the tree codes used in GCC. - Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005 - Free Software Foundation, Inc. + Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005, + 2006 Free Software Foundation, Inc. This file is part of GCC. @@ -1073,6 +1073,33 @@ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1) DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1) DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1) +/* Widenning dot-product. + The first two arguments are of type t1. + The third argument and the result are of type t2, such that t2 is at least + twice the size of t1. DOT_PROD_EXPR(arg1,arg2,arg3) is equivalent to: + tmp = WIDEN_MULT_EXPR(arg1, arg2); + arg3 = PLUS_EXPR (tmp, arg3); + or: + tmp = WIDEN_MULT_EXPR(arg1, arg2); + arg3 = WIDEN_SUM_EXPR (tmp, arg3); */ +DEFTREECODE (DOT_PROD_EXPR, "dot_prod_expr", tcc_expression, 3) + +/* Widenning summation. + The first argument is of type t1. + The second argument is of type t2, such that t2 is at least twice + the size of t1. The type of the entire expression is also t2. + WIDEN_SUM_EXPR is equivalent to first widening (promoting) + the first argument from type t1 to type t2, and then summing it + with the second argument. */ +DEFTREECODE (WIDEN_SUM_EXPR, "widen_sum_expr", tcc_binary, 2) + +/* Widenning multiplication. + The two arguments are of type t1. + The result is of type t2, such that t2 is at least twice + the size of t1. WIDEN_MULT_EXPR is equivalent to first widening (promoting) + the arguments from type t1 to type t2, and then multiplying them. */ +DEFTREECODE (WIDEN_MULT_EXPR, "widen_mult_expr", tcc_binary, 2) + /* Whole vector left/right shift in bits. Operand 0 is a vector to be shifted. Operand 1 is an integer shift amount in bits. */ -- cgit v1.1