aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog68
-rw-r--r--gcc/Makefile.in5
-rw-r--r--gcc/config/i386/sse.md44
-rw-r--r--gcc/config/rs6000/altivec.md73
-rw-r--r--gcc/doc/md.texi21
-rw-r--r--gcc/expr.c25
-rw-r--r--gcc/genopinit.c6
-rw-r--r--gcc/optabs.c159
-rw-r--r--gcc/optabs.h19
-rw-r--r--gcc/testsuite/ChangeLog19
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c70
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c111
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c77
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c101
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c60
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c67
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect.exp10
-rw-r--r--gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c108
-rwxr-xr-xgcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c59
-rw-r--r--gcc/testsuite/lib/target-supports.exp106
-rw-r--r--gcc/tree-inline.c6
-rw-r--r--gcc/tree-pretty-print.c24
-rw-r--r--gcc/tree-ssa-operands.c3
-rw-r--r--gcc/tree-vect-analyze.c75
-rw-r--r--gcc/tree-vect-generic.c7
-rw-r--r--gcc/tree-vect-patterns.c637
-rw-r--r--gcc/tree-vect-transform.c422
-rw-r--r--gcc/tree-vectorizer.c4
-rw-r--r--gcc/tree-vectorizer.h35
-rw-r--r--gcc/tree.def31
30 files changed, 2284 insertions, 168 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 6fd7bfd..510e1a7 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,71 @@
+2006-01-19 Dorit Nuzman <dorit@il.ibm.com>
+
+ * Makefile.in (tree-vect-patterns.o): Add rule for new file.
+ * tree-vect-analyze.c (vect_determine_vectorization_factor): Use
+ existing STMT_VINFO_VECTYPE if available.
+ (vect_mark_relevant): Add special handling for stmts that are
+ marked as STMT_VINFO_IN_PATTERN_P.
+ (vect_analyze_loop): Call vect_pattern_recog.
+ * tree-vectorizer.c (new_stmt_vec_info): Initialize new fields.
+ * tree-vectorizer.h (in_pattern_p, related_stmt): New fields in
+ stmt_info.
+ (STMT_VINFO_IN_PATTERN_P, STMT_VINFO_RELATED_STMT): New macros.
+ (vect_recog_func_ptr): New function-pointer type.
+ * tree-vect-patterns.c: New file.
+ (vect_recog_widen_sum_pattern, vect_recog_widen_mult_pattern):
+ (vect_recog_dot_prod_pattern, vect_pattern_recog):
+ (vect_pattern_recog_1): New functions.
+ (vect_pattern_recog_funcs): New array of function pointers.
+
+ * tree-vectorizer.h (ternary_op): New enum value.
+ * tree-vect-transform.c (vect_create_epilog_for_reduction): Added
+ declaration. Revised documentation. Removed redundant dump prints.
+ Removed redundant argument. Added support for reduction patterns.
+ (vectorizable_reduction): Added support for reduction patterns.
+ (vect_transform_stmt): Added support for patterns.
+
+ * expr.c (expand_expr_real_1): Added case for DOT_PROD_EXPR.
+ * genopinit.c (udot_prod_optab, sdot_prod_optab): Initialize.
+ * optabs.c (optab_for_tree_code): Added case for DOT_PROD_EXPR.
+ (expand_widen_pattern_expr): New function.
+ (init_optabs): Initialize new optabs udot_prod_optab,
+ sdot_prod_optab.
+ * optabs.h (OTI_sdot_prod, OTI_udot_prod): New.
+ (sdot_prod_optab, udot_prod_optab): Define new optabs.
+ (expand_widen_pattern_expr): New function declaration.
+ * tree.def (DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR): New
+ tree-codes.
+ * tree-inline.c (estimate_num_insns_1): Added cases for new
+ tree-codes DOT_PROD_EXPR, WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
+ * tree-pretty-print.c (dump_generic_node): Likewise.
+ (op_prio): Likewise.
+ (op_symbol): Added cases for WIDEN_SUM_EXPR, WIDEN_MULT_EXPR.
+ * tree-ssa-operands.c (get_expr_operands): Added case for
+ DOT_PROD_EXPR.
+ * tree-vect-patterns.c (widened_name_p): New function.
+ (vect_recog_dot_prod_pattern): Added function implementation.
+ * tree-vect-transform.c (get_initial_def_for_reduction): Added
+ cases for DOT_PROD_EXPR, WIDEN_SUM_EXPR.
+ * config/rs6000/altivec.md (udot_prod<mode>, sdot_prodv8hi): New.
+ * config/i386/sse.md (sdot_prodv8hi, udot_prodv4si): New.
+
+ * expr.c (expand_expr_real_1): Added case for WIDEN_SUM_EXPR.
+ * genopinit.c (widen_ssum_optab, widen_usum_optab): Initialize.
+ * optabs.c (optab_for_tree_code): Added case for WIDEN_SUM_EXPR.
+ (init_optabs): Initialize new optabs widen_ssum_optab,
+ widen_usum_optab.
+ * optabs.h (OTI_widen_ssum, OTI_widen_usum): New.
+ (widen_ssum_optab, widen_usum_optab): Define new optabs.
+ * tree-vect-generic.c: (expand_vector_operations_1): Check type of
+ use instead of type of def.
+ * tree-vect-patterns.c (vect_recog_widen_sum_pattern): Added
+ function implementation.
+ * config/rs6000/altivec.md (widen_usum<mode>, widen_ssumv16qi,
+ widen_ssumv8hi): New.
+
+ * doc/tm.texi (ssum_widen, usum_widen, sdot_prod, udot_prod): New
+ patterns.
+
2006-01-19 Richard Sandiford <richard@codesourcery.com>
PR c/25805
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index a8bd984..e1c6a1d 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -967,6 +967,7 @@ OBJS-common = \
tree-vect-generic.o tree-ssa-loop.o tree-ssa-loop-niter.o \
tree-ssa-loop-manip.o tree-ssa-threadupdate.o \
tree-vectorizer.o tree-vect-analyze.o tree-vect-transform.o \
+ tree-vect-patterns.o \
tree-ssa-loop-ivcanon.o tree-ssa-propagate.o tree-ssa-address.o \
tree-ssa-math-opts.o \
tree-ssa-loop-ivopts.o tree-if-conv.o tree-ssa-loop-unswitch.o \
@@ -2065,6 +2066,10 @@ tree-vect-analyze.o: tree-vect-analyze.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
$(TM_H) $(GGC_H) $(OPTABS_H) $(TREE_H) $(BASIC_BLOCK_H) \
$(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) $(CFGLOOP_H) \
tree-vectorizer.h $(TREE_DATA_REF_H) $(SCEV_H) $(EXPR_H) tree-chrec.h
+tree-vect-patterns.o: tree-vect-patterns.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+ $(TM_H) errors.h $(GGC_H) $(OPTABS_H) $(TREE_H) $(RTL_H) $(BASIC_BLOCK_H) \
+ diagnostic.h $(TREE_FLOW_H) $(TREE_DUMP_H) $(TIMEVAR_H) cfgloop.h \
+ tree-vectorizer.h tree-data-ref.h $(EXPR_H)
tree-vect-transform.o: tree-vect-transform.c $(CONFIG_H) $(SYSTEM_H) \
coretypes.h $(TM_H) $(GGC_H) $(OPTABS_H) $(RECOG_H) $(TREE_H) $(RTL_H) \
$(BASIC_BLOCK_H) $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TREE_DUMP_H) \
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bd943f7..88c7adf 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1,5 +1,5 @@
;; GCC machine description for SSE instructions
-;; Copyright (C) 2005
+;; Copyright (C) 2005, 2006
;; Free Software Foundation, Inc.
;;
;; This file is part of GCC.
@@ -2700,6 +2700,48 @@
DONE;
})
+(define_expand "sdot_prodv8hi"
+ [(match_operand:V4SI 0 "register_operand" "")
+ (match_operand:V8HI 1 "nonimmediate_operand" "")
+ (match_operand:V8HI 2 "nonimmediate_operand" "")
+ (match_operand:V4SI 3 "register_operand" "")]
+ "TARGET_SSE2"
+{
+ rtx t = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
+ emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+ DONE;
+})
+
+(define_expand "udot_prodv4si"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")
+ (match_operand:V2DI 3 "register_operand" "")]
+ "TARGET_SSE2"
+{
+ rtx t1, t2, t3, t4;
+
+ t1 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+ emit_insn (gen_addv2di3 (t1, t1, operands[3]));
+
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
+ gen_lowpart (TImode, operands[1]),
+ GEN_INT (32)));
+ emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
+ gen_lowpart (TImode, operands[2]),
+ GEN_INT (32)));
+
+ t4 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+
+ emit_insn (gen_addv2di3 (operands[0], t1, t4));
+ DONE;
+})
+
(define_insn "ashr<mode>3"
[(set (match_operand:SSEMODE24 0 "register_operand" "=x")
(ashiftrt:SSEMODE24
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 26ec2be..d4bf08e 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -1,5 +1,5 @@
;; AltiVec patterns.
-;; Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
;; Contributed by Aldy Hernandez (aldy@quesejoda.com)
;; This file is part of GCC.
@@ -2150,6 +2150,77 @@
DONE;
}")
+(define_expand "udot_prod<mode>"
+ [(set (match_operand:V4SI 0 "register_operand" "=v")
+ (plus:V4SI (match_operand:V4SI 3 "register_operand" "v")
+ (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")
+ (match_operand:VIshort 2 "register_operand" "v")]
+ UNSPEC_VMSUMU)))]
+ "TARGET_ALTIVEC"
+ "
+{
+ emit_insn (gen_altivec_vmsumu<VI_char>m (operands[0], operands[1], operands[2], operands[3]));
+ DONE;
+}")
+
+(define_expand "sdot_prodv8hi"
+ [(set (match_operand:V4SI 0 "register_operand" "=v")
+ (plus:V4SI (match_operand:V4SI 3 "register_operand" "v")
+ (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")
+ (match_operand:V8HI 2 "register_operand" "v")]
+ UNSPEC_VMSUMSHM)))]
+ "TARGET_ALTIVEC"
+ "
+{
+ emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], operands[2], operands[3]));
+ DONE;
+}")
+
+(define_expand "widen_usum<mode>3"
+ [(set (match_operand:V4SI 0 "register_operand" "=v")
+ (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+ (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v")]
+ UNSPEC_VMSUMU)))]
+ "TARGET_ALTIVEC"
+ "
+{
+ rtx vones = gen_reg_rtx (GET_MODE (operands[1]));
+
+ emit_insn (gen_altivec_vspltis<VI_char> (vones, const1_rtx));
+ emit_insn (gen_altivec_vmsumu<VI_char>m (operands[0], operands[1], vones, operands[2]));
+ DONE;
+}")
+
+(define_expand "widen_ssumv16qi3"
+ [(set (match_operand:V4SI 0 "register_operand" "=v")
+ (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+ (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")]
+ UNSPEC_VMSUMM)))]
+ "TARGET_ALTIVEC"
+ "
+{
+ rtx vones = gen_reg_rtx (V16QImode);
+
+ emit_insn (gen_altivec_vspltisb (vones, const1_rtx));
+ emit_insn (gen_altivec_vmsummbm (operands[0], operands[1], vones, operands[2]));
+ DONE;
+}")
+
+(define_expand "widen_ssumv8hi3"
+ [(set (match_operand:V4SI 0 "register_operand" "=v")
+ (plus:V4SI (match_operand:V4SI 2 "register_operand" "v")
+ (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v")]
+ UNSPEC_VMSUMSHM)))]
+ "TARGET_ALTIVEC"
+ "
+{
+ rtx vones = gen_reg_rtx (V8HImode);
+
+ emit_insn (gen_altivec_vspltish (vones, const1_rtx));
+ emit_insn (gen_altivec_vmsumshm (operands[0], operands[1], vones, operands[2]));
+ DONE;
+}")
+
(define_expand "negv4sf2"
[(use (match_operand:V4SF 0 "register_operand" ""))
(use (match_operand:V4SF 1 "register_operand" ""))]
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 90efcc3..b6dd838 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -1,5 +1,5 @@
@c Copyright (C) 1988, 1989, 1992, 1993, 1994, 1996, 1998, 1999, 2000, 2001,
-@c 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+@c 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
@c This is part of the GCC manual.
@c For copying conditions, see the file gcc.texi.
@@ -3099,6 +3099,25 @@ Compute the sum of the unsigned elements of a vector. The vector is operand 1,
and the scalar result is stored in the least significant bits of operand 0
(also a vector). The output and input vector should have the same modes.
+@cindex @code{sdot_prod@var{m}} instruction pattern
+@item @samp{sdot_prod@var{m}}
+@cindex @code{udot_prod@var{m}} instruction pattern
+@item @samp{udot_prod@var{m}}
+Compute the sum of the products of two signed/unsigned elements.
+Operand 1 and operand 2 are of the same mode. Their product, which is of a
+wider mode, is computed and added to operand 3. Operand 3 is of a mode equal or
+wider than the mode of the product. The result is placed in operand 0, which
+is of the same mode as operand 3.
+
+@cindex @code{ssum_widen@var{m3}} instruction pattern
+@item @samp{ssum_widen@var{m3}}
+@cindex @code{usum_widen@var{m3}} instruction pattern
+@item @samp{usum_widen@var{m3}}
+Operands 0 and 2 are of the same mode, which is wider than the mode of
+operand 1. Add operand 1 to operand 2 and place the widened result in
+operand 0. (This is used express accumulation of elements into an accumulator
+of a wider mode.)
+
@cindex @code{vec_shl_@var{m}} instruction pattern
@cindex @code{vec_shr_@var{m}} instruction pattern
@item @samp{vec_shl_@var{m}}, @samp{vec_shr_@var{m}}
diff --git a/gcc/expr.c b/gcc/expr.c
index 92048ff..b15b43c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8553,6 +8553,31 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode,
return temp;
}
+ case DOT_PROD_EXPR:
+ {
+ tree oprnd0 = TREE_OPERAND (exp, 0);
+ tree oprnd1 = TREE_OPERAND (exp, 1);
+ tree oprnd2 = TREE_OPERAND (exp, 2);
+ rtx op2;
+
+ expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0);
+ op2 = expand_expr (oprnd2, NULL_RTX, VOIDmode, 0);
+ target = expand_widen_pattern_expr (exp, op0, op1, op2,
+ target, unsignedp);
+ return target;
+ }
+
+ case WIDEN_SUM_EXPR:
+ {
+ tree oprnd0 = TREE_OPERAND (exp, 0);
+ tree oprnd1 = TREE_OPERAND (exp, 1);
+
+ expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, 0);
+ target = expand_widen_pattern_expr (exp, op0, NULL_RTX, op1,
+ target, unsignedp);
+ return target;
+ }
+
case REDUC_MAX_EXPR:
case REDUC_MIN_EXPR:
case REDUC_PLUS_EXPR:
diff --git a/gcc/genopinit.c b/gcc/genopinit.c
index ec8076b..d958220 100644
--- a/gcc/genopinit.c
+++ b/gcc/genopinit.c
@@ -1,6 +1,6 @@
/* Generate code to initialize optabs from machine description.
Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
- 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
This file is part of GCC.
@@ -203,6 +203,10 @@ static const char * const optabs[] =
"vec_realign_load_optab->handlers[$A].insn_code = CODE_FOR_$(vec_realign_load_$a$)",
"vcond_gen_code[$A] = CODE_FOR_$(vcond$a$)",
"vcondu_gen_code[$A] = CODE_FOR_$(vcondu$a$)",
+ "ssum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_ssum$I$a3$)",
+ "usum_widen_optab->handlers[$A].insn_code = CODE_FOR_$(widen_usum$I$a3$)",
+ "udot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(udot_prod$I$a$)",
+ "sdot_prod_optab->handlers[$A].insn_code = CODE_FOR_$(sdot_prod$I$a$)",
"reduc_smax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smax_$a$)",
"reduc_umax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umax_$a$)",
"reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)",
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 5a87ac0..da5251c 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -294,6 +294,12 @@ optab_for_tree_code (enum tree_code code, tree type)
case REALIGN_LOAD_EXPR:
return vec_realign_load_optab;
+ case WIDEN_SUM_EXPR:
+ return TYPE_UNSIGNED (type) ? usum_widen_optab : ssum_widen_optab;
+
+ case DOT_PROD_EXPR:
+ return TYPE_UNSIGNED (type) ? udot_prod_optab : sdot_prod_optab;
+
case REDUC_MAX_EXPR:
return TYPE_UNSIGNED (type) ? reduc_umax_optab : reduc_smax_optab;
@@ -337,6 +343,154 @@ optab_for_tree_code (enum tree_code code, tree type)
}
+/* Expand vector widening operations.
+
+ There are two different classes of operations handled here:
+ 1) Operations whose result is wider than all the arguments to the operation.
+ Examples: VEC_UNPACK_HI/LO_EXPR, VEC_WIDEN_MULT_HI/LO_EXPR
+ In this case OP0 and optionally OP1 would be initialized,
+ but WIDE_OP wouldn't (not relevant for this case).
+ 2) Operations whose result is of the same size as the last argument to the
+ operation, but wider than all the other arguments to the operation.
+ Examples: WIDEN_SUM_EXPR, VEC_DOT_PROD_EXPR.
+ In the case WIDE_OP, OP0 and optionally OP1 would be initialized.
+
+ E.g, when called to expand the following operations, this is how
+ the arguments will be initialized:
+ nops OP0 OP1 WIDE_OP
+ widening-sum 2 oprnd0 - oprnd1
+ widening-dot-product 3 oprnd0 oprnd1 oprnd2
+ widening-mult 2 oprnd0 oprnd1 -
+ type-promotion (vec-unpack) 1 oprnd0 - - */
+
+rtx
+expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op, rtx target,
+ int unsignedp)
+{
+ tree oprnd0, oprnd1, oprnd2;
+ enum machine_mode wmode = 0, tmode0, tmode1 = 0;
+ optab widen_pattern_optab;
+ int icode;
+ enum machine_mode xmode0, xmode1 = 0, wxmode = 0;
+ rtx temp;
+ rtx pat;
+ rtx xop0, xop1, wxop;
+ int nops = TREE_CODE_LENGTH (TREE_CODE (exp));
+
+ oprnd0 = TREE_OPERAND (exp, 0);
+ tmode0 = TYPE_MODE (TREE_TYPE (oprnd0));
+ widen_pattern_optab =
+ optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0));
+ icode = (int) widen_pattern_optab->handlers[(int) tmode0].insn_code;
+ gcc_assert (icode != CODE_FOR_nothing);
+ xmode0 = insn_data[icode].operand[1].mode;
+
+ if (nops >= 2)
+ {
+ oprnd1 = TREE_OPERAND (exp, 1);
+ tmode1 = TYPE_MODE (TREE_TYPE (oprnd1));
+ xmode1 = insn_data[icode].operand[2].mode;
+ }
+
+ /* The last operand is of a wider mode than the rest of the operands. */
+ if (nops == 2)
+ {
+ wmode = tmode1;
+ wxmode = xmode1;
+ }
+ else if (nops == 3)
+ {
+ gcc_assert (tmode1 == tmode0);
+ gcc_assert (op1);
+ oprnd2 = TREE_OPERAND (exp, 2);
+ wmode = TYPE_MODE (TREE_TYPE (oprnd2));
+ wxmode = insn_data[icode].operand[3].mode;
+ }
+
+ if (!wide_op)
+ wmode = wxmode = insn_data[icode].operand[0].mode;
+
+ if (!target
+ || ! (*insn_data[icode].operand[0].predicate) (target, wmode))
+ temp = gen_reg_rtx (wmode);
+ else
+ temp = target;
+
+ xop0 = op0;
+ xop1 = op1;
+ wxop = wide_op;
+
+ /* In case the insn wants input operands in modes different from
+ those of the actual operands, convert the operands. It would
+ seem that we don't need to convert CONST_INTs, but we do, so
+ that they're properly zero-extended, sign-extended or truncated
+ for their mode. */
+
+ if (GET_MODE (op0) != xmode0 && xmode0 != VOIDmode)
+ xop0 = convert_modes (xmode0,
+ GET_MODE (op0) != VOIDmode
+ ? GET_MODE (op0)
+ : tmode0,
+ xop0, unsignedp);
+
+ if (op1)
+ if (GET_MODE (op1) != xmode1 && xmode1 != VOIDmode)
+ xop1 = convert_modes (xmode1,
+ GET_MODE (op1) != VOIDmode
+ ? GET_MODE (op1)
+ : tmode1,
+ xop1, unsignedp);
+
+ if (wide_op)
+ if (GET_MODE (wide_op) != wxmode && wxmode != VOIDmode)
+ wxop = convert_modes (wxmode,
+ GET_MODE (wide_op) != VOIDmode
+ ? GET_MODE (wide_op)
+ : wmode,
+ wxop, unsignedp);
+
+ /* Now, if insn's predicates don't allow our operands, put them into
+ pseudo regs. */
+
+ if (! (*insn_data[icode].operand[1].predicate) (xop0, xmode0)
+ && xmode0 != VOIDmode)
+ xop0 = copy_to_mode_reg (xmode0, xop0);
+
+ if (op1)
+ {
+ if (! (*insn_data[icode].operand[2].predicate) (xop1, xmode1)
+ && xmode1 != VOIDmode)
+ xop1 = copy_to_mode_reg (xmode1, xop1);
+
+ if (wide_op)
+ {
+ if (! (*insn_data[icode].operand[3].predicate) (wxop, wxmode)
+ && wxmode != VOIDmode)
+ wxop = copy_to_mode_reg (wxmode, wxop);
+
+ pat = GEN_FCN (icode) (temp, xop0, xop1, wxop);
+ }
+ else
+ pat = GEN_FCN (icode) (temp, xop0, xop1);
+ }
+ else
+ {
+ if (wide_op)
+ {
+ if (! (*insn_data[icode].operand[2].predicate) (wxop, wxmode)
+ && wxmode != VOIDmode)
+ wxop = copy_to_mode_reg (wxmode, wxop);
+
+ pat = GEN_FCN (icode) (temp, xop0, wxop);
+ }
+ else
+ pat = GEN_FCN (icode) (temp, xop0);
+ }
+
+ emit_insn (pat);
+ return temp;
+}
+
/* Generate code to perform an operation specified by TERNARY_OPTAB
on operands OP0, OP1 and OP2, with result having machine-mode MODE.
@@ -5139,6 +5293,11 @@ init_optabs (void)
reduc_splus_optab = init_optab (UNKNOWN);
reduc_uplus_optab = init_optab (UNKNOWN);
+ ssum_widen_optab = init_optab (UNKNOWN);
+ usum_widen_optab = init_optab (UNKNOWN);
+ sdot_prod_optab = init_optab (UNKNOWN);
+ udot_prod_optab = init_optab (UNKNOWN);
+
vec_extract_optab = init_optab (UNKNOWN);
vec_set_optab = init_optab (UNKNOWN);
vec_init_optab = init_optab (UNKNOWN);
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 78cf53b..58fb690 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -1,5 +1,6 @@
/* Definitions for code generation pass of GNU compiler.
- Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
+ Free Software Foundation, Inc.
This file is part of GCC.
@@ -241,6 +242,14 @@ enum optab_index
OTI_reduc_splus,
OTI_reduc_uplus,
+ /* Summation, with result machine mode one or more wider than args. */
+ OTI_ssum_widen,
+ OTI_usum_widen,
+
+ /* Dot product, with result machine mode one or more wider than args. */
+ OTI_sdot_prod,
+ OTI_udot_prod,
+
/* Set specified field of vector operand. */
OTI_vec_set,
/* Extract specified field of vector operand. */
@@ -367,6 +376,11 @@ extern GTY(()) optab optab_table[OTI_MAX];
#define reduc_umin_optab (optab_table[OTI_reduc_umin])
#define reduc_splus_optab (optab_table[OTI_reduc_splus])
#define reduc_uplus_optab (optab_table[OTI_reduc_uplus])
+
+#define ssum_widen_optab (optab_table[OTI_ssum_widen])
+#define usum_widen_optab (optab_table[OTI_usum_widen])
+#define sdot_prod_optab (optab_table[OTI_sdot_prod])
+#define udot_prod_optab (optab_table[OTI_udot_prod])
#define vec_set_optab (optab_table[OTI_vec_set])
#define vec_extract_optab (optab_table[OTI_vec_extract])
@@ -495,6 +509,9 @@ extern enum insn_code sync_lock_release[NUM_MACHINE_MODES];
/* Define functions given in optabs.c. */
+extern rtx expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op,
+ rtx target, int unsignedp);
+
extern rtx expand_ternary_op (enum machine_mode mode, optab ternary_optab,
rtx op0, rtx op1, rtx op2, rtx target,
int unsignedp);
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 5a5e470..c54f6b0 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,22 @@
+2006-01-19 Dorit Nuzman <dorit@il.ibm.com>
+
+ * lib/target-suports.exp (check_effective_target_vect_sdot_qi): New.
+ (check_effective_target_vect_udot_qi): New.
+ (check_effective_target_vect_sdot_hi): New.
+ (check_effective_target_vect_udot_hi): New.
+ * gcc.dg/vect/vect.exp: Use dump-details, and compile testcases
+ prefixed with "wrapv-" with -fwrapv.
+ * gcc.dg/vect/wrapv-vect-reduc-dot-s8.c: New.
+ * gcc.dg/vect/vect-reduc-dot-u8.c: New.
+ * gcc.dg/vect/vect-reduc-dot-u16.c: New.
+ * gcc.dg/vect/vect-reduc-dot-s8.c: New.
+ * gcc.dg/vect/vect-reduc-dot-s16.c: New.
+
+ * lib/target-suports.exp (check_effective_target_vect_widen_sum): New.
+ * gcc.dg/vect/vect-reduc-pattern-1.c: New.
+ * gcc.dg/vect/vect-reduc-pattern-2.c: New.
+ * gcc.dg/vect/wrapv-vect-reduc-pattern-2.c: New.
+
2006-01-19 Volker Reichelt <reichelt@igpm.rwth-aachen.de>
PR c++/16829
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c
new file mode 100644
index 0000000..ddffc10
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s16.c
@@ -0,0 +1,70 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+
+signed short X[N] __attribute__ ((__aligned__(16)));
+signed short Y[N] __attribute__ ((__aligned__(16)));
+
+/* short->short->int dot product.
+ Not detected as a dot-product pattern.
+ Currently fails to be vectorized due to presence of type conversions. */
+int
+foo1(int len) {
+ int i;
+ int result = 0;
+ short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
+/* short->int->int dot product.
+ Detected as a dot-product pattern.
+ Vectorized on targets that support dot-product for signed shorts. */
+int
+foo2(int len) {
+ int i;
+ int result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+
+int main (void)
+{
+ int i, dot1, dot2;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ dot1 = foo1 (N);
+ if (dot1 != DOT1)
+ abort ();
+
+ dot2 = foo2 (N);
+ if (dot2 != DOT2)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_hi } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c
new file mode 100644
index 0000000..8e5d480
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-s8.c
@@ -0,0 +1,111 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 -21856
+#define DOT3 43680
+
+signed char X[N] __attribute__ ((__aligned__(16)));
+signed char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product.
+ The dot-product pattern should be detected.
+ Vectorizable on vect_sdot_qi targets (targets that support dot-product of
+ signed chars).
+
+ In the future could also be vectorized as widening-mult + widening-summation,
+ or with type-conversion support.
+ */
+int
+foo1(int len) {
+ int i;
+ int result = 0;
+ short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
+/* char->short->short dot product.
+ The dot-product pattern should be detected.
+ The reduction is currently not vectorized becaus of the signed->unsigned->signed
+ casts, since this patch:
+
+ 2005-12-26 Kazu Hirata <kazu@codesourcery.com>
+
+ PR tree-optimization/25125
+
+ When the dot-product is detected, the loop should be vectorized on vect_sdot_qi
+ targets (targets that support dot-product of signed char).
+ This test would currently fail to vectorize on targets that support
+ dot-product of chars when the accumulator is int.
+
+ In the future could also be vectorized as widening-mult + summation,
+ or with type-conversion support.
+ */
+short
+foo2(int len) {
+ int i;
+ short result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+/* char->int->int dot product.
+ Not detected as a dot-product pattern.
+ Currently fails to be vectorized due to presence of type conversions. */
+int
+foo3(int len) {
+ int i;
+ int result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+int main (void)
+{
+ int i, dot1, dot3;
+ short dot2;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ dot1 = foo1 (N);
+ if (dot1 != DOT1)
+ abort ();
+
+ dot2 = foo2 (N);
+ if (dot2 != DOT2)
+ abort ();
+
+ dot3 = foo3 (N);
+ if (dot3 != DOT3)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" } } */
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c
new file mode 100644
index 0000000..03db7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u16.c
@@ -0,0 +1,77 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+
+unsigned short X[N] __attribute__ ((__aligned__(16)));
+unsigned short Y[N] __attribute__ ((__aligned__(16)));
+
+/* short->short->int dot product.
+ Not detected as a dot-product pattern.
+ Not vectorized due to presence of type-conversions. */
+unsigned int
+foo1(int len) {
+ int i;
+ unsigned int result = 0;
+ unsigned short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
+/* short->int->int dot product.
+ Currently not detected as a dot-product pattern: the multiplication
+ promotes the ushorts to int, and then the product is promoted to unsigned
+ int for the addition. Which results in an int->unsigned int cast, which
+ since no bits are modified in the cast should be trivially vectorizable. */
+unsigned int
+foo2(int len) {
+ int i;
+ unsigned int result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+
+int main (void)
+{
+ unsigned int dot1, dot2;
+ int i;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ dot1 = foo1 (N);
+ if (dot1 != DOT1)
+ abort ();
+
+ dot2 = foo2 (N);
+ if (dot2 != DOT2)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
+
+/* Once the dot-product pattern is detected in the second loop, we expect
+ that loop to be vectorized on vect_udot_hi targets (targets that support
+ dot-product of unsigned shorts). */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c
new file mode 100644
index 0000000..ad68bc7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-dot-u8.c
@@ -0,0 +1,101 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 43680
+#define DOT3 43680
+
+unsigned char X[N] __attribute__ ((__aligned__(16)));
+unsigned char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product.
+ Detected as a dot-product pattern.
+ Should be vectorized on targets that support dot-product for unsigned chars.
+ */
+unsigned int
+foo1(int len) {
+ int i;
+ unsigned int result = 0;
+ unsigned short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
+/* char->short->short dot product.
+ Detected as a dot-product pattern.
+ Should be vectorized on targets that support dot-product for unsigned chars.
+ This test currently fails to vectorize on targets that support dot-product
+ of chars only when the accumulator is int.
+ */
+unsigned short
+foo2(int len) {
+ int i;
+ unsigned short result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (unsigned short)(X[i] * Y[i]);
+ }
+ return result;
+}
+
+/* char->int->int dot product.
+ Not detected as a dot-product.
+ Doesn't get vectorized due to presence of type converisons. */
+unsigned int
+foo3(int len) {
+ int i;
+ unsigned int result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+int main (void)
+{
+ unsigned int dot1, dot3;
+ unsigned short dot2;
+ int i;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ dot1 = foo1 (N);
+ if (dot1 != DOT1)
+ abort ();
+
+ dot2 = foo2 (N);
+ if (dot2 != DOT2)
+ abort ();
+
+ dot3 = foo3 (N);
+ if (dot3 != DOT3)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" } } */
+
+/* When the vectorizer is enhanced to vectorize foo2 (accumulation into short) for
+ targets that support accumulation into int (powerpc, ia64) we'd have:
+dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_udot_qi } }
+*/
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_udot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c
new file mode 100644
index 0000000..61f1da1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-1.c
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+ int i;
+ unsigned short udata_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+ unsigned char udata_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+ unsigned int intsum = 0;
+ unsigned short shortsum = 0;
+
+ /* widenning sum: sum shorts into int. */
+ for (i = 0; i < N; i++){
+ intsum += udata_sh[i];
+ }
+
+ /* check results: */
+ if (intsum != SH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into int. */
+ intsum = 0;
+ for (i = 0; i < N; i++){
+ intsum += udata_ch[i];
+ }
+
+ /* check results: */
+ if (intsum != CH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into short.
+ pattern detected, but not vectorized yet. */
+ for (i = 0; i < N; i++){
+ shortsum += udata_ch[i];
+ }
+
+ /* check results: */
+ if (shortsum != CH_SUM)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c
new file mode 100644
index 0000000..5423c43
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-pattern-2.c
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+ int i;
+ signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+ signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+ signed int intsum = 0;
+ signed short shortsum = 0;
+
+ /* widenning sum: sum shorts into int. */
+ for (i = 0; i < N; i++){
+ intsum += data_sh[i];
+ }
+
+ /* check results: */
+ if (intsum != SH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into int. */
+ intsum = 0;
+ for (i = 0; i < N; i++){
+ intsum += data_ch[i];
+ }
+
+ /* check results: */
+ if (intsum != CH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into short.
+ The widening-summation pattern is currently not detected because of this
+ patch:
+
+ 2005-12-26 Kazu Hirata <kazu@codesourcery.com>
+
+ PR tree-optimization/25125
+ */
+ for (i = 0; i < N; i++){
+ shortsum += data_ch[i];
+ }
+
+ /* check results: */
+ if (shortsum != CH_SUM)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
index bfa6dce..9cf78ff 100644
--- a/gcc/testsuite/gcc.dg/vect/vect.exp
+++ b/gcc/testsuite/gcc.dg/vect/vect.exp
@@ -1,4 +1,4 @@
-# Copyright (C) 1997, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1997, 2004, 2005, 2006 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -78,7 +78,7 @@ dg-init
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/nodump-*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
-lappend DEFAULT_VECTCFLAGS "-ftree-vectorizer-verbose=4" "-fdump-tree-vect-stats"
+lappend DEFAULT_VECTCFLAGS "-fdump-tree-vect-details"
# Main loop.
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]] \
@@ -96,6 +96,12 @@ lappend DEFAULT_VECTCFLAGS "-ffast-math"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]] \
"" $DEFAULT_VECTCFLAGS
+# -fwrapv tests
+set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+lappend DEFAULT_VECTCFLAGS "-fwrapv"
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/wrapv-vect*.\[cS\]]] \
+ "" $DEFAULT_VECTCFLAGS
+
# -ftrapv tests
set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
lappend DEFAULT_VECTCFLAGS "-ftrapv"
diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c
new file mode 100644
index 0000000..b11b9c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-dot-s8.c
@@ -0,0 +1,108 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+#define DOT1 43680
+#define DOT2 -21856
+#define DOT3 43680
+
+signed char X[N] __attribute__ ((__aligned__(16)));
+signed char Y[N] __attribute__ ((__aligned__(16)));
+
+/* char->short->int dot product.
+ The dot-product pattern should be detected.
+ Vectorizable on vect_sdot_qi targets (targets that support dot-product of
+ signed chars).
+
+ In the future could also be vectorized as widening-mult + widening-summation,
+ or with type-conversion support.
+ */
+int
+foo1(int len) {
+ int i;
+ int result = 0;
+ short prod;
+
+ for (i=0; i<len; i++) {
+ prod = X[i] * Y[i];
+ result += prod;
+ }
+ return result;
+}
+
+/* char->short->short dot product.
+ The dot-product pattern should be detected.
+ Should be vectorized on vect_sdot_qi targets (targets that support
+ dot-product of signed char).
+ This test currently fails to vectorize on targets that support
+ dot-product of chars when the accumulator is int.
+
+ In the future could also be vectorized as widening-mult + summation,
+ or with type-conversion support.
+ */
+short
+foo2(int len) {
+ int i;
+ short result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+/* char->int->int dot product.
+ Not detected as a dot-product pattern.
+ Currently fails to be vectorized due to presence of type conversions. */
+int
+foo3(int len) {
+ int i;
+ int result = 0;
+
+ for (i=0; i<len; i++) {
+ result += (X[i] * Y[i]);
+ }
+ return result;
+}
+
+int main (void)
+{
+ int i, dot1, dot3;
+ short dot2;
+
+ check_vect ();
+
+ for (i=0; i<N; i++) {
+ X[i] = i;
+ Y[i] = 64-i;
+ }
+
+ dot1 = foo1 (N);
+ if (dot1 != DOT1)
+ abort ();
+
+ dot2 = foo2 (N);
+ if (dot2 != DOT2)
+ abort ();
+
+ dot3 = foo3 (N);
+ if (dot3 != DOT3)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 2 "vect" } } */
+
+/* When vectorizer is enhanced to vectorize foo2 (accumulation into short) for targets
+ that support accumulation into int (ia64) we'd have:
+dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_sdot_qi } }
+*/
+/* In the meantime expect: */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sdot_qi } } } */
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c
new file mode 100755
index 0000000..6c844ea
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/wrapv-vect-reduc-pattern-2.c
@@ -0,0 +1,59 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 16
+#define SH_SUM 210
+#define CH_SUM 120
+
+int main1 ()
+{
+ int i;
+ signed short data_sh[N] = {0,2,4,6,8,10,12,14,16,18,20,22,24,26,28};
+ signed char data_ch[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+ signed int intsum = 0;
+ signed short shortsum = 0;
+
+ /* widenning sum: sum shorts into int. */
+ for (i = 0; i < N; i++){
+ intsum += data_sh[i];
+ }
+
+ /* check results: */
+ if (intsum != SH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into int. */
+ intsum = 0;
+ for (i = 0; i < N; i++){
+ intsum += data_ch[i];
+ }
+
+ /* check results: */
+ if (intsum != CH_SUM)
+ abort ();
+
+ /* widenning sum: sum chars into short. */
+ for (i = 0; i < N; i++){
+ shortsum += data_ch[i];
+ }
+
+ /* check results: */
+ if (shortsum != CH_SUM)
+ abort ();
+
+ return 0;
+}
+
+int main (void)
+{
+ check_vect ();
+
+ return main1 ();
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_sum_pattern: detected" 3 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_widen_sum } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index cadef96..05a180e 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1364,6 +1364,112 @@ proc check_effective_target_vect_no_bitwise { } {
return $et_vect_no_bitwise_saved
}
+# Return 1 if the target plus current options supports a vector
+# widening summation, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_widen_sum { } {
+ global et_vect_widen_sum
+
+ if [info exists et_vect_widen_sum_saved] {
+ verbose "check_effective_target_vect_widen_sum: using cached result" 2
+ } else {
+ set et_vect_widen_sum_saved 0
+ if { [istarget powerpc*-*-*]
+ || [istarget ia64-*-*] } {
+ set et_vect_widen_sum_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_widen_sum: returning $et_vect_widen_sum_saved" 2
+ return $et_vect_widen_sum_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of signed chars, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_sdot_qi { } {
+ global et_vect_sdot_qi
+
+ if [info exists et_vect_sdot_qi_saved] {
+ verbose "check_effective_target_vect_sdot_qi: using cached result" 2
+ } else {
+ set et_vect_sdot_qi_saved 0
+ if { [istarget ia64-*-*] } {
+ set et_vect_sdot_qi_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_sdot_qi: returning $et_vect_sdot_qi_saved" 2
+ return $et_vect_sdot_qi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of unsigned chars, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_udot_qi { } {
+ global et_vect_udot_qi
+
+ if [info exists et_vect_udot_qi_saved] {
+ verbose "check_effective_target_vect_udot_qi: using cached result" 2
+ } else {
+ set et_vect_udot_qi_saved 0
+ if { [istarget powerpc*-*-*]
+ || [istarget ia64-*-*] } {
+ set et_vect_udot_qi_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_udot_qi: returning $et_vect_udot_qi_saved" 2
+ return $et_vect_udot_qi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of signed shorts, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_sdot_hi { } {
+ global et_vect_sdot_hi
+
+ if [info exists et_vect_sdot_hi_saved] {
+ verbose "check_effective_target_vect_sdot_hi: using cached result" 2
+ } else {
+ set et_vect_sdot_hi_saved 0
+ if { [istarget powerpc*-*-*]
+ || [istarget i?86-*-*]
+ || [istarget x86_64-*-*]
+ || [istarget ia64-*-*] } {
+ set et_vect_sdot_hi_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_sdot_hi: returning $et_vect_sdot_hi_saved" 2
+ return $et_vect_sdot_hi_saved
+}
+
+# Return 1 if the target plus current options supports a vector
+# dot-product of unsigned shorts, 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_udot_hi { } {
+ global et_vect_udot_hi
+
+ if [info exists et_vect_udot_hi_saved] {
+ verbose "check_effective_target_vect_udot_hi: using cached result" 2
+ } else {
+ set et_vect_udot_hi_saved 0
+ if { [istarget powerpc*-*-*] } {
+ set et_vect_udot_hi_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_udot_hi: returning $et_vect_udot_hi_saved" 2
+ return $et_vect_udot_hi_saved
+}
+
+
# Return 1 if the target plus current options does not support a vector
# alignment mechanism, 0 otherwise.
#
diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
index 7030b92..9a93427 100644
--- a/gcc/tree-inline.c
+++ b/gcc/tree-inline.c
@@ -1,5 +1,5 @@
/* Tree inlining.
- Copyright 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
Contributed by Alexandre Oliva <aoliva@redhat.com>
This file is part of GCC.
@@ -1728,6 +1728,10 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data)
case REDUC_MAX_EXPR:
case REDUC_MIN_EXPR:
case REDUC_PLUS_EXPR:
+ case WIDEN_SUM_EXPR:
+ case DOT_PROD_EXPR:
+
+ case WIDEN_MULT_EXPR:
case RESX_EXPR:
*count += 1;
diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c
index d7e3391..4e50b8d 100644
--- a/gcc/tree-pretty-print.c
+++ b/gcc/tree-pretty-print.c
@@ -1,5 +1,6 @@
/* Pretty formatting of GENERIC trees in C syntax.
- Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
+ Free Software Foundation, Inc.
Adapted from c-pretty-print.c by Diego Novillo <dnovillo@redhat.com>
This file is part of GCC.
@@ -1168,6 +1169,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
break;
/* Binary arithmetic and logic expressions. */
+ case WIDEN_SUM_EXPR:
+ case WIDEN_MULT_EXPR:
case MULT_EXPR:
case PLUS_EXPR:
case MINUS_EXPR:
@@ -1686,6 +1689,16 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags,
pp_string (buffer, " > ");
break;
+ case DOT_PROD_EXPR:
+ pp_string (buffer, " DOT_PROD_EXPR < ");
+ dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
+ pp_string (buffer, " , ");
+ dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
+ pp_string (buffer, " , ");
+ dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
+ pp_string (buffer, " > ");
+ break;
+
case OMP_PARALLEL:
pp_string (buffer, "#pragma omp parallel");
dump_omp_clauses (buffer, OMP_PARALLEL_CLAUSES (node), spc, flags);
@@ -2105,10 +2118,13 @@ op_prio (tree op)
case RROTATE_EXPR:
return 11;
+ case WIDEN_SUM_EXPR:
case PLUS_EXPR:
case MINUS_EXPR:
return 12;
+ case WIDEN_MULT_EXPR:
+ case DOT_PROD_EXPR:
case MULT_EXPR:
case TRUNC_DIV_EXPR:
case CEIL_DIV_EXPR:
@@ -2263,6 +2279,12 @@ op_symbol_1 (enum tree_code code)
case REDUC_PLUS_EXPR:
return "r+";
+ case WIDEN_SUM_EXPR:
+ return "w+";
+
+ case WIDEN_MULT_EXPR:
+ return "w*";
+
case NEGATE_EXPR:
case MINUS_EXPR:
return "-";
diff --git a/gcc/tree-ssa-operands.c b/gcc/tree-ssa-operands.c
index 57cfedc..e3b95e7 100644
--- a/gcc/tree-ssa-operands.c
+++ b/gcc/tree-ssa-operands.c
@@ -1,5 +1,5 @@
/* SSA operands management for trees.
- Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
This file is part of GCC.
@@ -1273,6 +1273,7 @@ get_expr_operands (tree stmt, tree *expr_p, int flags)
return;
}
+ case DOT_PROD_EXPR:
case REALIGN_LOAD_EXPR:
{
get_expr_operands (stmt, &TREE_OPERAND (expr, 0), flags);
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index ab749fb..c5882d4 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -1,5 +1,5 @@
/* Analysis Utilities for Loop Vectorization.
- Copyright (C) 2003,2004,2005 Free Software Foundation, Inc.
+ Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
This file is part of GCC.
@@ -142,35 +142,46 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
return false;
}
- if (STMT_VINFO_DATA_REF (stmt_info))
- scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
- else if (TREE_CODE (stmt) == MODIFY_EXPR)
- scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0));
- else
- scalar_type = TREE_TYPE (stmt);
+ if (STMT_VINFO_VECTYPE (stmt_info))
+ {
+ vectype = STMT_VINFO_VECTYPE (stmt_info);
+ scalar_type = TREE_TYPE (vectype);
+ }
+ else
+ {
+ if (STMT_VINFO_DATA_REF (stmt_info))
+ scalar_type =
+ TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
+ else if (TREE_CODE (stmt) == MODIFY_EXPR)
+ scalar_type = TREE_TYPE (TREE_OPERAND (stmt, 0));
+ else
+ scalar_type = TREE_TYPE (stmt);
- if (vect_print_dump_info (REPORT_DETAILS))
- {
- fprintf (vect_dump, "get vectype for scalar type: ");
- print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
- }
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "get vectype for scalar type: ");
+ print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+ }
- vectype = get_vectype_for_scalar_type (scalar_type);
- if (!vectype)
- {
- if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
- {
- fprintf (vect_dump, "not vectorized: unsupported data-type ");
- print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
- }
- return false;
+ vectype = get_vectype_for_scalar_type (scalar_type);
+ if (!vectype)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))
+ {
+ fprintf (vect_dump,
+ "not vectorized: unsupported data-type ");
+ print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+ }
+ return false;
+ }
+ STMT_VINFO_VECTYPE (stmt_info) = vectype;
}
+
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "vectype: ");
print_generic_expr (vect_dump, vectype, TDF_SLIM);
}
- STMT_VINFO_VECTYPE (stmt_info) = vectype;
nunits = TYPE_VECTOR_SUBPARTS (vectype);
if (vect_print_dump_info (REPORT_DETAILS))
@@ -1439,6 +1450,24 @@ vect_mark_relevant (VEC(tree,heap) **worklist, tree stmt,
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "mark relevant %d, live %d.",relevant_p, live_p);
+ if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+ {
+ tree pattern_stmt;
+
+ /* This is the last stmt in a sequence that was detected as a
+ pattern that can potentially be vectorized. Don't mark the stmt
+ as relevant/live because it's not going to vectorized.
+ Instead mark the pattern-stmt that replaces it. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "last stmt in pattern. don't mark relevant/live.");
+ pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+ stmt_info = vinfo_for_stmt (pattern_stmt);
+ gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == stmt);
+ save_relevant_p = STMT_VINFO_RELEVANT_P (stmt_info);
+ save_live_p = STMT_VINFO_LIVE_P (stmt_info);
+ stmt = pattern_stmt;
+ }
+
STMT_VINFO_LIVE_P (stmt_info) |= live_p;
STMT_VINFO_RELEVANT_P (stmt_info) |= relevant_p;
@@ -2002,6 +2031,8 @@ vect_analyze_loop (struct loop *loop)
vect_analyze_scalar_cycles (loop_vinfo);
+ vect_pattern_recog (loop_vinfo);
+
/* Data-flow analysis to detect stmts that do not need to be vectorized. */
ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c
index cc834e4f..dd58cb9 100644
--- a/gcc/tree-vect-generic.c
+++ b/gcc/tree-vect-generic.c
@@ -1,5 +1,5 @@
/* Lower vector operations to scalar operations.
- Copyright (C) 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc.
This file is part of GCC.
@@ -411,6 +411,11 @@ expand_vector_operations_1 (block_stmt_iterator *bsi)
gcc_assert (code != CONVERT_EXPR);
op = optab_for_tree_code (code, type);
+ /* For widening vector operations, the relevant type is of the arguments,
+ not the widened result. */
+ if (code == WIDEN_SUM_EXPR)
+ type = TREE_TYPE (TREE_OPERAND (rhs, 0));
+
/* Optabs will try converting a negation into a subtraction, so
look for it as well. TODO: negation of floating-point vectors
might be turned into an exclusive OR toggling the sign bit. */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
new file mode 100644
index 0000000..0ef76fe
--- /dev/null
+++ b/gcc/tree-vect-patterns.c
@@ -0,0 +1,637 @@
+/* Analysis Utilities for Loop Vectorization.
+ Copyright (C) 2006 Free Software Foundation, Inc.
+ Contributed by Dorit Nuzman <dorit@il.ibm.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING. If not, write to the Free
+Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "ggc.h"
+#include "tree.h"
+
+#include "target.h"
+#include "basic-block.h"
+#include "diagnostic.h"
+#include "tree-flow.h"
+#include "tree-dump.h"
+#include "timevar.h"
+#include "cfgloop.h"
+#include "expr.h"
+#include "optabs.h"
+#include "params.h"
+#include "tree-data-ref.h"
+#include "tree-vectorizer.h"
+#include "recog.h"
+#include "toplev.h"
+
+/* Funcion prototypes */
+static void vect_pattern_recog_1
+ (tree (* ) (tree, tree *, tree *), block_stmt_iterator);
+static bool widened_name_p (tree, tree, tree *, tree *);
+
+/* Pattern recognition functions */
+static tree vect_recog_widen_sum_pattern (tree, tree *, tree *);
+static tree vect_recog_widen_mult_pattern (tree, tree *, tree *);
+static tree vect_recog_dot_prod_pattern (tree, tree *, tree *);
+static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
+ vect_recog_widen_mult_pattern,
+ vect_recog_widen_sum_pattern,
+ vect_recog_dot_prod_pattern};
+
+
+/* Function widened_name_p
+
+ Check whether NAME, an ssa-name used in USE_STMT,
+ is a result of a type-promotion, such that:
+ DEF_STMT: NAME = NOP (name0)
+ where the type of name0 (HALF_TYPE) is smaller than the type of NAME.
+*/
+
+static bool
+widened_name_p (tree name, tree use_stmt, tree *half_type, tree *def_stmt)
+{
+ tree dummy;
+ loop_vec_info loop_vinfo;
+ stmt_vec_info stmt_vinfo;
+ tree expr;
+ tree type = TREE_TYPE (name);
+ tree oprnd0;
+ enum vect_def_type dt;
+ tree def;
+
+ stmt_vinfo = vinfo_for_stmt (use_stmt);
+ loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+
+ if (!vect_is_simple_use (name, loop_vinfo, def_stmt, &def, &dt))
+ return false;
+
+ if (dt != vect_loop_def
+ && dt != vect_invariant_def && dt != vect_constant_def)
+ return false;
+
+ if (! *def_stmt)
+ return false;
+
+ if (TREE_CODE (*def_stmt) != MODIFY_EXPR)
+ return false;
+
+ expr = TREE_OPERAND (*def_stmt, 1);
+ if (TREE_CODE (expr) != NOP_EXPR)
+ return false;
+
+ oprnd0 = TREE_OPERAND (expr, 0);
+
+ *half_type = TREE_TYPE (oprnd0);
+ if (!INTEGRAL_TYPE_P (type) || !INTEGRAL_TYPE_P (*half_type)
+ || (TYPE_UNSIGNED (type) != TYPE_UNSIGNED (*half_type))
+ || (TYPE_PRECISION (type) < (TYPE_PRECISION (*half_type) * 2)))
+ return false;
+
+ if (!vect_is_simple_use (oprnd0, loop_vinfo, &dummy, &dummy, &dt))
+ return false;
+
+ if (dt != vect_invariant_def && dt != vect_constant_def
+ && dt != vect_loop_def)
+ return false;
+
+ return true;
+}
+
+
+/* Function vect_recog_dot_prod_pattern
+
+ Try to find the following pattern:
+
+ type x_t, y_t;
+ TYPE1 prod;
+ TYPE2 sum = init;
+ loop:
+ sum_0 = phi <init, sum_1>
+ S1 x_t = ...
+ S2 y_t = ...
+ S3 x_T = (TYPE1) x_t;
+ S4 y_T = (TYPE1) y_t;
+ S5 prod = x_T * y_T;
+ [S6 prod = (TYPE2) prod; #optional]
+ S7 sum_1 = prod + sum_0;
+
+ where 'TYPE1' is exactly double the size of type 'type', and 'TYPE2' is the
+ same size of 'TYPE1' or bigger. This is a sepcial case of a reduction
+ computation.
+
+ Input:
+
+ * LAST_STMT: A stmt from which the pattern search begins. In the example,
+ when this function is called with S7, the pattern {S3,S4,S5,S6,S7} will be
+ detected.
+
+ Output:
+
+ * TYPE_IN: The type of the input arguments to the pattern.
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the sequence of
+ stmts that constitute the pattern. In this case it will be:
+ WIDEN_DOT_PRODUCT <x_t, y_t, sum_0>
+*/
+
+static tree
+vect_recog_dot_prod_pattern (tree last_stmt, tree *type_in, tree *type_out)
+{
+ tree stmt, expr;
+ tree oprnd0, oprnd1;
+ tree oprnd00, oprnd01;
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+ tree type, half_type;
+ tree pattern_expr;
+ tree prod_type;
+
+ if (TREE_CODE (last_stmt) != MODIFY_EXPR)
+ return NULL;
+
+ expr = TREE_OPERAND (last_stmt, 1);
+ type = TREE_TYPE (expr);
+
+ /* Look for the following pattern
+ DX = (TYPE1) X;
+ DY = (TYPE1) Y;
+ DPROD = DX * DY;
+ DDPROD = (TYPE2) DPROD;
+ sum_1 = DDPROD + sum_0;
+ In which
+ - DX is double the size of X
+ - DY is double the size of Y
+ - DX, DY, DPROD all have the same type
+ - sum is the same size of DPROD or bigger
+ - sum has been recognized as a reduction variable.
+
+ This is equivalent to:
+ DPROD = X w* Y; #widen mult
+ sum_1 = DPROD w+ sum_0; #widen summation
+ or
+ DPROD = X w* Y; #widen mult
+ sum_1 = DPROD + sum_0; #summation
+ */
+
+ /* Starting from LAST_STMT, follow the defs of its uses in search
+ of the above pattern. */
+
+ if (TREE_CODE (expr) != PLUS_EXPR)
+ return NULL;
+
+ if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+ {
+ /* Has been detected as widening-summation? */
+
+ stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ expr = TREE_OPERAND (stmt, 1);
+ type = TREE_TYPE (expr);
+ if (TREE_CODE (expr) != WIDEN_SUM_EXPR)
+ return NULL;
+ oprnd0 = TREE_OPERAND (expr, 0);
+ oprnd1 = TREE_OPERAND (expr, 1);
+ half_type = TREE_TYPE (oprnd0);
+ }
+ else
+ {
+ tree def_stmt;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
+ return NULL;
+ oprnd0 = TREE_OPERAND (expr, 0);
+ oprnd1 = TREE_OPERAND (expr, 1);
+ if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type)
+ || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type))
+ return NULL;
+ stmt = last_stmt;
+
+ if (widened_name_p (oprnd0, stmt, &half_type, &def_stmt))
+ {
+ stmt = def_stmt;
+ expr = TREE_OPERAND (stmt, 1);
+ oprnd0 = TREE_OPERAND (expr, 0);
+ }
+ else
+ half_type = type;
+ }
+
+ /* So far so good. Since last_stmt was detected as a (summation) reduction,
+ we know that oprnd1 is the reduction variable (defined by a loop-header
+ phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
+ Left to check that oprnd0 is defined by a (widen_)mult_expr */
+
+ prod_type = half_type;
+ stmt = SSA_NAME_DEF_STMT (oprnd0);
+ gcc_assert (stmt);
+ stmt_vinfo = vinfo_for_stmt (stmt);
+ gcc_assert (stmt_vinfo);
+ gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def);
+ expr = TREE_OPERAND (stmt, 1);
+ if (TREE_CODE (expr) != MULT_EXPR)
+ return NULL;
+ if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+ {
+ /* Has been detected as a widening multiplication? */
+
+ stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+ expr = TREE_OPERAND (stmt, 1);
+ if (TREE_CODE (expr) != WIDEN_MULT_EXPR)
+ return NULL;
+ stmt_vinfo = vinfo_for_stmt (stmt);
+ gcc_assert (stmt_vinfo);
+ gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_loop_def);
+ oprnd00 = TREE_OPERAND (expr, 0);
+ oprnd01 = TREE_OPERAND (expr, 1);
+ }
+ else
+ {
+ tree half_type0, half_type1;
+ tree def_stmt;
+ tree oprnd0, oprnd1;
+
+ oprnd0 = TREE_OPERAND (expr, 0);
+ oprnd1 = TREE_OPERAND (expr, 1);
+ if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0))
+ != TYPE_MAIN_VARIANT (prod_type)
+ || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1))
+ != TYPE_MAIN_VARIANT (prod_type))
+ return NULL;
+ if (!widened_name_p (oprnd0, stmt, &half_type0, &def_stmt))
+ return NULL;
+ oprnd00 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0);
+ if (!widened_name_p (oprnd1, stmt, &half_type1, &def_stmt))
+ return NULL;
+ oprnd01 = TREE_OPERAND (TREE_OPERAND (def_stmt, 1), 0);
+ if (TYPE_MAIN_VARIANT (half_type0) != TYPE_MAIN_VARIANT (half_type1))
+ return NULL;
+ if (TYPE_PRECISION (prod_type) != TYPE_PRECISION (half_type0) * 2)
+ return NULL;
+ }
+
+ half_type = TREE_TYPE (oprnd00);
+ *type_in = half_type;
+ *type_out = type;
+
+ /* Pattern detected. Create a stmt to be used to replace the pattern: */
+ pattern_expr = build3 (DOT_PROD_EXPR, type, oprnd00, oprnd01, oprnd1);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "vect_recog_dot_prod_pattern: detected: ");
+ print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+ }
+ return pattern_expr;
+}
+
+
+/* Function vect_recog_widen_mult_pattern
+
+ Try to find the following pattern:
+
+ type a_t, b_t;
+ TYPE a_T, b_T, prod_T;
+
+ S1 a_t = ;
+ S2 b_t = ;
+ S3 a_T = (TYPE) a_t;
+ S4 b_T = (TYPE) b_t;
+ S5 prod_T = a_T * b_T;
+
+ where type 'TYPE' is at least double the size of type 'type'.
+
+ Input:
+
+ * LAST_STMT: A stmt from which the pattern search begins. In the example,
+ when this function is called with S5, the pattern {S3,S4,S5} is be detected.
+
+ Output:
+
+ * TYPE_IN: The type of the input arguments to the pattern.
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the sequence of
+ stmts that constitute the pattern. In this case it will be:
+ WIDEN_MULT <a_t, b_t>
+*/
+
+static tree
+vect_recog_widen_mult_pattern (tree last_stmt ATTRIBUTE_UNUSED,
+ tree *type_in ATTRIBUTE_UNUSED,
+ tree *type_out ATTRIBUTE_UNUSED)
+{
+ /* Yet to be implemented. */
+ return NULL;
+}
+
+
+/* Function vect_recog_widen_sum_pattern
+
+ Try to find the following pattern:
+
+ type x_t;
+ TYPE x_T, sum = init;
+ loop:
+ sum_0 = phi <init, sum_1>
+ S1 x_t = *p;
+ S2 x_T = (TYPE) x_t;
+ S3 sum_1 = x_T + sum_0;
+
+ where type 'TYPE' is at least double the size of type 'type', i.e - we're
+ summing elements of type 'type' into an accumulator of type 'TYPE'. This is
+ a sepcial case of a reduction computation.
+
+ Input:
+
+ * LAST_STMT: A stmt from which the pattern search begins. In the example,
+ when this function is called with S3, the pattern {S2,S3} will be detected.
+
+ Output:
+
+ * TYPE_IN: The type of the input arguments to the pattern.
+
+ * TYPE_OUT: The type of the output of this pattern.
+
+ * Return value: A new stmt that will be used to replace the sequence of
+ stmts that constitute the pattern. In this case it will be:
+ WIDEN_SUM <x_t, sum_0>
+*/
+
+static tree
+vect_recog_widen_sum_pattern (tree last_stmt, tree *type_in, tree *type_out)
+{
+ tree stmt, expr;
+ tree oprnd0, oprnd1;
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+ tree type, half_type;
+ tree pattern_expr;
+
+ if (TREE_CODE (last_stmt) != MODIFY_EXPR)
+ return NULL;
+
+ expr = TREE_OPERAND (last_stmt, 1);
+ type = TREE_TYPE (expr);
+
+ /* Look for the following pattern
+ DX = (TYPE) X;
+ sum_1 = DX + sum_0;
+ In which DX is at least double the size of X, and sum_1 has been
+ recognized as a reduction variable.
+ */
+
+ /* Starting from LAST_STMT, follow the defs of its uses in search
+ of the above pattern. */
+
+ if (TREE_CODE (expr) != PLUS_EXPR)
+ return NULL;
+
+ if (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def)
+ return NULL;
+
+ oprnd0 = TREE_OPERAND (expr, 0);
+ oprnd1 = TREE_OPERAND (expr, 1);
+ if (TYPE_MAIN_VARIANT (TREE_TYPE (oprnd0)) != TYPE_MAIN_VARIANT (type)
+ || TYPE_MAIN_VARIANT (TREE_TYPE (oprnd1)) != TYPE_MAIN_VARIANT (type))
+ return NULL;
+
+ /* So far so good. Since last_stmt was detected as a (summation) reduction,
+ we know that oprnd1 is the reduction variable (defined by a loop-header
+ phi), and oprnd0 is an ssa-name defined by a stmt in the loop body.
+ Left to check that oprnd0 is defined by a cast from type 'type' to type
+ 'TYPE'. */
+
+ if (!widened_name_p (oprnd0, last_stmt, &half_type, &stmt))
+ return NULL;
+
+ oprnd0 = TREE_OPERAND (TREE_OPERAND (stmt, 1), 0);
+ *type_in = half_type;
+ *type_out = type;
+
+ /* Pattern detected. Create a stmt to be used to replace the pattern: */
+ pattern_expr = build2 (WIDEN_SUM_EXPR, type, oprnd0, oprnd1);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "vect_recog_widen_sum_pattern: detected: ");
+ print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+ }
+ return pattern_expr;
+}
+
+
+/* Function vect_pattern_recog_1
+
+ Input:
+ PATTERN_RECOG_FUNC: A pointer to a function that detects a certain
+ computation pattern.
+ STMT: A stmt from which the pattern search should start.
+
+ If PATTERN_RECOG_FUNC successfully detected the pattern, it creates an
+ expression that computes the same functionality and can be used to
+ replace the sequence of stmts that are involved in the pattern.
+
+ Output:
+ This function checks if the expression returned by PATTERN_RECOG_FUNC is
+ supported in vector form by the target. We use 'TYPE_IN' to obtain the
+ relevant vector type. If 'TYPE_IN' is already a vector type, then this
+ indicates that target support had already been checked by PATTERN_RECOG_FUNC.
+ If 'TYPE_OUT' is also returned by PATTERN_RECOG_FUNC, we check that it fits
+ to the available target pattern.
+
+ This function also does some bookeeping, as explained in the documentation
+ for vect_recog_pattern. */
+
+static void
+vect_pattern_recog_1 (
+ tree (* vect_recog_func_ptr) (tree, tree *, tree *),
+ block_stmt_iterator si)
+{
+ tree stmt = bsi_stmt (si);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ stmt_vec_info pattern_stmt_info;
+ loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+ tree pattern_expr;
+ tree pattern_vectype;
+ tree type_in, type_out;
+ tree pattern_type;
+ enum tree_code code;
+ tree var, var_name;
+ stmt_ann_t ann;
+
+ pattern_expr = (* vect_recog_func_ptr) (stmt, &type_in, &type_out);
+ if (!pattern_expr)
+ return;
+
+ if (VECTOR_MODE_P (TYPE_MODE (type_in)))
+ {
+ /* No need to check target support (already checked by the pattern
+ recognition function). */
+ pattern_vectype = type_in;
+ }
+ else
+ {
+ enum tree_code vec_mode;
+ enum insn_code icode;
+ optab optab;
+
+ /* Check target support */
+ pattern_vectype = get_vectype_for_scalar_type (type_in);
+ optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype);
+ vec_mode = TYPE_MODE (pattern_vectype);
+ if (!optab
+ || (icode = optab->handlers[(int) vec_mode].insn_code) ==
+ CODE_FOR_nothing
+ || (type_out
+ && (insn_data[icode].operand[0].mode !=
+ TYPE_MODE (get_vectype_for_scalar_type (type_out)))))
+ return;
+ }
+
+ /* Found a vectorizable pattern. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "pattern recognized: ");
+ print_generic_expr (vect_dump, pattern_expr, TDF_SLIM);
+ }
+
+ /* Mark the stmts that are involved in the pattern,
+ create a new stmt to express the pattern and insert it. */
+ code = TREE_CODE (pattern_expr);
+ pattern_type = TREE_TYPE (pattern_expr);
+ var = create_tmp_var (pattern_type, "patt");
+ add_referenced_tmp_var (var);
+ var_name = make_ssa_name (var, NULL_TREE);
+ pattern_expr = build2 (MODIFY_EXPR, void_type_node, var_name, pattern_expr);
+ SSA_NAME_DEF_STMT (var_name) = pattern_expr;
+ bsi_insert_before (&si, pattern_expr, BSI_SAME_STMT);
+ ann = stmt_ann (pattern_expr);
+ set_stmt_info ((tree_ann_t)ann, new_stmt_vec_info (pattern_expr, loop_vinfo));
+ pattern_stmt_info = vinfo_for_stmt (pattern_expr);
+
+ STMT_VINFO_RELATED_STMT (pattern_stmt_info) = stmt;
+ STMT_VINFO_DEF_TYPE (pattern_stmt_info) = STMT_VINFO_DEF_TYPE (stmt_info);
+ STMT_VINFO_VECTYPE (pattern_stmt_info) = pattern_vectype;
+ STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
+ STMT_VINFO_RELATED_STMT (stmt_info) = pattern_expr;
+
+ return;
+}
+
+
+/* Function vect_pattern_recog
+
+ Input:
+ LOOP_VINFO - a struct_loop_info of a loop in which we want to look for
+ computation idioms.
+
+ Output - for each computation idiom that is detected we insert a new stmt
+ that provides the same functionality and that can be vectorized. We
+ also record some information in the struct_stmt_info of the relevant
+ stmts, as explained below:
+
+ At the entry to this function we have the following stmts, with the
+ following initial value in the STMT_VINFO fields:
+
+ stmt in_pattern_p related_stmt vec_stmt
+ S1: a_i = .... - - -
+ S2: a_2 = ..use(a_i).. - - -
+ S3: a_1 = ..use(a_2).. - - -
+ S4: a_0 = ..use(a_1).. - - -
+ S5: ... = ..use(a_0).. - - -
+
+ Say the sequence {S1,S2,S3,S4} was detected as a pattern that can be
+ represented by a single stmt. We then:
+ - create a new stmt S6 that will replace the pattern.
+ - insert the new stmt S6 before the last stmt in the pattern
+ - fill in the STMT_VINFO fields as follows:
+
+ in_pattern_p related_stmt vec_stmt
+ S1: a_i = .... - - -
+ S2: a_2 = ..use(a_i).. - - -
+ S3: a_1 = ..use(a_2).. - - -
+ > S6: a_new = .... - S4 -
+ S4: a_0 = ..use(a_1).. true S6 -
+ S5: ... = ..use(a_0).. - - -
+
+ (the last stmt in the pattern (S4) and the new pattern stmt (S6) point
+ to each other through the RELATED_STMT field).
+
+ S6 will be marked as relevant in vect_mark_stmts_to_be_vectorized instead
+ of S4 because it will replace all its uses. Stmts {S1,S2,S3} will
+ remain irrelevant unless used by stmts other than S4.
+
+ If vectorization succeeds, vect_transform_stmt will skip over {S1,S2,S3}
+ (because they are marked as irrelevent). It will vectorize S6, and record
+ a pointer to the new vector stmt VS6 both from S6 (as usual), and also
+ from S4. We do that so that when we get to vectorizing stmts that use the
+ def of S4 (like S5 that uses a_0), we'll know where to take the relevant
+ vector-def from. S4 will be skipped, and S5 will be vectorized as usual:
+
+ in_pattern_p related_stmt vec_stmt
+ S1: a_i = .... - - -
+ S2: a_2 = ..use(a_i).. - - -
+ S3: a_1 = ..use(a_2).. - - -
+ > VS6: va_new = .... - - -
+ S6: a_new = .... - S4 VS6
+ S4: a_0 = ..use(a_1).. true S6 VS6
+ > VS5: ... = ..vuse(va_new).. - - -
+ S5: ... = ..use(a_0).. - - -
+
+ DCE could then get rid of {S1,S2,S3,S4,S5,S6} (if their defs are not used
+ elsewhere), and we'll end up with:
+
+ VS6: va_new = ....
+ VS5: ... = ..vuse(va_new)..
+
+ If vectorization does not succeed, DCE will clean S6 away (its def is
+ not used), and we'll end up with the original sequence.
+*/
+
+void
+vect_pattern_recog (loop_vec_info loop_vinfo)
+{
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
+ unsigned int nbbs = loop->num_nodes;
+ block_stmt_iterator si;
+ tree stmt;
+ unsigned int i, j;
+ tree (* vect_recog_func_ptr) (tree, tree *, tree *);
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "=== vect_pattern_recog ===");
+
+ /* Scan through the loop stmts, applying the pattern recognition
+ functions starting at each stmt visited: */
+ for (i = 0; i < nbbs; i++)
+ {
+ basic_block bb = bbs[i];
+ for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si))
+ {
+ stmt = bsi_stmt (si);
+
+ /* Scan over all generic vect_recog_xxx_pattern functions. */
+ for (j = 0; j < NUM_PATTERNS; j++)
+ {
+ vect_recog_func_ptr = vect_vect_recog_func_ptrs[j];
+ vect_pattern_recog_1 (vect_recog_func_ptr, si);
+ }
+ }
+ }
+}
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c
index 42090f7..db0573c 100644
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -1,5 +1,5 @@
/* Transformation Utilities for Loop Vectorization.
- Copyright (C) 2003,2004,2005 Free Software Foundation, Inc.
+ Copyright (C) 2003,2004,2005,2006 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
This file is part of GCC.
@@ -59,6 +59,7 @@ static void vect_finish_stmt_generation
(tree stmt, tree vec_stmt, block_stmt_iterator *bsi);
static bool vect_is_simple_cond (tree, loop_vec_info);
static void update_vuses_to_preheader (tree, struct loop*);
+static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree);
static tree get_initial_def_for_reduction (tree, tree, tree *);
/* Utility function dealing with loop peeling (not peeling itself). */
@@ -656,6 +657,8 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
switch (code)
{
+ case WIDEN_SUM_EXPR:
+ case DOT_PROD_EXPR:
case PLUS_EXPR:
if (INTEGRAL_TYPE_P (type))
def = build_int_cst (type, 0);
@@ -711,66 +714,66 @@ get_initial_def_for_reduction (tree stmt, tree init_val, tree *scalar_def)
}
-/* Function vect_create_epilog_for_reduction:
+/* Function vect_create_epilog_for_reduction
Create code at the loop-epilog to finalize the result of a reduction
- computation.
+ computation.
- LOOP_EXIT_VECT_DEF is a vector of partial results. We need to "reduce" it
- into a single result, by applying the operation REDUC_CODE on the
- partial-results-vector. For this, we need to create a new phi node at the
- loop exit to preserve loop-closed form, as illustrated below.
-
- STMT is the original scalar reduction stmt that is being vectorized.
- REDUCTION_OP is the scalar reduction-variable.
+ VECT_DEF is a vector of partial results.
+ REDUC_CODE is the tree-code for the epilog reduction.
+ STMT is the scalar reduction stmt that is being vectorized.
REDUCTION_PHI is the phi-node that carries the reduction computation.
- This function also sets the arguments for the REDUCTION_PHI:
- The loop-entry argument is the (vectorized) initial-value of REDUCTION_OP.
- The loop-latch argument is VECT_DEF - the vector of partial sums.
- This function transforms this:
+ This function:
+ 1. Creates the reduction def-use cycle: sets the the arguments for
+ REDUCTION_PHI:
+ The loop-entry argument is the vectorized initial-value of the reduction.
+ The loop-latch argument is VECT_DEF - the vector of partial sums.
+ 2. "Reduces" the vector of partial results VECT_DEF into a single result,
+ by applying the operation specified by REDUC_CODE if available, or by
+ other means (whole-vector shifts or a scalar loop).
+ The function also creates a new phi node at the loop exit to preserve
+ loop-closed form, as illustrated below.
+
+ The flow at the entry to this function:
loop:
- vec_def = phi <null, null> # REDUCTION_PHI
- ....
- VECT_DEF = ...
-
+ vec_def = phi <null, null> # REDUCTION_PHI
+ VECT_DEF = vector_stmt # vectorized form of STMT
+ s_loop = scalar_stmt # (scalar) STMT
loop_exit:
- s_out0 = phi <s_loop> # EXIT_PHI
-
+ s_out0 = phi <s_loop> # (scalar) EXIT_PHI
use <s_out0>
use <s_out0>
- Into:
+ The above is transformed by this function into:
loop:
- vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
- ....
- VECT_DEF = ...
-
+ vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
+ VECT_DEF = vector_stmt # vectorized form of STMT
+ s_loop = scalar_stmt # (scalar) STMT
loop_exit:
- s_out0 = phi <s_loop> # EXIT_PHI
- v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
-
- v_out2 = reduc_expr <v_out1>
+ s_out0 = phi <s_loop> # (scalar) EXIT_PHI
+ v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
+ v_out2 = reduce <v_out1>
s_out3 = extract_field <v_out2, 0>
-
- use <s_out3>
- use <s_out3>
+ s_out4 = adjust_result <s_out3>
+ use <s_out4>
+ use <s_out4>
*/
static void
-vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
+vect_create_epilog_for_reduction (tree vect_def, tree stmt,
enum tree_code reduc_code, tree reduction_phi)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
- tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- enum machine_mode mode = TYPE_MODE (vectype);
+ tree vectype;
+ enum machine_mode mode;
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
basic_block exit_bb;
- tree scalar_dest = TREE_OPERAND (stmt, 0);
- tree scalar_type = TREE_TYPE (scalar_dest);
+ tree scalar_dest;
+ tree scalar_type;
tree new_phi;
block_stmt_iterator exit_bsi;
tree vec_dest;
@@ -786,7 +789,16 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
imm_use_iterator imm_iter;
use_operand_p use_p;
bool extract_scalar_result;
+ tree reduction_op;
+ tree orig_stmt;
+ tree operation = TREE_OPERAND (stmt, 1);
+ int op_type;
+ op_type = TREE_CODE_LENGTH (TREE_CODE (operation));
+ reduction_op = TREE_OPERAND (operation, op_type-1);
+ vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
+ mode = TYPE_MODE (vectype);
+
/*** 1. Create the reduction def-use cycle ***/
/* 1.1 set the loop-entry arg of the reduction-phi: */
@@ -797,7 +809,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
&scalar_initial_def);
add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop));
-
/* 1.2 set the loop-latch arg for the reduction-phi: */
add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop));
@@ -810,7 +821,32 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
}
- /*** 2. Create epilog code ***/
+ /*** 2. Create epilog code
+ The reduction epilog code operates across the elements of the vector
+ of partial results computed by the vectorized loop.
+ The reduction epilog code consists of:
+ step 1: compute the scalar result in a vector (v_out2)
+ step 2: extract the scalar result (s_out3) from the vector (v_out2)
+ step 3: adjust the scalar result (s_out3) if needed.
+
+ Step 1 can be accomplished using one the following three schemes:
+ (scheme 1) using reduc_code, if available.
+ (scheme 2) using whole-vector shifts, if available.
+ (scheme 3) using a scalar loop. In this case steps 1+2 above are
+ combined.
+
+ The overall epilog code looks like this:
+
+ s_out0 = phi <s_loop> # original EXIT_PHI
+ v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
+ v_out2 = reduce <v_out1> # step 1
+ s_out3 = extract_field <v_out2, 0> # step 2
+ s_out4 = adjust_result <s_out3> # step 3
+
+ (step 3 is optional, and step2 1 and 2 may be combined).
+ Lastly, the uses of s_out0 are replaced by s_out4.
+
+ ***/
/* 2.1 Create new loop-exit-phi to preserve loop-closed form:
v_out1 = phi <v_loop> */
@@ -818,15 +854,39 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
exit_bb = loop->single_exit->dest;
new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
SET_PHI_ARG_DEF (new_phi, loop->single_exit->dest_idx, vect_def);
-
exit_bsi = bsi_start (exit_bb);
-
+ /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
+ (i.e. when reduc_code is not available) and in the final adjusment code
+ (if needed). Also get the original scalar reduction variable as
+ defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
+ represents a reduction pattern), the tree-code and scalar-def are
+ taken from the original stmt that the pattern-stmt (STMT) replaces.
+ Otherwise (it is a regular reduction) - the tree-code and scalar-def
+ are taken from STMT. */
+
+ orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+ if (!orig_stmt)
+ {
+ /* Regular reduction */
+ orig_stmt = stmt;
+ }
+ else
+ {
+ /* Reduction pattern */
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
+ gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
+ gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
+ }
+ code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
+ scalar_dest = TREE_OPERAND (orig_stmt, 0);
+ scalar_type = TREE_TYPE (scalar_dest);
new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
bitsize = TYPE_SIZE (scalar_type);
bytesize = TYPE_SIZE_UNIT (scalar_type);
- /* 2.2 Create the reduction code. */
+ /* 2.3 Create the reduction code, using one of the three schemes described
+ above. */
if (reduc_code < NUM_TREE_CODES)
{
@@ -849,16 +909,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
{
enum tree_code shift_code = 0;
bool have_whole_vector_shift = true;
- enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */
int bit_offset;
int element_bitsize = tree_low_cst (bitsize, 1);
int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
tree vec_temp;
- /* The result of the reduction is expected to be at the least
- significant bits of the vector. This is merely convention,
- as it's the extraction later that really matters, and that
- is also under our control. */
if (vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing)
shift_code = VEC_RSHIFT_EXPR;
else
@@ -881,7 +936,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
if (have_whole_vector_shift)
{
- /*** Case 2:
+ /*** Case 2: Create:
for (offset = VS/2; offset >= element_size; offset/=2)
{
Create: va' = vec_shift <va, offset>
@@ -905,17 +960,12 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
new_name = make_ssa_name (vec_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_name;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
-
epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
build2 (code, vectype, new_name, new_temp));
new_temp = make_ssa_name (vec_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_temp;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
}
extract_scalar_result = true;
@@ -924,10 +974,11 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
{
tree rhs;
- /*** Case 3:
- Create:
+ /*** Case 3: Create:
s = extract_field <v_out2, 0>
- for (offset=element_size; offset<vector_size; offset+=element_size;)
+ for (offset = element_size;
+ offset < vector_size;
+ offset += element_size;)
{
Create: s' = extract_field <v_out2, offset>
Create: s = op <s, s'>
@@ -938,18 +989,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
vec_temp = PHI_RESULT (new_phi);
vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
-
rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
bitsize_zero_node);
-
BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type);
- epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
- rhs);
+ epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, rhs);
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_temp;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
for (bit_offset = element_bitsize;
bit_offset < vec_size_in_bits;
@@ -965,25 +1011,19 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_name;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
-
epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest,
build2 (code, scalar_type, new_name, new_temp));
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_temp;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
}
extract_scalar_result = false;
}
}
-
- /* 2.3 Extract the final scalar result. Create:
+ /* 2.4 Extract the final scalar result. Create:
s_out3 = extract_field <v_out2, bitpos> */
if (extract_scalar_result)
@@ -993,7 +1033,6 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "extract scalar result");
- /* The result is in the low order bits. */
if (BYTES_BIG_ENDIAN)
bitpos = size_binop (MULT_EXPR,
bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
@@ -1007,17 +1046,14 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_temp;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
}
-
/* 2.4 Adjust the final result by the initial value of the reduction
- variable. (when such adjustment is not needed, then
+ variable. (When such adjustment is not needed, then
'scalar_initial_def' is zero).
Create:
- s_out = scalar_expr <s_out, scalar_initial_def> */
+ s_out4 = scalar_expr <s_out3, scalar_initial_def> */
if (scalar_initial_def)
{
@@ -1026,18 +1062,13 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
TREE_OPERAND (epilog_stmt, 0) = new_temp;
bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT);
-
- if (vect_print_dump_info (REPORT_DETAILS))
- print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM);
}
+ /* 2.6 Replace uses of s_out0 with uses of s_out3 */
- /* 2.5 Replace uses of s_out0 with uses of s_out3 */
-
- /* Find the loop-closed-use at the loop exit of the original
- scalar result. (The reduction result is expected to have
- two immediate uses - one at the latch block, and one at the
- loop exit). */
+ /* Find the loop-closed-use at the loop exit of the original scalar result.
+ (The reduction result is expected to have two immediate uses - one at the
+ latch block, and one at the loop exit). */
exit_phi = NULL;
FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
{
@@ -1047,9 +1078,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
break;
}
}
-
+ /* We expect to have found an exit_phi because of loop-closed-ssa form. */
+ gcc_assert (exit_phi);
+ /* Replace the uses: */
orig_name = PHI_RESULT (exit_phi);
-
FOR_EACH_IMM_USE_SAFE (use_p, imm_iter, orig_name)
SET_USE (use_p, new_temp);
}
@@ -1060,33 +1092,69 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op,
Check if STMT performs a reduction operation that can be vectorized.
If VEC_STMT is also passed, vectorize the STMT: create a vectorized
stmt to replace it, put it in VEC_STMT, and insert it at BSI.
- Return FALSE if not a vectorizable STMT, TRUE otherwise. */
+ Return FALSE if not a vectorizable STMT, TRUE otherwise.
+
+ This function also handles reduction idioms (patterns) that have been
+ recognized in advance during vect_pattern_recog. In this case, STMT may be
+ of this form:
+ X = pattern_expr (arg0, arg1, ..., X)
+ and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
+ sequence that had been detected and replaced by the pattern-stmt (STMT).
+
+ In some cases of reduction patterns, the type of the reduction variable X is
+ different than the type of the other arguments of STMT.
+ In such cases, the vectype that is used when transforming STMT into a vector
+ stmt is different than the vectype that is used to determine the
+ vectorization factor, because it consists of a different number of elements
+ than the actual number of elements that are being operated upon in parallel.
+
+ For example, consider an accumulation of shorts into an int accumulator.
+ On some targets it's possible to vectorize this pattern operating on 8
+ shorts at a time (hence, the vectype for purposes of determining the
+ vectorization factor should be V8HI); on the other hand, the vectype that
+ is used to create the vector form is actually V4SI (the type of the result).
+
+ Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
+ indicates what is the actual level of parallelism (V8HI in the example), so
+ that the right vectorization factor would be derived. This vectype
+ corresponds to the type of arguments to the reduction stmt, and should *NOT*
+ be used to create the vectorized stmt. The right vectype for the vectorized
+ stmt is obtained from the type of the result X:
+ get_vectype_for_scalar_type (TREE_TYPE (X))
+
+ This means that, contrary to "regular" reductions (or "regular" stmts in
+ general), the following equation:
+ STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
+ does *NOT* necessarily hold for reduction patterns. */
bool
vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
{
tree vec_dest;
tree scalar_dest;
- tree op0, op1;
- tree loop_vec_def;
+ tree op;
+ tree loop_vec_def0, loop_vec_def1;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
tree operation;
- enum tree_code code, reduc_code = 0;
+ enum tree_code code, orig_code, epilog_reduc_code = 0;
enum machine_mode vec_mode;
int op_type;
optab optab, reduc_optab;
tree new_temp;
- tree def0, def1, def_stmt0, def_stmt1;
- enum vect_def_type dt0, dt1;
+ tree def, def_stmt;
+ enum vect_def_type dt;
tree new_phi;
tree scalar_type;
- bool is_simple_use0;
- bool is_simple_use1;
+ bool is_simple_use;
+ tree orig_stmt;
+ stmt_vec_info orig_stmt_info;
+ tree expr = NULL_TREE;
+ int i;
- /* Is vectorizable reduction? */
+ /* 1. Is vectorizable reduction? */
/* Not supportable if the reduction variable is used in the loop. */
if (STMT_VINFO_RELEVANT_P (stmt_info))
@@ -1095,43 +1163,68 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (!STMT_VINFO_LIVE_P (stmt_info))
return false;
- /* Make sure it was already recognized as a reduction pattern. */
+ /* Make sure it was already recognized as a reduction computation. */
if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)
return false;
+ /* 2. Has this been recognized as a reduction pattern?
+
+ Check if STMT represents a pattern that has been recognized
+ in earlier analysis stages. For stmts that represent a pattern,
+ the STMT_VINFO_RELATED_STMT field records the last stmt in
+ the original sequence that constitutes the pattern. */
+
+ orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+ if (orig_stmt)
+ {
+ orig_stmt_info = vinfo_for_stmt (orig_stmt);
+ gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
+ gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
+ gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
+ }
+
+ /* 3. Check the operands of the operation. The first operands are defined
+ inside the loop body. The last operand is the reduction variable,
+ which is defined by the loop-header-phi. */
+
gcc_assert (TREE_CODE (stmt) == MODIFY_EXPR);
operation = TREE_OPERAND (stmt, 1);
code = TREE_CODE (operation);
op_type = TREE_CODE_LENGTH (code);
- if (op_type != binary_op)
+ if (op_type != binary_op && op_type != ternary_op)
return false;
-
- op0 = TREE_OPERAND (operation, 0);
- op1 = TREE_OPERAND (operation, 1);
scalar_dest = TREE_OPERAND (stmt, 0);
scalar_type = TREE_TYPE (scalar_dest);
- /* Check the first operand. It is expected to be defined inside the loop. */
- is_simple_use0 =
- vect_is_simple_use (op0, loop_vinfo, &def_stmt0, &def0, &dt0);
- is_simple_use1 =
- vect_is_simple_use (op1, loop_vinfo, &def_stmt1, &def1, &dt1);
-
- gcc_assert (is_simple_use0);
- gcc_assert (is_simple_use1);
- gcc_assert (dt0 == vect_loop_def);
- gcc_assert (dt1 == vect_reduction_def);
- gcc_assert (TREE_CODE (def_stmt1) == PHI_NODE);
- gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt1));
+ /* All uses but the last are expected to be defined in the loop.
+ The last use is the reduction variable. */
+ for (i = 0; i < op_type-1; i++)
+ {
+ op = TREE_OPERAND (operation, i);
+ is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+ gcc_assert (is_simple_use);
+ gcc_assert (dt == vect_loop_def || dt == vect_invariant_def ||
+ dt == vect_constant_def);
+ }
- if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt1)))
- return false;
+ op = TREE_OPERAND (operation, i);
+ is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt);
+ gcc_assert (is_simple_use);
+ gcc_assert (dt == vect_reduction_def);
+ gcc_assert (TREE_CODE (def_stmt) == PHI_NODE);
+ if (orig_stmt)
+ gcc_assert (orig_stmt == vect_is_simple_reduction (loop, def_stmt));
+ else
+ gcc_assert (stmt == vect_is_simple_reduction (loop, def_stmt));
+
+ if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
+ return false;
- /* Supportable by target? */
+ /* 4. Supportable by target? */
- /* check support for the operation in the loop */
+ /* 4.1. check support for the operation in the loop */
optab = optab_for_tree_code (code, vectype);
if (!optab)
{
@@ -1162,21 +1255,69 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
return false;
}
- /* check support for the epilog operation */
- if (!reduction_code_for_scalar_code (code, &reduc_code))
+ /* 4.2. Check support for the epilog operation.
+
+ If STMT represents a reduction pattern, then the type of the
+ reduction variable may be different than the type of the rest
+ of the arguments. For example, consider the case of accumulation
+ of shorts into an int accumulator; The original code:
+ S1: int_a = (int) short_a;
+ orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
+
+ was replaced with:
+ STMT: int_acc = widen_sum <short_a, int_acc>
+
+ This means that:
+ 1. The tree-code that is used to create the vector operation in the
+ epilog code (that reduces the partial results) is not the
+ tree-code of STMT, but is rather the tree-code of the original
+ stmt from the pattern that STMT is replacing. I.e, in the example
+ above we want to use 'widen_sum' in the loop, but 'plus' in the
+ epilog.
+ 2. The type (mode) we use to check available target support
+ for the vector operation to be created in the *epilog*, is
+ determined by the type of the reduction variable (in the example
+ above we'd check this: plus_optab[vect_int_mode]).
+ However the type (mode) we use to check available target support
+ for the vector operation to be created *inside the loop*, is
+ determined by the type of the other arguments to STMT (in the
+ example we'd check this: widen_sum_optab[vect_short_mode]).
+
+ This is contrary to "regular" reductions, in which the types of all
+ the arguments are the same as the type of the reduction variable.
+ For "regular" reductions we can therefore use the same vector type
+ (and also the same tree-code) when generating the epilog code and
+ when generating the code inside the loop. */
+
+ if (orig_stmt)
+ {
+ /* This is a reduction pattern: get the vectype from the type of the
+ reduction variable, and get the tree-code from orig_stmt. */
+ orig_code = TREE_CODE (TREE_OPERAND (orig_stmt, 1));
+ vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
+ vec_mode = TYPE_MODE (vectype);
+ }
+ else
+ {
+ /* Regular reduction: use the same vectype and tree-code as used for
+ the vector code inside the loop can be used for the epilog code. */
+ orig_code = code;
+ }
+
+ if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
return false;
- reduc_optab = optab_for_tree_code (reduc_code, vectype);
+ reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
if (!reduc_optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "no optab for reduction.");
- reduc_code = NUM_TREE_CODES;
+ epilog_reduc_code = NUM_TREE_CODES;
}
if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "reduc op not supported by target.");
- reduc_code = NUM_TREE_CODES;
+ epilog_reduc_code = NUM_TREE_CODES;
}
if (!vec_stmt) /* transformation not required. */
@@ -1193,25 +1334,31 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
/* Create the destination vector */
vec_dest = vect_create_destination_var (scalar_dest, vectype);
-
/* Create the reduction-phi that defines the reduction-operand. */
new_phi = create_phi_node (vec_dest, loop->header);
-
/* Prepare the operand that is defined inside the loop body */
- loop_vec_def = vect_get_vec_def_for_operand (op0, stmt, NULL);
+ op = TREE_OPERAND (operation, 0);
+ loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL);
+ if (op_type == binary_op)
+ expr = build2 (code, vectype, loop_vec_def0, PHI_RESULT (new_phi));
+ else if (op_type == ternary_op)
+ {
+ op = TREE_OPERAND (operation, 1);
+ loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL);
+ expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
+ PHI_RESULT (new_phi));
+ }
/* Create the vectorized operation that computes the partial results */
- *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest,
- build2 (code, vectype, loop_vec_def, PHI_RESULT (new_phi)));
+ *vec_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, expr);
new_temp = make_ssa_name (vec_dest, *vec_stmt);
TREE_OPERAND (*vec_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, *vec_stmt, bsi);
-
/* Finalize the reduction-phi (set it's arguments) and create the
epilog reduction code. */
- vect_create_epilog_for_reduction (new_temp, stmt, op1, reduc_code, new_phi);
+ vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi);
return true;
}
@@ -2040,6 +2187,7 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi)
bool is_store = false;
tree vec_stmt = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ tree orig_stmt_in_pattern;
bool done;
if (STMT_VINFO_RELEVANT_P (stmt_info))
@@ -2078,7 +2226,25 @@ vect_transform_stmt (tree stmt, block_stmt_iterator *bsi)
gcc_unreachable ();
}
+ gcc_assert (vec_stmt);
STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt;
+ orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info);
+ if (orig_stmt_in_pattern)
+ {
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern);
+ if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo))
+ {
+ gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
+
+ /* STMT was inserted by the vectorizer to replace a computation
+ idiom. ORIG_STMT_IN_PATTERN is a stmt in the original
+ sequence that computed this idiom. We need to record a pointer
+ to VEC_STMT in the stmt_info of ORIG_STMT_IN_PATTERN. See more
+ detail in the documentation of vect_pattern_recog. */
+
+ STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt;
+ }
+ }
}
if (STMT_VINFO_LIVE_P (stmt_info))
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index d4c6989..f03a2a2 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1,5 +1,5 @@
/* Loop Vectorization
- Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
This file is part of GCC.
@@ -1361,6 +1361,8 @@ new_stmt_vec_info (tree stmt, loop_vec_info loop_vinfo)
STMT_VINFO_LIVE_P (res) = 0;
STMT_VINFO_VECTYPE (res) = NULL;
STMT_VINFO_VEC_STMT (res) = NULL;
+ STMT_VINFO_IN_PATTERN_P (res) = false;
+ STMT_VINFO_RELATED_STMT (res) = NULL;
STMT_VINFO_DATA_REF (res) = NULL;
if (TREE_CODE (stmt) == PHI_NODE)
STMT_VINFO_DEF_TYPE (res) = vect_unknown_def_type;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4f7fd95..c5b1378 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1,5 +1,5 @@
/* Loop Vectorization
- Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
This file is part of GCC.
@@ -43,10 +43,11 @@ enum vect_var_kind {
vect_scalar_var
};
-/* Defines type of operation: unary or binary. */
+/* Defines type of operation. */
enum operation_type {
unary_op = 1,
- binary_op
+ binary_op,
+ ternary_op
};
/* Define type of available alignment support. */
@@ -204,6 +205,20 @@ typedef struct _stmt_vec_info {
/* Information about the data-ref (access function, etc). */
struct data_reference *data_ref_info;
+ /* Stmt is part of some pattern (computation idiom) */
+ bool in_pattern_p;
+
+ /* Used for various bookeeping purposes, generally holding a pointer to
+ some other stmt S that is in some way "related" to this stmt.
+ Current use of this field is:
+ If this stmt is part of a pattern (i.e. the field 'in_pattern_p' is
+ true): S is the "pattern stmt" that represents (and replaces) the
+ sequence of stmts that constitutes the pattern. Similarly, the
+ related_stmt of the "pattern stmt" points back to this stmt (which is
+ the last stmt in the original sequence of stmts that constitutes the
+ pattern). */
+ tree related_stmt;
+
/* List of datarefs that are known to have the same alignment as the dataref
of this stmt. */
VEC(dr_p,heap) *same_align_refs;
@@ -222,6 +237,8 @@ typedef struct _stmt_vec_info {
#define STMT_VINFO_VECTYPE(S) (S)->vectype
#define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt
#define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
+#define STMT_VINFO_IN_PATTERN_P(S) (S)->in_pattern_p
+#define STMT_VINFO_RELATED_STMT(S) (S)->related_stmt
#define STMT_VINFO_SAME_ALIGN_REFS(S) (S)->same_align_refs
#define STMT_VINFO_DEF_TYPE(S) (S)->def_type
@@ -312,7 +329,6 @@ extern bool vect_can_force_dr_alignment_p (tree, unsigned int);
extern enum dr_alignment_support vect_supportable_dr_alignment
(struct data_reference *);
extern bool reduction_code_for_scalar_code (enum tree_code, enum tree_code *);
-
/* Creation and deletion of loop and stmt info structs. */
extern loop_vec_info new_loop_vec_info (struct loop *loop);
extern void destroy_loop_vec_info (loop_vec_info);
@@ -320,10 +336,21 @@ extern stmt_vec_info new_stmt_vec_info (tree stmt, loop_vec_info);
/* Main driver. */
extern void vectorize_loops (struct loops *);
+
/** In tree-vect-analyze.c **/
/* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *);
+
+/** In tree-vect-patterns.c **/
+/* Pattern recognition functions.
+ Additional pattern recognition functions can (and will) be added
+ in the future. */
+typedef tree (* vect_recog_func_ptr) (tree, tree *, tree *);
+#define NUM_PATTERNS 3
+void vect_pattern_recog (loop_vec_info);
+
+
/** In tree-vect-transform.c **/
extern bool vectorizable_load (tree, block_stmt_iterator *, tree *);
extern bool vectorizable_store (tree, block_stmt_iterator *, tree *);
diff --git a/gcc/tree.def b/gcc/tree.def
index 9e7e5b0..3cd03fd 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -1,7 +1,7 @@
/* This file contains the definitions and documentation for the
tree codes used in GCC.
- Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005
- Free Software Foundation, Inc.
+ Copyright (C) 1987, 1988, 1993, 1995, 1997, 1998, 2000, 2001, 2004, 2005,
+ 2006 Free Software Foundation, Inc.
This file is part of GCC.
@@ -1073,6 +1073,33 @@ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1)
DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1)
DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)
+/* Widenning dot-product.
+ The first two arguments are of type t1.
+ The third argument and the result are of type t2, such that t2 is at least
+ twice the size of t1. DOT_PROD_EXPR(arg1,arg2,arg3) is equivalent to:
+ tmp = WIDEN_MULT_EXPR(arg1, arg2);
+ arg3 = PLUS_EXPR (tmp, arg3);
+ or:
+ tmp = WIDEN_MULT_EXPR(arg1, arg2);
+ arg3 = WIDEN_SUM_EXPR (tmp, arg3); */
+DEFTREECODE (DOT_PROD_EXPR, "dot_prod_expr", tcc_expression, 3)
+
+/* Widenning summation.
+ The first argument is of type t1.
+ The second argument is of type t2, such that t2 is at least twice
+ the size of t1. The type of the entire expression is also t2.
+ WIDEN_SUM_EXPR is equivalent to first widening (promoting)
+ the first argument from type t1 to type t2, and then summing it
+ with the second argument. */
+DEFTREECODE (WIDEN_SUM_EXPR, "widen_sum_expr", tcc_binary, 2)
+
+/* Widenning multiplication.
+ The two arguments are of type t1.
+ The result is of type t2, such that t2 is at least twice
+ the size of t1. WIDEN_MULT_EXPR is equivalent to first widening (promoting)
+ the arguments from type t1 to type t2, and then multiplying them. */
+DEFTREECODE (WIDEN_MULT_EXPR, "widen_mult_expr", tcc_binary, 2)
+
/* Whole vector left/right shift in bits.
Operand 0 is a vector to be shifted.
Operand 1 is an integer shift amount in bits. */