diff options
author | Dorit Nuzman <dorit@il.ibm.com> | 2005-06-21 09:02:00 +0000 |
---|---|---|
committer | Dorit Nuzman <dorit@gcc.gnu.org> | 2005-06-21 09:02:00 +0000 |
commit | a6b46ba2c84f81e70811e13581c99350cdc76400 (patch) | |
tree | 9b2edf4d87ca9a2741f8f8a846bf5e277d74ab28 /gcc | |
parent | a3a2067ac5b2a5ce0b8439d42167df5694d2bb5b (diff) | |
download | gcc-a6b46ba2c84f81e70811e13581c99350cdc76400.zip gcc-a6b46ba2c84f81e70811e13581c99350cdc76400.tar.gz gcc-a6b46ba2c84f81e70811e13581c99350cdc76400.tar.bz2 |
genopinit.c (vec_shl_optab, [...]): Initialize new optabs.
* genopinit.c (vec_shl_optab, vec_shr_optab): Initialize new optabs.
(reduc_plus_optab): Removed. Replcaed with...
(reduc_splus_optab, reduc_uplus_optab): Initialize new optabs.
* optabs.c (optab_for_tree_code): Return reduc_splus_optab or
reduc_uplus_optab instead of reduc_plus_optab.
(expand_vec_shift_expr): New function.
(init_optabs): Initialize new optabs. Remove initialization of
reduc_plus_optab.
(optab_for_tree_code): Return vec_shl_optab/vec_shr_optab
for VEC_LSHIFT_EXPR/VEC_RSHIFT_EXPR.
* optabs.h (OTI_reduc_plus): Removed. Replaced with...
(OTI_reduc_splus, OTI_reduc_uplus): New.
(reduc_plus_optab): Removed. Replcaed with...
(reduc_splus_optab, reduc_uplus_optab): New optabs.
(vec_shl_optab, vec_shr_optab): New optabs.
(expand_vec_shift_expr): New function declaration.
* tree.def (VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR): New tree-codes.
* tree-inline.c (estimate_num_insns_1): Handle new tree-codes.
* expr.c (expand_expr_real_1): Handle new tree-codes.
* tree-pretty-print.c (dump_generic_node, op_symbol, op_prio): Likewise.
* tree-vect-generic.c (expand_vector_operations_1): Add assert.
* tree-vect-transform.c (vect_create_epilog_for_reduction): Add two
alternatives for generating reduction epilog code.
(vectorizable_reduction): Don't fail of direct reduction support is
not available.
(vectorizable_target_reduction_pattern): Likewise.
* config/rs6000/altivec.md (reduc_smax_v4si, reduc_smax_v4sf,
reduc_umax_v4si, reduc_smin_v4si, reduc_smin_v4sf, reduc_umin_v4si,
reduc_plus_v4si, reduc_plus_v4sf): Removed.
(vec_shl_<mode>, vec_shr_<mode>, altivec_vsumsws_nomode,
reduc_splus_<mode>, reduc_uplus_v16qi): New.
From-SVN: r101231
Diffstat (limited to 'gcc')
25 files changed, 882 insertions, 215 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0113201..0cc4cf9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,40 @@ +2005-06-21 Dorit Nuzman <dorit@il.ibm.com> + + * genopinit.c (vec_shl_optab, vec_shr_optab): Initialize new optabs. + (reduc_plus_optab): Removed. Replcaed with... + (reduc_splus_optab, reduc_uplus_optab): Initialize new optabs. + * optabs.c (optab_for_tree_code): Return reduc_splus_optab or + reduc_uplus_optab instead of reduc_plus_optab. + (expand_vec_shift_expr): New function. + (init_optabs): Initialize new optabs. Remove initialization of + reduc_plus_optab. + (optab_for_tree_code): Return vec_shl_optab/vec_shr_optab + for VEC_LSHIFT_EXPR/VEC_RSHIFT_EXPR. + * optabs.h (OTI_reduc_plus): Removed. Replaced with... + (OTI_reduc_splus, OTI_reduc_uplus): New. + (reduc_plus_optab): Removed. Replcaed with... + (reduc_splus_optab, reduc_uplus_optab): New optabs. + (vec_shl_optab, vec_shr_optab): New optabs. + (expand_vec_shift_expr): New function declaration. + + * tree.def (VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR): New tree-codes. + * tree-inline.c (estimate_num_insns_1): Handle new tree-codes. + * expr.c (expand_expr_real_1): Handle new tree-codes. + * tree-pretty-print.c (dump_generic_node, op_symbol, op_prio): Likewise. + * tree-vect-generic.c (expand_vector_operations_1): Add assert. + + * tree-vect-transform.c (vect_create_epilog_for_reduction): Add two + alternatives for generating reduction epilog code. + (vectorizable_reduction): Don't fail of direct reduction support is + not available. + (vectorizable_target_reduction_pattern): Likewise. + + * config/rs6000/altivec.md (reduc_smax_v4si, reduc_smax_v4sf, + reduc_umax_v4si, reduc_smin_v4si, reduc_smin_v4sf, reduc_umin_v4si, + reduc_plus_v4si, reduc_plus_v4sf): Removed. + (vec_shl_<mode>, vec_shr_<mode>, altivec_vsumsws_nomode, + reduc_splus_<mode>, reduc_uplus_v16qi): New. + 2005-06-20 Daniel Berlin <dberlin@dberlin.org> * c-typeck.c (build_function_call): Set fundecl = function again. diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 3b20447..7bfd5d9 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1825,157 +1825,100 @@ operands[3] = gen_reg_rtx (GET_MODE (operands[0])); }) -;; Reduction - -(define_expand "reduc_smax_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))] +;; Vector shift left in bits. Currently supported ony for shift +;; amounts that can be expressed as byte shifts (divisible by 8). +;; General shift amounts can be supported using vslo + vsl. We're +;; not expecting to see these yet (the vectorizer currently +;; generates only shifts divisible by byte_size). +(define_expand "vec_shl_<mode>" + [(set (match_operand:V 0 "register_operand" "=v") + (unspec:V [(match_operand:V 1 "register_operand" "v") + (match_operand:QI 2 "reg_or_short_operand" "")] 219 ))] "TARGET_ALTIVEC" " -{ - rtx vtmp1 = gen_reg_rtx (V4SImode); - rtx vtmp2 = gen_reg_rtx (V4SImode); - rtx vtmp3 = gen_reg_rtx (V4SImode); - - emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_smaxv4si3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_smaxv4si3 (operands[0], vtmp2, vtmp3)); +{ + rtx bitshift = operands[2]; + rtx byteshift = gen_reg_rtx (QImode); + HOST_WIDE_INT bitshift_val; + HOST_WIDE_INT byteshift_val; + + if (! CONSTANT_P (bitshift)) + FAIL; + bitshift_val = INTVAL (bitshift); + if (bitshift_val & 0x7) + FAIL; + byteshift_val = bitshift_val >> 3; + byteshift = gen_rtx_CONST_INT (QImode, byteshift_val); + emit_insn (gen_altivec_vsldoi_<mode> (operands[0], operands[1], operands[1], + byteshift)); DONE; }") -(define_expand "reduc_smax_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))] +;; Vector shift left in bits. Currently supported ony for shift +;; amounts that can be expressed as byte shifts (divisible by 8). +;; General shift amounts can be supported using vsro + vsr. We're +;; not expecting to see these yet (the vectorizer currently +;; generates only shifts divisible by byte_size). +(define_expand "vec_shr_<mode>" + [(set (match_operand:V 0 "register_operand" "=v") + (unspec:V [(match_operand:V 1 "register_operand" "v") + (match_operand:QI 2 "reg_or_short_operand" "")] 219 ))] "TARGET_ALTIVEC" " -{ - rtx vtmp1 = gen_reg_rtx (V4SFmode); - rtx vtmp2 = gen_reg_rtx (V4SFmode); - rtx vtmp3 = gen_reg_rtx (V4SFmode); - - emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_smaxv4sf3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_smaxv4sf3 (operands[0], vtmp2, vtmp3)); +{ + rtx bitshift = operands[2]; + rtx byteshift = gen_reg_rtx (QImode); + HOST_WIDE_INT bitshift_val; + HOST_WIDE_INT byteshift_val; + + if (! CONSTANT_P (bitshift)) + FAIL; + bitshift_val = INTVAL (bitshift); + if (bitshift_val & 0x7) + FAIL; + byteshift_val = 16 - (bitshift_val >> 3); + byteshift = gen_rtx_CONST_INT (QImode, byteshift_val); + emit_insn (gen_altivec_vsldoi_<mode> (operands[0], operands[1], operands[1], + byteshift)); DONE; }") -(define_expand "reduc_umax_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))] +(define_insn "altivec_vsumsws_nomode" + [(set (match_operand 0 "register_operand" "=v") + (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") + (match_operand:V4SI 2 "register_operand" "v")] 135)) + (set (reg:SI 110) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] "TARGET_ALTIVEC" - " -{ - rtx vtmp1 = gen_reg_rtx (V4SImode); - rtx vtmp2 = gen_reg_rtx (V4SImode); - rtx vtmp3 = gen_reg_rtx (V4SImode); - - emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_umaxv4si3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_umaxv4si3 (operands[0], vtmp2, vtmp3)); - DONE; -}") + "vsumsws %0,%1,%2" + [(set_attr "type" "veccomplex")]) -(define_expand "reduc_smin_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))] +(define_expand "reduc_splus_<mode>" + [(set (match_operand:VIshort 0 "register_operand" "=v") + (unspec:VIshort [(match_operand:VIshort 1 "register_operand" "v")] 217))] "TARGET_ALTIVEC" " { + rtx vzero = gen_reg_rtx (V4SImode); rtx vtmp1 = gen_reg_rtx (V4SImode); - rtx vtmp2 = gen_reg_rtx (V4SImode); - rtx vtmp3 = gen_reg_rtx (V4SImode); - - emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_sminv4si3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_sminv4si3 (operands[0], vtmp2, vtmp3)); - DONE; -}") -(define_expand "reduc_smin_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))] - "TARGET_ALTIVEC" - " -{ - rtx vtmp1 = gen_reg_rtx (V4SFmode); - rtx vtmp2 = gen_reg_rtx (V4SFmode); - rtx vtmp3 = gen_reg_rtx (V4SFmode); - - emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_sminv4sf3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_sminv4sf3 (operands[0], vtmp2, vtmp3)); + emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); + emit_insn (gen_altivec_vsum4s<VI_char>s (vtmp1, operands[1], vzero)); + emit_insn (gen_altivec_vsumsws_nomode (operands[0], vtmp1, vzero)); DONE; }") -(define_expand "reduc_umin_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))] +(define_expand "reduc_uplus_v16qi" + [(set (match_operand:V16QI 0 "register_operand" "=v") + (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")] 217))] "TARGET_ALTIVEC" " { + rtx vzero = gen_reg_rtx (V4SImode); rtx vtmp1 = gen_reg_rtx (V4SImode); - rtx vtmp2 = gen_reg_rtx (V4SImode); - rtx vtmp3 = gen_reg_rtx (V4SImode); - - emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_uminv4si3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_uminv4si3 (operands[0], vtmp2, vtmp3)); - DONE; -}") -(define_expand "reduc_plus_v4si" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")] 217))] - "TARGET_ALTIVEC" - " -{ - rtx vtmp1 = gen_reg_rtx (V4SImode); - rtx vtmp2 = gen_reg_rtx (V4SImode); - rtx vtmp3 = gen_reg_rtx (V4SImode); - - emit_insn (gen_altivec_vsldoi_v4si (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_addv4si3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4si (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_addv4si3 (operands[0], vtmp2, vtmp3)); - DONE; -}") - -(define_expand "reduc_plus_v4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v") - (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")] 217))] - "TARGET_ALTIVEC" - " -{ - rtx vtmp1 = gen_reg_rtx (V4SFmode); - rtx vtmp2 = gen_reg_rtx (V4SFmode); - rtx vtmp3 = gen_reg_rtx (V4SFmode); - - emit_insn (gen_altivec_vsldoi_v4sf (vtmp1, operands[1], operands[1], - gen_rtx_CONST_INT (SImode, 8))); - emit_insn (gen_addv4sf3 (vtmp2, operands[1], vtmp1)); - emit_insn (gen_altivec_vsldoi_v4sf (vtmp3, vtmp2, vtmp2, - gen_rtx_CONST_INT (SImode, 4))); - emit_insn (gen_addv4sf3 (operands[0], vtmp2, vtmp3)); + emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); + emit_insn (gen_altivec_vsum4ubs (vtmp1, operands[1], vzero)); + emit_insn (gen_altivec_vsumsws_nomode (operands[0], vtmp1, vzero)); DONE; }") @@ -8367,6 +8367,13 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, return temp; } + case VEC_LSHIFT_EXPR: + case VEC_RSHIFT_EXPR: + { + target = expand_vec_shift_expr (exp, target); + return target; + } + default: return lang_hooks.expand_expr (exp, original_target, tmode, modifier, alt_rtl); diff --git a/gcc/genopinit.c b/gcc/genopinit.c index eea084d..19a7f7c 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -196,6 +196,8 @@ static const char * const optabs[] = "vec_set_optab->handlers[$A].insn_code = CODE_FOR_$(vec_set$a$)", "vec_extract_optab->handlers[$A].insn_code = CODE_FOR_$(vec_extract$a$)", "vec_init_optab->handlers[$A].insn_code = CODE_FOR_$(vec_init$a$)", + "vec_shl_optab->handlers[$A].insn_code = CODE_FOR_$(vec_shl_$a$)", + "vec_shr_optab->handlers[$A].insn_code = CODE_FOR_$(vec_shr_$a$)", "vec_realign_load_optab->handlers[$A].insn_code = CODE_FOR_$(vec_realign_load_$a$)", "vcond_gen_code[$A] = CODE_FOR_$(vcond$a$)", "vcondu_gen_code[$A] = CODE_FOR_$(vcondu$a$)", @@ -203,7 +205,8 @@ static const char * const optabs[] = "reduc_umax_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umax_$a$)", "reduc_smin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_smin_$a$)", "reduc_umin_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_umin_$a$)", - "reduc_plus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_plus_$a$)" + "reduc_splus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_splus_$a$)" , + "reduc_uplus_optab->handlers[$A].insn_code = CODE_FOR_$(reduc_uplus_$a$)" }; static void gen_insn (rtx); diff --git a/gcc/optabs.c b/gcc/optabs.c index cd4f2cb..2202727 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -301,7 +301,13 @@ optab_for_tree_code (enum tree_code code, tree type) return TYPE_UNSIGNED (type) ? reduc_umin_optab : reduc_smin_optab; case REDUC_PLUS_EXPR: - return reduc_plus_optab; + return TYPE_UNSIGNED (type) ? reduc_uplus_optab : reduc_splus_optab; + + case VEC_LSHIFT_EXPR: + return vec_shl_optab; + + case VEC_RSHIFT_EXPR: + return vec_shr_optab; default: break; @@ -443,6 +449,61 @@ force_expand_binop (enum machine_mode mode, optab binoptab, return true; } +/* Generate insns for VEC_LSHIFT_EXPR, VEC_RSHIFT_EXPR. */ + +rtx +expand_vec_shift_expr (tree vec_shift_expr, rtx target) +{ + enum insn_code icode; + rtx rtx_op1, rtx_op2; + enum machine_mode mode1; + enum machine_mode mode2; + enum machine_mode mode = TYPE_MODE (TREE_TYPE (vec_shift_expr)); + tree vec_oprnd = TREE_OPERAND (vec_shift_expr, 0); + tree shift_oprnd = TREE_OPERAND (vec_shift_expr, 1); + optab shift_optab; + rtx pat; + + switch (TREE_CODE (vec_shift_expr)) + { + case VEC_RSHIFT_EXPR: + shift_optab = vec_shr_optab; + break; + case VEC_LSHIFT_EXPR: + shift_optab = vec_shl_optab; + break; + default: + gcc_unreachable (); + } + + icode = (int) shift_optab->handlers[(int) mode].insn_code; + gcc_assert (icode != CODE_FOR_nothing); + + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + + rtx_op1 = expand_expr (vec_oprnd, NULL_RTX, VOIDmode, EXPAND_NORMAL); + if (!(*insn_data[icode].operand[1].predicate) (rtx_op1, mode1) + && mode1 != VOIDmode) + rtx_op1 = force_reg (mode1, rtx_op1); + + rtx_op2 = expand_expr (shift_oprnd, NULL_RTX, VOIDmode, EXPAND_NORMAL); + if (!(*insn_data[icode].operand[2].predicate) (rtx_op2, mode2) + && mode2 != VOIDmode) + rtx_op2 = force_reg (mode2, rtx_op2); + + if (!target + || ! (*insn_data[icode].operand[0].predicate) (target, mode)) + target = gen_reg_rtx (mode); + + /* Emit instruction */ + pat = GEN_FCN (icode) (target, rtx_op1, rtx_op2); + gcc_assert (pat); + emit_insn (pat); + + return target; +} + /* This subroutine of expand_doubleword_shift handles the cases in which the effective shift value is >= BITS_PER_WORD. The arguments and return value are the same as for the parent routine, except that SUPERWORD_OP1 @@ -5074,11 +5135,14 @@ init_optabs (void) reduc_umax_optab = init_optab (UNKNOWN); reduc_smin_optab = init_optab (UNKNOWN); reduc_umin_optab = init_optab (UNKNOWN); - reduc_plus_optab = init_optab (UNKNOWN); + reduc_splus_optab = init_optab (UNKNOWN); + reduc_uplus_optab = init_optab (UNKNOWN); vec_extract_optab = init_optab (UNKNOWN); vec_set_optab = init_optab (UNKNOWN); vec_init_optab = init_optab (UNKNOWN); + vec_shl_optab = init_optab (UNKNOWN); + vec_shr_optab = init_optab (UNKNOWN); vec_realign_load_optab = init_optab (UNKNOWN); movmisalign_optab = init_optab (UNKNOWN); diff --git a/gcc/optabs.h b/gcc/optabs.h index 2495fed..91afce3 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -236,7 +236,8 @@ enum optab_index OTI_reduc_umax, OTI_reduc_smin, OTI_reduc_umin, - OTI_reduc_plus, + OTI_reduc_splus, + OTI_reduc_uplus, /* Set specified field of vector operand. */ OTI_vec_set, @@ -244,6 +245,9 @@ enum optab_index OTI_vec_extract, /* Initialize vector operand. */ OTI_vec_init, + /* Whole vector shift. The shift amount is in bits. */ + OTI_vec_shl, + OTI_vec_shr, /* Extract specified elements from vectors, for vector load. */ OTI_vec_realign_load, @@ -358,11 +362,14 @@ extern GTY(()) optab optab_table[OTI_MAX]; #define reduc_umax_optab (optab_table[OTI_reduc_umax]) #define reduc_smin_optab (optab_table[OTI_reduc_smin]) #define reduc_umin_optab (optab_table[OTI_reduc_umin]) -#define reduc_plus_optab (optab_table[OTI_reduc_plus]) +#define reduc_splus_optab (optab_table[OTI_reduc_splus]) +#define reduc_uplus_optab (optab_table[OTI_reduc_uplus]) #define vec_set_optab (optab_table[OTI_vec_set]) #define vec_extract_optab (optab_table[OTI_vec_extract]) #define vec_init_optab (optab_table[OTI_vec_init]) +#define vec_shl_optab (optab_table[OTI_vec_shl]) +#define vec_shr_optab (optab_table[OTI_vec_shr]) #define vec_realign_load_optab (optab_table[OTI_vec_realign_load]) #define powi_optab (optab_table[OTI_powi]) @@ -575,4 +582,7 @@ bool expand_vec_cond_expr_p (tree, enum machine_mode); /* Generate code for VEC_COND_EXPR. */ extern rtx expand_vec_cond_expr (tree, rtx); +/* Generate code for VEC_LSHIFT_EXPR and VEC_RSHIFT_EXPR. */ +extern rtx expand_vec_shift_expr (tree, rtx); + #endif /* GCC_OPTABS_H */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e452f4cd..a526fb1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,21 @@ +2005-06-21 Dorit Nuzman <dorit@il.ibm.com> + + * lib/target-supports.exp (check_effective_target_vect_reduction): + Remove. + * gcc.dg/vect/vect.exp: Run tests with additional flags separately. + * gcc.dg/vect/vect-reduc-1.c: Vectorizable on all relevant platforms - + remove vect_reduction target keyword. Also avoid two returns in main. + * gcc.dg/vect/vect-reduc-3.c: Likewise. + * gcc.dg/vect/vect-reduc-2.c: Likewise. Also initialize diff to 0. + * gcc.dg/vect/vect-reduc-1short.c: New test. + * gcc.dg/vect/vect-reduc-1char.c: New test. + * gcc.dg/vect/vect-reduc-2short.c: New test. + * gcc.dg/vect/vect-reduc-2char.c: New test. + * gcc.dg/vect/vect-reduc-6.c: New test. + * gcc.dg/vect/trapv-vect-reduc-4.c: New test. + * gcc.dg/vect/fast-math-vect-reduc-5.c: New test. + * gcc.dg/vect/fast-math-vect-reduc-7.c: New test + 2005-06-21 Tobias Schl"uter <tobias.schlueter@physik.uni-muenchen.de> Paul Thomas <pault@gcc.gnu.org> diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-5.c b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-5.c new file mode 100644 index 0000000..dd84f5c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-5.c @@ -0,0 +1,53 @@ +/* { dg-require-effective-target vect_float } */ + +/* need -funsafe-math-optimizations to vectorize the summation. + also need -ffinite-math-only to create the min/max expr. */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (float x, float max_result) +{ + int i; + float b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + float c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + float diff = 2; + float max = x; + float min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-7.c b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-7.c new file mode 100644 index 0000000..797b1a7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-reduc-7.c @@ -0,0 +1,53 @@ +/* { dg-require-effective-target vect_double } */ + +/* need -funsafe-math-optimizations to vectorize the summation. + also need -ffinite-math-only to create the min/max expr. */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (double x, double max_result) +{ + int i; + double b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + double c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + double diff = 2; + double max = x; + double min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c new file mode 100644 index 0000000..2129717 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c @@ -0,0 +1,49 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-do compile } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (int x, int max_result) +{ + int i; + int b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + int c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + int diff = 2; + int max = x; + int min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c index cb29357..bc87a5c 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1.c @@ -47,9 +47,9 @@ int main (void) { check_vect (); - return main1 (100, 100); - return main1 (0, 15); + main1 (100, 100); + main1 (0, 15); } -/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail {! vect_reduction} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-1char.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-1char.c new file mode 100644 index 0000000..e85fa4a --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1char.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (unsigned char x, unsigned char max_result) +{ + int i; + unsigned char ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned char uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + unsigned char udiff = 2; + unsigned char umax = x; + unsigned char umin = 10; + + for (i = 0; i < N; i++) { + udiff += (unsigned char)(ub[i] - uc[i]); + } + + for (i = 0; i < N; i++) { + umax = umax < uc[i] ? uc[i] : umax; + } + + for (i = 0; i < N; i++) { + umin = umin > uc[i] ? uc[i] : umin; + } + + /* check results: */ + if (udiff != DIFF) + abort (); + if (umax != max_result) + abort (); + if (umin != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-1short.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-1short.c new file mode 100644 index 0000000..bd116be --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-1short.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (unsigned short x, unsigned short max_result) +{ + int i; + unsigned short ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + unsigned short uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + unsigned short udiff = 2; + unsigned short umax = x; + unsigned short umin = 10; + + for (i = 0; i < N; i++) { + udiff += (unsigned short)(ub[i] - uc[i]); + } + + for (i = 0; i < N; i++) { + umax = umax < uc[i] ? uc[i] : umax; + } + + for (i = 0; i < N; i++) { + umin = umin > uc[i] ? uc[i] : umin; + } + + /* check results: */ + if (udiff != DIFF) + abort (); + if (umax != max_result) + abort (); + if (umin != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c index e44d3f3..ca1a3da 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2.c @@ -1,11 +1,10 @@ - /* { dg-require-effective-target vect_int } */ #include <stdarg.h> #include "tree-vect.h" #define N 16 -#define DIFF 242 +#define DIFF 240 /* Test vectorization of reduction of signed-int. */ @@ -14,7 +13,7 @@ int main1 (int x, int max_result) int i; int b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; int c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - int diff = 2; + int diff = 0; int max = x; int min = 10; @@ -45,9 +44,10 @@ int main (void) { check_vect (); - return main1 (100, 100); - return main1 (0, 15); + main1 (100, 100); + main1 (0, 15); + return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail {! vect_reduction} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c new file mode 100644 index 0000000..eddc2cf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2char.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 121 + +int main1 (char x, char max_result) +{ + int i; + char b[N] = {0,2,3,6,8,10,12,14,16,18,20,22,24,26,28,30}; + char c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + signed char diff = 2; + char max = x; + char min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0 ; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { xfail i?86-*-* x86_64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c new file mode 100644 index 0000000..f0880aa --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-2short.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (short x, short max_result) +{ + int i; + short b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + short c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + short diff = 2; + short max = x; + short min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100, 100); + main1 (0, 15); + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c index 8937254..0011837 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-3.c @@ -4,12 +4,11 @@ #include "tree-vect.h" #define N 16 -#define DIFF 240 /* Test vectorization of reduction of unsigned-int in the presence of unknown-loop-bound. */ -int main1 (int n) +int main1 (int n, int res) { int i; unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; @@ -22,7 +21,7 @@ int main1 (int n) } /* check results: */ - if (udiff != DIFF) + if (udiff != res) abort (); return 0; @@ -32,9 +31,10 @@ int main (void) { check_vect (); - return main1 (N); - return main1 (N-1); + main1 (N, 240); + main1 (N-1, 210); + return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail {! vect_reduction} } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c new file mode 100644 index 0000000..4e4f155 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_float } */ + +#include <stdarg.h> +#include "tree-vect.h" + +#define N 16 +#define DIFF 242 + +int main1 (float x, float max_result) +{ + int i; + float b[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; + float c[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + float diff = 2; + float max = x; + float min = 10; + + for (i = 0; i < N; i++) { + diff += (b[i] - c[i]); + } + + for (i = 0; i < N; i++) { + max = max < c[i] ? c[i] : max; + } + + for (i = 0; i < N; i++) { + min = min > c[i] ? c[i] : min; + } + + /* check results: */ + if (diff != DIFF) + abort (); + if (max != max_result) + abort (); + if (min != 0) + abort (); + + return 0; +} + +int main (void) +{ + check_vect (); + + main1 (100 ,100); + main1 (0, 15); + return 0; +} + +/* need -ffast-math to vectorizer these loops. */ +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp index 3f52ed6..6ab7e3d 100644 --- a/gcc/testsuite/gcc.dg/vect/vect.exp +++ b/gcc/testsuite/gcc.dg/vect/vect.exp @@ -76,7 +76,25 @@ if [istarget "powerpc*-*-*"] { dg-init # Main loop. -dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] \ +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/pr*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/vect-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + +#### Tests with special options +global SAVED_DEFAULT_VECTCFLAGS +set SAVED_DEFAULT_VECTCFLAGS $DEFAULT_VECTCFLAGS + +# -ffast-math tests +set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS +lappend DEFAULT_VECTCFLAGS "-ffast-math" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/fast-math-vect*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + +# -ftrapv tests +set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS +lappend DEFAULT_VECTCFLAGS "-ftrapv" +dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/trapv-vect*.\[cS\]]] \ "" $DEFAULT_VECTCFLAGS # Clean up. diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 4facec6..0378169 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -988,23 +988,6 @@ proc check_effective_target_vect_int_mult { } { return $et_vect_int_mult_saved } -# Return 1 if the target supports vector reduction - -proc check_effective_target_vect_reduction { } { - global et_vect_reduction_saved - - if [info exists et_vect_reduction_saved] { - verbose "check_effective_target_vect_reduction: using cached result" 2 - } else { - set et_vect_reduction_saved 0 - if { [istarget powerpc*-*-*] } { - set et_vect_reduction_saved 1 - } - } - verbose "check_effective_target_vect_reduction: returning $et_vect_reduction_saved" 2 - return $et_vect_reduction_saved -} - # Return 1 if the target supports atomic operations on "int" and "long". proc check_effective_target_sync_int_long { } { diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c index 7fa4350..ee30ccc 100644 --- a/gcc/tree-inline.c +++ b/gcc/tree-inline.c @@ -1692,6 +1692,8 @@ estimate_num_insns_1 (tree *tp, int *walk_subtrees, void *data) case RSHIFT_EXPR: case LROTATE_EXPR: case RROTATE_EXPR: + case VEC_LSHIFT_EXPR: + case VEC_RSHIFT_EXPR: case BIT_IOR_EXPR: case BIT_XOR_EXPR: diff --git a/gcc/tree-pretty-print.c b/gcc/tree-pretty-print.c index 1922be7..04cc8fa 100644 --- a/gcc/tree-pretty-print.c +++ b/gcc/tree-pretty-print.c @@ -1043,6 +1043,8 @@ dump_generic_node (pretty_printer *buffer, tree node, int spc, int flags, case RSHIFT_EXPR: case LROTATE_EXPR: case RROTATE_EXPR: + case VEC_LSHIFT_EXPR: + case VEC_RSHIFT_EXPR: case BIT_IOR_EXPR: case BIT_XOR_EXPR: case BIT_AND_EXPR: @@ -1838,6 +1840,8 @@ op_prio (tree op) case REDUC_MAX_EXPR: case REDUC_MIN_EXPR: case REDUC_PLUS_EXPR: + case VEC_LSHIFT_EXPR: + case VEC_RSHIFT_EXPR: return 16; case SAVE_EXPR: @@ -1925,6 +1929,12 @@ op_symbol (tree op) case RSHIFT_EXPR: return ">>"; + case VEC_LSHIFT_EXPR: + return "v<<"; + + case VEC_RSHIFT_EXPR: + return "v>>"; + case PLUS_EXPR: return "+"; diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c index 2da1ed2..fc75222 100644 --- a/gcc/tree-vect-generic.c +++ b/gcc/tree-vect-generic.c @@ -448,6 +448,7 @@ expand_vector_operations_1 (block_stmt_iterator *bsi) compute_type = TREE_TYPE (type); } + gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR); rhs = expand_vector_operation (bsi, type, compute_type, rhs, code); if (lang_hooks.types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (rhs))) *p_rhs = rhs; diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 2b4d1d7..a4417d4 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -834,6 +834,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); tree vectype = STMT_VINFO_VECTYPE (stmt_info); + enum machine_mode mode = TYPE_MODE (vectype); loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); basic_block exit_bb; @@ -843,15 +844,18 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, block_stmt_iterator exit_bsi; tree vec_dest; tree new_temp; + tree new_name; tree epilog_stmt; tree new_scalar_dest, exit_phi; - tree bitsize, bitpos; + tree bitsize, bitpos, bytesize; enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); tree scalar_initial_def; tree vec_initial_def; tree orig_name; imm_use_iterator imm_iter; use_operand_p use_p; + bool extract_scalar_result; + bool adjust_in_epilog; /*** 1. Create the reduction def-use cycle ***/ @@ -888,63 +892,214 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, exit_bsi = bsi_start (exit_bb); - /* 2.2 Create: - v_out2 = reduc_expr <v_out1> - s_out3 = extract_field <v_out2, 0> */ + new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); + bitsize = TYPE_SIZE (scalar_type); + bytesize = TYPE_SIZE_UNIT (scalar_type); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, - build1 (reduc_code, vectype, PHI_RESULT (new_phi))); - new_temp = make_ssa_name (vec_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + /* 2.2 Create the reduction code. */ - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + if (reduc_code < NUM_TREE_CODES) { - fprintf (vect_dump, "transform reduction: created epilog code:"); - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); - } + /*** Case 1: Create: + v_out2 = reduc_expr <v_out1> */ - new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); - bitsize = TYPE_SIZE (scalar_type); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + fprintf (vect_dump, "Reduce using direct vector reduction."); - /* The result is in the low order bits. */ - if (BITS_BIG_ENDIAN) - bitpos = size_binop (MULT_EXPR, - bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), - TYPE_SIZE (scalar_type)); + vec_dest = vect_create_destination_var (scalar_dest, vectype); + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build1 (reduc_code, vectype, PHI_RESULT (new_phi))); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + extract_scalar_result = true; + adjust_in_epilog = true; + } else - bitpos = bitsize_zero_node; + { + enum tree_code shift_code; + bool have_whole_vector_shift = true; + enum tree_code code = TREE_CODE (TREE_OPERAND (stmt, 1)); /* CHECKME */ + int bit_offset; + int element_bitsize = tree_low_cst (bitsize, 1); + int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + tree vec_temp; + + /* The result of the reduction is expected to be at the LSB bits + of the vector. For big-endian targets this means at the right + end of the vector. For little-edian targets this means at the + left end of the vector. */ + + if (BITS_BIG_ENDIAN + && vec_shr_optab->handlers[mode].insn_code != CODE_FOR_nothing) + shift_code = VEC_RSHIFT_EXPR; + else if (!BITS_BIG_ENDIAN + && vec_shl_optab->handlers[mode].insn_code != CODE_FOR_nothing) + shift_code = VEC_LSHIFT_EXPR; + else + have_whole_vector_shift = false; + + if (have_whole_vector_shift) + { + /*** Case 2: + for (offset = VS/2; offset >= element_size; offset/=2) + { + Create: va' = vec_shift <va, offset> + Create: va = vop <va, va'> + } */ + + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + fprintf (vect_dump, "Reduce using vector shifts"); + + vec_dest = vect_create_destination_var (scalar_dest, vectype); + new_temp = PHI_RESULT (new_phi); + + for (bit_offset = vec_size_in_bits/2; + bit_offset >= element_bitsize; + bit_offset /= 2) + { + tree bitpos = size_int (bit_offset); + + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build2 (shift_code, vectype, new_temp, bitpos)); + new_name = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_name; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + + + epilog_stmt = build2 (MODIFY_EXPR, vectype, vec_dest, + build2 (code, vectype, new_name, new_temp)); + new_temp = make_ssa_name (vec_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } + + extract_scalar_result = true; + adjust_in_epilog = true; + } + else + { + /*** Case 3: + Create: s = init; + for (offset=0; offset<vector_size; offset+=element_size;) + { + Create: s' = extract_field <v_out2, offset> + Create: s = op <s, s'> + } */ + + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + fprintf (vect_dump, "Reduce using scalar code. "); + + vec_temp = PHI_RESULT (new_phi); + vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); + + /* first iteration is peeled out when possible to minimize + the number of operations we generate: */ + if (code == PLUS_EXPR + && (integer_zerop (scalar_initial_def) + || real_zerop (scalar_initial_def))) + { + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build3 (BIT_FIELD_REF, scalar_type, + vec_temp, bitsize, bitsize_zero_node)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + + bit_offset = element_bitsize; + } + else + { + new_temp = scalar_initial_def; + bit_offset = 0; + } - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - build3 (BIT_FIELD_REF, scalar_type, - new_temp, bitsize, bitpos)); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + for (; + bit_offset < vec_size_in_bits; + bit_offset += element_bitsize) + { + tree bitpos = bitsize_int (bit_offset); + + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build3 (BIT_FIELD_REF, scalar_type, + vec_temp, bitsize, bitpos)); + new_name = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_name; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + + + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build2 (code, scalar_type, new_name, new_temp)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } + + extract_scalar_result = false; + adjust_in_epilog = false; + } + } - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + /* 2.3 Extract the final scalar result. Create: + s_out3 = extract_field <v_out2, bitpos> */ - /* 2.3 Adjust the final result by the initial value of the reduction - variable. (when such adjustment is not needed, then - 'scalar_initial_def' is zero). + if (extract_scalar_result) + { + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + fprintf (vect_dump, "extract scalar result"); - Create: - s_out = scalar_expr <s_out, scalar_initial_def> */ + /* The result is in the low order bits. */ + if (BITS_BIG_ENDIAN) + bitpos = size_binop (MULT_EXPR, + bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), + TYPE_SIZE (scalar_type)); + else + bitpos = bitsize_zero_node; + + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build3 (BIT_FIELD_REF, scalar_type, + new_temp, bitsize, bitpos)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } - epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, - build2 (code, scalar_type, new_temp, scalar_initial_def)); - new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); - TREE_OPERAND (epilog_stmt, 0) = new_temp; - bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); - if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + /* 2.4 Adjust the final result by the initial value of the reduction + variable. (when such adjustment is not needed, then + 'scalar_initial_def' is zero). - - /* 2.4 Replace uses of s_out0 with uses of s_out3 */ + Create: + s_out = scalar_expr <s_out, scalar_initial_def> */ + + if (adjust_in_epilog) + { + epilog_stmt = build2 (MODIFY_EXPR, scalar_type, new_scalar_dest, + build2 (code, scalar_type, new_temp, scalar_initial_def)); + new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); + TREE_OPERAND (epilog_stmt, 0) = new_temp; + bsi_insert_after (&exit_bsi, epilog_stmt, BSI_NEW_STMT); + + if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) + print_generic_expr (vect_dump, epilog_stmt, TDF_SLIM); + } + + + /* 2.5 Replace uses of s_out0 with uses of s_out3 */ /* Find the loop-closed-use at the loop exit of the original scalar result. (The reduction result is expected to have @@ -954,10 +1109,10 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, tree reduction_op, FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) { if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p)))) - { - exit_phi = USE_STMT (use_p); - break; - } + { + exit_phi = USE_STMT (use_p); + break; + } } orig_name = PHI_RESULT (exit_phi); @@ -1067,13 +1222,13 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) { if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) fprintf (vect_dump, "no optab for reduction."); - return false; + reduc_code = NUM_TREE_CODES; } if (reduc_optab->handlers[(int) vec_mode].insn_code == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_DETAILS, UNKNOWN_LOC)) - fprintf (vect_dump, "op not supported by target."); - return false; + fprintf (vect_dump, "reduc op not supported by target."); + reduc_code = NUM_TREE_CODES; } if (!vec_stmt) /* transformation not required. */ diff --git a/gcc/tree.def b/gcc/tree.def index 2b8c280..26a8703 100644 --- a/gcc/tree.def +++ b/gcc/tree.def @@ -957,6 +957,12 @@ DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1) DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1) DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1) +/* Whole vector lesft/right shift in bytes. + Operand 0 is a vector to be shifted. + Operand 1 is an integer shift amount in bits. */ +DEFTREECODE (VEC_LSHIFT_EXPR, "vec_lshift_expr", tcc_binary, 2) +DEFTREECODE (VEC_RSHIFT_EXPR, "vec_rshift_expr", tcc_binary, 2) + /* Local variables: mode:c |