diff options
-rw-r--r-- | gcc/ChangeLog | 14 | ||||
-rw-r--r-- | gcc/config/i386/i386.c | 77 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 29 |
3 files changed, 79 insertions, 41 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3719552..2eb5259 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,19 @@ 2012-06-26 Richard Henderson <rth@redhat.com> + * config/i386/i386.c (bdesc_args): Update. Change + IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI to OPTION_MASK_ISA_SSE2. + (IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI): New. + (ix86_builtin_mul_widen_even): Use it. + (ix86_builtin_mul_widen_odd): Relax SMUL_ODD from sse4 to sse2. + (ix86_expand_mul_widen_evenodd): Handle signed for sse2. + * config/i386/sse.md (vec_widen_<s>mult_hi_<V124_AVX2>): Allow + for all SSE2. + (vec_widen_<s>mult_lo_<V124_AVX2>): Likewise. + (vec_widen_<s>mult_odd_<VI4_AVX2>): Likewise. Relax from V124_AVX2. + (vec_widen_smult_even_v4si): New. + +2012-06-26 Richard Henderson <rth@redhat.com> + * config/i386/sse.md (mul<VI8_AVX2>3): Change from insn_and_split to expander; move guts to ... * config/i386/i386.c (ix86_expand_sse2_mulvxdi3): ... here. Add diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5cf230f..b96fc6e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -25758,6 +25758,7 @@ enum ix86_builtins IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, + IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, @@ -26620,7 +26621,9 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_even_v4si, "__builtin_ia32_vw_smul_even_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI }, @@ -26747,7 +26750,6 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI }, /* SSE4.1 */ @@ -31067,18 +31069,10 @@ ix86_builtin_mul_widen_even (tree type) switch (TYPE_MODE (type)) { case V4SImode: - if (uns_p) - { - if (!TARGET_SSE2) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI; - } - else - { - if (!TARGET_SSE4_1) - return NULL; - code = IX86_BUILTIN_PMULDQ128; - } + if (!TARGET_SSE2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI + : IX86_BUILTIN_VEC_WIDEN_SMUL_EVEN_V4SI); break; case V8SImode: @@ -31103,18 +31097,10 @@ ix86_builtin_mul_widen_odd (tree type) switch (TYPE_MODE (type)) { case V4SImode: - if (uns_p) - { - if (!TARGET_SSE2) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI; - } - else - { - if (!TARGET_SSE4_1) - return NULL; - code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI; - } + if (!TARGET_SSE2) + return NULL; + code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI + : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI); break; case V8SImode: @@ -38774,12 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); return; } + + x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), - GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, - 1, OPTAB_DIRECT); + x, NULL, 1, OPTAB_DIRECT); op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), - GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, - 1, OPTAB_DIRECT); + x, NULL, 1, OPTAB_DIRECT); op1 = gen_lowpart (mode, op1); op2 = gen_lowpart (mode, op2); } @@ -38801,7 +38787,38 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, x = gen_xop_pmacsdql (dest, op1, op2, x); } else - gcc_unreachable (); + { + rtx s1, s2, t0, t1, t2; + + /* The easiest way to implement this without PMULDQ is to go through + the motions as if we are performing a full 64-bit multiply. With + the exception that we need to do less shuffling of the elements. */ + + /* Compute the sign-extension, aka highparts, of the two operands. */ + s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op1, pc_rtx, pc_rtx); + s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op2, pc_rtx, pc_rtx); + + /* Multiply LO(A) * HI(B), and vice-versa. */ + t1 = gen_reg_rtx (wmode); + t2 = gen_reg_rtx (wmode); + emit_insn (gen_sse2_umulv2siv2di3 (t1, s1, op2)); + emit_insn (gen_sse2_umulv2siv2di3 (t2, s2, op1)); + + /* Multiply LO(A) * LO(B). */ + t0 = gen_reg_rtx (wmode); + emit_insn (gen_sse2_umulv2siv2di3 (t0, op1, op2)); + + /* Combine and shift the highparts into place. */ + t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); + t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, + 1, OPTAB_DIRECT); + + /* Combine high and low parts. */ + force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); + return; + } emit_insn (x); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 81e7dc0..754b8b4 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5607,9 +5607,7 @@ (any_extend:<sseunpackmode> (match_operand:VI124_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + "TARGET_SSE2" { ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], <u_bool>, true); @@ -5621,23 +5619,32 @@ (any_extend:<sseunpackmode> (match_operand:VI124_AVX2 1 "register_operand")) (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + "TARGET_SSE2" { ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], <u_bool>, false); DONE; }) +;; Most widen_<s>mult_even_<mode> can be handled directly from other +;; named patterns, but signed V4SI needs special help for plain SSE2. +(define_expand "vec_widen_smult_even_v4si" + [(match_operand:V2DI 0 "register_operand") + (match_operand:V4SI 1 "register_operand") + (match_operand:V4SI 2 "register_operand")] + "TARGET_SSE2" +{ + ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], + false, false); + DONE; +}) + (define_expand "vec_widen_<s>mult_odd_<mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (any_extend:<sseunpackmode> - (match_operand:VI124_AVX2 1 "register_operand")) - (match_operand:VI124_AVX2 2 "register_operand")] - ; Note that SSE2 does not have signed SI multiply - "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1 - || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" + (match_operand:VI4_AVX2 1 "register_operand")) + (match_operand:VI4_AVX2 2 "register_operand")] + "TARGET_SSE2" { ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2], <u_bool>, true); |