aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <rth@redhat.com>2012-06-23 09:42:19 -0700
committerRichard Henderson <rth@gcc.gnu.org>2012-06-23 09:42:19 -0700
commit2d542a9f780fce317221636bfad0581d2e227733 (patch)
tree6e30fb45a07f23eaba80b3c9ca95f5e5f6048ada
parent7b5321188b4011e2ce3b6d56cf26d6dde054419d (diff)
downloadgcc-2d542a9f780fce317221636bfad0581d2e227733.zip
gcc-2d542a9f780fce317221636bfad0581d2e227733.tar.gz
gcc-2d542a9f780fce317221636bfad0581d2e227733.tar.bz2
re PR target/53749 (ice in expand_shift_1)
PR target/53749 * config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for V*QImode shifts and multiply. (ix86_expand_vecop_qihi): Support shifts. * config/i386/i386.md (any_shift): New code iterator. * config/i386/sse.md (ashlv16qi3): Merge ... (<any_shiftrt>v16qi3): ... into ... (<any_shift><VI1_AVX2>3): ... here. Use ix86_expand_vecop_qihi to support SSE and AVX. From-SVN: r188909
-rw-r--r--gcc/ChangeLog10
-rw-r--r--gcc/config/i386/i386.c109
-rw-r--r--gcc/config/i386/i386.md3
-rw-r--r--gcc/config/i386/sse.md76
4 files changed, 123 insertions, 75 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 75f5c0a..5c65a39 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,15 @@
2012-06-23 Richard Henderson <rth@redhat.com>
+ PR target/53749
+ * config/i386/i386.c (ix86_rtx_costs): Add reasonable costs for
+ V*QImode shifts and multiply.
+ (ix86_expand_vecop_qihi): Support shifts.
+ * config/i386/i386.md (any_shift): New code iterator.
+ * config/i386/sse.md (ashlv16qi3): Merge ...
+ (<any_shiftrt>v16qi3): ... into ...
+ (<any_shift><VI1_AVX2>3): ... here. Use ix86_expand_vecop_qihi
+ to support SSE and AVX.
+
* config/i386/i386.c (ix86_expand_sse_unpack): Split operands[]
parameter into src and dest.
* config/i386/sse.md (vec_unpacku_hi_<V124_AVX2>): Update call.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7ae2060..fc30632 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31938,9 +31938,10 @@ ix86_set_reg_reg_cost (enum machine_mode mode)
scanned. In either case, *TOTAL contains the cost result. */
static bool
-ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
+ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
bool speed)
{
+ enum rtx_code code = (enum rtx_code) code_i;
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
enum machine_mode mode = GET_MODE (x);
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
@@ -32045,7 +32046,31 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
/* ??? Should be SSE vector operation cost. */
/* At least for published AMD latencies, this really is the same
as the latency for a simple fpu operation like fabs. */
- *total = cost->fabs;
+ /* V*QImode is emulated with 1-11 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int count;
+ if (TARGET_XOP && mode == V16QImode)
+ {
+ /* For XOP we use vpshab, which requires a broadcast of the
+ value to the variable shift insn. For constants this
+ means a V16Q const in mem; even when we can perform the
+ shift with one insn set the cost to prefer paddb. */
+ if (CONSTANT_P (XEXP (x, 1)))
+ {
+ *total = (cost->fabs
+ + rtx_cost (XEXP (x, 0), code, 0, speed)
+ + (speed ? 2 : COSTS_N_BYTES (16)));
+ return true;
+ }
+ count = 3;
+ }
+ else
+ count = TARGET_SSSE3 ? 7 : 11;
+ *total = cost->fabs * count;
+ }
+ else
+ *total = cost->fabs;
return false;
}
if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
@@ -32119,9 +32144,15 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
}
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
+ /* V*QImode is emulated with 7-13 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
+ *total = cost->fmul * 2 + cost->fabs * extra;
+ }
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
insns, including two PMULUDQ. */
- if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
*total = cost->fmul * 2 + cost->fabs * 5;
else
*total = cost->fmul;
@@ -38448,44 +38479,66 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
rtx (*gen_ih) (rtx, rtx, rtx);
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
struct expand_vec_perm_d d;
- bool ok;
+ bool ok, full_interleave;
+ bool uns_p = false;
int i;
- if (qimode == V16QImode)
+ switch (qimode)
{
+ case V16QImode:
himode = V8HImode;
gen_il = gen_vec_interleave_lowv16qi;
gen_ih = gen_vec_interleave_highv16qi;
- }
- else if (qimode == V32QImode)
- {
+ break;
+ case V32QImode:
himode = V16HImode;
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
+ break;
+ default:
+ gcc_unreachable ();
}
- else
- gcc_unreachable ();
- /* Unpack data such that we've got a source byte in each low byte of
- each word. We don't care what goes into the high byte of each word.
- Rather than trying to get zero in there, most convenient is to let
- it be a copy of the low byte. */
- op1_l = gen_reg_rtx (qimode);
- op1_h = gen_reg_rtx (qimode);
- emit_insn (gen_il (op1_l, op1, op1));
- emit_insn (gen_ih (op1_h, op1, op1));
+ op2_l = op2_h = op2;
+ switch (code)
+ {
+ case MULT:
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ op2_l = gen_reg_rtx (qimode);
+ op2_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op2_l, op2, op2));
+ emit_insn (gen_ih (op2_h, op2, op2));
+ /* FALLTHRU */
- op2_l = gen_reg_rtx (qimode);
- op2_h = gen_reg_rtx (qimode);
- emit_insn (gen_il (op2_l, op2, op2));
- emit_insn (gen_ih (op2_h, op2, op2));
+ op1_l = gen_reg_rtx (qimode);
+ op1_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op1_l, op1, op1));
+ emit_insn (gen_ih (op1_h, op1, op1));
+ full_interleave = qimode == V16QImode;
+ break;
+
+ case ASHIFT:
+ case LSHIFTRT:
+ uns_p = true;
+ /* FALLTHRU */
+ case ASHIFTRT:
+ op1_l = gen_reg_rtx (himode);
+ op1_h = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+ ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+ full_interleave = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
/* Perform the operation. */
- res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
- gen_lowpart (himode, op2_l), NULL_RTX,
+ res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
1, OPTAB_DIRECT);
- res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
- gen_lowpart (himode, op2_h), NULL_RTX,
+ res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
1, OPTAB_DIRECT);
gcc_assert (res_l && res_h);
@@ -38498,11 +38551,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
d.one_operand_p = false;
d.testing_p = false;
- if (qimode == V16QImode)
+ if (full_interleave)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 16; ++i)
+ for (i = 0; i < 32; ++i)
d.perm[i] = i * 2;
}
else
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 879b87b..da2f4b2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -711,6 +711,9 @@
;; Mapping of shift-right operators
(define_code_iterator any_shiftrt [lshiftrt ashiftrt])
+;; Mapping of all shift operators
+(define_code_iterator any_shift [ashift lshiftrt ashiftrt])
+
;; Base name for define_insn
(define_code_attr shift_insn
[(ashift "ashl") (lshiftrt "lshr") (ashiftrt "ashr")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c7c6392..691256d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10550,60 +10550,42 @@
(set_attr "prefix_extra" "2")
(set_attr "mode" "TI")])
-;; SSE2 doesn't have some shift variants, so define versions for XOP
-(define_expand "ashlv16qi3"
- [(set (match_operand:V16QI 0 "register_operand")
- (ashift:V16QI
- (match_operand:V16QI 1 "register_operand")
- (match_operand:SI 2 "nonmemory_operand")))]
- "TARGET_XOP"
-{
- rtx reg = gen_reg_rtx (V16QImode);
- rtx par;
- int i;
-
- par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
- for (i = 0; i < 16; i++)
- XVECEXP (par, 0, i) = operands[2];
-
- emit_insn (gen_vec_initv16qi (reg, par));
- emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], reg));
- DONE;
-})
-
-(define_expand "<shift_insn>v16qi3"
- [(set (match_operand:V16QI 0 "register_operand")
- (any_shiftrt:V16QI
- (match_operand:V16QI 1 "register_operand")
+(define_expand "<shift_insn><mode>3"
+ [(set (match_operand:VI1_AVX2 0 "register_operand")
+ (any_shift:VI1_AVX2
+ (match_operand:VI1_AVX2 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
- "TARGET_XOP"
+ "TARGET_SSE2"
{
- rtx reg = gen_reg_rtx (V16QImode);
- rtx par;
- bool negate = false;
- rtx (*shift_insn)(rtx, rtx, rtx);
- int i;
-
- if (CONST_INT_P (operands[2]))
- operands[2] = GEN_INT (-INTVAL (operands[2]));
- else
- negate = true;
+ if (TARGET_XOP && <MODE>mode == V16QImode)
+ {
+ bool negate = false;
+ rtx (*gen) (rtx, rtx, rtx);
+ rtx tmp, par;
+ int i;
- par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
- for (i = 0; i < 16; i++)
- XVECEXP (par, 0, i) = operands[2];
+ if (<CODE> != ASHIFT)
+ {
+ if (CONST_INT_P (operands[2]))
+ operands[2] = GEN_INT (-INTVAL (operands[2]));
+ else
+ negate = true;
+ }
+ par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+ for (i = 0; i < 16; i++)
+ XVECEXP (par, 0, i) = operands[2];
- emit_insn (gen_vec_initv16qi (reg, par));
+ tmp = gen_reg_rtx (V16QImode);
+ emit_insn (gen_vec_initv16qi (tmp, par));
- if (negate)
- emit_insn (gen_negv16qi2 (reg, reg));
+ if (negate)
+ emit_insn (gen_negv16qi2 (tmp, tmp));
- if (<CODE> == LSHIFTRT)
- shift_insn = gen_xop_shlv16qi3;
+ gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
+ emit_insn (gen (operands[0], operands[1], tmp));
+ }
else
- shift_insn = gen_xop_shav16qi3;
-
- emit_insn (shift_insn (operands[0], operands[1], reg));
+ ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
DONE;
})