aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Hubicka <hubicka@ucw.cz>2017-11-17 18:53:49 +0100
committerJan Hubicka <hubicka@gcc.gnu.org>2017-11-17 17:53:49 +0000
commit274c2d3f146e70e7205a2472c6f4d9a93380a80b (patch)
treef8ed3b7bb47a28a02c067a893caf7a57f28877bd
parent650fe7323c11e5e116e34d88028909b1221e2cc8 (diff)
downloadgcc-274c2d3f146e70e7205a2472c6f4d9a93380a80b.zip
gcc-274c2d3f146e70e7205a2472c6f4d9a93380a80b.tar.gz
gcc-274c2d3f146e70e7205a2472c6f4d9a93380a80b.tar.bz2
i386.c (ix86_multiplication_cost, [...]): Break out from ...
* i386.c (ix86_multiplication_cost, ix86_division_cost, ix86_shift_rotate_cost): Break out from ... (ix86_rtx_costs): ... here. (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of vector operations. * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel. From-SVN: r254889
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/i386/i386.c422
-rw-r--r--gcc/testsuite/ChangeLog4
-rw-r--r--gcc/testsuite/gcc.target/i386/xop-imul64-vector.c2
4 files changed, 306 insertions, 130 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 3864ff2..9d96f95 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,5 +1,13 @@
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
+ * i386.c (ix86_multiplication_cost, ix86_division_cost,
+ ix86_shift_rotate_cost): Break out from ...
+ (ix86_rtx_costs): ... here.
+ (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of
+ vector operations.
+
+2017-11-17 Jan Hubicka <hubicka@ucw.cz>
+
* predict.c (determine_unlikely_bbs): Set cgraph node count to 0
when entry block was promoted unlikely.
(estimate_bb_frequencies): Increase frequency scale.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a536669..835c73d 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39285,6 +39285,168 @@ ix86_vec_cost (machine_mode mode, int cost, bool parallel)
return cost;
}
+/* Return cost of multiplication in MODE. */
+
+static int
+ix86_multiplication_cost (const struct processor_costs *cost,
+ enum machine_mode mode)
+{
+ machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);
+
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return inner_mode == DFmode ? cost->mulsd : cost->mulss;
+ else if (X87_FLOAT_MODE_P (mode))
+ return cost->fmul;
+ else if (FLOAT_MODE_P (mode))
+ return ix86_vec_cost (mode,
+ inner_mode == DFmode
+ ? cost->mulsd : cost->mulss, true);
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* V*QImode is emulated with 7-13 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int extra = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ extra = 5;
+ else if (TARGET_SSSE3)
+ extra = 6;
+ return ix86_vec_cost (mode,
+ cost->mulss * 2 + cost->sse_op * extra,
+ true);
+ }
+ /* V*DImode is emulated with 5-8 insns. */
+ else if (mode == V2DImode || mode == V4DImode)
+ {
+ if (TARGET_XOP && mode == V2DImode)
+ return ix86_vec_cost (mode,
+ cost->mulss * 2 + cost->sse_op * 3,
+ true);
+ else
+ return ix86_vec_cost (mode,
+ cost->mulss * 3 + cost->sse_op * 5,
+ true);
+ }
+ /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+ insns, including two PMULUDQ. */
+ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
+ true);
+ else
+ return ix86_vec_cost (mode, cost->mulss, true);
+ }
+ else
+ return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
+}
+
+/* Return cost of multiplication in MODE. */
+
+static int
+ix86_division_cost (const struct processor_costs *cost,
+ enum machine_mode mode)
+{
+ machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);
+
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return inner_mode == DFmode ? cost->divsd : cost->divss;
+ else if (X87_FLOAT_MODE_P (mode))
+ return cost->fdiv;
+ else if (FLOAT_MODE_P (mode))
+ return ix86_vec_cost (mode,
+ inner_mode == DFmode ? cost->divsd : cost->divss,
+ true);
+ else
+ return cost->divide[MODE_INDEX (mode)];
+}
+
+/* Return cost of shift in MODE.
+ If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
+ AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
+ if op1 is a result of subreg.
+
+ SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
+
+static int
+ix86_shift_rotate_cost (const struct processor_costs *cost,
+ enum machine_mode mode, bool constant_op1,
+ HOST_WIDE_INT op1_val,
+ bool speed,
+ bool and_in_op1,
+ bool shift_and_truncate,
+ bool *skip_op0, bool *skip_op1)
+{
+ if (skip_op0)
+ *skip_op0 = *skip_op1 = false;
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* V*QImode is emulated with 1-11 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int count = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ {
+ /* For XOP we use vpshab, which requires a broadcast of the
+ value to the variable shift insn. For constants this
+ means a V16Q const in mem; even when we can perform the
+ shift with one insn set the cost to prefer paddb. */
+ if (constant_op1)
+ {
+ if (skip_op1)
+ *skip_op1 = true;
+ return ix86_vec_cost (mode,
+ cost->sse_op
+ + (speed
+ ? 2
+ : COSTS_N_BYTES
+ (GET_MODE_UNIT_SIZE (mode))), true);
+ }
+ count = 3;
+ }
+ else if (TARGET_SSSE3)
+ count = 7;
+ return ix86_vec_cost (mode, cost->sse_op * count, true);
+ }
+ else
+ return ix86_vec_cost (mode, cost->sse_op, true);
+ }
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ {
+ if (constant_op1)
+ {
+ if (op1_val > 32)
+ return cost->shift_const + COSTS_N_INSNS (2);
+ else
+ return cost->shift_const * 2;
+ }
+ else
+ {
+ if (and_in_op1)
+ return cost->shift_var * 2;
+ else
+ return cost->shift_var * 6 + COSTS_N_INSNS (2);
+ }
+ }
+ else
+ {
+ if (constant_op1)
+ return cost->shift_const;
+ else if (shift_and_truncate)
+ {
+ if (skip_op0)
+ *skip_op0 = *skip_op1 = true;
+ /* Return the cost after shift-and truncation. */
+ return cost->shift_var;
+ }
+ else
+ return cost->shift_var;
+ }
+ return cost->shift_const;
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -39298,9 +39460,6 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
int src_cost;
- machine_mode inner_mode = mode;
- if (VECTOR_MODE_P (mode))
- inner_mode = GET_MODE_INNER (mode);
switch (code)
{
@@ -39428,68 +39587,22 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
- if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- /* ??? Should be SSE vector operation cost. */
- /* At least for published AMD latencies, this really is the same
- as the latency for a simple fpu operation like fabs. */
- /* V*QImode is emulated with 1-11 insns. */
- if (mode == V16QImode || mode == V32QImode)
- {
- int count = 11;
- if (TARGET_XOP && mode == V16QImode)
- {
- /* For XOP we use vpshab, which requires a broadcast of the
- value to the variable shift insn. For constants this
- means a V16Q const in mem; even when we can perform the
- shift with one insn set the cost to prefer paddb. */
- if (CONSTANT_P (XEXP (x, 1)))
- {
- *total = ix86_vec_cost (mode,
- cost->sse_op
- + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
- + (speed ? 2 : COSTS_N_BYTES (16)), true);
- return true;
- }
- count = 3;
- }
- else if (TARGET_SSSE3)
- count = 7;
- *total = ix86_vec_cost (mode, cost->sse_op * count, true);
- }
- else
- *total = ix86_vec_cost (mode, cost->sse_op, true);
- }
- else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- {
- if (CONST_INT_P (XEXP (x, 1)))
- {
- if (INTVAL (XEXP (x, 1)) > 32)
- *total = cost->shift_const + COSTS_N_INSNS (2);
- else
- *total = cost->shift_const * 2;
- }
- else
- {
- if (GET_CODE (XEXP (x, 1)) == AND)
- *total = cost->shift_var * 2;
- else
- *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
- }
- }
- else
- {
- if (CONST_INT_P (XEXP (x, 1)))
- *total = cost->shift_const;
- else if (SUBREG_P (XEXP (x, 1))
- && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
- {
- /* Return the cost after shift-and truncation. */
- *total = cost->shift_var;
- return true;
- }
- else
- *total = cost->shift_var;
+ bool skip_op0, skip_op1;
+ *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+ CONST_INT_P (XEXP (x, 1))
+ ? INTVAL (XEXP (x, 1)) : -1,
+ speed,
+ GET_CODE (XEXP (x, 1)) == AND,
+ SUBREG_P (XEXP (x, 1))
+ && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+ &skip_op0, &skip_op1);
+ if (skip_op0 || skip_op1)
+ {
+ if (!skip_op0)
+ *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+ if (!skip_op1)
+ *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
+ return true;
}
return false;
@@ -39519,59 +39632,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
case MULT:
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- {
- *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
- return false;
- }
- else if (X87_FLOAT_MODE_P (mode))
- {
- *total = cost->fmul;
- return false;
- }
- else if (FLOAT_MODE_P (mode))
- {
- *total = ix86_vec_cost (mode,
- inner_mode == DFmode
- ? cost->mulsd : cost->mulss, true);
- return false;
- }
- else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- /* V*QImode is emulated with 7-13 insns. */
- if (mode == V16QImode || mode == V32QImode)
- {
- int extra = 11;
- if (TARGET_XOP && mode == V16QImode)
- extra = 5;
- else if (TARGET_SSSE3)
- extra = 6;
- *total = ix86_vec_cost (mode,
- cost->mulss * 2 + cost->sse_op * extra,
- true);
- }
- /* V*DImode is emulated with 5-8 insns. */
- else if (mode == V2DImode || mode == V4DImode)
- {
- if (TARGET_XOP && mode == V2DImode)
- *total = ix86_vec_cost (mode,
- cost->mulss * 2 + cost->sse_op * 3,
- true);
- else
- *total = ix86_vec_cost (mode,
- cost->mulss * 3 + cost->sse_op * 5,
- true);
- }
- /* Without sse4.1, we don't have PMULLD; it's emulated with 7
- insns, including two PMULUDQ. */
- else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
- *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
- true);
- else
- *total = ix86_vec_cost (mode, cost->mulss, true);
- return false;
- }
- else
+ if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
{
rtx op0 = XEXP (x, 0);
rtx op1 = XEXP (x, 1);
@@ -39616,21 +39677,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
return true;
}
+ *total = ix86_multiplication_cost (cost, mode);
+ return false;
case DIV:
case UDIV:
case MOD:
case UMOD:
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- *total = inner_mode == DFmode ? cost->divsd : cost->divss;
- else if (X87_FLOAT_MODE_P (mode))
- *total = cost->fdiv;
- else if (FLOAT_MODE_P (mode))
- *total = ix86_vec_cost (mode,
- inner_mode == DFmode ? cost->divsd : cost->divss,
- true);
- else
- *total = cost->divide[MODE_INDEX (mode)];
+ *total = ix86_division_cost (cost, mode);
return false;
case PLUS:
@@ -48827,7 +48881,117 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
unsigned retval = 0;
tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
- int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ int stmt_cost = - 1;
+
+ if ((kind == vector_stmt || kind == scalar_stmt)
+ && stmt_info
+ && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
+ {
+ tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+ bool fp = false;
+ machine_mode mode = TImode;
+
+ if (vectype != NULL)
+ {
+ fp = FLOAT_TYPE_P (vectype);
+ mode = TYPE_MODE (vectype);
+ }
+ /*machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);*/
+
+ switch (subcode)
+ {
+ case PLUS_EXPR:
+ case POINTER_PLUS_EXPR:
+ case MINUS_EXPR:
+ if (kind == scalar_stmt)
+ {
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ stmt_cost = ix86_cost->addss;
+ else if (X87_FLOAT_MODE_P (mode))
+ stmt_cost = ix86_cost->fadd;
+ else
+ stmt_cost = ix86_cost->add;
+ }
+ else
+ stmt_cost = ix86_vec_cost (mode,
+ fp ? ix86_cost->addss
+ : ix86_cost->sse_op,
+ true);
+ break;
+
+ case MULT_EXPR:
+ case WIDEN_MULT_EXPR:
+ case MULT_HIGHPART_EXPR:
+ stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
+ break;
+ case FMA_EXPR:
+ stmt_cost = ix86_vec_cost (mode,
+ mode == SFmode ? ix86_cost->fmass
+ : ix86_cost->fmasd,
+ true);
+ break;
+ case NEGATE_EXPR:
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ stmt_cost = ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ stmt_cost = ix86_cost->fchs;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
+ else
+ stmt_cost = ix86_cost->add;
+ break;
+ case TRUNC_DIV_EXPR:
+ case CEIL_DIV_EXPR:
+ case FLOOR_DIV_EXPR:
+ case ROUND_DIV_EXPR:
+ case TRUNC_MOD_EXPR:
+ case CEIL_MOD_EXPR:
+ case FLOOR_MOD_EXPR:
+ case RDIV_EXPR:
+ case ROUND_MOD_EXPR:
+ case EXACT_DIV_EXPR:
+ stmt_cost = ix86_division_cost (ix86_cost, mode);
+ break;
+
+ case RSHIFT_EXPR:
+ case LSHIFT_EXPR:
+ case LROTATE_EXPR:
+ case RROTATE_EXPR:
+ {
+ tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
+ stmt_cost = ix86_shift_rotate_cost
+ (ix86_cost, mode,
+ TREE_CODE (op2) == INTEGER_CST,
+ cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
+ true, false, false, NULL, NULL);
+ }
+ break;
+ case NOP_EXPR:
+ stmt_cost = 0;
+ break;
+
+ case BIT_IOR_EXPR:
+ case ABS_EXPR:
+ case MIN_EXPR:
+ case MAX_EXPR:
+ case BIT_XOR_EXPR:
+ case BIT_AND_EXPR:
+ case BIT_NOT_EXPR:
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ stmt_cost = ix86_cost->sse_op;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
+ else
+ stmt_cost = ix86_cost->add;
+ break;
+ default:
+ break;
+ }
+ }
+ if (stmt_cost == -1)
+ stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
/* Penalize DFmode vector operations for Bonnell. */
if (TARGET_BONNELL && kind == vector_stmt
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 149ed11..c75df82 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,5 +1,9 @@
2017-11-17 Jan Hubicka <hubicka@ucw.cz>
+ * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel.
+
+2017-11-17 Jan Hubicka <hubicka@ucw.cz>
+
* gcc.dg/tree-ssa/dump-2.c: Fixup template for profile precision
changes.
* gcc.dg/tree-ssa/pr77445-2.c: Fixup template for profile precision
diff --git a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c
index fc8c880..43a27a5 100644
--- a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c
+++ b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c
@@ -3,7 +3,7 @@
/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize" } */
+/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize -fno-vect-cost-model" } */
extern void exit (int);