aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r--gcc/config/i386/i386.cc191
1 files changed, 153 insertions, 38 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2..78df3d9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25257,32 +25257,6 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
return new ix86_vector_costs (vinfo, costing_for_scalar);
}
-/* Return cost of statement doing FP conversion. */
-
-static unsigned
-fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
-{
- int outer_size
- = tree_to_uhwi
- (TYPE_SIZE
- (TREE_TYPE (gimple_assign_lhs (stmt))));
- int inner_size
- = tree_to_uhwi
- (TYPE_SIZE
- (TREE_TYPE (gimple_assign_rhs1 (stmt))));
- int stmt_cost = vec_fp_conversion_cost
- (ix86_tune_cost, GET_MODE_BITSIZE (mode));
- /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
- up doing two conversions and packing them. */
- if (!scalar_p && inner_size > outer_size)
- {
- int n = inner_size / outer_size;
- stmt_cost = stmt_cost * n
- + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
- }
- return stmt_cost;
-}
-
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
@@ -25326,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else if (X87_FLOAT_MODE_P (mode))
stmt_cost = ix86_cost->fadd;
else
- stmt_cost = ix86_cost->add;
+ stmt_cost = ix86_cost->add;
}
else
stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25381,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(subcode == RSHIFT_EXPR
&& !TYPE_UNSIGNED (TREE_TYPE (op1)))
? ASHIFTRT : LSHIFTRT, mode,
- TREE_CODE (op2) == INTEGER_CST,
+ TREE_CODE (op2) == INTEGER_CST,
cst_and_fits_in_hwi (op2)
? int_cst_value (op2) : -1,
false, false, NULL, NULL);
@@ -25390,30 +25364,152 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
case NOP_EXPR:
/* Only sign-conversions are free. */
if (tree_nop_conversion_p
- (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
stmt_cost = 0;
else if (fp)
- stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
- scalar_p);
+ stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ break;
+
+ case COND_EXPR:
+ {
+ /* SSE2 conditinal move sequence is:
+ pcmpgtd %xmm5, %xmm0 (accounted separately)
+ pand %xmm0, %xmm2
+ pandn %xmm1, %xmm0
+ por %xmm2, %xmm0
+ while SSE4 uses cmp + blend
+ and AVX512 masked moves.
+
+ The condition is accounted separately since we usually have
+ p = a < b
+ c = p ? x : y
+ and we will account first statement as setcc. Exception is when
+ p is loaded from memory as bool and then we will not acocunt
+ the compare, but there is no way to check for this. */
+
+ int ninsns = TARGET_SSE4_1 ? 1 : 3;
+
+ /* If one of parameters is 0 or -1 the sequence will be simplified:
+ (if_true & mask) | (if_false & ~mask) -> if_true & mask */
+ if (ninsns > 1
+ && (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || zerop (gimple_assign_rhs3 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs3 (stmt_info->stmt))))
+ ninsns = 1;
+
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ninsns * ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+ else
+ /* compare (accounted separately) + cmov. */
+ stmt_cost = ix86_cost->add;
+ }
break;
- case BIT_IOR_EXPR:
- case ABS_EXPR:
- case ABSU_EXPR:
case MIN_EXPR:
case MAX_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else
+ /* minss */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vpmin was introduced in SSE3.
+ SSE2 needs pcmpgtd + pand + pandn + pxor.
+ If one of parameters is 0 or -1 the sequence is simplified
+ to pcmpgtd + pand. */
+ if (!TARGET_SSSE3)
+ {
+ if (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt)))
+ stmt_cost *= 2;
+ else
+ stmt_cost *= 4;
+ }
+ }
+ else
+ /* cmp + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case ABS_EXPR:
+ case ABSU_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* fabs. */
+ stmt_cost = ix86_cost->fabs;
+ else
+ /* andss of sign bit. */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vabs was introduced in SSE3.
+ SSE3 uses psrat + pxor + psub. */
+ if (!TARGET_SSSE3)
+ stmt_cost *= 3;
+ }
+ else
+ /* neg + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
case BIT_AND_EXPR:
case BIT_NOT_EXPR:
- if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
- stmt_cost = ix86_cost->sse_op;
- else if (VECTOR_MODE_P (mode))
+ gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+ && !X87_FLOAT_MODE_P (mode));
+ if (VECTOR_MODE_P (mode))
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
else
stmt_cost = ix86_cost->add;
break;
+
default:
+ if (truth_value_p (subcode))
+ {
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* CMPccS? insructions are cheap, so use sse_op. While they
+ produce a mask which may need to be turned to 0/1 by and,
+ expect that this will be optimized away in a common case. */
+ stmt_cost = ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* fcmp + setcc. */
+ stmt_cost = ix86_cost->fadd + ix86_cost->add;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ else
+ /* setcc. */
+ stmt_cost = ix86_cost->add;
+ break;
+ }
break;
}
}
@@ -25439,7 +25535,26 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (kind == vec_promote_demote
&& fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
- stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
+ {
+ int outer_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt))));
+ int inner_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))));
+ int stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
+ up doing two conversions and packing them. */
+ if (inner_size > outer_size)
+ {
+ int n = inner_size / outer_size;
+ stmt_cost = stmt_cost * n
+ + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ }
/* If we do elementwise loads into a vector then we are bound by
latency and execution resources for the many scalar loads