aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r--gcc/config/i386/i386.cc215
1 files changed, 201 insertions, 14 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4f8380c4..78df3d9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3. If not see
#include "i386-features.h"
#include "function-abi.h"
#include "rtl-error.h"
+#include "gimple-pretty-print.h"
/* This file should be included last. */
#include "target-def.h"
@@ -458,6 +459,9 @@ int ix86_arch_specified;
indirect thunk pushes the return address onto stack, destroying
red-zone.
+ NB: Don't use red-zone for functions with no_caller_saved_registers
+ and 32 GPRs since 128-byte red-zone is too small for 31 GPRs.
+
TODO: If we can reserve the first 2 WORDs, for PUSH and, another
for CALL, in red-zone, we can allow local indirect jumps with
indirect thunk. */
@@ -467,6 +471,9 @@ ix86_using_red_zone (void)
{
return (TARGET_RED_ZONE
&& !TARGET_64BIT_MS_ABI
+ && (!TARGET_APX_EGPR
+ || (cfun->machine->call_saved_registers
+ != TYPE_NO_CALLER_SAVED_REGISTERS))
&& (!cfun->machine->has_local_indirect_jump
|| cfun->machine->indirect_branch_type == indirect_branch_keep));
}
@@ -21810,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
return insn_cost + pattern_cost (PATTERN (insn), speed);
}
+/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates). */
+
+static int
+vec_fp_conversion_cost (const struct processor_costs *cost, int size)
+{
+ if (size < 128)
+ return cost->cvtss2sd;
+ else if (size < 256)
+ {
+ if (TARGET_SSE_SPLIT_REGS)
+ return cost->cvtss2sd * size / 64;
+ return cost->cvtss2sd;
+ }
+ if (size < 512)
+ return cost->vcvtps2pd256;
+ else
+ return cost->vcvtps2pd512;
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -22473,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
return false;
case FLOAT_EXTEND:
+ /* x87 represents all values extended to 80bit. */
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = 0;
else
- *total = ix86_vec_cost (mode, cost->addss);
+ *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
case FLOAT_TRUNCATE:
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = cost->fadd;
else
- *total = ix86_vec_cost (mode, cost->addss);
+ *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
case ABS:
@@ -23158,6 +23185,12 @@ x86_print_call_or_nop (FILE *file, const char *target)
if (flag_nop_mcount || !strcmp (target, "nop"))
/* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ else if (!TARGET_PECOFF && flag_pic)
+ {
+ gcc_assert (flag_plt);
+
+ fprintf (file, "1:\tcall\t%s@PLT\n", target);
+ }
else
fprintf (file, "1:\tcall\t%s\n", target);
}
@@ -23321,7 +23354,7 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
break;
case CM_SMALL_PIC:
case CM_MEDIUM_PIC:
- if (!ix86_direct_extern_access)
+ if (!flag_plt)
{
if (ASSEMBLER_DIALECT == ASM_INTEL)
fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
@@ -23352,7 +23385,9 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
"\tleal\t%sP%d@GOTOFF(%%ebx), %%" PROFILE_COUNT_REGISTER "\n",
LPREFIX, labelno);
#endif
- if (ASSEMBLER_DIALECT == ASM_INTEL)
+ if (flag_plt)
+ x86_print_call_or_nop (file, mcount_name);
+ else if (ASSEMBLER_DIALECT == ASM_INTEL)
fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name);
else
fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
@@ -24669,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
switch (type_of_cost)
{
case scalar_stmt:
- return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+ return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
case scalar_load:
/* load/store costs are relative to register move which is 2. Recompute
@@ -24740,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
return ix86_cost->cond_not_taken_branch_cost;
case vec_perm:
+ return ix86_vec_cost (mode, ix86_cost->sse_op);
+
case vec_promote_demote:
+ if (fp)
+ return vec_fp_conversion_cost (ix86_tune_cost, mode);
return ix86_vec_cost (mode, ix86_cost->sse_op);
case vec_construct:
@@ -25261,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else if (X87_FLOAT_MODE_P (mode))
stmt_cost = ix86_cost->fadd;
else
- stmt_cost = ix86_cost->add;
+ stmt_cost = ix86_cost->add;
}
else
stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25316,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(subcode == RSHIFT_EXPR
&& !TYPE_UNSIGNED (TREE_TYPE (op1)))
? ASHIFTRT : LSHIFTRT, mode,
- TREE_CODE (op2) == INTEGER_CST,
+ TREE_CODE (op2) == INTEGER_CST,
cst_and_fits_in_hwi (op2)
? int_cst_value (op2) : -1,
false, false, NULL, NULL);
@@ -25325,27 +25364,152 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
case NOP_EXPR:
/* Only sign-conversions are free. */
if (tree_nop_conversion_p
- (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
stmt_cost = 0;
+ else if (fp)
+ stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ break;
+
+ case COND_EXPR:
+ {
+ /* SSE2 conditinal move sequence is:
+ pcmpgtd %xmm5, %xmm0 (accounted separately)
+ pand %xmm0, %xmm2
+ pandn %xmm1, %xmm0
+ por %xmm2, %xmm0
+ while SSE4 uses cmp + blend
+ and AVX512 masked moves.
+
+ The condition is accounted separately since we usually have
+ p = a < b
+ c = p ? x : y
+ and we will account first statement as setcc. Exception is when
+ p is loaded from memory as bool and then we will not acocunt
+ the compare, but there is no way to check for this. */
+
+ int ninsns = TARGET_SSE4_1 ? 1 : 3;
+
+ /* If one of parameters is 0 or -1 the sequence will be simplified:
+ (if_true & mask) | (if_false & ~mask) -> if_true & mask */
+ if (ninsns > 1
+ && (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || zerop (gimple_assign_rhs3 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs3 (stmt_info->stmt))))
+ ninsns = 1;
+
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ninsns * ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+ else
+ /* compare (accounted separately) + cmov. */
+ stmt_cost = ix86_cost->add;
+ }
break;
- case BIT_IOR_EXPR:
- case ABS_EXPR:
- case ABSU_EXPR:
case MIN_EXPR:
case MAX_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else
+ /* minss */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vpmin was introduced in SSE3.
+ SSE2 needs pcmpgtd + pand + pandn + pxor.
+ If one of parameters is 0 or -1 the sequence is simplified
+ to pcmpgtd + pand. */
+ if (!TARGET_SSSE3)
+ {
+ if (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt)))
+ stmt_cost *= 2;
+ else
+ stmt_cost *= 4;
+ }
+ }
+ else
+ /* cmp + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case ABS_EXPR:
+ case ABSU_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* fabs. */
+ stmt_cost = ix86_cost->fabs;
+ else
+ /* andss of sign bit. */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vabs was introduced in SSE3.
+ SSE3 uses psrat + pxor + psub. */
+ if (!TARGET_SSSE3)
+ stmt_cost *= 3;
+ }
+ else
+ /* neg + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
case BIT_AND_EXPR:
case BIT_NOT_EXPR:
- if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
- stmt_cost = ix86_cost->sse_op;
- else if (VECTOR_MODE_P (mode))
+ gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+ && !X87_FLOAT_MODE_P (mode));
+ if (VECTOR_MODE_P (mode))
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
else
stmt_cost = ix86_cost->add;
break;
+
default:
+ if (truth_value_p (subcode))
+ {
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* CMPccS? insructions are cheap, so use sse_op. While they
+ produce a mask which may need to be turned to 0/1 by and,
+ expect that this will be optimized away in a common case. */
+ stmt_cost = ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* fcmp + setcc. */
+ stmt_cost = ix86_cost->fadd + ix86_cost->add;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ else
+ /* setcc. */
+ stmt_cost = ix86_cost->add;
+ break;
+ }
break;
}
}
@@ -25369,6 +25533,29 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
break;
}
+ if (kind == vec_promote_demote
+ && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
+ {
+ int outer_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt))));
+ int inner_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))));
+ int stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
+ up doing two conversions and packing them. */
+ if (inner_size > outer_size)
+ {
+ int n = inner_size / outer_size;
+ stmt_cost = stmt_cost * n
+ + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ }
+
/* If we do elementwise loads into a vector then we are bound by
latency and execution resources for the many scalar loads
(AGU and load ports). Try to account for this by scaling the