aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/i386-expand.cc50
-rw-r--r--gcc/config/i386/i386-options.cc4
-rw-r--r--gcc/config/i386/i386.cc164
-rw-r--r--gcc/config/i386/i386.h6
-rw-r--r--gcc/config/i386/i386.md4
-rw-r--r--gcc/config/i386/predicates.md14
-rw-r--r--gcc/config/i386/sse.md10
-rw-r--r--gcc/config/i386/x86-tune-costs.h123
-rw-r--r--gcc/config/i386/x86-tune-sched.cc15
9 files changed, 337 insertions, 53 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index cdfd94d..a314800 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4138,6 +4138,10 @@ ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
return false;
mode = GET_MODE (dest);
+ if (immediate_operand (if_false, mode))
+ if_false = force_reg (mode, if_false);
+ if (immediate_operand (if_true, mode))
+ if_true = force_reg (mode, if_true);
/* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
but MODE may be a vector mode and thus not appropriate. */
@@ -4687,6 +4691,8 @@ ix86_expand_fp_movcc (rtx operands[])
compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
}
+ operands[2] = force_reg (mode, operands[2]);
+ operands[3] = force_reg (mode, operands[3]);
emit_insn (gen_rtx_SET (operands[0],
gen_rtx_IF_THEN_ELSE (mode, compare_op,
operands[2], operands[3])));
@@ -19256,8 +19262,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
e1 = gen_reg_rtx (mode);
x1 = gen_reg_rtx (mode);
- /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
-
b = force_reg (mode, b);
/* x0 = rcp(b) estimate */
@@ -19270,20 +19274,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
UNSPEC_RCP)));
- /* e0 = x0 * b */
- emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+ unsigned vector_size = GET_MODE_SIZE (mode);
+
+ /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
+ N-R step with 2 fma implementation. */
+ if (TARGET_FMA
+ || (TARGET_AVX512F && vector_size == 64)
+ || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+ {
+ /* e0 = x0 * a */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+ /* e1 = e0 * b - a */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
+ gen_rtx_NEG (mode, a))));
+ /* res = - e1 * x0 + e0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
+ gen_rtx_NEG (mode, e1),
+ x0, e0)));
+ }
+ else
+ /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+ {
+ /* e0 = x0 * b */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
- /* e0 = x0 * e0 */
- emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+ /* e1 = x0 + x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
- /* e1 = x0 + x0 */
- emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+ /* e0 = x0 * e0 */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
- /* x1 = e1 - e0 */
- emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+ /* x1 = e1 - e0 */
+ emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
- /* res = a * x1 */
- emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+ /* res = a * x1 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+ }
}
/* Output code to perform a Newton-Rhapson approximation of a
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index a9fac01..964449f 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2828,8 +2828,8 @@ ix86_option_override_internal (bool main_args_p,
if (flag_nop_mcount)
error ("%<-mnop-mcount%> is not compatible with this target");
#endif
- if (flag_nop_mcount && flag_pic)
- error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
+ if (flag_nop_mcount && flag_pic && !flag_plt)
+ error ("%<-mnop-mcount%> is not implemented for %<-fno-plt%>");
/* Accept -msseregparm only if at least SSE support is enabled. */
if (TARGET_SSEREGPARM_P (opts->x_target_flags)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b172f71..aef4145 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3. If not see
#include "i386-features.h"
#include "function-abi.h"
#include "rtl-error.h"
+#include "gimple-pretty-print.h"
/* This file should be included last. */
#include "target-def.h"
@@ -21816,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
return insn_cost + pattern_cost (PATTERN (insn), speed);
}
+/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates). */
+
+static int
+vec_fp_conversion_cost (const struct processor_costs *cost, int size)
+{
+ if (size < 128)
+ return cost->cvtss2sd;
+ else if (size < 256)
+ {
+ if (TARGET_SSE_SPLIT_REGS)
+ return cost->cvtss2sd * size / 64;
+ return cost->cvtss2sd;
+ }
+ if (size < 512)
+ return cost->vcvtps2pd256;
+ else
+ return cost->vcvtps2pd512;
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -22479,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
return false;
case FLOAT_EXTEND:
+ /* x87 represents all values extended to 80bit. */
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = 0;
else
- *total = ix86_vec_cost (mode, cost->addss);
+ *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
case FLOAT_TRUNCATE:
if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
*total = cost->fadd;
else
- *total = ix86_vec_cost (mode, cost->addss);
+ *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
case ABS:
@@ -23164,6 +23185,12 @@ x86_print_call_or_nop (FILE *file, const char *target)
if (flag_nop_mcount || !strcmp (target, "nop"))
/* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ else if (!TARGET_PECOFF && flag_pic)
+ {
+ gcc_assert (flag_plt);
+
+ fprintf (file, "1:\tcall\t%s@PLT\n", target);
+ }
else
fprintf (file, "1:\tcall\t%s\n", target);
}
@@ -23327,7 +23354,7 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
break;
case CM_SMALL_PIC:
case CM_MEDIUM_PIC:
- if (!ix86_direct_extern_access)
+ if (!flag_plt)
{
if (ASSEMBLER_DIALECT == ASM_INTEL)
fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
@@ -23358,7 +23385,9 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
"\tleal\t%sP%d@GOTOFF(%%ebx), %%" PROFILE_COUNT_REGISTER "\n",
LPREFIX, labelno);
#endif
- if (ASSEMBLER_DIALECT == ASM_INTEL)
+ if (flag_plt)
+ x86_print_call_or_nop (file, mcount_name);
+ else if (ASSEMBLER_DIALECT == ASM_INTEL)
fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name);
else
fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
@@ -24675,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
switch (type_of_cost)
{
case scalar_stmt:
- return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+ return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
case scalar_load:
/* load/store costs are relative to register move which is 2. Recompute
@@ -24746,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
return ix86_cost->cond_not_taken_branch_cost;
case vec_perm:
+ return ix86_vec_cost (mode, ix86_cost->sse_op);
+
case vec_promote_demote:
+ if (fp)
+ return vec_fp_conversion_cost (ix86_tune_cost, mode);
return ix86_vec_cost (mode, ix86_cost->sse_op);
case vec_construct:
@@ -25267,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else if (X87_FLOAT_MODE_P (mode))
stmt_cost = ix86_cost->fadd;
else
- stmt_cost = ix86_cost->add;
+ stmt_cost = ix86_cost->add;
}
else
stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25322,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(subcode == RSHIFT_EXPR
&& !TYPE_UNSIGNED (TREE_TYPE (op1)))
? ASHIFTRT : LSHIFTRT, mode,
- TREE_CODE (op2) == INTEGER_CST,
+ TREE_CODE (op2) == INTEGER_CST,
cst_and_fits_in_hwi (op2)
? int_cst_value (op2) : -1,
false, false, NULL, NULL);
@@ -25331,22 +25364,102 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
case NOP_EXPR:
/* Only sign-conversions are free. */
if (tree_nop_conversion_p
- (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
stmt_cost = 0;
+ else if (fp)
+ stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ break;
+
+ case COND_EXPR:
+ {
+ /* SSE2 conditinal move sequence is:
+ pcmpgtd %xmm5, %xmm0
+ pand %xmm0, %xmm2
+ pandn %xmm1, %xmm0
+ por %xmm2, %xmm0
+ while SSE4 uses cmp + blend
+ and AVX512 masked moves. */
+
+ int ninsns = TARGET_SSE4_1 ? 2 : 4;
+
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ninsns * ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+ else
+ /* compare + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
break;
- case BIT_IOR_EXPR:
- case ABS_EXPR:
- case ABSU_EXPR:
case MIN_EXPR:
case MAX_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else
+ /* minss */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vpmin was introduced in SSE3.
+ SSE2 needs pcmpgtd + pand + pandn + pxor. */
+ if (!TARGET_SSSE3)
+ stmt_cost *= 4;
+ }
+ else
+ /* cmp + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case ABS_EXPR:
+ case ABSU_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode))
+ /* fabs. */
+ stmt_cost = ix86_cost->fabs;
+ else
+ /* andss of sign bit. */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vabs was introduced in SSE3.
+ SSE3 uses psrat + pxor + psub. */
+ if (!TARGET_SSSE3)
+ stmt_cost *= 3;
+ }
+ else
+ /* neg + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
case BIT_AND_EXPR:
case BIT_NOT_EXPR:
- if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
- stmt_cost = ix86_cost->sse_op;
- else if (VECTOR_MODE_P (mode))
+ gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+ && !X87_FLOAT_MODE_P (mode));
+ if (VECTOR_MODE_P (mode))
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
else
stmt_cost = ix86_cost->add;
@@ -25375,6 +25488,29 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
break;
}
+ if (kind == vec_promote_demote
+ && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
+ {
+ int outer_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt))));
+ int inner_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))));
+ int stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
+ up doing two conversions and packing them. */
+ if (inner_size > outer_size)
+ {
+ int n = inner_size / outer_size;
+ stmt_cost = stmt_cost * n
+ + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ }
+
/* If we do elementwise loads into a vector then we are bound by
latency and execution resources for the many scalar loads
(AGU and load ports). Try to account for this by scaling the
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8507243..18aa42d 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -207,6 +207,12 @@ struct processor_costs {
const int divsd; /* cost of DIVSD instructions. */
const int sqrtss; /* cost of SQRTSS instructions. */
const int sqrtsd; /* cost of SQRTSD instructions. */
+ const int cvtss2sd; /* cost SSE FP conversions,
+ such as CVTSS2SD. */
+ const int vcvtps2pd256; /* cost 256bit packed FP conversions,
+ such as VCVTPD2PS with larger reg in ymm. */
+ const int vcvtps2pd512; /* cost 512bit packed FP conversions,
+ such as VCVTPD2PS with larger reg in zmm. */
const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
/* Specify reassociation width for integer,
fp, vector integer and vector fp
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d6b2f29..e170da3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -26592,8 +26592,8 @@
[(set (match_operand:X87MODEF 0 "register_operand")
(if_then_else:X87MODEF
(match_operand 1 "comparison_operator")
- (match_operand:X87MODEF 2 "register_operand")
- (match_operand:X87MODEF 3 "register_operand")))]
+ (match_operand:X87MODEF 2 "nonimm_or_0_or_1s_operand")
+ (match_operand:X87MODEF 3 "nonimm_or_0_operand")))]
"(TARGET_80387 && TARGET_CMOVE)
|| (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
"if (ix86_expand_fp_movcc (operands)) DONE; else FAIL;")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 3d3848c..4b23e18 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1267,6 +1267,14 @@
(match_operand 0 "vector_memory_operand")
(match_code "const_vector")))
+; Return true when OP is register_operand, vector_memory_operand,
+; const_vector zero or const_vector all ones.
+(define_predicate "vector_or_0_or_1s_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_operand 0 "vector_memory_operand")
+ (match_operand 0 "const0_operand")
+ (match_operand 0 "int_float_vector_all_ones_operand")))
+
(define_predicate "bcst_mem_operand"
(and (match_code "vec_duplicate")
(and (match_test "TARGET_AVX512F")
@@ -1333,6 +1341,12 @@
(ior (match_operand 0 "nonimmediate_operand")
(match_operand 0 "const0_operand")))
+; Return true when OP is a nonimmediate or zero or all ones.
+(define_predicate "nonimm_or_0_or_1s_operand"
+ (ior (match_operand 0 "nonimmediate_operand")
+ (match_operand 0 "const0_operand")
+ (match_operand 0 "int_float_vector_all_ones_operand")))
+
;; Return true for RTX codes that force SImode address.
(define_predicate "SImode_address_operand"
(match_code "subreg,zero_extend,and"))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b280676..20b35a1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5142,7 +5142,7 @@
(define_expand "vcond_mask_<mode><sseintvecmodelower>"
[(set (match_operand:VI_256_AVX2 0 "register_operand")
(vec_merge:VI_256_AVX2
- (match_operand:VI_256_AVX2 1 "nonimmediate_operand")
+ (match_operand:VI_256_AVX2 1 "nonimm_or_0_or_1s_operand")
(match_operand:VI_256_AVX2 2 "nonimm_or_0_operand")
(match_operand:<sseintvecmode> 3 "register_operand")))]
"TARGET_AVX"
@@ -5155,7 +5155,7 @@
(define_expand "vcond_mask_<mode><sseintvecmodelower>"
[(set (match_operand:VI_128 0 "register_operand")
(vec_merge:VI_128
- (match_operand:VI_128 1 "vector_operand")
+ (match_operand:VI_128 1 "vector_or_0_or_1s_operand")
(match_operand:VI_128 2 "nonimm_or_0_operand")
(match_operand:<sseintvecmode> 3 "register_operand")))]
"TARGET_SSE2"
@@ -5168,7 +5168,7 @@
(define_expand "vcond_mask_v1tiv1ti"
[(set (match_operand:V1TI 0 "register_operand")
(vec_merge:V1TI
- (match_operand:V1TI 1 "vector_operand")
+ (match_operand:V1TI 1 "vector_or_0_or_1s_operand")
(match_operand:V1TI 2 "nonimm_or_0_operand")
(match_operand:V1TI 3 "register_operand")))]
"TARGET_SSE2"
@@ -5181,7 +5181,7 @@
(define_expand "vcond_mask_<mode><sseintvecmodelower>"
[(set (match_operand:VF_256 0 "register_operand")
(vec_merge:VF_256
- (match_operand:VF_256 1 "nonimmediate_operand")
+ (match_operand:VF_256 1 "nonimm_or_0_or_1s_operand")
(match_operand:VF_256 2 "nonimm_or_0_operand")
(match_operand:<sseintvecmode> 3 "register_operand")))]
"TARGET_AVX"
@@ -5194,7 +5194,7 @@
(define_expand "vcond_mask_<mode><sseintvecmodelower>"
[(set (match_operand:VF_128 0 "register_operand")
(vec_merge:VF_128
- (match_operand:VF_128 1 "vector_operand")
+ (match_operand:VF_128 1 "vector_or_0_or_1s_operand")
(match_operand:VF_128 2 "nonimm_or_0_operand")
(match_operand:<sseintvecmode> 3 "register_operand")))]
"TARGET_SSE"
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 7c8cb73..cddcf61 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -121,16 +121,19 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
- COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
- COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
- COSTS_N_BYTES (2), /* cost of MULSS instruction. */
- COSTS_N_BYTES (2), /* cost of MULSD instruction. */
- COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
- COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
- COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
- COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
- COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
- COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
+ COSTS_N_BYTES (4), /* cost of cheap SSE instruction. */
+ COSTS_N_BYTES (4), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_BYTES (4), /* cost of MULSS instruction. */
+ COSTS_N_BYTES (4), /* cost of MULSD instruction. */
+ COSTS_N_BYTES (4), /* cost of FMA SS instruction. */
+ COSTS_N_BYTES (4), /* cost of FMA SD instruction. */
+ COSTS_N_BYTES (4), /* cost of DIVSS instruction. */
+ COSTS_N_BYTES (4), /* cost of DIVSD instruction. */
+ COSTS_N_BYTES (4), /* cost of SQRTSS instruction. */
+ COSTS_N_BYTES (4), /* cost of SQRTSD instruction. */
+ COSTS_N_BYTES (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_BYTES (4), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_BYTES (6), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
ix86_size_memcpy,
ix86_size_memset,
@@ -243,6 +246,9 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (27), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (54), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (108), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i386_memcpy,
i386_memset,
@@ -356,6 +362,9 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i486_memcpy,
i486_memset,
@@ -467,6 +476,9 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
@@ -571,6 +583,9 @@ struct processor_costs lakemont_cost = {
COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (5), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (10), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (20), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
@@ -690,6 +705,9 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentiumpro_memcpy,
pentiumpro_memset,
@@ -800,6 +818,9 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
geode_memcpy,
geode_memset,
@@ -913,6 +934,9 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (8), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k6_memcpy,
k6_memset,
@@ -1027,6 +1051,9 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
athlon_memcpy,
athlon_memset,
@@ -1150,6 +1177,9 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k8_memcpy,
k8_memset,
@@ -1281,6 +1311,9 @@ struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (8), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (16), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
amdfam10_memcpy,
amdfam10_memset,
@@ -1405,6 +1438,9 @@ const struct processor_costs bdver_cost = {
COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver_memcpy,
bdver_memset,
@@ -1553,6 +1589,10 @@ struct processor_costs znver1_cost = {
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ /* Real latency is 4, but for split regs multiply cost of half op by 2. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
/* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
and it can execute 2 integer additions and 2 multiplications thus
reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
@@ -1712,6 +1752,9 @@ struct processor_costs znver2_cost = {
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -1847,6 +1890,9 @@ struct processor_costs znver3_cost = {
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -1984,6 +2030,10 @@ struct processor_costs znver4_cost = {
COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
+ /* Real latency is 6, but for split regs multiply cost of half op by 2. */
+ COSTS_N_INSNS (10), /* cost of 512bit VCVTPS2PD etc. */
/* Zen can execute 4 integer operations per cycle. FP operations
take 3 cycles and it can execute 2 integer additions and 2
multiplications thus reassociation may make sense up to with of 6.
@@ -2120,7 +2170,7 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
/* ADDSS has throughput 2 and latency 2
(in some cases when source is another addition). */
- COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
+ COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
/* MULSS has throughput 2 and latency 3. */
COSTS_N_INSNS (3), /* cost of MULSS instruction. */
COSTS_N_INSNS (3), /* cost of MULSD instruction. */
@@ -2135,6 +2185,9 @@ struct processor_costs znver5_cost = {
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
/* DIVSD has throughtput 0.13 and latency 20. */
COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (5), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */
/* Zen5 can execute:
- integer ops: 6 per cycle, at most 3 multiplications.
latency 1 for additions, 3 for multiplications (pipelined)
@@ -2274,6 +2327,9 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (4), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
skylake_memcpy,
skylake_memset,
@@ -2403,6 +2459,9 @@ struct processor_costs icelake_cost = {
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
icelake_memcpy,
icelake_memset,
@@ -2526,6 +2585,9 @@ struct processor_costs alderlake_cost = {
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
alderlake_memcpy,
alderlake_memset,
@@ -2642,6 +2704,9 @@ const struct processor_costs btver1_cost = {
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver1_memcpy,
btver1_memset,
@@ -2755,6 +2820,9 @@ const struct processor_costs btver2_cost = {
COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (4), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (7), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (14), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver2_memcpy,
btver2_memset,
@@ -2867,6 +2935,9 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium4_memcpy,
pentium4_memset,
@@ -2982,6 +3053,9 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (10), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (20), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (40), /* cost of 512bit VCVTPS2PD etc. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
nocona_memcpy,
nocona_memset,
@@ -3095,6 +3169,9 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (6), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (12), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (24), /* cost of 512bit VCVTPS2PD etc. */
2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
atom_memcpy,
atom_memset,
@@ -3208,6 +3285,9 @@ struct processor_costs slm_cost = {
COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
slm_memcpy,
slm_memset,
@@ -3335,6 +3415,9 @@ struct processor_costs tremont_cost = {
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
tremont_memcpy,
tremont_memset,
@@ -3448,6 +3531,9 @@ struct processor_costs intel_cost = {
COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (8), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (16), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (32), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
intel_memcpy,
intel_memset,
@@ -3566,6 +3652,9 @@ struct processor_costs lujiazui_cost = {
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
lujiazui_memcpy,
lujiazui_memset,
@@ -3682,6 +3771,9 @@ struct processor_costs yongfeng_cost = {
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
yongfeng_memcpy,
yongfeng_memset,
@@ -3798,6 +3890,9 @@ struct processor_costs shijidadao_cost = {
COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (6), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (12), /* cost of 512bit VCVTPS2PD etc. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
shijidadao_memcpy,
shijidadao_memset,
@@ -3922,6 +4017,9 @@ struct processor_costs generic_cost = {
COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (3), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (4), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (5), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
generic_memcpy,
generic_memset,
@@ -4051,6 +4149,9 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
+ COSTS_N_INSNS (2), /* cost of CVTSS2SD etc. */
+ COSTS_N_INSNS (2), /* cost of 256bit VCVTPS2PD etc. */
+ COSTS_N_INSNS (2), /* cost of 512bit VCVTPS2PD etc. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
core_memcpy,
core_memset,
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index 685a83c..15d3d91 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -81,6 +81,14 @@ ix86_issue_rate (void)
case PROCESSOR_YONGFENG:
case PROCESSOR_SHIJIDADAO:
case PROCESSOR_GENERIC:
+ /* For znver5 decoder can handle 4 or 8 instructions per cycle,
+ op cache 12 instruction/cycle, dispatch 8 instructions
+ integer rename 8 instructions and Fp 6 instructions.
+
+ The scheduler, without understanding out of order nature of the CPU
+ is not going to be able to use more than 4 instructions since that
+ is limits of the decoders. */
+ case PROCESSOR_ZNVER5:
return 4;
case PROCESSOR_ICELAKE_CLIENT:
@@ -91,13 +99,6 @@ ix86_issue_rate (void)
return 5;
case PROCESSOR_SAPPHIRERAPIDS:
- /* For znver5 decoder can handle 4 or 8 instructions per cycle,
- op cache 12 instruction/cycle, dispatch 8 instructions
- integer rename 8 instructions and Fp 6 instructions.
-
- The scheduler, without understanding out of order nature of the CPU
- is unlikely going to be able to fill all of these. */
- case PROCESSOR_ZNVER5:
return 6;
default: