aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r--gcc/config/i386/i386.cc1490
1 files changed, 1211 insertions, 279 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2..313522b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -335,6 +335,14 @@ static int const x86_64_ms_abi_int_parameter_registers[4] =
CX_REG, DX_REG, R8_REG, R9_REG
};
+/* Similar as Clang's preserve_none function parameter passing.
+ NB: Use DI_REG and SI_REG, see ix86_function_value_regno_p. */
+
+static int const x86_64_preserve_none_int_parameter_registers[6] =
+{
+ R12_REG, R13_REG, R14_REG, R15_REG, DI_REG, SI_REG
+};
+
static int const x86_64_int_return_registers[4] =
{
AX_REG, DX_REG, DI_REG, SI_REG
@@ -460,7 +468,8 @@ int ix86_arch_specified;
red-zone.
NB: Don't use red-zone for functions with no_caller_saved_registers
- and 32 GPRs since 128-byte red-zone is too small for 31 GPRs.
+ and 32 GPRs or 16 XMM registers since 128-byte red-zone is too small
+ for 31 GPRs or 15 GPRs + 16 XMM registers.
TODO: If we can reserve the first 2 WORDs, for PUSH and, another
for CALL, in red-zone, we can allow local indirect jumps with
@@ -471,7 +480,7 @@ ix86_using_red_zone (void)
{
return (TARGET_RED_ZONE
&& !TARGET_64BIT_MS_ABI
- && (!TARGET_APX_EGPR
+ && ((!TARGET_APX_EGPR && !TARGET_SSE)
|| (cfun->machine->call_saved_registers
!= TYPE_NO_CALLER_SAVED_REGISTERS))
&& (!cfun->machine->has_local_indirect_jump
@@ -898,6 +907,18 @@ x86_64_elf_unique_section (tree decl, int reloc)
default_unique_section (decl, reloc);
}
+/* Return true if TYPE has no_callee_saved_registers or preserve_none
+ attribute. */
+
+bool
+ix86_type_no_callee_saved_registers_p (const_tree type)
+{
+ return (lookup_attribute ("no_callee_saved_registers",
+ TYPE_ATTRIBUTES (type)) != NULL
+ || lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (type)) != NULL);
+}
+
#ifdef COMMON_ASM_OP
#ifndef LARGECOMM_SECTION_ASM_OP
@@ -1019,11 +1040,10 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
/* Sibling call isn't OK if callee has no callee-saved registers
and the calling function has callee-saved registers. */
- if (cfun->machine->call_saved_registers != TYPE_NO_CALLEE_SAVED_REGISTERS
- && (cfun->machine->call_saved_registers
- != TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP)
- && lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type)))
+ if ((cfun->machine->call_saved_registers
+ != TYPE_NO_CALLEE_SAVED_REGISTERS)
+ && cfun->machine->call_saved_registers != TYPE_PRESERVE_NONE
+ && ix86_type_no_callee_saved_registers_p (type))
return false;
/* If outgoing reg parm stack space changes, we cannot do sibcall. */
@@ -1188,10 +1208,16 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2)
!= ix86_function_regparm (type2, NULL))
return 0;
- if (lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type1))
- != lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type2)))
+ if (ix86_type_no_callee_saved_registers_p (type1)
+ != ix86_type_no_callee_saved_registers_p (type2))
+ return 0;
+
+ /* preserve_none attribute uses a different calling convention is
+ only for 64-bit. */
+ if (TARGET_64BIT
+ && (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type1))
+ != lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (type2))))
return 0;
return 1;
@@ -1553,7 +1579,10 @@ ix86_function_arg_regno_p (int regno)
if (call_abi == SYSV_ABI && regno == AX_REG)
return true;
- if (call_abi == MS_ABI)
+ if (cfun
+ && cfun->machine->call_saved_registers == TYPE_PRESERVE_NONE)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else if (call_abi == MS_ABI)
parm_regs = x86_64_ms_abi_int_parameter_registers;
else
parm_regs = x86_64_int_parameter_registers;
@@ -1716,6 +1745,19 @@ ix86_asm_output_function_label (FILE *out_file, const char *fname,
}
}
+/* Output a user-defined label. In AT&T syntax, registers are prefixed
+ with %, so labels require no punctuation. In Intel syntax, registers
+ are unprefixed, so labels may clash with registers or other operators,
+ and require quoting. */
+void
+ix86_asm_output_labelref (FILE *file, const char *prefix, const char *label)
+{
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ fprintf (file, "%s%s", prefix, label);
+ else
+ fprintf (file, "\"%s%s\"", prefix, label);
+}
+
/* Implementation of call abi switching target hook. Specific to FNDECL
the specific call register sets are set. See also
ix86_conditional_register_usage for more details. */
@@ -1795,8 +1837,7 @@ ix86_init_pic_reg (void)
add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
}
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
insert_insn_on_edge (seq, entry_edge);
@@ -1823,6 +1864,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
memset (cum, 0, sizeof (*cum));
+ tree preserve_none_type;
if (fndecl)
{
target = cgraph_node::get (fndecl);
@@ -1831,12 +1873,24 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
target = target->function_symbol ();
local_info_node = cgraph_node::local_info_node (target->decl);
cum->call_abi = ix86_function_abi (target->decl);
+ preserve_none_type = TREE_TYPE (target->decl);
}
else
- cum->call_abi = ix86_function_abi (fndecl);
+ {
+ cum->call_abi = ix86_function_abi (fndecl);
+ preserve_none_type = TREE_TYPE (fndecl);
+ }
}
else
- cum->call_abi = ix86_function_type_abi (fntype);
+ {
+ cum->call_abi = ix86_function_type_abi (fntype);
+ preserve_none_type = fntype;
+ }
+ cum->preserve_none_abi
+ = (preserve_none_type
+ && (lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (preserve_none_type))
+ != nullptr));
cum->caller = caller;
@@ -1998,8 +2052,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
&& GET_MODE_INNER (mode) == innermode)
{
- if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512)
- && !TARGET_IAMCU)
+ if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
{
static bool warnedavx512f;
static bool warnedavx512f_ret;
@@ -3410,9 +3463,15 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
break;
}
+ const int *parm_regs;
+ if (cum->preserve_none_abi)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
+
return construct_container (mode, orig_mode, type, 0, cum->nregs,
cum->sse_nregs,
- &x86_64_int_parameter_registers [cum->regno],
+ &parm_regs[cum->regno],
cum->sse_regno);
}
@@ -4422,7 +4481,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
/* AVX512F values are returned in ZMM0 if available. */
if (size == 64)
- return !TARGET_AVX512F || !TARGET_EVEX512;
+ return !TARGET_AVX512F;
}
if (mode == XFmode)
@@ -4577,6 +4636,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
if (max > X86_64_REGPARM_MAX)
max = X86_64_REGPARM_MAX;
+ const int *parm_regs;
+ if (cum->preserve_none_abi)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
+
for (i = cum->regno; i < max; i++)
{
mem = gen_rtx_MEM (word_mode,
@@ -4584,8 +4649,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
MEM_NOTRAP_P (mem) = 1;
set_mem_alias_set (mem, set);
emit_move_insn (mem,
- gen_rtx_REG (word_mode,
- x86_64_int_parameter_registers[i]));
+ gen_rtx_REG (word_mode, parm_regs[i]));
}
if (ix86_varargs_fpr_size)
@@ -4739,8 +4803,7 @@ ix86_va_start (tree valist, rtx nextarg)
start_sequence ();
emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
push_topmost_sequence ();
emit_insn_after (seq, entry_of_function ());
@@ -5180,6 +5243,27 @@ ix86_check_movabs (rtx insn, int opnum)
return volatile_ok || !MEM_VOLATILE_P (mem);
}
+/* Return true if XVECEXP idx of INSN satisfies MOVS arguments. */
+bool
+ix86_check_movs (rtx insn, int idx)
+{
+ rtx pat = PATTERN (insn);
+ gcc_assert (GET_CODE (pat) == PARALLEL);
+
+ rtx set = XVECEXP (pat, 0, idx);
+ gcc_assert (GET_CODE (set) == SET);
+
+ rtx dst = SET_DEST (set);
+ gcc_assert (MEM_P (dst));
+
+ rtx src = SET_SRC (set);
+ gcc_assert (MEM_P (src));
+
+ return (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst))
+ && (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src))
+ || Pmode == word_mode));
+}
+
/* Return false if INSN contains a MEM with a non-default address space. */
bool
ix86_check_no_addr_space (rtx insn)
@@ -5356,7 +5440,7 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
switch (GET_MODE_SIZE (mode))
{
case 64:
- if (TARGET_AVX512F && TARGET_EVEX512)
+ if (TARGET_AVX512F)
return 2;
break;
case 32:
@@ -5409,10 +5493,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vpxord\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxord\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxord\t%g0, %g0, %g0";
}
return "vpxor\t%x0, %x0, %x0";
@@ -5428,19 +5510,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vxorpd\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vxorpd\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vxorpd\t%g0, %g0, %g0";
}
else
{
if (TARGET_AVX512VL)
return "vpxorq\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxorq\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxorq\t%g0, %g0, %g0";
}
}
return "vxorpd\t%x0, %x0, %x0";
@@ -5457,19 +5535,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vxorps\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vxorps\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vxorps\t%g0, %g0, %g0";
}
else
{
if (TARGET_AVX512VL)
return "vpxord\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxord\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxord\t%g0, %g0, %g0";
}
}
return "vxorps\t%x0, %x0, %x0";
@@ -5490,7 +5564,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
case MODE_XI:
case MODE_V8DF:
case MODE_V16SF:
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
case MODE_OI:
@@ -5506,10 +5580,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
- else if (TARGET_EVEX512)
- return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
else
- gcc_unreachable ();
+ return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
}
return (TARGET_AVX
? "vpcmpeqd\t%0, %0, %0"
@@ -5523,7 +5595,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (GET_MODE_SIZE (mode) == 64)
{
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpcmpeqd\t%t0, %t0, %t0";
}
else if (GET_MODE_SIZE (mode) == 32)
@@ -5535,7 +5607,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
}
else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
{
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpcmpeqd\t%x0, %x0, %x0";
}
@@ -5646,8 +5718,6 @@ ix86_get_ssemov (rtx *operands, unsigned size,
|| memory_operand (operands[1], mode))
gcc_unreachable ();
size = 64;
- /* We need TARGET_EVEX512 to move into zmm register. */
- gcc_assert (TARGET_EVEX512);
switch (type)
{
case opcode_int:
@@ -5686,7 +5756,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu16"
: "%vmovdqu")
: "%vmovdqa");
@@ -5728,7 +5798,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu8"
: "%vmovdqu")
: "%vmovdqa");
@@ -5748,7 +5818,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu16"
: "%vmovdqu")
: "%vmovdqa");
@@ -6456,7 +6526,7 @@ output_set_got (rtx dest, rtx label)
xops[0] = dest;
- if (TARGET_VXWORKS_RTP && flag_pic)
+ if (TARGET_VXWORKS_GOTTPIC && TARGET_VXWORKS_RTP && flag_pic)
{
/* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
xops[2] = gen_rtx_MEM (Pmode,
@@ -6701,9 +6771,7 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
|| !frame_pointer_needed));
case TYPE_NO_CALLEE_SAVED_REGISTERS:
- return false;
-
- case TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP:
+ case TYPE_PRESERVE_NONE:
if (regno != HARD_FRAME_POINTER_REGNUM)
return false;
break;
@@ -6780,7 +6848,9 @@ ix86_nsaved_sseregs (void)
int nregs = 0;
int regno;
- if (!TARGET_64BIT_MS_ABI)
+ if (!TARGET_64BIT_MS_ABI
+ && (cfun->machine->call_saved_registers
+ != TYPE_NO_CALLER_SAVED_REGISTERS))
return 0;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
@@ -6888,6 +6958,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
&& (nregs + aligned) >= 3;
}
+/* Check if push/pop should be used to save/restore registers. */
+static bool
+save_regs_using_push_pop (HOST_WIDE_INT to_allocate)
+{
+ return ((!to_allocate && cfun->machine->frame.nregs <= 1)
+ || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+ /* If static stack checking is enabled and done with probes,
+ the registers need to be saved before allocating the frame. */
+ || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+ /* If stack clash probing needs a loop, then it needs a
+ scratch register. But the returned register is only guaranteed
+ to be safe to use after register saves are complete. So if
+ stack clash protections are enabled and the allocated frame is
+ larger than the probe interval, then use pushes to save
+ callee saved registers. */
+ || (flag_stack_clash_protection
+ && !ix86_target_stack_probe ()
+ && to_allocate > get_probe_interval ()));
+}
+
/* Fill structure ix86_frame about frame of currently computed function. */
static void
@@ -6968,12 +7058,18 @@ ix86_compute_frame_layout (void)
gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
gcc_assert (preferred_alignment <= stack_alignment_needed);
- /* The only ABI saving SSE regs should be 64-bit ms_abi. */
- gcc_assert (TARGET_64BIT || !frame->nsseregs);
+ /* The only ABI saving SSE regs should be 64-bit ms_abi or with
+ no_caller_saved_registers attribue. */
+ gcc_assert (TARGET_64BIT
+ || (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ || !frame->nsseregs);
if (TARGET_64BIT && m->call_ms2sysv)
{
gcc_assert (stack_alignment_needed >= 16);
- gcc_assert (!frame->nsseregs);
+ gcc_assert ((cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ || !frame->nsseregs);
}
/* For SEH we have to limit the amount of code movement into the prologue.
@@ -7172,20 +7268,7 @@ ix86_compute_frame_layout (void)
/* Size prologue needs to allocate. */
to_allocate = offset - frame->sse_reg_save_offset;
- if ((!to_allocate && frame->nregs <= 1)
- || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
- /* If static stack checking is enabled and done with probes,
- the registers need to be saved before allocating the frame. */
- || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
- /* If stack clash probing needs a loop, then it needs a
- scratch register. But the returned register is only guaranteed
- to be safe to use after register saves are complete. So if
- stack clash protections are enabled and the allocated frame is
- larger than the probe interval, then use pushes to save
- callee saved registers. */
- || (flag_stack_clash_protection
- && !ix86_target_stack_probe ()
- && to_allocate > get_probe_interval ()))
+ if (save_regs_using_push_pop (to_allocate))
frame->save_regs_using_mov = false;
if (ix86_using_red_zone ()
@@ -7643,7 +7726,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
{
- ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ /* Skip registers, already processed by shrink wrap separate. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -7736,8 +7821,15 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
add_frame_related_expr = true;
}
- insn = emit_insn (gen_pro_epilogue_adjust_stack_add
- (Pmode, dest, src, addend));
+ /* Shrink wrap separate may insert prologue between TEST and JMP. In order
+ not to affect EFlags, emit add without reg clobbering. */
+ if (crtl->shrink_wrapped_separate)
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_add_nocc
+ (Pmode, dest, src, addend));
+ else
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_add
+ (Pmode, dest, src, addend));
+
if (style >= 0)
ix86_add_queued_cfa_restore_notes (insn);
@@ -7921,6 +8013,15 @@ ix86_update_stack_boundary (void)
if (ix86_tls_descriptor_calls_expanded_in_cfun
&& crtl->preferred_stack_boundary < 128)
crtl->preferred_stack_boundary = 128;
+
+ /* For 32-bit MS ABI, both the incoming and preferred stack boundaries
+ are 32 bits, but if force_align_arg_pointer is specified, it should
+ prefer 128 bits for a backward-compatibility reason, which is also
+ what the doc suggests. */
+ if (lookup_attribute ("force_align_arg_pointer",
+ TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))
+ && crtl->preferred_stack_boundary < 128)
+ crtl->preferred_stack_boundary = 128;
}
/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
@@ -7951,8 +8052,7 @@ ix86_get_drap_rtx (void)
start_sequence ();
drap_vreg = copy_to_reg (arg_ptr);
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
if (!optimize)
@@ -8473,6 +8573,128 @@ output_probe_stack_range (rtx reg, rtx end)
return "";
}
+/* Data passed to ix86_update_stack_alignment. */
+struct stack_access_data
+{
+ /* The stack access register. */
+ const_rtx reg;
+ /* Pointer to stack alignment. */
+ unsigned int *stack_alignment;
+};
+
+/* Update the maximum stack slot alignment from memory alignment in PAT. */
+
+static void
+ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
+{
+ /* This insn may reference stack slot. Update the maximum stack slot
+ alignment if the memory is referenced by the stack access register. */
+ stack_access_data *p = (stack_access_data *) data;
+
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, pat, ALL)
+ {
+ auto op = *iter;
+ if (MEM_P (op))
+ {
+ if (reg_mentioned_p (p->reg, XEXP (op, 0)))
+ {
+ unsigned int alignment = MEM_ALIGN (op);
+
+ if (alignment > *p->stack_alignment)
+ *p->stack_alignment = alignment;
+ break;
+ }
+ else
+ iter.skip_subrtxes ();
+ }
+ }
+}
+
+/* Helper function for ix86_find_all_reg_uses. */
+
+static void
+ix86_find_all_reg_uses_1 (HARD_REG_SET &regset,
+ rtx set, unsigned int regno,
+ auto_bitmap &worklist)
+{
+ rtx dest = SET_DEST (set);
+
+ if (!REG_P (dest))
+ return;
+
+ /* Reject non-Pmode modes. */
+ if (GET_MODE (dest) != Pmode)
+ return;
+
+ unsigned int dst_regno = REGNO (dest);
+
+ if (TEST_HARD_REG_BIT (regset, dst_regno))
+ return;
+
+ const_rtx src = SET_SRC (set);
+
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, src, ALL)
+ {
+ auto op = *iter;
+
+ if (MEM_P (op))
+ iter.skip_subrtxes ();
+
+ if (REG_P (op) && REGNO (op) == regno)
+ {
+ /* Add this register to register set. */
+ add_to_hard_reg_set (&regset, Pmode, dst_regno);
+ bitmap_set_bit (worklist, dst_regno);
+ break;
+ }
+ }
+}
+
+/* Find all registers defined with register REGNO. */
+
+static void
+ix86_find_all_reg_uses (HARD_REG_SET &regset,
+ unsigned int regno, auto_bitmap &worklist)
+{
+ for (df_ref ref = DF_REG_USE_CHAIN (regno);
+ ref != NULL;
+ ref = DF_REF_NEXT_REG (ref))
+ {
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ continue;
+
+ rtx_insn *insn = DF_REF_INSN (ref);
+
+ if (!NONJUMP_INSN_P (insn))
+ continue;
+
+ unsigned int ref_regno = DF_REF_REGNO (ref);
+
+ rtx set = single_set (insn);
+ if (set)
+ {
+ ix86_find_all_reg_uses_1 (regset, set,
+ ref_regno, worklist);
+ continue;
+ }
+
+ rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) != PARALLEL)
+ continue;
+
+ for (int i = 0; i < XVECLEN (pat, 0); i++)
+ {
+ rtx exp = XVECEXP (pat, 0, i);
+
+ if (GET_CODE (exp) == SET)
+ ix86_find_all_reg_uses_1 (regset, exp,
+ ref_regno, worklist);
+ }
+ }
+}
+
/* Set stack_frame_required to false if stack frame isn't required.
Update STACK_ALIGNMENT to the largest alignment, in bits, of stack
slot used if stack frame is required and CHECK_STACK_SLOT is true. */
@@ -8491,10 +8713,6 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
add_to_hard_reg_set (&set_up_by_prologue, Pmode,
HARD_FRAME_POINTER_REGNUM);
- /* The preferred stack alignment is the minimum stack alignment. */
- if (stack_alignment > crtl->preferred_stack_boundary)
- stack_alignment = crtl->preferred_stack_boundary;
-
bool require_stack_frame = false;
FOR_EACH_BB_FN (bb, cfun)
@@ -8506,27 +8724,67 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
set_up_by_prologue))
{
require_stack_frame = true;
-
- if (check_stack_slot)
- {
- /* Find the maximum stack alignment. */
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
- if (MEM_P (*iter)
- && (reg_mentioned_p (stack_pointer_rtx,
- *iter)
- || reg_mentioned_p (frame_pointer_rtx,
- *iter)))
- {
- unsigned int alignment = MEM_ALIGN (*iter);
- if (alignment > stack_alignment)
- stack_alignment = alignment;
- }
- }
+ break;
}
}
cfun->machine->stack_frame_required = require_stack_frame;
+
+ /* Stop if we don't need to check stack slot. */
+ if (!check_stack_slot)
+ return;
+
+ /* The preferred stack alignment is the minimum stack alignment. */
+ if (stack_alignment > crtl->preferred_stack_boundary)
+ stack_alignment = crtl->preferred_stack_boundary;
+
+ HARD_REG_SET stack_slot_access;
+ CLEAR_HARD_REG_SET (stack_slot_access);
+
+ /* Stack slot can be accessed by stack pointer, frame pointer or
+ registers defined by stack pointer or frame pointer. */
+ auto_bitmap worklist;
+
+ add_to_hard_reg_set (&stack_slot_access, Pmode, STACK_POINTER_REGNUM);
+ bitmap_set_bit (worklist, STACK_POINTER_REGNUM);
+
+ if (frame_pointer_needed)
+ {
+ add_to_hard_reg_set (&stack_slot_access, Pmode,
+ HARD_FRAME_POINTER_REGNUM);
+ bitmap_set_bit (worklist, HARD_FRAME_POINTER_REGNUM);
+ }
+
+ unsigned int regno;
+
+ do
+ {
+ regno = bitmap_clear_first_set_bit (worklist);
+ ix86_find_all_reg_uses (stack_slot_access, regno, worklist);
+ }
+ while (!bitmap_empty_p (worklist));
+
+ hard_reg_set_iterator hrsi;
+ stack_access_data data;
+
+ data.stack_alignment = &stack_alignment;
+
+ EXECUTE_IF_SET_IN_HARD_REG_SET (stack_slot_access, 0, regno, hrsi)
+ for (df_ref ref = DF_REG_USE_CHAIN (regno);
+ ref != NULL;
+ ref = DF_REF_NEXT_REG (ref))
+ {
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ continue;
+
+ rtx_insn *insn = DF_REF_INSN (ref);
+
+ if (!NONJUMP_INSN_P (insn))
+ continue;
+
+ data.reg = DF_REF_REG (ref);
+ note_stores (insn, ix86_update_stack_alignment, &data);
+ }
}
/* Finalize stack_realign_needed and frame_pointer_needed flags, which
@@ -9036,11 +9294,22 @@ ix86_expand_prologue (void)
doing this if we have to probe the stack; at least on x86_64 the
stack probe can turn into a call that clobbers a red zone location. */
else if (ix86_using_red_zone ()
- && (! TARGET_STACK_PROBE
- || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
+ && (! TARGET_STACK_PROBE
+ || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
{
+ HOST_WIDE_INT allocate_offset;
+ if (crtl->shrink_wrapped_separate)
+ {
+ allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset;
+
+ /* Adjust the total offset at the beginning of the function. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (allocate_offset), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset;
+ }
+
ix86_emit_save_regs_using_mov (frame.reg_save_offset);
- cfun->machine->red_zone_used = true;
int_registers_saved = true;
}
}
@@ -9618,30 +9887,35 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
{
- rtx reg = gen_rtx_REG (word_mode, regno);
- rtx mem;
- rtx_insn *insn;
-
- mem = choose_baseaddr (cfa_offset, NULL);
- mem = gen_frame_mem (word_mode, mem);
- insn = emit_move_insn (reg, mem);
- if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+ /* Skip registers, already processed by shrink wrap separate. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
{
- /* Previously we'd represented the CFA as an expression
- like *(%ebp - 8). We've just popped that value from
- the stack, which means we need to reset the CFA to
- the drap register. This will remain until we restore
- the stack pointer. */
- add_reg_note (insn, REG_CFA_DEF_CFA, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
- /* This means that the DRAP register is valid for addressing. */
- m->fs.drap_valid = true;
- }
- else
- ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+ if (m->fs.cfa_reg == crtl->drap_reg
+ && regno == REGNO (crtl->drap_reg))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* DRAP register is valid for addressing. */
+ m->fs.drap_valid = true;
+ }
+ else
+ ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ }
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -9920,10 +10194,11 @@ ix86_expand_epilogue (int style)
less work than reloading sp and popping the register. */
else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
restore_regs_via_mov = true;
- else if (TARGET_EPILOGUE_USING_MOVE
- && cfun->machine->use_fast_prologue_epilogue
- && (frame.nregs > 1
- || m->fs.sp_offset != reg_save_offset))
+ else if (crtl->shrink_wrapped_separate
+ || (TARGET_EPILOGUE_USING_MOVE
+ && cfun->machine->use_fast_prologue_epilogue
+ && (frame.nregs > 1
+ || m->fs.sp_offset != reg_save_offset)))
restore_regs_via_mov = true;
else if (frame_pointer_needed
&& !frame.nregs
@@ -9937,6 +10212,9 @@ ix86_expand_epilogue (int style)
else
restore_regs_via_mov = false;
+ if (crtl->shrink_wrapped_separate)
+ gcc_assert (restore_regs_via_mov);
+
if (restore_regs_via_mov || frame.nsseregs)
{
/* Ensure that the entire register save area is addressable via
@@ -9989,6 +10267,7 @@ ix86_expand_epilogue (int style)
gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
gcc_assert (!crtl->drap_reg);
gcc_assert (!frame.nregs);
+ gcc_assert (!crtl->shrink_wrapped_separate);
}
else if (restore_regs_via_mov)
{
@@ -10003,6 +10282,8 @@ ix86_expand_epilogue (int style)
rtx sa = EH_RETURN_STACKADJ_RTX;
rtx_insn *insn;
+ gcc_assert (!crtl->shrink_wrapped_separate);
+
/* Stack realignment doesn't work with eh_return. */
if (crtl->stack_realign_needed)
sorry ("Stack realignment not supported with "
@@ -11184,6 +11465,9 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
x = XVECEXP (x, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+ case UNSPEC_SECREL32:
+ x = XVECEXP (x, 0, 0);
+ return GET_CODE (x) == SYMBOL_REF;
default:
return false;
}
@@ -11231,7 +11515,7 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
case E_OImode:
case E_XImode:
if (!standard_sse_constant_p (x, mode)
- && GET_MODE_SIZE (TARGET_AVX512F && TARGET_EVEX512
+ && GET_MODE_SIZE (TARGET_AVX512F
? XImode
: (TARGET_AVX
? OImode
@@ -11320,6 +11604,9 @@ legitimate_pic_operand_p (rtx x)
x = XVECEXP (inner, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_SECREL32:
+ x = XVECEXP (inner, 0, 0);
+ return GET_CODE (x) == SYMBOL_REF;
case UNSPEC_MACHOPIC_OFFSET:
return legitimate_pic_address_disp_p (x);
default:
@@ -11500,6 +11787,9 @@ legitimate_pic_address_disp_p (rtx disp)
disp = XVECEXP (disp, 0, 0);
return (GET_CODE (disp) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+ case UNSPEC_SECREL32:
+ disp = XVECEXP (disp, 0, 0);
+ return GET_CODE (disp) == SYMBOL_REF;
}
return false;
@@ -11777,6 +12067,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict,
case UNSPEC_INDNTPOFF:
case UNSPEC_NTPOFF:
case UNSPEC_DTPOFF:
+ case UNSPEC_SECREL32:
break;
default:
@@ -11802,7 +12093,8 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict,
|| GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
|| !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
|| (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
- && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_SECREL32))
/* Non-constant pic memory reference. */
return false;
}
@@ -11953,7 +12245,7 @@ legitimize_pic_address (rtx orig, rtx reg)
else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
/* We can't always use @GOTOFF for text labels
on VxWorks, see gotoff_operand. */
- || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+ || (TARGET_VXWORKS_VAROFF && GET_CODE (addr) == LABEL_REF))
{
#if TARGET_PECOFF
rtx tmp = legitimize_pe_coff_symbol (addr, true);
@@ -12126,6 +12418,24 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg)
return tp;
}
+/* Construct the SYMBOL_REF for the _tls_index symbol. */
+
+static GTY(()) rtx ix86_tls_index_symbol;
+
+#if TARGET_WIN32_TLS
+static rtx
+ix86_tls_index (void)
+{
+ if (!ix86_tls_index_symbol)
+ ix86_tls_index_symbol = gen_rtx_SYMBOL_REF (SImode, "_tls_index");
+
+ if (flag_pic)
+ return gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_index_symbol), UNSPEC_PCREL));
+ else
+ return ix86_tls_index_symbol;
+}
+#endif
+
/* Construct the SYMBOL_REF for the tls_get_addr function. */
static GTY(()) rtx ix86_tls_symbol;
@@ -12184,6 +12494,26 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
machine_mode tp_mode = Pmode;
int type;
+#if TARGET_WIN32_TLS
+ off = gen_const_mem (SImode, ix86_tls_index ());
+ set_mem_alias_set (off, GOT_ALIAS_SET);
+
+ tp = gen_const_mem (Pmode, GEN_INT (TARGET_64BIT ? 88 : 44));
+ set_mem_addr_space (tp, DEFAULT_TLS_SEG_REG);
+
+ if (TARGET_64BIT)
+ off = convert_to_mode (Pmode, off, 1);
+
+ base = force_reg (Pmode, off);
+ tp = copy_to_mode_reg (Pmode, tp);
+
+ tp = gen_const_mem (Pmode, gen_rtx_PLUS (Pmode, tp, gen_rtx_MULT (Pmode, base, GEN_INT (UNITS_PER_WORD))));
+ set_mem_alias_set (tp, GOT_ALIAS_SET);
+
+ base = force_reg (Pmode, tp);
+
+ return gen_rtx_PLUS (Pmode, base, gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_SECREL32)));
+#else
/* Fall back to global dynamic model if tool chain cannot support local
dynamic. */
if (TARGET_SUN_TLS && !TARGET_64BIT
@@ -12232,13 +12562,13 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
if (TARGET_64BIT)
{
rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx rdi = gen_rtx_REG (Pmode, DI_REG);
rtx_insn *insns;
start_sequence ();
emit_call_insn
- (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr));
- insns = get_insns ();
- end_sequence ();
+ (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr, rdi));
+ insns = end_sequence ();
if (GET_MODE (x) != Pmode)
x = gen_rtx_ZERO_EXTEND (Pmode, x);
@@ -12286,14 +12616,14 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
if (TARGET_64BIT)
{
rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx rdi = gen_rtx_REG (Pmode, DI_REG);
rtx_insn *insns;
rtx eqv;
start_sequence ();
emit_call_insn
- (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr));
- insns = get_insns ();
- end_sequence ();
+ (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi));
+ insns = end_sequence ();
/* Attach a unique REG_EQUAL, to allow the RTL optimizers to
share the LD_BASE result with other LD model accesses. */
@@ -12406,6 +12736,7 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
}
return dest;
+#endif
}
/* Return true if the TLS address requires insn using integer registers.
@@ -12875,6 +13206,9 @@ output_pic_addr_const (FILE *file, rtx x, int code)
case UNSPEC_INDNTPOFF:
fputs ("@indntpoff", file);
break;
+ case UNSPEC_SECREL32:
+ fputs ("@secrel32", file);
+ break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
putc ('-', file);
@@ -12900,7 +13234,11 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
{
fputs (ASM_LONG, file);
output_addr_const (file, x);
+#if TARGET_WIN32_TLS
+ fputs ("@secrel32", file);
+#else
fputs ("@dtpoff", file);
+#endif
switch (size)
{
case 4:
@@ -13134,7 +13472,7 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p)
else if (base_term_p
&& pic_offset_table_rtx
&& !TARGET_MACHO
- && !TARGET_VXWORKS_RTP)
+ && !TARGET_VXWORKS_VAROFF)
{
rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
@@ -13559,10 +13897,11 @@ print_reg (rtx x, int code, FILE *file)
H -- print a memory address offset by 8; used for sse high-parts
Y -- print condition for XOP pcom* instruction.
V -- print naked full integer register name without %.
+ v -- print segment override prefix
+ -- print a branch hint as 'cs' or 'ds' prefix
; -- print a semicolon (after prefixes due to bug in older gas).
~ -- print "i" if TARGET_AVX2, "f" otherwise.
- ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+ ^ -- print addr32 prefix if Pmode != word_mode
M -- print addr32 prefix for TARGET_X32 with VSIB address.
! -- print NOTRACK prefix for jxx/call/ret instructions if required.
N -- print maskz if it's constant 0 operand.
@@ -14064,6 +14403,28 @@ ix86_print_operand (FILE *file, rtx x, int code)
return;
+ case 'v':
+ if (MEM_P (x))
+ {
+ switch (MEM_ADDR_SPACE (x))
+ {
+ case ADDR_SPACE_GENERIC:
+ break;
+ case ADDR_SPACE_SEG_FS:
+ fputs ("fs ", file);
+ break;
+ case ADDR_SPACE_SEG_GS:
+ fputs ("gs ", file);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ else
+ output_operand_lossage ("operand is not a memory reference, "
+ "invalid operand code 'v'");
+ return;
+
case '*':
if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('*', file);
@@ -14138,7 +14499,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
return;
case '^':
- if (TARGET_64BIT && Pmode != word_mode)
+ if (Pmode != word_mode)
fputs ("addr32 ", file);
return;
@@ -14653,6 +15014,10 @@ i386_asm_output_addr_const_extra (FILE *file, rtx x)
output_addr_const (file, op);
fputs ("@indntpoff", file);
break;
+ case UNSPEC_SECREL32:
+ output_addr_const (file, op);
+ fputs ("@secrel32", file);
+ break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
output_addr_const (file, op);
@@ -15507,7 +15872,7 @@ ix86_output_addr_diff_elt (FILE *file, int value, int rel)
gcc_assert (!TARGET_64BIT);
#endif
/* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
- if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+ if (TARGET_64BIT || TARGET_VXWORKS_VAROFF)
fprintf (file, "%s%s%d-%s%d\n",
directive, LPREFIX, value, LPREFIX, rel);
#if TARGET_MACHO
@@ -17905,9 +18270,14 @@ ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
if (cum->decl && !TREE_PUBLIC (cum->decl))
return;
- const_tree ctx = get_ultimate_context (cum->decl);
- if (ctx != NULL_TREE
- && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
+ tree decl = cum->decl;
+ if (!decl)
+ /* If we don't know the target, look at the current TU. */
+ decl = current_function_decl;
+
+ const_tree ctx = get_ultimate_context (decl);
+ if (ctx == NULL_TREE
+ || !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
return;
/* If the actual size of the type is zero, then there is no change
@@ -20044,14 +20414,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
{
bool si;
enum ix86_builtins code;
- const machine_mode mode = TYPE_MODE (TREE_TYPE (vectype));
if (!TARGET_AVX512F)
return NULL_TREE;
- if (!TARGET_EVEX512 && GET_MODE_SIZE (mode) == 64)
- return NULL_TREE;
-
if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u)
? !TARGET_USE_SCATTER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
@@ -20794,7 +21160,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to,
return true;
/* x87 registers can't do subreg at all, as all values are reformatted
- to extended precision. */
+ to extended precision.
+
+ ??? middle-end queries mode changes for ALL_REGS and this makes
+ vec_series_lowpart_p to always return false. We probably should
+ restrict this to modes supported by i387 and check if it is enabled. */
if (MAYBE_FLOAT_CLASS_P (regclass))
return false;
@@ -21169,7 +21539,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
- any of 512-bit wide vector mode
- any scalar mode. */
if (TARGET_AVX512F
- && ((VALID_AVX512F_REG_OR_XI_MODE (mode) && TARGET_EVEX512)
+ && ((VALID_AVX512F_REG_OR_XI_MODE (mode))
|| VALID_AVX512F_SCALAR_MODE (mode)))
return true;
@@ -21340,19 +21710,20 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
return mode1 == SFmode;
/* If MODE2 is only appropriate for an SSE register, then tie with
- any other mode acceptable to SSE registers. */
- if (GET_MODE_SIZE (mode2) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 16
+ any vector modes or scalar floating point modes acceptable to SSE
+ registers, excluding scalar integer modes with SUBREG:
+ (subreg:QI (reg:TI 99) 0))
+ (subreg:HI (reg:TI 99) 0))
+ (subreg:SI (reg:TI 99) 0))
+ (subreg:DI (reg:TI 99) 0))
+ to avoid unnecessary move from SSE register to integer register.
+ */
+ if (GET_MODE_SIZE (mode2) >= 16
+ && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
+ || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
+ && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
/* If MODE2 is appropriate for an MMX register, then tie
with any other mode acceptable to MMX registers. */
@@ -21410,7 +21781,7 @@ ix86_set_reg_reg_cost (machine_mode mode)
case MODE_VECTOR_INT:
case MODE_VECTOR_FLOAT:
- if ((TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode))
+ if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
|| (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
|| (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
|| (TARGET_SSE && VALID_SSE_REG_MODE (mode))
@@ -21471,7 +21842,7 @@ ix86_widen_mult_cost (const struct processor_costs *cost,
/* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
require extra 4 mul, 4 add, 4 cmp and 2 shift. */
if (!TARGET_SSE4_1 && !uns_p)
- extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+ extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4
+ cost->sse_op * 2;
/* Fallthru. */
case V4DImode:
@@ -21521,11 +21892,11 @@ ix86_multiplication_cost (const struct processor_costs *cost,
else if (TARGET_AVX2)
nops += 2;
else if (TARGET_XOP)
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
else
{
nops += 1;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
goto do_qimode;
@@ -21544,13 +21915,13 @@ ix86_multiplication_cost (const struct processor_costs *cost,
{
nmults += 1;
nops += 2;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
else
{
nmults += 1;
nops += 4;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
goto do_qimode;
@@ -21563,14 +21934,16 @@ ix86_multiplication_cost (const struct processor_costs *cost,
{
nmults += 1;
nops += 4;
- extra += cost->sse_load[3] * 2;
+ /* 2 loads, so no division by 2. */
+ extra += COSTS_N_INSNS (cost->sse_load[3]);
}
goto do_qimode;
case V64QImode:
nmults = 2;
nops = 9;
- extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+ /* 2 loads of each size, so no division by 2. */
+ extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]);
do_qimode:
return ix86_vec_cost (mode, cost->mulss * nmults
@@ -21663,7 +22036,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
/* Use vpbroadcast. */
extra = cost->sse_op;
else
- extra = cost->sse_load[2];
+ extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
if (constant_op1)
{
@@ -21694,7 +22067,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
shift with one insn set the cost to prefer paddb. */
if (constant_op1)
{
- extra = cost->sse_load[2];
+ extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
return ix86_vec_cost (mode, cost->sse_op) + extra;
}
else
@@ -21709,7 +22082,9 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
/* Use vpbroadcast. */
extra = cost->sse_op;
else
- extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+ extra = COSTS_N_INSNS (mode == V16QImode
+ ? cost->sse_load[2]
+ : cost->sse_load[3]) / 2;
if (constant_op1)
{
@@ -21836,6 +22211,15 @@ vec_fp_conversion_cost (const struct processor_costs *cost, int size)
return cost->vcvtps2pd512;
}
+/* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP. */
+
+static bool
+unspec_pcmp_p (rtx x)
+{
+ return GET_CODE (x) == UNSPEC
+ && (XINT (x, 1) == UNSPEC_PCMP || XINT (x, 1) == UNSPEC_UNSIGNED_PCMP);
+}
+
/* Compute a (partial) cost for rtx X. Return true if the complete
cost has been computed, and false if subexpressions should be
scanned. In either case, *TOTAL contains the cost result. */
@@ -21853,9 +22237,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* Handling different vternlog variants. */
if ((GET_MODE_SIZE (mode) == 64
- ? (TARGET_AVX512F && TARGET_EVEX512)
+ ? TARGET_AVX512F
: (TARGET_AVX512VL
- || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+ || (TARGET_AVX512F && !TARGET_PREFER_AVX256)))
&& GET_MODE_SIZE (mode) >= 16
&& outer_code_i == SET
&& ternlog_operand (x, mode))
@@ -22204,8 +22588,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
{
/* (ior (not ...) ...) can be a single insn in AVX512. */
if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22296,8 +22679,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* (and (not ...) (not ...)) can be a single insn in AVX512. */
if (GET_CODE (right) == NOT && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22367,8 +22749,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
{
/* (not (xor ...)) can be a single insn in AVX512. */
if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22512,6 +22893,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
else
*total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
return false;
+ case FLOAT:
+ case UNSIGNED_FLOAT:
+ if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ *total = cost->fadd;
+ else if (VECTOR_MODE_P (mode))
+ *total = ix86_vec_cost (mode, cost->cvtpi2ps);
+ else
+ *total = cost->cvtsi2ss;
+ return false;
+
+ case FIX:
+ case UNSIGNED_FIX:
+ if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ *total = cost->fadd;
+ else if (VECTOR_MODE_P (mode))
+ *total = ix86_vec_cost (mode, cost->cvtps2pi);
+ else
+ *total = cost->cvtss2si;
+ return false;
case ABS:
/* SSE requires memory load for the constant operand. It may make
@@ -22571,13 +22973,41 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
return false;
- case VEC_SELECT:
case VEC_CONCAT:
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
- same cost. */
+ same cost.
+ ??? We should still recruse when computing cost. */
*total = cost->sse_op;
return true;
+
+ case VEC_SELECT:
+ /* Special case extracting lower part from the vector.
+ This by itself needs to code and most of SSE/AVX instructions have
+ packed and single forms where the single form may be represented
+ by such VEC_SELECT.
+
+ Use cost 1 (despite the fact that functionally equivalent SUBREG has
+ cost 0). Making VEC_SELECT completely free, for example instructs CSE
+ to forward propagate VEC_SELECT into
+
+ (set (reg eax) (reg src))
+
+ which then prevents fwprop and combining. See i.e.
+ gcc.target/i386/pr91103-1.c.
+
+ ??? rtvec_series_p test should be, for valid patterns, equivalent to
+ vec_series_lowpart_p but is not, since the latter calls
+ can_cange_mode_class on ALL_REGS and this return false since x87 does
+ not support subregs at all. */
+ if (rtvec_series_p (XVEC (XEXP (x, 1), 0), 0))
+ *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)),
+ outer_code, opno, speed) + 1;
+ else
+ /* ??? We should still recruse when computing cost. */
+ *total = cost->sse_op;
+ return true;
+
case VEC_DUPLICATE:
*total = rtx_cost (XEXP (x, 0),
GET_MODE (XEXP (x, 0)),
@@ -22590,13 +23020,87 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
case VEC_MERGE:
mask = XEXP (x, 2);
+ /* Scalar versions of SSE instructions may be represented as:
+
+ (vec_merge (vec_duplicate (operation ....))
+ (register or memory)
+ (const_int 1))
+
+ In this case vec_merge and vec_duplicate is for free.
+ Just recurse into operation and second operand. */
+ if (mask == const1_rtx
+ && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE)
+ {
+ *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed)
+ + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+ }
/* This is masked instruction, assume the same cost,
as nonmasked variant. */
- if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
- *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
+ else if (TARGET_AVX512F
+ && (register_operand (mask, GET_MODE (mask))
+ /* Redunduant clean up of high bits for kmask with VL=2/4
+ .i.e (vec_merge op0, op1, (and op3 15)). */
+ || (GET_CODE (mask) == AND
+ && register_operand (XEXP (mask, 0), GET_MODE (mask))
+ && CONST_INT_P (XEXP (mask, 1))
+ && ((INTVAL (XEXP (mask, 1)) == 3
+ && GET_MODE_NUNITS (mode) == 2)
+ || (INTVAL (XEXP (mask, 1)) == 15
+ && GET_MODE_NUNITS (mode) == 4)))))
+ {
+ *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+ + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+ }
+ /* Combination of the two above:
+
+ (vec_merge (vec_merge (vec_duplicate (operation ...))
+ (register or memory)
+ (reg:QI mask))
+ (register or memory)
+ (const_int 1))
+
+ i.e. avx512fp16_vcvtss2sh_mask. */
+ else if (TARGET_AVX512F
+ && mask == const1_rtx
+ && GET_CODE (XEXP (x, 0)) == VEC_MERGE
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE
+ && register_operand (XEXP (XEXP (x, 0), 2),
+ GET_MODE (XEXP (XEXP (x, 0), 2))))
+ {
+ *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
+ mode, outer_code, opno, speed)
+ + rtx_cost (XEXP (XEXP (x, 0), 1),
+ mode, outer_code, opno, speed)
+ + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
+ return true;
+ }
+ /* vcmp. */
+ else if (unspec_pcmp_p (mask)
+ || (GET_CODE (mask) == NOT
+ && unspec_pcmp_p (XEXP (mask, 0))))
+ {
+ rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask;
+ rtx unsop0 = XVECEXP (uns, 0, 0);
+ /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0)
+ cost the same as register.
+ This is used by avx_cmp<mode>3_ltint_not. */
+ if (GET_CODE (unsop0) == SUBREG)
+ unsop0 = XEXP (unsop0, 0);
+ if (GET_CODE (unsop0) == NOT)
+ unsop0 = XEXP (unsop0, 0);
+ *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+ + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
+ + rtx_cost (unsop0, mode, UNSPEC, opno, speed)
+ + rtx_cost (XVECEXP (uns, 0, 1), mode, UNSPEC, opno, speed)
+ + cost->sse_op;
+ return true;
+ }
else
*total = cost->sse_op;
- return true;
+ return false;
case MEM:
/* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
@@ -22613,7 +23117,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
/* An insn that accesses memory is slightly more expensive
- than one that does not. */
+ than one that does not. */
if (speed)
{
*total += 1;
@@ -22854,7 +23358,9 @@ x86_this_parameter (tree function)
{
const int *parm_regs;
- if (ix86_function_type_abi (type) == MS_ABI)
+ if (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type)))
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else if (ix86_function_type_abi (type) == MS_ABI)
parm_regs = x86_64_ms_abi_int_parameter_registers;
else
parm_regs = x86_64_int_parameter_registers;
@@ -23180,19 +23686,21 @@ x86_field_alignment (tree type, int computed)
/* Print call to TARGET to FILE. */
static void
-x86_print_call_or_nop (FILE *file, const char *target)
+x86_print_call_or_nop (FILE *file, const char *target,
+ const char *label)
{
if (flag_nop_mcount || !strcmp (target, "nop"))
/* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
- fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n",
+ label);
else if (!TARGET_PECOFF && flag_pic)
{
gcc_assert (flag_plt);
- fprintf (file, "1:\tcall\t%s@PLT\n", target);
+ fprintf (file, "%s\tcall\t%s@PLT\n", label, target);
}
else
- fprintf (file, "1:\tcall\t%s\n", target);
+ fprintf (file, "%s\tcall\t%s\n", label, target);
}
static bool
@@ -23277,6 +23785,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
const char *mcount_name = MCOUNT_NAME;
+ bool fentry_section_p
+ = (flag_record_mcount
+ || lookup_attribute ("fentry_section",
+ DECL_ATTRIBUTES (current_function_decl)));
+
+ const char *label = fentry_section_p ? "1:" : "";
+
if (current_fentry_name (&mcount_name))
;
else if (fentry_name)
@@ -23312,11 +23827,12 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
reg = legacy_reg;
}
if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tmovabs\t%s, OFFSET FLAT:%s\n"
- "\tcall\t%s\n", reg, mcount_name, reg);
+ fprintf (file, "%s\tmovabs\t%s, OFFSET FLAT:%s\n"
+ "\tcall\t%s\n", label, reg, mcount_name,
+ reg);
else
- fprintf (file, "1:\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n",
- mcount_name, reg, reg);
+ fprintf (file, "%s\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n",
+ label, mcount_name, reg, reg);
break;
case CM_LARGE_PIC:
#ifdef NO_PROFILE_COUNTERS
@@ -23357,21 +23873,21 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
if (!flag_plt)
{
if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
- mcount_name);
+ fprintf (file, "%s\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
+ label, mcount_name);
else
- fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n",
- mcount_name);
+ fprintf (file, "%s\tcall\t*%s@GOTPCREL(%%rip)\n",
+ label, mcount_name);
break;
}
/* fall through */
default:
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
break;
}
}
else
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
}
else if (flag_pic)
{
@@ -23386,11 +23902,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
LPREFIX, labelno);
#endif
if (flag_plt)
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
else if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name);
+ fprintf (file, "%s\tcall\t[DWORD PTR %s@GOT[ebx]]\n",
+ label, mcount_name);
else
- fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+ fprintf (file, "%s\tcall\t*%s@GOT(%%ebx)\n",
+ label, mcount_name);
}
else
{
@@ -23403,12 +23921,10 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
fprintf (file, "\tmovl\t$%sP%d, %%" PROFILE_COUNT_REGISTER "\n",
LPREFIX, labelno);
#endif
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
}
- if (flag_record_mcount
- || lookup_attribute ("fentry_section",
- DECL_ATTRIBUTES (current_function_decl)))
+ if (fentry_section_p)
{
const char *sname = "__mcount_loc";
@@ -24167,7 +24683,7 @@ ix86_vector_mode_supported_p (machine_mode mode)
return true;
if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
return true;
- if (TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode))
+ if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
return true;
if ((TARGET_MMX || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE (mode))
@@ -24415,8 +24931,7 @@ ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> & /*inputs*/,
}
}
- rtx_insn *seq = get_insns ();
- end_sequence ();
+ rtx_insn *seq = end_sequence ();
if (saw_asm_flag)
return seq;
@@ -24792,12 +25307,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
/* One vinserti128 for combining two SSE vectors for AVX256. */
else if (GET_MODE_BITSIZE (mode) == 256)
return ((n - 2) * ix86_cost->sse_op
- + ix86_vec_cost (mode, ix86_cost->addss));
+ + ix86_vec_cost (mode, ix86_cost->sse_op));
/* One vinserti64x4 and two vinserti128 for combining SSE
and AVX256 vectors to AVX512. */
else if (GET_MODE_BITSIZE (mode) == 512)
- return ((n - 4) * ix86_cost->sse_op
- + 3 * ix86_vec_cost (mode, ix86_cost->addss));
+ {
+ machine_mode half_mode
+ = mode_for_vector (GET_MODE_INNER (mode),
+ GET_MODE_NUNITS (mode) / 2).require ();
+ return ((n - 4) * ix86_cost->sse_op
+ + 2 * ix86_vec_cost (half_mode, ix86_cost->sse_op)
+ + ix86_vec_cost (mode, ix86_cost->sse_op));
+ }
gcc_unreachable ();
}
@@ -24965,7 +25486,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
switch (mode)
{
case E_QImode:
- if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
return V64QImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V32QImode;
@@ -24973,7 +25494,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V16QImode;
case E_HImode:
- if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
return V32HImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V16HImode;
@@ -24981,7 +25502,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V8HImode;
case E_SImode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V16SImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V8SImode;
@@ -24989,7 +25510,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V4SImode;
case E_DImode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V8DImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V4DImode;
@@ -25003,16 +25524,15 @@ ix86_preferred_simd_mode (scalar_mode mode)
{
if (TARGET_PREFER_AVX128)
return V8HFmode;
- else if (TARGET_PREFER_AVX256 || !TARGET_EVEX512)
+ else if (TARGET_PREFER_AVX256)
return V16HFmode;
}
- if (TARGET_EVEX512)
- return V32HFmode;
+ return V32HFmode;
}
return word_mode;
case E_BFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V32BFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V16BFmode;
@@ -25020,7 +25540,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V8BFmode;
case E_SFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V16SFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V8SFmode;
@@ -25028,7 +25548,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V4SFmode;
case E_DFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V8DFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V4DFmode;
@@ -25048,13 +25568,13 @@ ix86_preferred_simd_mode (scalar_mode mode)
static unsigned int
ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
{
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
{
modes->safe_push (V64QImode);
modes->safe_push (V32QImode);
modes->safe_push (V16QImode);
}
- else if (TARGET_AVX512F && TARGET_EVEX512 && all)
+ else if (TARGET_AVX512F && all)
{
modes->safe_push (V32QImode);
modes->safe_push (V16QImode);
@@ -25092,7 +25612,7 @@ ix86_get_mask_mode (machine_mode data_mode)
unsigned elem_size = vector_size / nunits;
/* Scalar mask case. */
- if ((TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64)
+ if ((TARGET_AVX512F && vector_size == 64)
|| (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16))
/* AVX512FP16 only supports vector comparison
to kmask for _Float16. */
@@ -25257,32 +25777,6 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
return new ix86_vector_costs (vinfo, costing_for_scalar);
}
-/* Return cost of statement doing FP conversion. */
-
-static unsigned
-fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
-{
- int outer_size
- = tree_to_uhwi
- (TYPE_SIZE
- (TREE_TYPE (gimple_assign_lhs (stmt))));
- int inner_size
- = tree_to_uhwi
- (TYPE_SIZE
- (TREE_TYPE (gimple_assign_rhs1 (stmt))));
- int stmt_cost = vec_fp_conversion_cost
- (ix86_tune_cost, GET_MODE_BITSIZE (mode));
- /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
- up doing two conversions and packing them. */
- if (!scalar_p && inner_size > outer_size)
- {
- int n = inner_size / outer_size;
- stmt_cost = stmt_cost * n
- + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
- }
- return stmt_cost;
-}
-
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
@@ -25304,6 +25798,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (scalar_p)
mode = TYPE_MODE (TREE_TYPE (vectype));
}
+ /* When we are costing a scalar stmt use the scalar stmt to get at the
+ type of the operation. */
+ else if (scalar_p && stmt_info)
+ if (tree lhs = gimple_get_lhs (stmt_info->stmt))
+ {
+ fp = FLOAT_TYPE_P (TREE_TYPE (lhs));
+ mode = TYPE_MODE (TREE_TYPE (lhs));
+ }
if ((kind == vector_stmt || kind == scalar_stmt)
&& stmt_info
@@ -25326,7 +25828,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else if (X87_FLOAT_MODE_P (mode))
stmt_cost = ix86_cost->fadd;
else
- stmt_cost = ix86_cost->add;
+ stmt_cost = ix86_cost->add;
}
else
stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25381,7 +25883,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(subcode == RSHIFT_EXPR
&& !TYPE_UNSIGNED (TREE_TYPE (op1)))
? ASHIFTRT : LSHIFTRT, mode,
- TREE_CODE (op2) == INTEGER_CST,
+ TREE_CODE (op2) == INTEGER_CST,
cst_and_fits_in_hwi (op2)
? int_cst_value (op2) : -1,
false, false, NULL, NULL);
@@ -25390,30 +25892,174 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
case NOP_EXPR:
/* Only sign-conversions are free. */
if (tree_nop_conversion_p
- (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
stmt_cost = 0;
else if (fp)
- stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
- scalar_p);
+ stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ break;
+
+ case FLOAT_EXPR:
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtsi2ss;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ stmt_cost = ix86_cost->fadd;
+ else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+ break;
+
+ case FIX_TRUNC_EXPR:
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtss2si;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87. */
+ stmt_cost = ix86_cost->fadd;
+ else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+ break;
+
+ case COND_EXPR:
+ {
+ /* SSE2 conditinal move sequence is:
+ pcmpgtd %xmm5, %xmm0 (accounted separately)
+ pand %xmm0, %xmm2
+ pandn %xmm1, %xmm0
+ por %xmm2, %xmm0
+ while SSE4 uses cmp + blend
+ and AVX512 masked moves.
+
+ The condition is accounted separately since we usually have
+ p = a < b
+ c = p ? x : y
+ and we will account first statement as setcc. Exception is when
+ p is loaded from memory as bool and then we will not acocunt
+ the compare, but there is no way to check for this. */
+
+ int ninsns = TARGET_SSE4_1 ? 1 : 3;
+
+ /* If one of parameters is 0 or -1 the sequence will be simplified:
+ (if_true & mask) | (if_false & ~mask) -> if_true & mask */
+ if (ninsns > 1
+ && (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || zerop (gimple_assign_rhs3 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs3 (stmt_info->stmt))))
+ ninsns = 1;
+
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ninsns * ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+ else
+ /* compare (accounted separately) + cmov. */
+ stmt_cost = ix86_cost->add;
+ }
break;
- case BIT_IOR_EXPR:
- case ABS_EXPR:
- case ABSU_EXPR:
case MIN_EXPR:
case MAX_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode)
+ && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* x87 requires conditional branch. We don't have cost for
+ that. */
+ ;
+ else
+ /* minss */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vpmin was introduced in SSE3.
+ SSE2 needs pcmpgtd + pand + pandn + pxor.
+ If one of parameters is 0 or -1 the sequence is simplified
+ to pcmpgtd + pand. */
+ if (!TARGET_SSSE3)
+ {
+ if (zerop (gimple_assign_rhs2 (stmt_info->stmt))
+ || integer_minus_onep
+ (gimple_assign_rhs2 (stmt_info->stmt)))
+ stmt_cost *= 2;
+ else
+ stmt_cost *= 4;
+ }
+ }
+ else
+ /* cmp + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case ABS_EXPR:
+ case ABSU_EXPR:
+ if (fp)
+ {
+ if (X87_FLOAT_MODE_P (mode)
+ && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* fabs. */
+ stmt_cost = ix86_cost->fabs;
+ else
+ /* andss of sign bit. */
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ {
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* vabs was introduced in SSE3.
+ SSE3 uses psrat + pxor + psub. */
+ if (!TARGET_SSSE3)
+ stmt_cost *= 3;
+ }
+ else
+ /* neg + cmov. */
+ stmt_cost = ix86_cost->add * 2;
+ }
+ break;
+
+ case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
case BIT_AND_EXPR:
case BIT_NOT_EXPR:
- if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
- stmt_cost = ix86_cost->sse_op;
- else if (VECTOR_MODE_P (mode))
+ gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+ && !X87_FLOAT_MODE_P (mode));
+ if (VECTOR_MODE_P (mode))
stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
else
stmt_cost = ix86_cost->add;
break;
+
default:
+ if (truth_value_p (subcode))
+ {
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ /* CMPccS? insructions are cheap, so use sse_op. While they
+ produce a mask which may need to be turned to 0/1 by and,
+ expect that this will be optimized away in a common case. */
+ stmt_cost = ix86_cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ /* fcmp + setcc. */
+ stmt_cost = ix86_cost->fadd + ix86_cost->add;
+ else if (VECTOR_MODE_P (mode))
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ else
+ /* setcc. */
+ stmt_cost = ix86_cost->add;
+ break;
+ }
break;
}
}
@@ -25437,9 +26083,36 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
break;
}
- if (kind == vec_promote_demote
- && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
- stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
+ if (kind == vec_promote_demote)
+ {
+ int outer_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt))));
+ int inner_size
+ = tree_to_uhwi
+ (TYPE_SIZE
+ (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))));
+ bool inner_fp = FLOAT_TYPE_P
+ (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)));
+
+ if (fp && inner_fp)
+ stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+ else if (fp && !inner_fp)
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+ else if (!fp && inner_fp)
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+ else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+ /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is
+ greater than inner size we will end up doing two conversions and
+ packing them. We always pack pairs; if the size difference is greater
+ it is split into multiple demote operations. */
+ if (inner_size > outer_size)
+ stmt_cost = stmt_cost * 2
+ + ix86_vec_cost (mode, ix86_cost->sse_op);
+ }
/* If we do elementwise loads into a vector then we are bound by
latency and execution resources for the many scalar loads
@@ -25511,7 +26184,22 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else
{
m_num_gpr_needed[where]++;
- stmt_cost += ix86_cost->sse_to_integer;
+
+ int cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+
+ /* For integer construction, the number of actual GPR -> XMM
+ moves will be somewhere between 0 and n.
+ We do not have very good idea about actual number, since
+ the source may be a constant, memory or a chain of
+ instructions that will be later converted by
+ scalar-to-vector pass. */
+ if (kind == vec_construct
+ && GET_MODE_BITSIZE (mode) == 256)
+ cost *= 2;
+ else if (kind == vec_construct
+ && GET_MODE_BITSIZE (mode) == 512)
+ cost *= 3;
+ stmt_cost += cost;
}
}
}
@@ -25603,14 +26291,10 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
/* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
a AVX2 and a SSE epilogue for AVX512 vectorized loops. */
if (loop_vinfo
+ && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32
&& ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
- {
- if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
- m_suggested_epilogue_mode = V32QImode;
- else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
- && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
- m_suggested_epilogue_mode = V16QImode;
- }
+ m_suggested_epilogue_mode = V16QImode;
/* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger
enable a 64bit SSE epilogue. */
if (loop_vinfo
@@ -25619,6 +26303,65 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
&& LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16)
m_suggested_epilogue_mode = V8QImode;
+ /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use
+ a masked epilogue if that doesn't seem detrimental. */
+ if (loop_vinfo
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+ && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
+ && !OPTION_SET_P (param_vect_partial_vector_usage))
+ {
+ bool avoid = false;
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+ {
+ unsigned int peel_niter
+ = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ peel_niter += 1;
+ /* When we know the number of scalar iterations of the epilogue,
+ avoid masking when a single vector epilog iteration handles
+ it in full. */
+ if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter)
+ % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()))
+ avoid = true;
+ }
+ if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo))))
+ for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo))
+ {
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ ;
+ else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+ ;
+ else
+ {
+ int loop_depth
+ = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
+ DDR_LOOP_NEST (ddr));
+ if (DDR_NUM_DIST_VECTS (ddr) == 1
+ && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0)
+ {
+ /* Avoid the case when there's an outer loop that might
+ traverse a multi-dimensional array with the inner
+ loop just executing the masked epilogue with a
+ read-write where the next outer iteration might
+ read from the masked part of the previous write,
+ 'n' filling half a vector.
+ for (j = 0; j < m; ++j)
+ for (i = 0; i < n; ++i)
+ a[j][i] = c * a[j][i]; */
+ avoid = true;
+ break;
+ }
+ }
+ }
+ if (!avoid)
+ {
+ m_suggested_epilogue_mode = loop_vinfo->vector_mode;
+ m_masked_epilogue = 1;
+ }
+ }
+
vector_costs::finish_cost (scalar_costs);
}
@@ -25738,7 +26481,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
{
/* If the function isn't exported, we can pick up just one ISA
for the clones. */
- if (TARGET_AVX512F && TARGET_EVEX512)
+ if (TARGET_AVX512F)
clonei->vecsize_mangle = 'e';
else if (TARGET_AVX2)
clonei->vecsize_mangle = 'd';
@@ -25830,17 +26573,17 @@ ix86_simd_clone_usable (struct cgraph_node *node, machine_mode)
return -1;
if (!TARGET_AVX)
return 0;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 3 : TARGET_AVX2 ? 2 : 1;
+ return TARGET_AVX512F ? 3 : TARGET_AVX2 ? 2 : 1;
case 'c':
if (!TARGET_AVX)
return -1;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 2 : TARGET_AVX2 ? 1 : 0;
+ return TARGET_AVX512F ? 2 : TARGET_AVX2 ? 1 : 0;
case 'd':
if (!TARGET_AVX2)
return -1;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 1 : 0;
+ return TARGET_AVX512F ? 1 : 0;
case 'e':
- if (!TARGET_AVX512F || !TARGET_EVEX512)
+ if (!TARGET_AVX512F)
return -1;
return 0;
default:
@@ -27512,6 +28255,195 @@ ix86_cannot_copy_insn_p (rtx_insn *insn)
#undef TARGET_DOCUMENTATION_NAME
#define TARGET_DOCUMENTATION_NAME "x86"
+/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
+sbitmap
+ix86_get_separate_components (void)
+{
+ HOST_WIDE_INT offset, to_allocate;
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+ struct machine_function *m = cfun->machine;
+
+ offset = m->frame.stack_pointer_offset;
+ to_allocate = offset - m->frame.sse_reg_save_offset;
+
+ /* Shrink wrap separate uses MOV, which means APX PPX cannot be used.
+ Experiments show that APX PPX can speed up the prologue. If the function
+ does not exit early during actual execution, then using APX PPX is faster.
+ If the function always exits early during actual execution, then shrink
+ wrap separate reduces the number of MOV (PUSH/POP) instructions actually
+ executed, thus speeding up execution.
+ foo:
+ movl $1, %eax
+ testq %rdi, %rdi
+ jne.L60
+ ret ---> early return.
+ .L60:
+ subq $88, %rsp ---> belong to prologue.
+ xorl %eax, %eax
+ movq %rbx, 40 (%rsp) ---> belong to prologue.
+ movq 8 (%rdi), %rbx
+ movq %rbp, 48 (%rsp) ---> belong to prologue.
+ movq %rdi, %rbp
+ testq %rbx, %rbx
+ jne.L61
+ movq 40 (%rsp), %rbx
+ movq 48 (%rsp), %rbp
+ addq $88, %rsp
+ ret
+ .L61:
+ movq %r12, 56 (%rsp) ---> belong to prologue.
+ movq %r13, 64 (%rsp) ---> belong to prologue.
+ movq %r14, 72 (%rsp) ---> belong to prologue.
+ ... ...
+
+ Disable shrink wrap separate when PPX is enabled. */
+ if ((TARGET_APX_PPX && !crtl->calls_eh_return)
+ || cfun->machine->func_type != TYPE_NORMAL
+ || TARGET_SEH
+ || crtl->stack_realign_needed
+ || m->call_ms2sysv)
+ return components;
+
+ /* Since shrink wrapping separate uses MOV instead of PUSH/POP.
+ Disable shrink wrap separate when MOV is prohibited. */
+ if (save_regs_using_push_pop (to_allocate))
+ return components;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ /* Skip registers with large offsets, where a pseudo may be needed. */
+ if (IN_RANGE (offset, -0x8000, 0x7fff))
+ bitmap_set_bit (components, regno);
+ offset += UNITS_PER_WORD;
+ }
+
+ /* Don't mess with the following registers. */
+ if (frame_pointer_needed)
+ bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+ if (crtl->drap_reg)
+ bitmap_clear_bit (components, REGNO (crtl->drap_reg));
+
+ if (pic_offset_table_rtx)
+ bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
+sbitmap
+ix86_components_for_bb (basic_block bb)
+{
+ bitmap in = DF_LIVE_IN (bb);
+ bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+ bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+
+ function_abi_aggregator callee_abis;
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (CALL_P (insn))
+ callee_abis.note_callee_abi (insn_callee_abi (insn));
+ HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+
+ /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (!fixed_regs[regno]
+ && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+ || bitmap_bit_p (in, regno)
+ || bitmap_bit_p (gen, regno)
+ || bitmap_bit_p (kill, regno)))
+ bitmap_set_bit (components, regno);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */
+void
+ix86_disqualify_components (sbitmap, edge, sbitmap, bool)
+{
+ /* Nothing to do for x86. */
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
+void
+ix86_emit_prologue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
+void
+ix86_emit_epilogue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ {
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
+
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ }
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
+void
+ix86_set_handled_components (sbitmap components)
+{
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (bitmap_bit_p (components, regno))
+ {
+ cfun->machine->reg_is_wrapped_separately[regno] = true;
+ cfun->machine->use_fast_prologue_epilogue = true;
+ cfun->machine->frame.save_regs_using_mov = true;
+ }
+}
+
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
+ ix86_emit_prologue_components
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
+ ix86_emit_epilogue_components
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-i386.h"