aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r--gcc/config/i386/i386.cc960
1 files changed, 685 insertions, 275 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9c24a92..65e04d3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -335,6 +335,14 @@ static int const x86_64_ms_abi_int_parameter_registers[4] =
CX_REG, DX_REG, R8_REG, R9_REG
};
+/* Similar as Clang's preserve_none function parameter passing.
+ NB: Use DI_REG and SI_REG, see ix86_function_value_regno_p. */
+
+static int const x86_64_preserve_none_int_parameter_registers[6] =
+{
+ R12_REG, R13_REG, R14_REG, R15_REG, DI_REG, SI_REG
+};
+
static int const x86_64_int_return_registers[4] =
{
AX_REG, DX_REG, DI_REG, SI_REG
@@ -460,7 +468,8 @@ int ix86_arch_specified;
red-zone.
NB: Don't use red-zone for functions with no_caller_saved_registers
- and 32 GPRs since 128-byte red-zone is too small for 31 GPRs.
+ and 32 GPRs or 16 XMM registers since 128-byte red-zone is too small
+ for 31 GPRs or 15 GPRs + 16 XMM registers.
TODO: If we can reserve the first 2 WORDs, for PUSH and, another
for CALL, in red-zone, we can allow local indirect jumps with
@@ -471,7 +480,7 @@ ix86_using_red_zone (void)
{
return (TARGET_RED_ZONE
&& !TARGET_64BIT_MS_ABI
- && (!TARGET_APX_EGPR
+ && ((!TARGET_APX_EGPR && !TARGET_SSE)
|| (cfun->machine->call_saved_registers
!= TYPE_NO_CALLER_SAVED_REGISTERS))
&& (!cfun->machine->has_local_indirect_jump
@@ -898,6 +907,18 @@ x86_64_elf_unique_section (tree decl, int reloc)
default_unique_section (decl, reloc);
}
+/* Return true if TYPE has no_callee_saved_registers or preserve_none
+ attribute. */
+
+bool
+ix86_type_no_callee_saved_registers_p (const_tree type)
+{
+ return (lookup_attribute ("no_callee_saved_registers",
+ TYPE_ATTRIBUTES (type)) != NULL
+ || lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (type)) != NULL);
+}
+
#ifdef COMMON_ASM_OP
#ifndef LARGECOMM_SECTION_ASM_OP
@@ -1019,11 +1040,10 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
/* Sibling call isn't OK if callee has no callee-saved registers
and the calling function has callee-saved registers. */
- if (cfun->machine->call_saved_registers != TYPE_NO_CALLEE_SAVED_REGISTERS
- && (cfun->machine->call_saved_registers
- != TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP)
- && lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type)))
+ if ((cfun->machine->call_saved_registers
+ != TYPE_NO_CALLEE_SAVED_REGISTERS)
+ && cfun->machine->call_saved_registers != TYPE_PRESERVE_NONE
+ && ix86_type_no_callee_saved_registers_p (type))
return false;
/* If outgoing reg parm stack space changes, we cannot do sibcall. */
@@ -1188,10 +1208,16 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2)
!= ix86_function_regparm (type2, NULL))
return 0;
- if (lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type1))
- != lookup_attribute ("no_callee_saved_registers",
- TYPE_ATTRIBUTES (type2)))
+ if (ix86_type_no_callee_saved_registers_p (type1)
+ != ix86_type_no_callee_saved_registers_p (type2))
+ return 0;
+
+ /* preserve_none attribute uses a different calling convention is
+ only for 64-bit. */
+ if (TARGET_64BIT
+ && (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type1))
+ != lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (type2))))
return 0;
return 1;
@@ -1553,7 +1579,10 @@ ix86_function_arg_regno_p (int regno)
if (call_abi == SYSV_ABI && regno == AX_REG)
return true;
- if (call_abi == MS_ABI)
+ if (cfun
+ && cfun->machine->call_saved_registers == TYPE_PRESERVE_NONE)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else if (call_abi == MS_ABI)
parm_regs = x86_64_ms_abi_int_parameter_registers;
else
parm_regs = x86_64_int_parameter_registers;
@@ -1716,6 +1745,19 @@ ix86_asm_output_function_label (FILE *out_file, const char *fname,
}
}
+/* Output a user-defined label. In AT&T syntax, registers are prefixed
+ with %, so labels require no punctuation. In Intel syntax, registers
+ are unprefixed, so labels may clash with registers or other operators,
+ and require quoting. */
+void
+ix86_asm_output_labelref (FILE *file, const char *prefix, const char *label)
+{
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ fprintf (file, "%s%s", prefix, label);
+ else
+ fprintf (file, "\"%s%s\"", prefix, label);
+}
+
/* Implementation of call abi switching target hook. Specific to FNDECL
the specific call register sets are set. See also
ix86_conditional_register_usage for more details. */
@@ -1795,8 +1837,7 @@ ix86_init_pic_reg (void)
add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
}
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
insert_insn_on_edge (seq, entry_edge);
@@ -1823,6 +1864,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
memset (cum, 0, sizeof (*cum));
+ tree preserve_none_type;
if (fndecl)
{
target = cgraph_node::get (fndecl);
@@ -1831,12 +1873,24 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
target = target->function_symbol ();
local_info_node = cgraph_node::local_info_node (target->decl);
cum->call_abi = ix86_function_abi (target->decl);
+ preserve_none_type = TREE_TYPE (target->decl);
}
else
- cum->call_abi = ix86_function_abi (fndecl);
+ {
+ cum->call_abi = ix86_function_abi (fndecl);
+ preserve_none_type = TREE_TYPE (fndecl);
+ }
}
else
- cum->call_abi = ix86_function_type_abi (fntype);
+ {
+ cum->call_abi = ix86_function_type_abi (fntype);
+ preserve_none_type = fntype;
+ }
+ cum->preserve_none_abi
+ = (preserve_none_type
+ && (lookup_attribute ("preserve_none",
+ TYPE_ATTRIBUTES (preserve_none_type))
+ != nullptr));
cum->caller = caller;
@@ -1998,8 +2052,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
&& GET_MODE_INNER (mode) == innermode)
{
- if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512)
- && !TARGET_IAMCU)
+ if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
{
static bool warnedavx512f;
static bool warnedavx512f_ret;
@@ -3410,9 +3463,15 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
break;
}
+ const int *parm_regs;
+ if (cum->preserve_none_abi)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
+
return construct_container (mode, orig_mode, type, 0, cum->nregs,
cum->sse_nregs,
- &x86_64_int_parameter_registers [cum->regno],
+ &parm_regs[cum->regno],
cum->sse_regno);
}
@@ -4422,7 +4481,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
/* AVX512F values are returned in ZMM0 if available. */
if (size == 64)
- return !TARGET_AVX512F || !TARGET_EVEX512;
+ return !TARGET_AVX512F;
}
if (mode == XFmode)
@@ -4577,6 +4636,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
if (max > X86_64_REGPARM_MAX)
max = X86_64_REGPARM_MAX;
+ const int *parm_regs;
+ if (cum->preserve_none_abi)
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
+
for (i = cum->regno; i < max; i++)
{
mem = gen_rtx_MEM (word_mode,
@@ -4584,8 +4649,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
MEM_NOTRAP_P (mem) = 1;
set_mem_alias_set (mem, set);
emit_move_insn (mem,
- gen_rtx_REG (word_mode,
- x86_64_int_parameter_registers[i]));
+ gen_rtx_REG (word_mode, parm_regs[i]));
}
if (ix86_varargs_fpr_size)
@@ -4739,8 +4803,7 @@ ix86_va_start (tree valist, rtx nextarg)
start_sequence ();
emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
push_topmost_sequence ();
emit_insn_after (seq, entry_of_function ());
@@ -5377,7 +5440,7 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode)
switch (GET_MODE_SIZE (mode))
{
case 64:
- if (TARGET_AVX512F && TARGET_EVEX512)
+ if (TARGET_AVX512F)
return 2;
break;
case 32:
@@ -5430,10 +5493,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vpxord\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxord\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxord\t%g0, %g0, %g0";
}
return "vpxor\t%x0, %x0, %x0";
@@ -5449,19 +5510,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vxorpd\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vxorpd\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vxorpd\t%g0, %g0, %g0";
}
else
{
if (TARGET_AVX512VL)
return "vpxorq\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxorq\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxorq\t%g0, %g0, %g0";
}
}
return "vxorpd\t%x0, %x0, %x0";
@@ -5478,19 +5535,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vxorps\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vxorps\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vxorps\t%g0, %g0, %g0";
}
else
{
if (TARGET_AVX512VL)
return "vpxord\t%x0, %x0, %x0";
- else if (TARGET_EVEX512)
- return "vpxord\t%g0, %g0, %g0";
else
- gcc_unreachable ();
+ return "vpxord\t%g0, %g0, %g0";
}
}
return "vxorps\t%x0, %x0, %x0";
@@ -5511,7 +5564,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
case MODE_XI:
case MODE_V8DF:
case MODE_V16SF:
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
case MODE_OI:
@@ -5527,10 +5580,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (TARGET_AVX512VL)
return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
- else if (TARGET_EVEX512)
- return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
else
- gcc_unreachable ();
+ return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
}
return (TARGET_AVX
? "vpcmpeqd\t%0, %0, %0"
@@ -5544,7 +5595,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
if (GET_MODE_SIZE (mode) == 64)
{
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpcmpeqd\t%t0, %t0, %t0";
}
else if (GET_MODE_SIZE (mode) == 32)
@@ -5556,7 +5607,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
}
else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
{
- gcc_assert (TARGET_AVX512F && TARGET_EVEX512);
+ gcc_assert (TARGET_AVX512F);
return "vpcmpeqd\t%x0, %x0, %x0";
}
@@ -5667,8 +5718,6 @@ ix86_get_ssemov (rtx *operands, unsigned size,
|| memory_operand (operands[1], mode))
gcc_unreachable ();
size = 64;
- /* We need TARGET_EVEX512 to move into zmm register. */
- gcc_assert (TARGET_EVEX512);
switch (type)
{
case opcode_int:
@@ -5707,7 +5756,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu16"
: "%vmovdqu")
: "%vmovdqa");
@@ -5749,7 +5798,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu8"
: "%vmovdqu")
: "%vmovdqa");
@@ -5769,7 +5818,7 @@ ix86_get_ssemov (rtx *operands, unsigned size,
: "%vmovaps");
else
opcode = (misaligned_p
- ? (TARGET_AVX512BW
+ ? (TARGET_AVX512BW && evex_reg_p
? "vmovdqu16"
: "%vmovdqu")
: "%vmovdqa");
@@ -5930,7 +5979,7 @@ symbolic_reference_mentioned_p (rtx op)
const char *fmt;
int i;
- if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+ if (SYMBOL_REF_P (op) || LABEL_REF_P (op))
return true;
fmt = GET_RTX_FORMAT (GET_CODE (op));
@@ -6477,7 +6526,7 @@ output_set_got (rtx dest, rtx label)
xops[0] = dest;
- if (TARGET_VXWORKS_RTP && flag_pic)
+ if (TARGET_VXWORKS_GOTTPIC && TARGET_VXWORKS_RTP && flag_pic)
{
/* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
xops[2] = gen_rtx_MEM (Pmode,
@@ -6722,9 +6771,7 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
|| !frame_pointer_needed));
case TYPE_NO_CALLEE_SAVED_REGISTERS:
- return false;
-
- case TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP:
+ case TYPE_PRESERVE_NONE:
if (regno != HARD_FRAME_POINTER_REGNUM)
return false;
break;
@@ -6801,7 +6848,9 @@ ix86_nsaved_sseregs (void)
int nregs = 0;
int regno;
- if (!TARGET_64BIT_MS_ABI)
+ if (!TARGET_64BIT_MS_ABI
+ && (cfun->machine->call_saved_registers
+ != TYPE_NO_CALLER_SAVED_REGISTERS))
return 0;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
@@ -6909,6 +6958,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
&& (nregs + aligned) >= 3;
}
+/* Check if push/pop should be used to save/restore registers. */
+static bool
+save_regs_using_push_pop (HOST_WIDE_INT to_allocate)
+{
+ return ((!to_allocate && cfun->machine->frame.nregs <= 1)
+ || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+ /* If static stack checking is enabled and done with probes,
+ the registers need to be saved before allocating the frame. */
+ || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+ /* If stack clash probing needs a loop, then it needs a
+ scratch register. But the returned register is only guaranteed
+ to be safe to use after register saves are complete. So if
+ stack clash protections are enabled and the allocated frame is
+ larger than the probe interval, then use pushes to save
+ callee saved registers. */
+ || (flag_stack_clash_protection
+ && !ix86_target_stack_probe ()
+ && to_allocate > get_probe_interval ()));
+}
+
/* Fill structure ix86_frame about frame of currently computed function. */
static void
@@ -6989,12 +7058,18 @@ ix86_compute_frame_layout (void)
gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
gcc_assert (preferred_alignment <= stack_alignment_needed);
- /* The only ABI saving SSE regs should be 64-bit ms_abi. */
- gcc_assert (TARGET_64BIT || !frame->nsseregs);
+ /* The only ABI saving SSE regs should be 64-bit ms_abi or with
+ no_caller_saved_registers attribue. */
+ gcc_assert (TARGET_64BIT
+ || (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ || !frame->nsseregs);
if (TARGET_64BIT && m->call_ms2sysv)
{
gcc_assert (stack_alignment_needed >= 16);
- gcc_assert (!frame->nsseregs);
+ gcc_assert ((cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ || !frame->nsseregs);
}
/* For SEH we have to limit the amount of code movement into the prologue.
@@ -7193,20 +7268,7 @@ ix86_compute_frame_layout (void)
/* Size prologue needs to allocate. */
to_allocate = offset - frame->sse_reg_save_offset;
- if ((!to_allocate && frame->nregs <= 1)
- || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
- /* If static stack checking is enabled and done with probes,
- the registers need to be saved before allocating the frame. */
- || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
- /* If stack clash probing needs a loop, then it needs a
- scratch register. But the returned register is only guaranteed
- to be safe to use after register saves are complete. So if
- stack clash protections are enabled and the allocated frame is
- larger than the probe interval, then use pushes to save
- callee saved registers. */
- || (flag_stack_clash_protection
- && !ix86_target_stack_probe ()
- && to_allocate > get_probe_interval ()))
+ if (save_regs_using_push_pop (to_allocate))
frame->save_regs_using_mov = false;
if (ix86_using_red_zone ()
@@ -7664,7 +7726,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
{
- ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ /* Skip registers, already processed by shrink wrap separate. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -7757,8 +7821,15 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
add_frame_related_expr = true;
}
- insn = emit_insn (gen_pro_epilogue_adjust_stack_add
- (Pmode, dest, src, addend));
+ /* Shrink wrap separate may insert prologue between TEST and JMP. In order
+ not to affect EFlags, emit add without reg clobbering. */
+ if (crtl->shrink_wrapped_separate)
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_add_nocc
+ (Pmode, dest, src, addend));
+ else
+ insn = emit_insn (gen_pro_epilogue_adjust_stack_add
+ (Pmode, dest, src, addend));
+
if (style >= 0)
ix86_add_queued_cfa_restore_notes (insn);
@@ -7981,8 +8052,7 @@ ix86_get_drap_rtx (void)
start_sequence ();
drap_vreg = copy_to_reg (arg_ptr);
- seq = get_insns ();
- end_sequence ();
+ seq = end_sequence ();
insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
if (!optimize)
@@ -9224,11 +9294,22 @@ ix86_expand_prologue (void)
doing this if we have to probe the stack; at least on x86_64 the
stack probe can turn into a call that clobbers a red zone location. */
else if (ix86_using_red_zone ()
- && (! TARGET_STACK_PROBE
- || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
+ && (! TARGET_STACK_PROBE
+ || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
{
+ HOST_WIDE_INT allocate_offset;
+ if (crtl->shrink_wrapped_separate)
+ {
+ allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset;
+
+ /* Adjust the total offset at the beginning of the function. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (allocate_offset), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset;
+ }
+
ix86_emit_save_regs_using_mov (frame.reg_save_offset);
- cfun->machine->red_zone_used = true;
int_registers_saved = true;
}
}
@@ -9806,30 +9887,35 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
{
- rtx reg = gen_rtx_REG (word_mode, regno);
- rtx mem;
- rtx_insn *insn;
- mem = choose_baseaddr (cfa_offset, NULL);
- mem = gen_frame_mem (word_mode, mem);
- insn = emit_move_insn (reg, mem);
-
- if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+ /* Skip registers, already processed by shrink wrap separate. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
{
- /* Previously we'd represented the CFA as an expression
- like *(%ebp - 8). We've just popped that value from
- the stack, which means we need to reset the CFA to
- the drap register. This will remain until we restore
- the stack pointer. */
- add_reg_note (insn, REG_CFA_DEF_CFA, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
- /* This means that the DRAP register is valid for addressing. */
- m->fs.drap_valid = true;
- }
- else
- ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+
+ if (m->fs.cfa_reg == crtl->drap_reg
+ && regno == REGNO (crtl->drap_reg))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ /* DRAP register is valid for addressing. */
+ m->fs.drap_valid = true;
+ }
+ else
+ ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ }
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -10108,10 +10194,11 @@ ix86_expand_epilogue (int style)
less work than reloading sp and popping the register. */
else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
restore_regs_via_mov = true;
- else if (TARGET_EPILOGUE_USING_MOVE
- && cfun->machine->use_fast_prologue_epilogue
- && (frame.nregs > 1
- || m->fs.sp_offset != reg_save_offset))
+ else if (crtl->shrink_wrapped_separate
+ || (TARGET_EPILOGUE_USING_MOVE
+ && cfun->machine->use_fast_prologue_epilogue
+ && (frame.nregs > 1
+ || m->fs.sp_offset != reg_save_offset)))
restore_regs_via_mov = true;
else if (frame_pointer_needed
&& !frame.nregs
@@ -10125,6 +10212,9 @@ ix86_expand_epilogue (int style)
else
restore_regs_via_mov = false;
+ if (crtl->shrink_wrapped_separate)
+ gcc_assert (restore_regs_via_mov);
+
if (restore_regs_via_mov || frame.nsseregs)
{
/* Ensure that the entire register save area is addressable via
@@ -10177,6 +10267,7 @@ ix86_expand_epilogue (int style)
gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
gcc_assert (!crtl->drap_reg);
gcc_assert (!frame.nregs);
+ gcc_assert (!crtl->shrink_wrapped_separate);
}
else if (restore_regs_via_mov)
{
@@ -10191,6 +10282,8 @@ ix86_expand_epilogue (int style)
rtx sa = EH_RETURN_STACKADJ_RTX;
rtx_insn *insn;
+ gcc_assert (!crtl->shrink_wrapped_separate);
+
/* Stack realignment doesn't work with eh_return. */
if (crtl->stack_realign_needed)
sorry ("Stack realignment not supported with "
@@ -10631,8 +10724,7 @@ split_stack_prologue_scratch_regno (void)
static GTY(()) rtx split_stack_fn;
-/* A SYMBOL_REF for the more stack function when using the large
- model. */
+/* A SYMBOL_REF for the more stack function when using the large model. */
static GTY(()) rtx split_stack_fn_large;
@@ -11320,7 +11412,7 @@ ix86_force_load_from_GOT_p (rtx x, bool call_p)
&& (!flag_pic || this_is_asm_operands)
&& ix86_cmodel != CM_LARGE
&& ix86_cmodel != CM_LARGE_PIC
- && GET_CODE (x) == SYMBOL_REF
+ && SYMBOL_REF_P (x)
&& ((!call_p
&& (!ix86_direct_extern_access
|| (SYMBOL_REF_DECL (x)
@@ -11366,23 +11458,23 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
case UNSPEC_TPOFF:
case UNSPEC_NTPOFF:
x = XVECEXP (x, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
+ return (SYMBOL_REF_P (x)
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
case UNSPEC_DTPOFF:
x = XVECEXP (x, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
+ return (SYMBOL_REF_P (x)
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
case UNSPEC_SECREL32:
x = XVECEXP (x, 0, 0);
- return GET_CODE (x) == SYMBOL_REF;
+ return SYMBOL_REF_P (x);
default:
return false;
}
/* We must have drilled down to a symbol. */
- if (GET_CODE (x) == LABEL_REF)
+ if (LABEL_REF_P (x))
return true;
- if (GET_CODE (x) != SYMBOL_REF)
+ if (!SYMBOL_REF_P (x))
return false;
/* FALLTHRU */
@@ -11422,7 +11514,7 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
case E_OImode:
case E_XImode:
if (!standard_sse_constant_p (x, mode)
- && GET_MODE_SIZE (TARGET_AVX512F && TARGET_EVEX512
+ && GET_MODE_SIZE (TARGET_AVX512F
? XImode
: (TARGET_AVX
? OImode
@@ -11509,11 +11601,11 @@ legitimate_pic_operand_p (rtx x)
return TARGET_64BIT;
case UNSPEC_TPOFF:
x = XVECEXP (inner, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
+ return (SYMBOL_REF_P (x)
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
case UNSPEC_SECREL32:
x = XVECEXP (inner, 0, 0);
- return GET_CODE (x) == SYMBOL_REF;
+ return SYMBOL_REF_P (x);
case UNSPEC_MACHOPIC_OFFSET:
return legitimate_pic_address_disp_p (x);
default:
@@ -11564,7 +11656,7 @@ legitimate_pic_address_disp_p (rtx disp)
if (INTVAL (op1) >= 16*1024*1024
|| INTVAL (op1) < -16*1024*1024)
break;
- if (GET_CODE (op0) == LABEL_REF)
+ if (LABEL_REF_P (op0))
return true;
if (GET_CODE (op0) == CONST
&& GET_CODE (XEXP (op0, 0)) == UNSPEC
@@ -11573,7 +11665,7 @@ legitimate_pic_address_disp_p (rtx disp)
if (GET_CODE (op0) == UNSPEC
&& XINT (op0, 1) == UNSPEC_PCREL)
return true;
- if (GET_CODE (op0) != SYMBOL_REF)
+ if (!SYMBOL_REF_P (op0))
break;
/* FALLTHRU */
@@ -11638,8 +11730,8 @@ legitimate_pic_address_disp_p (rtx disp)
&& XINT (disp, 1) != UNSPEC_PLTOFF))
return false;
- if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
- && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
+ if (!SYMBOL_REF_P (XVECEXP (disp, 0, 0))
+ && !LABEL_REF_P (XVECEXP (disp, 0, 0)))
return false;
return true;
}
@@ -11667,14 +11759,14 @@ legitimate_pic_address_disp_p (rtx disp)
/* We need to check for both symbols and labels because VxWorks loads
text labels with @GOT rather than @GOTOFF. See gotoff_operand for
details. */
- return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
- || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
+ return (SYMBOL_REF_P (XVECEXP (disp, 0, 0))
+ || LABEL_REF_P (XVECEXP (disp, 0, 0)));
case UNSPEC_GOTOFF:
/* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
While ABI specify also 32bit relocation but we don't produce it in
small PIC model at all. */
- if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
- || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
+ if ((SYMBOL_REF_P (XVECEXP (disp, 0, 0))
+ || LABEL_REF_P (XVECEXP (disp, 0, 0)))
&& !TARGET_64BIT)
return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
return false;
@@ -11684,19 +11776,19 @@ legitimate_pic_address_disp_p (rtx disp)
if (saw_plus)
return false;
disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
+ return (SYMBOL_REF_P (disp)
&& SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
case UNSPEC_NTPOFF:
disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
+ return (SYMBOL_REF_P (disp)
&& SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
case UNSPEC_DTPOFF:
disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
+ return (SYMBOL_REF_P (disp)
&& SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
case UNSPEC_SECREL32:
disp = XVECEXP (disp, 0, 0);
- return GET_CODE (disp) == SYMBOL_REF;
+ return SYMBOL_REF_P (disp);
}
return false;
@@ -12038,11 +12130,11 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict,
that never results in lea, this seems to be easier and
correct fix for crash to disable this test. */
}
- else if (GET_CODE (disp) != LABEL_REF
+ else if (!LABEL_REF_P (disp)
&& !CONST_INT_P (disp)
&& (GET_CODE (disp) != CONST
|| !ix86_legitimate_constant_p (Pmode, disp))
- && (GET_CODE (disp) != SYMBOL_REF
+ && (!SYMBOL_REF_P (disp)
|| !ix86_legitimate_constant_p (Pmode, disp)))
/* Displacement is not constant. */
return false;
@@ -12149,10 +12241,10 @@ legitimize_pic_address (rtx orig, rtx reg)
else
new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
}
- else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
+ else if ((SYMBOL_REF_P (addr) && SYMBOL_REF_TLS_MODEL (addr) == 0)
/* We can't always use @GOTOFF for text labels
on VxWorks, see gotoff_operand. */
- || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+ || (TARGET_VXWORKS_VAROFF && LABEL_REF_P (addr)))
{
#if TARGET_PECOFF
rtx tmp = legitimize_pe_coff_symbol (addr, true);
@@ -12287,8 +12379,8 @@ legitimize_pic_address (rtx orig, rtx reg)
/* For %rip addressing, we have to use
just disp32, not base nor index. */
if (TARGET_64BIT
- && (GET_CODE (base) == SYMBOL_REF
- || GET_CODE (base) == LABEL_REF))
+ && (SYMBOL_REF_P (base)
+ || LABEL_REF_P (base)))
base = force_reg (mode, base);
if (GET_CODE (new_rtx) == PLUS
&& CONSTANT_P (XEXP (new_rtx, 1)))
@@ -12350,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol;
static rtx
ix86_tls_get_addr (void)
{
+ if (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ {
+ /* __tls_get_addr doesn't preserve vector registers. When a
+ function with no_caller_saved_registers attribute calls
+ __tls_get_addr, YMM and ZMM registers will be clobbered.
+ Issue an error and suggest -mtls-dialect=gnu2 in this case. */
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+ " with the %<no_caller_saved_registers%> attribute"));
+ else
+ error (cfun->machine->func_type == TYPE_EXCEPTION
+ ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " exception service routine")
+ : G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " interrupt service routine"));
+ /* Don't issue the same error twice. */
+ cfun->machine->func_type = TYPE_NORMAL;
+ cfun->machine->call_saved_registers
+ = TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+ }
+
if (!ix86_tls_symbol)
{
const char *sym
@@ -12469,13 +12583,13 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
if (TARGET_64BIT)
{
rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx rdi = gen_rtx_REG (Pmode, DI_REG);
rtx_insn *insns;
start_sequence ();
emit_call_insn
- (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr));
- insns = get_insns ();
- end_sequence ();
+ (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr, rdi));
+ insns = end_sequence ();
if (GET_MODE (x) != Pmode)
x = gen_rtx_ZERO_EXTEND (Pmode, x);
@@ -12523,14 +12637,14 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
if (TARGET_64BIT)
{
rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx rdi = gen_rtx_REG (Pmode, DI_REG);
rtx_insn *insns;
rtx eqv;
start_sequence ();
emit_call_insn
- (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr));
- insns = get_insns ();
- end_sequence ();
+ (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi));
+ insns = end_sequence ();
/* Attach a unique REG_EQUAL, to allow the RTL optimizers to
share the LD_BASE result with other LD model accesses. */
@@ -12790,12 +12904,12 @@ ix86_legitimize_address (rtx x, rtx, machine_mode mode)
bool changed = false;
unsigned log;
- log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
+ log = SYMBOL_REF_P (x) ? SYMBOL_REF_TLS_MODEL (x) : 0;
if (log)
return legitimize_tls_address (x, (enum tls_model) log, false);
if (GET_CODE (x) == CONST
&& GET_CODE (XEXP (x, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
+ && SYMBOL_REF_P (XEXP (XEXP (x, 0), 0))
&& (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
{
rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
@@ -13212,7 +13326,7 @@ ix86_delegitimize_tls_address (rtx orig_x)
if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
return orig_x;
x = XVECEXP (unspec, 0, 0);
- gcc_assert (GET_CODE (x) == SYMBOL_REF);
+ gcc_assert (SYMBOL_REF_P (x));
if (unspec != XEXP (addr.disp, 0))
x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
if (addr.index)
@@ -13379,7 +13493,7 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p)
else if (base_term_p
&& pic_offset_table_rtx
&& !TARGET_MACHO
- && !TARGET_VXWORKS_RTP)
+ && !TARGET_VXWORKS_VAROFF)
{
rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
@@ -14576,7 +14690,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
/* We have patterns that allow zero sets of memory, for instance.
In 64-bit mode, we should probably support all 8-byte vectors,
since we can in fact encode that into an immediate. */
- if (GET_CODE (x) == CONST_VECTOR)
+ if (CONST_VECTOR_P (x))
{
if (x != CONST0_RTX (GET_MODE (x)))
output_operand_lossage ("invalid vector immediate");
@@ -14606,8 +14720,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('$', file);
}
- else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
- || GET_CODE (x) == LABEL_REF)
+ else if (GET_CODE (x) == CONST || SYMBOL_REF_P (x)
+ || LABEL_REF_P (x))
{
if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('$', file);
@@ -14702,8 +14816,8 @@ ix86_print_operand_address_as (FILE *file, rtx addr,
&& CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
symbol = XEXP (XEXP (disp, 0), 0);
- if (GET_CODE (symbol) == LABEL_REF
- || (GET_CODE (symbol) == SYMBOL_REF
+ if (LABEL_REF_P (symbol)
+ || (SYMBOL_REF_P (symbol)
&& SYMBOL_REF_TLS_MODEL (symbol) == 0))
base = pc_rtx;
}
@@ -14791,7 +14905,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr,
{
if (flag_pic)
output_pic_addr_const (file, disp, 0);
- else if (GET_CODE (disp) == LABEL_REF)
+ else if (LABEL_REF_P (disp))
output_asm_label (disp);
else
output_addr_const (file, disp);
@@ -14827,7 +14941,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr,
if (flag_pic)
output_pic_addr_const (file, disp, 0);
- else if (GET_CODE (disp) == LABEL_REF)
+ else if (LABEL_REF_P (disp))
output_asm_label (disp);
else if (CONST_INT_P (disp))
offset = disp;
@@ -15779,7 +15893,7 @@ ix86_output_addr_diff_elt (FILE *file, int value, int rel)
gcc_assert (!TARGET_64BIT);
#endif
/* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
- if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+ if (TARGET_64BIT || TARGET_VXWORKS_VAROFF)
fprintf (file, "%s%s%d-%s%d\n",
directive, LPREFIX, value, LPREFIX, rel);
#if TARGET_MACHO
@@ -16611,6 +16725,10 @@ ix86_convert_const_vector_to_integer (rtx op, machine_mode mode)
val = wi::insert (val, wv, innermode_bits * i, innermode_bits);
}
break;
+ case E_V1SImode:
+ case E_V1DImode:
+ op = CONST_VECTOR_ELT (op, 0);
+ return INTVAL (op);
case E_V2HFmode:
case E_V2BFmode:
case E_V4HFmode:
@@ -17586,8 +17704,8 @@ ix86_rip_relative_addr_p (struct ix86_address *parts)
&& CONST_INT_P (XEXP (symbol, 1)))
symbol = XEXP (symbol, 0);
- if (GET_CODE (symbol) == LABEL_REF
- || (GET_CODE (symbol) == SYMBOL_REF
+ if (LABEL_REF_P (symbol)
+ || (SYMBOL_REF_P (symbol)
&& SYMBOL_REF_TLS_MODEL (symbol) == 0)
|| (GET_CODE (symbol) == UNSPEC
&& (XINT (symbol, 1) == UNSPEC_GOTPCREL
@@ -19911,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree utype, ures, vce;
utype = unsigned_type_for (TREE_TYPE (arg0));
/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
- instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */
+ instead of ABS_EXPR to handle overflow case(TYPE_MIN). */
ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
loc = gimple_location (stmt);
@@ -20321,14 +20439,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
{
bool si;
enum ix86_builtins code;
- const machine_mode mode = TYPE_MODE (TREE_TYPE (vectype));
if (!TARGET_AVX512F)
return NULL_TREE;
- if (!TARGET_EVEX512 && GET_MODE_SIZE (mode) == 64)
- return NULL_TREE;
-
if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u)
? !TARGET_USE_SCATTER_2PARTS
: (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
@@ -21399,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
/* Register pair for mask registers. */
if (mode == P2QImode || mode == P2HImode)
return 2;
- if (mode == V64SFmode || mode == V64SImode)
- return 4;
+
return 1;
}
@@ -21450,7 +21563,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
- any of 512-bit wide vector mode
- any scalar mode. */
if (TARGET_AVX512F
- && ((VALID_AVX512F_REG_OR_XI_MODE (mode) && TARGET_EVEX512)
+ && ((VALID_AVX512F_REG_OR_XI_MODE (mode))
|| VALID_AVX512F_SCALAR_MODE (mode)))
return true;
@@ -21692,7 +21805,7 @@ ix86_set_reg_reg_cost (machine_mode mode)
case MODE_VECTOR_INT:
case MODE_VECTOR_FLOAT:
- if ((TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode))
+ if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
|| (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
|| (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
|| (TARGET_SSE && VALID_SSE_REG_MODE (mode))
@@ -21753,7 +21866,7 @@ ix86_widen_mult_cost (const struct processor_costs *cost,
/* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
require extra 4 mul, 4 add, 4 cmp and 2 shift. */
if (!TARGET_SSE4_1 && !uns_p)
- extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+ extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4
+ cost->sse_op * 2;
/* Fallthru. */
case V4DImode:
@@ -21803,11 +21916,11 @@ ix86_multiplication_cost (const struct processor_costs *cost,
else if (TARGET_AVX2)
nops += 2;
else if (TARGET_XOP)
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
else
{
nops += 1;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
goto do_qimode;
@@ -21826,13 +21939,13 @@ ix86_multiplication_cost (const struct processor_costs *cost,
{
nmults += 1;
nops += 2;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
else
{
nmults += 1;
nops += 4;
- extra += cost->sse_load[2];
+ extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
}
goto do_qimode;
@@ -21845,14 +21958,16 @@ ix86_multiplication_cost (const struct processor_costs *cost,
{
nmults += 1;
nops += 4;
- extra += cost->sse_load[3] * 2;
+ /* 2 loads, so no division by 2. */
+ extra += COSTS_N_INSNS (cost->sse_load[3]);
}
goto do_qimode;
case V64QImode:
nmults = 2;
nops = 9;
- extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+ /* 2 loads of each size, so no division by 2. */
+ extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]);
do_qimode:
return ix86_vec_cost (mode, cost->mulss * nmults
@@ -21945,7 +22060,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
/* Use vpbroadcast. */
extra = cost->sse_op;
else
- extra = cost->sse_load[2];
+ extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
if (constant_op1)
{
@@ -21976,7 +22091,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
shift with one insn set the cost to prefer paddb. */
if (constant_op1)
{
- extra = cost->sse_load[2];
+ extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
return ix86_vec_cost (mode, cost->sse_op) + extra;
}
else
@@ -21991,7 +22106,9 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
/* Use vpbroadcast. */
extra = cost->sse_op;
else
- extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+ extra = COSTS_N_INSNS (mode == V16QImode
+ ? cost->sse_load[2]
+ : cost->sse_load[3]) / 2;
if (constant_op1)
{
@@ -22144,9 +22261,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* Handling different vternlog variants. */
if ((GET_MODE_SIZE (mode) == 64
- ? (TARGET_AVX512F && TARGET_EVEX512)
+ ? TARGET_AVX512F
: (TARGET_AVX512VL
- || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+ || (TARGET_AVX512F && !TARGET_PREFER_AVX256)))
&& GET_MODE_SIZE (mode) >= 16
&& outer_code_i == SET
&& ternlog_operand (x, mode))
@@ -22495,8 +22612,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
{
/* (ior (not ...) ...) can be a single insn in AVX512. */
if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22587,8 +22703,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* (and (not ...) (not ...)) can be a single insn in AVX512. */
if (GET_CODE (right) == NOT && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22658,8 +22773,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
{
/* (not (xor ...)) can be a single insn in AVX512. */
if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
- && ((TARGET_EVEX512
- && GET_MODE_SIZE (mode) == 64)
+ && (GET_MODE_SIZE (mode) == 64
|| (TARGET_AVX512VL
&& (GET_MODE_SIZE (mode) == 32
|| GET_MODE_SIZE (mode) == 16))))
@@ -22948,7 +23062,17 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
}
/* This is masked instruction, assume the same cost,
as nonmasked variant. */
- else if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+ else if (TARGET_AVX512F
+ && (register_operand (mask, GET_MODE (mask))
+ /* Redunduant clean up of high bits for kmask with VL=2/4
+ .i.e (vec_merge op0, op1, (and op3 15)). */
+ || (GET_CODE (mask) == AND
+ && register_operand (XEXP (mask, 0), GET_MODE (mask))
+ && CONST_INT_P (XEXP (mask, 1))
+ && ((INTVAL (XEXP (mask, 1)) == 3
+ && GET_MODE_NUNITS (mode) == 2)
+ || (INTVAL (XEXP (mask, 1)) == 15
+ && GET_MODE_NUNITS (mode) == 4)))))
{
*total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+ rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed);
@@ -22987,7 +23111,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
/* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0)
cost the same as register.
This is used by avx_cmp<mode>3_ltint_not. */
- if (GET_CODE (unsop0) == SUBREG)
+ if (SUBREG_P (unsop0))
unsop0 = XEXP (unsop0, 0);
if (GET_CODE (unsop0) == NOT)
unsop0 = XEXP (unsop0, 0);
@@ -23029,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
So current solution is make constant disp as cheap as possible. */
if (GET_CODE (addr) == PLUS
&& x86_64_immediate_operand (XEXP (addr, 1), Pmode)
- /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+ /* Only handle (reg + disp) since other forms of addr are mostly LEA,
there's no additional cost for the plus of disp. */
&& register_operand (XEXP (addr, 0), Pmode))
{
@@ -23258,7 +23382,9 @@ x86_this_parameter (tree function)
{
const int *parm_regs;
- if (ix86_function_type_abi (type) == MS_ABI)
+ if (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type)))
+ parm_regs = x86_64_preserve_none_int_parameter_registers;
+ else if (ix86_function_type_abi (type) == MS_ABI)
parm_regs = x86_64_ms_abi_int_parameter_registers;
else
parm_regs = x86_64_int_parameter_registers;
@@ -23584,19 +23710,21 @@ x86_field_alignment (tree type, int computed)
/* Print call to TARGET to FILE. */
static void
-x86_print_call_or_nop (FILE *file, const char *target)
+x86_print_call_or_nop (FILE *file, const char *target,
+ const char *label)
{
if (flag_nop_mcount || !strcmp (target, "nop"))
/* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
- fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n",
+ label);
else if (!TARGET_PECOFF && flag_pic)
{
gcc_assert (flag_plt);
- fprintf (file, "1:\tcall\t%s@PLT\n", target);
+ fprintf (file, "%s\tcall\t%s@PLT\n", label, target);
}
else
- fprintf (file, "1:\tcall\t%s\n", target);
+ fprintf (file, "%s\tcall\t%s\n", label, target);
}
static bool
@@ -23681,6 +23809,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
const char *mcount_name = MCOUNT_NAME;
+ bool fentry_section_p
+ = (flag_record_mcount
+ || lookup_attribute ("fentry_section",
+ DECL_ATTRIBUTES (current_function_decl)));
+
+ const char *label = fentry_section_p ? "1:" : "";
+
if (current_fentry_name (&mcount_name))
;
else if (fentry_name)
@@ -23716,11 +23851,12 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
reg = legacy_reg;
}
if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tmovabs\t%s, OFFSET FLAT:%s\n"
- "\tcall\t%s\n", reg, mcount_name, reg);
+ fprintf (file, "%s\tmovabs\t%s, OFFSET FLAT:%s\n"
+ "\tcall\t%s\n", label, reg, mcount_name,
+ reg);
else
- fprintf (file, "1:\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n",
- mcount_name, reg, reg);
+ fprintf (file, "%s\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n",
+ label, mcount_name, reg, reg);
break;
case CM_LARGE_PIC:
#ifdef NO_PROFILE_COUNTERS
@@ -23761,21 +23897,21 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
if (!flag_plt)
{
if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
- mcount_name);
+ fprintf (file, "%s\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n",
+ label, mcount_name);
else
- fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n",
- mcount_name);
+ fprintf (file, "%s\tcall\t*%s@GOTPCREL(%%rip)\n",
+ label, mcount_name);
break;
}
/* fall through */
default:
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
break;
}
}
else
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
}
else if (flag_pic)
{
@@ -23790,11 +23926,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
LPREFIX, labelno);
#endif
if (flag_plt)
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
else if (ASSEMBLER_DIALECT == ASM_INTEL)
- fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name);
+ fprintf (file, "%s\tcall\t[DWORD PTR %s@GOT[ebx]]\n",
+ label, mcount_name);
else
- fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
+ fprintf (file, "%s\tcall\t*%s@GOT(%%ebx)\n",
+ label, mcount_name);
}
else
{
@@ -23807,12 +23945,10 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
fprintf (file, "\tmovl\t$%sP%d, %%" PROFILE_COUNT_REGISTER "\n",
LPREFIX, labelno);
#endif
- x86_print_call_or_nop (file, mcount_name);
+ x86_print_call_or_nop (file, mcount_name, label);
}
- if (flag_record_mcount
- || lookup_attribute ("fentry_section",
- DECL_ATTRIBUTES (current_function_decl)))
+ if (fentry_section_p)
{
const char *sname = "__mcount_loc";
@@ -24571,7 +24707,7 @@ ix86_vector_mode_supported_p (machine_mode mode)
return true;
if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
return true;
- if (TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode))
+ if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
return true;
if ((TARGET_MMX || TARGET_MMX_WITH_SSE)
&& VALID_MMX_REG_MODE (mode))
@@ -24673,6 +24809,12 @@ static void map_egpr_constraints (vec<const char *> &constraints)
buf.safe_push (cur[j + 1]);
j++;
break;
+ case '{':
+ do
+ {
+ buf.safe_push (cur[j]);
+ } while (cur[j++] != '}');
+ break;
default:
buf.safe_push (cur[j]);
break;
@@ -24819,8 +24961,7 @@ ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> & /*inputs*/,
}
}
- rtx_insn *seq = get_insns ();
- end_sequence ();
+ rtx_insn *seq = end_sequence ();
if (saw_asm_flag)
return seq;
@@ -25091,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global)
return DW_EH_PE_absptr;
}
-/* Implement targetm.vectorize.builtin_vectorization_cost. */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+ from ix86_vector_costs::add_stmt_cost. */
static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+ machine_mode mode)
{
- bool fp = false;
- machine_mode mode = TImode;
+ bool fp = FLOAT_MODE_P (mode);
int index;
- if (vectype != NULL)
- {
- fp = FLOAT_TYPE_P (vectype);
- mode = TYPE_MODE (vectype);
- }
-
switch (type_of_cost)
{
case scalar_stmt:
@@ -25163,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
COSTS_N_INSNS
(ix86_cost->gather_static
+ ix86_cost->gather_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case vector_scatter_store:
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->scatter_static
+ ix86_cost->scatter_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
@@ -25188,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
case vec_construct:
{
- int n = TYPE_VECTOR_SUBPARTS (vectype);
+ int n = GET_MODE_NUNITS (mode);
/* N - 1 element inserts into an SSE vector, the possible
GPR -> XMM move is accounted for in add_stmt_cost. */
if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25196,12 +25331,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
/* One vinserti128 for combining two SSE vectors for AVX256. */
else if (GET_MODE_BITSIZE (mode) == 256)
return ((n - 2) * ix86_cost->sse_op
- + ix86_vec_cost (mode, ix86_cost->addss));
+ + ix86_vec_cost (mode, ix86_cost->sse_op));
/* One vinserti64x4 and two vinserti128 for combining SSE
and AVX256 vectors to AVX512. */
else if (GET_MODE_BITSIZE (mode) == 512)
- return ((n - 4) * ix86_cost->sse_op
- + 3 * ix86_vec_cost (mode, ix86_cost->addss));
+ {
+ machine_mode half_mode
+ = mode_for_vector (GET_MODE_INNER (mode),
+ GET_MODE_NUNITS (mode) / 2).require ();
+ return ((n - 4) * ix86_cost->sse_op
+ + 2 * ix86_vec_cost (half_mode, ix86_cost->sse_op)
+ + ix86_vec_cost (mode, ix86_cost->sse_op));
+ }
gcc_unreachable ();
}
@@ -25210,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+ tree vectype, int)
+{
+ machine_mode mode = TImode;
+ if (vectype != NULL)
+ mode = TYPE_MODE (vectype);
+ return ix86_default_vector_cost (type_of_cost, mode);
+}
+
/* This function returns the calling abi specific va_list type node.
It returns the FNDECL specific va_list type. */
@@ -25369,7 +25521,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
switch (mode)
{
case E_QImode:
- if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
return V64QImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V32QImode;
@@ -25377,7 +25529,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V16QImode;
case E_HImode:
- if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
return V32HImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V16HImode;
@@ -25385,7 +25537,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V8HImode;
case E_SImode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V16SImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V8SImode;
@@ -25393,7 +25545,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V4SImode;
case E_DImode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V8DImode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V4DImode;
@@ -25407,16 +25559,15 @@ ix86_preferred_simd_mode (scalar_mode mode)
{
if (TARGET_PREFER_AVX128)
return V8HFmode;
- else if (TARGET_PREFER_AVX256 || !TARGET_EVEX512)
+ else if (TARGET_PREFER_AVX256)
return V16HFmode;
}
- if (TARGET_EVEX512)
- return V32HFmode;
+ return V32HFmode;
}
return word_mode;
case E_BFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V32BFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V16BFmode;
@@ -25424,7 +25575,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V8BFmode;
case E_SFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V16SFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V8SFmode;
@@ -25432,7 +25583,7 @@ ix86_preferred_simd_mode (scalar_mode mode)
return V4SFmode;
case E_DFmode:
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
return V8DFmode;
else if (TARGET_AVX && !TARGET_PREFER_AVX128)
return V4DFmode;
@@ -25452,13 +25603,13 @@ ix86_preferred_simd_mode (scalar_mode mode)
static unsigned int
ix86_autovectorize_vector_modes (vector_modes *modes, bool all)
{
- if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+ if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
{
modes->safe_push (V64QImode);
modes->safe_push (V32QImode);
modes->safe_push (V16QImode);
}
- else if (TARGET_AVX512F && TARGET_EVEX512 && all)
+ else if (TARGET_AVX512F && all)
{
modes->safe_push (V32QImode);
modes->safe_push (V16QImode);
@@ -25496,7 +25647,7 @@ ix86_get_mask_mode (machine_mode data_mode)
unsigned elem_size = vector_size / nunits;
/* Scalar mask case. */
- if ((TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64)
+ if ((TARGET_AVX512F && vector_size == 64)
|| (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16))
/* AVX512FP16 only supports vector comparison
to kmask for _Float16. */
@@ -25664,7 +25815,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
- tree vectype, int misalign,
+ tree vectype, int,
vect_cost_model_location where)
{
unsigned retval = 0;
@@ -25682,6 +25833,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (scalar_p)
mode = TYPE_MODE (TREE_TYPE (vectype));
}
+ /* When we are costing a scalar stmt use the scalar stmt to get at the
+ type of the operation. */
+ else if (scalar_p && stmt_info)
+ if (tree lhs = gimple_get_lhs (stmt_info->stmt))
+ {
+ fp = FLOAT_TYPE_P (TREE_TYPE (lhs));
+ mode = TYPE_MODE (TREE_TYPE (lhs));
+ }
if ((kind == vector_stmt || kind == scalar_stmt)
&& stmt_info
@@ -25995,32 +26154,24 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(AGU and load ports). Try to account for this by scaling the
construction cost by the number of elements involved. */
if ((kind == vec_construct || kind == vec_to_scalar)
- && ((stmt_info
- && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ && ((node
+ && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+ && SLP_TREE_LANES (node) == 1))
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+ (SLP_TREE_REPRESENTATIVE (node))))
!= INTEGER_CST))
- || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
- == VMAT_GATHER_SCATTER)))
- || (node
- && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
- && SLP_TREE_LANES (node) == 1))
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
- (SLP_TREE_REPRESENTATIVE (node))))
- != INTEGER_CST))
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
- == VMAT_GATHER_SCATTER)))))
- {
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
+ == VMAT_GATHER_SCATTER)))))
+ {
+ stmt_cost = ix86_default_vector_cost (kind, mode);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
}
else if ((kind == vec_construct || kind == scalar_to_vec)
&& node
&& SLP_TREE_DEF_TYPE (node) == vect_external_def)
{
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
unsigned i;
tree op;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26060,7 +26211,22 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else
{
m_num_gpr_needed[where]++;
- stmt_cost += ix86_cost->sse_to_integer;
+
+ int cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+
+ /* For integer construction, the number of actual GPR -> XMM
+ moves will be somewhere between 0 and n.
+ We do not have very good idea about actual number, since
+ the source may be a constant, memory or a chain of
+ instructions that will be later converted by
+ scalar-to-vector pass. */
+ if (kind == vec_construct
+ && GET_MODE_BITSIZE (mode) == 256)
+ cost *= 2;
+ else if (kind == vec_construct
+ && GET_MODE_BITSIZE (mode) == 512)
+ cost *= 3;
+ stmt_cost += cost;
}
}
}
@@ -26069,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
TREE_VISITED (op) = 0;
}
if (stmt_cost == -1)
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
if (kind == vec_perm && vectype
&& GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
@@ -26152,14 +26318,10 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
/* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both
a AVX2 and a SSE epilogue for AVX512 vectorized loops. */
if (loop_vinfo
+ && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32
&& ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES])
- {
- if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64)
- m_suggested_epilogue_mode = V32QImode;
- else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
- && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32)
- m_suggested_epilogue_mode = V16QImode;
- }
+ m_suggested_epilogue_mode = V16QImode;
/* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger
enable a 64bit SSE epilogue. */
if (loop_vinfo
@@ -26168,6 +26330,65 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
&& LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16)
m_suggested_epilogue_mode = V8QImode;
+ /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use
+ a masked epilogue if that doesn't seem detrimental. */
+ if (loop_vinfo
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+ && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2
+ && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES]
+ && !OPTION_SET_P (param_vect_partial_vector_usage))
+ {
+ bool avoid = false;
+ if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+ {
+ unsigned int peel_niter
+ = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ peel_niter += 1;
+ /* When we know the number of scalar iterations of the epilogue,
+ avoid masking when a single vector epilog iteration handles
+ it in full. */
+ if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter)
+ % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()))
+ avoid = true;
+ }
+ if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo))))
+ for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo))
+ {
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ ;
+ else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
+ ;
+ else
+ {
+ int loop_depth
+ = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
+ DDR_LOOP_NEST (ddr));
+ if (DDR_NUM_DIST_VECTS (ddr) == 1
+ && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0)
+ {
+ /* Avoid the case when there's an outer loop that might
+ traverse a multi-dimensional array with the inner
+ loop just executing the masked epilogue with a
+ read-write where the next outer iteration might
+ read from the masked part of the previous write,
+ 'n' filling half a vector.
+ for (j = 0; j < m; ++j)
+ for (i = 0; i < n; ++i)
+ a[j][i] = c * a[j][i]; */
+ avoid = true;
+ break;
+ }
+ }
+ }
+ if (!avoid)
+ {
+ m_suggested_epilogue_mode = loop_vinfo->vector_mode;
+ m_masked_epilogue = 1;
+ }
+ }
+
vector_costs::finish_cost (scalar_costs);
}
@@ -26287,7 +26508,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
{
/* If the function isn't exported, we can pick up just one ISA
for the clones. */
- if (TARGET_AVX512F && TARGET_EVEX512)
+ if (TARGET_AVX512F)
clonei->vecsize_mangle = 'e';
else if (TARGET_AVX2)
clonei->vecsize_mangle = 'd';
@@ -26379,17 +26600,17 @@ ix86_simd_clone_usable (struct cgraph_node *node, machine_mode)
return -1;
if (!TARGET_AVX)
return 0;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 3 : TARGET_AVX2 ? 2 : 1;
+ return TARGET_AVX512F ? 3 : TARGET_AVX2 ? 2 : 1;
case 'c':
if (!TARGET_AVX)
return -1;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 2 : TARGET_AVX2 ? 1 : 0;
+ return TARGET_AVX512F ? 2 : TARGET_AVX2 ? 1 : 0;
case 'd':
if (!TARGET_AVX2)
return -1;
- return (TARGET_AVX512F && TARGET_EVEX512) ? 1 : 0;
+ return TARGET_AVX512F ? 1 : 0;
case 'e':
- if (!TARGET_AVX512F || !TARGET_EVEX512)
+ if (!TARGET_AVX512F)
return -1;
return 0;
default:
@@ -26604,7 +26825,7 @@ ix86_reloc_rw_mask (void)
static bool
symbolic_base_address_p (rtx addr)
{
- if (GET_CODE (addr) == SYMBOL_REF)
+ if (SYMBOL_REF_P (addr))
return true;
if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_GOTOFF)
@@ -28061,6 +28282,195 @@ ix86_cannot_copy_insn_p (rtx_insn *insn)
#undef TARGET_DOCUMENTATION_NAME
#define TARGET_DOCUMENTATION_NAME "x86"
+/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
+sbitmap
+ix86_get_separate_components (void)
+{
+ HOST_WIDE_INT offset, to_allocate;
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+ struct machine_function *m = cfun->machine;
+
+ offset = m->frame.stack_pointer_offset;
+ to_allocate = offset - m->frame.sse_reg_save_offset;
+
+ /* Shrink wrap separate uses MOV, which means APX PPX cannot be used.
+ Experiments show that APX PPX can speed up the prologue. If the function
+ does not exit early during actual execution, then using APX PPX is faster.
+ If the function always exits early during actual execution, then shrink
+ wrap separate reduces the number of MOV (PUSH/POP) instructions actually
+ executed, thus speeding up execution.
+ foo:
+ movl $1, %eax
+ testq %rdi, %rdi
+ jne.L60
+ ret ---> early return.
+ .L60:
+ subq $88, %rsp ---> belong to prologue.
+ xorl %eax, %eax
+ movq %rbx, 40 (%rsp) ---> belong to prologue.
+ movq 8 (%rdi), %rbx
+ movq %rbp, 48 (%rsp) ---> belong to prologue.
+ movq %rdi, %rbp
+ testq %rbx, %rbx
+ jne.L61
+ movq 40 (%rsp), %rbx
+ movq 48 (%rsp), %rbp
+ addq $88, %rsp
+ ret
+ .L61:
+ movq %r12, 56 (%rsp) ---> belong to prologue.
+ movq %r13, 64 (%rsp) ---> belong to prologue.
+ movq %r14, 72 (%rsp) ---> belong to prologue.
+ ... ...
+
+ Disable shrink wrap separate when PPX is enabled. */
+ if ((TARGET_APX_PPX && !crtl->calls_eh_return)
+ || cfun->machine->func_type != TYPE_NORMAL
+ || TARGET_SEH
+ || crtl->stack_realign_needed
+ || m->call_ms2sysv)
+ return components;
+
+ /* Since shrink wrapping separate uses MOV instead of PUSH/POP.
+ Disable shrink wrap separate when MOV is prohibited. */
+ if (save_regs_using_push_pop (to_allocate))
+ return components;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ /* Skip registers with large offsets, where a pseudo may be needed. */
+ if (IN_RANGE (offset, -0x8000, 0x7fff))
+ bitmap_set_bit (components, regno);
+ offset += UNITS_PER_WORD;
+ }
+
+ /* Don't mess with the following registers. */
+ if (frame_pointer_needed)
+ bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+ if (crtl->drap_reg)
+ bitmap_clear_bit (components, REGNO (crtl->drap_reg));
+
+ if (pic_offset_table_rtx)
+ bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
+sbitmap
+ix86_components_for_bb (basic_block bb)
+{
+ bitmap in = DF_LIVE_IN (bb);
+ bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+ bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+
+ function_abi_aggregator callee_abis;
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (CALL_P (insn))
+ callee_abis.note_callee_abi (insn_callee_abi (insn));
+ HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+
+ /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (!fixed_regs[regno]
+ && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+ || bitmap_bit_p (in, regno)
+ || bitmap_bit_p (gen, regno)
+ || bitmap_bit_p (kill, regno)))
+ bitmap_set_bit (components, regno);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */
+void
+ix86_disqualify_components (sbitmap, edge, sbitmap, bool)
+{
+ /* Nothing to do for x86. */
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
+void
+ix86_emit_prologue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
+void
+ix86_emit_epilogue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ {
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
+
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ }
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
+void
+ix86_set_handled_components (sbitmap components)
+{
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (bitmap_bit_p (components, regno))
+ {
+ cfun->machine->reg_is_wrapped_separately[regno] = true;
+ cfun->machine->use_fast_prologue_epilogue = true;
+ cfun->machine->frame.save_regs_using_mov = true;
+ }
+}
+
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
+ ix86_emit_prologue_components
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
+ ix86_emit_epilogue_components
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-i386.h"