diff options
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r-- | gcc/config/i386/i386.cc | 1490 |
1 files changed, 1211 insertions, 279 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 28603c2..313522b 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -335,6 +335,14 @@ static int const x86_64_ms_abi_int_parameter_registers[4] = CX_REG, DX_REG, R8_REG, R9_REG }; +/* Similar as Clang's preserve_none function parameter passing. + NB: Use DI_REG and SI_REG, see ix86_function_value_regno_p. */ + +static int const x86_64_preserve_none_int_parameter_registers[6] = +{ + R12_REG, R13_REG, R14_REG, R15_REG, DI_REG, SI_REG +}; + static int const x86_64_int_return_registers[4] = { AX_REG, DX_REG, DI_REG, SI_REG @@ -460,7 +468,8 @@ int ix86_arch_specified; red-zone. NB: Don't use red-zone for functions with no_caller_saved_registers - and 32 GPRs since 128-byte red-zone is too small for 31 GPRs. + and 32 GPRs or 16 XMM registers since 128-byte red-zone is too small + for 31 GPRs or 15 GPRs + 16 XMM registers. TODO: If we can reserve the first 2 WORDs, for PUSH and, another for CALL, in red-zone, we can allow local indirect jumps with @@ -471,7 +480,7 @@ ix86_using_red_zone (void) { return (TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI - && (!TARGET_APX_EGPR + && ((!TARGET_APX_EGPR && !TARGET_SSE) || (cfun->machine->call_saved_registers != TYPE_NO_CALLER_SAVED_REGISTERS)) && (!cfun->machine->has_local_indirect_jump @@ -898,6 +907,18 @@ x86_64_elf_unique_section (tree decl, int reloc) default_unique_section (decl, reloc); } +/* Return true if TYPE has no_callee_saved_registers or preserve_none + attribute. */ + +bool +ix86_type_no_callee_saved_registers_p (const_tree type) +{ + return (lookup_attribute ("no_callee_saved_registers", + TYPE_ATTRIBUTES (type)) != NULL + || lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (type)) != NULL); +} + #ifdef COMMON_ASM_OP #ifndef LARGECOMM_SECTION_ASM_OP @@ -1019,11 +1040,10 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) /* Sibling call isn't OK if callee has no callee-saved registers and the calling function has callee-saved registers. */ - if (cfun->machine->call_saved_registers != TYPE_NO_CALLEE_SAVED_REGISTERS - && (cfun->machine->call_saved_registers - != TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP) - && lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type))) + if ((cfun->machine->call_saved_registers + != TYPE_NO_CALLEE_SAVED_REGISTERS) + && cfun->machine->call_saved_registers != TYPE_PRESERVE_NONE + && ix86_type_no_callee_saved_registers_p (type)) return false; /* If outgoing reg parm stack space changes, we cannot do sibcall. */ @@ -1188,10 +1208,16 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2) != ix86_function_regparm (type2, NULL)) return 0; - if (lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type1)) - != lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type2))) + if (ix86_type_no_callee_saved_registers_p (type1) + != ix86_type_no_callee_saved_registers_p (type2)) + return 0; + + /* preserve_none attribute uses a different calling convention is + only for 64-bit. */ + if (TARGET_64BIT + && (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type1)) + != lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (type2)))) return 0; return 1; @@ -1553,7 +1579,10 @@ ix86_function_arg_regno_p (int regno) if (call_abi == SYSV_ABI && regno == AX_REG) return true; - if (call_abi == MS_ABI) + if (cfun + && cfun->machine->call_saved_registers == TYPE_PRESERVE_NONE) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else if (call_abi == MS_ABI) parm_regs = x86_64_ms_abi_int_parameter_registers; else parm_regs = x86_64_int_parameter_registers; @@ -1716,6 +1745,19 @@ ix86_asm_output_function_label (FILE *out_file, const char *fname, } } +/* Output a user-defined label. In AT&T syntax, registers are prefixed + with %, so labels require no punctuation. In Intel syntax, registers + are unprefixed, so labels may clash with registers or other operators, + and require quoting. */ +void +ix86_asm_output_labelref (FILE *file, const char *prefix, const char *label) +{ + if (ASSEMBLER_DIALECT == ASM_ATT) + fprintf (file, "%s%s", prefix, label); + else + fprintf (file, "\"%s%s\"", prefix, label); +} + /* Implementation of call abi switching target hook. Specific to FNDECL the specific call register sets are set. See also ix86_conditional_register_usage for more details. */ @@ -1795,8 +1837,7 @@ ix86_init_pic_reg (void) add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); } - seq = get_insns (); - end_sequence (); + seq = end_sequence (); entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); insert_insn_on_edge (seq, entry_edge); @@ -1823,6 +1864,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ memset (cum, 0, sizeof (*cum)); + tree preserve_none_type; if (fndecl) { target = cgraph_node::get (fndecl); @@ -1831,12 +1873,24 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ target = target->function_symbol (); local_info_node = cgraph_node::local_info_node (target->decl); cum->call_abi = ix86_function_abi (target->decl); + preserve_none_type = TREE_TYPE (target->decl); } else - cum->call_abi = ix86_function_abi (fndecl); + { + cum->call_abi = ix86_function_abi (fndecl); + preserve_none_type = TREE_TYPE (fndecl); + } } else - cum->call_abi = ix86_function_type_abi (fntype); + { + cum->call_abi = ix86_function_type_abi (fntype); + preserve_none_type = fntype; + } + cum->preserve_none_abi + = (preserve_none_type + && (lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (preserve_none_type)) + != nullptr)); cum->caller = caller; @@ -1998,8 +2052,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) && GET_MODE_INNER (mode) == innermode) { - if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512) - && !TARGET_IAMCU) + if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU) { static bool warnedavx512f; static bool warnedavx512f_ret; @@ -3410,9 +3463,15 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, break; } + const int *parm_regs; + if (cum->preserve_none_abi) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + return construct_container (mode, orig_mode, type, 0, cum->nregs, cum->sse_nregs, - &x86_64_int_parameter_registers [cum->regno], + &parm_regs[cum->regno], cum->sse_regno); } @@ -4422,7 +4481,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* AVX512F values are returned in ZMM0 if available. */ if (size == 64) - return !TARGET_AVX512F || !TARGET_EVEX512; + return !TARGET_AVX512F; } if (mode == XFmode) @@ -4577,6 +4636,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) if (max > X86_64_REGPARM_MAX) max = X86_64_REGPARM_MAX; + const int *parm_regs; + if (cum->preserve_none_abi) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + for (i = cum->regno; i < max; i++) { mem = gen_rtx_MEM (word_mode, @@ -4584,8 +4649,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) MEM_NOTRAP_P (mem) = 1; set_mem_alias_set (mem, set); emit_move_insn (mem, - gen_rtx_REG (word_mode, - x86_64_int_parameter_registers[i])); + gen_rtx_REG (word_mode, parm_regs[i])); } if (ix86_varargs_fpr_size) @@ -4739,8 +4803,7 @@ ix86_va_start (tree valist, rtx nextarg) start_sequence (); emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno)); - seq = get_insns (); - end_sequence (); + seq = end_sequence (); push_topmost_sequence (); emit_insn_after (seq, entry_of_function ()); @@ -5180,6 +5243,27 @@ ix86_check_movabs (rtx insn, int opnum) return volatile_ok || !MEM_VOLATILE_P (mem); } +/* Return true if XVECEXP idx of INSN satisfies MOVS arguments. */ +bool +ix86_check_movs (rtx insn, int idx) +{ + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); + + rtx set = XVECEXP (pat, 0, idx); + gcc_assert (GET_CODE (set) == SET); + + rtx dst = SET_DEST (set); + gcc_assert (MEM_P (dst)); + + rtx src = SET_SRC (set); + gcc_assert (MEM_P (src)); + + return (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)) + && (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)) + || Pmode == word_mode)); +} + /* Return false if INSN contains a MEM with a non-default address space. */ bool ix86_check_no_addr_space (rtx insn) @@ -5356,7 +5440,7 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) switch (GET_MODE_SIZE (mode)) { case 64: - if (TARGET_AVX512F && TARGET_EVEX512) + if (TARGET_AVX512F) return 2; break; case 32: @@ -5409,10 +5493,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vpxord\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxord\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxord\t%g0, %g0, %g0"; } return "vpxor\t%x0, %x0, %x0"; @@ -5428,19 +5510,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vxorpd\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vxorpd\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vxorpd\t%g0, %g0, %g0"; } else { if (TARGET_AVX512VL) return "vpxorq\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxorq\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxorq\t%g0, %g0, %g0"; } } return "vxorpd\t%x0, %x0, %x0"; @@ -5457,19 +5535,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vxorps\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vxorps\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vxorps\t%g0, %g0, %g0"; } else { if (TARGET_AVX512VL) return "vpxord\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxord\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxord\t%g0, %g0, %g0"; } } return "vxorps\t%x0, %x0, %x0"; @@ -5490,7 +5564,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) case MODE_XI: case MODE_V8DF: case MODE_V16SF: - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; case MODE_OI: @@ -5506,10 +5580,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"; - else if (TARGET_EVEX512) - return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; else - gcc_unreachable (); + return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; } return (TARGET_AVX ? "vpcmpeqd\t%0, %0, %0" @@ -5523,7 +5595,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (GET_MODE_SIZE (mode) == 64) { - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpcmpeqd\t%t0, %t0, %t0"; } else if (GET_MODE_SIZE (mode) == 32) @@ -5535,7 +5607,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) } else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) { - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpcmpeqd\t%x0, %x0, %x0"; } @@ -5646,8 +5718,6 @@ ix86_get_ssemov (rtx *operands, unsigned size, || memory_operand (operands[1], mode)) gcc_unreachable (); size = 64; - /* We need TARGET_EVEX512 to move into zmm register. */ - gcc_assert (TARGET_EVEX512); switch (type) { case opcode_int: @@ -5686,7 +5756,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu16" : "%vmovdqu") : "%vmovdqa"); @@ -5728,7 +5798,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu8" : "%vmovdqu") : "%vmovdqa"); @@ -5748,7 +5818,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu16" : "%vmovdqu") : "%vmovdqa"); @@ -6456,7 +6526,7 @@ output_set_got (rtx dest, rtx label) xops[0] = dest; - if (TARGET_VXWORKS_RTP && flag_pic) + if (TARGET_VXWORKS_GOTTPIC && TARGET_VXWORKS_RTP && flag_pic) { /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ xops[2] = gen_rtx_MEM (Pmode, @@ -6701,9 +6771,7 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined) || !frame_pointer_needed)); case TYPE_NO_CALLEE_SAVED_REGISTERS: - return false; - - case TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP: + case TYPE_PRESERVE_NONE: if (regno != HARD_FRAME_POINTER_REGNUM) return false; break; @@ -6780,7 +6848,9 @@ ix86_nsaved_sseregs (void) int nregs = 0; int regno; - if (!TARGET_64BIT_MS_ABI) + if (!TARGET_64BIT_MS_ABI + && (cfun->machine->call_saved_registers + != TYPE_NO_CALLER_SAVED_REGISTERS)) return 0; for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) @@ -6888,6 +6958,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs) && (nregs + aligned) >= 3; } +/* Check if push/pop should be used to save/restore registers. */ +static bool +save_regs_using_push_pop (HOST_WIDE_INT to_allocate) +{ + return ((!to_allocate && cfun->machine->frame.nregs <= 1) + || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) + /* If static stack checking is enabled and done with probes, + the registers need to be saved before allocating the frame. */ + || flag_stack_check == STATIC_BUILTIN_STACK_CHECK + /* If stack clash probing needs a loop, then it needs a + scratch register. But the returned register is only guaranteed + to be safe to use after register saves are complete. So if + stack clash protections are enabled and the allocated frame is + larger than the probe interval, then use pushes to save + callee saved registers. */ + || (flag_stack_clash_protection + && !ix86_target_stack_probe () + && to_allocate > get_probe_interval ())); +} + /* Fill structure ix86_frame about frame of currently computed function. */ static void @@ -6968,12 +7058,18 @@ ix86_compute_frame_layout (void) gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); gcc_assert (preferred_alignment <= stack_alignment_needed); - /* The only ABI saving SSE regs should be 64-bit ms_abi. */ - gcc_assert (TARGET_64BIT || !frame->nsseregs); + /* The only ABI saving SSE regs should be 64-bit ms_abi or with + no_caller_saved_registers attribue. */ + gcc_assert (TARGET_64BIT + || (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + || !frame->nsseregs); if (TARGET_64BIT && m->call_ms2sysv) { gcc_assert (stack_alignment_needed >= 16); - gcc_assert (!frame->nsseregs); + gcc_assert ((cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + || !frame->nsseregs); } /* For SEH we have to limit the amount of code movement into the prologue. @@ -7172,20 +7268,7 @@ ix86_compute_frame_layout (void) /* Size prologue needs to allocate. */ to_allocate = offset - frame->sse_reg_save_offset; - if ((!to_allocate && frame->nregs <= 1) - || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) - /* If static stack checking is enabled and done with probes, - the registers need to be saved before allocating the frame. */ - || flag_stack_check == STATIC_BUILTIN_STACK_CHECK - /* If stack clash probing needs a loop, then it needs a - scratch register. But the returned register is only guaranteed - to be safe to use after register saves are complete. So if - stack clash protections are enabled and the allocated frame is - larger than the probe interval, then use pushes to save - callee saved registers. */ - || (flag_stack_clash_protection - && !ix86_target_stack_probe () - && to_allocate > get_probe_interval ())) + if (save_regs_using_push_pop (to_allocate)) frame->save_regs_using_mov = false; if (ix86_using_red_zone () @@ -7643,7 +7726,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) { - ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); cfa_offset -= UNITS_PER_WORD; } } @@ -7736,8 +7821,15 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, add_frame_related_expr = true; } - insn = emit_insn (gen_pro_epilogue_adjust_stack_add - (Pmode, dest, src, addend)); + /* Shrink wrap separate may insert prologue between TEST and JMP. In order + not to affect EFlags, emit add without reg clobbering. */ + if (crtl->shrink_wrapped_separate) + insn = emit_insn (gen_pro_epilogue_adjust_stack_add_nocc + (Pmode, dest, src, addend)); + else + insn = emit_insn (gen_pro_epilogue_adjust_stack_add + (Pmode, dest, src, addend)); + if (style >= 0) ix86_add_queued_cfa_restore_notes (insn); @@ -7921,6 +8013,15 @@ ix86_update_stack_boundary (void) if (ix86_tls_descriptor_calls_expanded_in_cfun && crtl->preferred_stack_boundary < 128) crtl->preferred_stack_boundary = 128; + + /* For 32-bit MS ABI, both the incoming and preferred stack boundaries + are 32 bits, but if force_align_arg_pointer is specified, it should + prefer 128 bits for a backward-compatibility reason, which is also + what the doc suggests. */ + if (lookup_attribute ("force_align_arg_pointer", + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))) + && crtl->preferred_stack_boundary < 128) + crtl->preferred_stack_boundary = 128; } /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is @@ -7951,8 +8052,7 @@ ix86_get_drap_rtx (void) start_sequence (); drap_vreg = copy_to_reg (arg_ptr); - seq = get_insns (); - end_sequence (); + seq = end_sequence (); insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); if (!optimize) @@ -8473,6 +8573,128 @@ output_probe_stack_range (rtx reg, rtx end) return ""; } +/* Data passed to ix86_update_stack_alignment. */ +struct stack_access_data +{ + /* The stack access register. */ + const_rtx reg; + /* Pointer to stack alignment. */ + unsigned int *stack_alignment; +}; + +/* Update the maximum stack slot alignment from memory alignment in PAT. */ + +static void +ix86_update_stack_alignment (rtx, const_rtx pat, void *data) +{ + /* This insn may reference stack slot. Update the maximum stack slot + alignment if the memory is referenced by the stack access register. */ + stack_access_data *p = (stack_access_data *) data; + + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, pat, ALL) + { + auto op = *iter; + if (MEM_P (op)) + { + if (reg_mentioned_p (p->reg, XEXP (op, 0))) + { + unsigned int alignment = MEM_ALIGN (op); + + if (alignment > *p->stack_alignment) + *p->stack_alignment = alignment; + break; + } + else + iter.skip_subrtxes (); + } + } +} + +/* Helper function for ix86_find_all_reg_uses. */ + +static void +ix86_find_all_reg_uses_1 (HARD_REG_SET ®set, + rtx set, unsigned int regno, + auto_bitmap &worklist) +{ + rtx dest = SET_DEST (set); + + if (!REG_P (dest)) + return; + + /* Reject non-Pmode modes. */ + if (GET_MODE (dest) != Pmode) + return; + + unsigned int dst_regno = REGNO (dest); + + if (TEST_HARD_REG_BIT (regset, dst_regno)) + return; + + const_rtx src = SET_SRC (set); + + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, src, ALL) + { + auto op = *iter; + + if (MEM_P (op)) + iter.skip_subrtxes (); + + if (REG_P (op) && REGNO (op) == regno) + { + /* Add this register to register set. */ + add_to_hard_reg_set (®set, Pmode, dst_regno); + bitmap_set_bit (worklist, dst_regno); + break; + } + } +} + +/* Find all registers defined with register REGNO. */ + +static void +ix86_find_all_reg_uses (HARD_REG_SET ®set, + unsigned int regno, auto_bitmap &worklist) +{ + for (df_ref ref = DF_REG_USE_CHAIN (regno); + ref != NULL; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + continue; + + rtx_insn *insn = DF_REF_INSN (ref); + + if (!NONJUMP_INSN_P (insn)) + continue; + + unsigned int ref_regno = DF_REF_REGNO (ref); + + rtx set = single_set (insn); + if (set) + { + ix86_find_all_reg_uses_1 (regset, set, + ref_regno, worklist); + continue; + } + + rtx pat = PATTERN (insn); + if (GET_CODE (pat) != PARALLEL) + continue; + + for (int i = 0; i < XVECLEN (pat, 0); i++) + { + rtx exp = XVECEXP (pat, 0, i); + + if (GET_CODE (exp) == SET) + ix86_find_all_reg_uses_1 (regset, exp, + ref_regno, worklist); + } + } +} + /* Set stack_frame_required to false if stack frame isn't required. Update STACK_ALIGNMENT to the largest alignment, in bits, of stack slot used if stack frame is required and CHECK_STACK_SLOT is true. */ @@ -8491,10 +8713,6 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, add_to_hard_reg_set (&set_up_by_prologue, Pmode, HARD_FRAME_POINTER_REGNUM); - /* The preferred stack alignment is the minimum stack alignment. */ - if (stack_alignment > crtl->preferred_stack_boundary) - stack_alignment = crtl->preferred_stack_boundary; - bool require_stack_frame = false; FOR_EACH_BB_FN (bb, cfun) @@ -8506,27 +8724,67 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, set_up_by_prologue)) { require_stack_frame = true; - - if (check_stack_slot) - { - /* Find the maximum stack alignment. */ - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL) - if (MEM_P (*iter) - && (reg_mentioned_p (stack_pointer_rtx, - *iter) - || reg_mentioned_p (frame_pointer_rtx, - *iter))) - { - unsigned int alignment = MEM_ALIGN (*iter); - if (alignment > stack_alignment) - stack_alignment = alignment; - } - } + break; } } cfun->machine->stack_frame_required = require_stack_frame; + + /* Stop if we don't need to check stack slot. */ + if (!check_stack_slot) + return; + + /* The preferred stack alignment is the minimum stack alignment. */ + if (stack_alignment > crtl->preferred_stack_boundary) + stack_alignment = crtl->preferred_stack_boundary; + + HARD_REG_SET stack_slot_access; + CLEAR_HARD_REG_SET (stack_slot_access); + + /* Stack slot can be accessed by stack pointer, frame pointer or + registers defined by stack pointer or frame pointer. */ + auto_bitmap worklist; + + add_to_hard_reg_set (&stack_slot_access, Pmode, STACK_POINTER_REGNUM); + bitmap_set_bit (worklist, STACK_POINTER_REGNUM); + + if (frame_pointer_needed) + { + add_to_hard_reg_set (&stack_slot_access, Pmode, + HARD_FRAME_POINTER_REGNUM); + bitmap_set_bit (worklist, HARD_FRAME_POINTER_REGNUM); + } + + unsigned int regno; + + do + { + regno = bitmap_clear_first_set_bit (worklist); + ix86_find_all_reg_uses (stack_slot_access, regno, worklist); + } + while (!bitmap_empty_p (worklist)); + + hard_reg_set_iterator hrsi; + stack_access_data data; + + data.stack_alignment = &stack_alignment; + + EXECUTE_IF_SET_IN_HARD_REG_SET (stack_slot_access, 0, regno, hrsi) + for (df_ref ref = DF_REG_USE_CHAIN (regno); + ref != NULL; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + continue; + + rtx_insn *insn = DF_REF_INSN (ref); + + if (!NONJUMP_INSN_P (insn)) + continue; + + data.reg = DF_REF_REG (ref); + note_stores (insn, ix86_update_stack_alignment, &data); + } } /* Finalize stack_realign_needed and frame_pointer_needed flags, which @@ -9036,11 +9294,22 @@ ix86_expand_prologue (void) doing this if we have to probe the stack; at least on x86_64 the stack probe can turn into a call that clobbers a red zone location. */ else if (ix86_using_red_zone () - && (! TARGET_STACK_PROBE - || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) { + HOST_WIDE_INT allocate_offset; + if (crtl->shrink_wrapped_separate) + { + allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset; + + /* Adjust the total offset at the beginning of the function. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (allocate_offset), -1, + m->fs.cfa_reg == stack_pointer_rtx); + m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset; + } + ix86_emit_save_regs_using_mov (frame.reg_save_offset); - cfun->machine->red_zone_used = true; int_registers_saved = true; } } @@ -9618,30 +9887,35 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) { - rtx reg = gen_rtx_REG (word_mode, regno); - rtx mem; - rtx_insn *insn; - - mem = choose_baseaddr (cfa_offset, NULL); - mem = gen_frame_mem (word_mode, mem); - insn = emit_move_insn (reg, mem); - if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) { - /* Previously we'd represented the CFA as an expression - like *(%ebp - 8). We've just popped that value from - the stack, which means we need to reset the CFA to - the drap register. This will remain until we restore - the stack pointer. */ - add_reg_note (insn, REG_CFA_DEF_CFA, reg); - RTX_FRAME_RELATED_P (insn) = 1; + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; - /* This means that the DRAP register is valid for addressing. */ - m->fs.drap_valid = true; - } - else - ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + if (m->fs.cfa_reg == crtl->drap_reg + && regno == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + /* DRAP register is valid for addressing. */ + m->fs.drap_valid = true; + } + else + ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + } cfa_offset -= UNITS_PER_WORD; } } @@ -9920,10 +10194,11 @@ ix86_expand_epilogue (int style) less work than reloading sp and popping the register. */ else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1) restore_regs_via_mov = true; - else if (TARGET_EPILOGUE_USING_MOVE - && cfun->machine->use_fast_prologue_epilogue - && (frame.nregs > 1 - || m->fs.sp_offset != reg_save_offset)) + else if (crtl->shrink_wrapped_separate + || (TARGET_EPILOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue + && (frame.nregs > 1 + || m->fs.sp_offset != reg_save_offset))) restore_regs_via_mov = true; else if (frame_pointer_needed && !frame.nregs @@ -9937,6 +10212,9 @@ ix86_expand_epilogue (int style) else restore_regs_via_mov = false; + if (crtl->shrink_wrapped_separate) + gcc_assert (restore_regs_via_mov); + if (restore_regs_via_mov || frame.nsseregs) { /* Ensure that the entire register save area is addressable via @@ -9989,6 +10267,7 @@ ix86_expand_epilogue (int style) gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); gcc_assert (!crtl->drap_reg); gcc_assert (!frame.nregs); + gcc_assert (!crtl->shrink_wrapped_separate); } else if (restore_regs_via_mov) { @@ -10003,6 +10282,8 @@ ix86_expand_epilogue (int style) rtx sa = EH_RETURN_STACKADJ_RTX; rtx_insn *insn; + gcc_assert (!crtl->shrink_wrapped_separate); + /* Stack realignment doesn't work with eh_return. */ if (crtl->stack_realign_needed) sorry ("Stack realignment not supported with " @@ -11184,6 +11465,9 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) x = XVECEXP (x, 0, 0); return (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC); + case UNSPEC_SECREL32: + x = XVECEXP (x, 0, 0); + return GET_CODE (x) == SYMBOL_REF; default: return false; } @@ -11231,7 +11515,7 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) case E_OImode: case E_XImode: if (!standard_sse_constant_p (x, mode) - && GET_MODE_SIZE (TARGET_AVX512F && TARGET_EVEX512 + && GET_MODE_SIZE (TARGET_AVX512F ? XImode : (TARGET_AVX ? OImode @@ -11320,6 +11604,9 @@ legitimate_pic_operand_p (rtx x) x = XVECEXP (inner, 0, 0); return (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_SECREL32: + x = XVECEXP (inner, 0, 0); + return GET_CODE (x) == SYMBOL_REF; case UNSPEC_MACHOPIC_OFFSET: return legitimate_pic_address_disp_p (x); default: @@ -11500,6 +11787,9 @@ legitimate_pic_address_disp_p (rtx disp) disp = XVECEXP (disp, 0, 0); return (GET_CODE (disp) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC); + case UNSPEC_SECREL32: + disp = XVECEXP (disp, 0, 0); + return GET_CODE (disp) == SYMBOL_REF; } return false; @@ -11777,6 +12067,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict, case UNSPEC_INDNTPOFF: case UNSPEC_NTPOFF: case UNSPEC_DTPOFF: + case UNSPEC_SECREL32: break; default: @@ -11802,7 +12093,8 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict, || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF - && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_SECREL32)) /* Non-constant pic memory reference. */ return false; } @@ -11953,7 +12245,7 @@ legitimize_pic_address (rtx orig, rtx reg) else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) /* We can't always use @GOTOFF for text labels on VxWorks, see gotoff_operand. */ - || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) + || (TARGET_VXWORKS_VAROFF && GET_CODE (addr) == LABEL_REF)) { #if TARGET_PECOFF rtx tmp = legitimize_pe_coff_symbol (addr, true); @@ -12126,6 +12418,24 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg) return tp; } +/* Construct the SYMBOL_REF for the _tls_index symbol. */ + +static GTY(()) rtx ix86_tls_index_symbol; + +#if TARGET_WIN32_TLS +static rtx +ix86_tls_index (void) +{ + if (!ix86_tls_index_symbol) + ix86_tls_index_symbol = gen_rtx_SYMBOL_REF (SImode, "_tls_index"); + + if (flag_pic) + return gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_index_symbol), UNSPEC_PCREL)); + else + return ix86_tls_index_symbol; +} +#endif + /* Construct the SYMBOL_REF for the tls_get_addr function. */ static GTY(()) rtx ix86_tls_symbol; @@ -12184,6 +12494,26 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) machine_mode tp_mode = Pmode; int type; +#if TARGET_WIN32_TLS + off = gen_const_mem (SImode, ix86_tls_index ()); + set_mem_alias_set (off, GOT_ALIAS_SET); + + tp = gen_const_mem (Pmode, GEN_INT (TARGET_64BIT ? 88 : 44)); + set_mem_addr_space (tp, DEFAULT_TLS_SEG_REG); + + if (TARGET_64BIT) + off = convert_to_mode (Pmode, off, 1); + + base = force_reg (Pmode, off); + tp = copy_to_mode_reg (Pmode, tp); + + tp = gen_const_mem (Pmode, gen_rtx_PLUS (Pmode, tp, gen_rtx_MULT (Pmode, base, GEN_INT (UNITS_PER_WORD)))); + set_mem_alias_set (tp, GOT_ALIAS_SET); + + base = force_reg (Pmode, tp); + + return gen_rtx_PLUS (Pmode, base, gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_SECREL32))); +#else /* Fall back to global dynamic model if tool chain cannot support local dynamic. */ if (TARGET_SUN_TLS && !TARGET_64BIT @@ -12232,13 +12562,13 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx rdi = gen_rtx_REG (Pmode, DI_REG); rtx_insn *insns; start_sequence (); emit_call_insn - (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr)); - insns = get_insns (); - end_sequence (); + (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr, rdi)); + insns = end_sequence (); if (GET_MODE (x) != Pmode) x = gen_rtx_ZERO_EXTEND (Pmode, x); @@ -12286,14 +12616,14 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx rdi = gen_rtx_REG (Pmode, DI_REG); rtx_insn *insns; rtx eqv; start_sequence (); emit_call_insn - (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr)); - insns = get_insns (); - end_sequence (); + (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi)); + insns = end_sequence (); /* Attach a unique REG_EQUAL, to allow the RTL optimizers to share the LD_BASE result with other LD model accesses. */ @@ -12406,6 +12736,7 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) } return dest; +#endif } /* Return true if the TLS address requires insn using integer registers. @@ -12875,6 +13206,9 @@ output_pic_addr_const (FILE *file, rtx x, int code) case UNSPEC_INDNTPOFF: fputs ("@indntpoff", file); break; + case UNSPEC_SECREL32: + fputs ("@secrel32", file); + break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: putc ('-', file); @@ -12900,7 +13234,11 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x) { fputs (ASM_LONG, file); output_addr_const (file, x); +#if TARGET_WIN32_TLS + fputs ("@secrel32", file); +#else fputs ("@dtpoff", file); +#endif switch (size) { case 4: @@ -13134,7 +13472,7 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p) else if (base_term_p && pic_offset_table_rtx && !TARGET_MACHO - && !TARGET_VXWORKS_RTP) + && !TARGET_VXWORKS_VAROFF) { rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp); @@ -13559,10 +13897,11 @@ print_reg (rtx x, int code, FILE *file) H -- print a memory address offset by 8; used for sse high-parts Y -- print condition for XOP pcom* instruction. V -- print naked full integer register name without %. + v -- print segment override prefix + -- print a branch hint as 'cs' or 'ds' prefix ; -- print a semicolon (after prefixes due to bug in older gas). ~ -- print "i" if TARGET_AVX2, "f" otherwise. - ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode + ^ -- print addr32 prefix if Pmode != word_mode M -- print addr32 prefix for TARGET_X32 with VSIB address. ! -- print NOTRACK prefix for jxx/call/ret instructions if required. N -- print maskz if it's constant 0 operand. @@ -14064,6 +14403,28 @@ ix86_print_operand (FILE *file, rtx x, int code) return; + case 'v': + if (MEM_P (x)) + { + switch (MEM_ADDR_SPACE (x)) + { + case ADDR_SPACE_GENERIC: + break; + case ADDR_SPACE_SEG_FS: + fputs ("fs ", file); + break; + case ADDR_SPACE_SEG_GS: + fputs ("gs ", file); + break; + default: + gcc_unreachable (); + } + } + else + output_operand_lossage ("operand is not a memory reference, " + "invalid operand code 'v'"); + return; + case '*': if (ASSEMBLER_DIALECT == ASM_ATT) putc ('*', file); @@ -14138,7 +14499,7 @@ ix86_print_operand (FILE *file, rtx x, int code) return; case '^': - if (TARGET_64BIT && Pmode != word_mode) + if (Pmode != word_mode) fputs ("addr32 ", file); return; @@ -14653,6 +15014,10 @@ i386_asm_output_addr_const_extra (FILE *file, rtx x) output_addr_const (file, op); fputs ("@indntpoff", file); break; + case UNSPEC_SECREL32: + output_addr_const (file, op); + fputs ("@secrel32", file); + break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: output_addr_const (file, op); @@ -15507,7 +15872,7 @@ ix86_output_addr_diff_elt (FILE *file, int value, int rel) gcc_assert (!TARGET_64BIT); #endif /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ - if (TARGET_64BIT || TARGET_VXWORKS_RTP) + if (TARGET_64BIT || TARGET_VXWORKS_VAROFF) fprintf (file, "%s%s%d-%s%d\n", directive, LPREFIX, value, LPREFIX, rel); #if TARGET_MACHO @@ -17905,9 +18270,14 @@ ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type) if (cum->decl && !TREE_PUBLIC (cum->decl)) return; - const_tree ctx = get_ultimate_context (cum->decl); - if (ctx != NULL_TREE - && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) + tree decl = cum->decl; + if (!decl) + /* If we don't know the target, look at the current TU. */ + decl = current_function_decl; + + const_tree ctx = get_ultimate_context (decl); + if (ctx == NULL_TREE + || !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) return; /* If the actual size of the type is zero, then there is no change @@ -20044,14 +20414,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype, { bool si; enum ix86_builtins code; - const machine_mode mode = TYPE_MODE (TREE_TYPE (vectype)); if (!TARGET_AVX512F) return NULL_TREE; - if (!TARGET_EVEX512 && GET_MODE_SIZE (mode) == 64) - return NULL_TREE; - if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u) ? !TARGET_USE_SCATTER_2PARTS : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) @@ -20794,7 +21160,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, return true; /* x87 registers can't do subreg at all, as all values are reformatted - to extended precision. */ + to extended precision. + + ??? middle-end queries mode changes for ALL_REGS and this makes + vec_series_lowpart_p to always return false. We probably should + restrict this to modes supported by i387 and check if it is enabled. */ if (MAYBE_FLOAT_CLASS_P (regclass)) return false; @@ -21169,7 +21539,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) - any of 512-bit wide vector mode - any scalar mode. */ if (TARGET_AVX512F - && ((VALID_AVX512F_REG_OR_XI_MODE (mode) && TARGET_EVEX512) + && ((VALID_AVX512F_REG_OR_XI_MODE (mode)) || VALID_AVX512F_SCALAR_MODE (mode))) return true; @@ -21340,19 +21710,20 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2) return mode1 == SFmode; /* If MODE2 is only appropriate for an SSE register, then tie with - any other mode acceptable to SSE registers. */ - if (GET_MODE_SIZE (mode2) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 16 + any vector modes or scalar floating point modes acceptable to SSE + registers, excluding scalar integer modes with SUBREG: + (subreg:QI (reg:TI 99) 0)) + (subreg:HI (reg:TI 99) 0)) + (subreg:SI (reg:TI 99) 0)) + (subreg:DI (reg:TI 99) 0)) + to avoid unnecessary move from SSE register to integer register. + */ + if (GET_MODE_SIZE (mode2) >= 16 + && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2) + || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1)) + && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2))) && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 16 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); + return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1); /* If MODE2 is appropriate for an MMX register, then tie with any other mode acceptable to MMX registers. */ @@ -21410,7 +21781,7 @@ ix86_set_reg_reg_cost (machine_mode mode) case MODE_VECTOR_INT: case MODE_VECTOR_FLOAT: - if ((TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode)) + if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) @@ -21471,7 +21842,7 @@ ix86_widen_mult_cost (const struct processor_costs *cost, /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, require extra 4 mul, 4 add, 4 cmp and 2 shift. */ if (!TARGET_SSE4_1 && !uns_p) - extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4 + cost->sse_op * 2; /* Fallthru. */ case V4DImode: @@ -21521,11 +21892,11 @@ ix86_multiplication_cost (const struct processor_costs *cost, else if (TARGET_AVX2) nops += 2; else if (TARGET_XOP) - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; else { nops += 1; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21544,13 +21915,13 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 2; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } else { nmults += 1; nops += 4; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21563,14 +21934,16 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 4; - extra += cost->sse_load[3] * 2; + /* 2 loads, so no division by 2. */ + extra += COSTS_N_INSNS (cost->sse_load[3]); } goto do_qimode; case V64QImode: nmults = 2; nops = 9; - extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2; + /* 2 loads of each size, so no division by 2. */ + extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]); do_qimode: return ix86_vec_cost (mode, cost->mulss * nmults @@ -21663,7 +22036,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; if (constant_op1) { @@ -21694,7 +22067,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, shift with one insn set the cost to prefer paddb. */ if (constant_op1) { - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; return ix86_vec_cost (mode, cost->sse_op) + extra; } else @@ -21709,7 +22082,9 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3]; + extra = COSTS_N_INSNS (mode == V16QImode + ? cost->sse_load[2] + : cost->sse_load[3]) / 2; if (constant_op1) { @@ -21836,6 +22211,15 @@ vec_fp_conversion_cost (const struct processor_costs *cost, int size) return cost->vcvtps2pd512; } +/* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP. */ + +static bool +unspec_pcmp_p (rtx x) +{ + return GET_CODE (x) == UNSPEC + && (XINT (x, 1) == UNSPEC_PCMP || XINT (x, 1) == UNSPEC_UNSIGNED_PCMP); +} + /* Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -21853,9 +22237,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* Handling different vternlog variants. */ if ((GET_MODE_SIZE (mode) == 64 - ? (TARGET_AVX512F && TARGET_EVEX512) + ? TARGET_AVX512F : (TARGET_AVX512VL - || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256))) + || (TARGET_AVX512F && !TARGET_PREFER_AVX256))) && GET_MODE_SIZE (mode) >= 16 && outer_code_i == SET && ternlog_operand (x, mode)) @@ -22204,8 +22588,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, { /* (ior (not ...) ...) can be a single insn in AVX512. */ if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22296,8 +22679,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* (and (not ...) (not ...)) can be a single insn in AVX512. */ if (GET_CODE (right) == NOT && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22367,8 +22749,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, { /* (not (xor ...)) can be a single insn in AVX512. */ if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22512,6 +22893,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, else *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode)); return false; + case FLOAT: + case UNSIGNED_FLOAT: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtpi2ps); + else + *total = cost->cvtsi2ss; + return false; + + case FIX: + case UNSIGNED_FIX: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtps2pi); + else + *total = cost->cvtss2si; + return false; case ABS: /* SSE requires memory load for the constant operand. It may make @@ -22571,13 +22973,41 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } return false; - case VEC_SELECT: case VEC_CONCAT: /* ??? Assume all of these vector manipulation patterns are recognizable. In which case they all pretty much have the - same cost. */ + same cost. + ??? We should still recruse when computing cost. */ *total = cost->sse_op; return true; + + case VEC_SELECT: + /* Special case extracting lower part from the vector. + This by itself needs to code and most of SSE/AVX instructions have + packed and single forms where the single form may be represented + by such VEC_SELECT. + + Use cost 1 (despite the fact that functionally equivalent SUBREG has + cost 0). Making VEC_SELECT completely free, for example instructs CSE + to forward propagate VEC_SELECT into + + (set (reg eax) (reg src)) + + which then prevents fwprop and combining. See i.e. + gcc.target/i386/pr91103-1.c. + + ??? rtvec_series_p test should be, for valid patterns, equivalent to + vec_series_lowpart_p but is not, since the latter calls + can_cange_mode_class on ALL_REGS and this return false since x87 does + not support subregs at all. */ + if (rtvec_series_p (XVEC (XEXP (x, 1), 0), 0)) + *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)), + outer_code, opno, speed) + 1; + else + /* ??? We should still recruse when computing cost. */ + *total = cost->sse_op; + return true; + case VEC_DUPLICATE: *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)), @@ -22590,13 +23020,87 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case VEC_MERGE: mask = XEXP (x, 2); + /* Scalar versions of SSE instructions may be represented as: + + (vec_merge (vec_duplicate (operation ....)) + (register or memory) + (const_int 1)) + + In this case vec_merge and vec_duplicate is for free. + Just recurse into operation and second operand. */ + if (mask == const1_rtx + && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE) + { + *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } /* This is masked instruction, assume the same cost, as nonmasked variant. */ - if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) - *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); + else if (TARGET_AVX512F + && (register_operand (mask, GET_MODE (mask)) + /* Redunduant clean up of high bits for kmask with VL=2/4 + .i.e (vec_merge op0, op1, (and op3 15)). */ + || (GET_CODE (mask) == AND + && register_operand (XEXP (mask, 0), GET_MODE (mask)) + && CONST_INT_P (XEXP (mask, 1)) + && ((INTVAL (XEXP (mask, 1)) == 3 + && GET_MODE_NUNITS (mode) == 2) + || (INTVAL (XEXP (mask, 1)) == 15 + && GET_MODE_NUNITS (mode) == 4))))) + { + *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } + /* Combination of the two above: + + (vec_merge (vec_merge (vec_duplicate (operation ...)) + (register or memory) + (reg:QI mask)) + (register or memory) + (const_int 1)) + + i.e. avx512fp16_vcvtss2sh_mask. */ + else if (TARGET_AVX512F + && mask == const1_rtx + && GET_CODE (XEXP (x, 0)) == VEC_MERGE + && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE + && register_operand (XEXP (XEXP (x, 0), 2), + GET_MODE (XEXP (XEXP (x, 0), 2)))) + { + *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), + mode, outer_code, opno, speed) + + rtx_cost (XEXP (XEXP (x, 0), 1), + mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } + /* vcmp. */ + else if (unspec_pcmp_p (mask) + || (GET_CODE (mask) == NOT + && unspec_pcmp_p (XEXP (mask, 0)))) + { + rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask; + rtx unsop0 = XVECEXP (uns, 0, 0); + /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0) + cost the same as register. + This is used by avx_cmp<mode>3_ltint_not. */ + if (GET_CODE (unsop0) == SUBREG) + unsop0 = XEXP (unsop0, 0); + if (GET_CODE (unsop0) == NOT) + unsop0 = XEXP (unsop0, 0); + *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed) + + rtx_cost (unsop0, mode, UNSPEC, opno, speed) + + rtx_cost (XVECEXP (uns, 0, 1), mode, UNSPEC, opno, speed) + + cost->sse_op; + return true; + } else *total = cost->sse_op; - return true; + return false; case MEM: /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast. @@ -22613,7 +23117,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } /* An insn that accesses memory is slightly more expensive - than one that does not. */ + than one that does not. */ if (speed) { *total += 1; @@ -22854,7 +23358,9 @@ x86_this_parameter (tree function) { const int *parm_regs; - if (ix86_function_type_abi (type) == MS_ABI) + if (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type))) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else if (ix86_function_type_abi (type) == MS_ABI) parm_regs = x86_64_ms_abi_int_parameter_registers; else parm_regs = x86_64_int_parameter_registers; @@ -23180,19 +23686,21 @@ x86_field_alignment (tree type, int computed) /* Print call to TARGET to FILE. */ static void -x86_print_call_or_nop (FILE *file, const char *target) +x86_print_call_or_nop (FILE *file, const char *target, + const char *label) { if (flag_nop_mcount || !strcmp (target, "nop")) /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ - fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); + fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n", + label); else if (!TARGET_PECOFF && flag_pic) { gcc_assert (flag_plt); - fprintf (file, "1:\tcall\t%s@PLT\n", target); + fprintf (file, "%s\tcall\t%s@PLT\n", label, target); } else - fprintf (file, "1:\tcall\t%s\n", target); + fprintf (file, "%s\tcall\t%s\n", label, target); } static bool @@ -23277,6 +23785,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) const char *mcount_name = MCOUNT_NAME; + bool fentry_section_p + = (flag_record_mcount + || lookup_attribute ("fentry_section", + DECL_ATTRIBUTES (current_function_decl))); + + const char *label = fentry_section_p ? "1:" : ""; + if (current_fentry_name (&mcount_name)) ; else if (fentry_name) @@ -23312,11 +23827,12 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) reg = legacy_reg; } if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tmovabs\t%s, OFFSET FLAT:%s\n" - "\tcall\t%s\n", reg, mcount_name, reg); + fprintf (file, "%s\tmovabs\t%s, OFFSET FLAT:%s\n" + "\tcall\t%s\n", label, reg, mcount_name, + reg); else - fprintf (file, "1:\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n", - mcount_name, reg, reg); + fprintf (file, "%s\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n", + label, mcount_name, reg, reg); break; case CM_LARGE_PIC: #ifdef NO_PROFILE_COUNTERS @@ -23357,21 +23873,21 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) if (!flag_plt) { if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n", - mcount_name); + fprintf (file, "%s\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n", + label, mcount_name); else - fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", - mcount_name); + fprintf (file, "%s\tcall\t*%s@GOTPCREL(%%rip)\n", + label, mcount_name); break; } /* fall through */ default: - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); break; } } else - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); } else if (flag_pic) { @@ -23386,11 +23902,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) LPREFIX, labelno); #endif if (flag_plt) - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); else if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name); + fprintf (file, "%s\tcall\t[DWORD PTR %s@GOT[ebx]]\n", + label, mcount_name); else - fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name); + fprintf (file, "%s\tcall\t*%s@GOT(%%ebx)\n", + label, mcount_name); } else { @@ -23403,12 +23921,10 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) fprintf (file, "\tmovl\t$%sP%d, %%" PROFILE_COUNT_REGISTER "\n", LPREFIX, labelno); #endif - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); } - if (flag_record_mcount - || lookup_attribute ("fentry_section", - DECL_ATTRIBUTES (current_function_decl))) + if (fentry_section_p) { const char *sname = "__mcount_loc"; @@ -24167,7 +24683,7 @@ ix86_vector_mode_supported_p (machine_mode mode) return true; if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) return true; - if (TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode)) + if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) return true; if ((TARGET_MMX || TARGET_MMX_WITH_SSE) && VALID_MMX_REG_MODE (mode)) @@ -24415,8 +24931,7 @@ ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> & /*inputs*/, } } - rtx_insn *seq = get_insns (); - end_sequence (); + rtx_insn *seq = end_sequence (); if (saw_asm_flag) return seq; @@ -24792,12 +25307,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* One vinserti128 for combining two SSE vectors for AVX256. */ else if (GET_MODE_BITSIZE (mode) == 256) return ((n - 2) * ix86_cost->sse_op - + ix86_vec_cost (mode, ix86_cost->addss)); + + ix86_vec_cost (mode, ix86_cost->sse_op)); /* One vinserti64x4 and two vinserti128 for combining SSE and AVX256 vectors to AVX512. */ else if (GET_MODE_BITSIZE (mode) == 512) - return ((n - 4) * ix86_cost->sse_op - + 3 * ix86_vec_cost (mode, ix86_cost->addss)); + { + machine_mode half_mode + = mode_for_vector (GET_MODE_INNER (mode), + GET_MODE_NUNITS (mode) / 2).require (); + return ((n - 4) * ix86_cost->sse_op + + 2 * ix86_vec_cost (half_mode, ix86_cost->sse_op) + + ix86_vec_cost (mode, ix86_cost->sse_op)); + } gcc_unreachable (); } @@ -24965,7 +25486,7 @@ ix86_preferred_simd_mode (scalar_mode mode) switch (mode) { case E_QImode: - if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512BW && !TARGET_PREFER_AVX256) return V64QImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V32QImode; @@ -24973,7 +25494,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V16QImode; case E_HImode: - if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512BW && !TARGET_PREFER_AVX256) return V32HImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V16HImode; @@ -24981,7 +25502,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V8HImode; case E_SImode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V16SImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SImode; @@ -24989,7 +25510,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V4SImode; case E_DImode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V8DImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DImode; @@ -25003,16 +25524,15 @@ ix86_preferred_simd_mode (scalar_mode mode) { if (TARGET_PREFER_AVX128) return V8HFmode; - else if (TARGET_PREFER_AVX256 || !TARGET_EVEX512) + else if (TARGET_PREFER_AVX256) return V16HFmode; } - if (TARGET_EVEX512) - return V32HFmode; + return V32HFmode; } return word_mode; case E_BFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V32BFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V16BFmode; @@ -25020,7 +25540,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V8BFmode; case E_SFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V16SFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SFmode; @@ -25028,7 +25548,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V4SFmode; case E_DFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V8DFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DFmode; @@ -25048,13 +25568,13 @@ ix86_preferred_simd_mode (scalar_mode mode) static unsigned int ix86_autovectorize_vector_modes (vector_modes *modes, bool all) { - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) { modes->safe_push (V64QImode); modes->safe_push (V32QImode); modes->safe_push (V16QImode); } - else if (TARGET_AVX512F && TARGET_EVEX512 && all) + else if (TARGET_AVX512F && all) { modes->safe_push (V32QImode); modes->safe_push (V16QImode); @@ -25092,7 +25612,7 @@ ix86_get_mask_mode (machine_mode data_mode) unsigned elem_size = vector_size / nunits; /* Scalar mask case. */ - if ((TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64) + if ((TARGET_AVX512F && vector_size == 64) || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)) /* AVX512FP16 only supports vector comparison to kmask for _Float16. */ @@ -25257,32 +25777,6 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) return new ix86_vector_costs (vinfo, costing_for_scalar); } -/* Return cost of statement doing FP conversion. */ - -static unsigned -fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p) -{ - int outer_size - = tree_to_uhwi - (TYPE_SIZE - (TREE_TYPE (gimple_assign_lhs (stmt)))); - int inner_size - = tree_to_uhwi - (TYPE_SIZE - (TREE_TYPE (gimple_assign_rhs1 (stmt)))); - int stmt_cost = vec_fp_conversion_cost - (ix86_tune_cost, GET_MODE_BITSIZE (mode)); - /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end - up doing two conversions and packing them. */ - if (!scalar_p && inner_size > outer_size) - { - int n = inner_size / outer_size; - stmt_cost = stmt_cost * n - + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op); - } - return stmt_cost; -} - unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, @@ -25304,6 +25798,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (scalar_p) mode = TYPE_MODE (TREE_TYPE (vectype)); } + /* When we are costing a scalar stmt use the scalar stmt to get at the + type of the operation. */ + else if (scalar_p && stmt_info) + if (tree lhs = gimple_get_lhs (stmt_info->stmt)) + { + fp = FLOAT_TYPE_P (TREE_TYPE (lhs)); + mode = TYPE_MODE (TREE_TYPE (lhs)); + } if ((kind == vector_stmt || kind == scalar_stmt) && stmt_info @@ -25326,7 +25828,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else if (X87_FLOAT_MODE_P (mode)) stmt_cost = ix86_cost->fadd; else - stmt_cost = ix86_cost->add; + stmt_cost = ix86_cost->add; } else stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss @@ -25381,7 +25883,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (subcode == RSHIFT_EXPR && !TYPE_UNSIGNED (TREE_TYPE (op1))) ? ASHIFTRT : LSHIFTRT, mode, - TREE_CODE (op2) == INTEGER_CST, + TREE_CODE (op2) == INTEGER_CST, cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, false, false, NULL, NULL); @@ -25390,30 +25892,174 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, case NOP_EXPR: /* Only sign-conversions are free. */ if (tree_nop_conversion_p - (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) stmt_cost = 0; else if (fp) - stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, - scalar_p); + stmt_cost = vec_fp_conversion_cost + (ix86_tune_cost, GET_MODE_BITSIZE (mode)); + break; + + case FLOAT_EXPR: + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ix86_cost->cvtsi2ss; + else if (X87_FLOAT_MODE_P (mode)) + /* TODO: We do not have cost tables for x87. */ + stmt_cost = ix86_cost->fadd; + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps); + break; + + case FIX_TRUNC_EXPR: + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ix86_cost->cvtss2si; + else if (X87_FLOAT_MODE_P (mode)) + /* TODO: We do not have cost tables for x87. */ + stmt_cost = ix86_cost->fadd; + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi); + break; + + case COND_EXPR: + { + /* SSE2 conditinal move sequence is: + pcmpgtd %xmm5, %xmm0 (accounted separately) + pand %xmm0, %xmm2 + pandn %xmm1, %xmm0 + por %xmm2, %xmm0 + while SSE4 uses cmp + blend + and AVX512 masked moves. + + The condition is accounted separately since we usually have + p = a < b + c = p ? x : y + and we will account first statement as setcc. Exception is when + p is loaded from memory as bool and then we will not acocunt + the compare, but there is no way to check for this. */ + + int ninsns = TARGET_SSE4_1 ? 1 : 3; + + /* If one of parameters is 0 or -1 the sequence will be simplified: + (if_true & mask) | (if_false & ~mask) -> if_true & mask */ + if (ninsns > 1 + && (zerop (gimple_assign_rhs2 (stmt_info->stmt)) + || zerop (gimple_assign_rhs3 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs2 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs3 (stmt_info->stmt)))) + ninsns = 1; + + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ninsns * ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op); + else + /* compare (accounted separately) + cmov. */ + stmt_cost = ix86_cost->add; + } break; - case BIT_IOR_EXPR: - case ABS_EXPR: - case ABSU_EXPR: case MIN_EXPR: case MAX_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode) + && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else + /* minss */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vpmin was introduced in SSE3. + SSE2 needs pcmpgtd + pand + pandn + pxor. + If one of parameters is 0 or -1 the sequence is simplified + to pcmpgtd + pand. */ + if (!TARGET_SSSE3) + { + if (zerop (gimple_assign_rhs2 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs2 (stmt_info->stmt))) + stmt_cost *= 2; + else + stmt_cost *= 4; + } + } + else + /* cmp + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case ABS_EXPR: + case ABSU_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode) + && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* fabs. */ + stmt_cost = ix86_cost->fabs; + else + /* andss of sign bit. */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vabs was introduced in SSE3. + SSE3 uses psrat + pxor + psub. */ + if (!TARGET_SSSE3) + stmt_cost *= 3; + } + else + /* neg + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case BIT_IOR_EXPR: case BIT_XOR_EXPR: case BIT_AND_EXPR: case BIT_NOT_EXPR: - if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) - stmt_cost = ix86_cost->sse_op; - else if (VECTOR_MODE_P (mode)) + gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode) + && !X87_FLOAT_MODE_P (mode)); + if (VECTOR_MODE_P (mode)) stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); else stmt_cost = ix86_cost->add; break; + default: + if (truth_value_p (subcode)) + { + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* CMPccS? insructions are cheap, so use sse_op. While they + produce a mask which may need to be turned to 0/1 by and, + expect that this will be optimized away in a common case. */ + stmt_cost = ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + /* fcmp + setcc. */ + stmt_cost = ix86_cost->fadd + ix86_cost->add; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + else + /* setcc. */ + stmt_cost = ix86_cost->add; + break; + } break; } } @@ -25437,9 +26083,36 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, break; } - if (kind == vec_promote_demote - && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) - stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p); + if (kind == vec_promote_demote) + { + int outer_size + = tree_to_uhwi + (TYPE_SIZE + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)))); + int inner_size + = tree_to_uhwi + (TYPE_SIZE + (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))); + bool inner_fp = FLOAT_TYPE_P + (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))); + + if (fp && inner_fp) + stmt_cost = vec_fp_conversion_cost + (ix86_tune_cost, GET_MODE_BITSIZE (mode)); + else if (fp && !inner_fp) + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps); + else if (!fp && inner_fp) + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi); + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is + greater than inner size we will end up doing two conversions and + packing them. We always pack pairs; if the size difference is greater + it is split into multiple demote operations. */ + if (inner_size > outer_size) + stmt_cost = stmt_cost * 2 + + ix86_vec_cost (mode, ix86_cost->sse_op); + } /* If we do elementwise loads into a vector then we are bound by latency and execution resources for the many scalar loads @@ -25511,7 +26184,22 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else { m_num_gpr_needed[where]++; - stmt_cost += ix86_cost->sse_to_integer; + + int cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; + + /* For integer construction, the number of actual GPR -> XMM + moves will be somewhere between 0 and n. + We do not have very good idea about actual number, since + the source may be a constant, memory or a chain of + instructions that will be later converted by + scalar-to-vector pass. */ + if (kind == vec_construct + && GET_MODE_BITSIZE (mode) == 256) + cost *= 2; + else if (kind == vec_construct + && GET_MODE_BITSIZE (mode) == 512) + cost *= 3; + stmt_cost += cost; } } } @@ -25603,14 +26291,10 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both a AVX2 and a SSE epilogue for AVX512 vectorized loops. */ if (loop_vinfo + && LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32 && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES]) - { - if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64) - m_suggested_epilogue_mode = V32QImode; - else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) - && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32) - m_suggested_epilogue_mode = V16QImode; - } + m_suggested_epilogue_mode = V16QImode; /* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger enable a 64bit SSE epilogue. */ if (loop_vinfo @@ -25619,6 +26303,65 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16) m_suggested_epilogue_mode = V8QImode; + /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use + a masked epilogue if that doesn't seem detrimental. */ + if (loop_vinfo + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2 + && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES] + && !OPTION_SET_P (param_vect_partial_vector_usage)) + { + bool avoid = false; + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) + { + unsigned int peel_niter + = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + peel_niter += 1; + /* When we know the number of scalar iterations of the epilogue, + avoid masking when a single vector epilog iteration handles + it in full. */ + if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter) + % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())) + avoid = true; + } + if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo)))) + for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo)) + { + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + ; + else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) + ; + else + { + int loop_depth + = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, + DDR_LOOP_NEST (ddr)); + if (DDR_NUM_DIST_VECTS (ddr) == 1 + && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0) + { + /* Avoid the case when there's an outer loop that might + traverse a multi-dimensional array with the inner + loop just executing the masked epilogue with a + read-write where the next outer iteration might + read from the masked part of the previous write, + 'n' filling half a vector. + for (j = 0; j < m; ++j) + for (i = 0; i < n; ++i) + a[j][i] = c * a[j][i]; */ + avoid = true; + break; + } + } + } + if (!avoid) + { + m_suggested_epilogue_mode = loop_vinfo->vector_mode; + m_masked_epilogue = 1; + } + } + vector_costs::finish_cost (scalar_costs); } @@ -25738,7 +26481,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, { /* If the function isn't exported, we can pick up just one ISA for the clones. */ - if (TARGET_AVX512F && TARGET_EVEX512) + if (TARGET_AVX512F) clonei->vecsize_mangle = 'e'; else if (TARGET_AVX2) clonei->vecsize_mangle = 'd'; @@ -25830,17 +26573,17 @@ ix86_simd_clone_usable (struct cgraph_node *node, machine_mode) return -1; if (!TARGET_AVX) return 0; - return (TARGET_AVX512F && TARGET_EVEX512) ? 3 : TARGET_AVX2 ? 2 : 1; + return TARGET_AVX512F ? 3 : TARGET_AVX2 ? 2 : 1; case 'c': if (!TARGET_AVX) return -1; - return (TARGET_AVX512F && TARGET_EVEX512) ? 2 : TARGET_AVX2 ? 1 : 0; + return TARGET_AVX512F ? 2 : TARGET_AVX2 ? 1 : 0; case 'd': if (!TARGET_AVX2) return -1; - return (TARGET_AVX512F && TARGET_EVEX512) ? 1 : 0; + return TARGET_AVX512F ? 1 : 0; case 'e': - if (!TARGET_AVX512F || !TARGET_EVEX512) + if (!TARGET_AVX512F) return -1; return 0; default: @@ -27512,6 +28255,195 @@ ix86_cannot_copy_insn_p (rtx_insn *insn) #undef TARGET_DOCUMENTATION_NAME #define TARGET_DOCUMENTATION_NAME "x86" +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ +sbitmap +ix86_get_separate_components (void) +{ + HOST_WIDE_INT offset, to_allocate; + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + struct machine_function *m = cfun->machine; + + offset = m->frame.stack_pointer_offset; + to_allocate = offset - m->frame.sse_reg_save_offset; + + /* Shrink wrap separate uses MOV, which means APX PPX cannot be used. + Experiments show that APX PPX can speed up the prologue. If the function + does not exit early during actual execution, then using APX PPX is faster. + If the function always exits early during actual execution, then shrink + wrap separate reduces the number of MOV (PUSH/POP) instructions actually + executed, thus speeding up execution. + foo: + movl $1, %eax + testq %rdi, %rdi + jne.L60 + ret ---> early return. + .L60: + subq $88, %rsp ---> belong to prologue. + xorl %eax, %eax + movq %rbx, 40 (%rsp) ---> belong to prologue. + movq 8 (%rdi), %rbx + movq %rbp, 48 (%rsp) ---> belong to prologue. + movq %rdi, %rbp + testq %rbx, %rbx + jne.L61 + movq 40 (%rsp), %rbx + movq 48 (%rsp), %rbp + addq $88, %rsp + ret + .L61: + movq %r12, 56 (%rsp) ---> belong to prologue. + movq %r13, 64 (%rsp) ---> belong to prologue. + movq %r14, 72 (%rsp) ---> belong to prologue. + ... ... + + Disable shrink wrap separate when PPX is enabled. */ + if ((TARGET_APX_PPX && !crtl->calls_eh_return) + || cfun->machine->func_type != TYPE_NORMAL + || TARGET_SEH + || crtl->stack_realign_needed + || m->call_ms2sysv) + return components; + + /* Since shrink wrapping separate uses MOV instead of PUSH/POP. + Disable shrink wrap separate when MOV is prohibited. */ + if (save_regs_using_push_pop (to_allocate)) + return components; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + /* Skip registers with large offsets, where a pseudo may be needed. */ + if (IN_RANGE (offset, -0x8000, 0x7fff)) + bitmap_set_bit (components, regno); + offset += UNITS_PER_WORD; + } + + /* Don't mess with the following registers. */ + if (frame_pointer_needed) + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); + + if (crtl->drap_reg) + bitmap_clear_bit (components, REGNO (crtl->drap_reg)); + + if (pic_offset_table_rtx) + bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ +sbitmap +ix86_components_for_bb (basic_block bb) +{ + bitmap in = DF_LIVE_IN (bb); + bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; + bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; + + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + + function_abi_aggregator callee_abis; + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (CALL_P (insn)) + callee_abis.note_callee_abi (insn_callee_abi (insn)); + HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi); + + /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!fixed_regs[regno] + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) + || bitmap_bit_p (in, regno) + || bitmap_bit_p (gen, regno) + || bitmap_bit_p (kill, regno))) + bitmap_set_bit (components, regno); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */ +void +ix86_disqualify_components (sbitmap, edge, sbitmap, bool) +{ + /* Nothing to do for x86. */ +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ +void +ix86_emit_prologue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ +void +ix86_emit_epilogue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + { + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; + + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_RESTORE, reg); + } + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ +void +ix86_set_handled_components (sbitmap components) +{ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (bitmap_bit_p (components, regno)) + { + cfun->machine->reg_is_wrapped_separately[regno] = true; + cfun->machine->use_fast_prologue_epilogue = true; + cfun->machine->frame.save_regs_using_mov = true; + } +} + +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ + ix86_emit_prologue_components +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ + ix86_emit_epilogue_components +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-i386.h" |