diff options
Diffstat (limited to 'gcc/config/i386/i386.cc')
-rw-r--r-- | gcc/config/i386/i386.cc | 1671 |
1 files changed, 1315 insertions, 356 deletions
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 28603c2..65e04d3 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -335,6 +335,14 @@ static int const x86_64_ms_abi_int_parameter_registers[4] = CX_REG, DX_REG, R8_REG, R9_REG }; +/* Similar as Clang's preserve_none function parameter passing. + NB: Use DI_REG and SI_REG, see ix86_function_value_regno_p. */ + +static int const x86_64_preserve_none_int_parameter_registers[6] = +{ + R12_REG, R13_REG, R14_REG, R15_REG, DI_REG, SI_REG +}; + static int const x86_64_int_return_registers[4] = { AX_REG, DX_REG, DI_REG, SI_REG @@ -460,7 +468,8 @@ int ix86_arch_specified; red-zone. NB: Don't use red-zone for functions with no_caller_saved_registers - and 32 GPRs since 128-byte red-zone is too small for 31 GPRs. + and 32 GPRs or 16 XMM registers since 128-byte red-zone is too small + for 31 GPRs or 15 GPRs + 16 XMM registers. TODO: If we can reserve the first 2 WORDs, for PUSH and, another for CALL, in red-zone, we can allow local indirect jumps with @@ -471,7 +480,7 @@ ix86_using_red_zone (void) { return (TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI - && (!TARGET_APX_EGPR + && ((!TARGET_APX_EGPR && !TARGET_SSE) || (cfun->machine->call_saved_registers != TYPE_NO_CALLER_SAVED_REGISTERS)) && (!cfun->machine->has_local_indirect_jump @@ -898,6 +907,18 @@ x86_64_elf_unique_section (tree decl, int reloc) default_unique_section (decl, reloc); } +/* Return true if TYPE has no_callee_saved_registers or preserve_none + attribute. */ + +bool +ix86_type_no_callee_saved_registers_p (const_tree type) +{ + return (lookup_attribute ("no_callee_saved_registers", + TYPE_ATTRIBUTES (type)) != NULL + || lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (type)) != NULL); +} + #ifdef COMMON_ASM_OP #ifndef LARGECOMM_SECTION_ASM_OP @@ -1019,11 +1040,10 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) /* Sibling call isn't OK if callee has no callee-saved registers and the calling function has callee-saved registers. */ - if (cfun->machine->call_saved_registers != TYPE_NO_CALLEE_SAVED_REGISTERS - && (cfun->machine->call_saved_registers - != TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP) - && lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type))) + if ((cfun->machine->call_saved_registers + != TYPE_NO_CALLEE_SAVED_REGISTERS) + && cfun->machine->call_saved_registers != TYPE_PRESERVE_NONE + && ix86_type_no_callee_saved_registers_p (type)) return false; /* If outgoing reg parm stack space changes, we cannot do sibcall. */ @@ -1188,10 +1208,16 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2) != ix86_function_regparm (type2, NULL)) return 0; - if (lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type1)) - != lookup_attribute ("no_callee_saved_registers", - TYPE_ATTRIBUTES (type2))) + if (ix86_type_no_callee_saved_registers_p (type1) + != ix86_type_no_callee_saved_registers_p (type2)) + return 0; + + /* preserve_none attribute uses a different calling convention is + only for 64-bit. */ + if (TARGET_64BIT + && (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type1)) + != lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (type2)))) return 0; return 1; @@ -1553,7 +1579,10 @@ ix86_function_arg_regno_p (int regno) if (call_abi == SYSV_ABI && regno == AX_REG) return true; - if (call_abi == MS_ABI) + if (cfun + && cfun->machine->call_saved_registers == TYPE_PRESERVE_NONE) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else if (call_abi == MS_ABI) parm_regs = x86_64_ms_abi_int_parameter_registers; else parm_regs = x86_64_int_parameter_registers; @@ -1716,6 +1745,19 @@ ix86_asm_output_function_label (FILE *out_file, const char *fname, } } +/* Output a user-defined label. In AT&T syntax, registers are prefixed + with %, so labels require no punctuation. In Intel syntax, registers + are unprefixed, so labels may clash with registers or other operators, + and require quoting. */ +void +ix86_asm_output_labelref (FILE *file, const char *prefix, const char *label) +{ + if (ASSEMBLER_DIALECT == ASM_ATT) + fprintf (file, "%s%s", prefix, label); + else + fprintf (file, "\"%s%s\"", prefix, label); +} + /* Implementation of call abi switching target hook. Specific to FNDECL the specific call register sets are set. See also ix86_conditional_register_usage for more details. */ @@ -1795,8 +1837,7 @@ ix86_init_pic_reg (void) add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); } - seq = get_insns (); - end_sequence (); + seq = end_sequence (); entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); insert_insn_on_edge (seq, entry_edge); @@ -1823,6 +1864,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ memset (cum, 0, sizeof (*cum)); + tree preserve_none_type; if (fndecl) { target = cgraph_node::get (fndecl); @@ -1831,12 +1873,24 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ target = target->function_symbol (); local_info_node = cgraph_node::local_info_node (target->decl); cum->call_abi = ix86_function_abi (target->decl); + preserve_none_type = TREE_TYPE (target->decl); } else - cum->call_abi = ix86_function_abi (fndecl); + { + cum->call_abi = ix86_function_abi (fndecl); + preserve_none_type = TREE_TYPE (fndecl); + } } else - cum->call_abi = ix86_function_type_abi (fntype); + { + cum->call_abi = ix86_function_type_abi (fntype); + preserve_none_type = fntype; + } + cum->preserve_none_abi + = (preserve_none_type + && (lookup_attribute ("preserve_none", + TYPE_ATTRIBUTES (preserve_none_type)) + != nullptr)); cum->caller = caller; @@ -1998,8 +2052,7 @@ type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) && GET_MODE_INNER (mode) == innermode) { - if (size == 64 && (!TARGET_AVX512F || !TARGET_EVEX512) - && !TARGET_IAMCU) + if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU) { static bool warnedavx512f; static bool warnedavx512f_ret; @@ -3410,9 +3463,15 @@ function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, break; } + const int *parm_regs; + if (cum->preserve_none_abi) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + return construct_container (mode, orig_mode, type, 0, cum->nregs, cum->sse_nregs, - &x86_64_int_parameter_registers [cum->regno], + &parm_regs[cum->regno], cum->sse_regno); } @@ -4422,7 +4481,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* AVX512F values are returned in ZMM0 if available. */ if (size == 64) - return !TARGET_AVX512F || !TARGET_EVEX512; + return !TARGET_AVX512F; } if (mode == XFmode) @@ -4577,6 +4636,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) if (max > X86_64_REGPARM_MAX) max = X86_64_REGPARM_MAX; + const int *parm_regs; + if (cum->preserve_none_abi) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + for (i = cum->regno; i < max; i++) { mem = gen_rtx_MEM (word_mode, @@ -4584,8 +4649,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) MEM_NOTRAP_P (mem) = 1; set_mem_alias_set (mem, set); emit_move_insn (mem, - gen_rtx_REG (word_mode, - x86_64_int_parameter_registers[i])); + gen_rtx_REG (word_mode, parm_regs[i])); } if (ix86_varargs_fpr_size) @@ -4739,8 +4803,7 @@ ix86_va_start (tree valist, rtx nextarg) start_sequence (); emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno)); - seq = get_insns (); - end_sequence (); + seq = end_sequence (); push_topmost_sequence (); emit_insn_after (seq, entry_of_function ()); @@ -5180,6 +5243,27 @@ ix86_check_movabs (rtx insn, int opnum) return volatile_ok || !MEM_VOLATILE_P (mem); } +/* Return true if XVECEXP idx of INSN satisfies MOVS arguments. */ +bool +ix86_check_movs (rtx insn, int idx) +{ + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); + + rtx set = XVECEXP (pat, 0, idx); + gcc_assert (GET_CODE (set) == SET); + + rtx dst = SET_DEST (set); + gcc_assert (MEM_P (dst)); + + rtx src = SET_SRC (set); + gcc_assert (MEM_P (src)); + + return (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)) + && (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)) + || Pmode == word_mode)); +} + /* Return false if INSN contains a MEM with a non-default address space. */ bool ix86_check_no_addr_space (rtx insn) @@ -5356,7 +5440,7 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) switch (GET_MODE_SIZE (mode)) { case 64: - if (TARGET_AVX512F && TARGET_EVEX512) + if (TARGET_AVX512F) return 2; break; case 32: @@ -5409,10 +5493,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vpxord\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxord\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxord\t%g0, %g0, %g0"; } return "vpxor\t%x0, %x0, %x0"; @@ -5428,19 +5510,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vxorpd\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vxorpd\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vxorpd\t%g0, %g0, %g0"; } else { if (TARGET_AVX512VL) return "vpxorq\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxorq\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxorq\t%g0, %g0, %g0"; } } return "vxorpd\t%x0, %x0, %x0"; @@ -5457,19 +5535,15 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vxorps\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vxorps\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vxorps\t%g0, %g0, %g0"; } else { if (TARGET_AVX512VL) return "vpxord\t%x0, %x0, %x0"; - else if (TARGET_EVEX512) - return "vpxord\t%g0, %g0, %g0"; else - gcc_unreachable (); + return "vpxord\t%g0, %g0, %g0"; } } return "vxorps\t%x0, %x0, %x0"; @@ -5490,7 +5564,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) case MODE_XI: case MODE_V8DF: case MODE_V16SF: - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; case MODE_OI: @@ -5506,10 +5580,8 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (TARGET_AVX512VL) return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"; - else if (TARGET_EVEX512) - return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; else - gcc_unreachable (); + return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; } return (TARGET_AVX ? "vpcmpeqd\t%0, %0, %0" @@ -5523,7 +5595,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { if (GET_MODE_SIZE (mode) == 64) { - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpcmpeqd\t%t0, %t0, %t0"; } else if (GET_MODE_SIZE (mode) == 32) @@ -5535,7 +5607,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) } else if (vector_all_ones_zero_extend_quarter_operand (x, mode)) { - gcc_assert (TARGET_AVX512F && TARGET_EVEX512); + gcc_assert (TARGET_AVX512F); return "vpcmpeqd\t%x0, %x0, %x0"; } @@ -5646,8 +5718,6 @@ ix86_get_ssemov (rtx *operands, unsigned size, || memory_operand (operands[1], mode)) gcc_unreachable (); size = 64; - /* We need TARGET_EVEX512 to move into zmm register. */ - gcc_assert (TARGET_EVEX512); switch (type) { case opcode_int: @@ -5686,7 +5756,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu16" : "%vmovdqu") : "%vmovdqa"); @@ -5728,7 +5798,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu8" : "%vmovdqu") : "%vmovdqa"); @@ -5748,7 +5818,7 @@ ix86_get_ssemov (rtx *operands, unsigned size, : "%vmovaps"); else opcode = (misaligned_p - ? (TARGET_AVX512BW + ? (TARGET_AVX512BW && evex_reg_p ? "vmovdqu16" : "%vmovdqu") : "%vmovdqa"); @@ -5909,7 +5979,7 @@ symbolic_reference_mentioned_p (rtx op) const char *fmt; int i; - if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) + if (SYMBOL_REF_P (op) || LABEL_REF_P (op)) return true; fmt = GET_RTX_FORMAT (GET_CODE (op)); @@ -6456,7 +6526,7 @@ output_set_got (rtx dest, rtx label) xops[0] = dest; - if (TARGET_VXWORKS_RTP && flag_pic) + if (TARGET_VXWORKS_GOTTPIC && TARGET_VXWORKS_RTP && flag_pic) { /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ xops[2] = gen_rtx_MEM (Pmode, @@ -6701,9 +6771,7 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined) || !frame_pointer_needed)); case TYPE_NO_CALLEE_SAVED_REGISTERS: - return false; - - case TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP: + case TYPE_PRESERVE_NONE: if (regno != HARD_FRAME_POINTER_REGNUM) return false; break; @@ -6780,7 +6848,9 @@ ix86_nsaved_sseregs (void) int nregs = 0; int regno; - if (!TARGET_64BIT_MS_ABI) + if (!TARGET_64BIT_MS_ABI + && (cfun->machine->call_saved_registers + != TYPE_NO_CALLER_SAVED_REGISTERS)) return 0; for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) @@ -6888,6 +6958,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs) && (nregs + aligned) >= 3; } +/* Check if push/pop should be used to save/restore registers. */ +static bool +save_regs_using_push_pop (HOST_WIDE_INT to_allocate) +{ + return ((!to_allocate && cfun->machine->frame.nregs <= 1) + || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) + /* If static stack checking is enabled and done with probes, + the registers need to be saved before allocating the frame. */ + || flag_stack_check == STATIC_BUILTIN_STACK_CHECK + /* If stack clash probing needs a loop, then it needs a + scratch register. But the returned register is only guaranteed + to be safe to use after register saves are complete. So if + stack clash protections are enabled and the allocated frame is + larger than the probe interval, then use pushes to save + callee saved registers. */ + || (flag_stack_clash_protection + && !ix86_target_stack_probe () + && to_allocate > get_probe_interval ())); +} + /* Fill structure ix86_frame about frame of currently computed function. */ static void @@ -6968,12 +7058,18 @@ ix86_compute_frame_layout (void) gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); gcc_assert (preferred_alignment <= stack_alignment_needed); - /* The only ABI saving SSE regs should be 64-bit ms_abi. */ - gcc_assert (TARGET_64BIT || !frame->nsseregs); + /* The only ABI saving SSE regs should be 64-bit ms_abi or with + no_caller_saved_registers attribue. */ + gcc_assert (TARGET_64BIT + || (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + || !frame->nsseregs); if (TARGET_64BIT && m->call_ms2sysv) { gcc_assert (stack_alignment_needed >= 16); - gcc_assert (!frame->nsseregs); + gcc_assert ((cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + || !frame->nsseregs); } /* For SEH we have to limit the amount of code movement into the prologue. @@ -7172,20 +7268,7 @@ ix86_compute_frame_layout (void) /* Size prologue needs to allocate. */ to_allocate = offset - frame->sse_reg_save_offset; - if ((!to_allocate && frame->nregs <= 1) - || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) - /* If static stack checking is enabled and done with probes, - the registers need to be saved before allocating the frame. */ - || flag_stack_check == STATIC_BUILTIN_STACK_CHECK - /* If stack clash probing needs a loop, then it needs a - scratch register. But the returned register is only guaranteed - to be safe to use after register saves are complete. So if - stack clash protections are enabled and the allocated frame is - larger than the probe interval, then use pushes to save - callee saved registers. */ - || (flag_stack_clash_protection - && !ix86_target_stack_probe () - && to_allocate > get_probe_interval ())) + if (save_regs_using_push_pop (to_allocate)) frame->save_regs_using_mov = false; if (ix86_using_red_zone () @@ -7643,7 +7726,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) { - ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); cfa_offset -= UNITS_PER_WORD; } } @@ -7736,8 +7821,15 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, add_frame_related_expr = true; } - insn = emit_insn (gen_pro_epilogue_adjust_stack_add - (Pmode, dest, src, addend)); + /* Shrink wrap separate may insert prologue between TEST and JMP. In order + not to affect EFlags, emit add without reg clobbering. */ + if (crtl->shrink_wrapped_separate) + insn = emit_insn (gen_pro_epilogue_adjust_stack_add_nocc + (Pmode, dest, src, addend)); + else + insn = emit_insn (gen_pro_epilogue_adjust_stack_add + (Pmode, dest, src, addend)); + if (style >= 0) ix86_add_queued_cfa_restore_notes (insn); @@ -7921,6 +8013,15 @@ ix86_update_stack_boundary (void) if (ix86_tls_descriptor_calls_expanded_in_cfun && crtl->preferred_stack_boundary < 128) crtl->preferred_stack_boundary = 128; + + /* For 32-bit MS ABI, both the incoming and preferred stack boundaries + are 32 bits, but if force_align_arg_pointer is specified, it should + prefer 128 bits for a backward-compatibility reason, which is also + what the doc suggests. */ + if (lookup_attribute ("force_align_arg_pointer", + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))) + && crtl->preferred_stack_boundary < 128) + crtl->preferred_stack_boundary = 128; } /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is @@ -7951,8 +8052,7 @@ ix86_get_drap_rtx (void) start_sequence (); drap_vreg = copy_to_reg (arg_ptr); - seq = get_insns (); - end_sequence (); + seq = end_sequence (); insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); if (!optimize) @@ -8473,6 +8573,128 @@ output_probe_stack_range (rtx reg, rtx end) return ""; } +/* Data passed to ix86_update_stack_alignment. */ +struct stack_access_data +{ + /* The stack access register. */ + const_rtx reg; + /* Pointer to stack alignment. */ + unsigned int *stack_alignment; +}; + +/* Update the maximum stack slot alignment from memory alignment in PAT. */ + +static void +ix86_update_stack_alignment (rtx, const_rtx pat, void *data) +{ + /* This insn may reference stack slot. Update the maximum stack slot + alignment if the memory is referenced by the stack access register. */ + stack_access_data *p = (stack_access_data *) data; + + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, pat, ALL) + { + auto op = *iter; + if (MEM_P (op)) + { + if (reg_mentioned_p (p->reg, XEXP (op, 0))) + { + unsigned int alignment = MEM_ALIGN (op); + + if (alignment > *p->stack_alignment) + *p->stack_alignment = alignment; + break; + } + else + iter.skip_subrtxes (); + } + } +} + +/* Helper function for ix86_find_all_reg_uses. */ + +static void +ix86_find_all_reg_uses_1 (HARD_REG_SET ®set, + rtx set, unsigned int regno, + auto_bitmap &worklist) +{ + rtx dest = SET_DEST (set); + + if (!REG_P (dest)) + return; + + /* Reject non-Pmode modes. */ + if (GET_MODE (dest) != Pmode) + return; + + unsigned int dst_regno = REGNO (dest); + + if (TEST_HARD_REG_BIT (regset, dst_regno)) + return; + + const_rtx src = SET_SRC (set); + + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, src, ALL) + { + auto op = *iter; + + if (MEM_P (op)) + iter.skip_subrtxes (); + + if (REG_P (op) && REGNO (op) == regno) + { + /* Add this register to register set. */ + add_to_hard_reg_set (®set, Pmode, dst_regno); + bitmap_set_bit (worklist, dst_regno); + break; + } + } +} + +/* Find all registers defined with register REGNO. */ + +static void +ix86_find_all_reg_uses (HARD_REG_SET ®set, + unsigned int regno, auto_bitmap &worklist) +{ + for (df_ref ref = DF_REG_USE_CHAIN (regno); + ref != NULL; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + continue; + + rtx_insn *insn = DF_REF_INSN (ref); + + if (!NONJUMP_INSN_P (insn)) + continue; + + unsigned int ref_regno = DF_REF_REGNO (ref); + + rtx set = single_set (insn); + if (set) + { + ix86_find_all_reg_uses_1 (regset, set, + ref_regno, worklist); + continue; + } + + rtx pat = PATTERN (insn); + if (GET_CODE (pat) != PARALLEL) + continue; + + for (int i = 0; i < XVECLEN (pat, 0); i++) + { + rtx exp = XVECEXP (pat, 0, i); + + if (GET_CODE (exp) == SET) + ix86_find_all_reg_uses_1 (regset, exp, + ref_regno, worklist); + } + } +} + /* Set stack_frame_required to false if stack frame isn't required. Update STACK_ALIGNMENT to the largest alignment, in bits, of stack slot used if stack frame is required and CHECK_STACK_SLOT is true. */ @@ -8491,10 +8713,6 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, add_to_hard_reg_set (&set_up_by_prologue, Pmode, HARD_FRAME_POINTER_REGNUM); - /* The preferred stack alignment is the minimum stack alignment. */ - if (stack_alignment > crtl->preferred_stack_boundary) - stack_alignment = crtl->preferred_stack_boundary; - bool require_stack_frame = false; FOR_EACH_BB_FN (bb, cfun) @@ -8506,27 +8724,67 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, set_up_by_prologue)) { require_stack_frame = true; - - if (check_stack_slot) - { - /* Find the maximum stack alignment. */ - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL) - if (MEM_P (*iter) - && (reg_mentioned_p (stack_pointer_rtx, - *iter) - || reg_mentioned_p (frame_pointer_rtx, - *iter))) - { - unsigned int alignment = MEM_ALIGN (*iter); - if (alignment > stack_alignment) - stack_alignment = alignment; - } - } + break; } } cfun->machine->stack_frame_required = require_stack_frame; + + /* Stop if we don't need to check stack slot. */ + if (!check_stack_slot) + return; + + /* The preferred stack alignment is the minimum stack alignment. */ + if (stack_alignment > crtl->preferred_stack_boundary) + stack_alignment = crtl->preferred_stack_boundary; + + HARD_REG_SET stack_slot_access; + CLEAR_HARD_REG_SET (stack_slot_access); + + /* Stack slot can be accessed by stack pointer, frame pointer or + registers defined by stack pointer or frame pointer. */ + auto_bitmap worklist; + + add_to_hard_reg_set (&stack_slot_access, Pmode, STACK_POINTER_REGNUM); + bitmap_set_bit (worklist, STACK_POINTER_REGNUM); + + if (frame_pointer_needed) + { + add_to_hard_reg_set (&stack_slot_access, Pmode, + HARD_FRAME_POINTER_REGNUM); + bitmap_set_bit (worklist, HARD_FRAME_POINTER_REGNUM); + } + + unsigned int regno; + + do + { + regno = bitmap_clear_first_set_bit (worklist); + ix86_find_all_reg_uses (stack_slot_access, regno, worklist); + } + while (!bitmap_empty_p (worklist)); + + hard_reg_set_iterator hrsi; + stack_access_data data; + + data.stack_alignment = &stack_alignment; + + EXECUTE_IF_SET_IN_HARD_REG_SET (stack_slot_access, 0, regno, hrsi) + for (df_ref ref = DF_REG_USE_CHAIN (regno); + ref != NULL; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + continue; + + rtx_insn *insn = DF_REF_INSN (ref); + + if (!NONJUMP_INSN_P (insn)) + continue; + + data.reg = DF_REF_REG (ref); + note_stores (insn, ix86_update_stack_alignment, &data); + } } /* Finalize stack_realign_needed and frame_pointer_needed flags, which @@ -9036,11 +9294,22 @@ ix86_expand_prologue (void) doing this if we have to probe the stack; at least on x86_64 the stack probe can turn into a call that clobbers a red zone location. */ else if (ix86_using_red_zone () - && (! TARGET_STACK_PROBE - || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) { + HOST_WIDE_INT allocate_offset; + if (crtl->shrink_wrapped_separate) + { + allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset; + + /* Adjust the total offset at the beginning of the function. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (allocate_offset), -1, + m->fs.cfa_reg == stack_pointer_rtx); + m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset; + } + ix86_emit_save_regs_using_mov (frame.reg_save_offset); - cfun->machine->red_zone_used = true; int_registers_saved = true; } } @@ -9618,30 +9887,35 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) { - rtx reg = gen_rtx_REG (word_mode, regno); - rtx mem; - rtx_insn *insn; - mem = choose_baseaddr (cfa_offset, NULL); - mem = gen_frame_mem (word_mode, mem); - insn = emit_move_insn (reg, mem); - - if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + /* Skip registers, already processed by shrink wrap separate. */ + if (!cfun->machine->reg_is_wrapped_separately[regno]) { - /* Previously we'd represented the CFA as an expression - like *(%ebp - 8). We've just popped that value from - the stack, which means we need to reset the CFA to - the drap register. This will remain until we restore - the stack pointer. */ - add_reg_note (insn, REG_CFA_DEF_CFA, reg); - RTX_FRAME_RELATED_P (insn) = 1; + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; - /* This means that the DRAP register is valid for addressing. */ - m->fs.drap_valid = true; - } - else - ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + if (m->fs.cfa_reg == crtl->drap_reg + && regno == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + /* DRAP register is valid for addressing. */ + m->fs.drap_valid = true; + } + else + ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + } cfa_offset -= UNITS_PER_WORD; } } @@ -9920,10 +10194,11 @@ ix86_expand_epilogue (int style) less work than reloading sp and popping the register. */ else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1) restore_regs_via_mov = true; - else if (TARGET_EPILOGUE_USING_MOVE - && cfun->machine->use_fast_prologue_epilogue - && (frame.nregs > 1 - || m->fs.sp_offset != reg_save_offset)) + else if (crtl->shrink_wrapped_separate + || (TARGET_EPILOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue + && (frame.nregs > 1 + || m->fs.sp_offset != reg_save_offset))) restore_regs_via_mov = true; else if (frame_pointer_needed && !frame.nregs @@ -9937,6 +10212,9 @@ ix86_expand_epilogue (int style) else restore_regs_via_mov = false; + if (crtl->shrink_wrapped_separate) + gcc_assert (restore_regs_via_mov); + if (restore_regs_via_mov || frame.nsseregs) { /* Ensure that the entire register save area is addressable via @@ -9989,6 +10267,7 @@ ix86_expand_epilogue (int style) gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); gcc_assert (!crtl->drap_reg); gcc_assert (!frame.nregs); + gcc_assert (!crtl->shrink_wrapped_separate); } else if (restore_regs_via_mov) { @@ -10003,6 +10282,8 @@ ix86_expand_epilogue (int style) rtx sa = EH_RETURN_STACKADJ_RTX; rtx_insn *insn; + gcc_assert (!crtl->shrink_wrapped_separate); + /* Stack realignment doesn't work with eh_return. */ if (crtl->stack_realign_needed) sorry ("Stack realignment not supported with " @@ -10443,8 +10724,7 @@ split_stack_prologue_scratch_regno (void) static GTY(()) rtx split_stack_fn; -/* A SYMBOL_REF for the more stack function when using the large - model. */ +/* A SYMBOL_REF for the more stack function when using the large model. */ static GTY(()) rtx split_stack_fn_large; @@ -11132,7 +11412,7 @@ ix86_force_load_from_GOT_p (rtx x, bool call_p) && (!flag_pic || this_is_asm_operands) && ix86_cmodel != CM_LARGE && ix86_cmodel != CM_LARGE_PIC - && GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_P (x) && ((!call_p && (!ix86_direct_extern_access || (SYMBOL_REF_DECL (x) @@ -11178,20 +11458,23 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) case UNSPEC_TPOFF: case UNSPEC_NTPOFF: x = XVECEXP (x, 0, 0); - return (GET_CODE (x) == SYMBOL_REF + return (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); case UNSPEC_DTPOFF: x = XVECEXP (x, 0, 0); - return (GET_CODE (x) == SYMBOL_REF + return (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC); + case UNSPEC_SECREL32: + x = XVECEXP (x, 0, 0); + return SYMBOL_REF_P (x); default: return false; } /* We must have drilled down to a symbol. */ - if (GET_CODE (x) == LABEL_REF) + if (LABEL_REF_P (x)) return true; - if (GET_CODE (x) != SYMBOL_REF) + if (!SYMBOL_REF_P (x)) return false; /* FALLTHRU */ @@ -11231,7 +11514,7 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x) case E_OImode: case E_XImode: if (!standard_sse_constant_p (x, mode) - && GET_MODE_SIZE (TARGET_AVX512F && TARGET_EVEX512 + && GET_MODE_SIZE (TARGET_AVX512F ? XImode : (TARGET_AVX ? OImode @@ -11318,8 +11601,11 @@ legitimate_pic_operand_p (rtx x) return TARGET_64BIT; case UNSPEC_TPOFF: x = XVECEXP (inner, 0, 0); - return (GET_CODE (x) == SYMBOL_REF + return (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_SECREL32: + x = XVECEXP (inner, 0, 0); + return SYMBOL_REF_P (x); case UNSPEC_MACHOPIC_OFFSET: return legitimate_pic_address_disp_p (x); default: @@ -11370,7 +11656,7 @@ legitimate_pic_address_disp_p (rtx disp) if (INTVAL (op1) >= 16*1024*1024 || INTVAL (op1) < -16*1024*1024) break; - if (GET_CODE (op0) == LABEL_REF) + if (LABEL_REF_P (op0)) return true; if (GET_CODE (op0) == CONST && GET_CODE (XEXP (op0, 0)) == UNSPEC @@ -11379,7 +11665,7 @@ legitimate_pic_address_disp_p (rtx disp) if (GET_CODE (op0) == UNSPEC && XINT (op0, 1) == UNSPEC_PCREL) return true; - if (GET_CODE (op0) != SYMBOL_REF) + if (!SYMBOL_REF_P (op0)) break; /* FALLTHRU */ @@ -11444,8 +11730,8 @@ legitimate_pic_address_disp_p (rtx disp) && XINT (disp, 1) != UNSPEC_PLTOFF)) return false; - if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF - && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF) + if (!SYMBOL_REF_P (XVECEXP (disp, 0, 0)) + && !LABEL_REF_P (XVECEXP (disp, 0, 0))) return false; return true; } @@ -11473,14 +11759,14 @@ legitimate_pic_address_disp_p (rtx disp) /* We need to check for both symbols and labels because VxWorks loads text labels with @GOT rather than @GOTOFF. See gotoff_operand for details. */ - return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF - || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF); + return (SYMBOL_REF_P (XVECEXP (disp, 0, 0)) + || LABEL_REF_P (XVECEXP (disp, 0, 0))); case UNSPEC_GOTOFF: /* Refuse GOTOFF in 64bit mode since it is always 64bit when used. While ABI specify also 32bit relocation but we don't produce it in small PIC model at all. */ - if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF - || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF) + if ((SYMBOL_REF_P (XVECEXP (disp, 0, 0)) + || LABEL_REF_P (XVECEXP (disp, 0, 0))) && !TARGET_64BIT) return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode); return false; @@ -11490,16 +11776,19 @@ legitimate_pic_address_disp_p (rtx disp) if (saw_plus) return false; disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF + return (SYMBOL_REF_P (disp) && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC); case UNSPEC_NTPOFF: disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF + return (SYMBOL_REF_P (disp) && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC); case UNSPEC_DTPOFF: disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF + return (SYMBOL_REF_P (disp) && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC); + case UNSPEC_SECREL32: + disp = XVECEXP (disp, 0, 0); + return SYMBOL_REF_P (disp); } return false; @@ -11777,6 +12066,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict, case UNSPEC_INDNTPOFF: case UNSPEC_NTPOFF: case UNSPEC_DTPOFF: + case UNSPEC_SECREL32: break; default: @@ -11802,7 +12092,8 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict, || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF - && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_SECREL32)) /* Non-constant pic memory reference. */ return false; } @@ -11839,11 +12130,11 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool strict, that never results in lea, this seems to be easier and correct fix for crash to disable this test. */ } - else if (GET_CODE (disp) != LABEL_REF + else if (!LABEL_REF_P (disp) && !CONST_INT_P (disp) && (GET_CODE (disp) != CONST || !ix86_legitimate_constant_p (Pmode, disp)) - && (GET_CODE (disp) != SYMBOL_REF + && (!SYMBOL_REF_P (disp) || !ix86_legitimate_constant_p (Pmode, disp))) /* Displacement is not constant. */ return false; @@ -11950,10 +12241,10 @@ legitimize_pic_address (rtx orig, rtx reg) else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); } - else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) + else if ((SYMBOL_REF_P (addr) && SYMBOL_REF_TLS_MODEL (addr) == 0) /* We can't always use @GOTOFF for text labels on VxWorks, see gotoff_operand. */ - || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) + || (TARGET_VXWORKS_VAROFF && LABEL_REF_P (addr))) { #if TARGET_PECOFF rtx tmp = legitimize_pe_coff_symbol (addr, true); @@ -12088,8 +12379,8 @@ legitimize_pic_address (rtx orig, rtx reg) /* For %rip addressing, we have to use just disp32, not base nor index. */ if (TARGET_64BIT - && (GET_CODE (base) == SYMBOL_REF - || GET_CODE (base) == LABEL_REF)) + && (SYMBOL_REF_P (base) + || LABEL_REF_P (base))) base = force_reg (mode, base); if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1))) @@ -12126,6 +12417,24 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg) return tp; } +/* Construct the SYMBOL_REF for the _tls_index symbol. */ + +static GTY(()) rtx ix86_tls_index_symbol; + +#if TARGET_WIN32_TLS +static rtx +ix86_tls_index (void) +{ + if (!ix86_tls_index_symbol) + ix86_tls_index_symbol = gen_rtx_SYMBOL_REF (SImode, "_tls_index"); + + if (flag_pic) + return gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_index_symbol), UNSPEC_PCREL)); + else + return ix86_tls_index_symbol; +} +#endif + /* Construct the SYMBOL_REF for the tls_get_addr function. */ static GTY(()) rtx ix86_tls_symbol; @@ -12133,6 +12442,28 @@ static GTY(()) rtx ix86_tls_symbol; static rtx ix86_tls_get_addr (void) { + if (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + { + /* __tls_get_addr doesn't preserve vector registers. When a + function with no_caller_saved_registers attribute calls + __tls_get_addr, YMM and ZMM registers will be clobbered. + Issue an error and suggest -mtls-dialect=gnu2 in this case. */ + if (cfun->machine->func_type == TYPE_NORMAL) + error (G_("%<-mtls-dialect=gnu2%> must be used with a function" + " with the %<no_caller_saved_registers%> attribute")); + else + error (cfun->machine->func_type == TYPE_EXCEPTION + ? G_("%<-mtls-dialect=gnu2%> must be used with an" + " exception service routine") + : G_("%<-mtls-dialect=gnu2%> must be used with an" + " interrupt service routine")); + /* Don't issue the same error twice. */ + cfun->machine->func_type = TYPE_NORMAL; + cfun->machine->call_saved_registers + = TYPE_DEFAULT_CALL_SAVED_REGISTERS; + } + if (!ix86_tls_symbol) { const char *sym @@ -12184,6 +12515,26 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) machine_mode tp_mode = Pmode; int type; +#if TARGET_WIN32_TLS + off = gen_const_mem (SImode, ix86_tls_index ()); + set_mem_alias_set (off, GOT_ALIAS_SET); + + tp = gen_const_mem (Pmode, GEN_INT (TARGET_64BIT ? 88 : 44)); + set_mem_addr_space (tp, DEFAULT_TLS_SEG_REG); + + if (TARGET_64BIT) + off = convert_to_mode (Pmode, off, 1); + + base = force_reg (Pmode, off); + tp = copy_to_mode_reg (Pmode, tp); + + tp = gen_const_mem (Pmode, gen_rtx_PLUS (Pmode, tp, gen_rtx_MULT (Pmode, base, GEN_INT (UNITS_PER_WORD)))); + set_mem_alias_set (tp, GOT_ALIAS_SET); + + base = force_reg (Pmode, tp); + + return gen_rtx_PLUS (Pmode, base, gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (SImode, gen_rtvec (1, x), UNSPEC_SECREL32))); +#else /* Fall back to global dynamic model if tool chain cannot support local dynamic. */ if (TARGET_SUN_TLS && !TARGET_64BIT @@ -12232,13 +12583,13 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx rdi = gen_rtx_REG (Pmode, DI_REG); rtx_insn *insns; start_sequence (); emit_call_insn - (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr)); - insns = get_insns (); - end_sequence (); + (gen_tls_global_dynamic_64 (Pmode, rax, x, caddr, rdi)); + insns = end_sequence (); if (GET_MODE (x) != Pmode) x = gen_rtx_ZERO_EXTEND (Pmode, x); @@ -12286,14 +12637,14 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx rdi = gen_rtx_REG (Pmode, DI_REG); rtx_insn *insns; rtx eqv; start_sequence (); emit_call_insn - (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr)); - insns = get_insns (); - end_sequence (); + (gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi)); + insns = end_sequence (); /* Attach a unique REG_EQUAL, to allow the RTL optimizers to share the LD_BASE result with other LD model accesses. */ @@ -12406,6 +12757,7 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) } return dest; +#endif } /* Return true if the TLS address requires insn using integer registers. @@ -12552,12 +12904,12 @@ ix86_legitimize_address (rtx x, rtx, machine_mode mode) bool changed = false; unsigned log; - log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0; + log = SYMBOL_REF_P (x) ? SYMBOL_REF_TLS_MODEL (x) : 0; if (log) return legitimize_tls_address (x, (enum tls_model) log, false); if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_P (XEXP (XEXP (x, 0), 0)) && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0)))) { rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), @@ -12875,6 +13227,9 @@ output_pic_addr_const (FILE *file, rtx x, int code) case UNSPEC_INDNTPOFF: fputs ("@indntpoff", file); break; + case UNSPEC_SECREL32: + fputs ("@secrel32", file); + break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: putc ('-', file); @@ -12900,7 +13255,11 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x) { fputs (ASM_LONG, file); output_addr_const (file, x); +#if TARGET_WIN32_TLS + fputs ("@secrel32", file); +#else fputs ("@dtpoff", file); +#endif switch (size) { case 4: @@ -12967,7 +13326,7 @@ ix86_delegitimize_tls_address (rtx orig_x) if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF) return orig_x; x = XVECEXP (unspec, 0, 0); - gcc_assert (GET_CODE (x) == SYMBOL_REF); + gcc_assert (SYMBOL_REF_P (x)); if (unspec != XEXP (addr.disp, 0)) x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1)); if (addr.index) @@ -13134,7 +13493,7 @@ ix86_delegitimize_address_1 (rtx x, bool base_term_p) else if (base_term_p && pic_offset_table_rtx && !TARGET_MACHO - && !TARGET_VXWORKS_RTP) + && !TARGET_VXWORKS_VAROFF) { rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp); @@ -13559,10 +13918,11 @@ print_reg (rtx x, int code, FILE *file) H -- print a memory address offset by 8; used for sse high-parts Y -- print condition for XOP pcom* instruction. V -- print naked full integer register name without %. + v -- print segment override prefix + -- print a branch hint as 'cs' or 'ds' prefix ; -- print a semicolon (after prefixes due to bug in older gas). ~ -- print "i" if TARGET_AVX2, "f" otherwise. - ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode + ^ -- print addr32 prefix if Pmode != word_mode M -- print addr32 prefix for TARGET_X32 with VSIB address. ! -- print NOTRACK prefix for jxx/call/ret instructions if required. N -- print maskz if it's constant 0 operand. @@ -14064,6 +14424,28 @@ ix86_print_operand (FILE *file, rtx x, int code) return; + case 'v': + if (MEM_P (x)) + { + switch (MEM_ADDR_SPACE (x)) + { + case ADDR_SPACE_GENERIC: + break; + case ADDR_SPACE_SEG_FS: + fputs ("fs ", file); + break; + case ADDR_SPACE_SEG_GS: + fputs ("gs ", file); + break; + default: + gcc_unreachable (); + } + } + else + output_operand_lossage ("operand is not a memory reference, " + "invalid operand code 'v'"); + return; + case '*': if (ASSEMBLER_DIALECT == ASM_ATT) putc ('*', file); @@ -14138,7 +14520,7 @@ ix86_print_operand (FILE *file, rtx x, int code) return; case '^': - if (TARGET_64BIT && Pmode != word_mode) + if (Pmode != word_mode) fputs ("addr32 ", file); return; @@ -14308,7 +14690,7 @@ ix86_print_operand (FILE *file, rtx x, int code) /* We have patterns that allow zero sets of memory, for instance. In 64-bit mode, we should probably support all 8-byte vectors, since we can in fact encode that into an immediate. */ - if (GET_CODE (x) == CONST_VECTOR) + if (CONST_VECTOR_P (x)) { if (x != CONST0_RTX (GET_MODE (x))) output_operand_lossage ("invalid vector immediate"); @@ -14338,8 +14720,8 @@ ix86_print_operand (FILE *file, rtx x, int code) if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); } - else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF - || GET_CODE (x) == LABEL_REF) + else if (GET_CODE (x) == CONST || SYMBOL_REF_P (x) + || LABEL_REF_P (x)) { if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); @@ -14434,8 +14816,8 @@ ix86_print_operand_address_as (FILE *file, rtx addr, && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) symbol = XEXP (XEXP (disp, 0), 0); - if (GET_CODE (symbol) == LABEL_REF - || (GET_CODE (symbol) == SYMBOL_REF + if (LABEL_REF_P (symbol) + || (SYMBOL_REF_P (symbol) && SYMBOL_REF_TLS_MODEL (symbol) == 0)) base = pc_rtx; } @@ -14523,7 +14905,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr, { if (flag_pic) output_pic_addr_const (file, disp, 0); - else if (GET_CODE (disp) == LABEL_REF) + else if (LABEL_REF_P (disp)) output_asm_label (disp); else output_addr_const (file, disp); @@ -14559,7 +14941,7 @@ ix86_print_operand_address_as (FILE *file, rtx addr, if (flag_pic) output_pic_addr_const (file, disp, 0); - else if (GET_CODE (disp) == LABEL_REF) + else if (LABEL_REF_P (disp)) output_asm_label (disp); else if (CONST_INT_P (disp)) offset = disp; @@ -14653,6 +15035,10 @@ i386_asm_output_addr_const_extra (FILE *file, rtx x) output_addr_const (file, op); fputs ("@indntpoff", file); break; + case UNSPEC_SECREL32: + output_addr_const (file, op); + fputs ("@secrel32", file); + break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: output_addr_const (file, op); @@ -15507,7 +15893,7 @@ ix86_output_addr_diff_elt (FILE *file, int value, int rel) gcc_assert (!TARGET_64BIT); #endif /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ - if (TARGET_64BIT || TARGET_VXWORKS_RTP) + if (TARGET_64BIT || TARGET_VXWORKS_VAROFF) fprintf (file, "%s%s%d-%s%d\n", directive, LPREFIX, value, LPREFIX, rel); #if TARGET_MACHO @@ -16339,6 +16725,10 @@ ix86_convert_const_vector_to_integer (rtx op, machine_mode mode) val = wi::insert (val, wv, innermode_bits * i, innermode_bits); } break; + case E_V1SImode: + case E_V1DImode: + op = CONST_VECTOR_ELT (op, 0); + return INTVAL (op); case E_V2HFmode: case E_V2BFmode: case E_V4HFmode: @@ -17314,8 +17704,8 @@ ix86_rip_relative_addr_p (struct ix86_address *parts) && CONST_INT_P (XEXP (symbol, 1))) symbol = XEXP (symbol, 0); - if (GET_CODE (symbol) == LABEL_REF - || (GET_CODE (symbol) == SYMBOL_REF + if (LABEL_REF_P (symbol) + || (SYMBOL_REF_P (symbol) && SYMBOL_REF_TLS_MODEL (symbol) == 0) || (GET_CODE (symbol) == UNSPEC && (XINT (symbol, 1) == UNSPEC_GOTPCREL @@ -17905,9 +18295,14 @@ ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type) if (cum->decl && !TREE_PUBLIC (cum->decl)) return; - const_tree ctx = get_ultimate_context (cum->decl); - if (ctx != NULL_TREE - && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) + tree decl = cum->decl; + if (!decl) + /* If we don't know the target, look at the current TU. */ + decl = current_function_decl; + + const_tree ctx = get_ultimate_context (decl); + if (ctx == NULL_TREE + || !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) return; /* If the actual size of the type is zero, then there is no change @@ -19634,7 +20029,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree utype, ures, vce; utype = unsigned_type_for (TREE_TYPE (arg0)); /* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR - instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */ + instead of ABS_EXPR to handle overflow case(TYPE_MIN). */ ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0); gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); loc = gimple_location (stmt); @@ -20044,14 +20439,10 @@ ix86_vectorize_builtin_scatter (const_tree vectype, { bool si; enum ix86_builtins code; - const machine_mode mode = TYPE_MODE (TREE_TYPE (vectype)); if (!TARGET_AVX512F) return NULL_TREE; - if (!TARGET_EVEX512 && GET_MODE_SIZE (mode) == 64) - return NULL_TREE; - if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 2u) ? !TARGET_USE_SCATTER_2PARTS : (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) @@ -20794,7 +21185,11 @@ ix86_can_change_mode_class (machine_mode from, machine_mode to, return true; /* x87 registers can't do subreg at all, as all values are reformatted - to extended precision. */ + to extended precision. + + ??? middle-end queries mode changes for ALL_REGS and this makes + vec_series_lowpart_p to always return false. We probably should + restrict this to modes supported by i387 and check if it is enabled. */ if (MAYBE_FLOAT_CLASS_P (regclass)) return false; @@ -21118,8 +21513,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) /* Register pair for mask registers. */ if (mode == P2QImode || mode == P2HImode) return 2; - if (mode == V64SFmode || mode == V64SImode) - return 4; + return 1; } @@ -21169,7 +21563,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) - any of 512-bit wide vector mode - any scalar mode. */ if (TARGET_AVX512F - && ((VALID_AVX512F_REG_OR_XI_MODE (mode) && TARGET_EVEX512) + && ((VALID_AVX512F_REG_OR_XI_MODE (mode)) || VALID_AVX512F_SCALAR_MODE (mode))) return true; @@ -21340,19 +21734,20 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2) return mode1 == SFmode; /* If MODE2 is only appropriate for an SSE register, then tie with - any other mode acceptable to SSE registers. */ - if (GET_MODE_SIZE (mode2) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 16 + any vector modes or scalar floating point modes acceptable to SSE + registers, excluding scalar integer modes with SUBREG: + (subreg:QI (reg:TI 99) 0)) + (subreg:HI (reg:TI 99) 0)) + (subreg:SI (reg:TI 99) 0)) + (subreg:DI (reg:TI 99) 0)) + to avoid unnecessary move from SSE register to integer register. + */ + if (GET_MODE_SIZE (mode2) >= 16 + && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2) + || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1)) + && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2))) && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 16 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); + return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1); /* If MODE2 is appropriate for an MMX register, then tie with any other mode acceptable to MMX registers. */ @@ -21410,7 +21805,7 @@ ix86_set_reg_reg_cost (machine_mode mode) case MODE_VECTOR_INT: case MODE_VECTOR_FLOAT: - if ((TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode)) + if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) @@ -21471,7 +21866,7 @@ ix86_widen_mult_cost (const struct processor_costs *cost, /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, require extra 4 mul, 4 add, 4 cmp and 2 shift. */ if (!TARGET_SSE4_1 && !uns_p) - extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4 + cost->sse_op * 2; /* Fallthru. */ case V4DImode: @@ -21521,11 +21916,11 @@ ix86_multiplication_cost (const struct processor_costs *cost, else if (TARGET_AVX2) nops += 2; else if (TARGET_XOP) - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; else { nops += 1; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21544,13 +21939,13 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 2; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } else { nmults += 1; nops += 4; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21563,14 +21958,16 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 4; - extra += cost->sse_load[3] * 2; + /* 2 loads, so no division by 2. */ + extra += COSTS_N_INSNS (cost->sse_load[3]); } goto do_qimode; case V64QImode: nmults = 2; nops = 9; - extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2; + /* 2 loads of each size, so no division by 2. */ + extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]); do_qimode: return ix86_vec_cost (mode, cost->mulss * nmults @@ -21663,7 +22060,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; if (constant_op1) { @@ -21694,7 +22091,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, shift with one insn set the cost to prefer paddb. */ if (constant_op1) { - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; return ix86_vec_cost (mode, cost->sse_op) + extra; } else @@ -21709,7 +22106,9 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3]; + extra = COSTS_N_INSNS (mode == V16QImode + ? cost->sse_load[2] + : cost->sse_load[3]) / 2; if (constant_op1) { @@ -21836,6 +22235,15 @@ vec_fp_conversion_cost (const struct processor_costs *cost, int size) return cost->vcvtps2pd512; } +/* Return true of X is UNSPEC with UNSPEC_PCMP or UNSPEC_UNSIGNED_PCMP. */ + +static bool +unspec_pcmp_p (rtx x) +{ + return GET_CODE (x) == UNSPEC + && (XINT (x, 1) == UNSPEC_PCMP || XINT (x, 1) == UNSPEC_UNSIGNED_PCMP); +} + /* Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -21853,9 +22261,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* Handling different vternlog variants. */ if ((GET_MODE_SIZE (mode) == 64 - ? (TARGET_AVX512F && TARGET_EVEX512) + ? TARGET_AVX512F : (TARGET_AVX512VL - || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256))) + || (TARGET_AVX512F && !TARGET_PREFER_AVX256))) && GET_MODE_SIZE (mode) >= 16 && outer_code_i == SET && ternlog_operand (x, mode)) @@ -22204,8 +22612,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, { /* (ior (not ...) ...) can be a single insn in AVX512. */ if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22296,8 +22703,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* (and (not ...) (not ...)) can be a single insn in AVX512. */ if (GET_CODE (right) == NOT && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22367,8 +22773,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, { /* (not (xor ...)) can be a single insn in AVX512. */ if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F - && ((TARGET_EVEX512 - && GET_MODE_SIZE (mode) == 64) + && (GET_MODE_SIZE (mode) == 64 || (TARGET_AVX512VL && (GET_MODE_SIZE (mode) == 32 || GET_MODE_SIZE (mode) == 16)))) @@ -22512,6 +22917,27 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, else *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode)); return false; + case FLOAT: + case UNSIGNED_FLOAT: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtpi2ps); + else + *total = cost->cvtsi2ss; + return false; + + case FIX: + case UNSIGNED_FIX: + if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* TODO: We do not have cost tables for x87. */ + *total = cost->fadd; + else if (VECTOR_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->cvtps2pi); + else + *total = cost->cvtss2si; + return false; case ABS: /* SSE requires memory load for the constant operand. It may make @@ -22571,13 +22997,41 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } return false; - case VEC_SELECT: case VEC_CONCAT: /* ??? Assume all of these vector manipulation patterns are recognizable. In which case they all pretty much have the - same cost. */ + same cost. + ??? We should still recruse when computing cost. */ *total = cost->sse_op; return true; + + case VEC_SELECT: + /* Special case extracting lower part from the vector. + This by itself needs to code and most of SSE/AVX instructions have + packed and single forms where the single form may be represented + by such VEC_SELECT. + + Use cost 1 (despite the fact that functionally equivalent SUBREG has + cost 0). Making VEC_SELECT completely free, for example instructs CSE + to forward propagate VEC_SELECT into + + (set (reg eax) (reg src)) + + which then prevents fwprop and combining. See i.e. + gcc.target/i386/pr91103-1.c. + + ??? rtvec_series_p test should be, for valid patterns, equivalent to + vec_series_lowpart_p but is not, since the latter calls + can_cange_mode_class on ALL_REGS and this return false since x87 does + not support subregs at all. */ + if (rtvec_series_p (XVEC (XEXP (x, 1), 0), 0)) + *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)), + outer_code, opno, speed) + 1; + else + /* ??? We should still recruse when computing cost. */ + *total = cost->sse_op; + return true; + case VEC_DUPLICATE: *total = rtx_cost (XEXP (x, 0), GET_MODE (XEXP (x, 0)), @@ -22590,13 +23044,87 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case VEC_MERGE: mask = XEXP (x, 2); + /* Scalar versions of SSE instructions may be represented as: + + (vec_merge (vec_duplicate (operation ....)) + (register or memory) + (const_int 1)) + + In this case vec_merge and vec_duplicate is for free. + Just recurse into operation and second operand. */ + if (mask == const1_rtx + && GET_CODE (XEXP (x, 0)) == VEC_DUPLICATE) + { + *total = rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } /* This is masked instruction, assume the same cost, as nonmasked variant. */ - if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) - *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); + else if (TARGET_AVX512F + && (register_operand (mask, GET_MODE (mask)) + /* Redunduant clean up of high bits for kmask with VL=2/4 + .i.e (vec_merge op0, op1, (and op3 15)). */ + || (GET_CODE (mask) == AND + && register_operand (XEXP (mask, 0), GET_MODE (mask)) + && CONST_INT_P (XEXP (mask, 1)) + && ((INTVAL (XEXP (mask, 1)) == 3 + && GET_MODE_NUNITS (mode) == 2) + || (INTVAL (XEXP (mask, 1)) == 15 + && GET_MODE_NUNITS (mode) == 4))))) + { + *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } + /* Combination of the two above: + + (vec_merge (vec_merge (vec_duplicate (operation ...)) + (register or memory) + (reg:QI mask)) + (register or memory) + (const_int 1)) + + i.e. avx512fp16_vcvtss2sh_mask. */ + else if (TARGET_AVX512F + && mask == const1_rtx + && GET_CODE (XEXP (x, 0)) == VEC_MERGE + && GET_CODE (XEXP (XEXP (x, 0), 0)) == VEC_DUPLICATE + && register_operand (XEXP (XEXP (x, 0), 2), + GET_MODE (XEXP (XEXP (x, 0), 2)))) + { + *total = rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), + mode, outer_code, opno, speed) + + rtx_cost (XEXP (XEXP (x, 0), 1), + mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed); + return true; + } + /* vcmp. */ + else if (unspec_pcmp_p (mask) + || (GET_CODE (mask) == NOT + && unspec_pcmp_p (XEXP (mask, 0)))) + { + rtx uns = GET_CODE (mask) == NOT ? XEXP (mask, 0) : mask; + rtx unsop0 = XVECEXP (uns, 0, 0); + /* Make (subreg:V4SI (not:V16QI (reg:V16QI ..)) 0) + cost the same as register. + This is used by avx_cmp<mode>3_ltint_not. */ + if (SUBREG_P (unsop0)) + unsop0 = XEXP (unsop0, 0); + if (GET_CODE (unsop0) == NOT) + unsop0 = XEXP (unsop0, 0); + *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + + rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed) + + rtx_cost (unsop0, mode, UNSPEC, opno, speed) + + rtx_cost (XVECEXP (uns, 0, 1), mode, UNSPEC, opno, speed) + + cost->sse_op; + return true; + } else *total = cost->sse_op; - return true; + return false; case MEM: /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast. @@ -22613,7 +23141,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } /* An insn that accesses memory is slightly more expensive - than one that does not. */ + than one that does not. */ if (speed) { *total += 1; @@ -22625,7 +23153,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, So current solution is make constant disp as cheap as possible. */ if (GET_CODE (addr) == PLUS && x86_64_immediate_operand (XEXP (addr, 1), Pmode) - /* Only hanlde (reg + disp) since other forms of addr are mostly LEA, + /* Only handle (reg + disp) since other forms of addr are mostly LEA, there's no additional cost for the plus of disp. */ && register_operand (XEXP (addr, 0), Pmode)) { @@ -22854,7 +23382,9 @@ x86_this_parameter (tree function) { const int *parm_regs; - if (ix86_function_type_abi (type) == MS_ABI) + if (lookup_attribute ("preserve_none", TYPE_ATTRIBUTES (type))) + parm_regs = x86_64_preserve_none_int_parameter_registers; + else if (ix86_function_type_abi (type) == MS_ABI) parm_regs = x86_64_ms_abi_int_parameter_registers; else parm_regs = x86_64_int_parameter_registers; @@ -23180,19 +23710,21 @@ x86_field_alignment (tree type, int computed) /* Print call to TARGET to FILE. */ static void -x86_print_call_or_nop (FILE *file, const char *target) +x86_print_call_or_nop (FILE *file, const char *target, + const char *label) { if (flag_nop_mcount || !strcmp (target, "nop")) /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ - fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); + fprintf (file, "%s" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n", + label); else if (!TARGET_PECOFF && flag_pic) { gcc_assert (flag_plt); - fprintf (file, "1:\tcall\t%s@PLT\n", target); + fprintf (file, "%s\tcall\t%s@PLT\n", label, target); } else - fprintf (file, "1:\tcall\t%s\n", target); + fprintf (file, "%s\tcall\t%s\n", label, target); } static bool @@ -23277,6 +23809,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) const char *mcount_name = MCOUNT_NAME; + bool fentry_section_p + = (flag_record_mcount + || lookup_attribute ("fentry_section", + DECL_ATTRIBUTES (current_function_decl))); + + const char *label = fentry_section_p ? "1:" : ""; + if (current_fentry_name (&mcount_name)) ; else if (fentry_name) @@ -23312,11 +23851,12 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) reg = legacy_reg; } if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tmovabs\t%s, OFFSET FLAT:%s\n" - "\tcall\t%s\n", reg, mcount_name, reg); + fprintf (file, "%s\tmovabs\t%s, OFFSET FLAT:%s\n" + "\tcall\t%s\n", label, reg, mcount_name, + reg); else - fprintf (file, "1:\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n", - mcount_name, reg, reg); + fprintf (file, "%s\tmovabsq\t$%s, %%%s\n\tcall\t*%%%s\n", + label, mcount_name, reg, reg); break; case CM_LARGE_PIC: #ifdef NO_PROFILE_COUNTERS @@ -23357,21 +23897,21 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) if (!flag_plt) { if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n", - mcount_name); + fprintf (file, "%s\tcall\t[QWORD PTR %s@GOTPCREL[rip]]\n", + label, mcount_name); else - fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", - mcount_name); + fprintf (file, "%s\tcall\t*%s@GOTPCREL(%%rip)\n", + label, mcount_name); break; } /* fall through */ default: - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); break; } } else - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); } else if (flag_pic) { @@ -23386,11 +23926,13 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) LPREFIX, labelno); #endif if (flag_plt) - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); else if (ASSEMBLER_DIALECT == ASM_INTEL) - fprintf (file, "1:\tcall\t[DWORD PTR %s@GOT[ebx]]\n", mcount_name); + fprintf (file, "%s\tcall\t[DWORD PTR %s@GOT[ebx]]\n", + label, mcount_name); else - fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name); + fprintf (file, "%s\tcall\t*%s@GOT(%%ebx)\n", + label, mcount_name); } else { @@ -23403,12 +23945,10 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) fprintf (file, "\tmovl\t$%sP%d, %%" PROFILE_COUNT_REGISTER "\n", LPREFIX, labelno); #endif - x86_print_call_or_nop (file, mcount_name); + x86_print_call_or_nop (file, mcount_name, label); } - if (flag_record_mcount - || lookup_attribute ("fentry_section", - DECL_ATTRIBUTES (current_function_decl))) + if (fentry_section_p) { const char *sname = "__mcount_loc"; @@ -24167,7 +24707,7 @@ ix86_vector_mode_supported_p (machine_mode mode) return true; if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) return true; - if (TARGET_AVX512F && TARGET_EVEX512 && VALID_AVX512F_REG_MODE (mode)) + if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) return true; if ((TARGET_MMX || TARGET_MMX_WITH_SSE) && VALID_MMX_REG_MODE (mode)) @@ -24269,6 +24809,12 @@ static void map_egpr_constraints (vec<const char *> &constraints) buf.safe_push (cur[j + 1]); j++; break; + case '{': + do + { + buf.safe_push (cur[j]); + } while (cur[j++] != '}'); + break; default: buf.safe_push (cur[j]); break; @@ -24415,8 +24961,7 @@ ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> & /*inputs*/, } } - rtx_insn *seq = get_insns (); - end_sequence (); + rtx_insn *seq = end_sequence (); if (saw_asm_flag) return seq; @@ -24687,20 +25232,14 @@ asm_preferred_eh_data_format (int code, int global) return DW_EH_PE_absptr; } -/* Implement targetm.vectorize.builtin_vectorization_cost. */ +/* Worker for ix86_builtin_vectorization_cost and the fallback calls + from ix86_vector_costs::add_stmt_cost. */ static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype, int) +ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost, + machine_mode mode) { - bool fp = false; - machine_mode mode = TImode; + bool fp = FLOAT_MODE_P (mode); int index; - if (vectype != NULL) - { - fp = FLOAT_TYPE_P (vectype); - mode = TYPE_MODE (vectype); - } - switch (type_of_cost) { case scalar_stmt: @@ -24759,14 +25298,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, COSTS_N_INSNS (ix86_cost->gather_static + ix86_cost->gather_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case vector_scatter_store: return ix86_vec_cost (mode, COSTS_N_INSNS (ix86_cost->scatter_static + ix86_cost->scatter_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; @@ -24784,7 +25323,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { - int n = TYPE_VECTOR_SUBPARTS (vectype); + int n = GET_MODE_NUNITS (mode); /* N - 1 element inserts into an SSE vector, the possible GPR -> XMM move is accounted for in add_stmt_cost. */ if (GET_MODE_BITSIZE (mode) <= 128) @@ -24792,12 +25331,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* One vinserti128 for combining two SSE vectors for AVX256. */ else if (GET_MODE_BITSIZE (mode) == 256) return ((n - 2) * ix86_cost->sse_op - + ix86_vec_cost (mode, ix86_cost->addss)); + + ix86_vec_cost (mode, ix86_cost->sse_op)); /* One vinserti64x4 and two vinserti128 for combining SSE and AVX256 vectors to AVX512. */ else if (GET_MODE_BITSIZE (mode) == 512) - return ((n - 4) * ix86_cost->sse_op - + 3 * ix86_vec_cost (mode, ix86_cost->addss)); + { + machine_mode half_mode + = mode_for_vector (GET_MODE_INNER (mode), + GET_MODE_NUNITS (mode) / 2).require (); + return ((n - 4) * ix86_cost->sse_op + + 2 * ix86_vec_cost (half_mode, ix86_cost->sse_op) + + ix86_vec_cost (mode, ix86_cost->sse_op)); + } gcc_unreachable (); } @@ -24806,6 +25351,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int) +{ + machine_mode mode = TImode; + if (vectype != NULL) + mode = TYPE_MODE (vectype); + return ix86_default_vector_cost (type_of_cost, mode); +} + /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ @@ -24965,7 +25521,7 @@ ix86_preferred_simd_mode (scalar_mode mode) switch (mode) { case E_QImode: - if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512BW && !TARGET_PREFER_AVX256) return V64QImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V32QImode; @@ -24973,7 +25529,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V16QImode; case E_HImode: - if (TARGET_AVX512BW && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512BW && !TARGET_PREFER_AVX256) return V32HImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V16HImode; @@ -24981,7 +25537,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V8HImode; case E_SImode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V16SImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SImode; @@ -24989,7 +25545,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V4SImode; case E_DImode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V8DImode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DImode; @@ -25003,16 +25559,15 @@ ix86_preferred_simd_mode (scalar_mode mode) { if (TARGET_PREFER_AVX128) return V8HFmode; - else if (TARGET_PREFER_AVX256 || !TARGET_EVEX512) + else if (TARGET_PREFER_AVX256) return V16HFmode; } - if (TARGET_EVEX512) - return V32HFmode; + return V32HFmode; } return word_mode; case E_BFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V32BFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V16BFmode; @@ -25020,7 +25575,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V8BFmode; case E_SFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V16SFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V8SFmode; @@ -25028,7 +25583,7 @@ ix86_preferred_simd_mode (scalar_mode mode) return V4SFmode; case E_DFmode: - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) return V8DFmode; else if (TARGET_AVX && !TARGET_PREFER_AVX128) return V4DFmode; @@ -25048,13 +25603,13 @@ ix86_preferred_simd_mode (scalar_mode mode) static unsigned int ix86_autovectorize_vector_modes (vector_modes *modes, bool all) { - if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + if (TARGET_AVX512F && !TARGET_PREFER_AVX256) { modes->safe_push (V64QImode); modes->safe_push (V32QImode); modes->safe_push (V16QImode); } - else if (TARGET_AVX512F && TARGET_EVEX512 && all) + else if (TARGET_AVX512F && all) { modes->safe_push (V32QImode); modes->safe_push (V16QImode); @@ -25092,7 +25647,7 @@ ix86_get_mask_mode (machine_mode data_mode) unsigned elem_size = vector_size / nunits; /* Scalar mask case. */ - if ((TARGET_AVX512F && TARGET_EVEX512 && vector_size == 64) + if ((TARGET_AVX512F && vector_size == 64) || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)) /* AVX512FP16 only supports vector comparison to kmask for _Float16. */ @@ -25257,36 +25812,10 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) return new ix86_vector_costs (vinfo, costing_for_scalar); } -/* Return cost of statement doing FP conversion. */ - -static unsigned -fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p) -{ - int outer_size - = tree_to_uhwi - (TYPE_SIZE - (TREE_TYPE (gimple_assign_lhs (stmt)))); - int inner_size - = tree_to_uhwi - (TYPE_SIZE - (TREE_TYPE (gimple_assign_rhs1 (stmt)))); - int stmt_cost = vec_fp_conversion_cost - (ix86_tune_cost, GET_MODE_BITSIZE (mode)); - /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end - up doing two conversions and packing them. */ - if (!scalar_p && inner_size > outer_size) - { - int n = inner_size / outer_size; - stmt_cost = stmt_cost * n - + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op); - } - return stmt_cost; -} - unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, - tree vectype, int misalign, + tree vectype, int, vect_cost_model_location where) { unsigned retval = 0; @@ -25304,6 +25833,14 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (scalar_p) mode = TYPE_MODE (TREE_TYPE (vectype)); } + /* When we are costing a scalar stmt use the scalar stmt to get at the + type of the operation. */ + else if (scalar_p && stmt_info) + if (tree lhs = gimple_get_lhs (stmt_info->stmt)) + { + fp = FLOAT_TYPE_P (TREE_TYPE (lhs)); + mode = TYPE_MODE (TREE_TYPE (lhs)); + } if ((kind == vector_stmt || kind == scalar_stmt) && stmt_info @@ -25326,7 +25863,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else if (X87_FLOAT_MODE_P (mode)) stmt_cost = ix86_cost->fadd; else - stmt_cost = ix86_cost->add; + stmt_cost = ix86_cost->add; } else stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss @@ -25381,7 +25918,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (subcode == RSHIFT_EXPR && !TYPE_UNSIGNED (TREE_TYPE (op1))) ? ASHIFTRT : LSHIFTRT, mode, - TREE_CODE (op2) == INTEGER_CST, + TREE_CODE (op2) == INTEGER_CST, cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1, false, false, NULL, NULL); @@ -25390,30 +25927,174 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, case NOP_EXPR: /* Only sign-conversions are free. */ if (tree_nop_conversion_p - (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) stmt_cost = 0; else if (fp) - stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, - scalar_p); + stmt_cost = vec_fp_conversion_cost + (ix86_tune_cost, GET_MODE_BITSIZE (mode)); + break; + + case FLOAT_EXPR: + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ix86_cost->cvtsi2ss; + else if (X87_FLOAT_MODE_P (mode)) + /* TODO: We do not have cost tables for x87. */ + stmt_cost = ix86_cost->fadd; + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps); + break; + + case FIX_TRUNC_EXPR: + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ix86_cost->cvtss2si; + else if (X87_FLOAT_MODE_P (mode)) + /* TODO: We do not have cost tables for x87. */ + stmt_cost = ix86_cost->fadd; + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi); + break; + + case COND_EXPR: + { + /* SSE2 conditinal move sequence is: + pcmpgtd %xmm5, %xmm0 (accounted separately) + pand %xmm0, %xmm2 + pandn %xmm1, %xmm0 + por %xmm2, %xmm0 + while SSE4 uses cmp + blend + and AVX512 masked moves. + + The condition is accounted separately since we usually have + p = a < b + c = p ? x : y + and we will account first statement as setcc. Exception is when + p is loaded from memory as bool and then we will not acocunt + the compare, but there is no way to check for this. */ + + int ninsns = TARGET_SSE4_1 ? 1 : 3; + + /* If one of parameters is 0 or -1 the sequence will be simplified: + (if_true & mask) | (if_false & ~mask) -> if_true & mask */ + if (ninsns > 1 + && (zerop (gimple_assign_rhs2 (stmt_info->stmt)) + || zerop (gimple_assign_rhs3 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs2 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs3 (stmt_info->stmt)))) + ninsns = 1; + + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + stmt_cost = ninsns * ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op); + else + /* compare (accounted separately) + cmov. */ + stmt_cost = ix86_cost->add; + } break; - case BIT_IOR_EXPR: - case ABS_EXPR: - case ABSU_EXPR: case MIN_EXPR: case MAX_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode) + && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* x87 requires conditional branch. We don't have cost for + that. */ + ; + else + /* minss */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vpmin was introduced in SSE3. + SSE2 needs pcmpgtd + pand + pandn + pxor. + If one of parameters is 0 or -1 the sequence is simplified + to pcmpgtd + pand. */ + if (!TARGET_SSSE3) + { + if (zerop (gimple_assign_rhs2 (stmt_info->stmt)) + || integer_minus_onep + (gimple_assign_rhs2 (stmt_info->stmt))) + stmt_cost *= 2; + else + stmt_cost *= 4; + } + } + else + /* cmp + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case ABS_EXPR: + case ABSU_EXPR: + if (fp) + { + if (X87_FLOAT_MODE_P (mode) + && !SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* fabs. */ + stmt_cost = ix86_cost->fabs; + else + /* andss of sign bit. */ + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + } + else + { + if (VECTOR_MODE_P (mode)) + { + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* vabs was introduced in SSE3. + SSE3 uses psrat + pxor + psub. */ + if (!TARGET_SSSE3) + stmt_cost *= 3; + } + else + /* neg + cmov. */ + stmt_cost = ix86_cost->add * 2; + } + break; + + case BIT_IOR_EXPR: case BIT_XOR_EXPR: case BIT_AND_EXPR: case BIT_NOT_EXPR: - if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) - stmt_cost = ix86_cost->sse_op; - else if (VECTOR_MODE_P (mode)) + gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode) + && !X87_FLOAT_MODE_P (mode)); + if (VECTOR_MODE_P (mode)) stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); else stmt_cost = ix86_cost->add; break; + default: + if (truth_value_p (subcode)) + { + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) + /* CMPccS? insructions are cheap, so use sse_op. While they + produce a mask which may need to be turned to 0/1 by and, + expect that this will be optimized away in a common case. */ + stmt_cost = ix86_cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + /* fcmp + setcc. */ + stmt_cost = ix86_cost->fadd + ix86_cost->add; + else if (VECTOR_MODE_P (mode)) + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + else + /* setcc. */ + stmt_cost = ix86_cost->add; + break; + } break; } } @@ -25437,41 +26118,60 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, break; } - if (kind == vec_promote_demote - && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) - stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p); + if (kind == vec_promote_demote) + { + int outer_size + = tree_to_uhwi + (TYPE_SIZE + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)))); + int inner_size + = tree_to_uhwi + (TYPE_SIZE + (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))); + bool inner_fp = FLOAT_TYPE_P + (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))); + + if (fp && inner_fp) + stmt_cost = vec_fp_conversion_cost + (ix86_tune_cost, GET_MODE_BITSIZE (mode)); + else if (fp && !inner_fp) + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps); + else if (!fp && inner_fp) + stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi); + else + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); + /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is + greater than inner size we will end up doing two conversions and + packing them. We always pack pairs; if the size difference is greater + it is split into multiple demote operations. */ + if (inner_size > outer_size) + stmt_cost = stmt_cost * 2 + + ix86_vec_cost (mode, ix86_cost->sse_op); + } /* If we do elementwise loads into a vector then we are bound by latency and execution resources for the many scalar loads (AGU and load ports). Try to account for this by scaling the construction cost by the number of elements involved. */ if ((kind == vec_construct || kind == vec_to_scalar) - && ((stmt_info - && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + && ((node + && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP + && SLP_TREE_LANES (node) == 1)) + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF + (SLP_TREE_REPRESENTATIVE (node)))) != INTEGER_CST)) - || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) - == VMAT_GATHER_SCATTER))) - || (node - && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP - && SLP_TREE_LANES (node) == 1)) - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF - (SLP_TREE_REPRESENTATIVE (node)))) - != INTEGER_CST)) - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) - == VMAT_GATHER_SCATTER))))) - { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) + == VMAT_GATHER_SCATTER))))) + { + stmt_cost = ix86_default_vector_cost (kind, mode); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } else if ((kind == vec_construct || kind == scalar_to_vec) && node && SLP_TREE_DEF_TYPE (node) == vect_external_def) { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); unsigned i; tree op; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) @@ -25511,7 +26211,22 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else { m_num_gpr_needed[where]++; - stmt_cost += ix86_cost->sse_to_integer; + + int cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; + + /* For integer construction, the number of actual GPR -> XMM + moves will be somewhere between 0 and n. + We do not have very good idea about actual number, since + the source may be a constant, memory or a chain of + instructions that will be later converted by + scalar-to-vector pass. */ + if (kind == vec_construct + && GET_MODE_BITSIZE (mode) == 256) + cost *= 2; + else if (kind == vec_construct + && GET_MODE_BITSIZE (mode) == 512) + cost *= 3; + stmt_cost += cost; } } } @@ -25520,7 +26235,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, TREE_VISITED (op) = 0; } if (stmt_cost == -1) - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); if (kind == vec_perm && vectype && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) @@ -25603,14 +26318,10 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) /* When X86_TUNE_AVX512_TWO_EPILOGUES is enabled arrange for both a AVX2 and a SSE epilogue for AVX512 vectorized loops. */ if (loop_vinfo + && LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32 && ix86_tune_features[X86_TUNE_AVX512_TWO_EPILOGUES]) - { - if (GET_MODE_SIZE (loop_vinfo->vector_mode) == 64) - m_suggested_epilogue_mode = V32QImode; - else if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) - && GET_MODE_SIZE (loop_vinfo->vector_mode) == 32) - m_suggested_epilogue_mode = V16QImode; - } + m_suggested_epilogue_mode = V16QImode; /* When a 128bit SSE vectorized epilogue still has a VF of 16 or larger enable a 64bit SSE epilogue. */ if (loop_vinfo @@ -25619,6 +26330,65 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () >= 16) m_suggested_epilogue_mode = V8QImode; + /* When X86_TUNE_AVX512_MASKED_EPILOGUES is enabled try to use + a masked epilogue if that doesn't seem detrimental. */ + if (loop_vinfo + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant () > 2 + && ix86_tune_features[X86_TUNE_AVX512_MASKED_EPILOGUES] + && !OPTION_SET_P (param_vect_partial_vector_usage)) + { + bool avoid = false; + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) + { + unsigned int peel_niter + = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); + if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) + peel_niter += 1; + /* When we know the number of scalar iterations of the epilogue, + avoid masking when a single vector epilog iteration handles + it in full. */ + if (pow2p_hwi ((LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter) + % LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())) + avoid = true; + } + if (!avoid && loop_outer (loop_outer (LOOP_VINFO_LOOP (loop_vinfo)))) + for (auto ddr : LOOP_VINFO_DDRS (loop_vinfo)) + { + if (DDR_ARE_DEPENDENT (ddr) == chrec_known) + ; + else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) + ; + else + { + int loop_depth + = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, + DDR_LOOP_NEST (ddr)); + if (DDR_NUM_DIST_VECTS (ddr) == 1 + && DDR_DIST_VECTS (ddr)[0][loop_depth] == 0) + { + /* Avoid the case when there's an outer loop that might + traverse a multi-dimensional array with the inner + loop just executing the masked epilogue with a + read-write where the next outer iteration might + read from the masked part of the previous write, + 'n' filling half a vector. + for (j = 0; j < m; ++j) + for (i = 0; i < n; ++i) + a[j][i] = c * a[j][i]; */ + avoid = true; + break; + } + } + } + if (!avoid) + { + m_suggested_epilogue_mode = loop_vinfo->vector_mode; + m_masked_epilogue = 1; + } + } + vector_costs::finish_cost (scalar_costs); } @@ -25738,7 +26508,7 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, { /* If the function isn't exported, we can pick up just one ISA for the clones. */ - if (TARGET_AVX512F && TARGET_EVEX512) + if (TARGET_AVX512F) clonei->vecsize_mangle = 'e'; else if (TARGET_AVX2) clonei->vecsize_mangle = 'd'; @@ -25830,17 +26600,17 @@ ix86_simd_clone_usable (struct cgraph_node *node, machine_mode) return -1; if (!TARGET_AVX) return 0; - return (TARGET_AVX512F && TARGET_EVEX512) ? 3 : TARGET_AVX2 ? 2 : 1; + return TARGET_AVX512F ? 3 : TARGET_AVX2 ? 2 : 1; case 'c': if (!TARGET_AVX) return -1; - return (TARGET_AVX512F && TARGET_EVEX512) ? 2 : TARGET_AVX2 ? 1 : 0; + return TARGET_AVX512F ? 2 : TARGET_AVX2 ? 1 : 0; case 'd': if (!TARGET_AVX2) return -1; - return (TARGET_AVX512F && TARGET_EVEX512) ? 1 : 0; + return TARGET_AVX512F ? 1 : 0; case 'e': - if (!TARGET_AVX512F || !TARGET_EVEX512) + if (!TARGET_AVX512F) return -1; return 0; default: @@ -26055,7 +26825,7 @@ ix86_reloc_rw_mask (void) static bool symbolic_base_address_p (rtx addr) { - if (GET_CODE (addr) == SYMBOL_REF) + if (SYMBOL_REF_P (addr)) return true; if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_GOTOFF) @@ -27512,6 +28282,195 @@ ix86_cannot_copy_insn_p (rtx_insn *insn) #undef TARGET_DOCUMENTATION_NAME #define TARGET_DOCUMENTATION_NAME "x86" +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ +sbitmap +ix86_get_separate_components (void) +{ + HOST_WIDE_INT offset, to_allocate; + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + struct machine_function *m = cfun->machine; + + offset = m->frame.stack_pointer_offset; + to_allocate = offset - m->frame.sse_reg_save_offset; + + /* Shrink wrap separate uses MOV, which means APX PPX cannot be used. + Experiments show that APX PPX can speed up the prologue. If the function + does not exit early during actual execution, then using APX PPX is faster. + If the function always exits early during actual execution, then shrink + wrap separate reduces the number of MOV (PUSH/POP) instructions actually + executed, thus speeding up execution. + foo: + movl $1, %eax + testq %rdi, %rdi + jne.L60 + ret ---> early return. + .L60: + subq $88, %rsp ---> belong to prologue. + xorl %eax, %eax + movq %rbx, 40 (%rsp) ---> belong to prologue. + movq 8 (%rdi), %rbx + movq %rbp, 48 (%rsp) ---> belong to prologue. + movq %rdi, %rbp + testq %rbx, %rbx + jne.L61 + movq 40 (%rsp), %rbx + movq 48 (%rsp), %rbp + addq $88, %rsp + ret + .L61: + movq %r12, 56 (%rsp) ---> belong to prologue. + movq %r13, 64 (%rsp) ---> belong to prologue. + movq %r14, 72 (%rsp) ---> belong to prologue. + ... ... + + Disable shrink wrap separate when PPX is enabled. */ + if ((TARGET_APX_PPX && !crtl->calls_eh_return) + || cfun->machine->func_type != TYPE_NORMAL + || TARGET_SEH + || crtl->stack_realign_needed + || m->call_ms2sysv) + return components; + + /* Since shrink wrapping separate uses MOV instead of PUSH/POP. + Disable shrink wrap separate when MOV is prohibited. */ + if (save_regs_using_push_pop (to_allocate)) + return components; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + /* Skip registers with large offsets, where a pseudo may be needed. */ + if (IN_RANGE (offset, -0x8000, 0x7fff)) + bitmap_set_bit (components, regno); + offset += UNITS_PER_WORD; + } + + /* Don't mess with the following registers. */ + if (frame_pointer_needed) + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); + + if (crtl->drap_reg) + bitmap_clear_bit (components, REGNO (crtl->drap_reg)); + + if (pic_offset_table_rtx) + bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ +sbitmap +ix86_components_for_bb (basic_block bb) +{ + bitmap in = DF_LIVE_IN (bb); + bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; + bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; + + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + + function_abi_aggregator callee_abis; + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (CALL_P (insn)) + callee_abis.note_callee_abi (insn_callee_abi (insn)); + HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi); + + /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (!fixed_regs[regno] + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) + || bitmap_bit_p (in, regno) + || bitmap_bit_p (gen, regno) + || bitmap_bit_p (kill, regno))) + bitmap_set_bit (components, regno); + + return components; +} + +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */ +void +ix86_disqualify_components (sbitmap, edge, sbitmap, bool) +{ + /* Nothing to do for x86. */ +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ +void +ix86_emit_prologue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ +void +ix86_emit_epilogue_components (sbitmap components) +{ + HOST_WIDE_INT cfa_offset; + struct machine_function *m = cfun->machine; + cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset + - m->frame.stack_pointer_offset; + + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + if (bitmap_bit_p (components, regno)) + { + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; + + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_RESTORE, reg); + } + cfa_offset -= UNITS_PER_WORD; + } +} + +/* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ +void +ix86_set_handled_components (sbitmap components) +{ + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (bitmap_bit_p (components, regno)) + { + cfun->machine->reg_is_wrapped_separately[regno] = true; + cfun->machine->use_fast_prologue_epilogue = true; + cfun->machine->frame.save_regs_using_mov = true; + } +} + +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ + ix86_emit_prologue_components +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ + ix86_emit_epilogue_components +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-i386.h" |