diff options
Diffstat (limited to 'gcc/config/i386/i386-features.cc')
-rw-r--r-- | gcc/config/i386/i386-features.cc | 1130 |
1 files changed, 936 insertions, 194 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c131577..0608dd2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3085,21 +3085,68 @@ ix86_rpad_gate () && optimize_function_for_speed_p (cfun)); } +enum x86_cse_kind +{ + X86_CSE_CONST0_VECTOR, + X86_CSE_CONSTM1_VECTOR, + X86_CSE_VEC_DUP, + X86_CSE_TLS_GD, + X86_CSE_TLS_LD_BASE, + X86_CSE_TLSDESC +}; + +struct redundant_pattern +{ + /* Bitmap of basic blocks with broadcast instructions. */ + auto_bitmap bbs; + /* Bitmap of broadcast instructions. */ + auto_bitmap insns; + /* The broadcast inner scalar. */ + rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; + /* The inner scalar mode. */ + machine_mode mode; + /* The instruction which sets the inner scalar. Nullptr if the inner + scalar is applied to the whole function, instead of within the same + block. */ + rtx_insn *def_insn; + /* The widest broadcast source. */ + rtx broadcast_source; + /* The widest broadcast register. */ + rtx broadcast_reg; + /* The basic block of the broadcast instruction. */ + basic_block bb; + /* The number of broadcast instructions with the same inner scalar. */ + unsigned HOST_WIDE_INT count; + /* The threshold of broadcast instructions with the same inner + scalar. */ + unsigned int threshold; + /* The widest broadcast size in bytes. */ + unsigned int size; + /* Load kind. */ + x86_cse_kind kind; +}; + /* Generate a vector set, DEST = SRC, at entry of the nearest dominator for basic block map BBS, which is in the fake loop that contains the whole function, so that there is only a single vector set in the - whole function. If not nullptr, INNER_SCALAR is the inner scalar of - SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ + whole function. If not nullptr, LOAD is a pointer to the load. */ static void ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, - rtx inner_scalar = nullptr) + redundant_pattern *load = nullptr) { basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); + /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop + to avoid extra spills. */ + if (!load || load->kind != X86_CSE_VEC_DUP) + { + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + } rtx set = gen_rtx_SET (dest, src); @@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, } } - if (inner_scalar) + if (load && load->kind == X86_CSE_VEC_DUP) { + /* Get the source from LOAD as (reg:SI 99) in + + (vec_duplicate:V4SI (reg:SI 99)) + + */ + rtx inner_scalar = load->val; /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ rtx reg = XEXP (src, 0); if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) @@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void) break; } - /* Only hanlde conversion here. */ + /* Only handle conversion here. */ machine_mode src_mode = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode; switch (src_mode) @@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const, } } -enum x86_cse_kind -{ - X86_CSE_CONST0_VECTOR, - X86_CSE_CONSTM1_VECTOR, - X86_CSE_VEC_DUP -}; - -struct redundant_load -{ - /* Bitmap of basic blocks with broadcast instructions. */ - auto_bitmap bbs; - /* Bitmap of broadcast instructions. */ - auto_bitmap insns; - /* The broadcast inner scalar. */ - rtx val; - /* The inner scalar mode. */ - machine_mode mode; - /* The instruction which sets the inner scalar. Nullptr if the inner - scalar is applied to the whole function, instead of within the same - block. */ - rtx_insn *def_insn; - /* The widest broadcast source. */ - rtx broadcast_source; - /* The widest broadcast register. */ - rtx broadcast_reg; - /* The basic block of the broadcast instruction. */ - basic_block bb; - /* The number of broadcast instructions with the same inner scalar. */ - unsigned HOST_WIDE_INT count; - /* The threshold of broadcast instructions with the same inner - scalar. */ - unsigned int threshold; - /* The widest broadcast size in bytes. */ - unsigned int size; - /* Load kind. */ - x86_cse_kind kind; -}; - /* Return the inner scalar if OP is a broadcast, else return nullptr. */ static rtx @@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode, Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an integer constant. */ op = src; + if (mode != GET_MODE (reg)) + op = gen_int_mode (INTVAL (src), mode); *insn_p = nullptr; } else @@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode, return op; } -/* At entry of the nearest common dominator for basic blocks with vector - CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest - vector set instruction for all CONST0_RTX and integer CONSTM1_RTX - uses. +/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and + put the updated instruction in UPDATED_TLS_INSNS. */ - NB: We want to generate only a single widest vector set to cover the - whole function. The LCM algorithm isn't appropriate here since it - may place a vector set inside the loop. */ +static void +replace_tls_call (rtx src, auto_bitmap &tls_call_insns, + auto_bitmap &updated_tls_insns) +{ + bitmap_iterator bi; + unsigned int id; -static unsigned int -remove_redundant_vector_load (void) + EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (id)->insn; + + /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are + allowed. */ + if (!CALL_P (insn)) + { + attr_tls64 tls64 = get_attr_tls64 (insn); + if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE) + gcc_unreachable (); + } + + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + + set = gen_rtx_SET (dest, src); + rtx_insn *set_insn = emit_insn_after (set, insn); + if (recog_memoized (set_insn) < 0) + gcc_unreachable (); + + /* Put SET_INSN in UPDATED_TLS_INSNS. */ + bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn)); + + if (dump_file) + { + fprintf (dump_file, "\nReplace:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nwith:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\n"); + } + + /* Delete the CALL insn. */ + delete_insn (insn); + + df_insn_rescan (set_insn); + } +} + +/* Return the basic block which dominates all basic blocks which set + hard register REGNO used in basic block BB. */ + +static basic_block +ix86_get_dominator_for_reg (unsigned int regno, basic_block bb) +{ + basic_block set_bb; + auto_bitmap set_bbs; + + /* Get all BBs which set REGNO and dominate the current BB from all + DEFs of REGNO. */ + for (df_ref def = DF_REG_DEF_CHAIN (regno); + def; + def = DF_REF_NEXT_REG (def)) + if (!DF_REF_IS_ARTIFICIAL (def) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER)) + { + set_bb = DF_REF_BB (def); + if (dominated_by_p (CDI_DOMINATORS, bb, set_bb)) + bitmap_set_bit (set_bbs, set_bb->index); + } + + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + return bb; +} + +/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved + registers, if DEST is FLAGS register. */ + +static void +ix86_check_flags_reg (rtx dest, const_rtx, void *data) +{ + auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data; + if (REG_P (dest) && REGNO (dest) == FLAGS_REG) + bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG); +} + +/* Emit a TLS_SET instruction of KIND in basic block BB. Store the + insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P + for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions + which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS + contains instructions which replace the GNU2 TLS instructions. */ + +static rtx_insn * +ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb, + rtx_insn **before_p, rtx_insn **after_p, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns) +{ + rtx_insn *tls_insn; + + do + { + rtx_insn *insn = BB_HEAD (bb); + while (insn && !NONDEBUG_INSN_P (insn)) + { + if (insn == BB_END (bb)) + { + /* This must be the beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or a basic block with only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or a basic block with only a debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + gcc_assert (DEBUG_INSN_P (insn) + || (NOTE_P (insn) + && ((NOTE_KIND (insn) + == NOTE_INSN_FUNCTION_BEG) + || (NOTE_KIND (insn) + == NOTE_INSN_BASIC_BLOCK)))); + insn = NULL; + break; + } + insn = NEXT_INSN (insn); + } + + /* TLS_GD and TLS_LD_BASE instructions are normal functions which + clobber caller-saved registers. TLSDESC instructions only + clobber FLAGS. If any registers clobbered by TLS instructions + are live in this basic block, we must insert TLS instructions + after all live registers clobbered are dead. */ + + auto_bitmap live_caller_saved_regs; + bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb); + + if (bitmap_bit_p (in, FLAGS_REG)) + bitmap_set_bit (live_caller_saved_regs, FLAGS_REG); + + unsigned int i; + + /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE + instructions. */ + if (kind != X86_CSE_TLSDESC) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (call_used_regs[i] + && !fixed_regs[i] + && bitmap_bit_p (in, i)) + bitmap_set_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + if (insn == BB_HEAD (bb)) + { + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + } + else + { + /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the + beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or after NOTE_INSN_BASIC_BLOCK in a basic block with + only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or after debug marker in a basic block with only a + debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + insn = insn ? PREV_INSN (insn) : BB_END (bb); + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + } + return tls_insn; + } + + bool repeat = false; + + /* Search for REG_DEAD notes in this basic block. */ + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + /* NB: Conditional jump is the only instruction which reads + flags register and changes control flow. We can never + place the TLS call after unconditional jump. */ + if (JUMP_P (insn)) + { + /* This must be a conditional jump. */ + rtx label = JUMP_LABEL (insn); + if (label == nullptr + || ANY_RETURN_P (label) + || !(LABEL_P (label) || SYMBOL_REF_P (label))) + gcc_unreachable (); + + /* Place the call before all FLAGS_REG setting BBs since + we can't place a call before nor after a conditional + jump. */ + bb = ix86_get_dominator_for_reg (FLAGS_REG, bb); + + /* Start over again. */ + repeat = true; + break; + } + + if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn))) + { + /* Insert the __tls_get_addr call before INSN which + replaces a __tls_get_addr call. */ + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + return tls_insn; + } + + if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn))) + { + /* Mark FLAGS register as dead since FLAGS register + would be clobbered by the GNU2 TLS instruction. */ + bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG); + continue; + } + + /* Check if FLAGS register is live. */ + note_stores (insn, ix86_check_flags_reg, + &live_caller_saved_regs); + + rtx link; + for (link = REG_NOTES (insn); link; link = XEXP (link, 1)) + if (REG_NOTE_KIND (link) == REG_DEAD + && REG_P (XEXP (link, 0))) + { + /* Mark the live caller-saved register as dead. */ + for (i = REGNO (XEXP (link, 0)); + i < END_REGNO (XEXP (link, 0)); + i++) + if (i < FIRST_PSEUDO_REGISTER) + bitmap_clear_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + return tls_insn; + } + } + } + + /* NB: Start over again for conditional jump. */ + if (repeat) + continue; + + gcc_assert (!bitmap_empty_p (live_caller_saved_regs)); + + /* If any live caller-saved registers aren't dead at the end of + this basic block, get the basic block which dominates all + basic blocks which set the remaining live registers. */ + auto_bitmap set_bbs; + bitmap_iterator bi; + unsigned int id; + EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi) + { + basic_block set_bb = ix86_get_dominator_for_reg (id, bb); + bitmap_set_bit (set_bbs, set_bb->index); + } + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + } + while (true); +} + +/* Generate a TLS call of KIND with VAL and copy the call result to DEST, + at entry of the nearest dominator for basic block map BBS, which is in + the fake loop that contains the whole function, so that there is only + a single TLS CALL of KIND with VAL in the whole function. + UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS + instructions. UPDATED_GNU2_TLS_INSNS contains instructions which + replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr, + insert it before the TLS call. */ + +static void +ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind, + auto_bitmap &bbs, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns, + rtx tlsdesc_set = nullptr) +{ + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + rtx rax = nullptr, rdi; + rtx eqv = nullptr; + rtx caddr; + rtx set; + rtx clob; + rtx symbol; + rtx tls; + + switch (kind) + { + case X86_CSE_TLS_GD: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + symbol = XVECEXP (val, 0, 0); + tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi); + + if (GET_MODE (symbol) != Pmode) + symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol); + eqv = symbol; + break; + + case X86_CSE_TLS_LD_BASE: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi); + + /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers + to share the LD_BASE result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_TLS_LD_BASE); + + break; + + case X86_CSE_TLSDESC: + set = gen_rtx_SET (dest, val); + clob = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (CCmode, FLAGS_REG)); + tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob)); + break; + + default: + gcc_unreachable (); + } + + /* Emit the TLS CALL insn. */ + rtx_insn *before = nullptr; + rtx_insn *after = nullptr; + rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before, + &after, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + + rtx_insn *tlsdesc_insn = nullptr; + if (tlsdesc_set) + { + rtx dest = copy_rtx (SET_DEST (tlsdesc_set)); + rtx src = copy_rtx (SET_SRC (tlsdesc_set)); + tlsdesc_set = gen_rtx_SET (dest, src); + tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn); + } + + if (kind != X86_CSE_TLSDESC) + { + RTL_CONST_CALL_P (tls_insn) = 1; + + /* Indicate that this function can't jump to non-local gotos. */ + make_reg_eh_region_note_nothrow_nononlocal (tls_insn); + } + + if (recog_memoized (tls_insn) < 0) + gcc_unreachable (); + + if (dump_file) + { + if (after) + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, after); + fprintf (dump_file, "\n"); + } + else + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nbefore:\n\n"); + print_rtl_single (dump_file, before); + fprintf (dump_file, "\n"); + } + } + + if (kind != X86_CSE_TLSDESC) + { + /* Copy RAX to DEST. */ + set = gen_rtx_SET (dest, rax); + rtx_insn *set_insn = emit_insn_after (set, tls_insn); + set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest); + if (dump_file) + { + fprintf (dump_file, "\nPlace:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\n"); + } + } +} + +namespace { + +const pass_data pass_data_x86_cse = +{ + RTL_PASS, /* type */ + "x86_cse", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_x86_cse : public rtl_opt_pass +{ +public: + pass_x86_cse (gcc::context *ctxt) + : rtl_opt_pass (pass_data_x86_cse, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *fun) final override + { + return (TARGET_SSE2 + && optimize + && optimize_function_for_speed_p (fun)); + } + + unsigned int execute (function *) final override + { + return x86_cse (); + } + +private: + /* The redundant source value. */ + rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; + /* The instruction which defines the redundant value. */ + rtx_insn *def_insn; + /* Mode of the destination of the candidate redundant instruction. */ + machine_mode mode; + /* Mode of the source of the candidate redundant instruction. */ + machine_mode scalar_mode; + /* The classification of the candidate redundant instruction. */ + x86_cse_kind kind; + + unsigned int x86_cse (void); + bool candidate_gnu_tls_p (rtx_insn *, attr_tls64); + bool candidate_gnu2_tls_p (rtx, attr_tls64); + bool candidate_vector_p (rtx); + rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx); +}; // class pass_x86_cse + +/* Return the instruction which sets REG from TLS_SYMBOL. */ + +rtx_insn * +pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg, + const_rtx tls_symbol) +{ + rtx_insn *set_insn = nullptr; + for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg)); + ref; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + return nullptr; + + set_insn = DF_REF_INSN (ref); + if (get_attr_tls64 (set_insn) != TLS64_LEA) + return nullptr; + + rtx tls_set = PATTERN (set_insn); + rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0); + if (!rtx_equal_p (tls_symbol, tls_src)) + return nullptr; + } + + return set_insn; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */ + +bool +pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + /* Record the redundant TLS CALLs for 64-bit: + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg/f:DI 7 sp)] UNSPEC_TLS_GD) + (clobber (reg:DI 5 di))]) + + + and + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)]) + + */ + + rtx pat = PATTERN (insn); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + scalar_mode = mode = GET_MODE (dest); + val = XVECEXP (pat, 0, 1); + gcc_assert (GET_CODE (val) == UNSPEC); + + if (tls64 == TLS64_GD) + kind = X86_CSE_TLS_GD; + else + kind = X86_CSE_TLS_LD_BASE; + + def_insn = nullptr; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + SET is UNSPEC_TLSDESC. */ + +bool +pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + rtx tls_symbol; + rtx_insn *set_insn; + rtx src = SET_SRC (set); + val = src; + tlsdesc_val = src; + kind = X86_CSE_TLSDESC; + + if (tls64 == TLS64_COMBINE) + { + /* Record 64-bit TLS64_COMBINE: + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (reg:DI 114) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + ] UNSPEC_TLSDESC) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + */ + + scalar_mode = mode = GET_MODE (src); + + /* Since the first operand of PLUS in the source TLS_COMBINE + pattern is unused, use the second operand of PLUS: + + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)) + + as VAL to check if 2 TLS_COMBINE patterns have the same + source. */ + val = XEXP (src, 1); + gcc_assert (GET_CODE (val) == CONST + && GET_CODE (XEXP (val, 0)) == UNSPEC + && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF + && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0))); + def_insn = nullptr; + return true; + } + + /* Record 64-bit TLS_CALL: + + (set (reg:DI 101) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg:DI 112) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + */ + + gcc_assert (GET_CODE (src) == UNSPEC); + tls_symbol = XVECEXP (src, 0, 0); + src = XVECEXP (src, 0, 1); + scalar_mode = mode = GET_MODE (src); + gcc_assert (REG_P (src)); + + /* All definitions of reg:DI 129 in + + (set (reg:DI 110) + (unspec:DI [(symbol_ref:DI ("foo")) + (reg:DI 129) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + should have the same source as in + + (set (reg:DI 129) + (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC)) + + */ + + set_insn = tls_set_insn_from_symbol (src, tls_symbol); + if (!set_insn) + return false; + + /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */ + val = tls_symbol; + def_insn = set_insn; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is a vector broadcast instruction. */ + +bool +pass_x86_cse::candidate_vector_p (rtx set) +{ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + mode = GET_MODE (dest); + /* Skip non-vector instruction. */ + if (!VECTOR_MODE_P (mode)) + return false; + + /* Skip non-vector load instruction. */ + if (!REG_P (dest) && !SUBREG_P (dest)) + return false; + + val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind, + &def_insn); + return val ? true : false; +} + +/* At entry of the nearest common dominator for basic blocks with + + 1. Vector CONST0_RTX patterns. + 2. Vector CONSTM1_RTX patterns. + 3. Vector broadcast patterns. + 4. UNSPEC_TLS_GD patterns. + 5. UNSPEC_TLS_LD_BASE patterns. + 6. UNSPEC_TLSDESC patterns. + + generate a single pattern whose destination is used to replace the + source in all identical patterns. + + NB: We want to generate a pattern, which is executed only once, to + cover the whole function. The LCM algorithm isn't appropriate here + since it may place a pattern inside the loop. */ + +unsigned int +pass_x86_cse::x86_cse (void) { timevar_push (TV_MACH_DEP); - auto_vec<redundant_load *> loads; - redundant_load *load; + auto_vec<redundant_pattern *> loads; + redundant_pattern *load; basic_block bb; rtx_insn *insn; unsigned int i; + auto_bitmap updated_gnu_tls_insns; + auto_bitmap updated_gnu2_tls_insns; df_set_flags (DF_DEFER_INSN_RESCAN); @@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void) if (!NONDEBUG_INSN_P (insn)) continue; + bool matched = false; + /* Remove redundant pattens if there are more than 2 of + them. */ + unsigned int threshold = 2; + rtx set = single_set (insn); - if (!set) + if (!set && !CALL_P (insn)) continue; - /* Record single set vector instruction with CONST0_RTX and - CONSTM1_RTX source. Record basic blocks with CONST0_RTX and - CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the - maximum size of CONST0_RTX and CONSTM1_RTX. */ + tlsdesc_val = nullptr; - rtx dest = SET_DEST (set); - machine_mode mode = GET_MODE (dest); - /* Skip non-vector instruction. */ - if (!VECTOR_MODE_P (mode)) - continue; + attr_tls64 tls64 = get_attr_tls64 (insn); + switch (tls64) + { + case TLS64_GD: + case TLS64_LD_BASE: + /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */ + if (candidate_gnu_tls_p (insn, tls64)) + break; + continue; - rtx src = SET_SRC (set); - /* Skip non-vector load instruction. */ - if (!REG_P (dest) && !SUBREG_P (dest)) - continue; + case TLS64_CALL: + case TLS64_COMBINE: + /* Verify UNSPEC_TLSDESC. */ + if (candidate_gnu2_tls_p (set, tls64)) + break; + continue; - rtx_insn *def_insn; - machine_mode scalar_mode; - x86_cse_kind kind; - rtx val = ix86_broadcast_inner (src, mode, &scalar_mode, - &kind, &def_insn); - if (!val) - continue; + case TLS64_LEA: + /* Skip TLS64_LEA. */ + continue; - /* Remove redundant register loads if there are more than 2 - loads will be used. */ - unsigned int threshold = 2; + case TLS64_NONE: + if (!set) + continue; - /* Check if there is a matching redundant vector load. */ - bool matched = false; + /* Check for vector broadcast. */ + if (candidate_vector_p (set)) + break; + continue; + } + + /* Check if there is a matching redundant load. */ FOR_EACH_VEC_ELT (loads, i, load) if (load->val && load->kind == kind && load->mode == scalar_mode && (load->bb == bb - || kind < X86_CSE_VEC_DUP + || kind != X86_CSE_VEC_DUP /* Non all 0s/1s vector load must be in the same basic block if it is in a recursive call. */ || !recursive_call_p) && rtx_equal_p (load->val, val)) { - /* Record vector instruction. */ + /* Record instruction. */ bitmap_set_bit (load->insns, INSN_UID (insn)); /* Record the maximum vector size. */ - if (load->size < GET_MODE_SIZE (mode)) + if (kind <= X86_CSE_VEC_DUP + && load->size < GET_MODE_SIZE (mode)) load->size = GET_MODE_SIZE (mode); /* Record the basic block. */ bitmap_set_bit (load->bbs, bb->index); + + /* Increment the count. */ load->count++; + matched = true; break; } @@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void) if (matched) continue; - /* We see this vector broadcast the first time. */ - load = new redundant_load; + /* We see this instruction the first time. Record the + redundant source value, its mode, the destination size, + instruction which defines the redundant source value, + instruction basic block and the instruction kind. */ + load = new redundant_pattern; load->val = copy_rtx (val); + if (tlsdesc_val) + load->tlsdesc_val = copy_rtx (tlsdesc_val); + else + load->tlsdesc_val = nullptr; load->mode = scalar_mode; load->size = GET_MODE_SIZE (mode); load->def_insn = def_insn; @@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void) } bool replaced = false; - rtx reg, broadcast_source, broadcast_reg; FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { - machine_mode mode = ix86_get_vector_cse_mode (load->size, - load->mode); - broadcast_reg = gen_reg_rtx (mode); - if (load->def_insn) - { - /* Replace redundant vector loads with a single vector load - in the same basic block. */ - reg = load->val; - if (load->mode != GET_MODE (reg)) - reg = gen_rtx_SUBREG (load->mode, reg, 0); - broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - replace_vector_const (mode, broadcast_reg, load->insns, - load->mode); - } - else + machine_mode mode; + rtx reg, broadcast_source, broadcast_reg; + replaced = true; + switch (load->kind) { - /* This is a constant integer/double vector. If the - inner scalar is 0 or -1, set vector to CONST0_RTX - or CONSTM1_RTX directly. */ - rtx reg; - switch (load->kind) + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + broadcast_reg = gen_reg_rtx (load->mode); + replace_tls_call (broadcast_reg, load->insns, + (load->kind == X86_CSE_TLSDESC + ? updated_gnu2_tls_insns + : updated_gnu_tls_insns)); + load->broadcast_reg = broadcast_reg; + break; + + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + mode = ix86_get_vector_cse_mode (load->size, load->mode); + broadcast_reg = gen_reg_rtx (mode); + if (load->def_insn) { - case X86_CSE_CONST0_VECTOR: - broadcast_source = CONST0_RTX (mode); - break; - case X86_CSE_CONSTM1_VECTOR: - broadcast_source = CONSTM1_RTX (mode); - break; - default: - reg = gen_reg_rtx (load->mode); + /* Replace redundant vector loads with a single vector + load in the same basic block. */ + reg = load->val; + if (load->mode != GET_MODE (reg)) + reg = gen_rtx_SUBREG (load->mode, reg, 0); broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - break; } + else + /* This is a constant integer/double vector. If the + inner scalar is 0 or -1, set vector to CONST0_RTX + or CONSTM1_RTX directly. */ + switch (load->kind) + { + case X86_CSE_CONST0_VECTOR: + broadcast_source = CONST0_RTX (mode); + break; + case X86_CSE_CONSTM1_VECTOR: + broadcast_source = CONSTM1_RTX (mode); + break; + case X86_CSE_VEC_DUP: + reg = gen_reg_rtx (load->mode); + broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); + break; + default: + gcc_unreachable (); + } replace_vector_const (mode, broadcast_reg, load->insns, load->mode); + load->broadcast_source = broadcast_source; + load->broadcast_reg = broadcast_reg; + break; } - load->broadcast_source = broadcast_source; - load->broadcast_reg = broadcast_reg; - replaced = true; } if (replaced) @@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void) FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { + rtx set; if (load->def_insn) - { - /* Insert a broadcast after the original scalar - definition. */ - rtx set = gen_rtx_SET (load->broadcast_reg, - load->broadcast_source); - insn = emit_insn_after (set, load->def_insn); - - if (cfun->can_throw_non_call_exceptions) - { - /* Handle REG_EH_REGION note in DEF_INSN. */ - rtx note = find_reg_note (load->def_insn, - REG_EH_REGION, nullptr); - if (note) - { - control_flow_insns.safe_push (load->def_insn); - add_reg_note (insn, REG_EH_REGION, - XEXP (note, 0)); - } - } + switch (load->kind) + { + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + load->tlsdesc_val, + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns, + PATTERN (load->def_insn)); + break; + case X86_CSE_VEC_DUP: + /* Insert a broadcast after the original scalar + definition. */ + set = gen_rtx_SET (load->broadcast_reg, + load->broadcast_source); + insn = emit_insn_after (set, load->def_insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note in DEF_INSN. */ + rtx note = find_reg_note (load->def_insn, + REG_EH_REGION, nullptr); + if (note) + { + control_flow_insns.safe_push (load->def_insn); + add_reg_note (insn, REG_EH_REGION, + XEXP (note, 0)); + } + } - if (dump_file) - { - fprintf (dump_file, "\nAdd:\n\n"); - print_rtl_single (dump_file, insn); - fprintf (dump_file, "\nafter:\n\n"); - print_rtl_single (dump_file, load->def_insn); - fprintf (dump_file, "\n"); - } - } + if (dump_file) + { + fprintf (dump_file, "\nAdd:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, load->def_insn); + fprintf (dump_file, "\n"); + } + break; + default: + gcc_unreachable (); + } else - ix86_place_single_vector_set (load->broadcast_reg, - load->broadcast_source, - load->bbs, - (load->kind == X86_CSE_VEC_DUP - ? load->val - : nullptr)); + switch (load->kind) + { + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + (load->kind == X86_CSE_TLSDESC + ? load->tlsdesc_val + : load->val), + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + break; + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + ix86_place_single_vector_set (load->broadcast_reg, + load->broadcast_source, + load->bbs, + load); + break; + } } loop_optimizer_finalize (); @@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void) return 0; } -namespace { - -const pass_data pass_data_remove_redundant_vector_load = -{ - RTL_PASS, /* type */ - "rrvl", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - 0, /* todo_flags_finish */ -}; - -class pass_remove_redundant_vector_load : public rtl_opt_pass -{ -public: - pass_remove_redundant_vector_load (gcc::context *ctxt) - : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) - {} - - /* opt_pass methods: */ - bool gate (function *fun) final override - { - return (TARGET_SSE2 - && optimize - && optimize_function_for_speed_p (fun)); - } - - unsigned int execute (function *) final override - { - return remove_redundant_vector_load (); - } -}; // class pass_remove_redundant_vector_load - } // anon namespace rtl_opt_pass * -make_pass_remove_redundant_vector_load (gcc::context *ctxt) +make_pass_x86_cse (gcc::context *ctxt) { - return new pass_remove_redundant_vector_load (ctxt); + return new pass_x86_cse (ctxt); } /* Convert legacy instructions that clobbers EFLAGS to APX_NF |