aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386-features.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386/i386-features.cc')
-rw-r--r--gcc/config/i386/i386-features.cc1130
1 files changed, 936 insertions, 194 deletions
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..0608dd2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3085,21 +3085,68 @@ ix86_rpad_gate ()
&& optimize_function_for_speed_p (cfun));
}
+enum x86_cse_kind
+{
+ X86_CSE_CONST0_VECTOR,
+ X86_CSE_CONSTM1_VECTOR,
+ X86_CSE_VEC_DUP,
+ X86_CSE_TLS_GD,
+ X86_CSE_TLS_LD_BASE,
+ X86_CSE_TLSDESC
+};
+
+struct redundant_pattern
+{
+ /* Bitmap of basic blocks with broadcast instructions. */
+ auto_bitmap bbs;
+ /* Bitmap of broadcast instructions. */
+ auto_bitmap insns;
+ /* The broadcast inner scalar. */
+ rtx val;
+ /* The actual redundant source value for UNSPEC_TLSDESC. */
+ rtx tlsdesc_val;
+ /* The inner scalar mode. */
+ machine_mode mode;
+ /* The instruction which sets the inner scalar. Nullptr if the inner
+ scalar is applied to the whole function, instead of within the same
+ block. */
+ rtx_insn *def_insn;
+ /* The widest broadcast source. */
+ rtx broadcast_source;
+ /* The widest broadcast register. */
+ rtx broadcast_reg;
+ /* The basic block of the broadcast instruction. */
+ basic_block bb;
+ /* The number of broadcast instructions with the same inner scalar. */
+ unsigned HOST_WIDE_INT count;
+ /* The threshold of broadcast instructions with the same inner
+ scalar. */
+ unsigned int threshold;
+ /* The widest broadcast size in bytes. */
+ unsigned int size;
+ /* Load kind. */
+ x86_cse_kind kind;
+};
+
/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
for basic block map BBS, which is in the fake loop that contains the
whole function, so that there is only a single vector set in the
- whole function. If not nullptr, INNER_SCALAR is the inner scalar of
- SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */
+ whole function. If not nullptr, LOAD is a pointer to the load. */
static void
ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
- rtx inner_scalar = nullptr)
+ redundant_pattern *load = nullptr)
{
basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
- while (bb->loop_father->latch
- != EXIT_BLOCK_PTR_FOR_FN (cfun))
- bb = get_immediate_dominator (CDI_DOMINATORS,
- bb->loop_father->header);
+ /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
+ to avoid extra spills. */
+ if (!load || load->kind != X86_CSE_VEC_DUP)
+ {
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+ }
rtx set = gen_rtx_SET (dest, src);
@@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
}
}
- if (inner_scalar)
+ if (load && load->kind == X86_CSE_VEC_DUP)
{
+ /* Get the source from LOAD as (reg:SI 99) in
+
+ (vec_duplicate:V4SI (reg:SI 99))
+
+ */
+ rtx inner_scalar = load->val;
/* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
rtx reg = XEXP (src, 0);
if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
@@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void)
break;
}
- /* Only hanlde conversion here. */
+ /* Only handle conversion here. */
machine_mode src_mode
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
switch (src_mode)
@@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
}
}
-enum x86_cse_kind
-{
- X86_CSE_CONST0_VECTOR,
- X86_CSE_CONSTM1_VECTOR,
- X86_CSE_VEC_DUP
-};
-
-struct redundant_load
-{
- /* Bitmap of basic blocks with broadcast instructions. */
- auto_bitmap bbs;
- /* Bitmap of broadcast instructions. */
- auto_bitmap insns;
- /* The broadcast inner scalar. */
- rtx val;
- /* The inner scalar mode. */
- machine_mode mode;
- /* The instruction which sets the inner scalar. Nullptr if the inner
- scalar is applied to the whole function, instead of within the same
- block. */
- rtx_insn *def_insn;
- /* The widest broadcast source. */
- rtx broadcast_source;
- /* The widest broadcast register. */
- rtx broadcast_reg;
- /* The basic block of the broadcast instruction. */
- basic_block bb;
- /* The number of broadcast instructions with the same inner scalar. */
- unsigned HOST_WIDE_INT count;
- /* The threshold of broadcast instructions with the same inner
- scalar. */
- unsigned int threshold;
- /* The widest broadcast size in bytes. */
- unsigned int size;
- /* Load kind. */
- x86_cse_kind kind;
-};
-
/* Return the inner scalar if OP is a broadcast, else return nullptr. */
static rtx
@@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
integer constant. */
op = src;
+ if (mode != GET_MODE (reg))
+ op = gen_int_mode (INTVAL (src), mode);
*insn_p = nullptr;
}
else
@@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
return op;
}
-/* At entry of the nearest common dominator for basic blocks with vector
- CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
- vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
- uses.
+/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
+ put the updated instruction in UPDATED_TLS_INSNS. */
- NB: We want to generate only a single widest vector set to cover the
- whole function. The LCM algorithm isn't appropriate here since it
- may place a vector set inside the loop. */
+static void
+replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
+ auto_bitmap &updated_tls_insns)
+{
+ bitmap_iterator bi;
+ unsigned int id;
-static unsigned int
-remove_redundant_vector_load (void)
+ EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
+ {
+ rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+ /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
+ allowed. */
+ if (!CALL_P (insn))
+ {
+ attr_tls64 tls64 = get_attr_tls64 (insn);
+ if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
+ gcc_unreachable ();
+ }
+
+ rtx pat = PATTERN (insn);
+ gcc_assert (GET_CODE (pat) == PARALLEL);
+ rtx set = XVECEXP (pat, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ rtx dest = SET_DEST (set);
+
+ set = gen_rtx_SET (dest, src);
+ rtx_insn *set_insn = emit_insn_after (set, insn);
+ if (recog_memoized (set_insn) < 0)
+ gcc_unreachable ();
+
+ /* Put SET_INSN in UPDATED_TLS_INSNS. */
+ bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nReplace:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nwith:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\n");
+ }
+
+ /* Delete the CALL insn. */
+ delete_insn (insn);
+
+ df_insn_rescan (set_insn);
+ }
+}
+
+/* Return the basic block which dominates all basic blocks which set
+ hard register REGNO used in basic block BB. */
+
+static basic_block
+ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
+{
+ basic_block set_bb;
+ auto_bitmap set_bbs;
+
+ /* Get all BBs which set REGNO and dominate the current BB from all
+ DEFs of REGNO. */
+ for (df_ref def = DF_REG_DEF_CHAIN (regno);
+ def;
+ def = DF_REF_NEXT_REG (def))
+ if (!DF_REF_IS_ARTIFICIAL (def)
+ && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
+ && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
+ {
+ set_bb = DF_REF_BB (def);
+ if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
+ bitmap_set_bit (set_bbs, set_bb->index);
+ }
+
+ bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+ return bb;
+}
+
+/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
+ registers, if DEST is FLAGS register. */
+
+static void
+ix86_check_flags_reg (rtx dest, const_rtx, void *data)
+{
+ auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
+ if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
+ bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
+}
+
+/* Emit a TLS_SET instruction of KIND in basic block BB. Store the
+ insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
+ for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
+ which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
+ contains instructions which replace the GNU2 TLS instructions. */
+
+static rtx_insn *
+ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
+ rtx_insn **before_p, rtx_insn **after_p,
+ auto_bitmap &updated_gnu_tls_insns,
+ auto_bitmap &updated_gnu2_tls_insns)
+{
+ rtx_insn *tls_insn;
+
+ do
+ {
+ rtx_insn *insn = BB_HEAD (bb);
+ while (insn && !NONDEBUG_INSN_P (insn))
+ {
+ if (insn == BB_END (bb))
+ {
+ /* This must be the beginning basic block:
+
+ (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+ or a basic block with only a label:
+
+ (code_label 78 11 77 3 14 (nil) [1 uses])
+ (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+ or a basic block with only a debug marker:
+
+ (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+ (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+ */
+ gcc_assert (DEBUG_INSN_P (insn)
+ || (NOTE_P (insn)
+ && ((NOTE_KIND (insn)
+ == NOTE_INSN_FUNCTION_BEG)
+ || (NOTE_KIND (insn)
+ == NOTE_INSN_BASIC_BLOCK))));
+ insn = NULL;
+ break;
+ }
+ insn = NEXT_INSN (insn);
+ }
+
+ /* TLS_GD and TLS_LD_BASE instructions are normal functions which
+ clobber caller-saved registers. TLSDESC instructions only
+ clobber FLAGS. If any registers clobbered by TLS instructions
+ are live in this basic block, we must insert TLS instructions
+ after all live registers clobbered are dead. */
+
+ auto_bitmap live_caller_saved_regs;
+ bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
+
+ if (bitmap_bit_p (in, FLAGS_REG))
+ bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
+
+ unsigned int i;
+
+ /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
+ instructions. */
+ if (kind != X86_CSE_TLSDESC)
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (call_used_regs[i]
+ && !fixed_regs[i]
+ && bitmap_bit_p (in, i))
+ bitmap_set_bit (live_caller_saved_regs, i);
+
+ if (bitmap_empty_p (live_caller_saved_regs))
+ {
+ if (insn == BB_HEAD (bb))
+ {
+ *before_p = insn;
+ tls_insn = emit_insn_before (tls_set, insn);
+ }
+ else
+ {
+ /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
+ beginning basic block:
+
+ (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+ or after NOTE_INSN_BASIC_BLOCK in a basic block with
+ only a label:
+
+ (code_label 78 11 77 3 14 (nil) [1 uses])
+ (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+ or after debug marker in a basic block with only a
+ debug marker:
+
+ (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+ (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+ */
+ insn = insn ? PREV_INSN (insn) : BB_END (bb);
+ *after_p = insn;
+ tls_insn = emit_insn_after (tls_set, insn);
+ }
+ return tls_insn;
+ }
+
+ bool repeat = false;
+
+ /* Search for REG_DEAD notes in this basic block. */
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ /* NB: Conditional jump is the only instruction which reads
+ flags register and changes control flow. We can never
+ place the TLS call after unconditional jump. */
+ if (JUMP_P (insn))
+ {
+ /* This must be a conditional jump. */
+ rtx label = JUMP_LABEL (insn);
+ if (label == nullptr
+ || ANY_RETURN_P (label)
+ || !(LABEL_P (label) || SYMBOL_REF_P (label)))
+ gcc_unreachable ();
+
+ /* Place the call before all FLAGS_REG setting BBs since
+ we can't place a call before nor after a conditional
+ jump. */
+ bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
+
+ /* Start over again. */
+ repeat = true;
+ break;
+ }
+
+ if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
+ {
+ /* Insert the __tls_get_addr call before INSN which
+ replaces a __tls_get_addr call. */
+ *before_p = insn;
+ tls_insn = emit_insn_before (tls_set, insn);
+ return tls_insn;
+ }
+
+ if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
+ {
+ /* Mark FLAGS register as dead since FLAGS register
+ would be clobbered by the GNU2 TLS instruction. */
+ bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
+ continue;
+ }
+
+ /* Check if FLAGS register is live. */
+ note_stores (insn, ix86_check_flags_reg,
+ &live_caller_saved_regs);
+
+ rtx link;
+ for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+ if (REG_NOTE_KIND (link) == REG_DEAD
+ && REG_P (XEXP (link, 0)))
+ {
+ /* Mark the live caller-saved register as dead. */
+ for (i = REGNO (XEXP (link, 0));
+ i < END_REGNO (XEXP (link, 0));
+ i++)
+ if (i < FIRST_PSEUDO_REGISTER)
+ bitmap_clear_bit (live_caller_saved_regs, i);
+
+ if (bitmap_empty_p (live_caller_saved_regs))
+ {
+ *after_p = insn;
+ tls_insn = emit_insn_after (tls_set, insn);
+ return tls_insn;
+ }
+ }
+ }
+
+ /* NB: Start over again for conditional jump. */
+ if (repeat)
+ continue;
+
+ gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
+
+ /* If any live caller-saved registers aren't dead at the end of
+ this basic block, get the basic block which dominates all
+ basic blocks which set the remaining live registers. */
+ auto_bitmap set_bbs;
+ bitmap_iterator bi;
+ unsigned int id;
+ EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
+ {
+ basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
+ bitmap_set_bit (set_bbs, set_bb->index);
+ }
+ bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+ }
+ while (true);
+}
+
+/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
+ at entry of the nearest dominator for basic block map BBS, which is in
+ the fake loop that contains the whole function, so that there is only
+ a single TLS CALL of KIND with VAL in the whole function.
+ UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
+ instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
+ replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
+ insert it before the TLS call. */
+
+static void
+ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
+ auto_bitmap &bbs,
+ auto_bitmap &updated_gnu_tls_insns,
+ auto_bitmap &updated_gnu2_tls_insns,
+ rtx tlsdesc_set = nullptr)
+{
+ basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+
+ rtx rax = nullptr, rdi;
+ rtx eqv = nullptr;
+ rtx caddr;
+ rtx set;
+ rtx clob;
+ rtx symbol;
+ rtx tls;
+
+ switch (kind)
+ {
+ case X86_CSE_TLS_GD:
+ rax = gen_rtx_REG (Pmode, AX_REG);
+ rdi = gen_rtx_REG (Pmode, DI_REG);
+ caddr = ix86_tls_get_addr ();
+
+ symbol = XVECEXP (val, 0, 0);
+ tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
+
+ if (GET_MODE (symbol) != Pmode)
+ symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
+ eqv = symbol;
+ break;
+
+ case X86_CSE_TLS_LD_BASE:
+ rax = gen_rtx_REG (Pmode, AX_REG);
+ rdi = gen_rtx_REG (Pmode, DI_REG);
+ caddr = ix86_tls_get_addr ();
+
+ tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
+
+ /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
+ to share the LD_BASE result with other LD model accesses. */
+ eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+ UNSPEC_TLS_LD_BASE);
+
+ break;
+
+ case X86_CSE_TLSDESC:
+ set = gen_rtx_SET (dest, val);
+ clob = gen_rtx_CLOBBER (VOIDmode,
+ gen_rtx_REG (CCmode, FLAGS_REG));
+ tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Emit the TLS CALL insn. */
+ rtx_insn *before = nullptr;
+ rtx_insn *after = nullptr;
+ rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
+ &after,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns);
+
+ rtx_insn *tlsdesc_insn = nullptr;
+ if (tlsdesc_set)
+ {
+ rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
+ rtx src = copy_rtx (SET_SRC (tlsdesc_set));
+ tlsdesc_set = gen_rtx_SET (dest, src);
+ tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
+ }
+
+ if (kind != X86_CSE_TLSDESC)
+ {
+ RTL_CONST_CALL_P (tls_insn) = 1;
+
+ /* Indicate that this function can't jump to non-local gotos. */
+ make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
+ }
+
+ if (recog_memoized (tls_insn) < 0)
+ gcc_unreachable ();
+
+ if (dump_file)
+ {
+ if (after)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ if (tlsdesc_insn)
+ print_rtl_single (dump_file, tlsdesc_insn);
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, after);
+ fprintf (dump_file, "\n");
+ }
+ else
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ if (tlsdesc_insn)
+ print_rtl_single (dump_file, tlsdesc_insn);
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\nbefore:\n\n");
+ print_rtl_single (dump_file, before);
+ fprintf (dump_file, "\n");
+ }
+ }
+
+ if (kind != X86_CSE_TLSDESC)
+ {
+ /* Copy RAX to DEST. */
+ set = gen_rtx_SET (dest, rax);
+ rtx_insn *set_insn = emit_insn_after (set, tls_insn);
+ set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\n");
+ }
+ }
+}
+
+namespace {
+
+const pass_data pass_data_x86_cse =
+{
+ RTL_PASS, /* type */
+ "x86_cse", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_x86_cse : public rtl_opt_pass
+{
+public:
+ pass_x86_cse (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_x86_cse, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *fun) final override
+ {
+ return (TARGET_SSE2
+ && optimize
+ && optimize_function_for_speed_p (fun));
+ }
+
+ unsigned int execute (function *) final override
+ {
+ return x86_cse ();
+ }
+
+private:
+ /* The redundant source value. */
+ rtx val;
+ /* The actual redundant source value for UNSPEC_TLSDESC. */
+ rtx tlsdesc_val;
+ /* The instruction which defines the redundant value. */
+ rtx_insn *def_insn;
+ /* Mode of the destination of the candidate redundant instruction. */
+ machine_mode mode;
+ /* Mode of the source of the candidate redundant instruction. */
+ machine_mode scalar_mode;
+ /* The classification of the candidate redundant instruction. */
+ x86_cse_kind kind;
+
+ unsigned int x86_cse (void);
+ bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
+ bool candidate_gnu2_tls_p (rtx, attr_tls64);
+ bool candidate_vector_p (rtx);
+ rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
+}; // class pass_x86_cse
+
+/* Return the instruction which sets REG from TLS_SYMBOL. */
+
+rtx_insn *
+pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
+ const_rtx tls_symbol)
+{
+ rtx_insn *set_insn = nullptr;
+ for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+ ref;
+ ref = DF_REF_NEXT_REG (ref))
+ {
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ return nullptr;
+
+ set_insn = DF_REF_INSN (ref);
+ if (get_attr_tls64 (set_insn) != TLS64_LEA)
+ return nullptr;
+
+ rtx tls_set = PATTERN (set_insn);
+ rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
+ if (!rtx_equal_p (tls_symbol, tls_src))
+ return nullptr;
+ }
+
+ return set_insn;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
+
+bool
+pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
+{
+ if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+ return false;
+
+ /* Record the redundant TLS CALLs for 64-bit:
+
+ (parallel [
+ (set (reg:DI 0 ax)
+ (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+ (const_int 0 [0])))
+ (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+ (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
+ (clobber (reg:DI 5 di))])
+
+
+ and
+
+ (parallel [
+ (set (reg:DI 0 ax)
+ (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+ (const_int 0 [0])))
+ (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
+
+ */
+
+ rtx pat = PATTERN (insn);
+ rtx set = XVECEXP (pat, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ rtx dest = SET_DEST (set);
+ scalar_mode = mode = GET_MODE (dest);
+ val = XVECEXP (pat, 0, 1);
+ gcc_assert (GET_CODE (val) == UNSPEC);
+
+ if (tls64 == TLS64_GD)
+ kind = X86_CSE_TLS_GD;
+ else
+ kind = X86_CSE_TLS_LD_BASE;
+
+ def_insn = nullptr;
+ return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ SET is UNSPEC_TLSDESC. */
+
+bool
+pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
+{
+ if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+ return false;
+
+ rtx tls_symbol;
+ rtx_insn *set_insn;
+ rtx src = SET_SRC (set);
+ val = src;
+ tlsdesc_val = src;
+ kind = X86_CSE_TLSDESC;
+
+ if (tls64 == TLS64_COMBINE)
+ {
+ /* Record 64-bit TLS64_COMBINE:
+
+ (set (reg/f:DI 104)
+ (plus:DI (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ (reg:DI 114)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))))
+
+ (set (reg/f:DI 104)
+ (plus:DI (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ ] UNSPEC_TLSDESC)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))))
+ */
+
+ scalar_mode = mode = GET_MODE (src);
+
+ /* Since the first operand of PLUS in the source TLS_COMBINE
+ pattern is unused, use the second operand of PLUS:
+
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))
+
+ as VAL to check if 2 TLS_COMBINE patterns have the same
+ source. */
+ val = XEXP (src, 1);
+ gcc_assert (GET_CODE (val) == CONST
+ && GET_CODE (XEXP (val, 0)) == UNSPEC
+ && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
+ && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
+ def_insn = nullptr;
+ return true;
+ }
+
+ /* Record 64-bit TLS_CALL:
+
+ (set (reg:DI 101)
+ (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+ (reg:DI 112)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+ */
+
+ gcc_assert (GET_CODE (src) == UNSPEC);
+ tls_symbol = XVECEXP (src, 0, 0);
+ src = XVECEXP (src, 0, 1);
+ scalar_mode = mode = GET_MODE (src);
+ gcc_assert (REG_P (src));
+
+ /* All definitions of reg:DI 129 in
+
+ (set (reg:DI 110)
+ (unspec:DI [(symbol_ref:DI ("foo"))
+ (reg:DI 129)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+ should have the same source as in
+
+ (set (reg:DI 129)
+ (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
+
+ */
+
+ set_insn = tls_set_insn_from_symbol (src, tls_symbol);
+ if (!set_insn)
+ return false;
+
+ /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
+ val = tls_symbol;
+ def_insn = set_insn;
+ return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ INSN is a vector broadcast instruction. */
+
+bool
+pass_x86_cse::candidate_vector_p (rtx set)
+{
+ rtx src = SET_SRC (set);
+ rtx dest = SET_DEST (set);
+ mode = GET_MODE (dest);
+ /* Skip non-vector instruction. */
+ if (!VECTOR_MODE_P (mode))
+ return false;
+
+ /* Skip non-vector load instruction. */
+ if (!REG_P (dest) && !SUBREG_P (dest))
+ return false;
+
+ val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
+ &def_insn);
+ return val ? true : false;
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+
+ 1. Vector CONST0_RTX patterns.
+ 2. Vector CONSTM1_RTX patterns.
+ 3. Vector broadcast patterns.
+ 4. UNSPEC_TLS_GD patterns.
+ 5. UNSPEC_TLS_LD_BASE patterns.
+ 6. UNSPEC_TLSDESC patterns.
+
+ generate a single pattern whose destination is used to replace the
+ source in all identical patterns.
+
+ NB: We want to generate a pattern, which is executed only once, to
+ cover the whole function. The LCM algorithm isn't appropriate here
+ since it may place a pattern inside the loop. */
+
+unsigned int
+pass_x86_cse::x86_cse (void)
{
timevar_push (TV_MACH_DEP);
- auto_vec<redundant_load *> loads;
- redundant_load *load;
+ auto_vec<redundant_pattern *> loads;
+ redundant_pattern *load;
basic_block bb;
rtx_insn *insn;
unsigned int i;
+ auto_bitmap updated_gnu_tls_insns;
+ auto_bitmap updated_gnu2_tls_insns;
df_set_flags (DF_DEFER_INSN_RESCAN);
@@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void)
if (!NONDEBUG_INSN_P (insn))
continue;
+ bool matched = false;
+ /* Remove redundant pattens if there are more than 2 of
+ them. */
+ unsigned int threshold = 2;
+
rtx set = single_set (insn);
- if (!set)
+ if (!set && !CALL_P (insn))
continue;
- /* Record single set vector instruction with CONST0_RTX and
- CONSTM1_RTX source. Record basic blocks with CONST0_RTX and
- CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the
- maximum size of CONST0_RTX and CONSTM1_RTX. */
+ tlsdesc_val = nullptr;
- rtx dest = SET_DEST (set);
- machine_mode mode = GET_MODE (dest);
- /* Skip non-vector instruction. */
- if (!VECTOR_MODE_P (mode))
- continue;
+ attr_tls64 tls64 = get_attr_tls64 (insn);
+ switch (tls64)
+ {
+ case TLS64_GD:
+ case TLS64_LD_BASE:
+ /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
+ if (candidate_gnu_tls_p (insn, tls64))
+ break;
+ continue;
- rtx src = SET_SRC (set);
- /* Skip non-vector load instruction. */
- if (!REG_P (dest) && !SUBREG_P (dest))
- continue;
+ case TLS64_CALL:
+ case TLS64_COMBINE:
+ /* Verify UNSPEC_TLSDESC. */
+ if (candidate_gnu2_tls_p (set, tls64))
+ break;
+ continue;
- rtx_insn *def_insn;
- machine_mode scalar_mode;
- x86_cse_kind kind;
- rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
- &kind, &def_insn);
- if (!val)
- continue;
+ case TLS64_LEA:
+ /* Skip TLS64_LEA. */
+ continue;
- /* Remove redundant register loads if there are more than 2
- loads will be used. */
- unsigned int threshold = 2;
+ case TLS64_NONE:
+ if (!set)
+ continue;
- /* Check if there is a matching redundant vector load. */
- bool matched = false;
+ /* Check for vector broadcast. */
+ if (candidate_vector_p (set))
+ break;
+ continue;
+ }
+
+ /* Check if there is a matching redundant load. */
FOR_EACH_VEC_ELT (loads, i, load)
if (load->val
&& load->kind == kind
&& load->mode == scalar_mode
&& (load->bb == bb
- || kind < X86_CSE_VEC_DUP
+ || kind != X86_CSE_VEC_DUP
/* Non all 0s/1s vector load must be in the same
basic block if it is in a recursive call. */
|| !recursive_call_p)
&& rtx_equal_p (load->val, val))
{
- /* Record vector instruction. */
+ /* Record instruction. */
bitmap_set_bit (load->insns, INSN_UID (insn));
/* Record the maximum vector size. */
- if (load->size < GET_MODE_SIZE (mode))
+ if (kind <= X86_CSE_VEC_DUP
+ && load->size < GET_MODE_SIZE (mode))
load->size = GET_MODE_SIZE (mode);
/* Record the basic block. */
bitmap_set_bit (load->bbs, bb->index);
+
+ /* Increment the count. */
load->count++;
+
matched = true;
break;
}
@@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void)
if (matched)
continue;
- /* We see this vector broadcast the first time. */
- load = new redundant_load;
+ /* We see this instruction the first time. Record the
+ redundant source value, its mode, the destination size,
+ instruction which defines the redundant source value,
+ instruction basic block and the instruction kind. */
+ load = new redundant_pattern;
load->val = copy_rtx (val);
+ if (tlsdesc_val)
+ load->tlsdesc_val = copy_rtx (tlsdesc_val);
+ else
+ load->tlsdesc_val = nullptr;
load->mode = scalar_mode;
load->size = GET_MODE_SIZE (mode);
load->def_insn = def_insn;
@@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void)
}
bool replaced = false;
- rtx reg, broadcast_source, broadcast_reg;
FOR_EACH_VEC_ELT (loads, i, load)
if (load->count >= load->threshold)
{
- machine_mode mode = ix86_get_vector_cse_mode (load->size,
- load->mode);
- broadcast_reg = gen_reg_rtx (mode);
- if (load->def_insn)
- {
- /* Replace redundant vector loads with a single vector load
- in the same basic block. */
- reg = load->val;
- if (load->mode != GET_MODE (reg))
- reg = gen_rtx_SUBREG (load->mode, reg, 0);
- broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
- replace_vector_const (mode, broadcast_reg, load->insns,
- load->mode);
- }
- else
+ machine_mode mode;
+ rtx reg, broadcast_source, broadcast_reg;
+ replaced = true;
+ switch (load->kind)
{
- /* This is a constant integer/double vector. If the
- inner scalar is 0 or -1, set vector to CONST0_RTX
- or CONSTM1_RTX directly. */
- rtx reg;
- switch (load->kind)
+ case X86_CSE_TLS_GD:
+ case X86_CSE_TLS_LD_BASE:
+ case X86_CSE_TLSDESC:
+ broadcast_reg = gen_reg_rtx (load->mode);
+ replace_tls_call (broadcast_reg, load->insns,
+ (load->kind == X86_CSE_TLSDESC
+ ? updated_gnu2_tls_insns
+ : updated_gnu_tls_insns));
+ load->broadcast_reg = broadcast_reg;
+ break;
+
+ case X86_CSE_CONST0_VECTOR:
+ case X86_CSE_CONSTM1_VECTOR:
+ case X86_CSE_VEC_DUP:
+ mode = ix86_get_vector_cse_mode (load->size, load->mode);
+ broadcast_reg = gen_reg_rtx (mode);
+ if (load->def_insn)
{
- case X86_CSE_CONST0_VECTOR:
- broadcast_source = CONST0_RTX (mode);
- break;
- case X86_CSE_CONSTM1_VECTOR:
- broadcast_source = CONSTM1_RTX (mode);
- break;
- default:
- reg = gen_reg_rtx (load->mode);
+ /* Replace redundant vector loads with a single vector
+ load in the same basic block. */
+ reg = load->val;
+ if (load->mode != GET_MODE (reg))
+ reg = gen_rtx_SUBREG (load->mode, reg, 0);
broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
- break;
}
+ else
+ /* This is a constant integer/double vector. If the
+ inner scalar is 0 or -1, set vector to CONST0_RTX
+ or CONSTM1_RTX directly. */
+ switch (load->kind)
+ {
+ case X86_CSE_CONST0_VECTOR:
+ broadcast_source = CONST0_RTX (mode);
+ break;
+ case X86_CSE_CONSTM1_VECTOR:
+ broadcast_source = CONSTM1_RTX (mode);
+ break;
+ case X86_CSE_VEC_DUP:
+ reg = gen_reg_rtx (load->mode);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ break;
+ default:
+ gcc_unreachable ();
+ }
replace_vector_const (mode, broadcast_reg, load->insns,
load->mode);
+ load->broadcast_source = broadcast_source;
+ load->broadcast_reg = broadcast_reg;
+ break;
}
- load->broadcast_source = broadcast_source;
- load->broadcast_reg = broadcast_reg;
- replaced = true;
}
if (replaced)
@@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void)
FOR_EACH_VEC_ELT (loads, i, load)
if (load->count >= load->threshold)
{
+ rtx set;
if (load->def_insn)
- {
- /* Insert a broadcast after the original scalar
- definition. */
- rtx set = gen_rtx_SET (load->broadcast_reg,
- load->broadcast_source);
- insn = emit_insn_after (set, load->def_insn);
-
- if (cfun->can_throw_non_call_exceptions)
- {
- /* Handle REG_EH_REGION note in DEF_INSN. */
- rtx note = find_reg_note (load->def_insn,
- REG_EH_REGION, nullptr);
- if (note)
- {
- control_flow_insns.safe_push (load->def_insn);
- add_reg_note (insn, REG_EH_REGION,
- XEXP (note, 0));
- }
- }
+ switch (load->kind)
+ {
+ case X86_CSE_TLSDESC:
+ ix86_place_single_tls_call (load->broadcast_reg,
+ load->tlsdesc_val,
+ load->kind,
+ load->bbs,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns,
+ PATTERN (load->def_insn));
+ break;
+ case X86_CSE_VEC_DUP:
+ /* Insert a broadcast after the original scalar
+ definition. */
+ set = gen_rtx_SET (load->broadcast_reg,
+ load->broadcast_source);
+ insn = emit_insn_after (set, load->def_insn);
+
+ if (cfun->can_throw_non_call_exceptions)
+ {
+ /* Handle REG_EH_REGION note in DEF_INSN. */
+ rtx note = find_reg_note (load->def_insn,
+ REG_EH_REGION, nullptr);
+ if (note)
+ {
+ control_flow_insns.safe_push (load->def_insn);
+ add_reg_note (insn, REG_EH_REGION,
+ XEXP (note, 0));
+ }
+ }
- if (dump_file)
- {
- fprintf (dump_file, "\nAdd:\n\n");
- print_rtl_single (dump_file, insn);
- fprintf (dump_file, "\nafter:\n\n");
- print_rtl_single (dump_file, load->def_insn);
- fprintf (dump_file, "\n");
- }
- }
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nAdd:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, load->def_insn);
+ fprintf (dump_file, "\n");
+ }
+ break;
+ default:
+ gcc_unreachable ();
+ }
else
- ix86_place_single_vector_set (load->broadcast_reg,
- load->broadcast_source,
- load->bbs,
- (load->kind == X86_CSE_VEC_DUP
- ? load->val
- : nullptr));
+ switch (load->kind)
+ {
+ case X86_CSE_TLS_GD:
+ case X86_CSE_TLS_LD_BASE:
+ case X86_CSE_TLSDESC:
+ ix86_place_single_tls_call (load->broadcast_reg,
+ (load->kind == X86_CSE_TLSDESC
+ ? load->tlsdesc_val
+ : load->val),
+ load->kind,
+ load->bbs,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns);
+ break;
+ case X86_CSE_CONST0_VECTOR:
+ case X86_CSE_CONSTM1_VECTOR:
+ case X86_CSE_VEC_DUP:
+ ix86_place_single_vector_set (load->broadcast_reg,
+ load->broadcast_source,
+ load->bbs,
+ load);
+ break;
+ }
}
loop_optimizer_finalize ();
@@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void)
return 0;
}
-namespace {
-
-const pass_data pass_data_remove_redundant_vector_load =
-{
- RTL_PASS, /* type */
- "rrvl", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- 0, /* todo_flags_finish */
-};
-
-class pass_remove_redundant_vector_load : public rtl_opt_pass
-{
-public:
- pass_remove_redundant_vector_load (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
- {}
-
- /* opt_pass methods: */
- bool gate (function *fun) final override
- {
- return (TARGET_SSE2
- && optimize
- && optimize_function_for_speed_p (fun));
- }
-
- unsigned int execute (function *) final override
- {
- return remove_redundant_vector_load ();
- }
-}; // class pass_remove_redundant_vector_load
-
} // anon namespace
rtl_opt_pass *
-make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+make_pass_x86_cse (gcc::context *ctxt)
{
- return new pass_remove_redundant_vector_load (ctxt);
+ return new pass_x86_cse (ctxt);
}
/* Convert legacy instructions that clobbers EFLAGS to APX_NF