aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/i386')
-rw-r--r--gcc/config/i386/i386-expand.cc129
-rw-r--r--gcc/config/i386/i386-features.cc1130
-rw-r--r--gcc/config/i386/i386-modes.def2
-rw-r--r--gcc/config/i386/i386-options.cc45
-rw-r--r--gcc/config/i386/i386-passes.def2
-rw-r--r--gcc/config/i386/i386-protos.h5
-rw-r--r--gcc/config/i386/i386.cc305
-rw-r--r--gcc/config/i386/i386.h59
-rw-r--r--gcc/config/i386/i386.md449
-rw-r--r--gcc/config/i386/i386.opt4
-rw-r--r--gcc/config/i386/predicates.md17
-rw-r--r--gcc/config/i386/sse.md135
-rw-r--r--gcc/config/i386/x86-tune-costs.h192
13 files changed, 1904 insertions, 570 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 09aa9b1..3278f1f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
}
/* Expand floating point op0 <=> op1, i.e.
- dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
+ dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */
void
ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
@@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
if (l2)
{
emit_label (l2);
- emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2);
+ emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
}
emit_label (lend);
}
@@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
unsigned HOST_WIDE_INT countval = UINTVAL (count);
unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
unsigned int destalign = MEM_ALIGN (destmem);
+ cfun->machine->by_pieces_in_use = true;
move_by_pieces (destmem, srcmem, epilogue_size, destalign,
RETURN_BEGIN);
+ cfun->machine->by_pieces_in_use = false;
return;
}
if (max_size > 8)
@@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
/* Callback routine for store_by_pieces. Return the RTL of a register
containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
- is a word or a word vector register. If PREV_P isn't nullptr, it
- has the RTL info from the previous iteration. */
+ is an integer or a word vector register. If PREV_P isn't nullptr,
+ it has the RTL info from the previous iteration. */
static rtx
setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
@@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
rtx op = (rtx) op_p;
machine_mode op_mode = GET_MODE (op);
- gcc_assert (op_mode == word_mode
- || (VECTOR_MODE_P (op_mode)
- && GET_MODE_INNER (op_mode) == word_mode));
-
if (VECTOR_MODE_P (mode))
{
gcc_assert (GET_MODE_INNER (mode) == QImode);
@@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
return tmp;
}
- target = gen_reg_rtx (word_mode);
if (VECTOR_MODE_P (op_mode))
{
+ gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
+ target = gen_reg_rtx (word_mode);
op = gen_rtx_SUBREG (word_mode, op, 0);
emit_move_insn (target, op);
}
else
target = op;
- if (mode == word_mode)
+ if (mode == GET_MODE (target))
return target;
rtx tmp = gen_reg_rtx (mode);
@@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
unsigned HOST_WIDE_INT countval = UINTVAL (count);
unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
unsigned int destalign = MEM_ALIGN (destmem);
+ cfun->machine->by_pieces_in_use = true;
store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
vec_value ? vec_value : value, destalign, true,
RETURN_BEGIN);
+ cfun->machine->by_pieces_in_use = false;
return;
}
if (max_size > 32)
@@ -9574,8 +9575,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
case vector_loop:
need_zero_guard = true;
unroll_factor = 4;
- /* Get the vector mode to move MOVE_MAX bytes. */
- nunits = MOVE_MAX / GET_MODE_SIZE (word_mode);
+ /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
+ nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
+ nunits /= GET_MODE_SIZE (word_mode);
if (nunits > 1)
{
move_mode = mode_for_vector (word_mode, nunits).require ();
@@ -27033,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
return target;
}
+/* GF2P8AFFINEQB matrixes to implement shift and rotate. */
+
+static const uint64_t matrix_ashift[8] =
+{
+ 0,
+ 0x0001020408102040, /* 1 l */
+ 0x0000010204081020, /* 2 l */
+ 0x0000000102040810, /* 3 l */
+ 0x0000000001020408, /* 4 l */
+ 0x0000000000010204, /* 5 l */
+ 0x0000000000000102, /* 6 l */
+ 0x0000000000000001 /* 7 l */
+};
+
+static const uint64_t matrix_lshiftrt[8] =
+{
+ 0,
+ 0x0204081020408000, /* 1 r */
+ 0x0408102040800000, /* 2 r */
+ 0x0810204080000000, /* 3 r */
+ 0x1020408000000000, /* 4 r */
+ 0x2040800000000000, /* 5 r */
+ 0x4080000000000000, /* 6 r */
+ 0x8000000000000000 /* 7 r */
+};
+
+static const uint64_t matrix_ashiftrt[8] =
+{
+ 0,
+ 0x0204081020408080, /* 1 r */
+ 0x0408102040808080, /* 2 r */
+ 0x0810204080808080, /* 3 r */
+ 0x1020408080808080, /* 4 r */
+ 0x2040808080808080, /* 5 r */
+ 0x4080808080808080, /* 6 r */
+ 0x8080808080808080 /* 7 r */
+};
+
+static const uint64_t matrix_rotate[8] =
+{
+ 0,
+ 0x8001020408102040, /* 1 rol8 */
+ 0x4080010204081020, /* 2 rol8 */
+ 0x2040800102040810, /* 3 rol8 */
+ 0x1020408001020408, /* 4 rol8 */
+ 0x0810204080010204, /* 5 rol8 */
+ 0x0408102040800102, /* 6 rol8 */
+ 0x0204081020408001 /* 7 rol8 */
+};
+
+static const uint64_t matrix_rotatert[8] =
+{
+ 0,
+ 0x0204081020408001, /* 1 ror8 */
+ 0x0408102040800102, /* 2 ror8 */
+ 0x0810204080010204, /* 3 ror8 */
+ 0x1020408001020408, /* 4 ror8 */
+ 0x2040800102040810, /* 5 ror8 */
+ 0x4080010204081020, /* 6 ror8 */
+ 0x8001020408102040 /* 7 ror8 */
+};
+
+/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
+ for CODE and shift count COUNT into register with vector of size of SRC. */
+
+rtx
+ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
+{
+ machine_mode mode = GET_MODE (src);
+ const uint64_t *matrix;
+ unsigned shift = INTVAL (count) & 7;
+ gcc_assert (shift > 0 && shift < 8);
+
+ switch (code)
+ {
+ case ASHIFT:
+ matrix = matrix_ashift;
+ break;
+ case ASHIFTRT:
+ matrix = matrix_ashiftrt;
+ break;
+ case LSHIFTRT:
+ matrix = matrix_lshiftrt;
+ break;
+ case ROTATE:
+ matrix = matrix_rotate;
+ break;
+ case ROTATERT:
+ matrix = matrix_rotatert;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ int nelts = GET_MODE_NUNITS (mode);
+ rtvec vec = rtvec_alloc (nelts);
+ uint64_t ma = matrix[shift];
+ for (int i = 0; i < nelts; i++)
+ RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
+
+ return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
+}
+
/* Trunc a vector to a narrow vector, like v4di -> v4si. */
void
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c131577..0608dd2 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3085,21 +3085,68 @@ ix86_rpad_gate ()
&& optimize_function_for_speed_p (cfun));
}
+enum x86_cse_kind
+{
+ X86_CSE_CONST0_VECTOR,
+ X86_CSE_CONSTM1_VECTOR,
+ X86_CSE_VEC_DUP,
+ X86_CSE_TLS_GD,
+ X86_CSE_TLS_LD_BASE,
+ X86_CSE_TLSDESC
+};
+
+struct redundant_pattern
+{
+ /* Bitmap of basic blocks with broadcast instructions. */
+ auto_bitmap bbs;
+ /* Bitmap of broadcast instructions. */
+ auto_bitmap insns;
+ /* The broadcast inner scalar. */
+ rtx val;
+ /* The actual redundant source value for UNSPEC_TLSDESC. */
+ rtx tlsdesc_val;
+ /* The inner scalar mode. */
+ machine_mode mode;
+ /* The instruction which sets the inner scalar. Nullptr if the inner
+ scalar is applied to the whole function, instead of within the same
+ block. */
+ rtx_insn *def_insn;
+ /* The widest broadcast source. */
+ rtx broadcast_source;
+ /* The widest broadcast register. */
+ rtx broadcast_reg;
+ /* The basic block of the broadcast instruction. */
+ basic_block bb;
+ /* The number of broadcast instructions with the same inner scalar. */
+ unsigned HOST_WIDE_INT count;
+ /* The threshold of broadcast instructions with the same inner
+ scalar. */
+ unsigned int threshold;
+ /* The widest broadcast size in bytes. */
+ unsigned int size;
+ /* Load kind. */
+ x86_cse_kind kind;
+};
+
/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
for basic block map BBS, which is in the fake loop that contains the
whole function, so that there is only a single vector set in the
- whole function. If not nullptr, INNER_SCALAR is the inner scalar of
- SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */
+ whole function. If not nullptr, LOAD is a pointer to the load. */
static void
ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
- rtx inner_scalar = nullptr)
+ redundant_pattern *load = nullptr)
{
basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
- while (bb->loop_father->latch
- != EXIT_BLOCK_PTR_FOR_FN (cfun))
- bb = get_immediate_dominator (CDI_DOMINATORS,
- bb->loop_father->header);
+ /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
+ to avoid extra spills. */
+ if (!load || load->kind != X86_CSE_VEC_DUP)
+ {
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+ }
rtx set = gen_rtx_SET (dest, src);
@@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
}
}
- if (inner_scalar)
+ if (load && load->kind == X86_CSE_VEC_DUP)
{
+ /* Get the source from LOAD as (reg:SI 99) in
+
+ (vec_duplicate:V4SI (reg:SI 99))
+
+ */
+ rtx inner_scalar = load->val;
/* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
rtx reg = XEXP (src, 0);
if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
@@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void)
break;
}
- /* Only hanlde conversion here. */
+ /* Only handle conversion here. */
machine_mode src_mode
= convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
switch (src_mode)
@@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const,
}
}
-enum x86_cse_kind
-{
- X86_CSE_CONST0_VECTOR,
- X86_CSE_CONSTM1_VECTOR,
- X86_CSE_VEC_DUP
-};
-
-struct redundant_load
-{
- /* Bitmap of basic blocks with broadcast instructions. */
- auto_bitmap bbs;
- /* Bitmap of broadcast instructions. */
- auto_bitmap insns;
- /* The broadcast inner scalar. */
- rtx val;
- /* The inner scalar mode. */
- machine_mode mode;
- /* The instruction which sets the inner scalar. Nullptr if the inner
- scalar is applied to the whole function, instead of within the same
- block. */
- rtx_insn *def_insn;
- /* The widest broadcast source. */
- rtx broadcast_source;
- /* The widest broadcast register. */
- rtx broadcast_reg;
- /* The basic block of the broadcast instruction. */
- basic_block bb;
- /* The number of broadcast instructions with the same inner scalar. */
- unsigned HOST_WIDE_INT count;
- /* The threshold of broadcast instructions with the same inner
- scalar. */
- unsigned int threshold;
- /* The widest broadcast size in bytes. */
- unsigned int size;
- /* Load kind. */
- x86_cse_kind kind;
-};
-
/* Return the inner scalar if OP is a broadcast, else return nullptr. */
static rtx
@@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
integer constant. */
op = src;
+ if (mode != GET_MODE (reg))
+ op = gen_int_mode (INTVAL (src), mode);
*insn_p = nullptr;
}
else
@@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
return op;
}
-/* At entry of the nearest common dominator for basic blocks with vector
- CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
- vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
- uses.
+/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
+ put the updated instruction in UPDATED_TLS_INSNS. */
- NB: We want to generate only a single widest vector set to cover the
- whole function. The LCM algorithm isn't appropriate here since it
- may place a vector set inside the loop. */
+static void
+replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
+ auto_bitmap &updated_tls_insns)
+{
+ bitmap_iterator bi;
+ unsigned int id;
-static unsigned int
-remove_redundant_vector_load (void)
+ EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
+ {
+ rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+ /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
+ allowed. */
+ if (!CALL_P (insn))
+ {
+ attr_tls64 tls64 = get_attr_tls64 (insn);
+ if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
+ gcc_unreachable ();
+ }
+
+ rtx pat = PATTERN (insn);
+ gcc_assert (GET_CODE (pat) == PARALLEL);
+ rtx set = XVECEXP (pat, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ rtx dest = SET_DEST (set);
+
+ set = gen_rtx_SET (dest, src);
+ rtx_insn *set_insn = emit_insn_after (set, insn);
+ if (recog_memoized (set_insn) < 0)
+ gcc_unreachable ();
+
+ /* Put SET_INSN in UPDATED_TLS_INSNS. */
+ bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nReplace:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nwith:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\n");
+ }
+
+ /* Delete the CALL insn. */
+ delete_insn (insn);
+
+ df_insn_rescan (set_insn);
+ }
+}
+
+/* Return the basic block which dominates all basic blocks which set
+ hard register REGNO used in basic block BB. */
+
+static basic_block
+ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
+{
+ basic_block set_bb;
+ auto_bitmap set_bbs;
+
+ /* Get all BBs which set REGNO and dominate the current BB from all
+ DEFs of REGNO. */
+ for (df_ref def = DF_REG_DEF_CHAIN (regno);
+ def;
+ def = DF_REF_NEXT_REG (def))
+ if (!DF_REF_IS_ARTIFICIAL (def)
+ && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
+ && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
+ {
+ set_bb = DF_REF_BB (def);
+ if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
+ bitmap_set_bit (set_bbs, set_bb->index);
+ }
+
+ bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+ return bb;
+}
+
+/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
+ registers, if DEST is FLAGS register. */
+
+static void
+ix86_check_flags_reg (rtx dest, const_rtx, void *data)
+{
+ auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
+ if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
+ bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
+}
+
+/* Emit a TLS_SET instruction of KIND in basic block BB. Store the
+ insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
+ for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
+ which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
+ contains instructions which replace the GNU2 TLS instructions. */
+
+static rtx_insn *
+ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
+ rtx_insn **before_p, rtx_insn **after_p,
+ auto_bitmap &updated_gnu_tls_insns,
+ auto_bitmap &updated_gnu2_tls_insns)
+{
+ rtx_insn *tls_insn;
+
+ do
+ {
+ rtx_insn *insn = BB_HEAD (bb);
+ while (insn && !NONDEBUG_INSN_P (insn))
+ {
+ if (insn == BB_END (bb))
+ {
+ /* This must be the beginning basic block:
+
+ (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+ or a basic block with only a label:
+
+ (code_label 78 11 77 3 14 (nil) [1 uses])
+ (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+ or a basic block with only a debug marker:
+
+ (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+ (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+ */
+ gcc_assert (DEBUG_INSN_P (insn)
+ || (NOTE_P (insn)
+ && ((NOTE_KIND (insn)
+ == NOTE_INSN_FUNCTION_BEG)
+ || (NOTE_KIND (insn)
+ == NOTE_INSN_BASIC_BLOCK))));
+ insn = NULL;
+ break;
+ }
+ insn = NEXT_INSN (insn);
+ }
+
+ /* TLS_GD and TLS_LD_BASE instructions are normal functions which
+ clobber caller-saved registers. TLSDESC instructions only
+ clobber FLAGS. If any registers clobbered by TLS instructions
+ are live in this basic block, we must insert TLS instructions
+ after all live registers clobbered are dead. */
+
+ auto_bitmap live_caller_saved_regs;
+ bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
+
+ if (bitmap_bit_p (in, FLAGS_REG))
+ bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
+
+ unsigned int i;
+
+ /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
+ instructions. */
+ if (kind != X86_CSE_TLSDESC)
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (call_used_regs[i]
+ && !fixed_regs[i]
+ && bitmap_bit_p (in, i))
+ bitmap_set_bit (live_caller_saved_regs, i);
+
+ if (bitmap_empty_p (live_caller_saved_regs))
+ {
+ if (insn == BB_HEAD (bb))
+ {
+ *before_p = insn;
+ tls_insn = emit_insn_before (tls_set, insn);
+ }
+ else
+ {
+ /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
+ beginning basic block:
+
+ (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
+
+ or after NOTE_INSN_BASIC_BLOCK in a basic block with
+ only a label:
+
+ (code_label 78 11 77 3 14 (nil) [1 uses])
+ (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
+
+ or after debug marker in a basic block with only a
+ debug marker:
+
+ (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
+ (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
+ (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
+
+ */
+ insn = insn ? PREV_INSN (insn) : BB_END (bb);
+ *after_p = insn;
+ tls_insn = emit_insn_after (tls_set, insn);
+ }
+ return tls_insn;
+ }
+
+ bool repeat = false;
+
+ /* Search for REG_DEAD notes in this basic block. */
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ /* NB: Conditional jump is the only instruction which reads
+ flags register and changes control flow. We can never
+ place the TLS call after unconditional jump. */
+ if (JUMP_P (insn))
+ {
+ /* This must be a conditional jump. */
+ rtx label = JUMP_LABEL (insn);
+ if (label == nullptr
+ || ANY_RETURN_P (label)
+ || !(LABEL_P (label) || SYMBOL_REF_P (label)))
+ gcc_unreachable ();
+
+ /* Place the call before all FLAGS_REG setting BBs since
+ we can't place a call before nor after a conditional
+ jump. */
+ bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
+
+ /* Start over again. */
+ repeat = true;
+ break;
+ }
+
+ if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
+ {
+ /* Insert the __tls_get_addr call before INSN which
+ replaces a __tls_get_addr call. */
+ *before_p = insn;
+ tls_insn = emit_insn_before (tls_set, insn);
+ return tls_insn;
+ }
+
+ if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
+ {
+ /* Mark FLAGS register as dead since FLAGS register
+ would be clobbered by the GNU2 TLS instruction. */
+ bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
+ continue;
+ }
+
+ /* Check if FLAGS register is live. */
+ note_stores (insn, ix86_check_flags_reg,
+ &live_caller_saved_regs);
+
+ rtx link;
+ for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
+ if (REG_NOTE_KIND (link) == REG_DEAD
+ && REG_P (XEXP (link, 0)))
+ {
+ /* Mark the live caller-saved register as dead. */
+ for (i = REGNO (XEXP (link, 0));
+ i < END_REGNO (XEXP (link, 0));
+ i++)
+ if (i < FIRST_PSEUDO_REGISTER)
+ bitmap_clear_bit (live_caller_saved_regs, i);
+
+ if (bitmap_empty_p (live_caller_saved_regs))
+ {
+ *after_p = insn;
+ tls_insn = emit_insn_after (tls_set, insn);
+ return tls_insn;
+ }
+ }
+ }
+
+ /* NB: Start over again for conditional jump. */
+ if (repeat)
+ continue;
+
+ gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
+
+ /* If any live caller-saved registers aren't dead at the end of
+ this basic block, get the basic block which dominates all
+ basic blocks which set the remaining live registers. */
+ auto_bitmap set_bbs;
+ bitmap_iterator bi;
+ unsigned int id;
+ EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
+ {
+ basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
+ bitmap_set_bit (set_bbs, set_bb->index);
+ }
+ bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
+ }
+ while (true);
+}
+
+/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
+ at entry of the nearest dominator for basic block map BBS, which is in
+ the fake loop that contains the whole function, so that there is only
+ a single TLS CALL of KIND with VAL in the whole function.
+ UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
+ instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
+ replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
+ insert it before the TLS call. */
+
+static void
+ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
+ auto_bitmap &bbs,
+ auto_bitmap &updated_gnu_tls_insns,
+ auto_bitmap &updated_gnu2_tls_insns,
+ rtx tlsdesc_set = nullptr)
+{
+ basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+ while (bb->loop_father->latch
+ != EXIT_BLOCK_PTR_FOR_FN (cfun))
+ bb = get_immediate_dominator (CDI_DOMINATORS,
+ bb->loop_father->header);
+
+ rtx rax = nullptr, rdi;
+ rtx eqv = nullptr;
+ rtx caddr;
+ rtx set;
+ rtx clob;
+ rtx symbol;
+ rtx tls;
+
+ switch (kind)
+ {
+ case X86_CSE_TLS_GD:
+ rax = gen_rtx_REG (Pmode, AX_REG);
+ rdi = gen_rtx_REG (Pmode, DI_REG);
+ caddr = ix86_tls_get_addr ();
+
+ symbol = XVECEXP (val, 0, 0);
+ tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
+
+ if (GET_MODE (symbol) != Pmode)
+ symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
+ eqv = symbol;
+ break;
+
+ case X86_CSE_TLS_LD_BASE:
+ rax = gen_rtx_REG (Pmode, AX_REG);
+ rdi = gen_rtx_REG (Pmode, DI_REG);
+ caddr = ix86_tls_get_addr ();
+
+ tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
+
+ /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
+ to share the LD_BASE result with other LD model accesses. */
+ eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+ UNSPEC_TLS_LD_BASE);
+
+ break;
+
+ case X86_CSE_TLSDESC:
+ set = gen_rtx_SET (dest, val);
+ clob = gen_rtx_CLOBBER (VOIDmode,
+ gen_rtx_REG (CCmode, FLAGS_REG));
+ tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Emit the TLS CALL insn. */
+ rtx_insn *before = nullptr;
+ rtx_insn *after = nullptr;
+ rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
+ &after,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns);
+
+ rtx_insn *tlsdesc_insn = nullptr;
+ if (tlsdesc_set)
+ {
+ rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
+ rtx src = copy_rtx (SET_SRC (tlsdesc_set));
+ tlsdesc_set = gen_rtx_SET (dest, src);
+ tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
+ }
+
+ if (kind != X86_CSE_TLSDESC)
+ {
+ RTL_CONST_CALL_P (tls_insn) = 1;
+
+ /* Indicate that this function can't jump to non-local gotos. */
+ make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
+ }
+
+ if (recog_memoized (tls_insn) < 0)
+ gcc_unreachable ();
+
+ if (dump_file)
+ {
+ if (after)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ if (tlsdesc_insn)
+ print_rtl_single (dump_file, tlsdesc_insn);
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, after);
+ fprintf (dump_file, "\n");
+ }
+ else
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ if (tlsdesc_insn)
+ print_rtl_single (dump_file, tlsdesc_insn);
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\nbefore:\n\n");
+ print_rtl_single (dump_file, before);
+ fprintf (dump_file, "\n");
+ }
+ }
+
+ if (kind != X86_CSE_TLSDESC)
+ {
+ /* Copy RAX to DEST. */
+ set = gen_rtx_SET (dest, rax);
+ rtx_insn *set_insn = emit_insn_after (set, tls_insn);
+ set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nPlace:\n\n");
+ print_rtl_single (dump_file, set_insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, tls_insn);
+ fprintf (dump_file, "\n");
+ }
+ }
+}
+
+namespace {
+
+const pass_data pass_data_x86_cse =
+{
+ RTL_PASS, /* type */
+ "x86_cse", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_x86_cse : public rtl_opt_pass
+{
+public:
+ pass_x86_cse (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_x86_cse, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ bool gate (function *fun) final override
+ {
+ return (TARGET_SSE2
+ && optimize
+ && optimize_function_for_speed_p (fun));
+ }
+
+ unsigned int execute (function *) final override
+ {
+ return x86_cse ();
+ }
+
+private:
+ /* The redundant source value. */
+ rtx val;
+ /* The actual redundant source value for UNSPEC_TLSDESC. */
+ rtx tlsdesc_val;
+ /* The instruction which defines the redundant value. */
+ rtx_insn *def_insn;
+ /* Mode of the destination of the candidate redundant instruction. */
+ machine_mode mode;
+ /* Mode of the source of the candidate redundant instruction. */
+ machine_mode scalar_mode;
+ /* The classification of the candidate redundant instruction. */
+ x86_cse_kind kind;
+
+ unsigned int x86_cse (void);
+ bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
+ bool candidate_gnu2_tls_p (rtx, attr_tls64);
+ bool candidate_vector_p (rtx);
+ rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
+}; // class pass_x86_cse
+
+/* Return the instruction which sets REG from TLS_SYMBOL. */
+
+rtx_insn *
+pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
+ const_rtx tls_symbol)
+{
+ rtx_insn *set_insn = nullptr;
+ for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
+ ref;
+ ref = DF_REF_NEXT_REG (ref))
+ {
+ if (DF_REF_IS_ARTIFICIAL (ref))
+ return nullptr;
+
+ set_insn = DF_REF_INSN (ref);
+ if (get_attr_tls64 (set_insn) != TLS64_LEA)
+ return nullptr;
+
+ rtx tls_set = PATTERN (set_insn);
+ rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
+ if (!rtx_equal_p (tls_symbol, tls_src))
+ return nullptr;
+ }
+
+ return set_insn;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
+
+bool
+pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
+{
+ if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+ return false;
+
+ /* Record the redundant TLS CALLs for 64-bit:
+
+ (parallel [
+ (set (reg:DI 0 ax)
+ (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+ (const_int 0 [0])))
+ (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+ (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
+ (clobber (reg:DI 5 di))])
+
+
+ and
+
+ (parallel [
+ (set (reg:DI 0 ax)
+ (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
+ (const_int 0 [0])))
+ (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
+
+ */
+
+ rtx pat = PATTERN (insn);
+ rtx set = XVECEXP (pat, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ rtx dest = SET_DEST (set);
+ scalar_mode = mode = GET_MODE (dest);
+ val = XVECEXP (pat, 0, 1);
+ gcc_assert (GET_CODE (val) == UNSPEC);
+
+ if (tls64 == TLS64_GD)
+ kind = X86_CSE_TLS_GD;
+ else
+ kind = X86_CSE_TLS_LD_BASE;
+
+ def_insn = nullptr;
+ return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ SET is UNSPEC_TLSDESC. */
+
+bool
+pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
+{
+ if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
+ return false;
+
+ rtx tls_symbol;
+ rtx_insn *set_insn;
+ rtx src = SET_SRC (set);
+ val = src;
+ tlsdesc_val = src;
+ kind = X86_CSE_TLSDESC;
+
+ if (tls64 == TLS64_COMBINE)
+ {
+ /* Record 64-bit TLS64_COMBINE:
+
+ (set (reg/f:DI 104)
+ (plus:DI (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ (reg:DI 114)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))))
+
+ (set (reg/f:DI 104)
+ (plus:DI (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ (unspec:DI [
+ (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
+ ] UNSPEC_TLSDESC)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))))
+ */
+
+ scalar_mode = mode = GET_MODE (src);
+
+ /* Since the first operand of PLUS in the source TLS_COMBINE
+ pattern is unused, use the second operand of PLUS:
+
+ (const:DI (unspec:DI [
+ (symbol_ref:DI ("e") [flags 0x1a])
+ ] UNSPEC_DTPOFF))
+
+ as VAL to check if 2 TLS_COMBINE patterns have the same
+ source. */
+ val = XEXP (src, 1);
+ gcc_assert (GET_CODE (val) == CONST
+ && GET_CODE (XEXP (val, 0)) == UNSPEC
+ && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
+ && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
+ def_insn = nullptr;
+ return true;
+ }
+
+ /* Record 64-bit TLS_CALL:
+
+ (set (reg:DI 101)
+ (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
+ (reg:DI 112)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+ */
+
+ gcc_assert (GET_CODE (src) == UNSPEC);
+ tls_symbol = XVECEXP (src, 0, 0);
+ src = XVECEXP (src, 0, 1);
+ scalar_mode = mode = GET_MODE (src);
+ gcc_assert (REG_P (src));
+
+ /* All definitions of reg:DI 129 in
+
+ (set (reg:DI 110)
+ (unspec:DI [(symbol_ref:DI ("foo"))
+ (reg:DI 129)
+ (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
+
+ should have the same source as in
+
+ (set (reg:DI 129)
+ (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
+
+ */
+
+ set_insn = tls_set_insn_from_symbol (src, tls_symbol);
+ if (!set_insn)
+ return false;
+
+ /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
+ val = tls_symbol;
+ def_insn = set_insn;
+ return true;
+}
+
+/* Return true and output def_insn, val, mode, scalar_mode and kind if
+ INSN is a vector broadcast instruction. */
+
+bool
+pass_x86_cse::candidate_vector_p (rtx set)
+{
+ rtx src = SET_SRC (set);
+ rtx dest = SET_DEST (set);
+ mode = GET_MODE (dest);
+ /* Skip non-vector instruction. */
+ if (!VECTOR_MODE_P (mode))
+ return false;
+
+ /* Skip non-vector load instruction. */
+ if (!REG_P (dest) && !SUBREG_P (dest))
+ return false;
+
+ val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
+ &def_insn);
+ return val ? true : false;
+}
+
+/* At entry of the nearest common dominator for basic blocks with
+
+ 1. Vector CONST0_RTX patterns.
+ 2. Vector CONSTM1_RTX patterns.
+ 3. Vector broadcast patterns.
+ 4. UNSPEC_TLS_GD patterns.
+ 5. UNSPEC_TLS_LD_BASE patterns.
+ 6. UNSPEC_TLSDESC patterns.
+
+ generate a single pattern whose destination is used to replace the
+ source in all identical patterns.
+
+ NB: We want to generate a pattern, which is executed only once, to
+ cover the whole function. The LCM algorithm isn't appropriate here
+ since it may place a pattern inside the loop. */
+
+unsigned int
+pass_x86_cse::x86_cse (void)
{
timevar_push (TV_MACH_DEP);
- auto_vec<redundant_load *> loads;
- redundant_load *load;
+ auto_vec<redundant_pattern *> loads;
+ redundant_pattern *load;
basic_block bb;
rtx_insn *insn;
unsigned int i;
+ auto_bitmap updated_gnu_tls_insns;
+ auto_bitmap updated_gnu2_tls_insns;
df_set_flags (DF_DEFER_INSN_RESCAN);
@@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void)
if (!NONDEBUG_INSN_P (insn))
continue;
+ bool matched = false;
+ /* Remove redundant pattens if there are more than 2 of
+ them. */
+ unsigned int threshold = 2;
+
rtx set = single_set (insn);
- if (!set)
+ if (!set && !CALL_P (insn))
continue;
- /* Record single set vector instruction with CONST0_RTX and
- CONSTM1_RTX source. Record basic blocks with CONST0_RTX and
- CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the
- maximum size of CONST0_RTX and CONSTM1_RTX. */
+ tlsdesc_val = nullptr;
- rtx dest = SET_DEST (set);
- machine_mode mode = GET_MODE (dest);
- /* Skip non-vector instruction. */
- if (!VECTOR_MODE_P (mode))
- continue;
+ attr_tls64 tls64 = get_attr_tls64 (insn);
+ switch (tls64)
+ {
+ case TLS64_GD:
+ case TLS64_LD_BASE:
+ /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
+ if (candidate_gnu_tls_p (insn, tls64))
+ break;
+ continue;
- rtx src = SET_SRC (set);
- /* Skip non-vector load instruction. */
- if (!REG_P (dest) && !SUBREG_P (dest))
- continue;
+ case TLS64_CALL:
+ case TLS64_COMBINE:
+ /* Verify UNSPEC_TLSDESC. */
+ if (candidate_gnu2_tls_p (set, tls64))
+ break;
+ continue;
- rtx_insn *def_insn;
- machine_mode scalar_mode;
- x86_cse_kind kind;
- rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
- &kind, &def_insn);
- if (!val)
- continue;
+ case TLS64_LEA:
+ /* Skip TLS64_LEA. */
+ continue;
- /* Remove redundant register loads if there are more than 2
- loads will be used. */
- unsigned int threshold = 2;
+ case TLS64_NONE:
+ if (!set)
+ continue;
- /* Check if there is a matching redundant vector load. */
- bool matched = false;
+ /* Check for vector broadcast. */
+ if (candidate_vector_p (set))
+ break;
+ continue;
+ }
+
+ /* Check if there is a matching redundant load. */
FOR_EACH_VEC_ELT (loads, i, load)
if (load->val
&& load->kind == kind
&& load->mode == scalar_mode
&& (load->bb == bb
- || kind < X86_CSE_VEC_DUP
+ || kind != X86_CSE_VEC_DUP
/* Non all 0s/1s vector load must be in the same
basic block if it is in a recursive call. */
|| !recursive_call_p)
&& rtx_equal_p (load->val, val))
{
- /* Record vector instruction. */
+ /* Record instruction. */
bitmap_set_bit (load->insns, INSN_UID (insn));
/* Record the maximum vector size. */
- if (load->size < GET_MODE_SIZE (mode))
+ if (kind <= X86_CSE_VEC_DUP
+ && load->size < GET_MODE_SIZE (mode))
load->size = GET_MODE_SIZE (mode);
/* Record the basic block. */
bitmap_set_bit (load->bbs, bb->index);
+
+ /* Increment the count. */
load->count++;
+
matched = true;
break;
}
@@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void)
if (matched)
continue;
- /* We see this vector broadcast the first time. */
- load = new redundant_load;
+ /* We see this instruction the first time. Record the
+ redundant source value, its mode, the destination size,
+ instruction which defines the redundant source value,
+ instruction basic block and the instruction kind. */
+ load = new redundant_pattern;
load->val = copy_rtx (val);
+ if (tlsdesc_val)
+ load->tlsdesc_val = copy_rtx (tlsdesc_val);
+ else
+ load->tlsdesc_val = nullptr;
load->mode = scalar_mode;
load->size = GET_MODE_SIZE (mode);
load->def_insn = def_insn;
@@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void)
}
bool replaced = false;
- rtx reg, broadcast_source, broadcast_reg;
FOR_EACH_VEC_ELT (loads, i, load)
if (load->count >= load->threshold)
{
- machine_mode mode = ix86_get_vector_cse_mode (load->size,
- load->mode);
- broadcast_reg = gen_reg_rtx (mode);
- if (load->def_insn)
- {
- /* Replace redundant vector loads with a single vector load
- in the same basic block. */
- reg = load->val;
- if (load->mode != GET_MODE (reg))
- reg = gen_rtx_SUBREG (load->mode, reg, 0);
- broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
- replace_vector_const (mode, broadcast_reg, load->insns,
- load->mode);
- }
- else
+ machine_mode mode;
+ rtx reg, broadcast_source, broadcast_reg;
+ replaced = true;
+ switch (load->kind)
{
- /* This is a constant integer/double vector. If the
- inner scalar is 0 or -1, set vector to CONST0_RTX
- or CONSTM1_RTX directly. */
- rtx reg;
- switch (load->kind)
+ case X86_CSE_TLS_GD:
+ case X86_CSE_TLS_LD_BASE:
+ case X86_CSE_TLSDESC:
+ broadcast_reg = gen_reg_rtx (load->mode);
+ replace_tls_call (broadcast_reg, load->insns,
+ (load->kind == X86_CSE_TLSDESC
+ ? updated_gnu2_tls_insns
+ : updated_gnu_tls_insns));
+ load->broadcast_reg = broadcast_reg;
+ break;
+
+ case X86_CSE_CONST0_VECTOR:
+ case X86_CSE_CONSTM1_VECTOR:
+ case X86_CSE_VEC_DUP:
+ mode = ix86_get_vector_cse_mode (load->size, load->mode);
+ broadcast_reg = gen_reg_rtx (mode);
+ if (load->def_insn)
{
- case X86_CSE_CONST0_VECTOR:
- broadcast_source = CONST0_RTX (mode);
- break;
- case X86_CSE_CONSTM1_VECTOR:
- broadcast_source = CONSTM1_RTX (mode);
- break;
- default:
- reg = gen_reg_rtx (load->mode);
+ /* Replace redundant vector loads with a single vector
+ load in the same basic block. */
+ reg = load->val;
+ if (load->mode != GET_MODE (reg))
+ reg = gen_rtx_SUBREG (load->mode, reg, 0);
broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
- break;
}
+ else
+ /* This is a constant integer/double vector. If the
+ inner scalar is 0 or -1, set vector to CONST0_RTX
+ or CONSTM1_RTX directly. */
+ switch (load->kind)
+ {
+ case X86_CSE_CONST0_VECTOR:
+ broadcast_source = CONST0_RTX (mode);
+ break;
+ case X86_CSE_CONSTM1_VECTOR:
+ broadcast_source = CONSTM1_RTX (mode);
+ break;
+ case X86_CSE_VEC_DUP:
+ reg = gen_reg_rtx (load->mode);
+ broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
+ break;
+ default:
+ gcc_unreachable ();
+ }
replace_vector_const (mode, broadcast_reg, load->insns,
load->mode);
+ load->broadcast_source = broadcast_source;
+ load->broadcast_reg = broadcast_reg;
+ break;
}
- load->broadcast_source = broadcast_source;
- load->broadcast_reg = broadcast_reg;
- replaced = true;
}
if (replaced)
@@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void)
FOR_EACH_VEC_ELT (loads, i, load)
if (load->count >= load->threshold)
{
+ rtx set;
if (load->def_insn)
- {
- /* Insert a broadcast after the original scalar
- definition. */
- rtx set = gen_rtx_SET (load->broadcast_reg,
- load->broadcast_source);
- insn = emit_insn_after (set, load->def_insn);
-
- if (cfun->can_throw_non_call_exceptions)
- {
- /* Handle REG_EH_REGION note in DEF_INSN. */
- rtx note = find_reg_note (load->def_insn,
- REG_EH_REGION, nullptr);
- if (note)
- {
- control_flow_insns.safe_push (load->def_insn);
- add_reg_note (insn, REG_EH_REGION,
- XEXP (note, 0));
- }
- }
+ switch (load->kind)
+ {
+ case X86_CSE_TLSDESC:
+ ix86_place_single_tls_call (load->broadcast_reg,
+ load->tlsdesc_val,
+ load->kind,
+ load->bbs,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns,
+ PATTERN (load->def_insn));
+ break;
+ case X86_CSE_VEC_DUP:
+ /* Insert a broadcast after the original scalar
+ definition. */
+ set = gen_rtx_SET (load->broadcast_reg,
+ load->broadcast_source);
+ insn = emit_insn_after (set, load->def_insn);
+
+ if (cfun->can_throw_non_call_exceptions)
+ {
+ /* Handle REG_EH_REGION note in DEF_INSN. */
+ rtx note = find_reg_note (load->def_insn,
+ REG_EH_REGION, nullptr);
+ if (note)
+ {
+ control_flow_insns.safe_push (load->def_insn);
+ add_reg_note (insn, REG_EH_REGION,
+ XEXP (note, 0));
+ }
+ }
- if (dump_file)
- {
- fprintf (dump_file, "\nAdd:\n\n");
- print_rtl_single (dump_file, insn);
- fprintf (dump_file, "\nafter:\n\n");
- print_rtl_single (dump_file, load->def_insn);
- fprintf (dump_file, "\n");
- }
- }
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nAdd:\n\n");
+ print_rtl_single (dump_file, insn);
+ fprintf (dump_file, "\nafter:\n\n");
+ print_rtl_single (dump_file, load->def_insn);
+ fprintf (dump_file, "\n");
+ }
+ break;
+ default:
+ gcc_unreachable ();
+ }
else
- ix86_place_single_vector_set (load->broadcast_reg,
- load->broadcast_source,
- load->bbs,
- (load->kind == X86_CSE_VEC_DUP
- ? load->val
- : nullptr));
+ switch (load->kind)
+ {
+ case X86_CSE_TLS_GD:
+ case X86_CSE_TLS_LD_BASE:
+ case X86_CSE_TLSDESC:
+ ix86_place_single_tls_call (load->broadcast_reg,
+ (load->kind == X86_CSE_TLSDESC
+ ? load->tlsdesc_val
+ : load->val),
+ load->kind,
+ load->bbs,
+ updated_gnu_tls_insns,
+ updated_gnu2_tls_insns);
+ break;
+ case X86_CSE_CONST0_VECTOR:
+ case X86_CSE_CONSTM1_VECTOR:
+ case X86_CSE_VEC_DUP:
+ ix86_place_single_vector_set (load->broadcast_reg,
+ load->broadcast_source,
+ load->bbs,
+ load);
+ break;
+ }
}
loop_optimizer_finalize ();
@@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void)
return 0;
}
-namespace {
-
-const pass_data pass_data_remove_redundant_vector_load =
-{
- RTL_PASS, /* type */
- "rrvl", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- 0, /* todo_flags_finish */
-};
-
-class pass_remove_redundant_vector_load : public rtl_opt_pass
-{
-public:
- pass_remove_redundant_vector_load (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
- {}
-
- /* opt_pass methods: */
- bool gate (function *fun) final override
- {
- return (TARGET_SSE2
- && optimize
- && optimize_function_for_speed_p (fun));
- }
-
- unsigned int execute (function *) final override
- {
- return remove_redundant_vector_load ();
- }
-}; // class pass_remove_redundant_vector_load
-
} // anon namespace
rtl_opt_pass *
-make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+make_pass_x86_cse (gcc::context *ctxt)
{
- return new pass_remove_redundant_vector_load (ctxt);
+ return new pass_x86_cse (ctxt);
}
/* Convert legacy instructions that clobbers EFLAGS to APX_NF
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 2fedbeb..c2db305 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */
VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */
VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */
-VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */
VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
VECTOR_MODE (FLOAT, BF, 2); /* V2BF */
VECTOR_MODE (FLOAT, HF, 6); /* V6HF */
@@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */
VECTOR_MODE (INT, QI, 12); /* V12QI */
VECTOR_MODE (INT, QI, 14); /* V14QI */
VECTOR_MODE (INT, HI, 6); /* V6HI */
-VECTOR_MODE (INT, SI, 64); /* V64SI */
INT_MODE (OI, 32);
INT_MODE (XI, 64);
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index ca6bb83..abb5dd7 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
OPT_mrecip,
MASK_RECIP),
+ IX86_ATTR_YES ("80387",
+ OPT_m80387,
+ MASK_80387),
+
IX86_ATTR_IX86_YES ("general-regs-only",
OPT_mgeneral_regs_only,
OPTION_MASK_GENERAL_REGS_ONLY),
@@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
else if (type == ix86_opt_yes || type == ix86_opt_no)
{
+ opts_set->x_target_flags |= mask;
+
if (type == ix86_opt_no)
opt_set_p = !opt_set_p;
@@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl)
isa = "AVX";
else if (cfun->machine->func_type != TYPE_NORMAL)
isa = "SSE";
+ else if (TARGET_MMX)
+ isa = "MMX/3Dnow";
+ else if (TARGET_80387)
+ isa = "80387";
else
isa = NULL;
}
@@ -3615,6 +3625,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
+ if (TARGET_64BIT)
+ {
+ /* Do not warn when emulating the MS ABI. */
+ if ((TREE_CODE (*node) != FUNCTION_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE)
+ || ix86_function_type_abi (*node) != MS_ABI)
+ warning (OPT_Wattributes, "%qE attribute ignored",
+ name);
+ *no_add_attrs = true;
+ return NULL_TREE;
+ }
+
/* Can combine regparm with all attributes but fastcall, and thiscall. */
if (is_attribute_p ("regparm", name))
{
@@ -3627,7 +3649,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
{
- error ("regparam and thiscall attributes are not compatible");
+ error ("regparm and thiscall attributes are not compatible");
}
cst = TREE_VALUE (args);
@@ -3648,19 +3670,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
return NULL_TREE;
}
- if (TARGET_64BIT)
- {
- /* Do not warn when emulating the MS ABI. */
- if ((TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE)
- || ix86_function_type_abi (*node) != MS_ABI)
- warning (OPT_Wattributes, "%qE attribute ignored",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
-
- /* Can combine fastcall with stdcall (redundant) and sseregparm. */
+ /* Can combine fastcall with sseregparm. */
if (is_attribute_p ("fastcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3681,8 +3691,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
}
}
- /* Can combine stdcall with fastcall (redundant), regparm and
- sseregparm. */
+ /* Can combine stdcall with regparm and sseregparm. */
else if (is_attribute_p ("stdcall", name))
{
if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
@@ -3732,6 +3741,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
{
error ("cdecl and thiscall attributes are not compatible");
}
+ if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("regparm and thiscall attributes are not compatible");
+ }
}
/* Can combine sseregparm with all attributes. */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 06f0288..553b46d 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,6 +35,6 @@ along with GCC; see the file COPYING3. If not see
PR116174. */
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
- INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
+ INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse);
INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 69bc0ee..bdb8bb9 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void);
extern bool ix86_gpr_tls_address_pattern_p (rtx);
extern bool ix86_tls_address_pattern_p (rtx);
extern rtx ix86_rewrite_tls_address (rtx);
+extern rtx ix86_tls_get_addr (void);
extern void ix86_expand_vector_init (bool, rtx, rtx);
extern void ix86_expand_vector_set (bool, rtx, rtx, int);
@@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
-extern rtl_opt_pass *make_pass_remove_redundant_vector_load
- (gcc::context *);
+extern rtl_opt_pass *make_pass_x86_cse (gcc::context *);
extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
@@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap);
/* In i386-expand.cc. */
bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
HOST_WIDE_INT*);
+rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4682db85..471be3e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
return cost;
}
+
+/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P. */
+
+bool
+ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
+ unsigned int align,
+ enum by_pieces_operation op,
+ bool speed_p)
+{
+ /* Return true when we are currently expanding memcpy/memset epilogue
+ with move_by_pieces or store_by_pieces. */
+ if (cfun->machine->by_pieces_in_use)
+ return true;
+
+ return default_use_by_pieces_infrastructure_p (size, align, op,
+ speed_p);
+}
/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
this is used for to form addresses to local data when -fPIC is in
@@ -12439,9 +12456,31 @@ ix86_tls_index (void)
static GTY(()) rtx ix86_tls_symbol;
-static rtx
+rtx
ix86_tls_get_addr (void)
{
+ if (cfun->machine->call_saved_registers
+ == TYPE_NO_CALLER_SAVED_REGISTERS)
+ {
+ /* __tls_get_addr doesn't preserve vector registers. When a
+ function with no_caller_saved_registers attribute calls
+ __tls_get_addr, YMM and ZMM registers will be clobbered.
+ Issue an error and suggest -mtls-dialect=gnu2 in this case. */
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ error (G_("%<-mtls-dialect=gnu2%> must be used with a function"
+ " with the %<no_caller_saved_registers%> attribute"));
+ else
+ error (cfun->machine->func_type == TYPE_EXCEPTION
+ ? G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " exception service routine")
+ : G_("%<-mtls-dialect=gnu2%> must be used with an"
+ " interrupt service routine"));
+ /* Don't issue the same error twice. */
+ cfun->machine->func_type = TYPE_NORMAL;
+ cfun->machine->call_saved_registers
+ = TYPE_DEFAULT_CALL_SAVED_REGISTERS;
+ }
+
if (!ix86_tls_symbol)
{
const char *sym
@@ -20007,7 +20046,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
tree utype, ures, vce;
utype = unsigned_type_for (TREE_TYPE (arg0));
/* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR
- instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */
+ instead of ABS_EXPR to handle overflow case(TYPE_MIN). */
ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0);
gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
loc = gimple_location (stmt);
@@ -21491,8 +21530,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
/* Register pair for mask registers. */
if (mode == P2QImode || mode == P2HImode)
return 2;
- if (mode == V64SFmode || mode == V64SImode)
- return 4;
+
return 1;
}
@@ -22081,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
}
/* FALLTHRU */
case V32QImode:
+ if (TARGET_GFNI && constant_op1)
+ {
+ /* Use vgf2p8affine. One extra load for the mask, but in a loop
+ with enough registers it will be moved out. So for now don't
+ account the constant mask load. This is not quite right
+ for non loop vectorization. */
+ extra = 0;
+ return ix86_vec_cost (mode, cost->sse_op) + extra;
+ }
if (TARGET_AVX2)
/* Use vpbroadcast. */
extra = cost->sse_op;
@@ -22115,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost,
count = 9;
return ix86_vec_cost (mode, cost->sse_op * count) + extra;
+ case V64QImode:
+ /* Ignore the mask load for GF2P8AFFINEQB. */
+ extra = 0;
+ return ix86_vec_cost (mode, cost->sse_op) + extra;
+
case V2DImode:
case V4DImode:
/* V*DImode arithmetic right shift is emulated. */
@@ -23132,7 +23184,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
So current solution is make constant disp as cheap as possible. */
if (GET_CODE (addr) == PLUS
&& x86_64_immediate_operand (XEXP (addr, 1), Pmode)
- /* Only hanlde (reg + disp) since other forms of addr are mostly LEA,
+ /* Only handle (reg + disp) since other forms of addr are mostly LEA,
there's no additional cost for the plus of disp. */
&& register_operand (XEXP (addr, 0), Pmode))
{
@@ -25211,20 +25263,14 @@ asm_preferred_eh_data_format (int code, int global)
return DW_EH_PE_absptr;
}
-/* Implement targetm.vectorize.builtin_vectorization_cost. */
+/* Worker for ix86_builtin_vectorization_cost and the fallback calls
+ from ix86_vector_costs::add_stmt_cost. */
static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype, int)
+ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost,
+ machine_mode mode)
{
- bool fp = false;
- machine_mode mode = TImode;
+ bool fp = FLOAT_MODE_P (mode);
int index;
- if (vectype != NULL)
- {
- fp = FLOAT_TYPE_P (vectype);
- mode = TYPE_MODE (vectype);
- }
-
switch (type_of_cost)
{
case scalar_stmt:
@@ -25283,14 +25329,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
COSTS_N_INSNS
(ix86_cost->gather_static
+ ix86_cost->gather_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case vector_scatter_store:
return ix86_vec_cost (mode,
COSTS_N_INSNS
(ix86_cost->scatter_static
+ ix86_cost->scatter_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ * GET_MODE_NUNITS (mode)) / 2);
case cond_branch_taken:
return ix86_cost->cond_taken_branch_cost;
@@ -25308,7 +25354,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
case vec_construct:
{
- int n = TYPE_VECTOR_SUBPARTS (vectype);
+ int n = GET_MODE_NUNITS (mode);
/* N - 1 element inserts into an SSE vector, the possible
GPR -> XMM move is accounted for in add_stmt_cost. */
if (GET_MODE_BITSIZE (mode) <= 128)
@@ -25336,6 +25382,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+ tree vectype, int)
+{
+ machine_mode mode = TImode;
+ if (vectype != NULL)
+ mode = TYPE_MODE (vectype);
+ return ix86_default_vector_cost (type_of_cost, mode);
+}
+
/* This function returns the calling abi specific va_list type node.
It returns the FNDECL specific va_list type. */
@@ -25768,15 +25825,20 @@ private:
unsigned m_num_sse_needed[3];
/* Number of 256-bit vector permutation. */
unsigned m_num_avx256_vec_perm[3];
+ /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR */
+ unsigned m_num_reduc[X86_REDUC_LAST];
+ /* Don't do unroll if m_prefer_unroll is false, default is true. */
+ bool m_prefer_unroll;
};
ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
: vector_costs (vinfo, costing_for_scalar),
m_num_gpr_needed (),
m_num_sse_needed (),
- m_num_avx256_vec_perm ()
-{
-}
+ m_num_avx256_vec_perm (),
+ m_num_reduc (),
+ m_prefer_unroll (true)
+{}
/* Implement targetm.vectorize.create_costs. */
@@ -25789,7 +25851,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
unsigned
ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree node,
- tree vectype, int misalign,
+ tree vectype, int,
vect_cost_model_location where)
{
unsigned retval = 0;
@@ -26073,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
}
}
+ /* Record number of load/store/gather/scatter in vectorized body. */
+ if (where == vect_body && !m_costing_for_scalar)
+ {
+ switch (kind)
+ {
+ /* Emulated gather/scatter or any scalarization. */
+ case scalar_load:
+ case scalar_stmt:
+ case scalar_store:
+ case vector_gather_load:
+ case vector_scatter_store:
+ m_prefer_unroll = false;
+ break;
+
+ case vector_stmt:
+ case vec_to_scalar:
+ /* Count number of reduction FMA and "real" DOT_PROD_EXPR,
+ unroll in the vectorizer will enable partial sum. */
+ if (stmt_info
+ && vect_is_reduction (stmt_info)
+ && stmt_info->stmt)
+ {
+ /* Handle __builtin_fma. */
+ if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA)
+ {
+ m_num_reduc[X86_REDUC_FMA] += count;
+ break;
+ }
+
+ if (!is_gimple_assign (stmt_info->stmt))
+ break;
+
+ tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ tree rhs1, rhs2;
+ bool native_vnni_p = true;
+ gimple* def;
+ machine_mode mode_rhs;
+ switch (subcode)
+ {
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ if (!fp || !flag_associative_math
+ || flag_fp_contract_mode != FP_CONTRACT_FAST)
+ break;
+
+ /* FMA condition for different modes. */
+ if (((inner_mode == DFmode || inner_mode == SFmode)
+ && !TARGET_FMA && !TARGET_AVX512VL)
+ || (inner_mode == HFmode && !TARGET_AVX512FP16)
+ || (inner_mode == BFmode && !TARGET_AVX10_2))
+ break;
+
+ /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed
+ to FMA/FNMA after vectorization. */
+ rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+ rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+ if (subcode == PLUS_EXPR
+ && TREE_CODE (rhs1) == SSA_NAME
+ && (def = SSA_NAME_DEF_STMT (rhs1), true)
+ && is_gimple_assign (def)
+ && gimple_assign_rhs_code (def) == MULT_EXPR)
+ m_num_reduc[X86_REDUC_FMA] += count;
+ else if (TREE_CODE (rhs2) == SSA_NAME
+ && (def = SSA_NAME_DEF_STMT (rhs2), true)
+ && is_gimple_assign (def)
+ && gimple_assign_rhs_code (def) == MULT_EXPR)
+ m_num_reduc[X86_REDUC_FMA] += count;
+ break;
+
+ /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR,
+ WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports
+ SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR. */
+ case DOT_PROD_EXPR:
+ rhs1 = gimple_assign_rhs1 (stmt_info->stmt);
+ mode_rhs = TYPE_MODE (TREE_TYPE (rhs1));
+ if (mode_rhs == QImode)
+ {
+ rhs2 = gimple_assign_rhs2 (stmt_info->stmt);
+ signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1));
+ signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2));
+
+ /* vpdpbusd. */
+ if (signop1_p != signop2_p)
+ native_vnni_p
+ = (GET_MODE_SIZE (mode) == 64
+ ? TARGET_AVX512VNNI
+ : ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+ || TARGET_AVXVNNI));
+ else
+ /* vpdpbssd. */
+ native_vnni_p
+ = (GET_MODE_SIZE (mode) == 64
+ ? TARGET_AVX10_2
+ : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2));
+ }
+ m_num_reduc[X86_REDUC_DOT_PROD] += count;
+
+ /* Dislike to do unroll and partial sum for
+ emulated DOT_PROD_EXPR. */
+ if (!native_vnni_p)
+ m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count;
+ break;
+
+ case SAD_EXPR:
+ m_num_reduc[X86_REDUC_SAD] += count;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ default:
+ break;
+ }
+ }
+
+
combined_fn cfn;
if ((kind == vector_stmt || kind == scalar_stmt)
&& stmt_info
@@ -26128,32 +26309,23 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
(AGU and load ports). Try to account for this by scaling the
construction cost by the number of elements involved. */
if ((kind == vec_construct || kind == vec_to_scalar)
- && ((stmt_info
- && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
- || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ && ((node
+ && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
+ && SLP_TREE_LANES (node) == 1))
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+ (SLP_TREE_REPRESENTATIVE (node))))
!= INTEGER_CST))
- || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)
- == VMAT_GATHER_SCATTER)))
- || (node
- && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP
- && SLP_TREE_LANES (node) == 1))
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
- (SLP_TREE_REPRESENTATIVE (node))))
- != INTEGER_CST))
- || (SLP_TREE_MEMORY_ACCESS_TYPE (node)
- == VMAT_GATHER_SCATTER)))))
- {
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))))
+ {
+ stmt_cost = ix86_default_vector_cost (kind, mode);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
}
else if ((kind == vec_construct || kind == scalar_to_vec)
&& node
&& SLP_TREE_DEF_TYPE (node) == vect_external_def)
{
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
unsigned i;
tree op;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
@@ -26217,7 +26389,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
TREE_VISITED (op) = 0;
}
if (stmt_cost == -1)
- stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ stmt_cost = ix86_default_vector_cost (kind, mode);
if (kind == vec_perm && vectype
&& GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
@@ -26288,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
&& (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
> ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
m_costs[vect_body] = INT_MAX;
+
+ bool any_reduc_p = false;
+ for (int i = 0; i != X86_REDUC_LAST; i++)
+ if (m_num_reduc[i])
+ {
+ any_reduc_p = true;
+ break;
+ }
+
+ if (any_reduc_p
+ /* Not much gain for loop with gather and scatter. */
+ && m_prefer_unroll
+ && !LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ {
+ unsigned unroll_factor
+ = OPTION_SET_P (ix86_vect_unroll_limit)
+ ? ix86_vect_unroll_limit
+ : ix86_cost->vect_unroll_limit;
+
+ if (unroll_factor > 1)
+ {
+ for (int i = 0 ; i != X86_REDUC_LAST; i++)
+ {
+ if (m_num_reduc[i])
+ {
+ unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i],
+ m_num_reduc[i]);
+ unroll_factor = MIN (unroll_factor, tmp);
+ }
+ }
+
+ m_suggested_unroll_factor = 1 << ceil_log2 (unroll_factor);
+ }
+ }
+
}
ix86_vect_estimate_reg_pressure ();
@@ -27171,9 +27378,9 @@ ix86_memtag_can_tag_addresses ()
return ix86_lam_type != lam_none && TARGET_LP64;
}
-/* Implement TARGET_MEMTAG_TAG_SIZE. */
+/* Implement TARGET_MEMTAG_TAG_BITSIZE. */
unsigned char
-ix86_memtag_tag_size ()
+ix86_memtag_tag_bitsize ()
{
return IX86_HWASAN_TAG_SIZE;
}
@@ -27744,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
+#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
+ ix86_use_by_pieces_infrastructure_p
+
#undef TARGET_OVERLAP_OP_BY_PIECES_P
#define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
@@ -28147,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p
#undef TARGET_MEMTAG_UNTAGGED_POINTER
#define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer
-#undef TARGET_MEMTAG_TAG_SIZE
-#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size
+#undef TARGET_MEMTAG_TAG_BITSIZE
+#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize
#undef TARGET_GEN_CCMP_FIRST
#define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 791f3b9..ac0ce68 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -102,6 +102,15 @@ struct stringop_algs
#define COSTS_N_BYTES(N) ((N) * 2)
#endif
+
+enum ix86_reduc_unroll_factor{
+ X86_REDUC_FMA,
+ X86_REDUC_DOT_PROD,
+ X86_REDUC_SAD,
+
+ X86_REDUC_LAST
+};
+
/* Define the specific costs for a given cpu. NB: hard_register is used
by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute
hard register move costs by register allocator. Relative costs of
@@ -225,6 +234,13 @@ struct processor_costs {
to number of instructions executed in
parallel. See also
ix86_reassociation_width. */
+ const unsigned reduc_lat_mult_thr[X86_REDUC_LAST];
+ /* Latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ const unsigned vect_unroll_limit; /* Limit how much the autovectorizer
+ may unroll a loop. */
struct stringop_algs *memcpy, *memset;
const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer
cost model. */
@@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
{"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \
{"arch", "%{!march=*:-march=%(VALUE)}"}, \
{"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"}, \
- {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"},
+ {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, \
+ {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"},
/* Specs for the compiler proper */
@@ -2477,9 +2494,9 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D
| PTA_MOVRS | PTA_AMX_MOVRS | PTA_USER_MSR;
constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
- | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
- | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
- | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+ | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
+ | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+ | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
constexpr wide_int_bitmask PTA_BDVER2 = PTA_BDVER1 | PTA_BMI | PTA_TBM
| PTA_F16C | PTA_FMA;
constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
@@ -2487,13 +2504,13 @@ constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT
constexpr wide_int_bitmask PTA_BDVER4 = PTA_BDVER3 | PTA_AVX2 | PTA_BMI2
| PTA_RDRND | PTA_MOVBE | PTA_MWAITX;
-constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2
- | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
- | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2
- | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT
- | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
- | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES | PTA_SHA | PTA_LZCNT
- | PTA_POPCNT;
+constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
+ | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT
+ | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL
+ | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
+ | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
+ | PTA_MWAITX | PTA_ADX | PTA_RDSEED | PTA_CLZERO | PTA_CLFLUSHOPT
+ | PTA_XSAVEC | PTA_XSAVES | PTA_SHA;
constexpr wide_int_bitmask PTA_ZNVER2 = PTA_ZNVER1 | PTA_CLWB | PTA_RDPID
| PTA_WBNOINVD;
constexpr wide_int_bitmask PTA_ZNVER3 = PTA_ZNVER2 | PTA_VAES | PTA_VPCLMULQDQ
@@ -2506,19 +2523,19 @@ constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI
| PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI;
constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE
- | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16
- | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
+ | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT
+ | PTA_ABM | PTA_CX16 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE;
constexpr wide_int_bitmask PTA_BTVER2 = PTA_BTVER1 | PTA_SSE4_1 | PTA_SSE4_2
| PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_BMI | PTA_F16C | PTA_MOVBE
| PTA_XSAVEOPT;
constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE
- | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
- | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW
- | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE
- | PTA_ADX | PTA_RDSEED | PTA_POPCNT;
+ | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_LZCNT | PTA_POPCNT | PTA_ABM
+ | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI
+ | PTA_BMI2 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
+ | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED;
constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2
- | PTA_F16C | PTA_FMA | PTA_SHA | PTA_LZCNT;
+ | PTA_F16C | PTA_FMA | PTA_SHA;
#ifndef GENERATOR_FILE
@@ -2865,6 +2882,9 @@ struct GTY(()) machine_function {
approximation. */
BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
+ /* True if TLS descriptor is called more than once. */
+ BOOL_BITFIELD tls_descriptor_call_multiple_p : 1;
+
/* If true, the current function has a STATIC_CHAIN is placed on the
stack below the return address. */
BOOL_BITFIELD static_chain_on_stack : 1;
@@ -2934,6 +2954,9 @@ struct GTY(()) machine_function {
/* True if this is a recursive function. */
BOOL_BITFIELD recursive_function : 1;
+ /* True if by_pieces op is currently in use. */
+ BOOL_BITFIELD by_pieces_in_use : 1;
+
/* The largest alignment, in bytes, of stack slot actually used. */
unsigned int max_used_stack_alignment;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index eb52699..cea6c15 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -901,6 +901,10 @@
(define_attr "avx_partial_xmm_update" "false,true"
(const_string "false"))
+;; Define attribute to indicate 64-bit TLS insns.
+(define_attr "tls64" "gd,ld_base,call,combine,lea,none"
+ (const_string "none"))
+
;; Define attribute to classify add/sub insns that consumes carry flag (CF)
(define_attr "use_carry" "0,1" (const_string "0"))
@@ -1618,10 +1622,8 @@
(compare
(match_operand:QI 0 "nonimmediate_operand" "QBn")
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))]
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
"ix86_match_ccmode (insn, CCmode)"
"cmp{b}\t{%h1, %0|%0, %h1}"
[(set_attr "addr" "gpr8")
@@ -1632,10 +1634,8 @@
[(set (reg FLAGS_REG)
(compare
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 0 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 0 "int248_register_operand" "Q")]) 0)
(match_operand:QI 1 "const0_operand")))]
"ix86_match_ccmode (insn, CCNOmode)"
"test{b}\t%h0, %h0"
@@ -1657,10 +1657,8 @@
[(set (reg FLAGS_REG)
(compare
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 0 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 0 "int248_register_operand" "Q")]) 0)
(match_operand:QI 1 "general_operand" "QnBn")))]
"ix86_match_ccmode (insn, CCmode)"
"cmp{b}\t{%1, %h0|%h0, %1}"
@@ -1672,15 +1670,11 @@
[(set (reg FLAGS_REG)
(compare
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 0 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 0 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))]
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
"ix86_match_ccmode (insn, CCmode)"
"cmp{b}\t{%h1, %h0|%h0, %h1}"
[(set_attr "type" "icmp")
@@ -2968,7 +2962,8 @@
(match_operand:SWI248 1 "const_int_operand"))]
"optimize_insn_for_size_p () && optimize_size > 1
&& operands[1] != const0_rtx
- && operands[1] != constm1_rtx
+ && (operands[1] != constm1_rtx
+ || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0])))
&& IN_RANGE (INTVAL (operands[1]), -128, 127)
&& !ix86_red_zone_used
&& REGNO (operands[0]) != SP_REG"
@@ -3479,10 +3474,8 @@
[(set (strict_low_part
(match_operand:QI 0 "register_operand" "+Q"))
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0))]
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
"mov{b}\t{%h1, %0|%0, %h1}"
[(set_attr "type" "imov")
@@ -3565,10 +3558,8 @@
(define_insn "*extzvqi"
[(set (match_operand:QI 0 "nonimmediate_operand" "=QBn,?R")
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0))]
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q,Q")]) 0))]
""
{
switch (get_attr_type (insn))
@@ -3689,10 +3680,8 @@
(match_operand 0 "int248_register_operand" "+Q")
(const_int 8)
(const_int 8))
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]))]
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]))]
""
"mov{b}\t{%h1, %h0|%h0, %h1}"
[(set_attr "type" "imov")
@@ -5259,10 +5248,8 @@
[(set (match_operand:SWI24 0 "register_operand" "=R")
(sign_extend:SWI24
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))]
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)))]
""
"movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}"
[(set_attr "type" "imovx")
@@ -7008,10 +6995,8 @@
[(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
(plus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
(match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -7025,8 +7010,8 @@
[(set (strict_low_part (match_dup 0))
(plus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -7037,29 +7022,25 @@
[(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
(plus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
"#"
"&& reload_completed"
[(set (strict_low_part (match_dup 0))
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0))
(parallel
[(set (strict_low_part (match_dup 0))
(plus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -7474,10 +7455,8 @@
[(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
(plus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)
(match_operand:QI 1 "nonimmediate_operand" "0")))
(clobber (reg:CC FLAGS_REG))]
""
@@ -7490,29 +7469,25 @@
[(set (match_operand:QI 0 "register_operand" "=&Q")
(plus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
""
"#"
"&& reload_completed"
[(set (match_dup 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0))
(parallel
[(set (match_dup 0)
(plus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -7542,10 +7517,8 @@
(subreg:SWI248
(plus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
(clobber (reg:CC FLAGS_REG))]
""
@@ -7580,8 +7553,8 @@
(subreg:SWI248
(plus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -7601,15 +7574,11 @@
(subreg:SWI248
(plusminus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "<comm>0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "<comm>0,!Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0)) 0))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
(clobber (reg:CC FLAGS_REG))]
""
"@
@@ -7628,11 +7597,11 @@
(subreg:SWI248
(plusminus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "alu")
@@ -8229,10 +8198,8 @@
(minus:QI
(match_operand:QI 1 "nonimmediate_operand" "0,!qm")
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
"@
@@ -8246,8 +8213,8 @@
(minus:QI
(match_dup 0)
(subreg:QI
- (match_op_dup 3
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "alu")
@@ -8257,30 +8224,26 @@
[(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
(minus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
"#"
"&& reload_completed"
[(set (strict_low_part (match_dup 0))
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0))
(parallel
[(set (strict_low_part (match_dup 0))
(minus:QI
(match_dup 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "alu")
@@ -8331,10 +8294,8 @@
(minus:QI
(match_operand:QI 1 "nonimmediate_operand" "0")
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
""
"sub{b}\t{%h2, %0|%0, %h2}"
@@ -8346,30 +8307,26 @@
[(set (match_operand:QI 0 "register_operand" "=&Q")
(minus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
""
"#"
"&& reload_completed"
[(set (match_dup 0)
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0))
(parallel
[(set (match_dup 0)
(minus:QI
(match_dup 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "alu")
@@ -8384,10 +8341,8 @@
(subreg:SWI248
(minus:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
(clobber (reg:CC FLAGS_REG))]
""
@@ -8406,8 +8361,8 @@
(subreg:SWI248
(minus:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -12355,10 +12310,8 @@
(compare
(and:QI
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 0 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 0 "int248_register_operand" "Q")]) 0)
(match_operand:QI 1 "general_operand" "QnBn"))
(const_int 0)))]
"ix86_match_ccmode (insn, CCNOmode)"
@@ -12372,15 +12325,11 @@
(compare
(and:QI
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 0 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 0 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0))
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0))
(const_int 0)))]
"ix86_match_ccmode (insn, CCNOmode)"
"test{b}\t{%h1, %h0|%h0, %h1}"
@@ -12969,10 +12918,8 @@
[(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q"))
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)
(match_operand:QI 1 "nonimmediate_operand" "0,!qm")))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
@@ -12986,8 +12933,8 @@
[(set (strict_low_part (match_dup 0))
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -12998,29 +12945,25 @@
[(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q"))
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
"!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)"
"#"
"&& reload_completed"
[(set (strict_low_part (match_dup 0))
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0))
(parallel
[(set (strict_low_part (match_dup 0))
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -13223,10 +13166,8 @@
[(set (match_operand:QI 0 "nonimmediate_operand" "=QBn")
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)
(match_operand:QI 1 "nonimmediate_operand" "0")))
(clobber (reg:CC FLAGS_REG))]
""
@@ -13239,29 +13180,25 @@
[(set (match_operand:QI 0 "register_operand" "=&Q")
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q")
- (const_int 8)
- (const_int 8)]) 0)))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q")]) 0)))
(clobber (reg:CC FLAGS_REG))]
""
"#"
"&& reload_completed"
[(set (match_dup 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0))
(parallel
[(set (match_dup 0)
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0)
(match_dup 0)))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -13291,10 +13228,8 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0))
(clobber (reg:CC FLAGS_REG))]
""
@@ -13313,8 +13248,8 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -13328,10 +13263,8 @@
(match_operator 5 "compare_operator"
[(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "general_operand" "QnBn,QnBn"))
(const_int 0)]))
(set (zero_extract:SWI248
@@ -13341,8 +13274,8 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))]
"ix86_match_ccmode (insn, CCNOmode)"
"@
@@ -13358,9 +13291,9 @@
[(set (match_dup 4)
(match_op_dup 5
[(any_logic:QI
- (subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (subreg:QI
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2))
(const_int 0)]))
(set (zero_extract:SWI248
@@ -13368,8 +13301,8 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 1) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 1) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))])]
""
[(set_attr "addr" "gpr8")
@@ -13385,15 +13318,11 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "%0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "%0,!Q")]) 0)
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand" "Q,Q")
- (const_int 8)
- (const_int 8)]) 0)) 0))
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0))
(clobber (reg:CC FLAGS_REG))]
""
"@
@@ -13412,11 +13341,11 @@
(subreg:SWI248
(any_logic:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(subreg:QI
- (match_op_dup 4
- [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0))
+ (zero_extract:SWI248
+ (match_dup 2) (const_int 8) (const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "alu")
@@ -13428,12 +13357,10 @@
(match_operand 0 "int248_register_operand" "+Q,&Q")
(const_int 8)
(const_int 8))
- (match_operator:SWI248 3 "extract_operator"
+ (match_operator:SWI248 3 "extract_high_operator"
[(any_logic
(match_operand 1 "int248_register_operand" "%0,!Q")
- (match_operand 2 "int248_register_operand" "Q,Q"))
- (const_int 8)
- (const_int 8)]))
+ (match_operand 2 "int248_register_operand" "Q,Q"))]))
(clobber (reg:CC FLAGS_REG))]
"GET_MODE (operands[1]) == GET_MODE (operands[2])"
"@
@@ -13449,9 +13376,9 @@
(parallel
[(set (zero_extract:SWI248
(match_dup 0) (const_int 8) (const_int 8))
- (match_op_dup 3
- [(any_logic (match_dup 4) (match_dup 2))
- (const_int 8) (const_int 8)]))
+ (zero_extract:SWI248
+ (any_logic (match_dup 4) (match_dup 2))
+ (const_int 8) (const_int 8)))
(clobber (reg:CC FLAGS_REG))])]
"operands[4] = gen_lowpart (GET_MODE (operands[1]), operands[0]);"
[(set_attr "type" "alu")
@@ -14696,10 +14623,8 @@
(subreg:SWI248
(neg:QI
(subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)) 0))
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)) 0))
(clobber (reg:CC FLAGS_REG))]
""
"@
@@ -14717,8 +14642,8 @@
(subreg:SWI248
(neg:QI
(subreg:QI
- (match_op_dup 2
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
[(set_attr "type" "negnot")
@@ -15350,13 +15275,9 @@
(match_operand 0 "int248_register_operand" "+Q,&Q")
(const_int 8)
(const_int 8))
- (subreg:SWI248
- (not:QI
- (subreg:QI
- (match_operator:SWI248 2 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)) 0))]
+ (not:SWI248
+ (match_operator:SWI248 2 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")])))]
""
"@
not{b}\t%h0
@@ -15369,11 +15290,8 @@
(match_dup 1) (const_int 8) (const_int 8)))
(set (zero_extract:SWI248
(match_dup 0) (const_int 8) (const_int 8))
- (subreg:SWI248
- (not:QI
- (subreg:QI
- (match_op_dup 2
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))]
+ (not:SWI248
+ (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8))))]
""
[(set_attr "type" "negnot")
(set_attr "mode" "QI")])
@@ -16720,10 +16638,8 @@
(subreg:SWI248
(ashift:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
(clobber (reg:CC FLAGS_REG))]
""
@@ -16757,8 +16673,8 @@
(subreg:SWI248
(ashift:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -18004,10 +17920,8 @@
(subreg:SWI248
(any_shiftrt:QI
(subreg:QI
- (match_operator:SWI248 3 "extract_operator"
- [(match_operand 1 "int248_register_operand" "0,!Q")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 3 "extract_high_operator"
+ [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)
(match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0))
(clobber (reg:CC FLAGS_REG))]
""
@@ -18033,8 +17947,8 @@
(subreg:SWI248
(any_shiftrt:QI
(subreg:QI
- (match_op_dup 3
- [(match_dup 0) (const_int 8) (const_int 8)]) 0)
+ (zero_extract:SWI248
+ (match_dup 0) (const_int 8) (const_int 8)) 0)
(match_dup 2)) 0))
(clobber (reg:CC FLAGS_REG))])]
""
@@ -18388,17 +18302,17 @@
(any_rotate:SWI
(match_operand:SWI 1 "const_int_operand")
(subreg:QI
- (and
- (match_operand 2 "int248_register_operand")
- (match_operand 3 "const_int_operand")) 0)))]
+ (match_operator 4 "and_operator"
+ [(match_operand 2 "int248_register_operand")
+ (match_operand 3 "const_int_operand")]) 0)))]
"(INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1))
== GET_MODE_BITSIZE (<MODE>mode) - 1"
- [(set (match_dup 4) (match_dup 1))
+ [(set (match_dup 5) (match_dup 1))
(set (match_dup 0)
- (any_rotate:SWI (match_dup 4)
+ (any_rotate:SWI (match_dup 5)
(subreg:QI
- (and:SI (match_dup 2) (match_dup 3)) 0)))]
- "operands[4] = gen_reg_rtx (<MODE>mode);")
+ (match_op_dup 4 [(match_dup 2) (match_dup 3)]) 0)))]
+ "operands[5] = gen_reg_rtx (<MODE>mode);")
(define_insn_and_split "*<insn><mode>3_mask_1"
[(set (match_operand:SWI 0 "nonimmediate_operand")
@@ -23243,6 +23157,7 @@
return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}";
}
[(set_attr "type" "multi")
+ (set_attr "tls64" "gd")
(set (attr "length")
(symbol_ref "TARGET_X32 ? 15 : 16"))])
@@ -23281,7 +23196,11 @@
UNSPEC_TLS_GD)
(clobber (match_operand:P 3 "register_operand"))])]
"TARGET_64BIT"
- "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+ if (ix86_tls_descriptor_calls_expanded_in_cfun)
+ cfun->machine->tls_descriptor_call_multiple_p = true;
+ ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
(define_insn "*tls_local_dynamic_base_32_gnu"
[(set (match_operand:SI 0 "register_operand" "=a")
@@ -23343,6 +23262,7 @@
return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}";
}
[(set_attr "type" "multi")
+ (set_attr "tls64" "ld_base")
(set_attr "length" "12")])
(define_insn "*tls_local_dynamic_base_64_largepic"
@@ -23376,7 +23296,11 @@
(unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)
(clobber (match_operand:P 2 "register_operand"))])]
"TARGET_64BIT"
- "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
+{
+ if (ix86_tls_descriptor_calls_expanded_in_cfun)
+ cfun->machine->tls_descriptor_call_multiple_p = true;
+ ix86_tls_descriptor_calls_expanded_in_cfun = true;
+})
;; Local dynamic of a single variable is a lose. Show combine how
;; to convert that back to global dynamic.
@@ -23570,6 +23494,8 @@
"TARGET_64BIT && TARGET_GNU2_TLS"
{
operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
+ if (ix86_tls_descriptor_calls_expanded_in_cfun)
+ cfun->machine->tls_descriptor_call_multiple_p = true;
ix86_tls_descriptor_calls_expanded_in_cfun = true;
})
@@ -23581,6 +23507,7 @@
"lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
[(set_attr "type" "lea")
(set_attr "mode" "<MODE>")
+ (set_attr "tls64" "lea")
(set_attr "length" "7")
(set_attr "length_address" "4")])
@@ -23594,6 +23521,7 @@
"TARGET_64BIT && TARGET_GNU2_TLS"
"call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
[(set_attr "type" "call")
+ (set_attr "tls64" "call")
(set_attr "length" "2")
(set_attr "length_address" "0")])
@@ -23615,7 +23543,8 @@
{
operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0];
emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1]));
-})
+}
+ [(set_attr "tls64" "combine")])
(define_split
[(match_operand 0 "tls_address_pattern")]
@@ -28251,10 +28180,8 @@
(match_operator 1 "compare_operator"
[(and:QI
(subreg:QI
- (match_operator:SWI248 4 "extract_operator"
- [(match_operand 2 "int248_register_operand")
- (const_int 8)
- (const_int 8)]) 0)
+ (match_operator:SWI248 4 "extract_high_operator"
+ [(match_operand 2 "int248_register_operand")]) 0)
(match_operand 3 "const_int_operand"))
(const_int 0)]))]
"! TARGET_PARTIAL_REG_STALL
@@ -28266,9 +28193,9 @@
(match_op_dup 1
[(and:QI
(subreg:QI
- (match_op_dup 4 [(match_dup 2)
- (const_int 8)
- (const_int 8)]) 0)
+ (zero_extract:SWI248 (match_dup 2)
+ (const_int 8)
+ (const_int 8)) 0)
(match_dup 3))
(const_int 0)]))
(set (zero_extract:SWI248 (match_dup 2)
@@ -28277,9 +28204,9 @@
(subreg:SWI248
(and:QI
(subreg:QI
- (match_op_dup 4 [(match_dup 2)
- (const_int 8)
- (const_int 8)]) 0)
+ (zero_extract:SWI248 (match_dup 2)
+ (const_int 8)
+ (const_int 8)) 0)
(match_dup 3)) 0))])])
;; Don't do logical operations with memory inputs.
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index c93c0b1..6bda22f 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1246,6 +1246,10 @@ munroll-only-small-loops
Target Var(ix86_unroll_only_small_loops) Init(0) Optimization
Enable conservative small loop unrolling.
+-param=ix86-vect-unroll-limit=
+Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param
+Limit how much the autovectorizer may unroll a loop.
+
mlam=
Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
-mlam=[none|u48|u57] Instrument meta data position in user data pointers.
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index b2d2eec..5dbe444 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1319,6 +1319,9 @@
(ior (match_operand 0 "nonimmediate_operand")
(match_test "const_vec_duplicate_p (op)")))
+(define_predicate "const_vec_dup_operand"
+ (match_test "const_vec_duplicate_p (op)"))
+
;; Return true when OP is either register operand, or any
;; CONST_VECTOR.
(define_predicate "reg_or_const_vector_operand"
@@ -1714,10 +1717,14 @@
(define_predicate "div_operator"
(match_code "div"))
-;; Return true if this is a and, ior or xor operation.
+;; Return true if this is an and, ior or xor operation.
(define_predicate "logic_operator"
(match_code "and,ior,xor"))
+;; Return true if this is an and operation.
+(define_predicate "and_operator"
+ (match_code "and"))
+
;; Return true if this is a plus, minus, and, ior or xor operation.
(define_predicate "plusminuslogic_operator"
(match_code "plus,minus,and,ior,xor"))
@@ -1740,8 +1747,12 @@
(define_predicate "compare_operator"
(match_code "compare"))
-(define_predicate "extract_operator"
- (match_code "zero_extract,sign_extract"))
+(define_predicate "extract_high_operator"
+ (match_code "zero_extract,sign_extract,ashiftrt,lshiftrt")
+{
+ return (const8_operand (XEXP (op, 1), VOIDmode)
+ && (BINARY_P (op) || const8_operand (XEXP (op, 2), VOIDmode)));
+})
;; Return true if OP is a memory operand, aligned to
;; less than its natural alignment.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d88c3d6..73906b8 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -326,6 +326,9 @@
(define_mode_iterator VI1_AVX512VL
[V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")])
+(define_mode_iterator VI1_AVX512_3264
+ [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")])
+
;; All vector modes
(define_mode_iterator V
[(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
@@ -21729,6 +21732,19 @@
(const_string "orig")))
(set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")])
+;; Eliminate redundancy caused by
+;; /* Special case TImode to 128-bit vector conversions via V2DI. */
+;; in ix86_expand_vector_move
+
+(define_split
+ [(set (match_operand:V2DI 0 "register_operand")
+ (vec_concat:V2DI
+ (subreg:DI (match_operand:TI 1 "register_operand") 0)
+ (subreg:DI (match_dup 1) 8)))]
+ "TARGET_SSE2 && ix86_pre_reload_split ()"
+ [(set (match_dup 0)
+ (subreg:V2DI (match_dup 1) 0))])
+
(define_insn "*vec_concatv2di_0"
[(set (match_operand:V2DI 0 "register_operand" "=v,v ,x")
(vec_concat:V2DI
@@ -26546,9 +26562,9 @@
;; XOP packed rotate instructions
(define_expand "rotl<mode>3"
- [(set (match_operand:VI_128 0 "register_operand")
- (rotate:VI_128
- (match_operand:VI_128 1 "nonimmediate_operand")
+ [(set (match_operand:VI248_128 0 "register_operand")
+ (rotate:VI248_128
+ (match_operand:VI248_128 1 "nonimmediate_operand")
(match_operand:SI 2 "general_operand")))]
"TARGET_XOP"
{
@@ -26577,9 +26593,9 @@
})
(define_expand "rotr<mode>3"
- [(set (match_operand:VI_128 0 "register_operand")
- (rotatert:VI_128
- (match_operand:VI_128 1 "nonimmediate_operand")
+ [(set (match_operand:VI248_128 0 "register_operand")
+ (rotatert:VI248_128
+ (match_operand:VI248_128 1 "nonimmediate_operand")
(match_operand:SI 2 "general_operand")))]
"TARGET_XOP"
{
@@ -26951,31 +26967,122 @@
int i;
if (<CODE> != ASHIFT)
- {
- if (CONST_INT_P (operands[2]))
- operands[2] = GEN_INT (-INTVAL (operands[2]));
- else
- negate = true;
- }
+ {
+ if (CONST_INT_P (operands[2]))
+ operands[2] = GEN_INT (-INTVAL (operands[2]));
+ else
+ negate = true;
+ }
par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
tmp = lowpart_subreg (QImode, operands[2], SImode);
for (i = 0; i < 16; i++)
- XVECEXP (par, 0, i) = tmp;
+ XVECEXP (par, 0, i) = tmp;
tmp = gen_reg_rtx (V16QImode);
emit_insn (gen_vec_initv16qiqi (tmp, par));
if (negate)
- emit_insn (gen_negv16qi2 (tmp, tmp));
+ emit_insn (gen_negv16qi2 (tmp, tmp));
gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
emit_insn (gen (operands[0], operands[1], tmp));
}
+ else if (TARGET_GFNI && CONST_INT_P (operands[2])
+ && (<MODE_SIZE> == 64
+ || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT)))
+ {
+ rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2],
+ <CODE>);
+ emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+ const0_rtx));
+ }
else
ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
DONE;
})
+(define_expand "cond_<insn><mode>"
+ [(set (match_operand:VI1_AVX512VL 0 "register_operand")
+ (vec_merge:VI1_AVX512VL
+ (any_shift:VI1_AVX512VL
+ (match_operand:VI1_AVX512VL 2 "register_operand")
+ (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand"))
+ (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand:<avx512fmaskmode> 1 "register_operand")))]
+ "TARGET_GFNI && TARGET_AVX512F"
+{
+ rtx count = XVECEXP (operands[3], 0, 0);
+ rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>);
+ emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix,
+ const0_rtx, operands[4],
+ operands[1]));
+ DONE;
+})
+
+(define_expand "<insn><mode>3"
+ [(set (match_operand:VI1_AVX512_3264 0 "register_operand")
+ (any_rotate:VI1_AVX512_3264
+ (match_operand:VI1_AVX512_3264 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand")))]
+ "TARGET_GFNI"
+{
+ rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+ emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix,
+ const0_rtx));
+ DONE;
+})
+
+(define_expand "<insn>v16qi3"
+ [(set (match_operand:V16QI 0 "register_operand")
+ (any_rotate:V16QI
+ (match_operand:V16QI 1 "nonimmediate_operand")
+ (match_operand:SI 2 "general_operand")))]
+ "TARGET_GFNI || TARGET_XOP"
+{
+ /* Handle the V16QI XOP case to avoid a conflict with the other expand. */
+ if (TARGET_XOP)
+ {
+ if (! const_0_to_7_operand (operands[2], SImode))
+ {
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ rtx op2 = operands[2];
+ int i;
+
+ if (GET_MODE (op2) != QImode)
+ {
+ op2 = gen_reg_rtx (QImode);
+ convert_move (op2, operands[2], false);
+ }
+
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = op2;
+
+ emit_insn (gen_vec_initv16qiqi (reg, par));
+ if (<CODE> == ROTATERT)
+ {
+ rtx neg = gen_reg_rtx (V16QImode);
+ emit_insn (gen_negv16qi2 (neg, reg));
+ emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg));
+ reg = neg;
+ }
+ emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg));
+ DONE;
+ }
+ }
+ else if (TARGET_GFNI && CONST_INT_P (operands[2]))
+ {
+ rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>);
+ emit_insn (gen_vgf2p8affineqb_v16qi (operands[0],
+ force_reg (V16QImode, operands[1]),
+ matrix, const0_rtx));
+ DONE;
+ }
+ else
+ FAIL;
+})
+
(define_expand "ashrv2di3"
[(set (match_operand:V2DI 0 "register_operand")
(ashiftrt:V2DI
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index c8603b9..1649ea2 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
ix86_size_memcpy,
ix86_size_memset,
COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
@@ -261,6 +267,12 @@ struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
i386_memcpy,
i386_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -382,6 +394,12 @@ struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
i486_memcpy,
i486_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -501,6 +519,12 @@ struct processor_costs pentium_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = {
COSTS_N_INSNS (5), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
pentium_memcpy,
pentium_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
pentiumpro_memcpy,
pentiumpro_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -858,6 +894,12 @@ struct processor_costs geode_cost = {
COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
geode_memcpy,
geode_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -979,6 +1021,12 @@ struct processor_costs k6_cost = {
COSTS_N_INSNS (2), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (2), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
k6_memcpy,
k6_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
athlon_memcpy,
athlon_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
k8_memcpy,
k8_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
amdfam10_memcpy,
amdfam10_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
bdver_memcpy,
bdver_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {5, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
znver1_memcpy,
znver1_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {10, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 6}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = {
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 6}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = {
We increase width to 6 for multiplications
in ix86_reassociation_width. */
6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 6}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
znver2_memcpy,
znver2_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = {
COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
skylake_memcpy,
skylake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = {
COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 10, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
icelake_memcpy,
icelake_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = {
COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
alderlake_memcpy,
alderlake_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
btver1_memcpy,
btver1_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
btver2_memcpy,
btver2_memset,
COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
@@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = {
COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
pentium4_memcpy,
pentium4_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = {
COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {1, 1, 1}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
nocona_memcpy,
nocona_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = {
COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 2, /* Limit how much the autovectorizer
+ may unroll a loop. */
atom_memcpy,
atom_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
slm_memcpy,
slm_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = {
COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
tremont_memcpy,
tremont_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
lujiazui_memcpy,
lujiazui_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
yongfeng_memcpy,
yongfeng_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
shijidadao_memcpy,
shijidadao_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
@@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = {
COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */
1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 8, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 4, /* Limit how much the autovectorizer
+ may unroll a loop. */
generic_memcpy,
generic_memset,
COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
@@ -4215,6 +4401,12 @@ struct processor_costs core_cost = {
COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */
COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
+ {8, 1, 3}, /* latency times throughput of
+ FMA/DOT_PROD_EXPR/SAD_EXPR,
+ it's used to determine unroll
+ factor in the vectorizer. */
+ 1, /* Limit how much the autovectorizer
+ may unroll a loop. */
core_memcpy,
core_memset,
COSTS_N_INSNS (3), /* cond_taken_branch_cost. */