diff options
author | Jerry DeLisle <jvdelisle@gcc.gnu.org> | 2025-09-02 15:58:26 -0700 |
---|---|---|
committer | Jerry DeLisle <jvdelisle@gcc.gnu.org> | 2025-09-02 15:58:26 -0700 |
commit | 071b4126c613881f4cb25b4e5c39032964827f88 (patch) | |
tree | 7ed805786566918630d1d617b1ed8f7310f5fd8e /gcc/config/i386 | |
parent | 845d23f3ea08ba873197c275a8857eee7edad996 (diff) | |
parent | caa1c2f42691d68af4d894a5c3e700ecd2dba080 (diff) | |
download | gcc-devel/gfortran-test.zip gcc-devel/gfortran-test.tar.gz gcc-devel/gfortran-test.tar.bz2 |
Merge branch 'master' into gfortran-testdevel/gfortran-test
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/i386-expand.cc | 129 | ||||
-rw-r--r-- | gcc/config/i386/i386-features.cc | 1130 | ||||
-rw-r--r-- | gcc/config/i386/i386-modes.def | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386-options.cc | 45 | ||||
-rw-r--r-- | gcc/config/i386/i386-passes.def | 2 | ||||
-rw-r--r-- | gcc/config/i386/i386-protos.h | 5 | ||||
-rw-r--r-- | gcc/config/i386/i386.cc | 305 | ||||
-rw-r--r-- | gcc/config/i386/i386.h | 59 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 449 | ||||
-rw-r--r-- | gcc/config/i386/i386.opt | 4 | ||||
-rw-r--r-- | gcc/config/i386/predicates.md | 17 | ||||
-rw-r--r-- | gcc/config/i386/sse.md | 135 | ||||
-rw-r--r-- | gcc/config/i386/x86-tune-costs.h | 192 |
13 files changed, 1904 insertions, 570 deletions
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 09aa9b1..3278f1f 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) } /* Expand floating point op0 <=> op1, i.e. - dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */ + dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */ void ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) @@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) if (l2) { emit_label (l2); - emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2); + emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2); } emit_label (lend); } @@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; move_by_pieces (destmem, srcmem, epilogue_size, destalign, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 8) @@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, /* Callback routine for store_by_pieces. Return the RTL of a register containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which - is a word or a word vector register. If PREV_P isn't nullptr, it - has the RTL info from the previous iteration. */ + is an integer or a word vector register. If PREV_P isn't nullptr, + it has the RTL info from the previous iteration. */ static rtx setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, @@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, rtx op = (rtx) op_p; machine_mode op_mode = GET_MODE (op); - gcc_assert (op_mode == word_mode - || (VECTOR_MODE_P (op_mode) - && GET_MODE_INNER (op_mode) == word_mode)); - if (VECTOR_MODE_P (mode)) { gcc_assert (GET_MODE_INNER (mode) == QImode); @@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, return tmp; } - target = gen_reg_rtx (word_mode); if (VECTOR_MODE_P (op_mode)) { + gcc_assert (GET_MODE_INNER (op_mode) == word_mode); + target = gen_reg_rtx (word_mode); op = gen_rtx_SUBREG (word_mode, op, 0); emit_move_insn (target, op); } else target = op; - if (mode == word_mode) + if (mode == GET_MODE (target)) return target; rtx tmp = gen_reg_rtx (mode); @@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val, vec_value ? vec_value : value, destalign, true, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 32) @@ -9574,8 +9575,9 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, case vector_loop: need_zero_guard = true; unroll_factor = 4; - /* Get the vector mode to move MOVE_MAX bytes. */ - nunits = MOVE_MAX / GET_MODE_SIZE (word_mode); + /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */ + nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX; + nunits /= GET_MODE_SIZE (word_mode); if (nunits > 1) { move_mode = mode_for_vector (word_mode, nunits).require (); @@ -27033,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, return target; } +/* GF2P8AFFINEQB matrixes to implement shift and rotate. */ + +static const uint64_t matrix_ashift[8] = +{ + 0, + 0x0001020408102040, /* 1 l */ + 0x0000010204081020, /* 2 l */ + 0x0000000102040810, /* 3 l */ + 0x0000000001020408, /* 4 l */ + 0x0000000000010204, /* 5 l */ + 0x0000000000000102, /* 6 l */ + 0x0000000000000001 /* 7 l */ +}; + +static const uint64_t matrix_lshiftrt[8] = +{ + 0, + 0x0204081020408000, /* 1 r */ + 0x0408102040800000, /* 2 r */ + 0x0810204080000000, /* 3 r */ + 0x1020408000000000, /* 4 r */ + 0x2040800000000000, /* 5 r */ + 0x4080000000000000, /* 6 r */ + 0x8000000000000000 /* 7 r */ +}; + +static const uint64_t matrix_ashiftrt[8] = +{ + 0, + 0x0204081020408080, /* 1 r */ + 0x0408102040808080, /* 2 r */ + 0x0810204080808080, /* 3 r */ + 0x1020408080808080, /* 4 r */ + 0x2040808080808080, /* 5 r */ + 0x4080808080808080, /* 6 r */ + 0x8080808080808080 /* 7 r */ +}; + +static const uint64_t matrix_rotate[8] = +{ + 0, + 0x8001020408102040, /* 1 rol8 */ + 0x4080010204081020, /* 2 rol8 */ + 0x2040800102040810, /* 3 rol8 */ + 0x1020408001020408, /* 4 rol8 */ + 0x0810204080010204, /* 5 rol8 */ + 0x0408102040800102, /* 6 rol8 */ + 0x0204081020408001 /* 7 rol8 */ +}; + +static const uint64_t matrix_rotatert[8] = +{ + 0, + 0x0204081020408001, /* 1 ror8 */ + 0x0408102040800102, /* 2 ror8 */ + 0x0810204080010204, /* 3 ror8 */ + 0x1020408001020408, /* 4 ror8 */ + 0x2040800102040810, /* 5 ror8 */ + 0x4080010204081020, /* 6 ror8 */ + 0x8001020408102040 /* 7 ror8 */ +}; + +/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift + for CODE and shift count COUNT into register with vector of size of SRC. */ + +rtx +ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code) +{ + machine_mode mode = GET_MODE (src); + const uint64_t *matrix; + unsigned shift = INTVAL (count) & 7; + gcc_assert (shift > 0 && shift < 8); + + switch (code) + { + case ASHIFT: + matrix = matrix_ashift; + break; + case ASHIFTRT: + matrix = matrix_ashiftrt; + break; + case LSHIFTRT: + matrix = matrix_lshiftrt; + break; + case ROTATE: + matrix = matrix_rotate; + break; + case ROTATERT: + matrix = matrix_rotatert; + break; + default: + gcc_unreachable (); + } + + int nelts = GET_MODE_NUNITS (mode); + rtvec vec = rtvec_alloc (nelts); + uint64_t ma = matrix[shift]; + for (int i = 0; i < nelts; i++) + RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode); + + return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec)); +} + /* Trunc a vector to a narrow vector, like v4di -> v4si. */ void diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c131577..0608dd2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3085,21 +3085,68 @@ ix86_rpad_gate () && optimize_function_for_speed_p (cfun)); } +enum x86_cse_kind +{ + X86_CSE_CONST0_VECTOR, + X86_CSE_CONSTM1_VECTOR, + X86_CSE_VEC_DUP, + X86_CSE_TLS_GD, + X86_CSE_TLS_LD_BASE, + X86_CSE_TLSDESC +}; + +struct redundant_pattern +{ + /* Bitmap of basic blocks with broadcast instructions. */ + auto_bitmap bbs; + /* Bitmap of broadcast instructions. */ + auto_bitmap insns; + /* The broadcast inner scalar. */ + rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; + /* The inner scalar mode. */ + machine_mode mode; + /* The instruction which sets the inner scalar. Nullptr if the inner + scalar is applied to the whole function, instead of within the same + block. */ + rtx_insn *def_insn; + /* The widest broadcast source. */ + rtx broadcast_source; + /* The widest broadcast register. */ + rtx broadcast_reg; + /* The basic block of the broadcast instruction. */ + basic_block bb; + /* The number of broadcast instructions with the same inner scalar. */ + unsigned HOST_WIDE_INT count; + /* The threshold of broadcast instructions with the same inner + scalar. */ + unsigned int threshold; + /* The widest broadcast size in bytes. */ + unsigned int size; + /* Load kind. */ + x86_cse_kind kind; +}; + /* Generate a vector set, DEST = SRC, at entry of the nearest dominator for basic block map BBS, which is in the fake loop that contains the whole function, so that there is only a single vector set in the - whole function. If not nullptr, INNER_SCALAR is the inner scalar of - SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ + whole function. If not nullptr, LOAD is a pointer to the load. */ static void ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, - rtx inner_scalar = nullptr) + redundant_pattern *load = nullptr) { basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); + /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop + to avoid extra spills. */ + if (!load || load->kind != X86_CSE_VEC_DUP) + { + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + } rtx set = gen_rtx_SET (dest, src); @@ -3141,8 +3188,14 @@ ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, } } - if (inner_scalar) + if (load && load->kind == X86_CSE_VEC_DUP) { + /* Get the source from LOAD as (reg:SI 99) in + + (vec_duplicate:V4SI (reg:SI 99)) + + */ + rtx inner_scalar = load->val; /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ rtx reg = XEXP (src, 0); if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) @@ -3226,7 +3279,7 @@ remove_partial_avx_dependency (void) break; } - /* Only hanlde conversion here. */ + /* Only handle conversion here. */ machine_mode src_mode = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode; switch (src_mode) @@ -3489,44 +3542,6 @@ replace_vector_const (machine_mode vector_mode, rtx vector_const, } } -enum x86_cse_kind -{ - X86_CSE_CONST0_VECTOR, - X86_CSE_CONSTM1_VECTOR, - X86_CSE_VEC_DUP -}; - -struct redundant_load -{ - /* Bitmap of basic blocks with broadcast instructions. */ - auto_bitmap bbs; - /* Bitmap of broadcast instructions. */ - auto_bitmap insns; - /* The broadcast inner scalar. */ - rtx val; - /* The inner scalar mode. */ - machine_mode mode; - /* The instruction which sets the inner scalar. Nullptr if the inner - scalar is applied to the whole function, instead of within the same - block. */ - rtx_insn *def_insn; - /* The widest broadcast source. */ - rtx broadcast_source; - /* The widest broadcast register. */ - rtx broadcast_reg; - /* The basic block of the broadcast instruction. */ - basic_block bb; - /* The number of broadcast instructions with the same inner scalar. */ - unsigned HOST_WIDE_INT count; - /* The threshold of broadcast instructions with the same inner - scalar. */ - unsigned int threshold; - /* The widest broadcast size in bytes. */ - unsigned int size; - /* Load kind. */ - x86_cse_kind kind; -}; - /* Return the inner scalar if OP is a broadcast, else return nullptr. */ static rtx @@ -3629,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode, Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an integer constant. */ op = src; + if (mode != GET_MODE (reg)) + op = gen_int_mode (INTVAL (src), mode); *insn_p = nullptr; } else @@ -3669,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode, return op; } -/* At entry of the nearest common dominator for basic blocks with vector - CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest - vector set instruction for all CONST0_RTX and integer CONSTM1_RTX - uses. +/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and + put the updated instruction in UPDATED_TLS_INSNS. */ - NB: We want to generate only a single widest vector set to cover the - whole function. The LCM algorithm isn't appropriate here since it - may place a vector set inside the loop. */ +static void +replace_tls_call (rtx src, auto_bitmap &tls_call_insns, + auto_bitmap &updated_tls_insns) +{ + bitmap_iterator bi; + unsigned int id; -static unsigned int -remove_redundant_vector_load (void) + EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (id)->insn; + + /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are + allowed. */ + if (!CALL_P (insn)) + { + attr_tls64 tls64 = get_attr_tls64 (insn); + if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE) + gcc_unreachable (); + } + + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + + set = gen_rtx_SET (dest, src); + rtx_insn *set_insn = emit_insn_after (set, insn); + if (recog_memoized (set_insn) < 0) + gcc_unreachable (); + + /* Put SET_INSN in UPDATED_TLS_INSNS. */ + bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn)); + + if (dump_file) + { + fprintf (dump_file, "\nReplace:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nwith:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\n"); + } + + /* Delete the CALL insn. */ + delete_insn (insn); + + df_insn_rescan (set_insn); + } +} + +/* Return the basic block which dominates all basic blocks which set + hard register REGNO used in basic block BB. */ + +static basic_block +ix86_get_dominator_for_reg (unsigned int regno, basic_block bb) +{ + basic_block set_bb; + auto_bitmap set_bbs; + + /* Get all BBs which set REGNO and dominate the current BB from all + DEFs of REGNO. */ + for (df_ref def = DF_REG_DEF_CHAIN (regno); + def; + def = DF_REF_NEXT_REG (def)) + if (!DF_REF_IS_ARTIFICIAL (def) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER)) + { + set_bb = DF_REF_BB (def); + if (dominated_by_p (CDI_DOMINATORS, bb, set_bb)) + bitmap_set_bit (set_bbs, set_bb->index); + } + + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + return bb; +} + +/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved + registers, if DEST is FLAGS register. */ + +static void +ix86_check_flags_reg (rtx dest, const_rtx, void *data) +{ + auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data; + if (REG_P (dest) && REGNO (dest) == FLAGS_REG) + bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG); +} + +/* Emit a TLS_SET instruction of KIND in basic block BB. Store the + insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P + for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions + which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS + contains instructions which replace the GNU2 TLS instructions. */ + +static rtx_insn * +ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb, + rtx_insn **before_p, rtx_insn **after_p, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns) +{ + rtx_insn *tls_insn; + + do + { + rtx_insn *insn = BB_HEAD (bb); + while (insn && !NONDEBUG_INSN_P (insn)) + { + if (insn == BB_END (bb)) + { + /* This must be the beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or a basic block with only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or a basic block with only a debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + gcc_assert (DEBUG_INSN_P (insn) + || (NOTE_P (insn) + && ((NOTE_KIND (insn) + == NOTE_INSN_FUNCTION_BEG) + || (NOTE_KIND (insn) + == NOTE_INSN_BASIC_BLOCK)))); + insn = NULL; + break; + } + insn = NEXT_INSN (insn); + } + + /* TLS_GD and TLS_LD_BASE instructions are normal functions which + clobber caller-saved registers. TLSDESC instructions only + clobber FLAGS. If any registers clobbered by TLS instructions + are live in this basic block, we must insert TLS instructions + after all live registers clobbered are dead. */ + + auto_bitmap live_caller_saved_regs; + bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb); + + if (bitmap_bit_p (in, FLAGS_REG)) + bitmap_set_bit (live_caller_saved_regs, FLAGS_REG); + + unsigned int i; + + /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE + instructions. */ + if (kind != X86_CSE_TLSDESC) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (call_used_regs[i] + && !fixed_regs[i] + && bitmap_bit_p (in, i)) + bitmap_set_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + if (insn == BB_HEAD (bb)) + { + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + } + else + { + /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the + beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or after NOTE_INSN_BASIC_BLOCK in a basic block with + only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or after debug marker in a basic block with only a + debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + insn = insn ? PREV_INSN (insn) : BB_END (bb); + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + } + return tls_insn; + } + + bool repeat = false; + + /* Search for REG_DEAD notes in this basic block. */ + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + /* NB: Conditional jump is the only instruction which reads + flags register and changes control flow. We can never + place the TLS call after unconditional jump. */ + if (JUMP_P (insn)) + { + /* This must be a conditional jump. */ + rtx label = JUMP_LABEL (insn); + if (label == nullptr + || ANY_RETURN_P (label) + || !(LABEL_P (label) || SYMBOL_REF_P (label))) + gcc_unreachable (); + + /* Place the call before all FLAGS_REG setting BBs since + we can't place a call before nor after a conditional + jump. */ + bb = ix86_get_dominator_for_reg (FLAGS_REG, bb); + + /* Start over again. */ + repeat = true; + break; + } + + if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn))) + { + /* Insert the __tls_get_addr call before INSN which + replaces a __tls_get_addr call. */ + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + return tls_insn; + } + + if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn))) + { + /* Mark FLAGS register as dead since FLAGS register + would be clobbered by the GNU2 TLS instruction. */ + bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG); + continue; + } + + /* Check if FLAGS register is live. */ + note_stores (insn, ix86_check_flags_reg, + &live_caller_saved_regs); + + rtx link; + for (link = REG_NOTES (insn); link; link = XEXP (link, 1)) + if (REG_NOTE_KIND (link) == REG_DEAD + && REG_P (XEXP (link, 0))) + { + /* Mark the live caller-saved register as dead. */ + for (i = REGNO (XEXP (link, 0)); + i < END_REGNO (XEXP (link, 0)); + i++) + if (i < FIRST_PSEUDO_REGISTER) + bitmap_clear_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + return tls_insn; + } + } + } + + /* NB: Start over again for conditional jump. */ + if (repeat) + continue; + + gcc_assert (!bitmap_empty_p (live_caller_saved_regs)); + + /* If any live caller-saved registers aren't dead at the end of + this basic block, get the basic block which dominates all + basic blocks which set the remaining live registers. */ + auto_bitmap set_bbs; + bitmap_iterator bi; + unsigned int id; + EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi) + { + basic_block set_bb = ix86_get_dominator_for_reg (id, bb); + bitmap_set_bit (set_bbs, set_bb->index); + } + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + } + while (true); +} + +/* Generate a TLS call of KIND with VAL and copy the call result to DEST, + at entry of the nearest dominator for basic block map BBS, which is in + the fake loop that contains the whole function, so that there is only + a single TLS CALL of KIND with VAL in the whole function. + UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS + instructions. UPDATED_GNU2_TLS_INSNS contains instructions which + replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr, + insert it before the TLS call. */ + +static void +ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind, + auto_bitmap &bbs, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns, + rtx tlsdesc_set = nullptr) +{ + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + rtx rax = nullptr, rdi; + rtx eqv = nullptr; + rtx caddr; + rtx set; + rtx clob; + rtx symbol; + rtx tls; + + switch (kind) + { + case X86_CSE_TLS_GD: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + symbol = XVECEXP (val, 0, 0); + tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi); + + if (GET_MODE (symbol) != Pmode) + symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol); + eqv = symbol; + break; + + case X86_CSE_TLS_LD_BASE: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi); + + /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers + to share the LD_BASE result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_TLS_LD_BASE); + + break; + + case X86_CSE_TLSDESC: + set = gen_rtx_SET (dest, val); + clob = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (CCmode, FLAGS_REG)); + tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob)); + break; + + default: + gcc_unreachable (); + } + + /* Emit the TLS CALL insn. */ + rtx_insn *before = nullptr; + rtx_insn *after = nullptr; + rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before, + &after, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + + rtx_insn *tlsdesc_insn = nullptr; + if (tlsdesc_set) + { + rtx dest = copy_rtx (SET_DEST (tlsdesc_set)); + rtx src = copy_rtx (SET_SRC (tlsdesc_set)); + tlsdesc_set = gen_rtx_SET (dest, src); + tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn); + } + + if (kind != X86_CSE_TLSDESC) + { + RTL_CONST_CALL_P (tls_insn) = 1; + + /* Indicate that this function can't jump to non-local gotos. */ + make_reg_eh_region_note_nothrow_nononlocal (tls_insn); + } + + if (recog_memoized (tls_insn) < 0) + gcc_unreachable (); + + if (dump_file) + { + if (after) + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, after); + fprintf (dump_file, "\n"); + } + else + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nbefore:\n\n"); + print_rtl_single (dump_file, before); + fprintf (dump_file, "\n"); + } + } + + if (kind != X86_CSE_TLSDESC) + { + /* Copy RAX to DEST. */ + set = gen_rtx_SET (dest, rax); + rtx_insn *set_insn = emit_insn_after (set, tls_insn); + set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest); + if (dump_file) + { + fprintf (dump_file, "\nPlace:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\n"); + } + } +} + +namespace { + +const pass_data pass_data_x86_cse = +{ + RTL_PASS, /* type */ + "x86_cse", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_x86_cse : public rtl_opt_pass +{ +public: + pass_x86_cse (gcc::context *ctxt) + : rtl_opt_pass (pass_data_x86_cse, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *fun) final override + { + return (TARGET_SSE2 + && optimize + && optimize_function_for_speed_p (fun)); + } + + unsigned int execute (function *) final override + { + return x86_cse (); + } + +private: + /* The redundant source value. */ + rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; + /* The instruction which defines the redundant value. */ + rtx_insn *def_insn; + /* Mode of the destination of the candidate redundant instruction. */ + machine_mode mode; + /* Mode of the source of the candidate redundant instruction. */ + machine_mode scalar_mode; + /* The classification of the candidate redundant instruction. */ + x86_cse_kind kind; + + unsigned int x86_cse (void); + bool candidate_gnu_tls_p (rtx_insn *, attr_tls64); + bool candidate_gnu2_tls_p (rtx, attr_tls64); + bool candidate_vector_p (rtx); + rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx); +}; // class pass_x86_cse + +/* Return the instruction which sets REG from TLS_SYMBOL. */ + +rtx_insn * +pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg, + const_rtx tls_symbol) +{ + rtx_insn *set_insn = nullptr; + for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg)); + ref; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + return nullptr; + + set_insn = DF_REF_INSN (ref); + if (get_attr_tls64 (set_insn) != TLS64_LEA) + return nullptr; + + rtx tls_set = PATTERN (set_insn); + rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0); + if (!rtx_equal_p (tls_symbol, tls_src)) + return nullptr; + } + + return set_insn; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */ + +bool +pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + /* Record the redundant TLS CALLs for 64-bit: + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg/f:DI 7 sp)] UNSPEC_TLS_GD) + (clobber (reg:DI 5 di))]) + + + and + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)]) + + */ + + rtx pat = PATTERN (insn); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + scalar_mode = mode = GET_MODE (dest); + val = XVECEXP (pat, 0, 1); + gcc_assert (GET_CODE (val) == UNSPEC); + + if (tls64 == TLS64_GD) + kind = X86_CSE_TLS_GD; + else + kind = X86_CSE_TLS_LD_BASE; + + def_insn = nullptr; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + SET is UNSPEC_TLSDESC. */ + +bool +pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + rtx tls_symbol; + rtx_insn *set_insn; + rtx src = SET_SRC (set); + val = src; + tlsdesc_val = src; + kind = X86_CSE_TLSDESC; + + if (tls64 == TLS64_COMBINE) + { + /* Record 64-bit TLS64_COMBINE: + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (reg:DI 114) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + ] UNSPEC_TLSDESC) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + */ + + scalar_mode = mode = GET_MODE (src); + + /* Since the first operand of PLUS in the source TLS_COMBINE + pattern is unused, use the second operand of PLUS: + + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)) + + as VAL to check if 2 TLS_COMBINE patterns have the same + source. */ + val = XEXP (src, 1); + gcc_assert (GET_CODE (val) == CONST + && GET_CODE (XEXP (val, 0)) == UNSPEC + && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF + && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0))); + def_insn = nullptr; + return true; + } + + /* Record 64-bit TLS_CALL: + + (set (reg:DI 101) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg:DI 112) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + */ + + gcc_assert (GET_CODE (src) == UNSPEC); + tls_symbol = XVECEXP (src, 0, 0); + src = XVECEXP (src, 0, 1); + scalar_mode = mode = GET_MODE (src); + gcc_assert (REG_P (src)); + + /* All definitions of reg:DI 129 in + + (set (reg:DI 110) + (unspec:DI [(symbol_ref:DI ("foo")) + (reg:DI 129) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + should have the same source as in + + (set (reg:DI 129) + (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC)) + + */ + + set_insn = tls_set_insn_from_symbol (src, tls_symbol); + if (!set_insn) + return false; + + /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */ + val = tls_symbol; + def_insn = set_insn; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is a vector broadcast instruction. */ + +bool +pass_x86_cse::candidate_vector_p (rtx set) +{ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + mode = GET_MODE (dest); + /* Skip non-vector instruction. */ + if (!VECTOR_MODE_P (mode)) + return false; + + /* Skip non-vector load instruction. */ + if (!REG_P (dest) && !SUBREG_P (dest)) + return false; + + val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind, + &def_insn); + return val ? true : false; +} + +/* At entry of the nearest common dominator for basic blocks with + + 1. Vector CONST0_RTX patterns. + 2. Vector CONSTM1_RTX patterns. + 3. Vector broadcast patterns. + 4. UNSPEC_TLS_GD patterns. + 5. UNSPEC_TLS_LD_BASE patterns. + 6. UNSPEC_TLSDESC patterns. + + generate a single pattern whose destination is used to replace the + source in all identical patterns. + + NB: We want to generate a pattern, which is executed only once, to + cover the whole function. The LCM algorithm isn't appropriate here + since it may place a pattern inside the loop. */ + +unsigned int +pass_x86_cse::x86_cse (void) { timevar_push (TV_MACH_DEP); - auto_vec<redundant_load *> loads; - redundant_load *load; + auto_vec<redundant_pattern *> loads; + redundant_pattern *load; basic_block bb; rtx_insn *insn; unsigned int i; + auto_bitmap updated_gnu_tls_insns; + auto_bitmap updated_gnu2_tls_insns; df_set_flags (DF_DEFER_INSN_RESCAN); @@ -3700,61 +4411,74 @@ remove_redundant_vector_load (void) if (!NONDEBUG_INSN_P (insn)) continue; + bool matched = false; + /* Remove redundant pattens if there are more than 2 of + them. */ + unsigned int threshold = 2; + rtx set = single_set (insn); - if (!set) + if (!set && !CALL_P (insn)) continue; - /* Record single set vector instruction with CONST0_RTX and - CONSTM1_RTX source. Record basic blocks with CONST0_RTX and - CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the - maximum size of CONST0_RTX and CONSTM1_RTX. */ + tlsdesc_val = nullptr; - rtx dest = SET_DEST (set); - machine_mode mode = GET_MODE (dest); - /* Skip non-vector instruction. */ - if (!VECTOR_MODE_P (mode)) - continue; + attr_tls64 tls64 = get_attr_tls64 (insn); + switch (tls64) + { + case TLS64_GD: + case TLS64_LD_BASE: + /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */ + if (candidate_gnu_tls_p (insn, tls64)) + break; + continue; - rtx src = SET_SRC (set); - /* Skip non-vector load instruction. */ - if (!REG_P (dest) && !SUBREG_P (dest)) - continue; + case TLS64_CALL: + case TLS64_COMBINE: + /* Verify UNSPEC_TLSDESC. */ + if (candidate_gnu2_tls_p (set, tls64)) + break; + continue; - rtx_insn *def_insn; - machine_mode scalar_mode; - x86_cse_kind kind; - rtx val = ix86_broadcast_inner (src, mode, &scalar_mode, - &kind, &def_insn); - if (!val) - continue; + case TLS64_LEA: + /* Skip TLS64_LEA. */ + continue; - /* Remove redundant register loads if there are more than 2 - loads will be used. */ - unsigned int threshold = 2; + case TLS64_NONE: + if (!set) + continue; - /* Check if there is a matching redundant vector load. */ - bool matched = false; + /* Check for vector broadcast. */ + if (candidate_vector_p (set)) + break; + continue; + } + + /* Check if there is a matching redundant load. */ FOR_EACH_VEC_ELT (loads, i, load) if (load->val && load->kind == kind && load->mode == scalar_mode && (load->bb == bb - || kind < X86_CSE_VEC_DUP + || kind != X86_CSE_VEC_DUP /* Non all 0s/1s vector load must be in the same basic block if it is in a recursive call. */ || !recursive_call_p) && rtx_equal_p (load->val, val)) { - /* Record vector instruction. */ + /* Record instruction. */ bitmap_set_bit (load->insns, INSN_UID (insn)); /* Record the maximum vector size. */ - if (load->size < GET_MODE_SIZE (mode)) + if (kind <= X86_CSE_VEC_DUP + && load->size < GET_MODE_SIZE (mode)) load->size = GET_MODE_SIZE (mode); /* Record the basic block. */ bitmap_set_bit (load->bbs, bb->index); + + /* Increment the count. */ load->count++; + matched = true; break; } @@ -3762,10 +4486,17 @@ remove_redundant_vector_load (void) if (matched) continue; - /* We see this vector broadcast the first time. */ - load = new redundant_load; + /* We see this instruction the first time. Record the + redundant source value, its mode, the destination size, + instruction which defines the redundant source value, + instruction basic block and the instruction kind. */ + load = new redundant_pattern; load->val = copy_rtx (val); + if (tlsdesc_val) + load->tlsdesc_val = copy_rtx (tlsdesc_val); + else + load->tlsdesc_val = nullptr; load->mode = scalar_mode; load->size = GET_MODE_SIZE (mode); load->def_insn = def_insn; @@ -3782,49 +4513,64 @@ remove_redundant_vector_load (void) } bool replaced = false; - rtx reg, broadcast_source, broadcast_reg; FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { - machine_mode mode = ix86_get_vector_cse_mode (load->size, - load->mode); - broadcast_reg = gen_reg_rtx (mode); - if (load->def_insn) - { - /* Replace redundant vector loads with a single vector load - in the same basic block. */ - reg = load->val; - if (load->mode != GET_MODE (reg)) - reg = gen_rtx_SUBREG (load->mode, reg, 0); - broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - replace_vector_const (mode, broadcast_reg, load->insns, - load->mode); - } - else + machine_mode mode; + rtx reg, broadcast_source, broadcast_reg; + replaced = true; + switch (load->kind) { - /* This is a constant integer/double vector. If the - inner scalar is 0 or -1, set vector to CONST0_RTX - or CONSTM1_RTX directly. */ - rtx reg; - switch (load->kind) + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + broadcast_reg = gen_reg_rtx (load->mode); + replace_tls_call (broadcast_reg, load->insns, + (load->kind == X86_CSE_TLSDESC + ? updated_gnu2_tls_insns + : updated_gnu_tls_insns)); + load->broadcast_reg = broadcast_reg; + break; + + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + mode = ix86_get_vector_cse_mode (load->size, load->mode); + broadcast_reg = gen_reg_rtx (mode); + if (load->def_insn) { - case X86_CSE_CONST0_VECTOR: - broadcast_source = CONST0_RTX (mode); - break; - case X86_CSE_CONSTM1_VECTOR: - broadcast_source = CONSTM1_RTX (mode); - break; - default: - reg = gen_reg_rtx (load->mode); + /* Replace redundant vector loads with a single vector + load in the same basic block. */ + reg = load->val; + if (load->mode != GET_MODE (reg)) + reg = gen_rtx_SUBREG (load->mode, reg, 0); broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - break; } + else + /* This is a constant integer/double vector. If the + inner scalar is 0 or -1, set vector to CONST0_RTX + or CONSTM1_RTX directly. */ + switch (load->kind) + { + case X86_CSE_CONST0_VECTOR: + broadcast_source = CONST0_RTX (mode); + break; + case X86_CSE_CONSTM1_VECTOR: + broadcast_source = CONSTM1_RTX (mode); + break; + case X86_CSE_VEC_DUP: + reg = gen_reg_rtx (load->mode); + broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); + break; + default: + gcc_unreachable (); + } replace_vector_const (mode, broadcast_reg, load->insns, load->mode); + load->broadcast_source = broadcast_source; + load->broadcast_reg = broadcast_reg; + break; } - load->broadcast_source = broadcast_source; - load->broadcast_reg = broadcast_reg; - replaced = true; } if (replaced) @@ -3839,43 +4585,75 @@ remove_redundant_vector_load (void) FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { + rtx set; if (load->def_insn) - { - /* Insert a broadcast after the original scalar - definition. */ - rtx set = gen_rtx_SET (load->broadcast_reg, - load->broadcast_source); - insn = emit_insn_after (set, load->def_insn); - - if (cfun->can_throw_non_call_exceptions) - { - /* Handle REG_EH_REGION note in DEF_INSN. */ - rtx note = find_reg_note (load->def_insn, - REG_EH_REGION, nullptr); - if (note) - { - control_flow_insns.safe_push (load->def_insn); - add_reg_note (insn, REG_EH_REGION, - XEXP (note, 0)); - } - } + switch (load->kind) + { + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + load->tlsdesc_val, + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns, + PATTERN (load->def_insn)); + break; + case X86_CSE_VEC_DUP: + /* Insert a broadcast after the original scalar + definition. */ + set = gen_rtx_SET (load->broadcast_reg, + load->broadcast_source); + insn = emit_insn_after (set, load->def_insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note in DEF_INSN. */ + rtx note = find_reg_note (load->def_insn, + REG_EH_REGION, nullptr); + if (note) + { + control_flow_insns.safe_push (load->def_insn); + add_reg_note (insn, REG_EH_REGION, + XEXP (note, 0)); + } + } - if (dump_file) - { - fprintf (dump_file, "\nAdd:\n\n"); - print_rtl_single (dump_file, insn); - fprintf (dump_file, "\nafter:\n\n"); - print_rtl_single (dump_file, load->def_insn); - fprintf (dump_file, "\n"); - } - } + if (dump_file) + { + fprintf (dump_file, "\nAdd:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, load->def_insn); + fprintf (dump_file, "\n"); + } + break; + default: + gcc_unreachable (); + } else - ix86_place_single_vector_set (load->broadcast_reg, - load->broadcast_source, - load->bbs, - (load->kind == X86_CSE_VEC_DUP - ? load->val - : nullptr)); + switch (load->kind) + { + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + (load->kind == X86_CSE_TLSDESC + ? load->tlsdesc_val + : load->val), + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + break; + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + ix86_place_single_vector_set (load->broadcast_reg, + load->broadcast_source, + load->bbs, + load); + break; + } } loop_optimizer_finalize (); @@ -3905,48 +4683,12 @@ remove_redundant_vector_load (void) return 0; } -namespace { - -const pass_data pass_data_remove_redundant_vector_load = -{ - RTL_PASS, /* type */ - "rrvl", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - 0, /* todo_flags_finish */ -}; - -class pass_remove_redundant_vector_load : public rtl_opt_pass -{ -public: - pass_remove_redundant_vector_load (gcc::context *ctxt) - : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) - {} - - /* opt_pass methods: */ - bool gate (function *fun) final override - { - return (TARGET_SSE2 - && optimize - && optimize_function_for_speed_p (fun)); - } - - unsigned int execute (function *) final override - { - return remove_redundant_vector_load (); - } -}; // class pass_remove_redundant_vector_load - } // anon namespace rtl_opt_pass * -make_pass_remove_redundant_vector_load (gcc::context *ctxt) +make_pass_x86_cse (gcc::context *ctxt) { - return new pass_remove_redundant_vector_load (ctxt); + return new pass_x86_cse (ctxt); } /* Convert legacy instructions that clobbers EFLAGS to APX_NF diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def index 2fedbeb..c2db305 100644 --- a/gcc/config/i386/i386-modes.def +++ b/gcc/config/i386/i386-modes.def @@ -91,7 +91,6 @@ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF V2TF */ VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF V4TF */ VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF V8TF */ -VECTOR_MODES (FLOAT, 256); /* V128HF V64SF V32DF V16TF */ VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ VECTOR_MODE (FLOAT, BF, 2); /* V2BF */ VECTOR_MODE (FLOAT, HF, 6); /* V6HF */ @@ -102,7 +101,6 @@ VECTOR_MODE (INT, QI, 2); /* V2QI */ VECTOR_MODE (INT, QI, 12); /* V12QI */ VECTOR_MODE (INT, QI, 14); /* V14QI */ VECTOR_MODE (INT, HI, 6); /* V6HI */ -VECTOR_MODE (INT, SI, 64); /* V64SI */ INT_MODE (OI, 32); INT_MODE (XI, 64); diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index ca6bb83..abb5dd7 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], OPT_mrecip, MASK_RECIP), + IX86_ATTR_YES ("80387", + OPT_m80387, + MASK_80387), + IX86_ATTR_IX86_YES ("general-regs-only", OPT_mgeneral_regs_only, OPTION_MASK_GENERAL_REGS_ONLY), @@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], else if (type == ix86_opt_yes || type == ix86_opt_no) { + opts_set->x_target_flags |= mask; + if (type == ix86_opt_no) opt_set_p = !opt_set_p; @@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl) isa = "AVX"; else if (cfun->machine->func_type != TYPE_NORMAL) isa = "SSE"; + else if (TARGET_MMX) + isa = "MMX/3Dnow"; + else if (TARGET_80387) + isa = "80387"; else isa = NULL; } @@ -3615,6 +3625,18 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } + if (TARGET_64BIT) + { + /* Do not warn when emulating the MS ABI. */ + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) + || ix86_function_type_abi (*node) != MS_ABI) + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + return NULL_TREE; + } + /* Can combine regparm with all attributes but fastcall, and thiscall. */ if (is_attribute_p ("regparm", name)) { @@ -3627,7 +3649,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) { - error ("regparam and thiscall attributes are not compatible"); + error ("regparm and thiscall attributes are not compatible"); } cst = TREE_VALUE (args); @@ -3648,19 +3670,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, return NULL_TREE; } - if (TARGET_64BIT) - { - /* Do not warn when emulating the MS ABI. */ - if ((TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE) - || ix86_function_type_abi (*node) != MS_ABI) - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine fastcall with stdcall (redundant) and sseregparm. */ + /* Can combine fastcall with sseregparm. */ if (is_attribute_p ("fastcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3681,8 +3691,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, } } - /* Can combine stdcall with fastcall (redundant), regparm and - sseregparm. */ + /* Can combine stdcall with regparm and sseregparm. */ else if (is_attribute_p ("stdcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -3732,6 +3741,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, { error ("cdecl and thiscall attributes are not compatible"); } + if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) + { + error ("regparm and thiscall attributes are not compatible"); + } } /* Can combine sseregparm with all attributes. */ diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 06f0288..553b46d 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -35,6 +35,6 @@ along with GCC; see the file COPYING3. If not see PR116174. */ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); - INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load); + INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse); INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 69bc0ee..bdb8bb9 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void); extern bool ix86_gpr_tls_address_pattern_p (rtx); extern bool ix86_tls_address_pattern_p (rtx); extern rtx ix86_rewrite_tls_address (rtx); +extern rtx ix86_tls_get_addr (void); extern void ix86_expand_vector_init (bool, rtx, rtx); extern void ix86_expand_vector_set (bool, rtx, rtx, int); @@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); -extern rtl_opt_pass *make_pass_remove_redundant_vector_load - (gcc::context *); +extern rtl_opt_pass *make_pass_x86_cse (gcc::context *); extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); @@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap); /* In i386-expand.cc. */ bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, HOST_WIDE_INT*); +rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 4682db85..471be3e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) return cost; } + +/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P. */ + +bool +ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, + unsigned int align, + enum by_pieces_operation op, + bool speed_p) +{ + /* Return true when we are currently expanding memcpy/memset epilogue + with move_by_pieces or store_by_pieces. */ + if (cfun->machine->by_pieces_in_use) + return true; + + return default_use_by_pieces_infrastructure_p (size, align, op, + speed_p); +} /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as this is used for to form addresses to local data when -fPIC is in @@ -12439,9 +12456,31 @@ ix86_tls_index (void) static GTY(()) rtx ix86_tls_symbol; -static rtx +rtx ix86_tls_get_addr (void) { + if (cfun->machine->call_saved_registers + == TYPE_NO_CALLER_SAVED_REGISTERS) + { + /* __tls_get_addr doesn't preserve vector registers. When a + function with no_caller_saved_registers attribute calls + __tls_get_addr, YMM and ZMM registers will be clobbered. + Issue an error and suggest -mtls-dialect=gnu2 in this case. */ + if (cfun->machine->func_type == TYPE_NORMAL) + error (G_("%<-mtls-dialect=gnu2%> must be used with a function" + " with the %<no_caller_saved_registers%> attribute")); + else + error (cfun->machine->func_type == TYPE_EXCEPTION + ? G_("%<-mtls-dialect=gnu2%> must be used with an" + " exception service routine") + : G_("%<-mtls-dialect=gnu2%> must be used with an" + " interrupt service routine")); + /* Don't issue the same error twice. */ + cfun->machine->func_type = TYPE_NORMAL; + cfun->machine->call_saved_registers + = TYPE_DEFAULT_CALL_SAVED_REGISTERS; + } + if (!ix86_tls_symbol) { const char *sym @@ -20007,7 +20046,7 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) tree utype, ures, vce; utype = unsigned_type_for (TREE_TYPE (arg0)); /* PABSB/W/D/Q store the unsigned result in dst, use ABSU_EXPR - instead of ABS_EXPR to hanlde overflow case(TYPE_MIN). */ + instead of ABS_EXPR to handle overflow case(TYPE_MIN). */ ures = gimple_build (&stmts, ABSU_EXPR, utype, arg0); gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); loc = gimple_location (stmt); @@ -21491,8 +21530,7 @@ ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) /* Register pair for mask registers. */ if (mode == P2QImode || mode == P2HImode) return 2; - if (mode == V64SFmode || mode == V64SImode) - return 4; + return 1; } @@ -22081,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, } /* FALLTHRU */ case V32QImode: + if (TARGET_GFNI && constant_op1) + { + /* Use vgf2p8affine. One extra load for the mask, but in a loop + with enough registers it will be moved out. So for now don't + account the constant mask load. This is not quite right + for non loop vectorization. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + } if (TARGET_AVX2) /* Use vpbroadcast. */ extra = cost->sse_op; @@ -22115,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 9; return ix86_vec_cost (mode, cost->sse_op * count) + extra; + case V64QImode: + /* Ignore the mask load for GF2P8AFFINEQB. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + case V2DImode: case V4DImode: /* V*DImode arithmetic right shift is emulated. */ @@ -23132,7 +23184,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, So current solution is make constant disp as cheap as possible. */ if (GET_CODE (addr) == PLUS && x86_64_immediate_operand (XEXP (addr, 1), Pmode) - /* Only hanlde (reg + disp) since other forms of addr are mostly LEA, + /* Only handle (reg + disp) since other forms of addr are mostly LEA, there's no additional cost for the plus of disp. */ && register_operand (XEXP (addr, 0), Pmode)) { @@ -25211,20 +25263,14 @@ asm_preferred_eh_data_format (int code, int global) return DW_EH_PE_absptr; } -/* Implement targetm.vectorize.builtin_vectorization_cost. */ +/* Worker for ix86_builtin_vectorization_cost and the fallback calls + from ix86_vector_costs::add_stmt_cost. */ static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype, int) +ix86_default_vector_cost (enum vect_cost_for_stmt type_of_cost, + machine_mode mode) { - bool fp = false; - machine_mode mode = TImode; + bool fp = FLOAT_MODE_P (mode); int index; - if (vectype != NULL) - { - fp = FLOAT_TYPE_P (vectype); - mode = TYPE_MODE (vectype); - } - switch (type_of_cost) { case scalar_stmt: @@ -25283,14 +25329,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, COSTS_N_INSNS (ix86_cost->gather_static + ix86_cost->gather_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case vector_scatter_store: return ix86_vec_cost (mode, COSTS_N_INSNS (ix86_cost->scatter_static + ix86_cost->scatter_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + * GET_MODE_NUNITS (mode)) / 2); case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; @@ -25308,7 +25354,7 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, case vec_construct: { - int n = TYPE_VECTOR_SUBPARTS (vectype); + int n = GET_MODE_NUNITS (mode); /* N - 1 element inserts into an SSE vector, the possible GPR -> XMM move is accounted for in add_stmt_cost. */ if (GET_MODE_BITSIZE (mode) <= 128) @@ -25336,6 +25382,17 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int) +{ + machine_mode mode = TImode; + if (vectype != NULL) + mode = TYPE_MODE (vectype); + return ix86_default_vector_cost (type_of_cost, mode); +} + /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ @@ -25768,15 +25825,20 @@ private: unsigned m_num_sse_needed[3]; /* Number of 256-bit vector permutation. */ unsigned m_num_avx256_vec_perm[3]; + /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR */ + unsigned m_num_reduc[X86_REDUC_LAST]; + /* Don't do unroll if m_prefer_unroll is false, default is true. */ + bool m_prefer_unroll; }; ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), m_num_gpr_needed (), m_num_sse_needed (), - m_num_avx256_vec_perm () -{ -} + m_num_avx256_vec_perm (), + m_num_reduc (), + m_prefer_unroll (true) +{} /* Implement targetm.vectorize.create_costs. */ @@ -25789,7 +25851,7 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar) unsigned ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree node, - tree vectype, int misalign, + tree vectype, int, vect_cost_model_location where) { unsigned retval = 0; @@ -26073,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, } } + /* Record number of load/store/gather/scatter in vectorized body. */ + if (where == vect_body && !m_costing_for_scalar) + { + switch (kind) + { + /* Emulated gather/scatter or any scalarization. */ + case scalar_load: + case scalar_stmt: + case scalar_store: + case vector_gather_load: + case vector_scatter_store: + m_prefer_unroll = false; + break; + + case vector_stmt: + case vec_to_scalar: + /* Count number of reduction FMA and "real" DOT_PROD_EXPR, + unroll in the vectorizer will enable partial sum. */ + if (stmt_info + && vect_is_reduction (stmt_info) + && stmt_info->stmt) + { + /* Handle __builtin_fma. */ + if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) + { + m_num_reduc[X86_REDUC_FMA] += count; + break; + } + + if (!is_gimple_assign (stmt_info->stmt)) + break; + + tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); + machine_mode inner_mode = GET_MODE_INNER (mode); + tree rhs1, rhs2; + bool native_vnni_p = true; + gimple* def; + machine_mode mode_rhs; + switch (subcode) + { + case PLUS_EXPR: + case MINUS_EXPR: + if (!fp || !flag_associative_math + || flag_fp_contract_mode != FP_CONTRACT_FAST) + break; + + /* FMA condition for different modes. */ + if (((inner_mode == DFmode || inner_mode == SFmode) + && !TARGET_FMA && !TARGET_AVX512VL) + || (inner_mode == HFmode && !TARGET_AVX512FP16) + || (inner_mode == BFmode && !TARGET_AVX10_2)) + break; + + /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed + to FMA/FNMA after vectorization. */ + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + if (subcode == PLUS_EXPR + && TREE_CODE (rhs1) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs1), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduc[X86_REDUC_FMA] += count; + else if (TREE_CODE (rhs2) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs2), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduc[X86_REDUC_FMA] += count; + break; + + /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR, + WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports + SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR. */ + case DOT_PROD_EXPR: + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + mode_rhs = TYPE_MODE (TREE_TYPE (rhs1)); + if (mode_rhs == QImode) + { + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1)); + signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2)); + + /* vpdpbusd. */ + if (signop1_p != signop2_p) + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX512VNNI + : ((TARGET_AVX512VNNI && TARGET_AVX512VL) + || TARGET_AVXVNNI)); + else + /* vpdpbssd. */ + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX10_2 + : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); + } + m_num_reduc[X86_REDUC_DOT_PROD] += count; + + /* Dislike to do unroll and partial sum for + emulated DOT_PROD_EXPR. */ + if (!native_vnni_p) + m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count; + break; + + case SAD_EXPR: + m_num_reduc[X86_REDUC_SAD] += count; + break; + + default: + break; + } + } + + default: + break; + } + } + + combined_fn cfn; if ((kind == vector_stmt || kind == scalar_stmt) && stmt_info @@ -26128,32 +26309,23 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (AGU and load ports). Try to account for this by scaling the construction cost by the number of elements involved. */ if ((kind == vec_construct || kind == vec_to_scalar) - && ((stmt_info - && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + && ((node + && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP + && SLP_TREE_LANES (node) == 1)) + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF + (SLP_TREE_REPRESENTATIVE (node)))) != INTEGER_CST)) - || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) - == VMAT_GATHER_SCATTER))) - || (node - && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP - && SLP_TREE_LANES (node) == 1)) - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF - (SLP_TREE_REPRESENTATIVE (node)))) - != INTEGER_CST)) - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) - == VMAT_GATHER_SCATTER))))) - { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))))) + { + stmt_cost = ix86_default_vector_cost (kind, mode); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); } else if ((kind == vec_construct || kind == scalar_to_vec) && node && SLP_TREE_DEF_TYPE (node) == vect_external_def) { - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); unsigned i; tree op; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) @@ -26217,7 +26389,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, TREE_VISITED (op) = 0; } if (stmt_cost == -1) - stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost = ix86_default_vector_cost (kind, mode); if (kind == vec_perm && vectype && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) @@ -26288,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) m_costs[vect_body] = INT_MAX; + + bool any_reduc_p = false; + for (int i = 0; i != X86_REDUC_LAST; i++) + if (m_num_reduc[i]) + { + any_reduc_p = true; + break; + } + + if (any_reduc_p + /* Not much gain for loop with gather and scatter. */ + && m_prefer_unroll + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + unsigned unroll_factor + = OPTION_SET_P (ix86_vect_unroll_limit) + ? ix86_vect_unroll_limit + : ix86_cost->vect_unroll_limit; + + if (unroll_factor > 1) + { + for (int i = 0 ; i != X86_REDUC_LAST; i++) + { + if (m_num_reduc[i]) + { + unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i], + m_num_reduc[i]); + unroll_factor = MIN (unroll_factor, tmp); + } + } + + m_suggested_unroll_factor = 1 << ceil_log2 (unroll_factor); + } + } + } ix86_vect_estimate_reg_pressure (); @@ -27171,9 +27378,9 @@ ix86_memtag_can_tag_addresses () return ix86_lam_type != lam_none && TARGET_LP64; } -/* Implement TARGET_MEMTAG_TAG_SIZE. */ +/* Implement TARGET_MEMTAG_TAG_BITSIZE. */ unsigned char -ix86_memtag_tag_size () +ix86_memtag_tag_bitsize () { return IX86_HWASAN_TAG_SIZE; } @@ -27744,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST ix86_address_cost +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \ + ix86_use_by_pieces_infrastructure_p + #undef TARGET_OVERLAP_OP_BY_PIECES_P #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true @@ -28147,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p #undef TARGET_MEMTAG_UNTAGGED_POINTER #define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer -#undef TARGET_MEMTAG_TAG_SIZE -#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size +#undef TARGET_MEMTAG_TAG_BITSIZE +#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize #undef TARGET_GEN_CCMP_FIRST #define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 791f3b9..ac0ce68 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -102,6 +102,15 @@ struct stringop_algs #define COSTS_N_BYTES(N) ((N) * 2) #endif + +enum ix86_reduc_unroll_factor{ + X86_REDUC_FMA, + X86_REDUC_DOT_PROD, + X86_REDUC_SAD, + + X86_REDUC_LAST +}; + /* Define the specific costs for a given cpu. NB: hard_register is used by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute hard register move costs by register allocator. Relative costs of @@ -225,6 +234,13 @@ struct processor_costs { to number of instructions executed in parallel. See also ix86_reassociation_width. */ + const unsigned reduc_lat_mult_thr[X86_REDUC_LAST]; + /* Latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + const unsigned vect_unroll_limit; /* Limit how much the autovectorizer + may unroll a loop. */ struct stringop_algs *memcpy, *memset; const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer cost model. */ @@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ {"arch", "%{!march=*:-march=%(VALUE)}"}, \ {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"}, \ - {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, + {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, \ + {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"}, /* Specs for the compiler proper */ @@ -2477,9 +2494,9 @@ constexpr wide_int_bitmask PTA_DIAMONDRAPIDS = PTA_GRANITERAPIDS_D | PTA_MOVRS | PTA_AMX_MOVRS | PTA_USER_MSR; constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE - | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 - | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 - | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT + | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL + | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; constexpr wide_int_bitmask PTA_BDVER2 = PTA_BDVER1 | PTA_BMI | PTA_TBM | PTA_F16C | PTA_FMA; constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT @@ -2487,13 +2504,13 @@ constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT constexpr wide_int_bitmask PTA_BDVER4 = PTA_BDVER3 | PTA_AVX2 | PTA_BMI2 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX; -constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 - | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 - | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT - | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED - | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES | PTA_SHA | PTA_LZCNT - | PTA_POPCNT; +constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_POPCNT | PTA_LZCNT + | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL + | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW + | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE + | PTA_MWAITX | PTA_ADX | PTA_RDSEED | PTA_CLZERO | PTA_CLFLUSHOPT + | PTA_XSAVEC | PTA_XSAVES | PTA_SHA; constexpr wide_int_bitmask PTA_ZNVER2 = PTA_ZNVER1 | PTA_CLWB | PTA_RDPID | PTA_WBNOINVD; constexpr wide_int_bitmask PTA_ZNVER3 = PTA_ZNVER2 | PTA_VAES | PTA_VPCLMULQDQ @@ -2506,19 +2523,19 @@ constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI; constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE - | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 - | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; + | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_LZCNT | PTA_POPCNT + | PTA_ABM | PTA_CX16 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; constexpr wide_int_bitmask PTA_BTVER2 = PTA_BTVER1 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_XSAVEOPT; constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE - | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW - | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE - | PTA_ADX | PTA_RDSEED | PTA_POPCNT; + | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_LZCNT | PTA_POPCNT | PTA_ABM + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI + | PTA_BMI2 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE + | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED; constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2 - | PTA_F16C | PTA_FMA | PTA_SHA | PTA_LZCNT; + | PTA_F16C | PTA_FMA | PTA_SHA; #ifndef GENERATOR_FILE @@ -2865,6 +2882,9 @@ struct GTY(()) machine_function { approximation. */ BOOL_BITFIELD tls_descriptor_call_expanded_p : 1; + /* True if TLS descriptor is called more than once. */ + BOOL_BITFIELD tls_descriptor_call_multiple_p : 1; + /* If true, the current function has a STATIC_CHAIN is placed on the stack below the return address. */ BOOL_BITFIELD static_chain_on_stack : 1; @@ -2934,6 +2954,9 @@ struct GTY(()) machine_function { /* True if this is a recursive function. */ BOOL_BITFIELD recursive_function : 1; + /* True if by_pieces op is currently in use. */ + BOOL_BITFIELD by_pieces_in_use : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index eb52699..cea6c15 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -901,6 +901,10 @@ (define_attr "avx_partial_xmm_update" "false,true" (const_string "false")) +;; Define attribute to indicate 64-bit TLS insns. +(define_attr "tls64" "gd,ld_base,call,combine,lea,none" + (const_string "none")) + ;; Define attribute to classify add/sub insns that consumes carry flag (CF) (define_attr "use_carry" "0,1" (const_string "0")) @@ -1618,10 +1622,8 @@ (compare (match_operand:QI 0 "nonimmediate_operand" "QBn") (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0)))] + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0)))] "ix86_match_ccmode (insn, CCmode)" "cmp{b}\t{%h1, %0|%0, %h1}" [(set_attr "addr" "gpr8") @@ -1632,10 +1634,8 @@ [(set (reg FLAGS_REG) (compare (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 0 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 0 "int248_register_operand" "Q")]) 0) (match_operand:QI 1 "const0_operand")))] "ix86_match_ccmode (insn, CCNOmode)" "test{b}\t%h0, %h0" @@ -1657,10 +1657,8 @@ [(set (reg FLAGS_REG) (compare (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 0 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 0 "int248_register_operand" "Q")]) 0) (match_operand:QI 1 "general_operand" "QnBn")))] "ix86_match_ccmode (insn, CCmode)" "cmp{b}\t{%1, %h0|%h0, %1}" @@ -1672,15 +1670,11 @@ [(set (reg FLAGS_REG) (compare (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 0 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 0 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0)))] + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0)))] "ix86_match_ccmode (insn, CCmode)" "cmp{b}\t{%h1, %h0|%h0, %h1}" [(set_attr "type" "icmp") @@ -2968,7 +2962,8 @@ (match_operand:SWI248 1 "const_int_operand"))] "optimize_insn_for_size_p () && optimize_size > 1 && operands[1] != const0_rtx - && operands[1] != constm1_rtx + && (operands[1] != constm1_rtx + || (<MODE>mode == DImode && LEGACY_INT_REG_P (operands[0]))) && IN_RANGE (INTVAL (operands[1]), -128, 127) && !ix86_red_zone_used && REGNO (operands[0]) != SP_REG" @@ -3479,10 +3474,8 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q")) (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))] + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" "mov{b}\t{%h1, %0|%0, %h1}" [(set_attr "type" "imov") @@ -3565,10 +3558,8 @@ (define_insn "*extzvqi" [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn,?R") (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0))] + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q,Q")]) 0))] "" { switch (get_attr_type (insn)) @@ -3689,10 +3680,8 @@ (match_operand 0 "int248_register_operand" "+Q") (const_int 8) (const_int 8)) - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]))] + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]))] "" "mov{b}\t{%h1, %h0|%h0, %h1}" [(set_attr "type" "imov") @@ -5259,10 +5248,8 @@ [(set (match_operand:SWI24 0 "register_operand" "=R") (sign_extend:SWI24 (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0)))] + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0)))] "" "movs{b<SWI24:imodesuffix>|x}\t{%h1, %0|%0, %h1}" [(set_attr "type" "imovx") @@ -7008,10 +6995,8 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q")) (plus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q,Q")]) 0) (match_operand:QI 1 "nonimmediate_operand" "0,!qm"))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" @@ -7025,8 +7010,8 @@ [(set (strict_low_part (match_dup 0)) (plus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 2) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -7037,29 +7022,25 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q")) (plus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" "#" "&& reload_completed" [(set (strict_low_part (match_dup 0)) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) (parallel [(set (strict_low_part (match_dup 0)) (plus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -7474,10 +7455,8 @@ [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn") (plus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0) (match_operand:QI 1 "nonimmediate_operand" "0"))) (clobber (reg:CC FLAGS_REG))] "" @@ -7490,29 +7469,25 @@ [(set (match_operand:QI 0 "register_operand" "=&Q") (plus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "" "#" "&& reload_completed" [(set (match_dup 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) (parallel [(set (match_dup 0) (plus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -7542,10 +7517,8 @@ (subreg:SWI248 (plus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0)) (clobber (reg:CC FLAGS_REG))] "" @@ -7580,8 +7553,8 @@ (subreg:SWI248 (plus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0)) (clobber (reg:CC FLAGS_REG))])] "" @@ -7601,15 +7574,11 @@ (subreg:SWI248 (plusminus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "<comm>0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "<comm>0,!Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0)) 0)) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0)) (clobber (reg:CC FLAGS_REG))] "" "@ @@ -7628,11 +7597,11 @@ (subreg:SWI248 (plusminus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) 0)) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "alu") @@ -8229,10 +8198,8 @@ (minus:QI (match_operand:QI 1 "nonimmediate_operand" "0,!qm") (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q,Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" "@ @@ -8246,8 +8213,8 @@ (minus:QI (match_dup 0) (subreg:QI - (match_op_dup 3 - [(match_dup 2) (const_int 8) (const_int 8)]) 0))) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0))) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "alu") @@ -8257,30 +8224,26 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q")) (minus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" "#" "&& reload_completed" [(set (strict_low_part (match_dup 0)) (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0)) (parallel [(set (strict_low_part (match_dup 0)) (minus:QI (match_dup 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0))) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0))) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "alu") @@ -8331,10 +8294,8 @@ (minus:QI (match_operand:QI 1 "nonimmediate_operand" "0") (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "" "sub{b}\t{%h2, %0|%0, %h2}" @@ -8346,30 +8307,26 @@ [(set (match_operand:QI 0 "register_operand" "=&Q") (minus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "" "#" "&& reload_completed" [(set (match_dup 0) (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0)) (parallel [(set (match_dup 0) (minus:QI (match_dup 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0))) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0))) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "alu") @@ -8384,10 +8341,8 @@ (subreg:SWI248 (minus:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0)) (clobber (reg:CC FLAGS_REG))] "" @@ -8406,8 +8361,8 @@ (subreg:SWI248 (minus:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0)) (clobber (reg:CC FLAGS_REG))])] "" @@ -12355,10 +12310,8 @@ (compare (and:QI (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 0 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 0 "int248_register_operand" "Q")]) 0) (match_operand:QI 1 "general_operand" "QnBn")) (const_int 0)))] "ix86_match_ccmode (insn, CCNOmode)" @@ -12372,15 +12325,11 @@ (compare (and:QI (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 0 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 0 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0)) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0)) (const_int 0)))] "ix86_match_ccmode (insn, CCNOmode)" "test{b}\t{%h1, %h0|%h0, %h1}" @@ -12969,10 +12918,8 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+Q,&Q")) (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q,Q")]) 0) (match_operand:QI 1 "nonimmediate_operand" "0,!qm"))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" @@ -12986,8 +12933,8 @@ [(set (strict_low_part (match_dup 0)) (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 2) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -12998,29 +12945,25 @@ [(set (strict_low_part (match_operand:QI 0 "register_operand" "+&Q")) (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "!TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)" "#" "&& reload_completed" [(set (strict_low_part (match_dup 0)) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) (parallel [(set (strict_low_part (match_dup 0)) (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -13223,10 +13166,8 @@ [(set (match_operand:QI 0 "nonimmediate_operand" "=QBn") (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0) (match_operand:QI 1 "nonimmediate_operand" "0"))) (clobber (reg:CC FLAGS_REG))] "" @@ -13239,29 +13180,25 @@ [(set (match_operand:QI 0 "register_operand" "=&Q") (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q") - (const_int 8) - (const_int 8)]) 0))) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q")]) 0))) (clobber (reg:CC FLAGS_REG))] "" "#" "&& reload_completed" [(set (match_dup 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) (parallel [(set (match_dup 0) (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0) (match_dup 0))) (clobber (reg:CC FLAGS_REG))])] "" @@ -13291,10 +13228,8 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "general_operand" "QnBn,QnBn")) 0)) (clobber (reg:CC FLAGS_REG))] "" @@ -13313,8 +13248,8 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0)) (clobber (reg:CC FLAGS_REG))])] "" @@ -13328,10 +13263,8 @@ (match_operator 5 "compare_operator" [(any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "general_operand" "QnBn,QnBn")) (const_int 0)])) (set (zero_extract:SWI248 @@ -13341,8 +13274,8 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0))] "ix86_match_ccmode (insn, CCNOmode)" "@ @@ -13358,9 +13291,9 @@ [(set (match_dup 4) (match_op_dup 5 [(any_logic:QI - (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (subreg:QI + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) (const_int 0)])) (set (zero_extract:SWI248 @@ -13368,8 +13301,8 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 1) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 1) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0))])] "" [(set_attr "addr" "gpr8") @@ -13385,15 +13318,11 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "%0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "%0,!Q")]) 0) (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand" "Q,Q") - (const_int 8) - (const_int 8)]) 0)) 0)) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand" "Q,Q")]) 0)) 0)) (clobber (reg:CC FLAGS_REG))] "" "@ @@ -13412,11 +13341,11 @@ (subreg:SWI248 (any_logic:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (subreg:QI - (match_op_dup 4 - [(match_dup 2) (const_int 8) (const_int 8)]) 0)) 0)) + (zero_extract:SWI248 + (match_dup 2) (const_int 8) (const_int 8)) 0)) 0)) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "alu") @@ -13428,12 +13357,10 @@ (match_operand 0 "int248_register_operand" "+Q,&Q") (const_int 8) (const_int 8)) - (match_operator:SWI248 3 "extract_operator" + (match_operator:SWI248 3 "extract_high_operator" [(any_logic (match_operand 1 "int248_register_operand" "%0,!Q") - (match_operand 2 "int248_register_operand" "Q,Q")) - (const_int 8) - (const_int 8)])) + (match_operand 2 "int248_register_operand" "Q,Q"))])) (clobber (reg:CC FLAGS_REG))] "GET_MODE (operands[1]) == GET_MODE (operands[2])" "@ @@ -13449,9 +13376,9 @@ (parallel [(set (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8)) - (match_op_dup 3 - [(any_logic (match_dup 4) (match_dup 2)) - (const_int 8) (const_int 8)])) + (zero_extract:SWI248 + (any_logic (match_dup 4) (match_dup 2)) + (const_int 8) (const_int 8))) (clobber (reg:CC FLAGS_REG))])] "operands[4] = gen_lowpart (GET_MODE (operands[1]), operands[0]);" [(set_attr "type" "alu") @@ -14696,10 +14623,8 @@ (subreg:SWI248 (neg:QI (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0)) 0)) + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0)) 0)) (clobber (reg:CC FLAGS_REG))] "" "@ @@ -14717,8 +14642,8 @@ (subreg:SWI248 (neg:QI (subreg:QI - (match_op_dup 2 - [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0)) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0)) 0)) (clobber (reg:CC FLAGS_REG))])] "" [(set_attr "type" "negnot") @@ -15350,13 +15275,9 @@ (match_operand 0 "int248_register_operand" "+Q,&Q") (const_int 8) (const_int 8)) - (subreg:SWI248 - (not:QI - (subreg:QI - (match_operator:SWI248 2 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0)) 0))] + (not:SWI248 + (match_operator:SWI248 2 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")])))] "" "@ not{b}\t%h0 @@ -15369,11 +15290,8 @@ (match_dup 1) (const_int 8) (const_int 8))) (set (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8)) - (subreg:SWI248 - (not:QI - (subreg:QI - (match_op_dup 2 - [(match_dup 0) (const_int 8) (const_int 8)]) 0)) 0))] + (not:SWI248 + (zero_extract:SWI248 (match_dup 0) (const_int 8) (const_int 8))))] "" [(set_attr "type" "negnot") (set_attr "mode" "QI")]) @@ -16720,10 +16638,8 @@ (subreg:SWI248 (ashift:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0)) (clobber (reg:CC FLAGS_REG))] "" @@ -16757,8 +16673,8 @@ (subreg:SWI248 (ashift:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0)) (clobber (reg:CC FLAGS_REG))])] "" @@ -18004,10 +17920,8 @@ (subreg:SWI248 (any_shiftrt:QI (subreg:QI - (match_operator:SWI248 3 "extract_operator" - [(match_operand 1 "int248_register_operand" "0,!Q") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 3 "extract_high_operator" + [(match_operand 1 "int248_register_operand" "0,!Q")]) 0) (match_operand:QI 2 "nonmemory_operand" "cI,cI")) 0)) (clobber (reg:CC FLAGS_REG))] "" @@ -18033,8 +17947,8 @@ (subreg:SWI248 (any_shiftrt:QI (subreg:QI - (match_op_dup 3 - [(match_dup 0) (const_int 8) (const_int 8)]) 0) + (zero_extract:SWI248 + (match_dup 0) (const_int 8) (const_int 8)) 0) (match_dup 2)) 0)) (clobber (reg:CC FLAGS_REG))])] "" @@ -18388,17 +18302,17 @@ (any_rotate:SWI (match_operand:SWI 1 "const_int_operand") (subreg:QI - (and - (match_operand 2 "int248_register_operand") - (match_operand 3 "const_int_operand")) 0)))] + (match_operator 4 "and_operator" + [(match_operand 2 "int248_register_operand") + (match_operand 3 "const_int_operand")]) 0)))] "(INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1)) == GET_MODE_BITSIZE (<MODE>mode) - 1" - [(set (match_dup 4) (match_dup 1)) + [(set (match_dup 5) (match_dup 1)) (set (match_dup 0) - (any_rotate:SWI (match_dup 4) + (any_rotate:SWI (match_dup 5) (subreg:QI - (and:SI (match_dup 2) (match_dup 3)) 0)))] - "operands[4] = gen_reg_rtx (<MODE>mode);") + (match_op_dup 4 [(match_dup 2) (match_dup 3)]) 0)))] + "operands[5] = gen_reg_rtx (<MODE>mode);") (define_insn_and_split "*<insn><mode>3_mask_1" [(set (match_operand:SWI 0 "nonimmediate_operand") @@ -23243,6 +23157,7 @@ return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}"; } [(set_attr "type" "multi") + (set_attr "tls64" "gd") (set (attr "length") (symbol_ref "TARGET_X32 ? 15 : 16"))]) @@ -23281,7 +23196,11 @@ UNSPEC_TLS_GD) (clobber (match_operand:P 3 "register_operand"))])] "TARGET_64BIT" - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") +{ + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) (define_insn "*tls_local_dynamic_base_32_gnu" [(set (match_operand:SI 0 "register_operand" "=a") @@ -23343,6 +23262,7 @@ return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}"; } [(set_attr "type" "multi") + (set_attr "tls64" "ld_base") (set_attr "length" "12")]) (define_insn "*tls_local_dynamic_base_64_largepic" @@ -23376,7 +23296,11 @@ (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE) (clobber (match_operand:P 2 "register_operand"))])] "TARGET_64BIT" - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") +{ + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) ;; Local dynamic of a single variable is a lose. Show combine how ;; to convert that back to global dynamic. @@ -23570,6 +23494,8 @@ "TARGET_64BIT && TARGET_GNU2_TLS" { operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0]; + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; ix86_tls_descriptor_calls_expanded_in_cfun = true; }) @@ -23581,6 +23507,7 @@ "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}" [(set_attr "type" "lea") (set_attr "mode" "<MODE>") + (set_attr "tls64" "lea") (set_attr "length" "7") (set_attr "length_address" "4")]) @@ -23594,6 +23521,7 @@ "TARGET_64BIT && TARGET_GNU2_TLS" "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}" [(set_attr "type" "call") + (set_attr "tls64" "call") (set_attr "length" "2") (set_attr "length_address" "0")]) @@ -23615,7 +23543,8 @@ { operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0]; emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1])); -}) +} + [(set_attr "tls64" "combine")]) (define_split [(match_operand 0 "tls_address_pattern")] @@ -28251,10 +28180,8 @@ (match_operator 1 "compare_operator" [(and:QI (subreg:QI - (match_operator:SWI248 4 "extract_operator" - [(match_operand 2 "int248_register_operand") - (const_int 8) - (const_int 8)]) 0) + (match_operator:SWI248 4 "extract_high_operator" + [(match_operand 2 "int248_register_operand")]) 0) (match_operand 3 "const_int_operand")) (const_int 0)]))] "! TARGET_PARTIAL_REG_STALL @@ -28266,9 +28193,9 @@ (match_op_dup 1 [(and:QI (subreg:QI - (match_op_dup 4 [(match_dup 2) - (const_int 8) - (const_int 8)]) 0) + (zero_extract:SWI248 (match_dup 2) + (const_int 8) + (const_int 8)) 0) (match_dup 3)) (const_int 0)])) (set (zero_extract:SWI248 (match_dup 2) @@ -28277,9 +28204,9 @@ (subreg:SWI248 (and:QI (subreg:QI - (match_op_dup 4 [(match_dup 2) - (const_int 8) - (const_int 8)]) 0) + (zero_extract:SWI248 (match_dup 2) + (const_int 8) + (const_int 8)) 0) (match_dup 3)) 0))])]) ;; Don't do logical operations with memory inputs. diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c93c0b1..6bda22f 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1246,6 +1246,10 @@ munroll-only-small-loops Target Var(ix86_unroll_only_small_loops) Init(0) Optimization Enable conservative small loop unrolling. +-param=ix86-vect-unroll-limit= +Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param +Limit how much the autovectorizer may unroll a loop. + mlam= Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none) -mlam=[none|u48|u57] Instrument meta data position in user data pointers. diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index b2d2eec..5dbe444 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1319,6 +1319,9 @@ (ior (match_operand 0 "nonimmediate_operand") (match_test "const_vec_duplicate_p (op)"))) +(define_predicate "const_vec_dup_operand" + (match_test "const_vec_duplicate_p (op)")) + ;; Return true when OP is either register operand, or any ;; CONST_VECTOR. (define_predicate "reg_or_const_vector_operand" @@ -1714,10 +1717,14 @@ (define_predicate "div_operator" (match_code "div")) -;; Return true if this is a and, ior or xor operation. +;; Return true if this is an and, ior or xor operation. (define_predicate "logic_operator" (match_code "and,ior,xor")) +;; Return true if this is an and operation. +(define_predicate "and_operator" + (match_code "and")) + ;; Return true if this is a plus, minus, and, ior or xor operation. (define_predicate "plusminuslogic_operator" (match_code "plus,minus,and,ior,xor")) @@ -1740,8 +1747,12 @@ (define_predicate "compare_operator" (match_code "compare")) -(define_predicate "extract_operator" - (match_code "zero_extract,sign_extract")) +(define_predicate "extract_high_operator" + (match_code "zero_extract,sign_extract,ashiftrt,lshiftrt") +{ + return (const8_operand (XEXP (op, 1), VOIDmode) + && (BINARY_P (op) || const8_operand (XEXP (op, 2), VOIDmode))); +}) ;; Return true if OP is a memory operand, aligned to ;; less than its natural alignment. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d88c3d6..73906b8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -326,6 +326,9 @@ (define_mode_iterator VI1_AVX512VL [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")]) +(define_mode_iterator VI1_AVX512_3264 + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")]) + ;; All vector modes (define_mode_iterator V [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI @@ -21729,6 +21732,19 @@ (const_string "orig"))) (set_attr "mode" "TI,TI,TI,TI,TI,TI,V4SF,V2SF,V2SF")]) +;; Eliminate redundancy caused by +;; /* Special case TImode to 128-bit vector conversions via V2DI. */ +;; in ix86_expand_vector_move + +(define_split + [(set (match_operand:V2DI 0 "register_operand") + (vec_concat:V2DI + (subreg:DI (match_operand:TI 1 "register_operand") 0) + (subreg:DI (match_dup 1) 8)))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + [(set (match_dup 0) + (subreg:V2DI (match_dup 1) 0))]) + (define_insn "*vec_concatv2di_0" [(set (match_operand:V2DI 0 "register_operand" "=v,v ,x") (vec_concat:V2DI @@ -26546,9 +26562,9 @@ ;; XOP packed rotate instructions (define_expand "rotl<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotate:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotate:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26577,9 +26593,9 @@ }) (define_expand "rotr<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotatert:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotatert:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26951,31 +26967,122 @@ int i; if (<CODE> != ASHIFT) - { - if (CONST_INT_P (operands[2])) - operands[2] = GEN_INT (-INTVAL (operands[2])); - else - negate = true; - } + { + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (-INTVAL (operands[2])); + else + negate = true; + } par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); tmp = lowpart_subreg (QImode, operands[2], SImode); for (i = 0; i < 16; i++) - XVECEXP (par, 0, i) = tmp; + XVECEXP (par, 0, i) = tmp; tmp = gen_reg_rtx (V16QImode); emit_insn (gen_vec_initv16qiqi (tmp, par)); if (negate) - emit_insn (gen_negv16qi2 (tmp, tmp)); + emit_insn (gen_negv16qi2 (tmp, tmp)); gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } + else if (TARGET_GFNI && CONST_INT_P (operands[2]) + && (<MODE_SIZE> == 64 + || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT))) + { + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], + <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, + const0_rtx)); + } else ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); DONE; }) +(define_expand "cond_<insn><mode>" + [(set (match_operand:VI1_AVX512VL 0 "register_operand") + (vec_merge:VI1_AVX512VL + (any_shift:VI1_AVX512VL + (match_operand:VI1_AVX512VL 2 "register_operand") + (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand")) + (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand")))] + "TARGET_GFNI && TARGET_AVX512F" +{ + rtx count = XVECEXP (operands[3], 0, 0); + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix, + const0_rtx, operands[4], + operands[1])); + DONE; +}) + +(define_expand "<insn><mode>3" + [(set (match_operand:VI1_AVX512_3264 0 "register_operand") + (any_rotate:VI1_AVX512_3264 + (match_operand:VI1_AVX512_3264 1 "register_operand") + (match_operand:SI 2 "const_int_operand")))] + "TARGET_GFNI" +{ + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, + const0_rtx)); + DONE; +}) + +(define_expand "<insn>v16qi3" + [(set (match_operand:V16QI 0 "register_operand") + (any_rotate:V16QI + (match_operand:V16QI 1 "nonimmediate_operand") + (match_operand:SI 2 "general_operand")))] + "TARGET_GFNI || TARGET_XOP" +{ + /* Handle the V16QI XOP case to avoid a conflict with the other expand. */ + if (TARGET_XOP) + { + if (! const_0_to_7_operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != QImode) + { + op2 = gen_reg_rtx (QImode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_initv16qiqi (reg, par)); + if (<CODE> == ROTATERT) + { + rtx neg = gen_reg_rtx (V16QImode); + emit_insn (gen_negv16qi2 (neg, reg)); + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg)); + reg = neg; + } + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg)); + DONE; + } + } + else if (TARGET_GFNI && CONST_INT_P (operands[2])) + { + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], + force_reg (V16QImode, operands[1]), + matrix, const0_rtx)); + DONE; + } + else + FAIL; +}) + (define_expand "ashrv2di3" [(set (match_operand:V2DI 0 "register_operand") (ashiftrt:V2DI diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index c8603b9..1649ea2 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ ix86_size_memcpy, ix86_size_memset, COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ @@ -261,6 +267,12 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ i386_memcpy, i386_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -382,6 +394,12 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ i486_memcpy, i486_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -501,6 +519,12 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium_memcpy, pentium_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (5), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium_memcpy, pentium_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentiumpro_memcpy, pentiumpro_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -858,6 +894,12 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ geode_memcpy, geode_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -979,6 +1021,12 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (2), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ k6_memcpy, k6_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ athlon_memcpy, athlon_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ k8_memcpy, k8_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ amdfam10_memcpy, amdfam10_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ bdver_memcpy, bdver_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {5, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver1_memcpy, znver1_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {10, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = { We increase width to 6 for multiplications in ix86_reassociation_width. */ 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ skylake_memcpy, skylake_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 10, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ icelake_memcpy, icelake_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ alderlake_memcpy, alderlake_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ btver1_memcpy, btver1_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ btver2_memcpy, btver2_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium4_memcpy, pentium4_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ nocona_memcpy, nocona_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 2, /* Limit how much the autovectorizer + may unroll a loop. */ atom_memcpy, atom_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ slm_memcpy, slm_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ tremont_memcpy, tremont_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ lujiazui_memcpy, lujiazui_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ yongfeng_memcpy, yongfeng_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ shijidadao_memcpy, shijidadao_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ generic_memcpy, generic_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -4215,6 +4401,12 @@ struct processor_costs core_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ core_memcpy, core_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |