diff options
Diffstat (limited to 'gcc/config')
89 files changed, 4157 insertions, 1662 deletions
diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc deleted file mode 100644 index cea54de..0000000 --- a/gcc/config/aarch64/aarch64-cc-fusion.cc +++ /dev/null @@ -1,297 +0,0 @@ -// Pass to fuse CC operations with other instructions. -// Copyright (C) 2021-2025 Free Software Foundation, Inc. -// -// This file is part of GCC. -// -// GCC is free software; you can redistribute it and/or modify it under -// the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 3, or (at your option) any later -// version. -// -// GCC is distributed in the hope that it will be useful, but WITHOUT ANY -// WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License -// along with GCC; see the file COPYING3. If not see -// <http://www.gnu.org/licenses/>. - -// This pass looks for sequences of the form: -// -// A: (set (reg R1) X1) -// B: ...instructions that might change the value of X1... -// C: (set (reg CC) X2) // X2 uses R1 -// -// and tries to change them to: -// -// C': [(set (reg CC) X2') -// (set (reg R1) X1)] -// B: ...instructions that might change the value of X1... -// -// where X2' is the result of replacing R1 with X1 in X2. -// -// This sequence occurs in SVE code in two important cases: -// -// (a) Sometimes, to deal correctly with overflow, we need to increment -// an IV after a WHILELO rather than before it. In this case: -// - A is a WHILELO, -// - B includes an IV increment and -// - C is a separate PTEST. -// -// (b) ACLE code of the form: -// -// svbool_t ok = svrdffr (); -// if (svptest_last (pg, ok)) -// ... -// -// must, for performance reasons, be code-generated as: -// -// RDFFRS Pok.B, Pg/Z -// ...branch on flags result... -// -// without a separate PTEST of Pok. In this case: -// - A is an aarch64_rdffr -// - B includes an aarch64_update_ffrt -// - C is a separate PTEST -// -// Combine can handle this optimization if B doesn't exist and if A and -// C are in the same BB. This pass instead handles cases where B does -// exist and cases where A and C are in different BBs of the same EBB. - -#define IN_TARGET_CODE 1 - -#define INCLUDE_ALGORITHM -#define INCLUDE_FUNCTIONAL -#define INCLUDE_ARRAY -#include "config.h" -#include "system.h" -#include "coretypes.h" -#include "backend.h" -#include "rtl.h" -#include "df.h" -#include "rtl-ssa.h" -#include "tree-pass.h" - -using namespace rtl_ssa; - -namespace { -const pass_data pass_data_cc_fusion = -{ - RTL_PASS, // type - "cc_fusion", // name - OPTGROUP_NONE, // optinfo_flags - TV_NONE, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - TODO_df_finish, // todo_flags_finish -}; - -// Class that represents one run of the pass. -class cc_fusion -{ -public: - cc_fusion () : m_parallel () {} - void execute (); - -private: - rtx optimizable_set (const insn_info *); - bool parallelize_insns (def_info *, rtx, def_info *, rtx); - void optimize_cc_setter (def_info *, rtx); - - // A spare PARALLEL rtx, or null if none. - rtx m_parallel; -}; - -// See whether INSN is a single_set that we can optimize. Return the -// set if so, otherwise return null. -rtx -cc_fusion::optimizable_set (const insn_info *insn) -{ - if (!insn->can_be_optimized () - || insn->is_asm () - || insn->has_volatile_refs () - || insn->has_pre_post_modify ()) - return NULL_RTX; - - return single_set (insn->rtl ()); -} - -// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise -// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the -// CC register and the instruction that contains CC_SET is known to use -// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. -bool -cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set, - def_info *other_def, rtx other_set) -{ - auto attempt = crtl->ssa->new_change_attempt (); - - insn_info *cc_insn = cc_def->insn (); - insn_info *other_insn = other_def->insn (); - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", - other_insn->uid (), cc_insn->uid ()); - - // Try to substitute OTHER_SET into CC_INSN. - insn_change_watermark rtl_watermark; - rtx_insn *cc_rtl = cc_insn->rtl (); - insn_propagation prop (cc_rtl, SET_DEST (other_set), - SET_SRC (other_set)); - if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) - || prop.num_replacements == 0) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- failed to substitute all uses of r%d\n", - other_def->regno ()); - return false; - } - - // Restrict the uses to those outside notes. - use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); - use_array other_set_uses = remove_note_accesses (attempt, - other_insn->uses ()); - - // Remove the use of the substituted value. - access_array_builder uses_builder (attempt); - uses_builder.reserve (cc_uses.size ()); - for (use_info *use : cc_uses) - if (use->def () != other_def) - uses_builder.quick_push (use); - cc_uses = use_array (uses_builder.finish ()); - - // Get the list of uses for the new instruction. - insn_change cc_change (cc_insn); - cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); - if (!cc_change.new_uses.is_valid ()) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- cannot merge uses\n"); - return false; - } - - // The instruction initially defines just two registers. recog can add - // extra clobbers if necessary. - auto_vec<access_info *, 2> new_defs; - new_defs.quick_push (cc_def); - new_defs.quick_push (other_def); - sort_accesses (new_defs); - cc_change.new_defs = def_array (access_array (new_defs)); - - // Make sure there is somewhere that the new instruction could live. - auto other_change = insn_change::delete_insn (other_insn); - insn_change *changes[] = { &other_change, &cc_change }; - cc_change.move_range = cc_insn->ebb ()->insn_range (); - if (!restrict_movement (cc_change, ignore_changing_insns (changes))) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); - return false; - } - - // Tentatively install the new pattern. By convention, the CC set - // must be first. - if (m_parallel) - { - XVECEXP (m_parallel, 0, 0) = cc_set; - XVECEXP (m_parallel, 0, 1) = other_set; - } - else - { - rtvec vec = gen_rtvec (2, cc_set, other_set); - m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); - } - validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); - - // These routines report failures themselves. - if (!recog (attempt, cc_change, ignore_changing_insns (changes)) - || !changes_are_worthwhile (changes) - || !crtl->ssa->verify_insn_changes (changes)) - return false; - - remove_reg_equal_equiv_notes (cc_rtl); - confirm_change_group (); - crtl->ssa->change_insns (changes); - m_parallel = NULL_RTX; - return true; -} - -// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes -// a definition of the CC register by CC_SET. -void -cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set) -{ - // Search the registers used by the CC setter for an easily-substitutable - // def-use chain. - for (use_info *other_use : cc_def->insn ()->uses ()) - if (def_info *other_def = other_use->def ()) - if (other_use->regno () != CC_REGNUM - && other_def->ebb () == cc_def->ebb ()) - if (rtx other_set = optimizable_set (other_def->insn ())) - { - rtx dest = SET_DEST (other_set); - if (REG_P (dest) - && REGNO (dest) == other_def->regno () - && REG_NREGS (dest) == 1 - && parallelize_insns (cc_def, cc_set, other_def, other_set)) - return; - } -} - -// Run the pass on the current function. -void -cc_fusion::execute () -{ - // Initialization. - calculate_dominance_info (CDI_DOMINATORS); - df_analyze (); - crtl->ssa = new rtl_ssa::function_info (cfun); - - // Walk through all instructions that set CC. Look for a PTEST instruction - // that we can optimize. - // - // ??? The PTEST test isn't needed for correctness, but it ensures that the - // pass no effect on non-SVE code. - for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM)) - if (rtx cc_set = optimizable_set (def->insn ())) - if (REG_P (SET_DEST (cc_set)) - && REGNO (SET_DEST (cc_set)) == CC_REGNUM - && GET_CODE (SET_SRC (cc_set)) == UNSPEC - && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST) - optimize_cc_setter (def, cc_set); - - // Finalization. - crtl->ssa->perform_pending_updates (); - free_dominance_info (CDI_DOMINATORS); -} - -class pass_cc_fusion : public rtl_opt_pass -{ -public: - pass_cc_fusion (gcc::context *ctxt) - : rtl_opt_pass (pass_data_cc_fusion, ctxt) - {} - - // opt_pass methods: - virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; } - virtual unsigned int execute (function *); -}; - -unsigned int -pass_cc_fusion::execute (function *) -{ - cc_fusion ().execute (); - return 0; -} - -} // end namespace - -// Create a new CC fusion pass instance. - -rtl_opt_pass * -make_pass_cc_fusion (gcc::context *ctxt) -{ - return new pass_cc_fusion (ctxt); -} diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def index 9cf9d3e..6a53ff3 100644 --- a/gcc/config/aarch64/aarch64-passes.def +++ b/gcc/config/aarch64/aarch64-passes.def @@ -24,6 +24,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation); INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm); INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti); -INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion); INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion); INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 36bd885..56efcf2 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1098,6 +1098,7 @@ bool aarch64_legitimate_address_p (machine_mode, rtx, bool, aarch64_addr_query_type = ADDR_QUERY_M); machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx); rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); +rtx aarch64_gen_compare_split_imm24 (rtx, rtx, rtx); bool aarch64_maxmin_plus_const (rtx_code, rtx *, bool); rtx aarch64_load_tp (rtx); @@ -1236,7 +1237,6 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *); rtl_opt_pass *make_pass_track_speculation (gcc::context *); rtl_opt_pass *make_pass_late_track_speculation (gcc::context *); rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt); -rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt); rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt); rtl_opt_pass *make_pass_ldp_fusion (gcc::context *); @@ -1281,4 +1281,7 @@ extern bool aarch64_gcs_enabled (); extern unsigned aarch64_data_alignment (const_tree exp, unsigned align); extern unsigned aarch64_stack_alignment (const_tree exp, unsigned align); +extern rtx aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x, + rtx_code_label *label); + #endif /* GCC_AARCH64_PROTOS_H */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 8b75c3d..c111dc2 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -6731,7 +6731,7 @@ (SAT_TRUNC:<VNARROWQ> (<TRUNC_SHIFT>:SD_HSDI (match_operand:SD_HSDI 1 "register_operand" "w") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))] "TARGET_SIMD" "<shrn_op>shrn\t%<vn2>0<Vmntype>, %<v>1<Vmtype>, %2" [(set_attr "type" "neon_shift_imm_narrow_q")] @@ -6753,7 +6753,7 @@ (ALL_TRUNC:<VNARROWQ> (<TRUNC_SHIFT>:VQN (match_operand:VQN 1 "register_operand") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))] "TARGET_SIMD" { operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode, @@ -6784,7 +6784,7 @@ (<TRUNCEXTEND>:<DWI> (match_operand:SD_HSDI 1 "register_operand" "w")) (match_operand:<DWI> 3 "aarch64_int_rnd_operand")) - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))] "TARGET_SIMD && aarch64_const_vec_rnd_cst_p (operands[3], operands[2])" "<shrn_op>rshrn\t%<vn2>0<Vmntype>, %<v>1<Vmtype>, %2" @@ -6799,7 +6799,7 @@ (<TRUNCEXTEND>:<V2XWIDE> (match_operand:SD_HSDI 1 "register_operand")) (match_dup 3)) - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))] "TARGET_SIMD" { /* Use this expander to create the rounding constant vector, which is @@ -6819,7 +6819,7 @@ (<TRUNCEXTEND>:<V2XWIDE> (match_operand:VQN 1 "register_operand")) (match_dup 3)) - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>"))))] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>"))))] "TARGET_SIMD" { if (<CODE> == TRUNCATE @@ -6861,7 +6861,7 @@ (smax:SD_HSDI (ashiftrt:SD_HSDI (match_operand:SD_HSDI 1 "register_operand" "w") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")) + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")) (const_int 0)) (const_int <half_mask>)))] "TARGET_SIMD" @@ -6872,7 +6872,7 @@ (define_expand "aarch64_sqshrun_n<mode>" [(match_operand:<VNARROWQ> 0 "register_operand") (match_operand:SD_HSDI 1 "register_operand") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")] "TARGET_SIMD" { rtx dst = gen_reg_rtx (<MODE>mode); @@ -6890,7 +6890,7 @@ (smax:VQN (ashiftrt:VQN (match_operand:VQN 1 "register_operand") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")) + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")) (match_dup 3)) (match_dup 4))))] "TARGET_SIMD" @@ -6932,7 +6932,7 @@ (sign_extend:<DWI> (match_operand:SD_HSDI 1 "register_operand" "w")) (match_operand:<DWI> 3 "aarch64_int_rnd_operand")) - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")) + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")) (const_int 0)) (const_int <half_mask>)))] "TARGET_SIMD @@ -6944,7 +6944,7 @@ (define_expand "aarch64_sqrshrun_n<mode>" [(match_operand:<VNARROWQ> 0 "register_operand") (match_operand:SD_HSDI 1 "register_operand") - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")] + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")] "TARGET_SIMD" { int prec = GET_MODE_UNIT_PRECISION (<DWI>mode); @@ -6967,7 +6967,7 @@ (sign_extend:<V2XWIDE> (match_operand:VQN 1 "register_operand")) (match_dup 3)) - (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<ve_mode>")) + (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")) (match_dup 4)) (match_dup 5))))] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md index 6b1a747..0123ea0 100644 --- a/gcc/config/aarch64/aarch64-sme.md +++ b/gcc/config/aarch64/aarch64-sme.md @@ -400,7 +400,8 @@ auto label = gen_label_rtx (); auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM); emit_insn (gen_aarch64_read_tpidr2 (tpidr2)); - auto jump = emit_likely_jump_insn (gen_aarch64_cbznedi1 (tpidr2, label)); + auto pat = aarch64_gen_compare_zero_and_branch (NE, tpidr2, label); + auto jump = emit_likely_jump_insn (pat); JUMP_LABEL (jump) = label; aarch64_restore_za (operands[0]); diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 2dbaf4a..ef9c165 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -975,19 +975,24 @@ aarch64_cb_rhs (rtx_code op_code, rtx rhs) { case EQ: case NE: - case GT: - case GTU: case LT: case LTU: + case GE: + case GEU: + /* EQ/NE range is 0 .. 63. + LT/LTU range is 0 .. 63. + GE/GEU range is 1 .. 64 => GT x - 1, but also supports 0 via XZR. + So the intersection is 0 .. 63. */ return IN_RANGE (rhs_val, 0, 63); - case GE: /* CBGE: signed greater than or equal */ - case GEU: /* CBHS: unsigned greater than or equal */ - return IN_RANGE (rhs_val, 1, 64); - - case LE: /* CBLE: signed less than or equal */ - case LEU: /* CBLS: unsigned less than or equal */ - return IN_RANGE (rhs_val, -1, 62); + case GT: + case GTU: + case LE: + case LEU: + /* GT/GTU range is 0 .. 63 + LE/LEU range is -1 .. 62 => LT x + 1. + So the intersection is 0 .. 62. */ + return IN_RANGE (rhs_val, 0, 62); default: return false; @@ -2882,10 +2887,47 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, return aarch64_gen_compare_reg (code, x, y); } +/* Split IMM into two 12-bit halves, producing an EQ/NE comparison vs X. + TMP may be a scratch. This optimizes a sequence from + mov x0, #imm1 + movk x0, #imm2, lsl 16 -- x0 contains CST + cmp x1, x0 + into the shorter: + sub tmp, x1, #(CST & 0xfff000) + subs tmp, tmp, #(CST & 0x000fff) +*/ +rtx +aarch64_gen_compare_split_imm24 (rtx x, rtx imm, rtx tmp) +{ + HOST_WIDE_INT lo_imm = UINTVAL (imm) & 0xfff; + HOST_WIDE_INT hi_imm = UINTVAL (imm) & 0xfff000; + enum machine_mode mode = GET_MODE (x); + + if (GET_CODE (tmp) == SCRATCH) + tmp = gen_reg_rtx (mode); + + emit_insn (gen_add3_insn (tmp, x, GEN_INT (-hi_imm))); + /* TODO: We don't need the gpr result of the second insn. */ + switch (mode) + { + case SImode: + tmp = gen_addsi3_compare0 (tmp, tmp, GEN_INT (-lo_imm)); + break; + case DImode: + tmp = gen_adddi3_compare0 (tmp, tmp, GEN_INT (-lo_imm)); + break; + default: + abort (); + } + emit_insn (tmp); + + return gen_rtx_REG (CC_NZmode, CC_REGNUM); +} + /* Generate conditional branch to LABEL, comparing X to 0 using CODE. Return the jump instruction. */ -static rtx +rtx aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x, rtx_code_label *label) { @@ -14380,41 +14422,57 @@ aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed) if (GET_CODE (op1) == PC || GET_CODE (op2) == PC) { /* Conditional branch. */ - if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) + enum machine_mode cmpmode = GET_MODE (inner); + if (GET_MODE_CLASS (cmpmode) == MODE_CC) return true; - else + + if (comparator == const0_rtx) { - if (cmpcode == NE || cmpcode == EQ) + switch (cmpcode) { - if (comparator == const0_rtx) - { - /* TBZ/TBNZ/CBZ/CBNZ. */ - if (GET_CODE (inner) == ZERO_EXTRACT) - /* TBZ/TBNZ. */ - *cost += rtx_cost (XEXP (inner, 0), VOIDmode, - ZERO_EXTRACT, 0, speed); - else - /* CBZ/CBNZ. */ - *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed); - - return true; - } - if (register_operand (inner, VOIDmode) - && aarch64_imm24 (comparator, VOIDmode)) + case NE: + case EQ: + if (cmpmode != SImode && cmpmode != DImode) + break; + if (GET_CODE (inner) == ZERO_EXTRACT) { - /* SUB and SUBS. */ - *cost += COSTS_N_INSNS (2); - if (speed) - *cost += extra_cost->alu.arith * 2; + /* TBZ/TBNZ. */ + *cost += rtx_cost (XEXP (inner, 0), VOIDmode, + ZERO_EXTRACT, 0, speed); return true; } + /* FALLTHRU */ + + case LT: + case GE: + /* CBZ/CBNZ/TBZ/TBNZ. */ + *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed); + return true; + + default: + break; } - else if (cmpcode == LT || cmpcode == GE) - { - /* TBZ/TBNZ. */ - if (comparator == const0_rtx) - return true; - } + } + + if ((cmpcode == NE || cmpcode == EQ) + && (cmpmode == SImode || cmpmode == DImode) + && aarch64_split_imm24 (comparator, cmpmode)) + { + /* SUB and SUBS. */ + *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed); + *cost += COSTS_N_INSNS (2); + if (speed) + *cost += extra_cost->alu.arith * 2; + return true; + } + + if (TARGET_CMPBR) + { + *cost += rtx_cost (inner, cmpmode, cmpcode, 0, speed); + if ((cmpmode != SImode && cmpmode != DImode) + || !aarch64_cb_rhs (cmpcode, comparator)) + *cost += rtx_cost (comparator, cmpmode, cmpcode, 1, speed); + return true; } } else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) @@ -16999,6 +17057,14 @@ private: or vector loop. There is one entry for each tuning option of interest. */ auto_vec<aarch64_vec_op_count, 2> m_ops; + + /* When doing inner-loop vectorization the constraints on the data-refs in the + outer-loop could limit the inner loop references. i.e. the outerloop can + force the inner-loop to do a load and splat which will result in the loop + being entirely scalar as all lanes work on a duplicate. Currently we don't + support unrolling of the inner loop independently from the outerloop during + outer-loop vectorization which tends to lead to pipeline bubbles. */ + bool m_loop_fully_scalar_dup = false; }; aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo, @@ -17320,13 +17386,14 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info, static bool aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info, - unsigned int vec_flags) + slp_tree node, unsigned int vec_flags) { gassign *assign = dyn_cast<gassign *> (stmt_info->stmt); if (!assign + || !node || gimple_assign_rhs_code (assign) != BIT_AND_EXPR - || !STMT_VINFO_VECTYPE (stmt_info) - || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info))) + || !SLP_TREE_VECTYPE (node) + || !VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))) return false; for (int i = 1; i < 3; ++i) @@ -17361,10 +17428,11 @@ aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info, instructions. */ static unsigned int aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, + slp_tree node, stmt_vec_info stmt_info, const sve_vec_cost *sve_costs) { - switch (vect_reduc_type (vinfo, stmt_info)) + switch (vect_reduc_type (vinfo, node)) { case EXTRACT_LAST_REDUCTION: return sve_costs->clast_cost; @@ -17404,7 +17472,9 @@ aarch64_sve_in_loop_reduction_latency (vec_info *vinfo, - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the SVE implementation. */ static unsigned int -aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, +aarch64_in_loop_reduction_latency (vec_info *vinfo, + slp_tree node, + stmt_vec_info stmt_info, unsigned int vec_flags) { const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs; @@ -17417,7 +17487,8 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, if (sve_costs) { unsigned int latency - = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs); + = aarch64_sve_in_loop_reduction_latency (vinfo, node, + stmt_info, sve_costs); if (latency) return latency; } @@ -17493,7 +17564,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, if (kind == scalar_load && node && sve_costs - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) { unsigned int nunits = vect_nunits_for_cost (vectype); /* Test for VNx2 modes, which have 64-bit containers. */ @@ -17507,7 +17578,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, if (kind == scalar_store && node && sve_costs - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -17516,7 +17587,8 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, && sve_costs) { unsigned int latency - = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs); + = aarch64_sve_in_loop_reduction_latency (vinfo, node, + stmt_info, sve_costs); if (latency) return latency; } @@ -17665,7 +17737,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, /* For vector boolean ANDs with a compare operand we just need one insn. */ - if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags)) + if (aarch64_bool_compound_p (vinfo, stmt_info, node, vec_flags)) return 0; } @@ -17698,13 +17770,12 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, with the single accumulator being read and written multiple times. */ static bool -aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info) +aarch64_force_single_cycle (vec_info *vinfo, slp_tree node) { - if (!STMT_VINFO_REDUC_DEF (stmt_info)) + auto reduc_info = info_for_reduction (as_a <loop_vec_info> (vinfo), node); + if (!reduc_info) return false; - - auto reduc_info = info_for_reduction (vinfo, stmt_info); - return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); + return VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info); } /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost @@ -17728,8 +17799,10 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, && vect_is_reduction (stmt_info)) { unsigned int base - = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags); - if (aarch64_force_single_cycle (m_vinfo, stmt_info)) + = aarch64_in_loop_reduction_latency (m_vinfo, node, + stmt_info, m_vec_flags); + if (m_costing_for_scalar + || aarch64_force_single_cycle (m_vinfo, node)) /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector, and then accumulate that, but at the moment the loop-carried dependency includes all copies. */ @@ -17746,7 +17819,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, /* Assume that bool AND with compare operands will become a single operation. */ - if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags)) + if (aarch64_bool_compound_p (m_vinfo, stmt_info, node, m_vec_flags)) return; } @@ -17763,7 +17836,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && kind == vec_to_scalar && (m_vec_flags & VEC_ADVSIMD) - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) { auto dr = STMT_VINFO_DATA_REF (stmt_info); tree dr_ref = DR_REF (dr); @@ -17842,7 +17915,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, have only accounted for one. */ if (stmt_info && (kind == vector_stmt || kind == vec_to_scalar) - && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION) + && vect_reduc_type (m_vinfo, node) == COND_REDUCTION) ops->general_ops += count; /* Count the predicate operations needed by an SVE comparison. */ @@ -17878,7 +17951,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && sve_issue && (kind == scalar_load || kind == scalar_store) - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17987,6 +18060,17 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, tree vectype, int misalign, vect_cost_model_location where) { + /* When costing for scalars, vectype will be NULL; so look up the type via + stmt_info's statement. */ + if (m_costing_for_scalar && stmt_info) + { + gcc_assert (!vectype); + /* This won't work for e.g. gconds or other statements without a lhs, + but those only work on GPR anyway and this is the best we can do. */ + if (tree lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info))) + vectype = TREE_TYPE (lhs); + } + fractional_cost stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); @@ -18002,6 +18086,28 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, analyze_loop_vinfo (loop_vinfo); m_analyzed_vinfo = true; + if (in_inner_loop_p) + m_loop_fully_scalar_dup = true; + } + + /* Detect whether the loop is working on fully duplicated lanes. This would + only be possible with inner loop vectorization since otherwise we wouldn't + try to vectorize. */ + if (in_inner_loop_p + && node + && m_loop_fully_scalar_dup + && SLP_TREE_LANES (node) == 1 + && !SLP_TREE_CHILDREN (node).exists ()) + { + /* Check if load is a duplicate. */ + if (gimple_vuse (stmt_info->stmt) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT) + ; + else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def + || SLP_TREE_DEF_TYPE (node) == vect_external_def) + ; + else + m_loop_fully_scalar_dup = false; } /* Apply the heuristic described above m_stp_sequence_cost. */ @@ -18036,7 +18142,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && node && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) @@ -18368,8 +18474,19 @@ adjust_body_cost (loop_vec_info loop_vinfo, if (m_vec_flags & VEC_ANY_SVE) threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); - if (m_num_vector_iterations >= 1 - && m_num_vector_iterations < threshold) + /* Increase the cost of the vector code if it looks like the vector code has + limited throughput due to outer-loop vectorization. */ + if (m_loop_fully_scalar_dup) + { + body_cost *= estimated_vf; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because vector code has" + " low throughput of per iteration due to splats\n", + body_cost); + } + else if (m_num_vector_iterations >= 1 + && m_num_vector_iterations < threshold) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -31808,7 +31925,7 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode, /* Expand the spaceship optab for floating-point operands. - If the result is compared against (-1, 0, 1 , 2), expand into + If the result is compared against (-1, 0, 1, -128), expand into fcmpe + conditional branch insns. Otherwise (the result is just stored as an integer), expand into @@ -31847,7 +31964,7 @@ aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint) emit_jump (end_label); emit_label (un_label); - emit_move_insn (dest, const2_rtx); + emit_move_insn (dest, GEN_INT (-128)); emit_jump (end_label); emit_label (gt_label); diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 096c853..2b3610c 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -410,8 +410,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED /* CSSC instructions are enabled through +cssc. */ #define TARGET_CSSC AARCH64_HAVE_ISA (CSSC) -/* CB<cc> instructions are enabled through +cmpbr. */ -#define TARGET_CMPBR AARCH64_HAVE_ISA (CMPBR) +/* CB<cc> instructions are enabled through +cmpbr, + but are incompatible with -mtrack-speculation. */ +#define TARGET_CMPBR (AARCH64_HAVE_ISA (CMPBR) && !aarch64_track_speculation) /* Make sure this is always defined so we don't have to check for ifdefs but rather use normal ifs. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dc2be81..6e215c4 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -441,6 +441,16 @@ ; must not operate on inactive inputs if doing so could induce a fault. (SVE_STRICT_GP 1)]) +;; These constants are used as a const_int in MTE instructions +(define_constants + [; 0xf0ff... + ; Tag mask for the 4-bit tag stored in the top 8 bits of a pointer. + (MEMTAG_TAG_MASK -1080863910568919041) + + ; 0x00ff... + ; Tag mask 56-bit address used by subp instruction. + (MEMTAG_ADDR_MASK 72057594037927935)]) + (include "constraints.md") (include "predicates.md") (include "iterators.md") @@ -725,8 +735,8 @@ (BRANCH_LEN_N_32KiB -32768) ;; +/- 1KiB. Used by CBB<cond>, CBH<cond>, CB<cond>. - (BRANCH_LEN_P_1Kib 1020) - (BRANCH_LEN_N_1Kib -1024) + (BRANCH_LEN_P_1KiB 1020) + (BRANCH_LEN_N_1KiB -1024) ] ) @@ -804,7 +814,7 @@ ) ;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ` -(define_insn "aarch64_cbz<optab><mode>1" +(define_insn "*aarch64_cbz<optab><mode>" [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r") (const_int 0)) (label_ref (match_operand 1)) @@ -838,27 +848,13 @@ [(set (pc) (if_then_else (LTGE (match_operand:ALLI 0 "register_operand" "r") (const_int 0)) (label_ref (match_operand 1)) - (pc))) - (clobber (reg:CC CC_REGNUM))] + (pc)))] "!aarch64_track_speculation" { - if (get_attr_length (insn) == 8) - { - if (get_attr_far_branch (insn) == FAR_BRANCH_YES) - return aarch64_gen_far_branch (operands, 1, "Ltb", - "<inv_tb>\\t%<w>0, <sizem1>, "); - else - { - char buf[64]; - uint64_t val = ((uint64_t) 1) - << (GET_MODE_SIZE (<MODE>mode) * BITS_PER_UNIT - 1); - sprintf (buf, "tst\t%%<w>0, %" PRId64, val); - output_asm_insn (buf, operands); - return "<bcond>\t%l1"; - } - } - else + if (get_attr_length (insn) == 4) return "<tbz>\t%<w>0, <sizem1>, %l1"; + return aarch64_gen_far_branch (operands, 1, "Ltb", + "<inv_tb>\\t%<w>0, <sizem1>, "); } [(set_attr "type" "branch") (set (attr "length") @@ -870,44 +866,44 @@ (const_int 8))) (set (attr "far_branch") (if_then_else (and (ge (minus (match_dup 1) (pc)) - (const_int BRANCH_LEN_N_1MiB)) + (const_int BRANCH_LEN_N_32KiB)) (lt (minus (match_dup 1) (pc)) - (const_int BRANCH_LEN_P_1MiB))) + (const_int BRANCH_LEN_P_32KiB))) (const_string "no") (const_string "yes")))] ) ;; Emit a `CB<cond> (register)` or `CB<cond> (immediate)` instruction. ;; The immediate range depends on the comparison code. -;; Comparisons against immediates outside this range fall back to -;; CMP + B<cond>. -(define_insn "aarch64_cb<INT_CMP:code><GPI:mode>" - [(set (pc) (if_then_else (INT_CMP - (match_operand:GPI 0 "register_operand" "r") - (match_operand:GPI 1 "nonmemory_operand" - "r<INT_CMP:cmpbr_imm_constraint>")) - (label_ref (match_operand 2)) - (pc)))] - "TARGET_CMPBR && aarch64_cb_rhs (<INT_CMP:CODE>, operands[1])" +(define_insn "*aarch64_cb<code><mode>" + [(set (pc) (if_then_else + (INT_CMP + (match_operand:GPI 0 "register_operand" "r") + (match_operand:GPI 1 + "aarch64_reg_<cmpbr_imm_constraint>_operand" + "r<cmpbr_imm_constraint>")) + (label_ref (match_operand 2)) + (pc)))] + "TARGET_CMPBR" { - return (get_attr_far_branch (insn) == FAR_BRANCH_NO) - ? "cb<INT_CMP:cmp_op>\\t%<w>0, %<w>1, %l2" - : aarch64_gen_far_branch (operands, 2, "L", - "cb<INT_CMP:inv_cmp_op>\\t%<w>0, %<w>1, "); + if (get_attr_length (insn) == 4) + return "cb<cmp_op>\t%<w>0, %<w>1, %l2"; + return aarch64_gen_far_branch (operands, 2, "L", + "cb<inv_cmp_op>\t%<w>0, %<w>1, "); } [(set_attr "type" "branch") (set (attr "length") (if_then_else (and (ge (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_N_1Kib)) + (const_int BRANCH_LEN_N_1KiB)) (lt (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_P_1Kib))) + (const_int BRANCH_LEN_P_1KiB))) (const_int 4) (const_int 8))) (set (attr "far_branch") (if_then_else (and (ge (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_N_1Kib)) + (const_int BRANCH_LEN_N_1KiB)) (lt (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_P_1Kib))) + (const_int BRANCH_LEN_P_1KiB))) (const_string "no") (const_string "yes")))] ) @@ -929,16 +925,16 @@ [(set_attr "type" "branch") (set (attr "length") (if_then_else (and (ge (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_N_1Kib)) + (const_int BRANCH_LEN_N_1KiB)) (lt (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_P_1Kib))) + (const_int BRANCH_LEN_P_1KiB))) (const_int 4) (const_int 8))) (set (attr "far_branch") (if_then_else (and (ge (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_N_1Kib)) + (const_int BRANCH_LEN_N_1KiB)) (lt (minus (match_dup 2) (pc)) - (const_int BRANCH_LEN_P_1Kib))) + (const_int BRANCH_LEN_P_1KiB))) (const_string "no") (const_string "yes")))] ) @@ -978,37 +974,24 @@ (const_string "yes")))] ) -;; For a 24-bit immediate CST we can optimize the compare for equality -;; and branch sequence from: -;; mov x0, #imm1 -;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ -;; cmp x1, x0 -;; b<ne,eq> .Label -;; into the shorter: -;; sub x0, x1, #(CST & 0xfff000) -;; subs x0, x0, #(CST & 0x000fff) -;; b<ne,eq> .Label +;; For a 24-bit immediate CST we can optimize the compare for equality. (define_insn_and_split "*aarch64_bcond_wide_imm<GPI:mode>" - [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r") - (match_operand:GPI 1 "aarch64_imm24" "n")) - (label_ref:P (match_operand 2)) - (pc)))] - "!aarch64_move_imm (INTVAL (operands[1]), <GPI:MODE>mode) - && !aarch64_plus_operand (operands[1], <GPI:MODE>mode) - && !reload_completed" + [(set (pc) (if_then_else + (match_operator 0 "aarch64_equality_operator" + [(match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "aarch64_split_imm24" "n")]) + (label_ref (match_operand 3)) + (pc))) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:GPI 4 "=r"))] + "" "#" - "&& true" + "" [(const_int 0)] { - HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; - HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; - rtx tmp = gen_reg_rtx (<GPI:MODE>mode); - emit_insn (gen_add<GPI:mode>3 (tmp, operands[0], GEN_INT (-hi_imm))); - emit_insn (gen_add<GPI:mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); - rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); - rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <GPI:MODE>mode, - cc_reg, const0_rtx); - emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2])); + rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[1], operands[2], + operands[4]); + emit_jump_insn (gen_aarch64_bcond (operands[0], cc_reg, operands[3])); DONE; } ) @@ -1413,16 +1396,16 @@ /* Save GCS with code like mov x16, 1 chkfeat x16 - tbnz x16, 0, .L_done + cbnz x16, .L_done mrs tmp, gcspr_el0 str tmp, [%0, 8] .L_done: */ - rtx done_label = gen_label_rtx (); + auto done_label = gen_label_rtx (); rtx r16 = gen_rtx_REG (DImode, R16_REGNUM); emit_move_insn (r16, const1_rtx); emit_insn (gen_aarch64_chkfeat ()); - emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label)); + emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label)); rtx gcs_slot = adjust_address (operands[0], Pmode, GET_MODE_SIZE (Pmode)); rtx gcs = gen_reg_rtx (Pmode); emit_insn (gen_aarch64_load_gcspr (gcs)); @@ -1445,7 +1428,7 @@ /* Restore GCS with code like mov x16, 1 chkfeat x16 - tbnz x16, 0, .L_done + cbnz x16, .L_done ldr tmp1, [%1, 8] mrs tmp2, gcspr_el0 subs tmp2, tmp1, tmp2 @@ -1456,12 +1439,12 @@ b.ne .L_loop .L_done: */ - rtx loop_label = gen_label_rtx (); - rtx done_label = gen_label_rtx (); + auto loop_label = gen_label_rtx (); + auto done_label = gen_label_rtx (); rtx r16 = gen_rtx_REG (DImode, R16_REGNUM); emit_move_insn (r16, const1_rtx); emit_insn (gen_aarch64_chkfeat ()); - emit_insn (gen_tbranch_neqi3 (r16, const0_rtx, done_label)); + emit_jump_insn (aarch64_gen_compare_zero_and_branch (NE, r16, done_label)); rtx gcs_slot = adjust_address (operands[1], Pmode, GET_MODE_SIZE (Pmode)); rtx gcs_old = gen_reg_rtx (Pmode); emit_move_insn (gcs_old, gcs_slot); @@ -4524,7 +4507,7 @@ [(set_attr "type" "fcmp<stype>")] ) -(define_insn "*cmp_swp_<shift>_reg<mode>" +(define_insn "cmp_swp_<shift>_reg<mode>" [(set (reg:CC_SWP CC_REGNUM) (compare:CC_SWP (ASHIFT:GPI (match_operand:GPI 0 "register_operand" "r") @@ -4651,39 +4634,24 @@ [(set_attr "type" "csel")] ) -;; For a 24-bit immediate CST we can optimize the compare for equality -;; and branch sequence from: -;; mov x0, #imm1 -;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ -;; cmp x1, x0 -;; cset x2, <ne,eq> -;; into the shorter: -;; sub x0, x1, #(CST & 0xfff000) -;; subs x0, x0, #(CST & 0x000fff) -;; cset x2, <ne, eq>. +;; For a 24-bit immediate CST we can optimize the compare for equality. (define_insn_and_split "*compare_cstore<mode>_insn" [(set (match_operand:GPI 0 "register_operand" "=r") - (EQL:GPI (match_operand:GPI 1 "register_operand" "r") - (match_operand:GPI 2 "aarch64_imm24" "n"))) - (clobber (reg:CC CC_REGNUM))] - "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode) - && !aarch64_plus_operand (operands[2], <MODE>mode) - && !reload_completed" + (match_operator:GPI 1 "aarch64_equality_operator" + [(match_operand:GPI 2 "register_operand" "r") + (match_operand:GPI 3 "aarch64_split_imm24" "n")])) + (clobber (reg:CC CC_REGNUM)) + (clobber (match_scratch:GPI 4 "=r"))] + "" "#" - "&& true" + "" [(const_int 0)] { - HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; - HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; - rtx tmp = gen_reg_rtx (<MODE>mode); - emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm))); - emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); - rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); - rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); - emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg)); + rtx cc_reg = aarch64_gen_compare_split_imm24 (operands[2], operands[3], + operands[4]); + emit_insn (gen_aarch64_cstore<mode> (operands[0], operands[1], cc_reg)); DONE; } - [(set_attr "type" "csel")] ) ;; zero_extend version of the above @@ -4813,15 +4781,21 @@ (match_operand:ALLI 3 "register_operand")))] "" { - rtx ccreg; enum rtx_code code = GET_CODE (operands[1]); - if (code == UNEQ || code == LTGT) FAIL; - ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0), - XEXP (operands[1], 1)); - operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + rtx ccreg = XEXP (operands[1], 0); + enum machine_mode ccmode = GET_MODE (ccreg); + if (GET_MODE_CLASS (ccmode) == MODE_CC) + gcc_assert (XEXP (operands[1], 1) == const0_rtx); + else if (ccmode == QImode || ccmode == HImode) + FAIL; + else + { + ccreg = aarch64_gen_compare_reg (code, ccreg, XEXP (operands[1], 1)); + operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx); + } } ) @@ -7716,6 +7690,22 @@ } ) +(define_expand "isinf<mode>2" + [(match_operand:SI 0 "register_operand") + (match_operand:GPF 1 "register_operand")] + "TARGET_FLOAT" +{ + rtx op = force_lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode); + rtx tmp = gen_reg_rtx (<V_INT_EQUIV>mode); + emit_move_insn (tmp, GEN_INT (HOST_WIDE_INT_M1U << (<mantissa_bits> + 1))); + rtx cc_reg = gen_rtx_REG (CC_SWPmode, CC_REGNUM); + emit_insn (gen_cmp_swp_lsl_reg<v_int_equiv> (op, GEN_INT (1), tmp)); + rtx cmp = gen_rtx_fmt_ee (EQ, SImode, cc_reg, const0_rtx); + emit_insn (gen_aarch64_cstoresi (operands[0], cmp, cc_reg)); + DONE; +} +) + ;; ------------------------------------------------------------------- ;; Reload support ;; ------------------------------------------------------------------- @@ -8566,7 +8556,7 @@ [(set (match_operand:DI 0 "register_operand" "=rk") (ior:DI (and:DI (match_operand:DI 1 "register_operand" "rk") - (const_int -1080863910568919041)) ;; 0xf0ff... + (const_int MEMTAG_TAG_MASK)) (ashift:DI (unspec:QI [(match_operand:DI 2 "register_operand" "r")] UNSPEC_GEN_TAG_RND) (const_int 56))))] @@ -8609,9 +8599,9 @@ [(set (match_operand:DI 0 "register_operand" "=r") (minus:DI (and:DI (match_operand:DI 1 "register_operand" "rk") - (const_int 72057594037927935)) ;; 0x00ff... + (const_int MEMTAG_ADDR_MASK)) (and:DI (match_operand:DI 2 "register_operand" "rk") - (const_int 72057594037927935))))] ;; 0x00ff... + (const_int MEMTAG_ADDR_MASK))))] "TARGET_MEMTAG" "subp\\t%0, %1, %2" [(set_attr "type" "memtag")] @@ -8621,7 +8611,7 @@ (define_insn "ldg" [(set (match_operand:DI 0 "register_operand" "+r") (ior:DI - (and:DI (match_dup 0) (const_int -1080863910568919041)) ;; 0xf0ff... + (and:DI (match_dup 0) (const_int MEMTAG_TAG_MASK)) (ashift:DI (mem:QI (unspec:DI [(and:DI (plus:DI (match_operand:DI 1 "register_operand" "rk") diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index dc1925d..7b9e558 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -312,15 +312,9 @@ (define_constraint "Uc1" "@internal - A constraint that matches the integers 1...64." + A constraint that matches the integers 0...62." (and (match_code "const_int") - (match_test "IN_RANGE (ival, 1, 64)"))) - -(define_constraint "Uc2" - "@internal - A constraint that matches the integers -1...62." - (and (match_code "const_int") - (match_test "IN_RANGE (ival, -1, 62)"))) + (match_test "IN_RANGE (ival, 0, 62)"))) (define_constraint "Up3" "@internal diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 68b080d..7a6ea0d 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1340,6 +1340,8 @@ (define_mode_attr half_mask [(HI "255") (SI "65535") (DI "4294967295")]) +(define_mode_attr mantissa_bits [(SF "23") (DF "52")]) + ;; For constraints used in scalar immediate vector moves (define_mode_attr hq [(HI "h") (QI "q")]) @@ -2203,7 +2205,8 @@ (SI "si")]) ;; Like ve_mode but for the half-width modes. -(define_mode_attr vn_mode [(V8HI "qi") (V4SI "hi") (V2DI "si")]) +(define_mode_attr vn_mode [(V8HI "qi") (V4SI "hi") (V2DI "si") (DI "si") + (SI "hi") (HI "qi")]) ;; Vm for lane instructions is restricted to FP_LO_REGS. (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x") @@ -2986,19 +2989,15 @@ (define_code_iterator INT_CMP [lt le eq ne ge gt ltu leu geu gtu]) +;; Inverse comparisons must have the same constraint so that +;; branches can be redirected during late compilation. (define_code_attr cmpbr_imm_constraint [ - (eq "Uc0") - (ne "Uc0") - (gt "Uc0") - (gtu "Uc0") - (lt "Uc0") - (ltu "Uc0") - - (ge "Uc1") - (geu "Uc1") - - (le "Uc2") - (leu "Uc2") + (eq "Uc0") (ne "Uc0") + (lt "Uc0") (ge "Uc0") + (ltu "Uc0") (geu "Uc0") + + (gt "Uc1") (le "Uc1") + (gtu "Uc1") (leu "Uc1") ]) (define_code_attr fix_trunc_optab [(fix "fix_trunc") diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 4d5d57f..42304ce 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -286,10 +286,15 @@ (and (match_code "const_int") (match_test "UINTVAL (op) <= 7"))) -;; An immediate that fits into 24 bits. -(define_predicate "aarch64_imm24" - (and (match_code "const_int") - (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)"))) +;; An immediate that fits into 24 bits, but needs splitting. +(define_predicate "aarch64_split_imm24" + (match_code "const_int") +{ + unsigned HOST_WIDE_INT i = UINTVAL (op); + return (IN_RANGE (i, 0, 0xffffff) + && !aarch64_move_imm (i, mode) + && !aarch64_uimm12_shift (i)); +}) (define_predicate "aarch64_mem_pair_offset" (and (match_code "const_int") @@ -1084,3 +1089,13 @@ (define_special_predicate "aarch64_ptrue_all_operand" (and (match_code "const_vector") (match_test "aarch64_ptrue_all_mode (op) == mode"))) + +(define_predicate "aarch64_reg_Uc0_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "const_int") + (match_test "satisfies_constraint_Uc0 (op)")))) + +(define_predicate "aarch64_reg_Uc1_operand" + (ior (match_operand 0 "register_operand") + (and (match_code "const_int") + (match_test "satisfies_constraint_Uc1 (op)")))) diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index 38a8c06..63ca8e9 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -190,12 +190,6 @@ aarch-bti-insert.o: $(srcdir)/config/arm/aarch-bti-insert.cc \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/arm/aarch-bti-insert.cc -aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \ - $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ - $(RTL_SSA_H) tree-pass.h - $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ - $(srcdir)/config/aarch64/aarch64-cc-fusion.cc - aarch64-early-ra.o: $(srcdir)/config/aarch64/aarch64-early-ra.cc \ $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ $(RTL_SSA_H) tree-pass.h diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index d119464..8f7e537 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -66,9 +66,9 @@ ;; I signed 12-bit immediate (for ARCompact) ;; K unsigned 3-bit immediate (for ARCompact) ;; L unsigned 6-bit immediate (for ARCompact) -;; M unsinged 5-bit immediate (for ARCompact) -;; O unsinged 7-bit immediate (for ARCompact) -;; P unsinged 8-bit immediate (for ARCompact) +;; M unsigned 5-bit immediate (for ARCompact) +;; O unsigned 7-bit immediate (for ARCompact) +;; P unsigned 8-bit immediate (for ARCompact) ;; N constant '1' (for ARCompact) diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 537a3e2..422ae54 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -13026,7 +13026,7 @@ "arm_coproc_builtin_available (VUNSPEC_<MCRR>)" { arm_const_bounds (operands[0], 0, 16); - arm_const_bounds (operands[1], 0, 8); + arm_const_bounds (operands[1], 0, 16); arm_const_bounds (operands[3], 0, (1 << 5)); return "<mcrr>\\tp%c0, %1, %Q2, %R2, CR%c3"; } @@ -13041,7 +13041,7 @@ "arm_coproc_builtin_available (VUNSPEC_<MRRC>)" { arm_const_bounds (operands[1], 0, 16); - arm_const_bounds (operands[2], 0, 8); + arm_const_bounds (operands[2], 0, 16); arm_const_bounds (operands[3], 0, (1 << 5)); return "<mrrc>\\tp%c1, %2, %Q0, %R0, CR%c3"; } diff --git a/gcc/config/avr/specs.h b/gcc/config/avr/specs.h index ff269bf..c95c758 100644 --- a/gcc/config/avr/specs.h +++ b/gcc/config/avr/specs.h @@ -57,7 +57,7 @@ along with GCC; see the file COPYING3. If not see "%(asm_errata_skip) " #define LINK_RELAX_SPEC \ - "%{mrelax:--relax} " + "%{!r:%{mrelax:--relax}} " #undef LINK_SPEC #define LINK_SPEC \ diff --git a/gcc/config/cris/cris.h b/gcc/config/cris/cris.h index 1681c79..f356679 100644 --- a/gcc/config/cris/cris.h +++ b/gcc/config/cris/cris.h @@ -171,7 +171,7 @@ extern int cris_cpu_version; /* For the cris-*-elf subtarget. */ #define CRIS_ASM_SUBTARGET_SPEC \ - "--em=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}" + "--emulation=criself %{!march=*:%{!mcpu=*:" CRIS_DEFAULT_ASM_ARCH_OPTION "}}" /* FIXME: We should propagate the -melf option to make the criself "emulation" unless a linker script is provided (-T*), but I don't know diff --git a/gcc/config/darwin-sections.def b/gcc/config/darwin-sections.def index 44adcc6..76587c2 100644 --- a/gcc/config/darwin-sections.def +++ b/gcc/config/darwin-sections.def @@ -215,3 +215,10 @@ DEF_SECTION (objc2_method_names_section, 0, DEF_SECTION (objc2_method_types_section, 0, ".section __TEXT, __objc_methtype, cstring_literals", 1) + +/* ASAN sections. */ + +DEF_SECTION (asan_string_section, 0, ".section __TEXT, __asan_cstring", 0) +DEF_SECTION (asan_globals_section, 0, ".section __DATA, __asan_globals", 0) +DEF_SECTION (asan_liveness_section, 0, + ".section __DATA,__asan_liveness,regular,live_support", 0) diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc index be2daed..75ac356 100644 --- a/gcc/config/darwin.cc +++ b/gcc/config/darwin.cc @@ -49,6 +49,7 @@ along with GCC; see the file COPYING3. If not see #include "optabs.h" #include "flags.h" #include "opts.h" +#include "asan.h" /* Fix and Continue. @@ -1298,6 +1299,39 @@ darwin_encode_section_info (tree decl, rtx rtl, int first) SYMBOL_FLAG_EXTERNAL. */ default_encode_section_info (decl, rtl, first); + if (CONSTANT_CLASS_P (decl)) + { + bool is_str = TREE_CODE (decl) == STRING_CST; + rtx sym_ref = XEXP (rtl, 0); + + /* Unless this is a string cst or we are in an anchored section we have + nothing more to do here. */ + if (!is_str && !SYMBOL_REF_HAS_BLOCK_INFO_P (sym_ref)) + return; + + tree sym_decl = SYMBOL_REF_DECL (sym_ref); + const char *name = XSTR (sym_ref, 0); + gcc_checking_assert (strncmp ("*lC", name, 3) == 0); + + char *buf; + if (is_str) + { + bool for_asan = (flag_sanitize & SANITIZE_ADDRESS) + && asan_protect_global (CONST_CAST_TREE (decl)); + /* When we are generating code for sanitized strings, the string + internal symbols are made visible in the object. */ + buf = xasprintf ("*%c.str.%s", for_asan ? 'l' : 'L', &name[3]); + } + else + /* Lets identify anchored constants with a different prefix, for the + sake of inspection only. */ + buf = xasprintf ("*LaC%s", &name[3]); + if (sym_decl) + DECL_NAME (sym_decl) = get_identifier (buf); + XSTR (sym_ref, 0) = ggc_strdup (buf); + free (buf); + } + if (! VAR_OR_FUNCTION_DECL_P (decl)) return; @@ -1683,6 +1717,17 @@ machopic_select_section (tree decl, ro = TREE_READONLY (decl) || TREE_CONSTANT (decl) ; + /* Trump categorize_decl_for_section () for ASAN stuff - the Darwin + categorisations are special. */ + if (flag_sanitize & SANITIZE_ADDRESS) + { + if (TREE_CODE (decl) == STRING_CST + && asan_protect_global (CONST_CAST_TREE (decl))) + { + return darwin_sections[asan_string_section]; + } + } + switch (categorize_decl_for_section (decl, reloc)) { case SECCAT_TEXT: @@ -1699,7 +1744,12 @@ machopic_select_section (tree decl, break; case SECCAT_RODATA_MERGE_STR_INIT: - base_section = darwin_mergeable_string_section (DECL_INITIAL (decl), align); + if ((flag_sanitize & SANITIZE_ADDRESS) + && asan_protect_global (CONST_CAST_TREE (decl))) + /* or !flag_merge_constants */ + return darwin_sections[asan_string_section]; + else + return darwin_mergeable_string_section (DECL_INITIAL (decl), align); break; case SECCAT_RODATA_MERGE_CONST: @@ -3297,11 +3347,16 @@ darwin_use_anchors_for_symbol_p (const_rtx symbol) { if (DARWIN_SECTION_ANCHORS && flag_section_anchors) { - section *sect; - /* If the section contains a zero-sized object it's ineligible. */ - sect = SYMBOL_REF_BLOCK (symbol)->sect; - /* This should have the effect of disabling anchors for vars that follow - any zero-sized one, in a given section. */ + tree decl = SYMBOL_REF_DECL (symbol); + /* If the symbol would be linker-visible, then it can split at that + so we must disallow. This is more strict than the default impl. + TODO: add other cases. */ + if (decl && DECL_P (decl) + && (TREE_PUBLIC (decl) || !DECL_ARTIFICIAL (decl))) + return false; + + /* We mark sections containing unsuitable entries. */ + section *sect = SYMBOL_REF_BLOCK (symbol)->sect; if (sect->common.flags & SECTION_NO_ANCHOR) return false; diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 9b9a3fe..c3e28e2 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -287,6 +287,19 @@ extern GTY(()) int darwin_ms_struct; #define DARWIN_RDYNAMIC "%{rdynamic:%nrdynamic is not supported}" #endif +#if LD64_HAS_NO_DEDUPLICATE +/* What we want is "when the optimization level is debug OR when it is + a compile & link job with implied O0 optimization". */ +#define DARWIN_LD_NO_DEDUPLICATE \ + "%{O0|O1|O|Og: -no_deduplicate} \ + %{!O*:\ + %{.c|.cc|.C|.cpp|.cp|.c++|.cxx|.CPP|.m|.mm|.s|.S|.i|.ii|.mi|.mii|\ + .f|.for|.ftn|.fpp|.f90|.f95|.f03|.f08|.f77|.F|.F90|.F95|.F03|.F08|\ + .d|.mod: -no_deduplicate }} " +#else +#define DARWIN_LD_NO_DEDUPLICATE "" +#endif + #if LD64_HAS_MACOS_VERSION_MIN # define DARWIN_PLATFORM_ID \ "%{mmacosx-version-min=*:-macos_version_min %*} " @@ -403,10 +416,14 @@ extern GTY(()) int darwin_ms_struct; %(linker)" \ DARWIN_LD_DEMANGLE \ LINK_PLUGIN_SPEC \ + DARWIN_LD_NO_DEDUPLICATE \ "%{flto*:%<fcompare-debug*} \ %{flto} %{fno-lto} %{flto=*} \ - %l " \ + %{static}%{!static:%{!dynamic:-dynamic}} \ + %{force_cpusubtype_ALL:-arch %(darwin_arch)} \ + %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\ DARWIN_PLATFORM_ID \ + " %l " \ LINK_COMPRESS_DEBUG_SPEC \ "%X %{s} %{t} %{Z} %{u*} \ %{e*} %{r} \ @@ -493,9 +510,8 @@ extern GTY(()) int darwin_ms_struct; Note that options taking arguments may appear multiple times on a command line with different arguments each time, so put a * after their names so all of them get passed. */ -#define LINK_SPEC \ - "%{static}%{!static:%{!dynamic:-dynamic}} \ - %:remove-outfile(-ldl) \ +#define LINK_SPEC \ + "%:remove-outfile(-ldl) \ %:remove-outfile(-lm) \ %:remove-outfile(-lpthread) \ %{fgnu-runtime: %{static|static-libgcc: \ @@ -511,9 +527,7 @@ extern GTY(()) int darwin_ms_struct; %{static|static-libgm2:%:replace-outfile(-lm2iso libm2iso.a%s)}\ %{static|static-libgm2:%:replace-outfile(-lm2min libm2min.a%s)}\ %{static|static-libgm2:%:replace-outfile(-lm2log libm2log.a%s)}\ - %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)}\ - %{force_cpusubtype_ALL:-arch %(darwin_arch)} \ - %{!force_cpusubtype_ALL:-arch %(darwin_subarch)} "\ + %{static|static-libgm2:%:replace-outfile(-lm2cor libm2cor.a%s)} "\ LINK_SYSROOT_SPEC \ "%{!multiply_defined*:%{shared-libgcc: \ %:version-compare(< 10.5 mmacosx-version-min= -multiply_defined) \ @@ -1005,6 +1019,8 @@ extern GTY(()) section * darwin_sections[NUM_DARWIN_SECTIONS]; sprintf (LABEL, "*%s%ld", "lASAN", (long)(NUM));\ else if (strcmp ("LTRAMP", PREFIX) == 0) \ sprintf (LABEL, "*%s%ld", "lTRAMP", (long)(NUM));\ + else if (strncmp ("LANCHOR", PREFIX, 7) == 0) \ + sprintf (LABEL, "*%s%ld", "lANCHOR", (long)(NUM));\ else \ sprintf (LABEL, "*%s%ld", PREFIX, (long)(NUM)); \ } while (0) diff --git a/gcc/config/h8300/addsub.md b/gcc/config/h8300/addsub.md index 32eba9d..f153625 100644 --- a/gcc/config/h8300/addsub.md +++ b/gcc/config/h8300/addsub.md @@ -271,7 +271,7 @@ (match_operand:QHSI 2 "register_operand" "r")) (match_dup 1))) (set (match_operand:QHSI 0 "register_operand" "=r") - (plus (match_dup 1) (match_dup 2))) + (plus:QHSI (match_dup 1) (match_dup 2))) (clobber (reg:CC CC_REG))] "" { diff --git a/gcc/config/h8300/jumpcall.md b/gcc/config/h8300/jumpcall.md index 4e63408..44847e4 100644 --- a/gcc/config/h8300/jumpcall.md +++ b/gcc/config/h8300/jumpcall.md @@ -156,7 +156,7 @@ "#" "&& reload_completed" [(set (reg:CCZ CC_REG) - (eq (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2)) + (eq:CCZ (zero_extract:HSI (match_dup 1) (const_int 1) (match_dup 2)) (const_int 0))) (set (pc) (if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)]) @@ -181,7 +181,7 @@ (lshiftrt:SI (match_dup 1) (const_int 16)))) (clobber (reg:CC CC_REG))]) (set (reg:CCZ CC_REG) - (eq (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2)) + (eq:CCZ (zero_extract:SI (match_dup 4) (const_int 1) (match_dup 2)) (const_int 0))) (set (pc) (if_then_else (match_op_dup 3 [(reg:CCZ CC_REG) (const_int 0)]) @@ -288,7 +288,7 @@ }) (define_insn "call_insn_<mode>" - [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr")) + [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr")) (match_operand:P 1 "general_operand" "g"))] "!SIBLING_CALL_P (insn)" { @@ -326,7 +326,7 @@ (define_insn "call_value_insn_<mode>" [(set (match_operand 0 "" "=r") - (call (mem:QI (match_operand 1 "call_insn_operand" "Cr")) + (call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr")) (match_operand:P 2 "general_operand" "g")))] "!SIBLING_CALL_P (insn)" { @@ -358,7 +358,7 @@ }) (define_insn "sibcall_insn_<mode>" - [(call (mem:QI (match_operand 0 "call_insn_operand" "Cr")) + [(call (mem:QI (match_operand:P 0 "call_insn_operand" "Cr")) (match_operand:P 1 "general_operand" "g"))] "SIBLING_CALL_P (insn)" { @@ -396,7 +396,7 @@ (define_insn "sibcall_value_insn_<mode>" [(set (match_operand 0 "" "=r") - (call (mem:QI (match_operand 1 "call_insn_operand" "Cr")) + (call (mem:QI (match_operand:P 1 "call_insn_operand" "Cr")) (match_operand:P 2 "general_operand" "g")))] "SIBLING_CALL_P (insn)" { diff --git a/gcc/config/h8300/testcompare.md b/gcc/config/h8300/testcompare.md index 694c9e6..3b43381 100644 --- a/gcc/config/h8300/testcompare.md +++ b/gcc/config/h8300/testcompare.md @@ -28,7 +28,7 @@ ;; (define_insn "" [(set (reg:CCZ CC_REG) - (eq (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r") + (eq:CCZ (zero_extract:HSI (match_operand:HSI 0 "register_operand" "r") (const_int 1) (match_operand 1 "const_int_operand" "n")) (const_int 0)))] @@ -54,7 +54,7 @@ (define_insn "*tsthi_upper" [(set (reg:CCZN CC_REG) - (compare (and:HI (match_operand:HI 0 "register_operand" "r") + (compare:CCZN (and:HI (match_operand:HI 0 "register_operand" "r") (const_int -256)) (const_int 0)))] "reload_completed" @@ -63,7 +63,7 @@ (define_insn "*tsthi_upper_z" [(set (reg:CCZ CC_REG) - (compare (and:HI (match_operand:HI 0 "register_operand" "r") + (compare:CCZ (and:HI (match_operand:HI 0 "register_operand" "r") (const_int -256)) (const_int 0)))] "reload_completed" @@ -72,7 +72,7 @@ (define_insn "*tstsi_upper" [(set (reg:CCZN CC_REG) - (compare (and:SI (match_operand:SI 0 "register_operand" "r") + (compare:CCZN (and:SI (match_operand:SI 0 "register_operand" "r") (const_int -65536)) (const_int 0)))] "reload_completed" @@ -81,7 +81,7 @@ (define_insn "*cmp<mode>_c" [(set (reg:CCC CC_REG) - (ltu (match_operand:QHSI 0 "h8300_dst_operand" "rQ") + (ltu:CCC (match_operand:QHSI 0 "h8300_dst_operand" "rQ") (match_operand:QHSI 1 "h8300_src_operand" "rQi")))] "reload_completed" { @@ -97,7 +97,7 @@ (define_insn "*cmpqi_z" [(set (reg:CCZ CC_REG) - (eq (match_operand:QI 0 "h8300_dst_operand" "rQ") + (eq:CCZ (match_operand:QI 0 "h8300_dst_operand" "rQ") (match_operand:QI 1 "h8300_src_operand" "rQi")))] "reload_completed" { return "cmp.b %X1,%X0"; } @@ -105,7 +105,7 @@ (define_insn "*cmphi_z" [(set (reg:CCZ CC_REG) - (eq (match_operand:HI 0 "h8300_dst_operand" "rQ") + (eq:CCZ (match_operand:HI 0 "h8300_dst_operand" "rQ") (match_operand:HI 1 "h8300_src_operand" "rQi")))] "reload_completed" { return "cmp.w %T1,%T0"; } @@ -113,7 +113,7 @@ (define_insn "*cmpsi_z" [(set (reg:CCZ CC_REG) - (eq (match_operand:SI 0 "h8300_dst_operand" "rQ") + (eq:CCZ (match_operand:SI 0 "h8300_dst_operand" "rQ") (match_operand:SI 1 "h8300_src_operand" "rQi")))] "reload_completed" { return "cmp.l %S1,%S0"; } @@ -121,7 +121,7 @@ (define_insn "*cmpqi" [(set (reg:CC CC_REG) - (compare (match_operand:QI 0 "h8300_dst_operand" "rQ") + (compare:CC (match_operand:QI 0 "h8300_dst_operand" "rQ") (match_operand:QI 1 "h8300_src_operand" "rQi")))] "reload_completed" "cmp.b %X1,%X0" @@ -129,7 +129,7 @@ (define_insn "*cmphi" [(set (reg:CC CC_REG) - (compare (match_operand:HI 0 "h8300_dst_operand" "rU,rQ") + (compare:CC (match_operand:HI 0 "h8300_dst_operand" "rU,rQ") (match_operand:HI 1 "h8300_src_operand" "P3>X,rQi")))] "reload_completed" { @@ -150,7 +150,7 @@ (define_insn "cmpsi" [(set (reg:CC CC_REG) - (compare (match_operand:SI 0 "h8300_dst_operand" "r,rQ") + (compare:CC (match_operand:SI 0 "h8300_dst_operand" "r,rQ") (match_operand:SI 1 "h8300_src_operand" "P3>X,rQi")))] "reload_completed" { @@ -176,7 +176,7 @@ (define_peephole2 [(match_scratch:QHSI 1 "r") (set (reg:CC CC_REG) - (compare (match_operand:QHSI 0 "memory_operand" "") + (compare:CC (match_operand:QHSI 0 "memory_operand" "") (const_int 0)))] "!mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))" [(parallel [(set (reg:CCZN CC_REG) (compare:CCZN (match_dup 0) (const_int 0))) @@ -187,7 +187,7 @@ (define_peephole2 [(match_scratch:QHSI 1 "r") (set (reg:CC CC_REG) - (compare (match_operand:QHSI 0 "memory_operand" "") + (compare:CC (match_operand:QHSI 0 "memory_operand" "") (const_int 0)))] "mode_dependent_address_p (XEXP (operands[0], 0), MEM_ADDR_SPACE (operands[0]))" [(parallel [(set (match_dup 1) (match_dup 0)) (clobber (reg:CC CC_REG))]) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 12cec61..3278f1f 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -3151,7 +3151,7 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) } /* Expand floating point op0 <=> op1, i.e. - dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */ + dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */ void ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) @@ -3264,7 +3264,7 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) if (l2) { emit_label (l2); - emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2); + emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2); } emit_label (lend); } @@ -8241,8 +8241,10 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; move_by_pieces (destmem, srcmem, epilogue_size, destalign, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 8) @@ -8405,8 +8407,8 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, /* Callback routine for store_by_pieces. Return the RTL of a register containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which - is a word or a word vector register. If PREV_P isn't nullptr, it - has the RTL info from the previous iteration. */ + is an integer or a word vector register. If PREV_P isn't nullptr, + it has the RTL info from the previous iteration. */ static rtx setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, @@ -8435,10 +8437,6 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, rtx op = (rtx) op_p; machine_mode op_mode = GET_MODE (op); - gcc_assert (op_mode == word_mode - || (VECTOR_MODE_P (op_mode) - && GET_MODE_INNER (op_mode) == word_mode)); - if (VECTOR_MODE_P (mode)) { gcc_assert (GET_MODE_INNER (mode) == QImode); @@ -8460,16 +8458,17 @@ setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT, return tmp; } - target = gen_reg_rtx (word_mode); if (VECTOR_MODE_P (op_mode)) { + gcc_assert (GET_MODE_INNER (op_mode) == word_mode); + target = gen_reg_rtx (word_mode); op = gen_rtx_SUBREG (word_mode, op, 0); emit_move_insn (target, op); } else target = op; - if (mode == word_mode) + if (mode == GET_MODE (target)) return target; rtx tmp = gen_reg_rtx (mode); @@ -8490,9 +8489,11 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, unsigned HOST_WIDE_INT countval = UINTVAL (count); unsigned HOST_WIDE_INT epilogue_size = countval % max_size; unsigned int destalign = MEM_ALIGN (destmem); + cfun->machine->by_pieces_in_use = true; store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val, vec_value ? vec_value : value, destalign, true, RETURN_BEGIN); + cfun->machine->by_pieces_in_use = false; return; } if (max_size > 32) @@ -27034,6 +27035,109 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, return target; } +/* GF2P8AFFINEQB matrixes to implement shift and rotate. */ + +static const uint64_t matrix_ashift[8] = +{ + 0, + 0x0001020408102040, /* 1 l */ + 0x0000010204081020, /* 2 l */ + 0x0000000102040810, /* 3 l */ + 0x0000000001020408, /* 4 l */ + 0x0000000000010204, /* 5 l */ + 0x0000000000000102, /* 6 l */ + 0x0000000000000001 /* 7 l */ +}; + +static const uint64_t matrix_lshiftrt[8] = +{ + 0, + 0x0204081020408000, /* 1 r */ + 0x0408102040800000, /* 2 r */ + 0x0810204080000000, /* 3 r */ + 0x1020408000000000, /* 4 r */ + 0x2040800000000000, /* 5 r */ + 0x4080000000000000, /* 6 r */ + 0x8000000000000000 /* 7 r */ +}; + +static const uint64_t matrix_ashiftrt[8] = +{ + 0, + 0x0204081020408080, /* 1 r */ + 0x0408102040808080, /* 2 r */ + 0x0810204080808080, /* 3 r */ + 0x1020408080808080, /* 4 r */ + 0x2040808080808080, /* 5 r */ + 0x4080808080808080, /* 6 r */ + 0x8080808080808080 /* 7 r */ +}; + +static const uint64_t matrix_rotate[8] = +{ + 0, + 0x8001020408102040, /* 1 rol8 */ + 0x4080010204081020, /* 2 rol8 */ + 0x2040800102040810, /* 3 rol8 */ + 0x1020408001020408, /* 4 rol8 */ + 0x0810204080010204, /* 5 rol8 */ + 0x0408102040800102, /* 6 rol8 */ + 0x0204081020408001 /* 7 rol8 */ +}; + +static const uint64_t matrix_rotatert[8] = +{ + 0, + 0x0204081020408001, /* 1 ror8 */ + 0x0408102040800102, /* 2 ror8 */ + 0x0810204080010204, /* 3 ror8 */ + 0x1020408001020408, /* 4 ror8 */ + 0x2040800102040810, /* 5 ror8 */ + 0x4080010204081020, /* 6 ror8 */ + 0x8001020408102040 /* 7 ror8 */ +}; + +/* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift + for CODE and shift count COUNT into register with vector of size of SRC. */ + +rtx +ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code) +{ + machine_mode mode = GET_MODE (src); + const uint64_t *matrix; + unsigned shift = INTVAL (count) & 7; + gcc_assert (shift > 0 && shift < 8); + + switch (code) + { + case ASHIFT: + matrix = matrix_ashift; + break; + case ASHIFTRT: + matrix = matrix_ashiftrt; + break; + case LSHIFTRT: + matrix = matrix_lshiftrt; + break; + case ROTATE: + matrix = matrix_rotate; + break; + case ROTATERT: + matrix = matrix_rotatert; + break; + default: + gcc_unreachable (); + } + + int nelts = GET_MODE_NUNITS (mode); + rtvec vec = rtvec_alloc (nelts); + uint64_t ma = matrix[shift]; + for (int i = 0; i < nelts; i++) + RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode); + + return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec)); +} + /* Trunc a vector to a narrow vector, like v4di -> v4si. */ void diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 9941e61..0608dd2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -3089,10 +3089,13 @@ enum x86_cse_kind { X86_CSE_CONST0_VECTOR, X86_CSE_CONSTM1_VECTOR, - X86_CSE_VEC_DUP + X86_CSE_VEC_DUP, + X86_CSE_TLS_GD, + X86_CSE_TLS_LD_BASE, + X86_CSE_TLSDESC }; -struct redundant_load +struct redundant_pattern { /* Bitmap of basic blocks with broadcast instructions. */ auto_bitmap bbs; @@ -3100,6 +3103,8 @@ struct redundant_load auto_bitmap insns; /* The broadcast inner scalar. */ rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; /* The inner scalar mode. */ machine_mode mode; /* The instruction which sets the inner scalar. Nullptr if the inner @@ -3130,7 +3135,7 @@ struct redundant_load static void ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, - redundant_load *load = nullptr) + redundant_pattern *load = nullptr) { basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop @@ -3639,6 +3644,8 @@ ix86_broadcast_inner (rtx op, machine_mode mode, Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an integer constant. */ op = src; + if (mode != GET_MODE (reg)) + op = gen_int_mode (INTVAL (src), mode); *insn_p = nullptr; } else @@ -3679,25 +3686,719 @@ ix86_broadcast_inner (rtx op, machine_mode mode, return op; } -/* At entry of the nearest common dominator for basic blocks with vector - CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest - vector set instruction for all CONST0_RTX and integer CONSTM1_RTX - uses. +/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and + put the updated instruction in UPDATED_TLS_INSNS. */ - NB: We want to generate only a single widest vector set to cover the - whole function. The LCM algorithm isn't appropriate here since it - may place a vector set inside the loop. */ +static void +replace_tls_call (rtx src, auto_bitmap &tls_call_insns, + auto_bitmap &updated_tls_insns) +{ + bitmap_iterator bi; + unsigned int id; -static unsigned int -remove_redundant_vector_load (void) + EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (id)->insn; + + /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are + allowed. */ + if (!CALL_P (insn)) + { + attr_tls64 tls64 = get_attr_tls64 (insn); + if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE) + gcc_unreachable (); + } + + rtx pat = PATTERN (insn); + gcc_assert (GET_CODE (pat) == PARALLEL); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + + set = gen_rtx_SET (dest, src); + rtx_insn *set_insn = emit_insn_after (set, insn); + if (recog_memoized (set_insn) < 0) + gcc_unreachable (); + + /* Put SET_INSN in UPDATED_TLS_INSNS. */ + bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn)); + + if (dump_file) + { + fprintf (dump_file, "\nReplace:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nwith:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\n"); + } + + /* Delete the CALL insn. */ + delete_insn (insn); + + df_insn_rescan (set_insn); + } +} + +/* Return the basic block which dominates all basic blocks which set + hard register REGNO used in basic block BB. */ + +static basic_block +ix86_get_dominator_for_reg (unsigned int regno, basic_block bb) +{ + basic_block set_bb; + auto_bitmap set_bbs; + + /* Get all BBs which set REGNO and dominate the current BB from all + DEFs of REGNO. */ + for (df_ref def = DF_REG_DEF_CHAIN (regno); + def; + def = DF_REF_NEXT_REG (def)) + if (!DF_REF_IS_ARTIFICIAL (def) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER) + && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER)) + { + set_bb = DF_REF_BB (def); + if (dominated_by_p (CDI_DOMINATORS, bb, set_bb)) + bitmap_set_bit (set_bbs, set_bb->index); + } + + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + return bb; +} + +/* Mark FLAGS register as live in DATA, a bitmap of live caller-saved + registers, if DEST is FLAGS register. */ + +static void +ix86_check_flags_reg (rtx dest, const_rtx, void *data) +{ + auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data; + if (REG_P (dest) && REGNO (dest) == FLAGS_REG) + bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG); +} + +/* Emit a TLS_SET instruction of KIND in basic block BB. Store the + insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P + for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions + which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS + contains instructions which replace the GNU2 TLS instructions. */ + +static rtx_insn * +ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb, + rtx_insn **before_p, rtx_insn **after_p, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns) +{ + rtx_insn *tls_insn; + + do + { + rtx_insn *insn = BB_HEAD (bb); + while (insn && !NONDEBUG_INSN_P (insn)) + { + if (insn == BB_END (bb)) + { + /* This must be the beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or a basic block with only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or a basic block with only a debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + gcc_assert (DEBUG_INSN_P (insn) + || (NOTE_P (insn) + && ((NOTE_KIND (insn) + == NOTE_INSN_FUNCTION_BEG) + || (NOTE_KIND (insn) + == NOTE_INSN_BASIC_BLOCK)))); + insn = NULL; + break; + } + insn = NEXT_INSN (insn); + } + + /* TLS_GD and TLS_LD_BASE instructions are normal functions which + clobber caller-saved registers. TLSDESC instructions only + clobber FLAGS. If any registers clobbered by TLS instructions + are live in this basic block, we must insert TLS instructions + after all live registers clobbered are dead. */ + + auto_bitmap live_caller_saved_regs; + bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb); + + if (bitmap_bit_p (in, FLAGS_REG)) + bitmap_set_bit (live_caller_saved_regs, FLAGS_REG); + + unsigned int i; + + /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE + instructions. */ + if (kind != X86_CSE_TLSDESC) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (call_used_regs[i] + && !fixed_regs[i] + && bitmap_bit_p (in, i)) + bitmap_set_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + if (insn == BB_HEAD (bb)) + { + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + } + else + { + /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the + beginning basic block: + + (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG) + + or after NOTE_INSN_BASIC_BLOCK in a basic block with + only a label: + + (code_label 78 11 77 3 14 (nil) [1 uses]) + (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK) + + or after debug marker in a basic block with only a + debug marker: + + (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK) + (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG) + (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil)) + + */ + insn = insn ? PREV_INSN (insn) : BB_END (bb); + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + } + return tls_insn; + } + + bool repeat = false; + + /* Search for REG_DEAD notes in this basic block. */ + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + /* NB: Conditional jump is the only instruction which reads + flags register and changes control flow. We can never + place the TLS call after unconditional jump. */ + if (JUMP_P (insn)) + { + /* This must be a conditional jump. */ + rtx label = JUMP_LABEL (insn); + if (label == nullptr + || ANY_RETURN_P (label) + || !(LABEL_P (label) || SYMBOL_REF_P (label))) + gcc_unreachable (); + + /* Place the call before all FLAGS_REG setting BBs since + we can't place a call before nor after a conditional + jump. */ + bb = ix86_get_dominator_for_reg (FLAGS_REG, bb); + + /* Start over again. */ + repeat = true; + break; + } + + if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn))) + { + /* Insert the __tls_get_addr call before INSN which + replaces a __tls_get_addr call. */ + *before_p = insn; + tls_insn = emit_insn_before (tls_set, insn); + return tls_insn; + } + + if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn))) + { + /* Mark FLAGS register as dead since FLAGS register + would be clobbered by the GNU2 TLS instruction. */ + bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG); + continue; + } + + /* Check if FLAGS register is live. */ + note_stores (insn, ix86_check_flags_reg, + &live_caller_saved_regs); + + rtx link; + for (link = REG_NOTES (insn); link; link = XEXP (link, 1)) + if (REG_NOTE_KIND (link) == REG_DEAD + && REG_P (XEXP (link, 0))) + { + /* Mark the live caller-saved register as dead. */ + for (i = REGNO (XEXP (link, 0)); + i < END_REGNO (XEXP (link, 0)); + i++) + if (i < FIRST_PSEUDO_REGISTER) + bitmap_clear_bit (live_caller_saved_regs, i); + + if (bitmap_empty_p (live_caller_saved_regs)) + { + *after_p = insn; + tls_insn = emit_insn_after (tls_set, insn); + return tls_insn; + } + } + } + + /* NB: Start over again for conditional jump. */ + if (repeat) + continue; + + gcc_assert (!bitmap_empty_p (live_caller_saved_regs)); + + /* If any live caller-saved registers aren't dead at the end of + this basic block, get the basic block which dominates all + basic blocks which set the remaining live registers. */ + auto_bitmap set_bbs; + bitmap_iterator bi; + unsigned int id; + EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi) + { + basic_block set_bb = ix86_get_dominator_for_reg (id, bb); + bitmap_set_bit (set_bbs, set_bb->index); + } + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs); + } + while (true); +} + +/* Generate a TLS call of KIND with VAL and copy the call result to DEST, + at entry of the nearest dominator for basic block map BBS, which is in + the fake loop that contains the whole function, so that there is only + a single TLS CALL of KIND with VAL in the whole function. + UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS + instructions. UPDATED_GNU2_TLS_INSNS contains instructions which + replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr, + insert it before the TLS call. */ + +static void +ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind, + auto_bitmap &bbs, + auto_bitmap &updated_gnu_tls_insns, + auto_bitmap &updated_gnu2_tls_insns, + rtx tlsdesc_set = nullptr) +{ + basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + rtx rax = nullptr, rdi; + rtx eqv = nullptr; + rtx caddr; + rtx set; + rtx clob; + rtx symbol; + rtx tls; + + switch (kind) + { + case X86_CSE_TLS_GD: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + symbol = XVECEXP (val, 0, 0); + tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi); + + if (GET_MODE (symbol) != Pmode) + symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol); + eqv = symbol; + break; + + case X86_CSE_TLS_LD_BASE: + rax = gen_rtx_REG (Pmode, AX_REG); + rdi = gen_rtx_REG (Pmode, DI_REG); + caddr = ix86_tls_get_addr (); + + tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi); + + /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers + to share the LD_BASE result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_TLS_LD_BASE); + + break; + + case X86_CSE_TLSDESC: + set = gen_rtx_SET (dest, val); + clob = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_REG (CCmode, FLAGS_REG)); + tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob)); + break; + + default: + gcc_unreachable (); + } + + /* Emit the TLS CALL insn. */ + rtx_insn *before = nullptr; + rtx_insn *after = nullptr; + rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before, + &after, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + + rtx_insn *tlsdesc_insn = nullptr; + if (tlsdesc_set) + { + rtx dest = copy_rtx (SET_DEST (tlsdesc_set)); + rtx src = copy_rtx (SET_SRC (tlsdesc_set)); + tlsdesc_set = gen_rtx_SET (dest, src); + tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn); + } + + if (kind != X86_CSE_TLSDESC) + { + RTL_CONST_CALL_P (tls_insn) = 1; + + /* Indicate that this function can't jump to non-local gotos. */ + make_reg_eh_region_note_nothrow_nononlocal (tls_insn); + } + + if (recog_memoized (tls_insn) < 0) + gcc_unreachable (); + + if (dump_file) + { + if (after) + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, after); + fprintf (dump_file, "\n"); + } + else + { + fprintf (dump_file, "\nPlace:\n\n"); + if (tlsdesc_insn) + print_rtl_single (dump_file, tlsdesc_insn); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\nbefore:\n\n"); + print_rtl_single (dump_file, before); + fprintf (dump_file, "\n"); + } + } + + if (kind != X86_CSE_TLSDESC) + { + /* Copy RAX to DEST. */ + set = gen_rtx_SET (dest, rax); + rtx_insn *set_insn = emit_insn_after (set, tls_insn); + set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest); + if (dump_file) + { + fprintf (dump_file, "\nPlace:\n\n"); + print_rtl_single (dump_file, set_insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, tls_insn); + fprintf (dump_file, "\n"); + } + } +} + +namespace { + +const pass_data pass_data_x86_cse = +{ + RTL_PASS, /* type */ + "x86_cse", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_x86_cse : public rtl_opt_pass +{ +public: + pass_x86_cse (gcc::context *ctxt) + : rtl_opt_pass (pass_data_x86_cse, ctxt) + {} + + /* opt_pass methods: */ + bool gate (function *fun) final override + { + return (TARGET_SSE2 + && optimize + && optimize_function_for_speed_p (fun)); + } + + unsigned int execute (function *) final override + { + return x86_cse (); + } + +private: + /* The redundant source value. */ + rtx val; + /* The actual redundant source value for UNSPEC_TLSDESC. */ + rtx tlsdesc_val; + /* The instruction which defines the redundant value. */ + rtx_insn *def_insn; + /* Mode of the destination of the candidate redundant instruction. */ + machine_mode mode; + /* Mode of the source of the candidate redundant instruction. */ + machine_mode scalar_mode; + /* The classification of the candidate redundant instruction. */ + x86_cse_kind kind; + + unsigned int x86_cse (void); + bool candidate_gnu_tls_p (rtx_insn *, attr_tls64); + bool candidate_gnu2_tls_p (rtx, attr_tls64); + bool candidate_vector_p (rtx); + rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx); +}; // class pass_x86_cse + +/* Return the instruction which sets REG from TLS_SYMBOL. */ + +rtx_insn * +pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg, + const_rtx tls_symbol) +{ + rtx_insn *set_insn = nullptr; + for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg)); + ref; + ref = DF_REF_NEXT_REG (ref)) + { + if (DF_REF_IS_ARTIFICIAL (ref)) + return nullptr; + + set_insn = DF_REF_INSN (ref); + if (get_attr_tls64 (set_insn) != TLS64_LEA) + return nullptr; + + rtx tls_set = PATTERN (set_insn); + rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0); + if (!rtx_equal_p (tls_symbol, tls_src)) + return nullptr; + } + + return set_insn; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */ + +bool +pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + /* Record the redundant TLS CALLs for 64-bit: + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg/f:DI 7 sp)] UNSPEC_TLS_GD) + (clobber (reg:DI 5 di))]) + + + and + + (parallel [ + (set (reg:DI 0 ax) + (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr"))) + (const_int 0 [0]))) + (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)]) + + */ + + rtx pat = PATTERN (insn); + rtx set = XVECEXP (pat, 0, 0); + gcc_assert (GET_CODE (set) == SET); + rtx dest = SET_DEST (set); + scalar_mode = mode = GET_MODE (dest); + val = XVECEXP (pat, 0, 1); + gcc_assert (GET_CODE (val) == UNSPEC); + + if (tls64 == TLS64_GD) + kind = X86_CSE_TLS_GD; + else + kind = X86_CSE_TLS_LD_BASE; + + def_insn = nullptr; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + SET is UNSPEC_TLSDESC. */ + +bool +pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64) +{ + if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p) + return false; + + rtx tls_symbol; + rtx_insn *set_insn; + rtx src = SET_SRC (set); + val = src; + tlsdesc_val = src; + kind = X86_CSE_TLSDESC; + + if (tls64 == TLS64_COMBINE) + { + /* Record 64-bit TLS64_COMBINE: + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (reg:DI 114) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + + (set (reg/f:DI 104) + (plus:DI (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + (unspec:DI [ + (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10]) + ] UNSPEC_TLSDESC) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC) + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)))) + */ + + scalar_mode = mode = GET_MODE (src); + + /* Since the first operand of PLUS in the source TLS_COMBINE + pattern is unused, use the second operand of PLUS: + + (const:DI (unspec:DI [ + (symbol_ref:DI ("e") [flags 0x1a]) + ] UNSPEC_DTPOFF)) + + as VAL to check if 2 TLS_COMBINE patterns have the same + source. */ + val = XEXP (src, 1); + gcc_assert (GET_CODE (val) == CONST + && GET_CODE (XEXP (val, 0)) == UNSPEC + && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF + && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0))); + def_insn = nullptr; + return true; + } + + /* Record 64-bit TLS_CALL: + + (set (reg:DI 101) + (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50]) + (reg:DI 112) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + */ + + gcc_assert (GET_CODE (src) == UNSPEC); + tls_symbol = XVECEXP (src, 0, 0); + src = XVECEXP (src, 0, 1); + scalar_mode = mode = GET_MODE (src); + gcc_assert (REG_P (src)); + + /* All definitions of reg:DI 129 in + + (set (reg:DI 110) + (unspec:DI [(symbol_ref:DI ("foo")) + (reg:DI 129) + (reg/f:DI 7 sp)] UNSPEC_TLSDESC)) + + should have the same source as in + + (set (reg:DI 129) + (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC)) + + */ + + set_insn = tls_set_insn_from_symbol (src, tls_symbol); + if (!set_insn) + return false; + + /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */ + val = tls_symbol; + def_insn = set_insn; + return true; +} + +/* Return true and output def_insn, val, mode, scalar_mode and kind if + INSN is a vector broadcast instruction. */ + +bool +pass_x86_cse::candidate_vector_p (rtx set) +{ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + mode = GET_MODE (dest); + /* Skip non-vector instruction. */ + if (!VECTOR_MODE_P (mode)) + return false; + + /* Skip non-vector load instruction. */ + if (!REG_P (dest) && !SUBREG_P (dest)) + return false; + + val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind, + &def_insn); + return val ? true : false; +} + +/* At entry of the nearest common dominator for basic blocks with + + 1. Vector CONST0_RTX patterns. + 2. Vector CONSTM1_RTX patterns. + 3. Vector broadcast patterns. + 4. UNSPEC_TLS_GD patterns. + 5. UNSPEC_TLS_LD_BASE patterns. + 6. UNSPEC_TLSDESC patterns. + + generate a single pattern whose destination is used to replace the + source in all identical patterns. + + NB: We want to generate a pattern, which is executed only once, to + cover the whole function. The LCM algorithm isn't appropriate here + since it may place a pattern inside the loop. */ + +unsigned int +pass_x86_cse::x86_cse (void) { timevar_push (TV_MACH_DEP); - auto_vec<redundant_load *> loads; - redundant_load *load; + auto_vec<redundant_pattern *> loads; + redundant_pattern *load; basic_block bb; rtx_insn *insn; unsigned int i; + auto_bitmap updated_gnu_tls_insns; + auto_bitmap updated_gnu2_tls_insns; df_set_flags (DF_DEFER_INSN_RESCAN); @@ -3710,61 +4411,74 @@ remove_redundant_vector_load (void) if (!NONDEBUG_INSN_P (insn)) continue; + bool matched = false; + /* Remove redundant pattens if there are more than 2 of + them. */ + unsigned int threshold = 2; + rtx set = single_set (insn); - if (!set) + if (!set && !CALL_P (insn)) continue; - /* Record single set vector instruction with CONST0_RTX and - CONSTM1_RTX source. Record basic blocks with CONST0_RTX and - CONSTM1_RTX. Count CONST0_RTX and CONSTM1_RTX. Record the - maximum size of CONST0_RTX and CONSTM1_RTX. */ + tlsdesc_val = nullptr; - rtx dest = SET_DEST (set); - machine_mode mode = GET_MODE (dest); - /* Skip non-vector instruction. */ - if (!VECTOR_MODE_P (mode)) - continue; + attr_tls64 tls64 = get_attr_tls64 (insn); + switch (tls64) + { + case TLS64_GD: + case TLS64_LD_BASE: + /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */ + if (candidate_gnu_tls_p (insn, tls64)) + break; + continue; - rtx src = SET_SRC (set); - /* Skip non-vector load instruction. */ - if (!REG_P (dest) && !SUBREG_P (dest)) - continue; + case TLS64_CALL: + case TLS64_COMBINE: + /* Verify UNSPEC_TLSDESC. */ + if (candidate_gnu2_tls_p (set, tls64)) + break; + continue; - rtx_insn *def_insn; - machine_mode scalar_mode; - x86_cse_kind kind; - rtx val = ix86_broadcast_inner (src, mode, &scalar_mode, - &kind, &def_insn); - if (!val) - continue; + case TLS64_LEA: + /* Skip TLS64_LEA. */ + continue; - /* Remove redundant register loads if there are more than 2 - loads will be used. */ - unsigned int threshold = 2; + case TLS64_NONE: + if (!set) + continue; - /* Check if there is a matching redundant vector load. */ - bool matched = false; + /* Check for vector broadcast. */ + if (candidate_vector_p (set)) + break; + continue; + } + + /* Check if there is a matching redundant load. */ FOR_EACH_VEC_ELT (loads, i, load) if (load->val && load->kind == kind && load->mode == scalar_mode && (load->bb == bb - || kind < X86_CSE_VEC_DUP + || kind != X86_CSE_VEC_DUP /* Non all 0s/1s vector load must be in the same basic block if it is in a recursive call. */ || !recursive_call_p) && rtx_equal_p (load->val, val)) { - /* Record vector instruction. */ + /* Record instruction. */ bitmap_set_bit (load->insns, INSN_UID (insn)); /* Record the maximum vector size. */ - if (load->size < GET_MODE_SIZE (mode)) + if (kind <= X86_CSE_VEC_DUP + && load->size < GET_MODE_SIZE (mode)) load->size = GET_MODE_SIZE (mode); /* Record the basic block. */ bitmap_set_bit (load->bbs, bb->index); + + /* Increment the count. */ load->count++; + matched = true; break; } @@ -3772,10 +4486,17 @@ remove_redundant_vector_load (void) if (matched) continue; - /* We see this vector broadcast the first time. */ - load = new redundant_load; + /* We see this instruction the first time. Record the + redundant source value, its mode, the destination size, + instruction which defines the redundant source value, + instruction basic block and the instruction kind. */ + load = new redundant_pattern; load->val = copy_rtx (val); + if (tlsdesc_val) + load->tlsdesc_val = copy_rtx (tlsdesc_val); + else + load->tlsdesc_val = nullptr; load->mode = scalar_mode; load->size = GET_MODE_SIZE (mode); load->def_insn = def_insn; @@ -3792,49 +4513,64 @@ remove_redundant_vector_load (void) } bool replaced = false; - rtx reg, broadcast_source, broadcast_reg; FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { - machine_mode mode = ix86_get_vector_cse_mode (load->size, - load->mode); - broadcast_reg = gen_reg_rtx (mode); - if (load->def_insn) - { - /* Replace redundant vector loads with a single vector load - in the same basic block. */ - reg = load->val; - if (load->mode != GET_MODE (reg)) - reg = gen_rtx_SUBREG (load->mode, reg, 0); - broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - replace_vector_const (mode, broadcast_reg, load->insns, - load->mode); - } - else + machine_mode mode; + rtx reg, broadcast_source, broadcast_reg; + replaced = true; + switch (load->kind) { - /* This is a constant integer/double vector. If the - inner scalar is 0 or -1, set vector to CONST0_RTX - or CONSTM1_RTX directly. */ - rtx reg; - switch (load->kind) + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + broadcast_reg = gen_reg_rtx (load->mode); + replace_tls_call (broadcast_reg, load->insns, + (load->kind == X86_CSE_TLSDESC + ? updated_gnu2_tls_insns + : updated_gnu_tls_insns)); + load->broadcast_reg = broadcast_reg; + break; + + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + mode = ix86_get_vector_cse_mode (load->size, load->mode); + broadcast_reg = gen_reg_rtx (mode); + if (load->def_insn) { - case X86_CSE_CONST0_VECTOR: - broadcast_source = CONST0_RTX (mode); - break; - case X86_CSE_CONSTM1_VECTOR: - broadcast_source = CONSTM1_RTX (mode); - break; - default: - reg = gen_reg_rtx (load->mode); + /* Replace redundant vector loads with a single vector + load in the same basic block. */ + reg = load->val; + if (load->mode != GET_MODE (reg)) + reg = gen_rtx_SUBREG (load->mode, reg, 0); broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); - break; } + else + /* This is a constant integer/double vector. If the + inner scalar is 0 or -1, set vector to CONST0_RTX + or CONSTM1_RTX directly. */ + switch (load->kind) + { + case X86_CSE_CONST0_VECTOR: + broadcast_source = CONST0_RTX (mode); + break; + case X86_CSE_CONSTM1_VECTOR: + broadcast_source = CONSTM1_RTX (mode); + break; + case X86_CSE_VEC_DUP: + reg = gen_reg_rtx (load->mode); + broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); + break; + default: + gcc_unreachable (); + } replace_vector_const (mode, broadcast_reg, load->insns, load->mode); + load->broadcast_source = broadcast_source; + load->broadcast_reg = broadcast_reg; + break; } - load->broadcast_source = broadcast_source; - load->broadcast_reg = broadcast_reg; - replaced = true; } if (replaced) @@ -3849,40 +4585,75 @@ remove_redundant_vector_load (void) FOR_EACH_VEC_ELT (loads, i, load) if (load->count >= load->threshold) { + rtx set; if (load->def_insn) - { - /* Insert a broadcast after the original scalar - definition. */ - rtx set = gen_rtx_SET (load->broadcast_reg, - load->broadcast_source); - insn = emit_insn_after (set, load->def_insn); - - if (cfun->can_throw_non_call_exceptions) - { - /* Handle REG_EH_REGION note in DEF_INSN. */ - rtx note = find_reg_note (load->def_insn, - REG_EH_REGION, nullptr); - if (note) - { - control_flow_insns.safe_push (load->def_insn); - add_reg_note (insn, REG_EH_REGION, - XEXP (note, 0)); - } - } + switch (load->kind) + { + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + load->tlsdesc_val, + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns, + PATTERN (load->def_insn)); + break; + case X86_CSE_VEC_DUP: + /* Insert a broadcast after the original scalar + definition. */ + set = gen_rtx_SET (load->broadcast_reg, + load->broadcast_source); + insn = emit_insn_after (set, load->def_insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note in DEF_INSN. */ + rtx note = find_reg_note (load->def_insn, + REG_EH_REGION, nullptr); + if (note) + { + control_flow_insns.safe_push (load->def_insn); + add_reg_note (insn, REG_EH_REGION, + XEXP (note, 0)); + } + } - if (dump_file) - { - fprintf (dump_file, "\nAdd:\n\n"); - print_rtl_single (dump_file, insn); - fprintf (dump_file, "\nafter:\n\n"); - print_rtl_single (dump_file, load->def_insn); - fprintf (dump_file, "\n"); - } - } + if (dump_file) + { + fprintf (dump_file, "\nAdd:\n\n"); + print_rtl_single (dump_file, insn); + fprintf (dump_file, "\nafter:\n\n"); + print_rtl_single (dump_file, load->def_insn); + fprintf (dump_file, "\n"); + } + break; + default: + gcc_unreachable (); + } else - ix86_place_single_vector_set (load->broadcast_reg, - load->broadcast_source, - load->bbs, load); + switch (load->kind) + { + case X86_CSE_TLS_GD: + case X86_CSE_TLS_LD_BASE: + case X86_CSE_TLSDESC: + ix86_place_single_tls_call (load->broadcast_reg, + (load->kind == X86_CSE_TLSDESC + ? load->tlsdesc_val + : load->val), + load->kind, + load->bbs, + updated_gnu_tls_insns, + updated_gnu2_tls_insns); + break; + case X86_CSE_CONST0_VECTOR: + case X86_CSE_CONSTM1_VECTOR: + case X86_CSE_VEC_DUP: + ix86_place_single_vector_set (load->broadcast_reg, + load->broadcast_source, + load->bbs, + load); + break; + } } loop_optimizer_finalize (); @@ -3912,48 +4683,12 @@ remove_redundant_vector_load (void) return 0; } -namespace { - -const pass_data pass_data_remove_redundant_vector_load = -{ - RTL_PASS, /* type */ - "rrvl", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - 0, /* todo_flags_finish */ -}; - -class pass_remove_redundant_vector_load : public rtl_opt_pass -{ -public: - pass_remove_redundant_vector_load (gcc::context *ctxt) - : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt) - {} - - /* opt_pass methods: */ - bool gate (function *fun) final override - { - return (TARGET_SSE2 - && optimize - && optimize_function_for_speed_p (fun)); - } - - unsigned int execute (function *) final override - { - return remove_redundant_vector_load (); - } -}; // class pass_remove_redundant_vector_load - } // anon namespace rtl_opt_pass * -make_pass_remove_redundant_vector_load (gcc::context *ctxt) +make_pass_x86_cse (gcc::context *ctxt) { - return new pass_remove_redundant_vector_load (ctxt); + return new pass_x86_cse (ctxt); } /* Convert legacy instructions that clobbers EFLAGS to APX_NF diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 09a35ef..abb5dd7 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1172,6 +1172,10 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], OPT_mrecip, MASK_RECIP), + IX86_ATTR_YES ("80387", + OPT_m80387, + MASK_80387), + IX86_ATTR_IX86_YES ("general-regs-only", OPT_mgeneral_regs_only, OPTION_MASK_GENERAL_REGS_ONLY), @@ -1281,6 +1285,8 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], else if (type == ix86_opt_yes || type == ix86_opt_no) { + opts_set->x_target_flags |= mask; + if (type == ix86_opt_no) opt_set_p = !opt_set_p; @@ -3556,6 +3562,10 @@ ix86_set_current_function (tree fndecl) isa = "AVX"; else if (cfun->machine->func_type != TYPE_NORMAL) isa = "SSE"; + else if (TARGET_MMX) + isa = "MMX/3Dnow"; + else if (TARGET_80387) + isa = "80387"; else isa = NULL; } diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 06f0288..553b46d 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -35,6 +35,6 @@ along with GCC; see the file COPYING3. If not see PR116174. */ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); - INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load); + INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse); INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 69bc0ee..bdb8bb9 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void); extern bool ix86_gpr_tls_address_pattern_p (rtx); extern bool ix86_tls_address_pattern_p (rtx); extern rtx ix86_rewrite_tls_address (rtx); +extern rtx ix86_tls_get_addr (void); extern void ix86_expand_vector_init (bool, rtx, rtx); extern void ix86_expand_vector_set (bool, rtx, rtx, int); @@ -430,8 +431,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area (gcc::context *); extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); -extern rtl_opt_pass *make_pass_remove_redundant_vector_load - (gcc::context *); +extern rtl_opt_pass *make_pass_x86_cse (gcc::context *); extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *); extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); @@ -448,3 +448,4 @@ extern void ix86_set_handled_components (sbitmap); /* In i386-expand.cc. */ bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, HOST_WIDE_INT*); +rtx ix86_vgf2p8affine_shift_matrix (rtx, rtx, enum rtx_code); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 65e04d3..471be3e 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -11382,6 +11382,23 @@ ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) return cost; } + +/* Implement TARGET_USE_BY_PIECES_INFRASTRUCTURE_P. */ + +bool +ix86_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, + unsigned int align, + enum by_pieces_operation op, + bool speed_p) +{ + /* Return true when we are currently expanding memcpy/memset epilogue + with move_by_pieces or store_by_pieces. */ + if (cfun->machine->by_pieces_in_use) + return true; + + return default_use_by_pieces_infrastructure_p (size, align, op, + speed_p); +} /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as this is used for to form addresses to local data when -fPIC is in @@ -12439,7 +12456,7 @@ ix86_tls_index (void) static GTY(()) rtx ix86_tls_symbol; -static rtx +rtx ix86_tls_get_addr (void) { if (cfun->machine->call_saved_registers @@ -22102,6 +22119,15 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, } /* FALLTHRU */ case V32QImode: + if (TARGET_GFNI && constant_op1) + { + /* Use vgf2p8affine. One extra load for the mask, but in a loop + with enough registers it will be moved out. So for now don't + account the constant mask load. This is not quite right + for non loop vectorization. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + } if (TARGET_AVX2) /* Use vpbroadcast. */ extra = cost->sse_op; @@ -22136,6 +22162,11 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, count = 9; return ix86_vec_cost (mode, cost->sse_op * count) + extra; + case V64QImode: + /* Ignore the mask load for GF2P8AFFINEQB. */ + extra = 0; + return ix86_vec_cost (mode, cost->sse_op) + extra; + case V2DImode: case V4DImode: /* V*DImode arithmetic right shift is emulated. */ @@ -25794,15 +25825,20 @@ private: unsigned m_num_sse_needed[3]; /* Number of 256-bit vector permutation. */ unsigned m_num_avx256_vec_perm[3]; + /* Number of reductions for FMA/DOT_PROD_EXPR/SAD_EXPR */ + unsigned m_num_reduc[X86_REDUC_LAST]; + /* Don't do unroll if m_prefer_unroll is false, default is true. */ + bool m_prefer_unroll; }; ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), m_num_gpr_needed (), m_num_sse_needed (), - m_num_avx256_vec_perm () -{ -} + m_num_avx256_vec_perm (), + m_num_reduc (), + m_prefer_unroll (true) +{} /* Implement targetm.vectorize.create_costs. */ @@ -26099,6 +26135,125 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, } } + /* Record number of load/store/gather/scatter in vectorized body. */ + if (where == vect_body && !m_costing_for_scalar) + { + switch (kind) + { + /* Emulated gather/scatter or any scalarization. */ + case scalar_load: + case scalar_stmt: + case scalar_store: + case vector_gather_load: + case vector_scatter_store: + m_prefer_unroll = false; + break; + + case vector_stmt: + case vec_to_scalar: + /* Count number of reduction FMA and "real" DOT_PROD_EXPR, + unroll in the vectorizer will enable partial sum. */ + if (stmt_info + && vect_is_reduction (stmt_info) + && stmt_info->stmt) + { + /* Handle __builtin_fma. */ + if (gimple_call_combined_fn (stmt_info->stmt) == CFN_FMA) + { + m_num_reduc[X86_REDUC_FMA] += count; + break; + } + + if (!is_gimple_assign (stmt_info->stmt)) + break; + + tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt); + machine_mode inner_mode = GET_MODE_INNER (mode); + tree rhs1, rhs2; + bool native_vnni_p = true; + gimple* def; + machine_mode mode_rhs; + switch (subcode) + { + case PLUS_EXPR: + case MINUS_EXPR: + if (!fp || !flag_associative_math + || flag_fp_contract_mode != FP_CONTRACT_FAST) + break; + + /* FMA condition for different modes. */ + if (((inner_mode == DFmode || inner_mode == SFmode) + && !TARGET_FMA && !TARGET_AVX512VL) + || (inner_mode == HFmode && !TARGET_AVX512FP16) + || (inner_mode == BFmode && !TARGET_AVX10_2)) + break; + + /* MULT_EXPR + PLUS_EXPR/MINUS_EXPR is transformed + to FMA/FNMA after vectorization. */ + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + if (subcode == PLUS_EXPR + && TREE_CODE (rhs1) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs1), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduc[X86_REDUC_FMA] += count; + else if (TREE_CODE (rhs2) == SSA_NAME + && (def = SSA_NAME_DEF_STMT (rhs2), true) + && is_gimple_assign (def) + && gimple_assign_rhs_code (def) == MULT_EXPR) + m_num_reduc[X86_REDUC_FMA] += count; + break; + + /* Vectorizer lane_reducing_op_p supports DOT_PROX_EXPR, + WIDEN_SUM_EXPR and SAD_EXPR, x86 backend only supports + SAD_EXPR (usad{v16qi,v32qi,v64qi}) and DOT_PROD_EXPR. */ + case DOT_PROD_EXPR: + rhs1 = gimple_assign_rhs1 (stmt_info->stmt); + mode_rhs = TYPE_MODE (TREE_TYPE (rhs1)); + if (mode_rhs == QImode) + { + rhs2 = gimple_assign_rhs2 (stmt_info->stmt); + signop signop1_p = TYPE_SIGN (TREE_TYPE (rhs1)); + signop signop2_p = TYPE_SIGN (TREE_TYPE (rhs2)); + + /* vpdpbusd. */ + if (signop1_p != signop2_p) + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX512VNNI + : ((TARGET_AVX512VNNI && TARGET_AVX512VL) + || TARGET_AVXVNNI)); + else + /* vpdpbssd. */ + native_vnni_p + = (GET_MODE_SIZE (mode) == 64 + ? TARGET_AVX10_2 + : (TARGET_AVXVNNIINT8 || TARGET_AVX10_2)); + } + m_num_reduc[X86_REDUC_DOT_PROD] += count; + + /* Dislike to do unroll and partial sum for + emulated DOT_PROD_EXPR. */ + if (!native_vnni_p) + m_num_reduc[X86_REDUC_DOT_PROD] += 3 * count; + break; + + case SAD_EXPR: + m_num_reduc[X86_REDUC_SAD] += count; + break; + + default: + break; + } + } + + default: + break; + } + } + + combined_fn cfn; if ((kind == vector_stmt || kind == scalar_stmt) && stmt_info @@ -26161,8 +26316,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node)))) != INTEGER_CST)) - || (SLP_TREE_MEMORY_ACCESS_TYPE (node) - == VMAT_GATHER_SCATTER))))) + || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node)))))) { stmt_cost = ix86_default_vector_cost (kind, mode); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); @@ -26306,6 +26460,41 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) m_costs[vect_body] = INT_MAX; + + bool any_reduc_p = false; + for (int i = 0; i != X86_REDUC_LAST; i++) + if (m_num_reduc[i]) + { + any_reduc_p = true; + break; + } + + if (any_reduc_p + /* Not much gain for loop with gather and scatter. */ + && m_prefer_unroll + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + unsigned unroll_factor + = OPTION_SET_P (ix86_vect_unroll_limit) + ? ix86_vect_unroll_limit + : ix86_cost->vect_unroll_limit; + + if (unroll_factor > 1) + { + for (int i = 0 ; i != X86_REDUC_LAST; i++) + { + if (m_num_reduc[i]) + { + unsigned tmp = CEIL (ix86_cost->reduc_lat_mult_thr[i], + m_num_reduc[i]); + unroll_factor = MIN (unroll_factor, tmp); + } + } + + m_suggested_unroll_factor = 1 << ceil_log2 (unroll_factor); + } + } + } ix86_vect_estimate_reg_pressure (); @@ -27189,9 +27378,9 @@ ix86_memtag_can_tag_addresses () return ix86_lam_type != lam_none && TARGET_LP64; } -/* Implement TARGET_MEMTAG_TAG_SIZE. */ +/* Implement TARGET_MEMTAG_TAG_BITSIZE. */ unsigned char -ix86_memtag_tag_size () +ix86_memtag_tag_bitsize () { return IX86_HWASAN_TAG_SIZE; } @@ -27762,6 +27951,10 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST ix86_address_cost +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \ + ix86_use_by_pieces_infrastructure_p + #undef TARGET_OVERLAP_OP_BY_PIECES_P #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true @@ -28165,8 +28358,8 @@ ix86_libgcc_floating_mode_supported_p #undef TARGET_MEMTAG_UNTAGGED_POINTER #define TARGET_MEMTAG_UNTAGGED_POINTER ix86_memtag_untagged_pointer -#undef TARGET_MEMTAG_TAG_SIZE -#define TARGET_MEMTAG_TAG_SIZE ix86_memtag_tag_size +#undef TARGET_MEMTAG_TAG_BITSIZE +#define TARGET_MEMTAG_TAG_BITSIZE ix86_memtag_tag_bitsize #undef TARGET_GEN_CCMP_FIRST #define TARGET_GEN_CCMP_FIRST ix86_gen_ccmp_first diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 49af963..ac0ce68 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -102,6 +102,15 @@ struct stringop_algs #define COSTS_N_BYTES(N) ((N) * 2) #endif + +enum ix86_reduc_unroll_factor{ + X86_REDUC_FMA, + X86_REDUC_DOT_PROD, + X86_REDUC_SAD, + + X86_REDUC_LAST +}; + /* Define the specific costs for a given cpu. NB: hard_register is used by TARGET_REGISTER_MOVE_COST and TARGET_MEMORY_MOVE_COST to compute hard register move costs by register allocator. Relative costs of @@ -225,6 +234,13 @@ struct processor_costs { to number of instructions executed in parallel. See also ix86_reassociation_width. */ + const unsigned reduc_lat_mult_thr[X86_REDUC_LAST]; + /* Latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + const unsigned vect_unroll_limit; /* Limit how much the autovectorizer + may unroll a loop. */ struct stringop_algs *memcpy, *memset; const int cond_taken_branch_cost; /* Cost of taken branch for vectorizer cost model. */ @@ -644,7 +660,8 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); {"cpu_64", "%{" OPT_ARCH64 ":%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}}" }, \ {"arch", "%{!march=*:-march=%(VALUE)}"}, \ {"arch_32", "%{" OPT_ARCH32 ":%{!march=*:-march=%(VALUE)}}"}, \ - {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, + {"arch_64", "%{" OPT_ARCH64 ":%{!march=*:-march=%(VALUE)}}"}, \ + {"tls", "%{!mtls-dialect=*:-mtls-dialect=%(VALUE)}"}, /* Specs for the compiler proper */ @@ -2865,6 +2882,9 @@ struct GTY(()) machine_function { approximation. */ BOOL_BITFIELD tls_descriptor_call_expanded_p : 1; + /* True if TLS descriptor is called more than once. */ + BOOL_BITFIELD tls_descriptor_call_multiple_p : 1; + /* If true, the current function has a STATIC_CHAIN is placed on the stack below the return address. */ BOOL_BITFIELD static_chain_on_stack : 1; @@ -2934,6 +2954,9 @@ struct GTY(()) machine_function { /* True if this is a recursive function. */ BOOL_BITFIELD recursive_function : 1; + /* True if by_pieces op is currently in use. */ + BOOL_BITFIELD by_pieces_in_use : 1; + /* The largest alignment, in bytes, of stack slot actually used. */ unsigned int max_used_stack_alignment; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 6686f10..cea6c15 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -901,6 +901,10 @@ (define_attr "avx_partial_xmm_update" "false,true" (const_string "false")) +;; Define attribute to indicate 64-bit TLS insns. +(define_attr "tls64" "gd,ld_base,call,combine,lea,none" + (const_string "none")) + ;; Define attribute to classify add/sub insns that consumes carry flag (CF) (define_attr "use_carry" "0,1" (const_string "0")) @@ -23153,6 +23157,7 @@ return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}"; } [(set_attr "type" "multi") + (set_attr "tls64" "gd") (set (attr "length") (symbol_ref "TARGET_X32 ? 15 : 16"))]) @@ -23191,7 +23196,11 @@ UNSPEC_TLS_GD) (clobber (match_operand:P 3 "register_operand"))])] "TARGET_64BIT" - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") +{ + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) (define_insn "*tls_local_dynamic_base_32_gnu" [(set (match_operand:SI 0 "register_operand" "=a") @@ -23253,6 +23262,7 @@ return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}"; } [(set_attr "type" "multi") + (set_attr "tls64" "ld_base") (set_attr "length" "12")]) (define_insn "*tls_local_dynamic_base_64_largepic" @@ -23286,7 +23296,11 @@ (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE) (clobber (match_operand:P 2 "register_operand"))])] "TARGET_64BIT" - "ix86_tls_descriptor_calls_expanded_in_cfun = true;") +{ + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; + ix86_tls_descriptor_calls_expanded_in_cfun = true; +}) ;; Local dynamic of a single variable is a lose. Show combine how ;; to convert that back to global dynamic. @@ -23480,6 +23494,8 @@ "TARGET_64BIT && TARGET_GNU2_TLS" { operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0]; + if (ix86_tls_descriptor_calls_expanded_in_cfun) + cfun->machine->tls_descriptor_call_multiple_p = true; ix86_tls_descriptor_calls_expanded_in_cfun = true; }) @@ -23491,6 +23507,7 @@ "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}" [(set_attr "type" "lea") (set_attr "mode" "<MODE>") + (set_attr "tls64" "lea") (set_attr "length" "7") (set_attr "length_address" "4")]) @@ -23504,6 +23521,7 @@ "TARGET_64BIT && TARGET_GNU2_TLS" "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}" [(set_attr "type" "call") + (set_attr "tls64" "call") (set_attr "length" "2") (set_attr "length_address" "0")]) @@ -23525,7 +23543,8 @@ { operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : operands[0]; emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1])); -}) +} + [(set_attr "tls64" "combine")]) (define_split [(match_operand 0 "tls_address_pattern")] diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index c93c0b1..6bda22f 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1246,6 +1246,10 @@ munroll-only-small-loops Target Var(ix86_unroll_only_small_loops) Init(0) Optimization Enable conservative small loop unrolling. +-param=ix86-vect-unroll-limit= +Target Joined UInteger Var(ix86_vect_unroll_limit) Init(4) Param +Limit how much the autovectorizer may unroll a loop. + mlam= Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none) -mlam=[none|u48|u57] Instrument meta data position in user data pointers. diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 175798c..5dbe444 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1319,6 +1319,9 @@ (ior (match_operand 0 "nonimmediate_operand") (match_test "const_vec_duplicate_p (op)"))) +(define_predicate "const_vec_dup_operand" + (match_test "const_vec_duplicate_p (op)")) + ;; Return true when OP is either register operand, or any ;; CONST_VECTOR. (define_predicate "reg_or_const_vector_operand" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ec74f93..73906b8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -326,6 +326,9 @@ (define_mode_iterator VI1_AVX512VL [V64QI (V16QI "TARGET_AVX512VL") (V32QI "TARGET_AVX512VL")]) +(define_mode_iterator VI1_AVX512_3264 + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX")]) + ;; All vector modes (define_mode_iterator V [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI @@ -26559,9 +26562,9 @@ ;; XOP packed rotate instructions (define_expand "rotl<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotate:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotate:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26590,9 +26593,9 @@ }) (define_expand "rotr<mode>3" - [(set (match_operand:VI_128 0 "register_operand") - (rotatert:VI_128 - (match_operand:VI_128 1 "nonimmediate_operand") + [(set (match_operand:VI248_128 0 "register_operand") + (rotatert:VI248_128 + (match_operand:VI248_128 1 "nonimmediate_operand") (match_operand:SI 2 "general_operand")))] "TARGET_XOP" { @@ -26964,31 +26967,122 @@ int i; if (<CODE> != ASHIFT) - { - if (CONST_INT_P (operands[2])) - operands[2] = GEN_INT (-INTVAL (operands[2])); - else - negate = true; - } + { + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (-INTVAL (operands[2])); + else + negate = true; + } par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); tmp = lowpart_subreg (QImode, operands[2], SImode); for (i = 0; i < 16; i++) - XVECEXP (par, 0, i) = tmp; + XVECEXP (par, 0, i) = tmp; tmp = gen_reg_rtx (V16QImode); emit_insn (gen_vec_initv16qiqi (tmp, par)); if (negate) - emit_insn (gen_negv16qi2 (tmp, tmp)); + emit_insn (gen_negv16qi2 (tmp, tmp)); gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } + else if (TARGET_GFNI && CONST_INT_P (operands[2]) + && (<MODE_SIZE> == 64 + || !(INTVAL (operands[2]) == 7 && <CODE> == ASHIFTRT))) + { + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], + <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, + const0_rtx)); + } else ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]); DONE; }) +(define_expand "cond_<insn><mode>" + [(set (match_operand:VI1_AVX512VL 0 "register_operand") + (vec_merge:VI1_AVX512VL + (any_shift:VI1_AVX512VL + (match_operand:VI1_AVX512VL 2 "register_operand") + (match_operand:VI1_AVX512VL 3 "const_vec_dup_operand")) + (match_operand:VI1_AVX512VL 4 "nonimm_or_0_operand") + (match_operand:<avx512fmaskmode> 1 "register_operand")))] + "TARGET_GFNI && TARGET_AVX512F" +{ + rtx count = XVECEXP (operands[3], 0, 0); + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], count, <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode>_mask (operands[0], operands[2], matrix, + const0_rtx, operands[4], + operands[1])); + DONE; +}) + +(define_expand "<insn><mode>3" + [(set (match_operand:VI1_AVX512_3264 0 "register_operand") + (any_rotate:VI1_AVX512_3264 + (match_operand:VI1_AVX512_3264 1 "register_operand") + (match_operand:SI 2 "const_int_operand")))] + "TARGET_GFNI" +{ + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_<mode> (operands[0], operands[1], matrix, + const0_rtx)); + DONE; +}) + +(define_expand "<insn>v16qi3" + [(set (match_operand:V16QI 0 "register_operand") + (any_rotate:V16QI + (match_operand:V16QI 1 "nonimmediate_operand") + (match_operand:SI 2 "general_operand")))] + "TARGET_GFNI || TARGET_XOP" +{ + /* Handle the V16QI XOP case to avoid a conflict with the other expand. */ + if (TARGET_XOP) + { + if (! const_0_to_7_operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != QImode) + { + op2 = gen_reg_rtx (QImode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_initv16qiqi (reg, par)); + if (<CODE> == ROTATERT) + { + rtx neg = gen_reg_rtx (V16QImode); + emit_insn (gen_negv16qi2 (neg, reg)); + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], neg)); + reg = neg; + } + emit_insn (gen_xop_vrotlv16qi3 (operands[0], operands[1], reg)); + DONE; + } + } + else if (TARGET_GFNI && CONST_INT_P (operands[2])) + { + rtx matrix = ix86_vgf2p8affine_shift_matrix (operands[0], operands[2], <CODE>); + emit_insn (gen_vgf2p8affineqb_v16qi (operands[0], + force_reg (V16QImode, operands[1]), + matrix, const0_rtx)); + DONE; + } + else + FAIL; +}) + (define_expand "ashrv2di3" [(set (match_operand:V2DI 0 "register_operand") (ashiftrt:V2DI diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index c8603b9..1649ea2 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -141,6 +141,12 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ ix86_size_memcpy, ix86_size_memset, COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ @@ -261,6 +267,12 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ i386_memcpy, i386_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -382,6 +394,12 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (27), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (27), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ i486_memcpy, i486_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -501,6 +519,12 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium_memcpy, pentium_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -613,6 +637,12 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (5), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium_memcpy, pentium_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -740,6 +770,12 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentiumpro_memcpy, pentiumpro_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -858,6 +894,12 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ geode_memcpy, geode_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -979,6 +1021,12 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (2), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ k6_memcpy, k6_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1101,6 +1149,12 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ athlon_memcpy, athlon_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1232,6 +1286,12 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (5), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ k8_memcpy, k8_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -1371,6 +1431,12 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ amdfam10_memcpy, amdfam10_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -1503,6 +1569,12 @@ const struct processor_costs bdver_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ bdver_memcpy, bdver_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1668,6 +1740,12 @@ struct processor_costs znver1_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {5, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver1_memcpy, znver1_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1836,6 +1914,12 @@ struct processor_costs znver2_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {10, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -1979,6 +2063,12 @@ struct processor_costs znver3_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2125,6 +2215,12 @@ struct processor_costs znver4_cost = { plus/minus operations per cycle but only one multiply. This is adjusted in ix86_reassociation_width. */ 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2287,6 +2383,12 @@ struct processor_costs znver5_cost = { We increase width to 6 for multiplications in ix86_reassociation_width. */ 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 6}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2422,6 +2524,12 @@ struct processor_costs skylake_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ skylake_memcpy, skylake_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -2559,6 +2667,12 @@ struct processor_costs icelake_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 10, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ icelake_memcpy, icelake_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -2690,6 +2804,12 @@ struct processor_costs alderlake_cost = { COSTS_N_INSNS (7), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (6), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ alderlake_memcpy, alderlake_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -2814,6 +2934,12 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ btver1_memcpy, btver1_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -2935,6 +3061,12 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ btver2_memcpy, btver2_memset, COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ @@ -3055,6 +3187,12 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ pentium4_memcpy, pentium4_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3178,6 +3316,12 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (12), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (8), /* cost of CVT(T)PS2PI instruction. */ 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {1, 1, 1}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ nocona_memcpy, nocona_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3299,6 +3443,12 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 2, /* Limit how much the autovectorizer + may unroll a loop. */ atom_memcpy, atom_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3420,6 +3570,12 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ slm_memcpy, slm_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3555,6 +3711,12 @@ struct processor_costs tremont_cost = { COSTS_N_INSNS (4), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (4), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ tremont_memcpy, tremont_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -3681,6 +3843,12 @@ struct processor_costs lujiazui_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ lujiazui_memcpy, lujiazui_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -3805,6 +3973,12 @@ struct processor_costs yongfeng_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ yongfeng_memcpy, yongfeng_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -3929,6 +4103,12 @@ struct processor_costs shijidadao_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ shijidadao_memcpy, shijidadao_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ @@ -4078,6 +4258,12 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (3), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (3), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 8, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 4, /* Limit how much the autovectorizer + may unroll a loop. */ generic_memcpy, generic_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ @@ -4215,6 +4401,12 @@ struct processor_costs core_cost = { COSTS_N_INSNS (6), /* cost of CVTPI2PS instruction. */ COSTS_N_INSNS (7), /* cost of CVT(T)PS2PI instruction. */ 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ + {8, 1, 3}, /* latency times throughput of + FMA/DOT_PROD_EXPR/SAD_EXPR, + it's used to determine unroll + factor in the vectorizer. */ + 1, /* Limit how much the autovectorizer + may unroll a loop. */ core_memcpy, core_memset, COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ diff --git a/gcc/config/loongarch/genopts/isa-evolution.in b/gcc/config/loongarch/genopts/isa-evolution.in index 50f72d5..836d93a 100644 --- a/gcc/config/loongarch/genopts/isa-evolution.in +++ b/gcc/config/loongarch/genopts/isa-evolution.in @@ -2,4 +2,5 @@ 2 26 div32 1.1 Support div.w[u] and mod.w[u] instructions with inputs not sign-extended. 2 27 lam-bh 1.1 Support am{swap/add}[_db].{b/h} instructions. 2 28 lamcas 1.1 Support amcas[_db].{b/h/w/d} instructions. +2 30 scq 1.1 Support sc.q instruction. 3 23 ld-seq-sa 1.1 Do not need load-load barriers (dbar 0x700). diff --git a/gcc/config/loongarch/loongarch-def.cc b/gcc/config/loongarch/loongarch-def.cc index 04b277e..dcd8d90 100644 --- a/gcc/config/loongarch/loongarch-def.cc +++ b/gcc/config/loongarch/loongarch-def.cc @@ -72,7 +72,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa = .simd_ (ISA_EXT_SIMD_LASX) .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS - | OPTION_MASK_ISA_FRECIPE)) + | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ)) .set (ARCH_LA64V1_0, loongarch_isa () .base_ (ISA_BASE_LA64) @@ -86,7 +86,7 @@ array_arch<loongarch_isa> loongarch_cpu_default_isa = .simd_ (ISA_EXT_SIMD_LSX) .evolution_ (OPTION_MASK_ISA_DIV32 | OPTION_MASK_ISA_LD_SEQ_SA | OPTION_MASK_ISA_LAM_BH | OPTION_MASK_ISA_LAMCAS - | OPTION_MASK_ISA_FRECIPE)); + | OPTION_MASK_ISA_FRECIPE | OPTION_MASK_ISA_SCQ)); static inline loongarch_cache la464_cache () diff --git a/gcc/config/loongarch/loongarch-def.h b/gcc/config/loongarch/loongarch-def.h index 0bcd2a7..0a7d0c9 100644 --- a/gcc/config/loongarch/loongarch-def.h +++ b/gcc/config/loongarch/loongarch-def.h @@ -78,12 +78,10 @@ extern loongarch_def_array<const char *, N_ISA_EXT_TYPES> /* Base ABI */ -enum { - ABI_BASE_LP64D = 0, - ABI_BASE_LP64F = 1, - ABI_BASE_LP64S = 2, - N_ABI_BASE_TYPES = 3 -}; +#define ABI_BASE_LP64D 0 +#define ABI_BASE_LP64F 1 +#define ABI_BASE_LP64S 2 +#define N_ABI_BASE_TYPES 3 extern loongarch_def_array<const char *, N_ABI_BASE_TYPES> loongarch_abi_base_strings; diff --git a/gcc/config/loongarch/loongarch-evolution.cc b/gcc/config/loongarch/loongarch-evolution.cc index de68624..a92a645 100644 --- a/gcc/config/loongarch/loongarch-evolution.cc +++ b/gcc/config/loongarch/loongarch-evolution.cc @@ -32,6 +32,7 @@ int la_evo_feature_masks[] = { OPTION_MASK_ISA_DIV32, OPTION_MASK_ISA_LAM_BH, OPTION_MASK_ISA_LAMCAS, + OPTION_MASK_ISA_SCQ, OPTION_MASK_ISA_LD_SEQ_SA, }; @@ -40,6 +41,7 @@ const char* la_evo_macro_name[] = { "__loongarch_div32", "__loongarch_lam_bh", "__loongarch_lamcas", + "__loongarch_scq", "__loongarch_ld_seq_sa", }; @@ -48,6 +50,7 @@ int la_evo_version_major[] = { 1, /* DIV32 */ 1, /* LAM_BH */ 1, /* LAMCAS */ + 1, /* SCQ */ 1, /* LD_SEQ_SA */ }; @@ -56,5 +59,6 @@ int la_evo_version_minor[] = { 1, /* DIV32 */ 1, /* LAM_BH */ 1, /* LAMCAS */ + 1, /* SCQ */ 1, /* LD_SEQ_SA */ }; diff --git a/gcc/config/loongarch/loongarch-evolution.h b/gcc/config/loongarch/loongarch-evolution.h index 5f90839..7fb7b0d 100644 --- a/gcc/config/loongarch/loongarch-evolution.h +++ b/gcc/config/loongarch/loongarch-evolution.h @@ -36,6 +36,7 @@ static constexpr struct { { 2, 1u << 26, OPTION_MASK_ISA_DIV32 }, { 2, 1u << 27, OPTION_MASK_ISA_LAM_BH }, { 2, 1u << 28, OPTION_MASK_ISA_LAMCAS }, + { 2, 1u << 30, OPTION_MASK_ISA_SCQ }, { 3, 1u << 23, OPTION_MASK_ISA_LD_SEQ_SA }, }; @@ -58,8 +59,9 @@ enum { EVO_DIV32 = 1, EVO_LAM_BH = 2, EVO_LAMCAS = 3, - EVO_LD_SEQ_SA = 4, - N_EVO_FEATURES = 5 + EVO_SCQ = 4, + EVO_LD_SEQ_SA = 5, + N_EVO_FEATURES = 6 }; /* Condition macros */ @@ -71,6 +73,8 @@ enum { (la_target.isa.evolution & OPTION_MASK_ISA_LAM_BH) #define ISA_HAS_LAMCAS \ (la_target.isa.evolution & OPTION_MASK_ISA_LAMCAS) +#define ISA_HAS_SCQ \ + (la_target.isa.evolution & OPTION_MASK_ISA_SCQ) #define ISA_HAS_LD_SEQ_SA \ (la_target.isa.evolution & OPTION_MASK_ISA_LD_SEQ_SA) diff --git a/gcc/config/loongarch/loongarch-str.h b/gcc/config/loongarch/loongarch-str.h index 1546ea3..583cce8 100644 --- a/gcc/config/loongarch/loongarch-str.h +++ b/gcc/config/loongarch/loongarch-str.h @@ -70,6 +70,7 @@ along with GCC; see the file COPYING3. If not see #define OPTSTR_DIV32 "div32" #define OPTSTR_LAM_BH "lam-bh" #define OPTSTR_LAMCAS "lamcas" +#define OPTSTR_SCQ "scq" #define OPTSTR_LD_SEQ_SA "ld-seq-sa" #endif /* LOONGARCH_STR_H */ diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 493f95e..0935d7b 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -4388,6 +4388,7 @@ loongarch_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, break; } else if (TARGET_RECIP_VEC_DIV + && vectype && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN) { machine_mode mode = TYPE_MODE (vectype); @@ -6221,9 +6222,8 @@ loongarch_print_operand_reloc (FILE *file, rtx op, bool hi64_part, 'Q' Print R_LARCH_RELAX for TLS IE. 'r' Print address 12-31bit relocation associated with OP. 'R' Print address 32-51bit relocation associated with OP. - 'T' Print 'f' for (eq:CC ...), 't' for (ne:CC ...), - 'z' for (eq:?I ...), 'n' for (ne:?I ...). - 't' Like 'T', but with the EQ/NE cases reversed + 'T' Print a comment marker if %G outputs nothing. + 't' Print the register containing the higher 64 bits of a TImode. 'u' Print a LASX register. 'v' Print the insn size suffix b, h, w or d for vector modes V16QI, V8HI, V4SI, V2SI, and w, d for vector modes V4SF, V2DF respectively. @@ -6306,6 +6306,13 @@ loongarch_print_operand (FILE *file, rtx op, int letter) fputs ("dbar\t0x700", file); break; + case 'T': + if (!loongarch_cas_failure_memorder_needs_acquire ( + memmodel_from_int (INTVAL (op))) + && ISA_HAS_LD_SEQ_SA) + fprintf (file, "%s", ASM_COMMENT_START); + break; + case 'h': if (code == HIGH) op = XEXP (op, 0); @@ -6384,14 +6391,6 @@ loongarch_print_operand (FILE *file, rtx op, int letter) false /* lo_reloc */); break; - case 't': - case 'T': - { - int truth = (code == NE) == (letter == 'T'); - fputc ("zfnt"[truth * 2 + FCC_REG_P (REGNO (XEXP (op, 0)))], file); - } - break; - case 'V': if (CONST_VECTOR_P (op)) { @@ -6495,6 +6494,16 @@ loongarch_print_operand (FILE *file, rtx op, int letter) } break; + case 't': + if (GET_MODE (op) != TImode + || (op != CONST0_RTX (TImode) && code != REG)) + { + output_operand_lossage ("invalid use of '%%%c'", letter); + break; + } + op = loongarch_subword (op, 1); + letter = 'z'; + /* fall through */ default: switch (code) { @@ -10786,9 +10795,9 @@ loongarch_expand_vec_cmp (rtx operands[]) to a fixed type. */ static machine_mode -loongarch_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, +loongarch_promote_function_mode (const_tree type, machine_mode mode, - int *punsignedp ATTRIBUTE_UNUSED, + int *punsignedp, const_tree fntype ATTRIBUTE_UNUSED, int for_return ATTRIBUTE_UNUSED) { @@ -11154,6 +11163,46 @@ loongarch_c_mode_for_suffix (char suffix) return VOIDmode; } +/* Implement TARGET_C_BITINT_TYPE_INFO. + Return true if _BitInt(N) is supported and fill its details into *INFO. */ +bool +loongarch_bitint_type_info (int n, struct bitint_info *info) +{ + if (n <= 8) + info->limb_mode = QImode; + else if (n <= 16) + info->limb_mode = HImode; + else if (n <= 32) + info->limb_mode = SImode; + else if (n <= 64) + info->limb_mode = DImode; + else if (n <= 128) + info->limb_mode = TImode; + else + info->limb_mode = DImode; + + info->abi_limb_mode = info->limb_mode; + + if (n > 64) + info->abi_limb_mode = TImode; + + info->big_endian = false; + info->extended = true; + return true; +} + +/* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */ + +static int +loongarch_compute_pressure_classes (reg_class *classes) +{ + int i = 0; + classes[i++] = GENERAL_REGS; + classes[i++] = FP_REGS; + classes[i++] = FCC_REGS; + return i; +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" @@ -11428,6 +11477,12 @@ loongarch_c_mode_for_suffix (char suffix) #undef TARGET_C_MODE_FOR_SUFFIX #define TARGET_C_MODE_FOR_SUFFIX loongarch_c_mode_for_suffix +#undef TARGET_C_BITINT_TYPE_INFO +#define TARGET_C_BITINT_TYPE_INFO loongarch_bitint_type_info + +#undef TARGET_COMPUTE_PRESSURE_CLASSES +#define TARGET_COMPUTE_PRESSURE_CLASSES loongarch_compute_pressure_classes + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-loongarch.h" diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index 5fc8665..e8819bf 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -270,7 +270,9 @@ along with GCC; see the file COPYING3. If not see if (GET_MODE_CLASS (MODE) == MODE_INT \ && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \ { \ - if ((MODE) == SImode) \ + if ((MODE) == SImode \ + && !(TYPE && TREE_CODE (TYPE) == BITINT_TYPE \ + && TYPE_PRECISION (TYPE) < 32)) \ (UNSIGNEDP) = 0; \ (MODE) = Pmode; \ } diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt index 4d85cf5..fbe61c0 100644 --- a/gcc/config/loongarch/loongarch.opt +++ b/gcc/config/loongarch/loongarch.opt @@ -334,6 +334,10 @@ mlamcas Target Mask(ISA_LAMCAS) Var(la_isa_evolution) Support amcas[_db].{b/h/w/d} instructions. +mscq +Target Mask(ISA_SCQ) Var(la_isa_evolution) +Support sc.q instruction. + mld-seq-sa Target Mask(ISA_LD_SEQ_SA) Var(la_isa_evolution) Do not need load-load barriers (dbar 0x700). diff --git a/gcc/config/loongarch/loongarch.opt.urls b/gcc/config/loongarch/loongarch.opt.urls index 5f644f6..606a211 100644 --- a/gcc/config/loongarch/loongarch.opt.urls +++ b/gcc/config/loongarch/loongarch.opt.urls @@ -90,6 +90,9 @@ UrlSuffix(gcc/LoongArch-Options.html#index-mlam-bh) mlamcas UrlSuffix(gcc/LoongArch-Options.html#index-mlamcas) +mscq +UrlSuffix(gcc/LoongArch-Options.html#index-mscq) + mld-seq-sa UrlSuffix(gcc/LoongArch-Options.html#index-mld-seq-sa) diff --git a/gcc/config/loongarch/simd.md b/gcc/config/loongarch/simd.md index dd17cd1..4156b26 100644 --- a/gcc/config/loongarch/simd.md +++ b/gcc/config/loongarch/simd.md @@ -773,7 +773,7 @@ (vec_select:<VEC_HALF> (match_operand:IVEC 2 "register_operand" "f") (match_operand:IVEC 4 "vect_par_cnst_even_or_odd_half"))) - (any_extend:<WVEC> + (any_extend:<WVEC_HALF> (vec_select:<VEC_HALF> (match_operand:IVEC 3 "register_operand" "f") (match_dup 4)))) diff --git a/gcc/config/loongarch/sync.md b/gcc/config/loongarch/sync.md index fd8d732..2ee400e 100644 --- a/gcc/config/loongarch/sync.md +++ b/gcc/config/loongarch/sync.md @@ -21,25 +21,25 @@ (define_c_enum "unspec" [ UNSPEC_COMPARE_AND_SWAP + UNSPEC_COMPARE_AND_SWAP_AMCAS UNSPEC_COMPARE_AND_SWAP_ADD UNSPEC_COMPARE_AND_SWAP_SUB - UNSPEC_COMPARE_AND_SWAP_AND - UNSPEC_COMPARE_AND_SWAP_XOR - UNSPEC_COMPARE_AND_SWAP_OR UNSPEC_COMPARE_AND_SWAP_NAND UNSPEC_SYNC_OLD_OP UNSPEC_SYNC_EXCHANGE UNSPEC_ATOMIC_STORE UNSPEC_ATOMIC_LOAD UNSPEC_MEMORY_BARRIER + + UNSPEC_TI_FETCH_ADD + UNSPEC_TI_FETCH_SUB + UNSPEC_TI_FETCH_AND + UNSPEC_TI_FETCH_XOR + UNSPEC_TI_FETCH_OR + UNSPEC_TI_FETCH_NAND_MASK_INVERTED ]) (define_code_iterator any_atomic [plus ior xor and]) -(define_code_attr atomic_optab - [(plus "add") (ior "or") (xor "xor") (and "and")]) - -;; This attribute gives the format suffix for atomic memory operations. -(define_mode_attr amo [(QI "b") (HI "h") (SI "w") (DI "d")]) ;; <amop> expands to the name of the atomic operand that implements a ;; particular code. @@ -107,7 +107,7 @@ (define_insn "atomic_load<mode>" [(set (match_operand:QHWD 0 "register_operand" "=r") (unspec_volatile:QHWD - [(match_operand:QHWD 1 "memory_operand" "+m") + [(match_operand:QHWD 1 "memory_operand" "m") (match_operand:SI 2 "const_int_operand")] ;; model UNSPEC_ATOMIC_LOAD))] "" @@ -142,9 +142,50 @@ } [(set (attr "length") (const_int 12))]) +(define_insn "atomic_loadti_lsx" + [(set (match_operand:V2DI 0 "register_operand" "=f") + (unspec_volatile:V2DI + [(match_operand:TI 1 "memory_operand" "m") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPEC_ATOMIC_LOAD))] + "ISA_HAS_LSX && TARGET_64BIT" +{ + enum memmodel model = memmodel_base (INTVAL (operands[2])); + + switch (model) + { + case MEMMODEL_SEQ_CST: + output_asm_insn ("dbar\t0x11", operands); + /* fall through */ + case MEMMODEL_ACQUIRE: + case MEMMODEL_RELAXED: + return "vld\t%w0,%1\\n\\t%G2"; + + default: + gcc_unreachable (); + } +} + [(set (attr "length") (const_int 12))]) + +(define_expand "atomic_loadti" + [(match_operand:TI 0 "register_operand" "=r") + (match_operand:TI 1 "memory_operand" "m") + (match_operand:SI 2 "const_int_operand")] + "ISA_HAS_LSX && TARGET_64BIT" +{ + rtx vr = gen_reg_rtx (V2DImode); + + emit_insn (gen_atomic_loadti_lsx (vr, operands[1], operands[2])); + for (int i = 0; i < 2; i++) + emit_insn ( + gen_lsx_vpickve2gr_d (loongarch_subword (operands[0], i), vr, + GEN_INT (i))); + DONE; +}) + ;; Implement atomic stores with amoswap. Fall back to fences for atomic loads. (define_insn "atomic_store<mode>" - [(set (match_operand:QHWD 0 "memory_operand" "+m") + [(set (match_operand:QHWD 0 "memory_operand" "=m") (unspec_volatile:QHWD [(match_operand:QHWD 1 "reg_or_0_operand" "rJ") (match_operand:SI 2 "const_int_operand")] ;; model @@ -175,7 +216,67 @@ } [(set (attr "length") (const_int 12))]) -(define_insn "atomic_<atomic_optab><mode>" +(define_insn "atomic_storeti_lsx" + [(set (match_operand:TI 0 "memory_operand" "=m") + (unspec_volatile:TI + [(match_operand:V2DI 1 "register_operand" "f") + (match_operand:SI 2 "const_int_operand")] ;; model + UNSPEC_ATOMIC_STORE))] + "ISA_HAS_LSX && TARGET_64BIT" +{ + enum memmodel model = memmodel_base (INTVAL (operands[2])); + + switch (model) + { + case MEMMODEL_SEQ_CST: + return "dbar\t0x12\\n\\t" + "vst\t%w1,%0\\n\\t" + "dbar\t0x18"; + case MEMMODEL_RELEASE: + return "dbar\t0x12\\n\\t" + "vst\t%w1,%0"; + case MEMMODEL_RELAXED: + return "vst\t%w1,%0"; + default: + gcc_unreachable (); + } +} + [(set (attr "length") (const_int 12))]) + +(define_insn "atomic_storeti_scq" + [(set (match_operand:TI 0 "memory_operand" "=m") + (unspec_volatile:TI + [(match_operand:TI 1 "register_operand" "r")] + UNSPEC_ATOMIC_STORE)) + (clobber (match_scratch:DI 2 "=&r"))] + "TARGET_64BIT && ISA_HAS_SCQ" + "1:\\n\\tll.d\t$r0,%0\n\tmove\t%2,%1\n\tsc.q\t%2,%t1,%0\n\tbeqz\t%2,1b" + [(set (attr "length") (const_int 16))]) + +(define_expand "atomic_storeti" + [(match_operand:TI 0 "memory_operand" "=m") + (match_operand:TI 1 "reg_or_0_operand" "rJ") + (match_operand:SI 2 "const_int_operand")] + "TARGET_64BIT && (ISA_HAS_LSX || ISA_HAS_SCQ)" +{ + if (!ISA_HAS_LSX) + { + emit_insn (gen_atomic_storeti_scq (operands[0], operands[1])); + DONE; + } + + rtx vr = gen_reg_rtx (V2DImode), op1 = operands[1]; + rtvec v = rtvec_alloc (2); + + for (int i = 0; i < 2; i++) + RTVEC_ELT (v, i) = loongarch_subword (op1, i); + + emit_insn (gen_vec_initv2didi (vr, gen_rtx_PARALLEL (V2DImode, v))); + emit_insn (gen_atomic_storeti_lsx (operands[0], vr, operands[2])); + DONE; +}) + +(define_insn "atomic_<amop><mode>" [(set (match_operand:GPR 0 "memory_operand" "+ZB") (unspec_volatile:GPR [(any_atomic:GPR (match_dup 0) @@ -183,7 +284,7 @@ (match_operand:SI 2 "const_int_operand")] ;; model UNSPEC_SYNC_OLD_OP))] "" - "am<amop>%A2.<amo>\t$zero,%z1,%0" + "am<amop>%A2.<size>\t$zero,%z1,%0" [(set (attr "length") (const_int 4))]) (define_insn "atomic_add<mode>" @@ -194,10 +295,10 @@ (match_operand:SI 2 "const_int_operand")] ;; model UNSPEC_SYNC_OLD_OP))] "ISA_HAS_LAM_BH" - "amadd%A2.<amo>\t$zero,%z1,%0" + "amadd%A2.<size>\t$zero,%z1,%0" [(set (attr "length") (const_int 4))]) -(define_insn "atomic_fetch_<atomic_optab><mode>" +(define_insn "atomic_fetch_<amop><mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") (match_operand:GPR 1 "memory_operand" "+ZB")) (set (match_dup 1) @@ -207,9 +308,52 @@ (match_operand:SI 3 "const_int_operand")] ;; model UNSPEC_SYNC_OLD_OP))] "" - "am<amop>%A3.<amo>\t%0,%z2,%1" + "am<amop>%A3.<size>\t%0,%z2,%1" [(set (attr "length") (const_int 4))]) +(define_insn "atomic_fetch_nand_mask_inverted<mode>" + [(set (match_operand:GPR 0 "register_operand" "=&r") + (match_operand:GPR 1 "memory_operand" "+ZC")) + (set (match_dup 1) + (unspec_volatile:GPR + [(ior:GPR (not (match_dup 1)) + (match_operand:GPR 2 "register_operand" "r"))] + UNSPEC_SYNC_OLD_OP)) + (clobber (match_scratch:GPR 3 "=&r"))] + "" + { + return "1:\\n\\t" + "ll.<d>\\t%0,%1\\n\\t" + "orn\\t%3,%2,%0\\n\\t" + "sc.<d>\\t%3,%1\\n\\t" + "beqz\\t%3,1b"; + } + [(set (attr "length") (const_int 16))]) + +(define_mode_iterator ALL_SC [GPR (TI "TARGET_64BIT && ISA_HAS_SCQ")]) +(define_mode_attr _scq [(SI "") (DI "") (TI "_scq")]) +(define_expand "atomic_fetch_nand<mode>" + [(match_operand:ALL_SC 0 "register_operand") + (match_operand:ALL_SC 1 "memory_operand") + (match_operand:ALL_SC 2 "reg_or_0_operand") + (match_operand:SI 3 "const_int_operand")] + "" + { + /* ~(atom & mask) = (~mask) | (~atom), so we can hoist + (~mask) out of the ll/sc loop and use the orn instruction in the + ll/sc loop. */ + rtx inverted_mask = gen_reg_rtx (<MODE>mode); + emit_move_insn (inverted_mask, + expand_simple_unop (<MODE>mode, NOT, operands[2], + NULL_RTX, false)); + + emit_insn ( + gen_atomic_fetch_nand_mask_inverted<mode><_scq> (operands[0], + operands[1], + inverted_mask)); + DONE; + }) + (define_insn "atomic_exchange<mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") (unspec_volatile:GPR @@ -219,9 +363,44 @@ (set (match_dup 1) (match_operand:GPR 2 "register_operand" "r"))] "" - "amswap%A3.<amo>\t%0,%z2,%1" + "amswap%A3.<size>\t%0,%z2,%1" [(set (attr "length") (const_int 4))]) +(define_insn "atomic_exchangeti_scq" + [(set (match_operand:TI 0 "register_operand" "=&r") + (unspec_volatile:TI + [(match_operand:TI 1 "memory_operand" "+ZB")] + UNSPEC_SYNC_EXCHANGE)) + (set (match_dup 1) + (match_operand:TI 2 "register_operand" "rJ")) + (clobber (match_scratch:DI 3 "=&r"))] + "TARGET_64BIT && ISA_HAS_SCQ" +{ + output_asm_insn ("1:", operands); + output_asm_insn ("ll.d\t%0,%1", operands); + if (!ISA_HAS_LD_SEQ_SA) + output_asm_insn ("dbar\t0x700", operands); + output_asm_insn ("ld.d\t%t0,%b1,8", operands); + output_asm_insn ("move\t%3,%z2", operands); + output_asm_insn ("sc.q\t%3,%t2,%1", operands); + output_asm_insn ("beqz\t%3,1b", operands); + + return ""; +} + [(set (attr "length") (const_int 24))]) + +(define_expand "atomic_exchangeti" + [(match_operand:TI 0 "register_operand" "=&r") + (match_operand:TI 1 "memory_operand" "+ZB") + (match_operand:TI 2 "register_operand" "rJ") + (match_operand:SI 3 "const_int_operand")] ;; model + "TARGET_64BIT && ISA_HAS_SCQ" +{ + emit_insn (gen_atomic_exchangeti_scq (operands[0], operands[1], + operands[2])); + DONE; +}) + (define_insn "atomic_exchange<mode>_short" [(set (match_operand:SHORT 0 "register_operand" "=&r") (unspec_volatile:SHORT @@ -231,7 +410,7 @@ (set (match_dup 1) (match_operand:SHORT 2 "register_operand" "r"))] "ISA_HAS_LAM_BH" - "amswap%A3.<amo>\t%0,%z2,%1" + "amswap%A3.<size>\t%0,%z2,%1" [(set (attr "length") (const_int 4))]) (define_insn "atomic_cas_value_strong<mode>" @@ -240,13 +419,13 @@ (set (match_dup 1) (unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ") (match_operand:GPR 3 "reg_or_0_operand" "rJ") - (match_operand:SI 4 "const_int_operand")] ;; mod_s + (match_operand:SI 4 "const_int_operand")] ;; mod_f UNSPEC_COMPARE_AND_SWAP)) (clobber (match_scratch:GPR 5 "=&r"))] "" { output_asm_insn ("1:", operands); - output_asm_insn ("ll.<amo>\t%0,%1", operands); + output_asm_insn ("ll.<size>\t%0,%1", operands); /* Like the test case atomic-cas-int.C, in loongarch64, O1 and higher, the return value of the val_without_const_folding will not be truncated and @@ -266,9 +445,9 @@ output_asm_insn ("bne\t%0,%z2,2f", operands); output_asm_insn ("or%i3\t%5,$zero,%3", operands); - output_asm_insn ("sc.<amo>\t%5,%1", operands); + output_asm_insn ("sc.<size>\t%5,%1", operands); output_asm_insn ("beqz\t%5,1b", operands); - output_asm_insn ("b\t3f", operands); + output_asm_insn ("%T4b\t3f", operands); output_asm_insn ("2:", operands); output_asm_insn ("%G4", operands); output_asm_insn ("3:", operands); @@ -288,10 +467,10 @@ (set (match_dup 1) (unspec_volatile:QHWD [(match_operand:QHWD 2 "reg_or_0_operand" "rJ") (match_operand:QHWD 3 "reg_or_0_operand" "rJ") - (match_operand:SI 4 "const_int_operand")] ;; mod_s - UNSPEC_COMPARE_AND_SWAP))] + (match_operand:SI 4 "const_int_operand")] ;; mod + UNSPEC_COMPARE_AND_SWAP_AMCAS))] "ISA_HAS_LAMCAS" - "ori\t%0,%z2,0\n\tamcas%A4.<amo>\t%0,%z3,%1" + "ori\t%0,%z2,0\n\tamcas%A4.<size>\t%0,%z3,%1" [(set (attr "length") (const_int 8))]) (define_expand "atomic_compare_and_swap<mode>" @@ -318,16 +497,14 @@ && is_mm_release (memmodel_base (INTVAL (mod_s)))) mod_s = GEN_INT (MEMMODEL_ACQ_REL); - operands[6] = mod_s; - if (ISA_HAS_LAMCAS) emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2], operands[3], operands[4], - operands[6])); + mod_s)); else emit_insn (gen_atomic_cas_value_strong<mode> (operands[1], operands[2], operands[3], operands[4], - operands[6])); + mod_f)); rtx compare = operands[1]; if (operands[3] != const0_rtx) @@ -349,49 +526,74 @@ DONE; }) -(define_expand "atomic_test_and_set" - [(match_operand:QI 0 "register_operand" "") ;; bool output - (match_operand:QI 1 "memory_operand" "+ZB") ;; memory - (match_operand:SI 2 "const_int_operand" "")] ;; model +(define_expand "atomic_fetch_<amop><mode>" + [(match_operand:SHORT 0 "register_operand" "") ;; output + (any_bitwise (match_operand:SHORT 1 "memory_operand" "+ZB") ;; memory + (match_operand:SHORT 2 "reg_or_0_operand" "rJ")) ;; val + (match_operand:SI 3 "const_int_operand" "")] ;; model "" { - /* We have no QImode atomics, so use the address LSBs to form a mask, - then use an aligned SImode atomic. */ + /* We have no QI/HImode bitwise atomics, so use the address LSBs to form + a mask, then use an aligned SImode atomic. */ rtx result = operands[0]; rtx mem = operands[1]; - rtx model = operands[2]; + rtx model = operands[3]; rtx addr = force_reg (Pmode, XEXP (mem, 0)); - rtx tmp_reg = gen_reg_rtx (Pmode); - rtx zero_reg = gen_rtx_REG (Pmode, 0); - + rtx mask = gen_int_mode (-4, Pmode); rtx aligned_addr = gen_reg_rtx (Pmode); - emit_move_insn (tmp_reg, gen_rtx_PLUS (Pmode, zero_reg, GEN_INT (-4))); - emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, tmp_reg)); + + if (!and_operand (mask, Pmode)) + mask = force_reg (Pmode, mask); + + emit_move_insn (aligned_addr, gen_rtx_AND (Pmode, addr, mask)); rtx aligned_mem = change_address (mem, SImode, aligned_addr); set_mem_alias_set (aligned_mem, 0); - rtx offset = gen_reg_rtx (SImode); - emit_move_insn (offset, gen_rtx_AND (SImode, gen_lowpart (SImode, addr), - GEN_INT (3))); - rtx tmp = gen_reg_rtx (SImode); - emit_move_insn (tmp, GEN_INT (1)); + emit_move_insn (tmp, simplify_gen_unary (ZERO_EXTEND, SImode, + operands[2], <MODE>mode)); + /* Note that we have defined SHIFT_COUNT_TRUNCATED to 1, so we don't need + to mask addr with 0b11 here. */ rtx shmt = gen_reg_rtx (SImode); - emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, offset, GEN_INT (3))); + emit_move_insn (shmt, gen_rtx_ASHIFT (SImode, gen_lowpart (SImode, addr), + GEN_INT (3))); rtx word = gen_reg_rtx (SImode); emit_move_insn (word, gen_rtx_ASHIFT (SImode, tmp, shmt)); + if (<is_and>) + { + /* word = word | ~(mode_mask << shmt) */ + rtx tmp = force_reg (SImode, + gen_int_mode (GET_MODE_MASK (<MODE>mode), + SImode)); + emit_move_insn (tmp, gen_rtx_ASHIFT (SImode, tmp, shmt)); + emit_move_insn (word, gen_rtx_IOR (SImode, gen_rtx_NOT (SImode, tmp), + word)); + } + tmp = gen_reg_rtx (SImode); - emit_insn (gen_atomic_fetch_orsi (tmp, aligned_mem, word, model)); + emit_insn (gen_atomic_fetch_<amop>si (tmp, aligned_mem, word, model)); emit_move_insn (gen_lowpart (SImode, result), gen_rtx_LSHIFTRT (SImode, tmp, shmt)); DONE; }) +(define_expand "atomic_test_and_set" + [(match_operand:QI 0 "register_operand" "") ;; bool output + (match_operand:QI 1 "memory_operand" "+ZB") ;; memory + (match_operand:SI 2 "const_int_operand" "")] ;; model + "" +{ + rtx one = force_reg (QImode, gen_int_mode (1, QImode)); + emit_insn (gen_atomic_fetch_orqi (operands[0], operands[1], one, + operands[2])); + DONE; +}) + (define_insn "atomic_cas_value_cmp_and_7_<mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") (match_operand:GPR 1 "memory_operand" "+ZC")) @@ -400,20 +602,20 @@ (match_operand:GPR 3 "reg_or_0_operand" "rJ") (match_operand:GPR 4 "reg_or_0_operand" "rJ") (match_operand:GPR 5 "reg_or_0_operand" "rJ") - (match_operand:SI 6 "const_int_operand")] ;; model + (match_operand:SI 6 "const_int_operand")] ;; mod_f UNSPEC_COMPARE_AND_SWAP)) (clobber (match_scratch:GPR 7 "=&r"))] "" { return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" + "ll.<size>\\t%0,%1\\n\\t" "and\\t%7,%0,%2\\n\\t" "bne\\t%7,%z4,2f\\n\\t" "and\\t%7,%0,%z3\\n\\t" "or%i5\\t%7,%7,%5\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" + "sc.<size>\\t%7,%1\\n\\t" "beq\\t$zero,%7,1b\\n\\t" - "b\\t3f\\n\\t" + "%T6b\\t3f\\n\\t" "2:\\n\\t" "%G6\\n\\t" "3:\\n\\t"; @@ -444,18 +646,16 @@ && is_mm_release (memmodel_base (INTVAL (mod_s)))) mod_s = GEN_INT (MEMMODEL_ACQ_REL); - operands[6] = mod_s; - if (ISA_HAS_LAMCAS) emit_insn (gen_atomic_cas_value_strong<mode>_amcas (operands[1], operands[2], operands[3], operands[4], - operands[6])); + mod_s)); else { union loongarch_gen_fn_ptrs generator; generator.fn_7 = gen_atomic_cas_value_cmp_and_7_si; loongarch_expand_atomic_qihi (generator, operands[1], operands[2], - operands[3], operands[4], operands[6]); + operands[3], operands[4], mod_f); } rtx compare = operands[1]; @@ -481,83 +681,96 @@ DONE; }) -(define_insn "atomic_cas_value_add_7_<mode>" - [(set (match_operand:GPR 0 "register_operand" "=&r") ;; res - (match_operand:GPR 1 "memory_operand" "+ZC")) +(define_insn "atomic_compare_and_swapti_scq" + [(set (match_operand:TI 0 "register_operand" "=&r") + (match_operand:TI 1 "memory_operand" "+ZB")) (set (match_dup 1) - (unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ") ;; mask - (match_operand:GPR 3 "reg_or_0_operand" "rJ") ;; inverted_mask - (match_operand:GPR 4 "reg_or_0_operand" "rJ") ;; old val - (match_operand:GPR 5 "reg_or_0_operand" "rJ") ;; new val - (match_operand:SI 6 "const_int_operand")] ;; model - UNSPEC_COMPARE_AND_SWAP_ADD)) - (clobber (match_scratch:GPR 7 "=&r")) - (clobber (match_scratch:GPR 8 "=&r"))] - "" + (unspec_volatile:TI [(match_operand:TI 2 "reg_or_0_operand" "rJ") + (match_operand:TI 3 "reg_or_0_operand" "rJ") + (match_operand:SI 4 "const_int_operand")] ;; mod_f + UNSPEC_COMPARE_AND_SWAP)) + (clobber (match_scratch:DI 5 "=&r"))] + "TARGET_64BIT && ISA_HAS_SCQ" { - return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" - "and\\t%7,%0,%3\\n\\t" - "add.w\\t%8,%0,%z5\\n\\t" - "and\\t%8,%8,%z2\\n\\t" - "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" - "beq\\t$zero,%7,1b"; -} + output_asm_insn ("1:", operands); + output_asm_insn ("ll.d\t%0,%1", operands); - [(set (attr "length") (const_int 28))]) + /* Compare the low word */ + output_asm_insn ("bne\t%0,%z2,2f", operands); -(define_insn "atomic_cas_value_sub_7_<mode>" - [(set (match_operand:GPR 0 "register_operand" "=&r") ;; res - (match_operand:GPR 1 "memory_operand" "+ZC")) - (set (match_dup 1) - (unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ") ;; mask - (match_operand:GPR 3 "reg_or_0_operand" "rJ") ;; inverted_mask - (match_operand:GPR 4 "reg_or_0_operand" "rJ") ;; old val - (match_operand:GPR 5 "reg_or_0_operand" "rJ") ;; new val - (match_operand:SI 6 "const_int_operand")] ;; model - UNSPEC_COMPARE_AND_SWAP_SUB)) - (clobber (match_scratch:GPR 7 "=&r")) - (clobber (match_scratch:GPR 8 "=&r"))] - "" -{ - return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" - "and\\t%7,%0,%3\\n\\t" - "sub.w\\t%8,%0,%z5\\n\\t" - "and\\t%8,%8,%z2\\n\\t" - "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" - "beq\\t$zero,%7,1b"; + /* Don't reorder the load of high word before ll.d. As the TImode + must be aligned in the memory, the high and low words must be in + the same cacheline, thus dbar 0x700 is enough. */ + if (!ISA_HAS_LD_SEQ_SA) + output_asm_insn ("dbar\t0x700", operands); + + /* Now load the high word. As the high and low words are in the same + cacheline, in case another core has clobbered the high word before the + sc.q instruction is executed, the LL bit for the low word will be + cleared. Thus a normal load is sufficient. */ + output_asm_insn ("ld.d\t%t0,%b1,8", operands); + + /* Compare the high word. */ + output_asm_insn ("bne\t%t0,%t2,2f", operands); + + /* Copy the low word of the new value as it'll be clobbered by sc.q. */ + output_asm_insn ("move\t%5,%z3", operands); + + /* Store both words if LL bit is still set. */ + output_asm_insn ("sc.q\t%5,%t3,%1", operands); + + /* Check if sc.q has done the store. */ + output_asm_insn ("beqz\t%5,1b", operands); + + /* Jump over the mod_f barrier if sc.q has succeeded. */ + output_asm_insn ("%T4b\t3f", operands); + + /* The barrier for mod_f. */ + output_asm_insn ("2:", operands); + output_asm_insn ("%G4", operands); + + output_asm_insn ("3:", operands); + return ""; } - [(set (attr "length") (const_int 28))]) + [(set_attr "length" "40")]) -(define_insn "atomic_cas_value_and_7_<mode>" - [(set (match_operand:GPR 0 "register_operand" "=&r") ;; res - (match_operand:GPR 1 "memory_operand" "+ZC")) - (set (match_dup 1) - (unspec_volatile:GPR [(match_operand:GPR 2 "reg_or_0_operand" "rJ") ;; mask - (match_operand:GPR 3 "reg_or_0_operand" "rJ") ;; inverted_mask - (match_operand:GPR 4 "reg_or_0_operand" "rJ") ;; old val - (match_operand:GPR 5 "reg_or_0_operand" "rJ") ;; new val - (match_operand:SI 6 "const_int_operand")] ;; model - UNSPEC_COMPARE_AND_SWAP_AND)) - (clobber (match_scratch:GPR 7 "=&r")) - (clobber (match_scratch:GPR 8 "=&r"))] - "" +(define_expand "atomic_compare_and_swapti" + [(match_operand:SI 0 "register_operand" "") ;; bool output + (match_operand:TI 1 "register_operand" "") ;; val output + (match_operand:TI 2 "memory_operand" "") ;; memory + (match_operand:TI 3 "reg_or_0_operand" "") ;; expected value + (match_operand:TI 4 "reg_or_0_operand" "") ;; desired value + (match_operand:SI 5 "const_int_operand" "") ;; is_weak + (match_operand:SI 6 "const_int_operand" "") ;; mod_s + (match_operand:SI 7 "const_int_operand" "")] ;; mod_f + "TARGET_64BIT && ISA_HAS_SCQ" { - return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" - "and\\t%7,%0,%3\\n\\t" - "and\\t%8,%0,%z5\\n\\t" - "and\\t%8,%8,%z2\\n\\t" - "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" - "beq\\t$zero,%7,1b"; -} - [(set (attr "length") (const_int 28))]) + emit_insn (gen_atomic_compare_and_swapti_scq (operands[1], operands[2], + operands[3], operands[4], + operands[7])); + + rtx t[2]; -(define_insn "atomic_cas_value_xor_7_<mode>" + for (int i = 0; i < 2; i++) + { + rtx compare = loongarch_subword (operands[1], i); + rtx expect = loongarch_subword (operands[3], i); + + t[i] = gen_reg_rtx (DImode); + + if (expect != const0_rtx) + emit_insn (gen_xordi3 (t[i], compare, expect)); + else + emit_move_insn (t[i], compare); + } + + emit_insn (gen_iordi3 (t[0], t[0], t[1])); + emit_insn (gen_rtx_SET (operands[0], + gen_rtx_EQ (SImode, t[0], const0_rtx))); + DONE; +}) + +(define_insn "atomic_cas_value_add_7_<mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") ;; res (match_operand:GPR 1 "memory_operand" "+ZC")) (set (match_dup 1) @@ -566,24 +779,24 @@ (match_operand:GPR 4 "reg_or_0_operand" "rJ") ;; old val (match_operand:GPR 5 "reg_or_0_operand" "rJ") ;; new val (match_operand:SI 6 "const_int_operand")] ;; model - UNSPEC_COMPARE_AND_SWAP_XOR)) + UNSPEC_COMPARE_AND_SWAP_ADD)) (clobber (match_scratch:GPR 7 "=&r")) (clobber (match_scratch:GPR 8 "=&r"))] "" { return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" + "ll.<size>\\t%0,%1\\n\\t" "and\\t%7,%0,%3\\n\\t" - "xor\\t%8,%0,%z5\\n\\t" + "add.w\\t%8,%0,%z5\\n\\t" "and\\t%8,%8,%z2\\n\\t" "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" + "sc.<size>\\t%7,%1\\n\\t" "beq\\t$zero,%7,1b"; } [(set (attr "length") (const_int 28))]) -(define_insn "atomic_cas_value_or_7_<mode>" +(define_insn "atomic_cas_value_sub_7_<mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") ;; res (match_operand:GPR 1 "memory_operand" "+ZC")) (set (match_dup 1) @@ -592,21 +805,20 @@ (match_operand:GPR 4 "reg_or_0_operand" "rJ") ;; old val (match_operand:GPR 5 "reg_or_0_operand" "rJ") ;; new val (match_operand:SI 6 "const_int_operand")] ;; model - UNSPEC_COMPARE_AND_SWAP_OR)) + UNSPEC_COMPARE_AND_SWAP_SUB)) (clobber (match_scratch:GPR 7 "=&r")) (clobber (match_scratch:GPR 8 "=&r"))] "" { return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" + "ll.<size>\\t%0,%1\\n\\t" "and\\t%7,%0,%3\\n\\t" - "or\\t%8,%0,%z5\\n\\t" + "sub.w\\t%8,%0,%z5\\n\\t" "and\\t%8,%8,%z2\\n\\t" "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" + "sc.<size>\\t%7,%1\\n\\t" "beq\\t$zero,%7,1b"; } - [(set (attr "length") (const_int 28))]) (define_insn "atomic_cas_value_nand_7_<mode>" @@ -624,12 +836,12 @@ "" { return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" + "ll.<size>\\t%0,%1\\n\\t" "and\\t%7,%0,%3\\n\\t" "and\\t%8,%0,%z5\\n\\t" "xor\\t%8,%8,%z2\\n\\t" "or%i8\\t%7,%7,%8\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" + "sc.<size>\\t%7,%1\\n\\t" "beq\\t$zero,%7,1b"; } [(set (attr "length") (const_int 28))]) @@ -648,10 +860,10 @@ "" { return "1:\\n\\t" - "ll.<amo>\\t%0,%1\\n\\t" + "ll.<size>\\t%0,%1\\n\\t" "and\\t%7,%0,%z3\\n\\t" "or%i5\\t%7,%7,%5\\n\\t" - "sc.<amo>\\t%7,%1\\n\\t" + "sc.<size>\\t%7,%1\\n\\t" "beqz\\t%7,1b\\n\\t"; } [(set (attr "length") (const_int 20))]) @@ -678,6 +890,101 @@ DONE; }) +(define_int_iterator UNSPEC_TI_FETCH_DIRECT + [UNSPEC_TI_FETCH_ADD + UNSPEC_TI_FETCH_SUB + UNSPEC_TI_FETCH_AND + UNSPEC_TI_FETCH_XOR + UNSPEC_TI_FETCH_OR]) +(define_int_iterator UNSPEC_TI_FETCH + [UNSPEC_TI_FETCH_DIRECT UNSPEC_TI_FETCH_NAND_MASK_INVERTED]) +(define_int_attr amop_ti_fetch + [(UNSPEC_TI_FETCH_ADD "add") + (UNSPEC_TI_FETCH_SUB "sub") + (UNSPEC_TI_FETCH_AND "and") + (UNSPEC_TI_FETCH_XOR "xor") + (UNSPEC_TI_FETCH_OR "or") + (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "nand_mask_inverted")]) +(define_int_attr size_ti_fetch + [(UNSPEC_TI_FETCH_ADD "36") + (UNSPEC_TI_FETCH_SUB "36") + (UNSPEC_TI_FETCH_AND "28") + (UNSPEC_TI_FETCH_XOR "28") + (UNSPEC_TI_FETCH_OR "28") + (UNSPEC_TI_FETCH_NAND_MASK_INVERTED "28")]) + +(define_insn "atomic_fetch_<amop_ti_fetch>ti_scq" + [(set (match_operand:TI 0 "register_operand" "=&r") + (match_operand:TI 1 "memory_operand" "+ZB")) + (set (match_dup 1) + (unspec_volatile:TI + [(match_dup 0) + (match_operand:TI 2 "reg_or_0_operand" "rJ")] + UNSPEC_TI_FETCH)) + (clobber (match_scratch:DI 3 "=&r")) + (clobber (match_scratch:DI 4 "=&r"))] + "TARGET_64BIT && ISA_HAS_SCQ" +{ + output_asm_insn ("1:", operands); + output_asm_insn ("ll.d\t%0,%1", operands); + if (!ISA_HAS_LD_SEQ_SA) + output_asm_insn ("dbar\t0x700", operands); + output_asm_insn ("ld.d\t%t0,%b1,8", operands); + + switch (<UNSPEC_TI_FETCH>) + { + case UNSPEC_TI_FETCH_AND: + case UNSPEC_TI_FETCH_OR: + case UNSPEC_TI_FETCH_XOR: + output_asm_insn ("<amop_ti_fetch>\t%3,%0,%z2", operands); + output_asm_insn ("<amop_ti_fetch>\t%4,%t0,%t2", operands); + break; + case UNSPEC_TI_FETCH_NAND_MASK_INVERTED: + output_asm_insn ("orn\t%3,%z2,%0", operands); + output_asm_insn ("orn\t%4,%t2,%t0", operands); + break; + case UNSPEC_TI_FETCH_ADD: + case UNSPEC_TI_FETCH_SUB: + output_asm_insn ("<amop_ti_fetch>.d\t%3,%0,%z2", operands); + + /* Generate carry bit. */ + output_asm_insn ( + <UNSPEC_TI_FETCH> == UNSPEC_TI_FETCH_ADD ? "sltu\t%4,%3,%0" + : "sltu\t%4,%0,%3", + operands); + + output_asm_insn ("<amop_ti_fetch>.d\t%4,%t0,%4", operands); + output_asm_insn ("<amop_ti_fetch>.d\t%4,%4,%t2", operands); + break; + default: + gcc_unreachable (); + } + + output_asm_insn ("sc.q\t%3,%4,%1", operands); + output_asm_insn ("beqz\t%3,1b", operands); + + return ""; +} + [(set_attr "length" "<size_ti_fetch>")]) + +(define_expand "atomic_fetch_<amop_ti_fetch>ti" + [(set (match_operand:TI 0 "register_operand" "=&r") + (match_operand:TI 1 "memory_operand" "+ZB")) + (set (match_dup 1) + (unspec_volatile:TI + [(match_dup 0) + (match_operand:TI 2 "reg_or_0_operand" "rJ")] + UNSPEC_TI_FETCH_DIRECT)) + (match_operand:SI 3 "const_int_operand")] ;; model + "TARGET_64BIT && ISA_HAS_SCQ" +{ + /* Model is ignored as sc.q implies a full barrier. */ + emit_insn (gen_atomic_fetch_<amop_ti_fetch>ti_scq (operands[0], + operands[1], + operands[2])); + DONE; +}) + (define_insn "atomic_fetch_add<mode>_short" [(set (match_operand:SHORT 0 "register_operand" "=&r") (match_operand:SHORT 1 "memory_operand" "+ZB")) @@ -688,7 +995,7 @@ (match_operand:SI 3 "const_int_operand")] ;; model UNSPEC_SYNC_OLD_OP))] "ISA_HAS_LAM_BH" - "amadd%A3.<amo>\t%0,%z2,%1" + "amadd%A3.<size>\t%0,%z2,%1" [(set (attr "length") (const_int 4))]) (define_expand "atomic_fetch_add<mode>" @@ -724,7 +1031,7 @@ (match_operand:SHORT 2 "reg_or_0_operand" "rJ")) (match_operand:SI 3 "const_int_operand")] ;; model UNSPEC_SYNC_OLD_OP))] - "" + "!ISA_HAS_LAM_BH" { union loongarch_gen_fn_ptrs generator; generator.fn_7 = gen_atomic_cas_value_sub_7_si; @@ -733,60 +1040,6 @@ DONE; }) -(define_expand "atomic_fetch_and<mode>" - [(set (match_operand:SHORT 0 "register_operand" "=&r") - (match_operand:SHORT 1 "memory_operand" "+ZB")) - (set (match_dup 1) - (unspec_volatile:SHORT - [(and:SHORT (match_dup 1) - (match_operand:SHORT 2 "reg_or_0_operand" "rJ")) - (match_operand:SI 3 "const_int_operand")] ;; model - UNSPEC_SYNC_OLD_OP))] - "" -{ - union loongarch_gen_fn_ptrs generator; - generator.fn_7 = gen_atomic_cas_value_and_7_si; - loongarch_expand_atomic_qihi (generator, operands[0], operands[1], - operands[1], operands[2], operands[3]); - DONE; -}) - -(define_expand "atomic_fetch_xor<mode>" - [(set (match_operand:SHORT 0 "register_operand" "=&r") - (match_operand:SHORT 1 "memory_operand" "+ZB")) - (set (match_dup 1) - (unspec_volatile:SHORT - [(xor:SHORT (match_dup 1) - (match_operand:SHORT 2 "reg_or_0_operand" "rJ")) - (match_operand:SI 3 "const_int_operand")] ;; model - UNSPEC_SYNC_OLD_OP))] - "" -{ - union loongarch_gen_fn_ptrs generator; - generator.fn_7 = gen_atomic_cas_value_xor_7_si; - loongarch_expand_atomic_qihi (generator, operands[0], operands[1], - operands[1], operands[2], operands[3]); - DONE; -}) - -(define_expand "atomic_fetch_or<mode>" - [(set (match_operand:SHORT 0 "register_operand" "=&r") - (match_operand:SHORT 1 "memory_operand" "+ZB")) - (set (match_dup 1) - (unspec_volatile:SHORT - [(ior:SHORT (match_dup 1) - (match_operand:SHORT 2 "reg_or_0_operand" "rJ")) - (match_operand:SI 3 "const_int_operand")] ;; model - UNSPEC_SYNC_OLD_OP))] - "" -{ - union loongarch_gen_fn_ptrs generator; - generator.fn_7 = gen_atomic_cas_value_or_7_si; - loongarch_expand_atomic_qihi (generator, operands[0], operands[1], - operands[1], operands[2], operands[3]); - DONE; -}) - (define_expand "atomic_fetch_nand<mode>" [(set (match_operand:SHORT 0 "register_operand" "=&r") (match_operand:SHORT 1 "memory_operand" "+ZB")) diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index e224ade..494f14c 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -2363,8 +2363,14 @@ enum reg_class #define STACK_GROWS_DOWNWARD 1 -#define FRAME_GROWS_DOWNWARD (flag_stack_protect != 0 \ - || (flag_sanitize & SANITIZE_ADDRESS) != 0) +/* Growing the frame downwards allows us to put spills closest to + the stack pointer which is good as they are likely to be accessed + frequently. We can also arrange for normal stack usage to place + scalars last so that they too are close to the stack pointer. */ +#define FRAME_GROWS_DOWNWARD ((TARGET_MIPS16 \ + && TARGET_FRAME_GROWS_DOWNWARDS) \ + || (flag_stack_protect != 0 \ + || (flag_sanitize & SANITIZE_ADDRESS) != 0)) /* Size of the area allocated in the frame to save the GP. */ diff --git a/gcc/config/mips/mips.opt b/gcc/config/mips/mips.opt index e245654..f07db5a 100644 --- a/gcc/config/mips/mips.opt +++ b/gcc/config/mips/mips.opt @@ -473,6 +473,10 @@ mframe-header-opt Target Var(flag_frame_header_optimization) Optimization Optimize frame header. +mgrow-frame-downwards +Target Var(TARGET_FRAME_GROWS_DOWNWARDS) Init(1) Undocumented +Change the behaviour to grow the frame downwards. + noasmopt Driver diff --git a/gcc/config/pru/pru.cc b/gcc/config/pru/pru.cc index 322e319..3fdc56e 100644 --- a/gcc/config/pru/pru.cc +++ b/gcc/config/pru/pru.cc @@ -941,10 +941,19 @@ pru_init_libfuncs (void) /* Long long. */ set_optab_libfunc (ashr_optab, DImode, "__pruabi_asrll"); - set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll"); set_optab_libfunc (ashl_optab, DImode, "__pruabi_lslll"); set_optab_libfunc (lshr_optab, DImode, "__pruabi_lsrll"); + if (TARGET_OPT_MUL) + { + set_optab_libfunc (smul_optab, DImode, "__pruabi_mpyll"); + } + else + { + set_optab_libfunc (smul_optab, DImode, "__pruabi_softmpyll"); + set_optab_libfunc (smul_optab, SImode, "__pruabi_softmpyi"); + } + set_optab_libfunc (sdiv_optab, SImode, "__pruabi_divi"); set_optab_libfunc (udiv_optab, SImode, "__pruabi_divu"); set_optab_libfunc (smod_optab, SImode, "__pruabi_remi"); diff --git a/gcc/config/pru/pru.h b/gcc/config/pru/pru.h index 6c0719b..9d547ed 100644 --- a/gcc/config/pru/pru.h +++ b/gcc/config/pru/pru.h @@ -65,6 +65,9 @@ #undef ENDFILE_SPEC #define ENDFILE_SPEC "%{!mabi=ti:-lgloss} " +#undef MULTILIB_DEFAULTS +#define MULTILIB_DEFAULTS { "mloop", "mmul", "mfillzero" } + /* TI ABI mandates that ELF symbols do not start with any prefix. */ #undef USER_LABEL_PREFIX #define USER_LABEL_PREFIX "" diff --git a/gcc/config/pru/pru.md b/gcc/config/pru/pru.md index 3504e42..b8ef55b 100644 --- a/gcc/config/pru/pru.md +++ b/gcc/config/pru/pru.md @@ -215,7 +215,7 @@ mov\\t%0, %1 ldi\\t%0, %%pmem(%1) ldi\\t%0, %1 - fill\\t%0, 4 + * return TARGET_OPT_FILLZERO ? \"fill\\t%0, 4\" : \"ldi32\\t%0, 0xffffffff\"; ldi32\\t%0, %1" [(set_attr "type" "st,ld,alu,alu,alu,alu,alu") (set_attr "length" "4,4,4,4,4,4,8")]) @@ -259,9 +259,11 @@ case 1: return "lb%B1o\\t%b0, %1, %S1"; case 2: - return "zero\\t%F0, 8"; + return TARGET_OPT_FILLZERO ? "zero\\t%F0, 8" + : "ldi\\t%F0, 0\;ldi\\t%N0, 0"; case 3: - return "fill\\t%F0, 8"; + return TARGET_OPT_FILLZERO ? "fill\\t%F0, 8" + : "ldi32\\t%F0, 0xffffffff\;mov\\t%N0, %F0"; case 4: /* careful with overlapping source and destination regs. */ gcc_assert (GP_REG_P (REGNO (operands[0]))); @@ -502,7 +504,7 @@ (define_insn "zero_extendqidi2" [(set (match_operand:DI 0 "register_operand" "=r,r") (zero_extend:DI (match_operand:QI 1 "register_operand" "0,r")))] - "" + "TARGET_OPT_FILLZERO" "@ zero\\t%F0.b1, 7 mov\\t%F0.b0, %1\;zero\\t%F0.b1, 7" @@ -512,7 +514,7 @@ (define_insn "zero_extendhidi2" [(set (match_operand:DI 0 "register_operand" "=r,r") (zero_extend:DI (match_operand:HI 1 "register_operand" "0,r")))] - "" + "TARGET_OPT_FILLZERO" "@ zero\\t%F0.b2, 6 mov\\t%F0.w0, %1\;zero\\t%F0.b2, 6" @@ -522,7 +524,7 @@ (define_insn "zero_extendsidi2" [(set (match_operand:DI 0 "register_operand" "=r,r") (zero_extend:DI (match_operand:SI 1 "register_operand" "0,r")))] - "" + "TARGET_OPT_FILLZERO" "@ zero\\t%N0, 4 mov\\t%F0, %1\;zero\\t%N0, 4" @@ -535,7 +537,7 @@ (define_expand "extend<EQS0:mode><EQDHIDI:mode>2" [(set (match_operand:EQDHIDI 0 "register_operand" "=r") (sign_extend:EQDHIDI (match_operand:EQS0 1 "register_operand" "r")))] - "" + "TARGET_OPT_FILLZERO" { rtx_code_label *skip_hiset_label; @@ -744,7 +746,7 @@ (ior:HIDI (match_operand:HIDI 1 "register_operand" "0") (match_operand:HIDI 2 "const_fillbytes_operand" "Uf")))] - "" + "TARGET_OPT_FILLZERO" { static char line[64]; pru_byterange r; @@ -767,7 +769,7 @@ (and:HIDI (match_operand:HIDI 1 "register_operand" "0") (match_operand:HIDI 2 "const_zerobytes_operand" "Uz")))] - "" + "TARGET_OPT_FILLZERO" { static char line[64]; pru_byterange r; @@ -1114,7 +1116,8 @@ /* Try with the more efficient zero/fill patterns first. */ if (<LOGICAL_BITOP:CODE> == IOR && CONST_INT_P (operands[2]) - && const_fillbytes_operand (operands[2], DImode)) + && const_fillbytes_operand (operands[2], DImode) + && TARGET_OPT_FILLZERO) { rtx insn = maybe_gen_pru_ior_fillbytes (DImode, operands[0], @@ -1130,7 +1133,8 @@ } if (<LOGICAL_BITOP:CODE> == AND && CONST_INT_P (operands[2]) - && const_zerobytes_operand (operands[2], DImode)) + && const_zerobytes_operand (operands[2], DImode) + && TARGET_OPT_FILLZERO) { rtx insn = maybe_gen_pru_and_zerobytes (DImode, operands[0], @@ -1212,7 +1216,7 @@ [(set (match_operand:SI 0 "pru_muldst_operand" "=Rmd0") (mult:SI (match_operand:SI 1 "pru_mulsrc0_operand" "%Rms0") (match_operand:SI 2 "pru_mulsrc1_operand" "Rms1")))] - "" + "TARGET_OPT_MUL" "nop\;xin\\t0, %0, 4" [(set_attr "type" "alu") (set_attr "length" "8")]) diff --git a/gcc/config/pru/pru.opt b/gcc/config/pru/pru.opt index 8385beb..5206b2a 100644 --- a/gcc/config/pru/pru.opt +++ b/gcc/config/pru/pru.opt @@ -39,6 +39,14 @@ mloop Target Mask(OPT_LOOP) Allow (or do not allow) gcc to use the LOOP instruction. +mmul +Target Mask(OPT_MUL) +Allow (or do not allow) gcc to use the PRU multiplier unit. + +mfillzero +Target Mask(OPT_FILLZERO) +Allow (or do not allow) gcc to use the FILL and ZERO instructions. + mabi= Target RejectNegative Joined Enum(pru_abi_t) Var(pru_current_abi) Init(PRU_ABI_GNU) Save Select target ABI variant. diff --git a/gcc/config/pru/pru.opt.urls b/gcc/config/pru/pru.opt.urls index c87affb..5c57892 100644 --- a/gcc/config/pru/pru.opt.urls +++ b/gcc/config/pru/pru.opt.urls @@ -12,6 +12,12 @@ UrlSuffix(gcc/PRU-Options.html#index-mno-relax-1) mloop UrlSuffix(gcc/PRU-Options.html#index-mloop) +mmul +UrlSuffix(gcc/PRU-Options.html#index-mmul) + +mfillzero +UrlSuffix(gcc/PRU-Options.html#index-mfillzero) + mabi= UrlSuffix(gcc/PRU-Options.html#index-mabi-4) diff --git a/gcc/config/pru/t-multilib b/gcc/config/pru/t-multilib new file mode 100644 index 0000000..1e3c2b8 --- /dev/null +++ b/gcc/config/pru/t-multilib @@ -0,0 +1,29 @@ +# Copyright (C) 2025 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 3, or (at your option) any later +# version. +# +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +MULTILIB_OPTIONS = +MULTILIB_OPTIONS += mloop/mno-loop +MULTILIB_OPTIONS += mmul/mno-mul +MULTILIB_OPTIONS += mfillzero/mno-fillzero + +# Build two variants: +# - Newer PRU core versions, present in AM335x and later. +# - Older PRU core versions, present in AM18xx. +MULTILIB_REQUIRED = +MULTILIB_REQUIRED += mloop/mmul/mfillzero +MULTILIB_REQUIRED += mno-loop/mno-mul/mno-fillzero diff --git a/gcc/config/riscv/arch-canonicalize b/gcc/config/riscv/arch-canonicalize index 5d24f5ed..15a3985 100755 --- a/gcc/config/riscv/arch-canonicalize +++ b/gcc/config/riscv/arch-canonicalize @@ -163,7 +163,19 @@ def parse_dep_exts(dep_exts_str): ext_name = match.group(1) condition_code = match.group(2) deps.append({'ext': ext_name, 'type': 'conditional', 'condition': condition_code}) - conditional_matches.append((match.start(), match.end())) + # The conditional_pattern RE matches only the first code block enclosed + # in braces. + # + # Extend the match to the condition block's closing brace, encompassing + # all code blocks, by simply trying to match the numbers of opening + # and closing braces. While crude, this avoids writing a complicated + # parse here. + closing_braces_left = condition_code.count('{') - condition_code.count('}') + condition_end = match.end() + while closing_braces_left > 0: + condition_end = dep_exts_str.find('}', condition_end) + closing_braces_left -= 1 + conditional_matches.append((match.start(), condition_end)) # Remove conditional dependency blocks from the string remaining_str = dep_exts_str @@ -534,6 +546,11 @@ def run_unit_tests(): assert extensions[0]['name'] == 'test' assert len(extensions[0]['dep_exts']) == 2 + def test_parse_long_condition_block(): + """Test condition block containing several code blocks.""" + result = arch_canonicalize("rv32ec", "20191213") + assert "rv32ec_zca" in result + # Collect test functions test_functions = [ test_basic_arch_parsing, @@ -542,7 +559,8 @@ def run_unit_tests(): test_conditional_dependencies, test_parse_dep_exts, test_evaluate_conditional_dependency, - test_parse_define_riscv_ext + test_parse_define_riscv_ext, + test_parse_long_condition_block ] # Run tests manually first, then optionally with pytest diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md index 6531996..9695fdc 100644 --- a/gcc/config/riscv/autovec-opt.md +++ b/gcc/config/riscv/autovec-opt.md @@ -1679,6 +1679,26 @@ ;; Combine vec_duplicate + op.vv to op.vx ;; Include ;; - vadd.vx +;; - vsub.vx +;; - vrsub.vx +;; - vand.vx +;; - vor.vx +;; - vmul.vx +;; - vdiv.vx +;; - vdivu.vx +;; - vrem.vx +;; - vremu.vx +;; - vmax.vx +;; - vmaxu.vx +;; - vmin.vx +;; - vminu.vx +;; - vsadd.vx +;; - vsaddu.vx +;; - vssub.vx +;; - vssubu.vx +;; - vaadd.vx +;; - vaaddu.vx +;; - vmerge.vxm ;; ============================================================================= (define_insn_and_split "*<optab>_vx_<mode>" [(set (match_operand:V_VLSI 0 "register_operand") @@ -1694,6 +1714,8 @@ riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2], operands[1], <CODE>, <MODE>mode); + + DONE; } [(set_attr "type" "vialu")]) @@ -1711,6 +1733,8 @@ riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1], operands[2], <CODE>, <MODE>mode); + + DONE; } [(set_attr "type" "vialu")]) @@ -1782,6 +1806,69 @@ } [(set_attr "type" "vaalu")]) +(define_insn_and_split "*merge_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (if_then_else:V_VLSI + (match_operand:<VM> 3 "vector_mask_operand") + (vec_duplicate:V_VLSI + (match_operand:<VEL> 2 "reg_or_int_operand")) + (match_operand:V_VLSI 1 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + insn_code icode = code_for_pred_merge_scalar (<MODE>mode); + riscv_vector::emit_vlmax_insn (icode, riscv_vector::MERGE_OP, operands); + DONE; + } + [(set_attr "type" "vimerge")]) + +(define_insn_and_split "*vmacc_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (plus:V_VLSI + (mult:V_VLSI + (vec_duplicate:V_VLSI + (match_operand:<VEL> 1 "register_operand")) + (match_operand:V_VLSI 2 "register_operand")) + (match_operand:V_VLSI 3 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + insn_code icode = code_for_pred_mul_plus_vx (<MODE>mode); + rtx ops[] = {operands[0], operands[1], operands[2], operands[3], + RVV_VUNDEF(<MODE>mode)}; + riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops); + + DONE; + } + [(set_attr "type" "vimuladd")]) + +(define_insn_and_split "*vnmsac_vx_<mode>" + [(set (match_operand:V_VLSI 0 "register_operand") + (minus:V_VLSI + (match_operand:V_VLSI 3 "register_operand") + (mult:V_VLSI + (vec_duplicate:V_VLSI + (match_operand:<VEL> 1 "register_operand")) + (match_operand:V_VLSI 2 "register_operand"))))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + insn_code icode = code_for_pred_vnmsac_vx (<MODE>mode); + rtx ops[] = {operands[0], operands[1], operands[2], operands[3], + RVV_VUNDEF(<MODE>mode)}; + riscv_vector::emit_vlmax_insn (icode, riscv_vector::TERNARY_OP, ops); + + DONE; + } + [(set_attr "type" "vimuladd")]) + + ;; ============================================================================= ;; Combine vec_duplicate + op.vv to op.vf ;; Include @@ -1962,3 +2049,98 @@ } [(set_attr "type" "vfwmuladd")] ) + +;; vfmul.vf +(define_insn_and_split "*vfmul_vf_<mode>" + [(set (match_operand:V_VLSF 0 "register_operand") + (mult:V_VLSF + (vec_duplicate:V_VLSF + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSF 1 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + riscv_vector::emit_vlmax_insn (code_for_pred_scalar (MULT, <MODE>mode), + riscv_vector::BINARY_OP_FRM_DYN, operands); + DONE; + } + [(set_attr "type" "vfmul")] +) + +;; vfrdiv.vf +(define_insn_and_split "*vfrdiv_vf_<mode>" + [(set (match_operand:V_VLSF 0 "register_operand") + (div:V_VLSF + (vec_duplicate:V_VLSF + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSF 1 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + riscv_vector::emit_vlmax_insn (code_for_pred_reverse_scalar (DIV, <MODE>mode), + riscv_vector::BINARY_OP_FRM_DYN, operands); + DONE; + } + [(set_attr "type" "vfdiv")] +) + +;; vfmin.vf +(define_insn_and_split "*vfmin_vf_<mode>" + [(set (match_operand:V_VLSF 0 "register_operand") + (smin:V_VLSF + (vec_duplicate:V_VLSF + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSF 1 "register_operand")))] + "TARGET_VECTOR && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + riscv_vector::emit_vlmax_insn (code_for_pred_scalar (SMIN, <MODE>mode), + riscv_vector::BINARY_OP, operands); + DONE; + } + [(set_attr "type" "vfminmax")] +) + +(define_insn_and_split "*vfmin_vf_ieee_<mode>" + [(set (match_operand:V_VLSF 0 "register_operand") + (unspec:V_VLSF [ + (vec_duplicate:V_VLSF + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSF 1 "register_operand") + ] UNSPEC_VFMIN))] + "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode), + riscv_vector::BINARY_OP, operands); + DONE; + } + [(set_attr "type" "vfminmax")] +) + +(define_insn_and_split "*vfmin_vf_ieee_<mode>" + [(set (match_operand:V_VLSF 0 "register_operand") + (unspec:V_VLSF [ + (match_operand:V_VLSF 1 "register_operand") + (vec_duplicate:V_VLSF + (match_operand:<VEL> 2 "register_operand")) + ] UNSPEC_VFMIN))] + "TARGET_VECTOR && !HONOR_SNANS (<MODE>mode) && can_create_pseudo_p ()" + "#" + "&& 1" + [(const_int 0)] + { + riscv_vector::emit_vlmax_insn (code_for_pred_scalar (UNSPEC_VFMIN, <MODE>mode), + riscv_vector::BINARY_OP, operands); + DONE; + } + [(set_attr "type" "vfminmax")] +) diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md index 5ecaa19..979e0df 100644 --- a/gcc/config/riscv/constraints.md +++ b/gcc/config/riscv/constraints.md @@ -330,3 +330,7 @@ (define_constraint "Q" "An address operand that is valid for a prefetch instruction" (match_operand 0 "prefetch_operand")) + +(define_address_constraint "ZD" + "An address operand that is valid for a mips prefetch instruction" + (match_test "riscv_prefetch_offset_address_p (op, mode)")) diff --git a/gcc/config/riscv/gen-riscv-ext-opt.cc b/gcc/config/riscv/gen-riscv-ext-opt.cc index 17b8f5b..1ca339c 100644 --- a/gcc/config/riscv/gen-riscv-ext-opt.cc +++ b/gcc/config/riscv/gen-riscv-ext-opt.cc @@ -4,50 +4,6 @@ #include <stdio.h> #include "riscv-opts.h" -struct version_t -{ - int major; - int minor; - version_t (int major, int minor, - enum riscv_isa_spec_class spec = ISA_SPEC_CLASS_NONE) - : major (major), minor (minor) - {} - bool operator<(const version_t &other) const - { - if (major != other.major) - return major < other.major; - return minor < other.minor; - } - - bool operator== (const version_t &other) const - { - return major == other.major && minor == other.minor; - } -}; - -static void -print_ext_doc_entry (const std::string &ext_name, const std::string &full_name, - const std::string &desc, - const std::vector<version_t> &supported_versions) -{ - // Implementation of the function to print the documentation entry - // for the extension. - std::set<version_t> unique_versions; - for (const auto &version : supported_versions) - unique_versions.insert (version); - printf ("@item %s\n", ext_name.c_str ()); - printf ("@tab"); - for (const auto &version : unique_versions) - { - printf (" %d.%d", version.major, version.minor); - } - printf ("\n"); - printf ("@tab %s", full_name.c_str ()); - if (desc.size ()) - printf (", %s", desc.c_str ()); - printf ("\n\n"); -} - int main () { diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 381f96c..bdb3d22 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -27,10 +27,14 @@ (ior (match_operand 0 "const_arith_operand") (match_operand 0 "register_operand"))) +(define_predicate "prefetch_const_operand" + (and (match_code "const_int") + (match_test "(IN_RANGE (INTVAL (op), 0, 511))"))) + ;; REG or REG+D where D fits in a simm12 and has the low 5 bits ;; off. The REG+D form can be reloaded into a temporary if needed ;; after FP elimination if that exposes an invalid offset. -(define_predicate "prefetch_operand" +(define_predicate "zicbop_prefetch_operand" (ior (match_operand 0 "register_operand") (and (match_test "const_arith_operand (op, VOIDmode)") (match_test "(INTVAL (op) & 0x1f) == 0")) @@ -39,6 +43,20 @@ (match_test "const_arith_operand (XEXP (op, 1), VOIDmode)") (match_test "(INTVAL (XEXP (op, 1)) & 0x1f) == 0")))) +;; REG or REG+D where D fits in a uimm9 +(define_predicate "mips_prefetch_operand" + (ior (match_operand 0 "register_operand") + (match_test "prefetch_const_operand (op, VOIDmode)") + (and (match_code "plus") + (match_test "register_operand (XEXP (op, 0), word_mode)") + (match_test "prefetch_const_operand (XEXP (op, 1), VOIDmode)")))) + +;; MIPS specific or Standard RISCV Extension +(define_predicate "prefetch_operand" + (if_then_else (match_test "TARGET_XMIPSCBOP") + (match_operand 0 "mips_prefetch_operand") + (match_operand 0 "zicbop_prefetch_operand"))) + (define_predicate "lui_operand" (and (match_code "const_int") (match_test "LUI_OPERAND (INTVAL (op))"))) diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc index 3031c29..b8547a7 100644 --- a/gcc/config/riscv/riscv-avlprop.cc +++ b/gcc/config/riscv/riscv-avlprop.cc @@ -156,6 +156,7 @@ get_insn_vtype_mode (rtx_insn *rinsn) extract_insn_cached (rinsn); int mode_idx = get_attr_mode_idx (rinsn); gcc_assert (mode_idx != INVALID_ATTRIBUTE); + gcc_assert (mode_idx < recog_data.n_operands); return GET_MODE (recog_data.operand[mode_idx]); } @@ -205,6 +206,7 @@ simplify_replace_vlmax_avl (rtx_insn *rinsn, rtx new_avl) { int index = get_attr_avl_type_idx (rinsn); gcc_assert (index != INVALID_ATTRIBUTE); + gcc_assert (index < recog_data.n_operands); validate_change_or_fail (rinsn, recog_data.operand_loc[index], get_avl_type_rtx (avl_type::NONVLMAX), false); } @@ -361,6 +363,8 @@ pass_avlprop::get_vlmax_ta_preferred_avl (insn_info *insn) const is not depend on. */ extract_insn_cached (use_insn->rtl ()); int merge_op_idx = get_attr_merge_op_idx (use_insn->rtl ()); + gcc_assert (merge_op_idx == INVALID_ATTRIBUTE + || merge_op_idx < recog_data.n_operands); if (merge_op_idx != INVALID_ATTRIBUTE && !satisfies_constraint_vu (recog_data.operand[merge_op_idx]) && refers_to_regno_p (set->regno (), @@ -531,7 +535,14 @@ pass_avlprop::execute (function *fn) && !m_avl_propagations->get (candidate.second) && imm_avl_p (vtype_mode)) { - rtx new_avl = gen_int_mode (GET_MODE_NUNITS (vtype_mode), Pmode); + /* For segmented operations AVL refers to a single register and + not all NF registers. Therefore divide the mode size by NF + to obtain the proper AVL. */ + int nf = 1; + if (riscv_v_ext_tuple_mode_p (vtype_mode)) + nf = get_nf (vtype_mode); + rtx new_avl = gen_int_mode + (GET_MODE_NUNITS (vtype_mode).to_constant () / nf, Pmode); simplify_replace_vlmax_avl (rinsn, new_avl); } } diff --git a/gcc/config/riscv/riscv-cores.def b/gcc/config/riscv/riscv-cores.def index 98f3470..8f0f630 100644 --- a/gcc/config/riscv/riscv-cores.def +++ b/gcc/config/riscv/riscv-cores.def @@ -113,7 +113,7 @@ RISCV_CORE("xt-c908v", "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicsr_" "zvfh_sstc_svinval_svnapot_svpbmt__xtheadba_" "xtheadbb_xtheadbs_xtheadcmo_xtheadcondmov_" "xtheadfmemidx_xtheadmac_xtheadmemidx_" - "xtheadmempair_xtheadsync_xtheadvdot", + "xtheadmempair_xtheadsync", "xt-c908") RISCV_CORE("xt-c910", "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_" "xtheadba_xtheadbb_xtheadbs_xtheadcmo_" @@ -121,7 +121,7 @@ RISCV_CORE("xt-c910", "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_" "xtheadmemidx_xtheadmempair_xtheadsync", "xt-c910") RISCV_CORE("xt-c910v2", "rv64imafdc_zicbom_zicbop_zicboz_zicntr_zicond_" - "zicsr_zifencei _zihintntl_zihintpause_zihpm_" + "zicsr_zifencei_zihintntl_zihintpause_zihpm_" "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_" "zbs_sscofpmf_sstc_svinval_svnapot_svpbmt_" "xtheadba_xtheadbb_xtheadbs_xtheadcmo_" @@ -135,13 +135,13 @@ RISCV_CORE("xt-c920", "rv64imafdc_zicntr_zicsr_zifencei_zihpm_zfh_" "xtheadvector", "xt-c910") RISCV_CORE("xt-c920v2", "rv64imafdcv_zicbom_zicbop_zicboz_zicntr_zicond_" - "zicsr_zifencei _zihintntl_zihintpause_zihpm_" + "zicsr_zifencei_zihintntl_zihintpause_zihpm_" "zawrs_zfa_zfbfmin_zfh_zca_zcb_zcd_zba_zbb_zbc_" "zbs_zvfbfmin_zvfbfwma_zvfh_sscofpmf_sstc_" "svinval_svnapot_svpbmt_xtheadba_xtheadbb_" "xtheadbs_xtheadcmo_xtheadcondmov_xtheadfmemidx_" "xtheadmac_xtheadmemidx_xtheadmempair_" - "xtheadsync_xtheadvdot", + "xtheadsync", "xt-c920v2") RISCV_CORE("tt-ascalon-d8", "rv64imafdcv_zic64b_zicbom_zicbop_zicboz_" diff --git a/gcc/config/riscv/riscv-ext-mips.def b/gcc/config/riscv/riscv-ext-mips.def index 5d7836d..132f6c1 100644 --- a/gcc/config/riscv/riscv-ext-mips.def +++ b/gcc/config/riscv/riscv-ext-mips.def @@ -33,3 +33,16 @@ DEFINE_RISCV_EXT ( /* BITMASK_GROUP_ID. */ BITMASK_NOT_YET_ALLOCATED, /* BITMASK_BIT_POSITION. */ BITMASK_NOT_YET_ALLOCATED, /* EXTRA_EXTENSION_FLAGS. */ 0) + +DEFINE_RISCV_EXT ( + /* NAME. */ xmipscbop, + /* UPPERCASE_NAME. */ XMIPSCBOP, + /* FULL_NAME. */ "Mips Prefetch extension", + /* DESC. */ "", + /* URL. */ , + /* DEP_EXTS. */ ({}), + /* SUPPORTED_VERSIONS. */ ({{1, 0}}), + /* FLAG_GROUP. */ xmips, + /* BITMASK_GROUP_ID. */ BITMASK_NOT_YET_ALLOCATED, + /* BITMASK_BIT_POSITION. */ BITMASK_NOT_YET_ALLOCATED, + /* EXTRA_EXTENSION_FLAGS. */ 0) diff --git a/gcc/config/riscv/riscv-ext.opt b/gcc/config/riscv/riscv-ext.opt index 26d6e68..ced05d2 100644 --- a/gcc/config/riscv/riscv-ext.opt +++ b/gcc/config/riscv/riscv-ext.opt @@ -449,3 +449,5 @@ Mask(XTHEADVECTOR) Var(riscv_xthead_subext) Mask(XVENTANACONDOPS) Var(riscv_xventana_subext) Mask(XMIPSCMOV) Var(riscv_xmips_subext) + +Mask(XMIPSCBOP) Var(riscv_xmips_subext) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 539321f..46b256d 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -143,6 +143,8 @@ extern void riscv_expand_sstrunc (rtx, rtx); extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t); extern bool synthesize_ior_xor (rtx_code, rtx [3]); extern bool synthesize_and (rtx [3]); +extern bool synthesize_add (rtx [3]); +extern bool synthesize_add_extended (rtx [3]); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); @@ -830,16 +832,18 @@ extern bool th_print_operand_address (FILE *, machine_mode, rtx); extern bool strided_load_broadcast_p (void); extern bool riscv_use_divmod_expander (void); -void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); +void riscv_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, tree, int); extern bool riscv_option_valid_attribute_p (tree, tree, tree, int); extern bool riscv_option_valid_version_attribute_p (tree, tree, tree, int); extern bool -riscv_process_target_version_attr (tree, location_t); +riscv_process_target_version_attr (tree, location_t *); extern void riscv_override_options_internal (struct gcc_options *); extern void riscv_option_override (void); +extern rtx riscv_prefetch_cookie (rtx, rtx); +extern bool riscv_prefetch_offset_address_p (rtx, machine_mode); struct riscv_tune_param; /* Information about one micro-arch we know about. */ diff --git a/gcc/config/riscv/riscv-subset.h b/gcc/config/riscv/riscv-subset.h index a35537d..4cd860f 100644 --- a/gcc/config/riscv/riscv-subset.h +++ b/gcc/config/riscv/riscv-subset.h @@ -52,8 +52,9 @@ private: /* Original arch string. */ const char *m_arch; - /* Location of arch string, used for report error. */ - location_t m_loc; + /* A pointer to the location that should be used for diagnostics, + or null if diagnostics should be suppressed. */ + location_t *m_loc; /* Head of subset info list. */ riscv_subset_t *m_head; @@ -70,7 +71,7 @@ private: /* Allow adding the same extension more than once. */ bool m_allow_adding_dup; - riscv_subset_list (const char *, location_t); + riscv_subset_list (const char *, location_t *); const char *parsing_subset_version (const char *, const char *, unsigned *, unsigned *, bool, bool *); @@ -106,12 +107,12 @@ public: riscv_subset_list *clone () const; - static riscv_subset_list *parse (const char *, location_t); + static riscv_subset_list *parse (const char *, location_t *); const char *parse_single_ext (const char *, bool exact_single_p = true); int match_score (riscv_subset_list *) const; - void set_loc (location_t); + void set_loc (location_t *); void set_allow_adding_dup (bool v) { m_allow_adding_dup = v; } @@ -182,7 +183,7 @@ extern void riscv_set_arch_by_subset_list (riscv_subset_list *, struct gcc_options *); extern bool riscv_minimal_hwprobe_feature_bits (const char *, struct riscv_feature_bits *, - location_t); + location_t *); extern bool riscv_ext_is_subset (struct cl_target_option *, struct cl_target_option *); diff --git a/gcc/config/riscv/riscv-target-attr.cc b/gcc/config/riscv/riscv-target-attr.cc index 8ad3025..5e01c92 100644 --- a/gcc/config/riscv/riscv-target-attr.cc +++ b/gcc/config/riscv/riscv-target-attr.cc @@ -34,7 +34,7 @@ namespace { class riscv_target_attr_parser { public: - riscv_target_attr_parser (location_t loc) + riscv_target_attr_parser (location_t *loc) : m_found_arch_p (false) , m_found_tune_p (false) , m_found_cpu_p (false) @@ -62,7 +62,7 @@ private: bool m_found_cpu_p; bool m_found_priority_p; riscv_subset_list *m_subset_list; - location_t m_loc; + location_t *m_loc; const riscv_cpu_info *m_cpu_info; const char *m_tune; int m_priority; @@ -102,15 +102,17 @@ riscv_target_attr_parser::parse_arch (const char *str) { if (TARGET_64BIT && strncmp ("32", str + 2, strlen ("32")) == 0) { - error_at (m_loc, "unexpected arch for %<target()%> attribute: " - "must start with rv64 but found %qs", str); + if (m_loc) + error_at (*m_loc, "unexpected arch for %<target()%> attribute: " + "must start with rv64 but found %qs", str); goto fail; } if (!TARGET_64BIT && strncmp ("64", str + 2, strlen ("64")) == 0) { - error_at (m_loc, "unexpected arch for %<target()%> attribute: " - "must start with rv32 but found %qs", str); + if (m_loc) + error_at (*m_loc, "unexpected arch for %<target()%> attribute: " + "must start with rv32 but found %qs", str); goto fail; } @@ -140,10 +142,9 @@ riscv_target_attr_parser::parse_arch (const char *str) { if (token[0] != '+') { - error_at ( - m_loc, - "unexpected arch for %<target()%> attribute: must start " - "with + or rv"); + if (*m_loc) + error_at (*m_loc, "unexpected arch for %<target()%> " + "attribute: must start with + or rv"); goto fail; } @@ -151,10 +152,9 @@ riscv_target_attr_parser::parse_arch (const char *str) /* Check parse_single_ext has consume all string. */ if (*result != '\0') { - error_at ( - m_loc, - "unexpected arch for %<target()%> attribute: bad " - "string found %qs", token); + if (m_loc) + error_at (*m_loc, "unexpected arch for %<target()%> " + "attribute: bad string found %qs", token); goto fail; } @@ -179,8 +179,8 @@ fail: bool riscv_target_attr_parser::handle_arch (const char *str) { - if (m_found_arch_p) - error_at (m_loc, "%<target()%> attribute: arch appears more than once"); + if (m_found_arch_p && m_loc) + error_at (*m_loc, "%<target()%> attribute: arch appears more than once"); m_found_arch_p = true; return parse_arch (str); } @@ -190,15 +190,16 @@ riscv_target_attr_parser::handle_arch (const char *str) bool riscv_target_attr_parser::handle_cpu (const char *str) { - if (m_found_cpu_p) - error_at (m_loc, "%<target()%> attribute: cpu appears more than once"); + if (m_found_cpu_p && m_loc) + error_at (*m_loc, "%<target()%> attribute: cpu appears more than once"); m_found_cpu_p = true; const riscv_cpu_info *cpu_info = riscv_find_cpu (str); if (!cpu_info) { - error_at (m_loc, "%<target()%> attribute: unknown CPU %qs", str); + if (m_loc) + error_at (*m_loc, "%<target()%> attribute: unknown CPU %qs", str); return false; } @@ -218,14 +219,15 @@ riscv_target_attr_parser::handle_cpu (const char *str) bool riscv_target_attr_parser::handle_tune (const char *str) { - if (m_found_tune_p) - error_at (m_loc, "%<target()%> attribute: tune appears more than once"); + if (m_found_tune_p && m_loc) + error_at (*m_loc, "%<target()%> attribute: tune appears more than once"); m_found_tune_p = true; const struct riscv_tune_info *tune = riscv_parse_tune (str, true); if (tune == nullptr) { - error_at (m_loc, "%<target()%> attribute: unknown TUNE %qs", str); + if (m_loc) + error_at (*m_loc, "%<target()%> attribute: unknown TUNE %qs", str); return false; } @@ -237,13 +239,15 @@ riscv_target_attr_parser::handle_tune (const char *str) bool riscv_target_attr_parser::handle_priority (const char *str) { - if (m_found_priority_p) - error_at (m_loc, "%<target()%> attribute: priority appears more than once"); + if (m_found_priority_p && m_loc) + error_at (*m_loc, "%<target()%> attribute: priority appears " + "more than once"); m_found_priority_p = true; if (sscanf (str, "%d", &m_priority) != 1) { - error_at (m_loc, "%<target()%> attribute: invalid priority %qs", str); + if (m_loc) + error_at (*m_loc, "%<target()%> attribute: invalid priority %qs", str); return false; } @@ -282,7 +286,7 @@ riscv_target_attr_parser::update_settings (struct gcc_options *opts) const static bool riscv_process_one_target_attr (char *arg_str, - location_t loc, + location_t *loc, riscv_target_attr_parser &attr_parser, const struct riscv_attribute_info *attrs) { @@ -290,7 +294,8 @@ riscv_process_one_target_attr (char *arg_str, if (len == 0) { - error_at (loc, "malformed %<target()%> attribute"); + if (loc) + error_at (*loc, "malformed %<target()%> attribute"); return false; } @@ -302,10 +307,9 @@ riscv_process_one_target_attr (char *arg_str, if (!arg) { - error_at ( - loc, - "attribute %<target(\"%s\")%> does not accept an argument", - str_to_check); + if (loc) + error_at (*loc, "attribute %<target(\"%s\")%> does not " + "accept an argument", str_to_check); return false; } @@ -324,7 +328,8 @@ riscv_process_one_target_attr (char *arg_str, return (&attr_parser->*attr->handler) (arg); } - error_at (loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check); + if (loc) + error_at (*loc, "Got unknown attribute %<target(\"%s\")%>", str_to_check); return false; } @@ -347,11 +352,12 @@ num_occurrences_in_str (char c, char *str) } /* Parse the string in ARGS that contains the target attribute information - and update the global target options space. */ + and update the global target options space. If LOC is nonnull, report + diagnostics against location *LOC, otherwise remain silent. */ bool riscv_process_target_attr (const char *args, - location_t loc, + location_t *loc, const struct riscv_attribute_info *attrs) { size_t len = strlen (args); @@ -387,8 +393,8 @@ riscv_process_target_attr (const char *args, if (num_attrs != num_semicolons + 1) { - error_at (loc, "malformed %<target(\"%s\")%> attribute", - args); + if (loc) + error_at (*loc, "malformed %<target(\"%s\")%> attribute", args); return false; } @@ -399,11 +405,12 @@ riscv_process_target_attr (const char *args, } /* Parse the tree in ARGS that contains the target attribute information - and update the global target options space. */ + and update the global target options space. If LOC is nonnull, report + diagnostics against *LOC, otherwise remain silent. */ static bool riscv_process_target_attr (tree args, - location_t loc, + location_t *loc, const struct riscv_attribute_info *attrs) { if (TREE_CODE (args) == TREE_LIST) @@ -424,7 +431,8 @@ riscv_process_target_attr (tree args, if (TREE_CODE (args) != STRING_CST) { - error_at (loc, "attribute %<target%> argument not a string"); + if (loc) + error_at (*loc, "attribute %<target%> argument not a string"); return false; } @@ -466,7 +474,7 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int) TREE_TARGET_OPTION (target_option_default_node)); /* Now we can parse the attributes and set &global_options accordingly. */ - ret = riscv_process_target_attr (args, loc, riscv_target_attrs); + ret = riscv_process_target_attr (args, &loc, riscv_target_attrs); if (ret) { riscv_override_options_internal (&global_options); @@ -481,16 +489,19 @@ riscv_option_valid_attribute_p (tree fndecl, tree, tree args, int) } /* Parse the tree in ARGS that contains the target_version attribute - information and update the global target options space. */ + information and update the global target options space. If LOC is nonnull, + report diagnostics against *LOC, otherwise remain silent. */ bool -riscv_process_target_version_attr (tree args, location_t loc) +riscv_process_target_version_attr (tree args, location_t *loc) { if (TREE_CODE (args) == TREE_LIST) { if (TREE_CHAIN (args)) { - error ("attribute %<target_version%> has multiple values"); + if (loc) + error_at (*loc, "attribute %<target_version%> " + "has multiple values"); return false; } args = TREE_VALUE (args); @@ -498,7 +509,8 @@ riscv_process_target_version_attr (tree args, location_t loc) if (!args || TREE_CODE (args) != STRING_CST) { - error ("attribute %<target_version%> argument not a string"); + if (loc) + error_at (*loc, "attribute %<target_version%> argument not a string"); return false; } @@ -541,7 +553,7 @@ riscv_option_valid_version_attribute_p (tree fndecl, tree, tree args, int) cl_target_option_restore (&global_options, &global_options_set, TREE_TARGET_OPTION (target_option_current_node)); - ret = riscv_process_target_version_attr (args, loc); + ret = riscv_process_target_version_attr (args, &loc); /* Set up any additional state. */ if (ret) diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index c9c8328..b27a0be 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -63,20 +63,37 @@ imm_avl_p (machine_mode mode) { poly_uint64 nunits = GET_MODE_NUNITS (mode); + /* For segmented operations AVL refers to a single register and not all NF + registers. Therefore divide the mode size by NF before checking if it is + in range. */ + int nf = 1; + if (riscv_v_ext_tuple_mode_p (mode)) + nf = get_nf (mode); + return nunits.is_constant () /* The vsetivli can only hold register 0~31. */ - ? (IN_RANGE (nunits.to_constant (), 0, 31)) + ? (IN_RANGE (nunits.to_constant () / nf, 0, 31)) /* Only allowed in VLS-VLMAX mode. */ : false; } -/* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */ +/* Return true if LEN equals the number of units in MODE if MODE is either a + VLA mode or MODE is a VLS mode its size equals the vector size. + In that case we can emit a VLMAX insn which can be optimized more easily + by the vsetvl pass. */ + static bool is_vlmax_len_p (machine_mode mode, rtx len) { poly_int64 value; + if (poly_int_rtx_p (len, &value) + && known_eq (value, GET_MODE_NUNITS (mode)) + && known_eq (GET_MODE_UNIT_SIZE (mode) * value, BYTES_PER_RISCV_VECTOR)) + return true; + return poly_int_rtx_p (len, &value) - && known_eq (value, GET_MODE_NUNITS (mode)); + && !GET_MODE_NUNITS (mode).is_constant () + && known_eq (value, GET_MODE_NUNITS (mode)); } /* Helper functions for insn_flags && insn_types */ @@ -954,6 +971,26 @@ emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask) emit_vlmax_insn (icode, BINARY_OP_TAMU, ops); } +/* Function to emit a vslide1up instruction of mode MODE with destination + DEST and slideup element ELT. */ + +rtx +expand_slide1up (machine_mode mode, rtx dest, rtx elt) +{ + unsigned int unspec + = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; + insn_code icode = code_for_pred_slide (unspec, mode); + /* RVV Spec 16.3.1 + The destination vector register group for vslideup cannot overlap the + source vector register group, otherwise the instruction encoding + is reserved. Thus, we need a new register. */ + rtx tmp = gen_reg_rtx (mode); + rtx ops[] = {tmp, dest, elt}; + emit_vlmax_insn (icode, BINARY_OP, ops); + return tmp; +} + + /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress): https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc @@ -1175,16 +1212,7 @@ expand_vector_init_trailing_same_elem (rtx target, { rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1)); for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--) - { - unsigned int unspec - = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; - insn_code icode = code_for_pred_slide (unspec, mode); - rtx tmp = gen_reg_rtx (mode); - rtx ops[] = {tmp, dup, builder.elt (i)}; - emit_vlmax_insn (icode, BINARY_OP, ops); - /* slide1up need source and dest to be different REG. */ - dup = tmp; - } + dup = expand_slide1up (mode, dup, builder.elt (i)); emit_move_insn (target, dup); return true; @@ -1717,6 +1745,77 @@ expand_const_vector_stepped (rtx target, rtx src, rvv_builder *builder) gcc_unreachable (); } +/* We don't actually allow this case in legitimate_constant_p but + the middle-end still expects us to handle it in an expander + (see PR121334). This is assumed to happen very rarely so the + implementation is not very efficient, particularly + for short vectors. +*/ + +static void +expand_const_vector_onestep (rtx target, rvv_builder &builder) +{ + machine_mode mode = GET_MODE (target); + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); + gcc_assert (builder.nelts_per_pattern () == 2); + + /* We have n encoded patterns + {csta_0, cstb_0}, + {csta_1, cstb_1}, + ... + {csta_{n-1}, cstb_{n-1}} + which should become one vector: + {csta_0, csta_1, ..., csta_{n-1}, + cstb_0, cstb_1, ..., cstb_{n-1}, + ... + cstb_0, cstb_1, ..., cstb_{n-1}}. + + In order to achieve this we create a permute/gather constant + sel = {0, 1, ..., n - 1, 0, 1, ..., n - 1, ...} + and two vectors + va = {csta_0, csta_1, ..., csta_{n-1}}, + vb = {cstb_0, cstb_1, ..., cstb_{n-1}}. + + Then we use a VLMAX gather to "broadcast" vb and afterwards + overwrite the first n elements with va. */ + + int n = builder.npatterns (); + /* { 0, 1, 2, ..., n - 1 }. */ + rtx vid = gen_reg_rtx (mode); + expand_vec_series (vid, const0_rtx, const1_rtx); + + /* { 0, 1, ..., n - 1, 0, 1, ..., n - 1, ... }. */ + rtx sel = gen_reg_rtx (mode); + rtx and_ops[] = {sel, vid, GEN_INT (n)}; + emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP, and_ops); + + /* va = { ELT (0), ELT (1), ... ELT (n - 1) }. */ + rtx tmp1 = gen_reg_rtx (mode); + rtx ops1[] = {tmp1, builder.elt (0)}; + expand_broadcast (mode, ops1); + for (int i = 1; i < n; i++) + tmp1 = expand_slide1up (mode, tmp1, builder.elt (i)); + + /* vb = { ELT (n), ELT (n + 1), ... ELT (2 * n - 1) }. */ + rtx tmp2 = gen_reg_rtx (mode); + rtx ops2[] = {tmp2, builder.elt (n)}; + expand_broadcast (mode, ops2); + for (int i = 1; i < n; i++) + tmp2 = expand_slide1up (mode, tmp2, builder.elt (n + i)); + + /* Duplicate vb. */ + rtx tmp3 = gen_reg_rtx (mode); + emit_vlmax_gather_insn (tmp3, tmp2, sel); + + /* Overwrite the first n - 1 elements with va. */ + rtx dest = gen_reg_rtx (mode); + insn_code icode = code_for_pred_mov (mode); + rtx ops3[] = {dest, tmp3, tmp1}; + emit_nonvlmax_insn (icode, __MASK_OP_TUMA | UNARY_OP_P, ops3, GEN_INT (n)); + + emit_move_insn (target, dest); +} + static void expand_const_vector (rtx target, rtx src) { @@ -1744,6 +1843,8 @@ expand_const_vector (rtx target, rtx src) if (CONST_VECTOR_DUPLICATE_P (src)) return expand_const_vector_duplicate (target, &builder); + else if (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2) + return expand_const_vector_onestep (target, builder); else if (CONST_VECTOR_STEPPED_P (src)) return expand_const_vector_stepped (target, src, &builder); @@ -2648,8 +2749,14 @@ expand_vector_init_merge_repeating_sequence (rtx target, = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode); uint64_t full_nelts = builder.full_nelts ().to_constant (); + gcc_assert (builder.nelts_per_pattern () == 1 + || builder.nelts_per_pattern () == 2); + + rtx first + = builder.nelts_per_pattern () == 1 ? builder.elt (0) : builder.elt (1); + /* Step 1: Broadcast the first pattern. */ - rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))}; + rtx ops[] = {target, force_reg (builder.inner_mode (), first)}; expand_broadcast (builder.mode (), ops); /* Step 2: Merge the rest iteration of pattern. */ for (unsigned int i = 1; i < builder.npatterns (); i++) @@ -2677,7 +2784,10 @@ expand_vector_init_merge_repeating_sequence (rtx target, emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup)); /* Step 2-2: Merge pattern according to the mask. */ - rtx ops[] = {target, target, builder.elt (i), mask}; + unsigned int which = i; + if (builder.nelts_per_pattern () == 2) + which = 2 * which + 1; + rtx ops[] = {target, target, builder.elt (which), mask}; emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)), MERGE_OP, ops); } @@ -3220,15 +3330,17 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) mask_mode = get_mask_mode (data_mode); rtx mask = gen_reg_rtx (mask_mode); rtx max_sel = gen_const_vector_dup (sel_mode, nunits); + bool overlap = reg_overlap_mentioned_p (target, op1); + rtx tmp_target = overlap ? gen_reg_rtx (data_mode) : target; /* Step 1: generate a mask that should select everything >= nunits into the * mask. */ expand_vec_cmp (mask, GEU, sel_mod, max_sel); - /* Step2: gather every op0 values indexed by sel into target, + /* Step2: gather every op0 values indexed by sel into TMP_TARGET, we don't need to care about the result of the element whose index >= nunits. */ - emit_vlmax_gather_insn (target, op0, sel_mod); + emit_vlmax_gather_insn (tmp_target, op0, sel_mod); /* Step3: shift the range from (nunits, max_of_mode] to [0, max_of_mode - nunits]. */ @@ -3238,7 +3350,10 @@ expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) /* Step4: gather those into the previously masked-out elements of target. */ - emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask); + emit_vlmax_masked_gather_mu_insn (tmp_target, op1, tmp, mask); + + if (overlap) + emit_move_insn (tmp_target, target); } /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */ @@ -4078,11 +4193,7 @@ shuffle_off_by_one_patterns (struct expand_vec_perm_d *d) emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); /* Insert the scalar into element 0. */ - unsigned int unspec - = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; - insn_code icode = code_for_pred_slide (unspec, d->vmode); - rtx ops[] = {d->target, d->op1, tmp}; - emit_vlmax_insn (icode, BINARY_OP, ops); + expand_slide1up (d->vmode, d->op1, tmp); } return true; @@ -4376,13 +4487,11 @@ expand_strided_load (machine_mode mode, rtx *ops) int idx = 4; get_else_operand (ops[idx++]); rtx len = ops[idx]; - poly_int64 len_val; insn_code icode = code_for_pred_strided_load (mode); rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride}; - if (poly_int_rtx_p (len, &len_val) - && known_eq (len_val, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops); else { @@ -4400,11 +4509,9 @@ expand_strided_store (machine_mode mode, rtx *ops) rtx stride = ops[1]; rtx mask = ops[3]; rtx len = ops[4]; - poly_int64 len_val; rtx vl_type; - if (poly_int_rtx_p (len, &len_val) - && known_eq (len_val, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) { len = gen_reg_rtx (Pmode); emit_vlmax_vsetvl (mode, len); diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 44ef44a..5e6cb67 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -607,7 +607,7 @@ costs::need_additional_vector_vars_p (stmt_vec_info stmt_info, if (type == load_vec_info_type || type == store_vec_info_type) { if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) - && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) + && mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))) return true; machine_mode mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index e0d8904..591122f 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -3685,7 +3685,8 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src) /* This test can fail if (for example) we want a HF and Z[v]fh is not enabled. In that case we just want to let the standard expansion path run. */ - if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode)) + if (riscv_vector::get_vector_mode (smode, nunits).exists (&vmode) + && gen_lowpart_common (vmode, SUBREG_REG (src))) { rtx v = gen_lowpart (vmode, SUBREG_REG (src)); rtx int_reg = dest; @@ -3958,41 +3959,6 @@ riscv_extend_cost (rtx op, bool unsigned_p) return COSTS_N_INSNS (2); } -/* Return the cost of the vector binary rtx like add, minus, mult. - The cost of scalar2vr_cost will be appended if there one of the - op comes from the VEC_DUPLICATE. */ - -static int -get_vector_binary_rtx_cost (rtx x, int scalar2vr_cost) -{ - gcc_assert (riscv_v_ext_mode_p (GET_MODE (x))); - - rtx neg; - rtx op_0; - rtx op_1; - - if (GET_CODE (x) == UNSPEC) - { - op_0 = XVECEXP (x, 0, 0); - op_1 = XVECEXP (x, 0, 1); - } - else - { - op_0 = XEXP (x, 0); - op_1 = XEXP (x, 1); - } - - if (GET_CODE (op_0) == VEC_DUPLICATE - || GET_CODE (op_1) == VEC_DUPLICATE) - return (scalar2vr_cost + 1) * COSTS_N_INSNS (1); - else if (GET_CODE (neg = op_0) == NEG - && (GET_CODE (op_1) == VEC_DUPLICATE - || GET_CODE (XEXP (neg, 0)) == VEC_DUPLICATE)) - return (scalar2vr_cost + 1) * COSTS_N_INSNS (1); - else - return COSTS_N_INSNS (1); -} - /* Implement TARGET_RTX_COSTS. */ #define SINGLE_SHIFT_COST 1 @@ -4014,73 +3980,20 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN { case SET: { - switch (GET_CODE (x)) + if (GET_CODE (x) == VEC_DUPLICATE) + *total = (scalar2vr_cost + 1) * COSTS_N_INSNS (1); + else { - case VEC_DUPLICATE: - *total = gr2vr_cost * COSTS_N_INSNS (1); - break; - case IF_THEN_ELSE: - { - rtx op = XEXP (x, 1); + int vec_dup_count = 0; + subrtx_var_iterator::array_type array; - switch (GET_CODE (op)) - { - case DIV: - case UDIV: - case MOD: - case UMOD: - case US_PLUS: - case US_MINUS: - case SS_PLUS: - case SS_MINUS: - *total = get_vector_binary_rtx_cost (op, scalar2vr_cost); - break; - case UNSPEC: - { - switch (XINT (op, 1)) - { - case UNSPEC_VAADDU: - case UNSPEC_VAADD: - *total - = get_vector_binary_rtx_cost (op, scalar2vr_cost); - break; - default: - *total = COSTS_N_INSNS (1); - break; - } - } - break; - default: - *total = COSTS_N_INSNS (1); - break; - } - } - break; - case PLUS: - case MINUS: - case AND: - case IOR: - case XOR: - case MULT: - case SMAX: - case UMAX: - case SMIN: - case UMIN: - { - rtx op; - rtx op_0 = XEXP (x, 0); - rtx op_1 = XEXP (x, 1); + FOR_EACH_SUBRTX_VAR (iter, array, x, ALL) + if (GET_CODE (*iter) == VEC_DUPLICATE) + vec_dup_count++; - if (GET_CODE (op = op_0) == MULT - || GET_CODE (op = op_1) == MULT) - *total = get_vector_binary_rtx_cost (op, scalar2vr_cost); - else - *total = get_vector_binary_rtx_cost (x, scalar2vr_cost); - } - break; - default: - *total = COSTS_N_INSNS (1); - break; + int total_vec_dup_cost = vec_dup_count * scalar2vr_cost; + + *total = COSTS_N_INSNS (1) * (total_vec_dup_cost + 1); } } break; @@ -5532,9 +5445,9 @@ canonicalize_comparands (rtx_code code, rtx *op0, rtx *op1) /* We might have been handed back a SUBREG. Just to make things easy, force it into a REG. */ - if (!REG_P (*op0) && !CONST_INT_P (*op0)) + if (!REG_P (*op0) && !CONST_INT_P (*op0) && INTEGRAL_MODE_P (GET_MODE (*op0))) *op0 = force_reg (word_mode, *op0); - if (!REG_P (*op1) && !CONST_INT_P (*op1)) + if (!REG_P (*op1) && !CONST_INT_P (*op1) && INTEGRAL_MODE_P (GET_MODE (*op1))) *op1 = force_reg (word_mode, *op1); } @@ -6213,7 +6126,8 @@ riscv_pass_vls_aggregate_in_gpr (struct riscv_arg_info *info, machine_mode mode, For a library call, FNTYPE is 0. */ void -riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype, rtx, tree, int) +riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, const_tree fntype, + rtx, tree, int) { memset (cum, 0, sizeof (*cum)); @@ -6494,30 +6408,44 @@ riscv_arg_partial_bytes (cumulative_args_t cum, return arg.stack_p ? arg.num_gprs * UNITS_PER_WORD : 0; } -/* Implement FUNCTION_VALUE and LIBCALL_VALUE. For normal calls, - VALTYPE is the return type and MODE is VOIDmode. For libcalls, - VALTYPE is null and MODE is the mode of the return value. */ +/* Implements hook TARGET_FUNCTION_VALUE. */ rtx -riscv_function_value (const_tree type, const_tree func, machine_mode mode) +riscv_function_value (const_tree ret_type, const_tree fn_decl_or_type, + bool) { struct riscv_arg_info info; CUMULATIVE_ARGS args; - if (type) + if (fn_decl_or_type) { - int unsigned_p = TYPE_UNSIGNED (type); + const_tree fntype = TREE_CODE (fn_decl_or_type) == FUNCTION_DECL ? + TREE_TYPE (fn_decl_or_type) : fn_decl_or_type; + riscv_init_cumulative_args (&args, fntype, NULL_RTX, NULL_TREE, 0); + } + else + memset (&args, 0, sizeof args); - mode = TYPE_MODE (type); + int unsigned_p = TYPE_UNSIGNED (ret_type); - /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes, - return values, promote the mode here too. */ - mode = promote_function_mode (type, mode, &unsigned_p, func, 1); - } + machine_mode mode = TYPE_MODE (ret_type); - memset (&args, 0, sizeof args); + /* Since TARGET_PROMOTE_FUNCTION_MODE unconditionally promotes, + return values, promote the mode here too. */ + mode = promote_function_mode (ret_type, mode, &unsigned_p, fn_decl_or_type, 1); - return riscv_get_arg_info (&info, &args, mode, type, true, true); + return riscv_get_arg_info (&info, &args, mode, ret_type, true, true); +} + +/* Implements hook TARGET_LIBCALL_VALUE. */ + +rtx +riscv_libcall_value (machine_mode mode, const_rtx fun ATTRIBUTE_UNUSED) +{ + struct riscv_arg_info info; + CUMULATIVE_ARGS args; + memset (&args, 0, sizeof args); + return riscv_get_arg_info (&info, &args, mode, NULL_TREE, true, true); } /* Implement TARGET_PASS_BY_REFERENCE. */ @@ -14037,10 +13965,13 @@ riscv_c_mode_for_floating_type (enum tree_index ti) return default_mode_for_floating_type (ti); } -/* This parses the attribute arguments to target_version in DECL and modifies - the feature mask and priority required to select those targets. */ +/* Parse the attribute arguments to target_version in DECL and modify + the feature mask and priority required to select those targets. + If LOC is nonnull, report diagnostics against *LOC, otherwise + remain silent. */ static void parse_features_for_version (tree decl, + location_t *loc, struct riscv_feature_bits &res, int &priority) { @@ -14071,14 +14002,12 @@ parse_features_for_version (tree decl, cl_target_option_restore (&global_options, &global_options_set, default_opts); - riscv_process_target_version_attr (TREE_VALUE (version_attr), - DECL_SOURCE_LOCATION (decl)); + riscv_process_target_version_attr (TREE_VALUE (version_attr), loc); priority = global_options.x_riscv_fmv_priority; const char *arch_string = global_options.x_riscv_arch_string; bool parse_res - = riscv_minimal_hwprobe_feature_bits (arch_string, &res, - DECL_SOURCE_LOCATION (decl)); + = riscv_minimal_hwprobe_feature_bits (arch_string, &res, loc); gcc_assert (parse_res); cl_target_option_restore (&global_options, &global_options_set, @@ -14135,8 +14064,8 @@ riscv_compare_version_priority (tree decl1, tree decl2) struct riscv_feature_bits mask1, mask2; int prio1, prio2; - parse_features_for_version (decl1, mask1, prio1); - parse_features_for_version (decl2, mask2, prio2); + parse_features_for_version (decl1, nullptr, mask1, prio1); + parse_features_for_version (decl2, nullptr, mask2, prio2); return compare_fmv_features (mask1, mask2, prio1, prio2); } @@ -14439,6 +14368,7 @@ dispatch_function_versions (tree dispatch_decl, version_info.version_decl = version_decl; // Get attribute string, parse it and find the right features. parse_features_for_version (version_decl, + &DECL_SOURCE_LOCATION (version_decl), version_info.features, version_info.prio); function_versions.push_back (version_info); @@ -15441,6 +15371,217 @@ synthesize_and (rtx operands[3]) return true; } +/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2]. + + OPERANDS[0] and OPERANDS[1] will be a REG and may be the same + REG. + + OPERANDS[2] is a CONST_INT. + + Return TRUE if the operation was fully synthesized and the caller + need not generate additional code. Return FALSE if the operation + was not synthesized and the caller is responsible for emitting the + proper sequence. */ + +bool +synthesize_add (rtx operands[3]) +{ + /* Trivial cases that don't need synthesis. */ + if (SMALL_OPERAND (INTVAL (operands[2]))) + return false; + + int budget1 = riscv_const_insns (operands[2], true); + int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true); + + HOST_WIDE_INT ival = INTVAL (operands[2]); + + /* If we can emit two addi insns then that's better than synthesizing + the constant into a temporary, then adding the temporary to the + other input. The exception is when the constant can be loaded + in a single instruction which can issue whenever its convenient. */ + if (SUM_OF_TWO_S12 (ival) && budget1 >= 2) + { + HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1); + + if (ival >= 0) + saturated = ~saturated; + + ival -= saturated; + + rtx x = gen_rtx_PLUS (word_mode, operands[1], GEN_INT (saturated)); + emit_insn (gen_rtx_SET (operands[0], x)); + rtx output = gen_rtx_PLUS (word_mode, operands[0], GEN_INT (ival)); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* If we can shift the constant by 1, 2, or 3 bit positions + and the result is a cheaper constant, then do so. */ + ival = INTVAL (operands[2]); + if (TARGET_ZBA + && (((ival % 2) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 1), true)) + || ((ival % 4) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 2), true)) + || ((ival % 8) == 0 && budget1 + > riscv_const_insns (GEN_INT (ival >> 3), true)))) + { + // Load the shifted constant into a temporary + int shct = ctz_hwi (ival); + + /* We can handle shifting up to 3 bit positions via shNadd. */ + if (shct > 3) + shct = 3; + + /* The adjusted constant may still need synthesis, so do not copy + it directly into register. Let the expander handle it. */ + rtx tmp = force_reg (word_mode, GEN_INT (ival >> shct)); + + /* Generate shift-add of temporary and operands[1] + into the final destination. */ + rtx x = gen_rtx_ASHIFT (word_mode, tmp, GEN_INT (shct)); + rtx output = gen_rtx_PLUS (word_mode, x, operands[1]); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* If the negated constant is cheaper than the original, then negate + the constant and use sub. */ + if (budget2 < budget1) + { + // load -INTVAL (operands[2]) into a temporary + rtx tmp = force_reg (word_mode, GEN_INT (-INTVAL (operands[2]))); + + // subtract operads[2] from operands[1] + rtx output = gen_rtx_MINUS (word_mode, operands[1], tmp); + emit_insn (gen_rtx_SET (operands[0], output)); + return true; + } + + /* No add synthesis was found. Synthesize the constant into + a temporary and use that. */ + rtx x = force_reg (word_mode, operands[2]); + x = gen_rtx_PLUS (word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; +} + +/* Synthesize OPERANDS[0] = OPERANDS[1] + OPERANDS[2]. + + For 32-bit object cases with a 64-bit target. + + OPERANDS[0] and OPERANDS[1] will be a REG and may be the same + REG. + + OPERANDS[2] is a CONST_INT. + + Return TRUE if the operation was fully synthesized and the caller + need not generate additional code. Return FALSE if the operation + was not synthesized and the caller is responsible for emitting the + proper sequence. */ + + +bool +synthesize_add_extended (rtx operands[3]) +{ + +/* If operands[2] is a 12-bit signed immediate, + no synthesis needs to be done. */ + + if (SMALL_OPERAND (INTVAL (operands[2]))) + return false; + + HOST_WIDE_INT ival = INTVAL (operands[2]); + int budget1 = riscv_const_insns (operands[2], true); + int budget2 = riscv_const_insns (GEN_INT (-INTVAL (operands[2])), true); + +/* If operands[2] can be split into two 12-bit signed immediates, + split add into two adds. */ + + if (SUM_OF_TWO_S12 (ival)) + { + HOST_WIDE_INT saturated = HOST_WIDE_INT_M1U << (IMM_BITS - 1); + + if (ival >= 0) + saturated = ~saturated; + + ival -= saturated; + + rtx temp = gen_reg_rtx (DImode); + emit_insn (gen_addsi3_extended (temp, operands[1], GEN_INT (saturated))); + temp = gen_lowpart (SImode, temp); + SUBREG_PROMOTED_VAR_P (temp) = 1; + SUBREG_PROMOTED_SET (temp, SRP_SIGNED); + emit_insn (gen_rtx_SET (operands[0], temp)); + rtx t = gen_reg_rtx (DImode); + emit_insn (gen_addsi3_extended (t, operands[0], GEN_INT (ival))); + t = gen_lowpart (SImode, t); + SUBREG_PROMOTED_VAR_P (t) = 1; + SUBREG_PROMOTED_SET (t, SRP_SIGNED); + emit_move_insn (operands[0], t); + return true; + } + + +/* If the negated value is cheaper to synthesize, subtract that from + operands[1]. */ + + if (budget2 < budget1) + { + rtx tmp = gen_reg_rtx (SImode); + emit_insn (gen_rtx_SET (tmp, GEN_INT (-INTVAL (operands[2])))); + + rtx t = gen_reg_rtx (DImode); + emit_insn (gen_subsi3_extended (t, operands[1], tmp)); + t = gen_lowpart (SImode, t); + SUBREG_PROMOTED_VAR_P (t) = 1; + SUBREG_PROMOTED_SET (t, SRP_SIGNED); + emit_move_insn (operands[0], t); + return true; + } + + rtx tsrc = force_reg (SImode, operands[2]); + rtx tdest = gen_reg_rtx (DImode); + emit_insn (gen_addsi3_extended (tdest, operands[1], tsrc)); + tdest = gen_lowpart (SImode, tdest); + SUBREG_PROMOTED_VAR_P (tdest) = 1; + SUBREG_PROMOTED_SET (tdest, SRP_SIGNED); + emit_move_insn (operands[0], tdest); + return true; + +} + + +/* + HINT : argument specify the target cache + + TODO : LOCALITY is unused. + + Return the first operand of the associated PREF or PREFX insn. */ +rtx +riscv_prefetch_cookie (rtx hint, rtx locality) +{ + return (GEN_INT (INTVAL (hint) + + CacheHint::DCACHE_HINT + INTVAL (locality) * 0)); +} + +/* Return true if X is a legitimate address with offset for prefetch. + MODE is the mode of the value being accessed. */ +bool +riscv_prefetch_offset_address_p (rtx x, machine_mode mode) +{ + struct riscv_address_info addr; + + if (riscv_classify_address (&addr, x, mode, false) + && addr.type == ADDRESS_REG) + { + if (TARGET_XMIPSCBOP) + return (CONST_INT_P (addr.offset) + && MIPS_RISCV_9BIT_OFFSET_P (INTVAL (addr.offset))); + } + + return true; +} /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP @@ -15804,6 +15945,12 @@ synthesize_and (rtx operands[3]) #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P riscv_vector_mode_supported_any_target_p +#undef TARGET_FUNCTION_VALUE +#define TARGET_FUNCTION_VALUE riscv_function_value + +#undef TARGET_LIBCALL_VALUE +#define TARGET_LIBCALL_VALUE riscv_libcall_value + #undef TARGET_FUNCTION_VALUE_REGNO_P #define TARGET_FUNCTION_VALUE_REGNO_P riscv_function_value_regno_p diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 29342d8..9146571 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -765,12 +765,6 @@ enum reg_class #define CALLEE_SAVED_FREG_NUMBER(REGNO) CALLEE_SAVED_REG_NUMBER (REGNO - 32) -#define LIBCALL_VALUE(MODE) \ - riscv_function_value (NULL_TREE, NULL_TREE, MODE) - -#define FUNCTION_VALUE(VALTYPE, FUNC) \ - riscv_function_value (VALTYPE, FUNC, VOIDmode) - /* 1 if N is a possible register number for function argument passing. We have no FP argument registers when soft-float. */ @@ -1325,4 +1319,15 @@ extern void riscv_remove_unneeded_save_restore_calls (void); #define TARGET_HAS_FMV_TARGET_ATTRIBUTE 0 +/* mips pref valid offset range. */ +#define MIPS_RISCV_9BIT_OFFSET_P(OFFSET) (IN_RANGE (OFFSET, 0, 511)) + +/* mips pref cache hint type. */ +typedef enum { + ICACHE_HINT = 0 << 3, + DCACHE_HINT = 1 << 3, + SCACHE_HINT = 2 << 3, + TCACHE_HINT = 3 << 3 +} CacheHint; + #endif /* ! GCC_RISCV_H */ diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 578dd43..d34405c 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -712,24 +712,45 @@ (set_attr "mode" "SI")]) (define_expand "addsi3" - [(set (match_operand:SI 0 "register_operand" "=r,r") - (plus:SI (match_operand:SI 1 "register_operand" " r,r") - (match_operand:SI 2 "arith_operand" " r,I")))] + [(set (match_operand:SI 0 "register_operand") + (plus:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "reg_or_const_int_operand")))] "" { + /* We may be able to find a faster sequence, if so, then we are + done. Otherwise let expansion continue normally. */ + if (CONST_INT_P (operands[2]) + && ((!TARGET_64BIT && synthesize_add (operands)) + || (TARGET_64BIT && synthesize_add_extended (operands)))) + DONE; + + /* Constants have already been handled already. */ if (TARGET_64BIT) { - rtx t = gen_reg_rtx (DImode); - emit_insn (gen_addsi3_extended (t, operands[1], operands[2])); - t = gen_lowpart (SImode, t); - SUBREG_PROMOTED_VAR_P (t) = 1; - SUBREG_PROMOTED_SET (t, SRP_SIGNED); - emit_move_insn (operands[0], t); + rtx tdest = gen_reg_rtx (DImode); + emit_insn (gen_addsi3_extended (tdest, operands[1], operands[2])); + tdest = gen_lowpart (SImode, tdest); + SUBREG_PROMOTED_VAR_P (tdest) = 1; + SUBREG_PROMOTED_SET (tdest, SRP_SIGNED); + emit_move_insn (operands[0], tdest); DONE; } + }) -(define_insn "adddi3" +(define_expand "adddi3" + [(set (match_operand:DI 0 "register_operand") + (plus:DI (match_operand:DI 1 "register_operand") + (match_operand:DI 2 "reg_or_const_int_operand")))] + "TARGET_64BIT" +{ + /* We may be able to find a faster sequence, if so, then we are + done. Otherwise let expansion continue normally. */ + if (CONST_INT_P (operands[2]) && synthesize_add (operands)) + DONE; +}) + +(define_insn "*adddi3" [(set (match_operand:DI 0 "register_operand" "=r,r") (plus:DI (match_operand:DI 1 "register_operand" " r,r") (match_operand:DI 2 "arith_operand" " r,I")))] @@ -2293,12 +2314,16 @@ rtx abs_reg = gen_reg_rtx (<ANYF:MODE>mode); rtx coeff_reg = gen_reg_rtx (<ANYF:MODE>mode); rtx tmp_reg = gen_reg_rtx (<ANYF:MODE>mode); + rtx fflags = gen_reg_rtx (SImode); riscv_emit_move (tmp_reg, operands[1]); riscv_emit_move (coeff_reg, riscv_vector::get_fp_rounding_coefficient (<ANYF:MODE>mode)); emit_insn (gen_abs<ANYF:mode>2 (abs_reg, operands[1])); + /* fp compare can set invalid flag for NaN, so backup fflags. */ + if (flag_trapping_math) + emit_insn (gen_riscv_frflags (fflags)); riscv_expand_conditional_branch (label, LT, abs_reg, coeff_reg); emit_jump_insn (gen_jump (end_label)); @@ -2324,6 +2349,14 @@ emit_insn (gen_copysign<ANYF:mode>3 (tmp_reg, abs_reg, operands[1])); emit_label (end_label); + + /* Restore fflags, but after label. This is slightly different + than glibc implementation which only needs to restore under + the label, since it checks for NaN first, meaning following fp + compare can't raise fp exceptons and thus not clobber fflags. */ + if (flag_trapping_math) + emit_insn (gen_riscv_fsflags (fflags)); + riscv_emit_move (operands[0], tmp_reg); } @@ -4402,11 +4435,21 @@ ) (define_insn "prefetch" - [(prefetch (match_operand 0 "prefetch_operand" "Qr") - (match_operand 1 "imm5_operand" "i") - (match_operand 2 "const_int_operand" "n"))] - "TARGET_ZICBOP" + [(prefetch (match_operand 0 "prefetch_operand" "Qr,ZD") + (match_operand 1 "imm5_operand" "i,i") + (match_operand 2 "const_int_operand" "n,n"))] + "TARGET_ZICBOP || TARGET_XMIPSCBOP" { + if (TARGET_XMIPSCBOP) + { + /* Mips Prefetch write is nop for p8700. */ + if (operands[1] != CONST0_RTX (GET_MODE (operands[1]))) + return "nop"; + + operands[1] = riscv_prefetch_cookie (operands[1], operands[2]); + return "mips.pref\t%1,%a0"; + } + switch (INTVAL (operands[1])) { case 0: diff --git a/gcc/config/riscv/sifive-p400.md b/gcc/config/riscv/sifive-p400.md index ed8b8ec..0acdbda 100644 --- a/gcc/config/riscv/sifive-p400.md +++ b/gcc/config/riscv/sifive-p400.md @@ -153,10 +153,13 @@ (eq_attr "type" "fmove,fcvt")) "p400_float_pipe,sifive_p400_fpu") +;; We need something for HF so that we don't abort during +;; scheduling if someone was to ask for p400 scheduling, but +;; enable the various HF mode extensions. (define_insn_reservation "sifive_p400_fdiv_s" 18 (and (eq_attr "tune" "sifive_p400") (eq_attr "type" "fdiv,fsqrt") - (eq_attr "mode" "SF")) + (eq_attr "mode" "HF,SF")) "sifive_p400_FM, sifive_p400_fdiv*5") (define_insn_reservation "sifive_p400_fdiv_d" 31 @@ -178,3 +181,18 @@ (define_bypass 1 "sifive_p400_f2i" "sifive_p400_branch,sifive_p400_sfb_alu,sifive_p400_mul, sifive_p400_div,sifive_p400_alu,sifive_p400_cpop") + + +;; Someone familiar with the p400 uarch needs to put +;; these into the right reservations. This is just a placeholder +;; for everything I found that had no mapping to a reservation. +;; +;; Note that even if the processor does not implementat a particular +;; instruction it should still have suitable reservations, even if +;; they are just dummies like this one. +(define_insn_reservation "sifive_p400_unknown" 1 + (and (eq_attr "tune" "sifive_p400") + (eq_attr "type" "ghost,vfrecp,vclmul,vldm,vmffs,vclmulh,vlsegde,vfcvtitof,vsm4k,vfcvtftoi,vfdiv,vsm3c,vsm4r,viwmuladd,vfwredu,vcpop,vfwmuladd,vstux,vsshift,vfwcvtftof,vfncvtftof,vfwmaccbf16,vext,vssegte,rdvl,vaeskf1,vfslide1up,vmov,vimovvx,vaesef,vfsqrt,viminmax,vfwcvtftoi,vssegtox,vfclass,viwmul,vector,vgmul,vsm3me,vfcmp,vstm,vfredo,vfwmul,vaeskf2,vstox,vfncvtbf16,vislide1up,vgather,vldox,viwred,vctz,vghsh,vsts,vslidedown,vfmerge,vicmp,vsmul,vlsegdff,vfalu,vfmov,vislide1down,vfminmax,vcompress,vldr,vldff,vlsegdux,vimuladd,vsalu,vidiv,sf_vqmacc,vfslide1down,vaesem,vimerge,vfncvtftoi,vfwcvtitof,vicalu,vaesz,sf_vc_se,vsha2cl,vmsfs,vldux,vmidx,vslideup,vired,vlde,vfwredo,vfmovfv,vbrev,vfncvtitof,rdfrm,vsetvl,vssegts,vimul,vialu,vbrev8,vfwalu,rdvlenb,sf_vfnrclip,vclz,vnclip,sf_vc,vimov,vste,vfmuladd,vfmovvf,vwsll,vsetvl_pre,vlds,vlsegds,vmiota,vmalu,wrvxrm,wrfrm,viwalu,vaesdm,vssegtux,vaesdf,vimovxv,vror,vnshift,vstr,vaalu,vsha2ms,crypto,vfwcvtbf16,vlsegdox,vrol,vandn,vfsgnj,vmpop,vfredu,vsha2ch,vshift,vrev8,vfmul")) + "p400_int_pipe+sifive_p400_ialu") + + diff --git a/gcc/config/riscv/sifive-p600.md b/gcc/config/riscv/sifive-p600.md index 2401349..ccd006d 100644 --- a/gcc/config/riscv/sifive-p600.md +++ b/gcc/config/riscv/sifive-p600.md @@ -157,10 +157,13 @@ (eq_attr "type" "fmove,fcvt")) "float_pipe,sifive_p600_fpu") +;; We need something for HF so that we don't abort during +;; scheduling if someone was to ask for p600 scheduling, but +;; enable the various HF mode extensions. (define_insn_reservation "sifive_p600_fdiv_s" 11 (and (eq_attr "tune" "sifive_p600") (eq_attr "type" "fdiv,fsqrt") - (eq_attr "mode" "SF")) + (eq_attr "mode" "HF,SF")) "sifive_p600_FM, sifive_p600_fdiv*5") (define_insn_reservation "sifive_p600_fdiv_d" 19 @@ -182,3 +185,15 @@ (define_bypass 1 "sifive_p600_f2i" "sifive_p600_branch,sifive_p600_sfb_alu,sifive_p600_mul, sifive_p600_div,sifive_p600_alu,sifive_p600_cpop") + +;; Someone familiar with the p600 uarch needs to put +;; these into the right reservations. This is just a placeholder +;; for everything I found that had no mapping to a reservation. +;; +;; Note that even if the processor does not implementat a particular +;; instruction it should still have suitable reservations, even if +;; they are just dummies like this one. +(define_insn_reservation "sifive_p600_unknown" 1 + (and (eq_attr "tune" "sifive_p600") + (eq_attr "type" "vicmp,vssegte,vbrev8,vfwalu,vimov,vmpop,vaesdf,vislide1up,vror,vsha2cl,vrol,vslideup,vimuladd,vclmul,vaesef,vext,vlsegdff,vfmuladd,vfclass,vmsfs,vfcmp,vsmul,vsm3me,vmalu,vshift,viwmuladd,vfslide1up,vlsegde,vsm4k,wrvxrm,vislide1down,vsm3c,vfwmuladd,vaesdm,vclmulh,vfwcvtftof,vfwredu,vfredo,sf_vfnrclip,vaesz,vwsll,vmiota,vctz,vsetvl_pre,vstm,vidiv,vssegtux,vfwmul,vcompress,vste,vired,vlsegds,vaesem,vfminmax,ghost,vandn,crypto,vfmul,vialu,vfmovvf,rdfrm,vldff,vfmerge,vsshift,vnclip,sf_vqmacc,vnshift,vfdiv,vfslide1down,vfncvtitof,vfsqrt,vimovxv,vstr,vfwcvtbf16,vfwcvtitof,vbrev,vssegtox,vssegts,vcpop,vmffs,viwmul,vldr,vmidx,rdvlenb,vfalu,vslidedown,vlde,vfsgnj,vfmov,viwalu,vsha2ch,vfncvtbf16,vfcvtitof,rdvl,vsetvl,vsha2ms,vector,vstux,vimerge,vclz,sf_vc,vfcvtftoi,viminmax,vsm4r,sf_vc_se,wrfrm,vstox,vfmovfv,vfncvtftoi,vimul,vsalu,vmov,vgmul,vgather,vldux,vlsegdox,vfncvtftof,vimovvx,vghsh,vldm,vldox,vfwcvtftoi,vlds,vfrecp,vaeskf2,vsts,vfredu,vicalu,vaalu,vfwmaccbf16,vrev8,vfwredo,vlsegdux,viwred,vaeskf1")) + "int_pipe+sifive_p600_ialu") diff --git a/gcc/config/riscv/sync.md b/gcc/config/riscv/sync.md index 50ec8b3..ab6f430 100644 --- a/gcc/config/riscv/sync.md +++ b/gcc/config/riscv/sync.md @@ -376,7 +376,19 @@ (match_operand:SI 3 "const_int_operand")] ;; model "TARGET_ZAAMO || TARGET_ZALRSC" { - if (TARGET_ZAAMO) + if (TARGET_ZAAMO && TARGET_64BIT && <MODE>mode == SImode) + { + rtx t = gen_reg_rtx (DImode); + emit_insn (gen_amo_atomic_exchange_extended (t, + operands[1], + operands[2], + operands[3])); + t = gen_lowpart (SImode, t); + SUBREG_PROMOTED_VAR_P (t) = 1; + SUBREG_PROMOTED_SET (t, SRP_SIGNED); + emit_move_insn (operands[0], t); + } + else if (TARGET_ZAAMO) emit_insn (gen_amo_atomic_exchange<mode> (operands[0], operands[1], operands[2], operands[3])); else @@ -386,18 +398,31 @@ }) (define_insn "amo_atomic_exchange<mode>" - [(set (match_operand:GPR 0 "register_operand" "=&r") + [(set (match_operand:GPR 0 "register_operand" "=r") (unspec_volatile:GPR [(match_operand:GPR 1 "memory_operand" "+A") (match_operand:SI 3 "const_int_operand")] ;; model UNSPEC_SYNC_EXCHANGE)) (set (match_dup 1) - (match_operand:GPR 2 "register_operand" "0"))] + (match_operand:GPR 2 "reg_or_0_operand" "rJ"))] "TARGET_ZAAMO" "amoswap.<amo>%A3\t%0,%z2,%1" [(set_attr "type" "atomic") (set (attr "length") (const_int 4))]) +(define_insn "amo_atomic_exchange_extended" + [(set (match_operand:DI 0 "register_operand" "=r") + (sign_extend:DI (unspec_volatile:SI + [(match_operand:SI 1 "memory_operand" "+A") + (match_operand:SI 3 "const_int_operand")] ;; model + UNSPEC_SYNC_EXCHANGE))) + (set (match_dup 1) + (match_operand:SI 2 "reg_or_0_operand" "rJ"))] + "TARGET_64BIT && TARGET_ZAAMO" + "amoswap.w%A3\t%0,%z2,%1" + [(set_attr "type" "atomic") + (set (attr "length") (const_int 4))]) + (define_insn "lrsc_atomic_exchange<mode>" [(set (match_operand:GPR 0 "register_operand" "=&r") (unspec_volatile:GPR @@ -434,13 +459,13 @@ }) (define_insn "zabha_atomic_exchange<mode>" - [(set (match_operand:SHORT 0 "register_operand" "=&r") + [(set (match_operand:SHORT 0 "register_operand" "=r") (unspec_volatile:SHORT [(match_operand:SHORT 1 "memory_operand" "+A") (match_operand:SI 3 "const_int_operand")] ;; model UNSPEC_SYNC_EXCHANGE_ZABHA)) (set (match_dup 1) - (match_operand:SHORT 2 "register_operand" "0"))] + (match_operand:SHORT 2 "reg_or_0_operand" "rJ"))] "TARGET_ZABHA" "amoswap.<amobh>%A3\t%0,%z2,%1" [(set_attr "type" "atomic") diff --git a/gcc/config/riscv/t-rtems b/gcc/config/riscv/t-rtems index f596e76..a4d2d03 100644 --- a/gcc/config/riscv/t-rtems +++ b/gcc/config/riscv/t-rtems @@ -1,8 +1,8 @@ MULTILIB_OPTIONS = MULTILIB_DIRNAMES = -MULTILIB_OPTIONS += march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc -MULTILIB_DIRNAMES += rv32i rv32iac rv32im rv32imf rv32ima rv32imac rv32imaf rv32imafc rv32imafd rv32imafdc rv64ima rv64imac rv64imafd rv64imafdc +MULTILIB_OPTIONS += march=rv32i/march=rv32iac/march=rv32im/march=rv32imf/march=rv32ima/march=rv32imac/march=rv32imaf/march=rv32imafc/march=rv32imafd/march=rv32imafdc/march=rv64ima/march=rv64imac/march=rv64imafd/march=rv64imafdc/march=rv64imc +MULTILIB_DIRNAMES += rv32i rv32iac rv32im rv32imf rv32ima rv32imac rv32imaf rv32imafc rv32imafd rv32imafdc rv64ima rv64imac rv64imafd rv64imafdc rv64imc MULTILIB_OPTIONS += mabi=ilp32/mabi=ilp32f/mabi=ilp32d/mabi=lp64/mabi=lp64d MULTILIB_DIRNAMES += ilp32 ilp32f ilp32d lp64 lp64d @@ -10,6 +10,9 @@ MULTILIB_DIRNAMES += ilp32 ilp32f ilp32d lp64 lp64d MULTILIB_OPTIONS += mcmodel=medany MULTILIB_DIRNAMES += medany +MULTILIB_OPTIONS += mstrict-align +MULTILIB_DIRNAMES += strict-align + MULTILIB_REQUIRED = MULTILIB_REQUIRED += march=rv32i/mabi=ilp32 MULTILIB_REQUIRED += march=rv32iac/mabi=ilp32 @@ -25,3 +28,5 @@ MULTILIB_REQUIRED += march=rv64ima/mabi=lp64/mcmodel=medany MULTILIB_REQUIRED += march=rv64imac/mabi=lp64/mcmodel=medany MULTILIB_REQUIRED += march=rv64imafd/mabi=lp64d/mcmodel=medany MULTILIB_REQUIRED += march=rv64imafdc/mabi=lp64d/mcmodel=medany +MULTILIB_REQUIRED += march=rv64imafdc/mabi=lp64d/mcmodel=medany/mstrict-align +MULTILIB_REQUIRED += march=rv64imc/mabi=lp64/mcmodel=medany/mstrict-align diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 66b7670..2b35d66 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1398,6 +1398,7 @@ } [(set_attr "type" "vmov,vlde,vste") (set_attr "mode" "<VT:MODE>") + (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))]) @@ -1435,6 +1436,7 @@ } [(set_attr "type" "vlde,vste,vmov") (set_attr "mode" "<MODE>") + (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))] ) @@ -1485,6 +1487,7 @@ } [(set_attr "type" "vlde,vste,vmov") (set_attr "mode" "<VLS_AVL_REG:MODE>") + (set (attr "merge_op_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "avl_type_idx") (const_int INVALID_ATTRIBUTE)) (set (attr "mode_idx") (const_int INVALID_ATTRIBUTE))] ) @@ -5490,6 +5493,98 @@ "TARGET_VECTOR" {}) +(define_expand "@pred_mul_plus_vx_<mode>" + [(set (match_operand:V_VLSI_QHS 0 "register_operand") + (if_then_else:V_VLSI_QHS + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 6 "vector_length_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (plus:V_VLSI_QHS + (mult:V_VLSI_QHS + (vec_duplicate:V_VLSI_QHS + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSI_QHS 3 "register_operand")) + (match_operand:V_VLSI_QHS 4 "register_operand")) + (match_operand:V_VLSI_QHS 5 "vector_merge_operand")))] + "TARGET_VECTOR" +{ + riscv_vector::prepare_ternary_operands (operands); +}) + +(define_expand "@pred_mul_plus_vx_<mode>" + [(set (match_operand:V_VLSI_D 0 "register_operand") + (if_then_else:V_VLSI_D + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 6 "vector_length_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (plus:V_VLSI_D + (mult:V_VLSI_D + (vec_duplicate:V_VLSI_D + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSI_D 3 "register_operand")) + (match_operand:V_VLSI_D 4 "register_operand")) + (match_operand:V_VLSI_D 5 "vector_merge_operand")))] + "TARGET_VECTOR && TARGET_64BIT" +{ + riscv_vector::prepare_ternary_operands (operands); +}) + +(define_expand "@pred_vnmsac_vx_<mode>" + [(set (match_operand:V_VLSI_QHS 0 "register_operand") + (if_then_else:V_VLSI_QHS + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 6 "vector_length_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (minus:V_VLSI_QHS + (match_operand:V_VLSI_QHS 4 "register_operand") + (mult:V_VLSI_QHS + (vec_duplicate:V_VLSI_QHS + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSI_QHS 3 "register_operand"))) + (match_operand:V_VLSI_QHS 5 "vector_merge_operand")))] + "TARGET_VECTOR" +{ + riscv_vector::prepare_ternary_operands (operands); +}) + +(define_expand "@pred_vnmsac_vx_<mode>" + [(set (match_operand:V_VLSI_D 0 "register_operand") + (if_then_else:V_VLSI_D + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand") + (match_operand 6 "vector_length_operand") + (match_operand 7 "const_int_operand") + (match_operand 8 "const_int_operand") + (match_operand 9 "const_int_operand") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (minus:V_VLSI_D + (match_operand:V_VLSI_D 4 "register_operand") + (mult:V_VLSI_D + (vec_duplicate:V_VLSI_D + (match_operand:<VEL> 2 "register_operand")) + (match_operand:V_VLSI_D 3 "register_operand"))) + (match_operand:V_VLSI_D 5 "vector_merge_operand")))] + "TARGET_VECTOR && TARGET_64BIT" +{ + riscv_vector::prepare_ternary_operands (operands); +}) + (define_insn "*pred_madd<mode>_scalar" [(set (match_operand:V_VLSI 0 "register_operand" "=vd, vr") (if_then_else:V_VLSI @@ -6324,8 +6419,8 @@ (set_attr "mode" "<MODE>")]) (define_insn "@pred_<optab><mode>_scalar" - [(set (match_operand:VF 0 "register_operand" "=vd, vd, vr, vr") - (if_then_else:VF + [(set (match_operand:V_VLSF 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSF (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") @@ -6336,11 +6431,11 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE) - (commutative_float_binop:VF - (vec_duplicate:VF + (commutative_float_binop:V_VLSF + (vec_duplicate:V_VLSF (match_operand:<VEL> 4 "register_operand" " f, f, f, f")) - (match_operand:VF 3 "register_operand" " vr, vr, vr, vr")) - (match_operand:VF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:V_VLSF 3 "register_operand" " vr, vr, vr, vr")) + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "vf<insn>.vf\t%0,%3,%4%p1" [(set_attr "type" "<float_insn_type>") @@ -6349,43 +6444,43 @@ (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))]) (define_insn "@pred_<optab><mode>_scalar" - [(set (match_operand:VF 0 "register_operand" "=vd, vd, vr, vr") - (if_then_else:VF + [(set (match_operand:V_VLSF 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSF (unspec:<VM> - [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") - (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") - (match_operand 8 "const_int_operand" " i, i, i, i") + [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") + (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (match_operand 8 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (commutative_float_binop_nofrm:VF - (vec_duplicate:VF - (match_operand:<VEL> 4 "register_operand" " f, f, f, f")) - (match_operand:VF 3 "register_operand" " vr, vr, vr, vr")) - (match_operand:VF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (commutative_float_binop_nofrm:V_VLSF + (vec_duplicate:V_VLSF + (match_operand:<VEL> 4 "register_operand" " f, f, f, f")) + (match_operand:V_VLSF 3 "register_operand" " vr, vr, vr, vr")) + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "vf<insn>.vf\t%0,%3,%4%p1" [(set_attr "type" "<float_insn_type>") (set_attr "mode" "<MODE>")]) (define_insn "@pred_<ieee_fmaxmin_op><mode>_scalar" - [(set (match_operand:VF 0 "register_operand" "=vd, vd, vr, vr") - (if_then_else:VF + [(set (match_operand:V_VLSF 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSF (unspec:<VM> - [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") - (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") - (match_operand 6 "const_int_operand" " i, i, i, i") - (match_operand 7 "const_int_operand" " i, i, i, i") - (match_operand 8 "const_int_operand" " i, i, i, i") + [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") + (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") + (match_operand 6 "const_int_operand" " i, i, i, i") + (match_operand 7 "const_int_operand" " i, i, i, i") + (match_operand 8 "const_int_operand" " i, i, i, i") (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) - (unspec:VF - [(match_operand:VF 3 "register_operand" " vr, vr, vr, vr") - (vec_duplicate:VF + (unspec:V_VLSF + [(match_operand:V_VLSF 3 "register_operand" " vr, vr, vr, vr") + (vec_duplicate:V_VLSF (match_operand:<VEL> 4 "register_operand" " f, f, f, f"))] UNSPEC_VFMAXMIN) - (match_operand:VF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "v<ieee_fmaxmin_op>.vf\t%0,%3,%4%p1" [(set_attr "type" "vfminmax") @@ -6417,8 +6512,8 @@ (symbol_ref "riscv_vector::get_frm_mode (operands[9])"))]) (define_insn "@pred_<optab><mode>_reverse_scalar" - [(set (match_operand:VF 0 "register_operand" "=vd, vd, vr, vr") - (if_then_else:VF + [(set (match_operand:V_VLSF 0 "register_operand" "=vd, vd, vr, vr") + (if_then_else:V_VLSF (unspec:<VM> [(match_operand:<VM> 1 "vector_mask_operand" " vm, vm,Wc1,Wc1") (match_operand 5 "vector_length_operand" "rvl,rvl,rvl,rvl") @@ -6429,11 +6524,11 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM) (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE) - (non_commutative_float_binop:VF - (vec_duplicate:VF + (non_commutative_float_binop:V_VLSF + (vec_duplicate:V_VLSF (match_operand:<VEL> 4 "register_operand" " f, f, f, f")) - (match_operand:VF 3 "register_operand" " vr, vr, vr, vr")) - (match_operand:VF 2 "vector_merge_operand" " vu, 0, vu, 0")))] + (match_operand:V_VLSF 3 "register_operand" " vr, vr, vr, vr")) + (match_operand:V_VLSF 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" "vfr<insn>.vf\t%0,%3,%4%p1" [(set_attr "type" "<float_insn_type>") @@ -8839,6 +8934,106 @@ [(set_attr "type" "vssegt<order>x") (set_attr "mode" "<V32T:MODE>")]) +(define_insn "*pred_macc_<mode>_scalar_undef" + [(set (match_operand:V_VLSI_QHS 0 "register_operand" "=vd, vr") + (if_then_else:V_VLSI_QHS + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand" " vm, Wc1") + (match_operand 6 "vector_length_operand" "rvl, rvl") + (match_operand 7 "const_int_operand" " i, i") + (match_operand 8 "const_int_operand" " i, i") + (match_operand 9 "const_int_operand" " i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (plus:V_VLSI_QHS + (mult:V_VLSI_QHS + (vec_duplicate:V_VLSI_QHS + (match_operand:<VEL> 3 "reg_or_0_operand" " rJ, rJ")) + (match_operand:V_VLSI_QHS 4 "register_operand" " vr, vr")) + (match_operand:V_VLSI_QHS 5 "register_operand" " 0, 0")) + (match_operand:V_VLSI_QHS 2 "vector_undef_operand")))] + "TARGET_VECTOR" + "@ + vmacc.vx\t%0,%z3,%4%p1 + vmacc.vx\t%0,%z3,%4%p1" + [(set_attr "type" "vimuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*pred_macc_<mode>_scalar_undef" + [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr") + (if_then_else:V_VLSI_D + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand" " vm, Wc1") + (match_operand 6 "vector_length_operand" "rvl, rvl") + (match_operand 7 "const_int_operand" " i, i") + (match_operand 8 "const_int_operand" " i, i") + (match_operand 9 "const_int_operand" " i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (plus:V_VLSI_D + (mult:V_VLSI_D + (vec_duplicate:V_VLSI_D + (match_operand:<VEL> 3 "reg_or_0_operand" " rJ, rJ")) + (match_operand:V_VLSI_D 4 "register_operand" " vr, vr")) + (match_operand:V_VLSI_D 5 "register_operand" " 0, 0")) + (match_operand:V_VLSI_D 2 "vector_undef_operand")))] + "TARGET_VECTOR && TARGET_64BIT" + "@ + vmacc.vx\t%0,%z3,%4%p1 + vmacc.vx\t%0,%z3,%4%p1" + [(set_attr "type" "vimuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*pred_nmsac_<mode>_scalar_undef" + [(set (match_operand:V_VLSI_QHS 0 "register_operand" "=vd, vr") + (if_then_else:V_VLSI_QHS + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand" " vm, Wc1") + (match_operand 6 "vector_length_operand" "rvl, rvl") + (match_operand 7 "const_int_operand" " i, i") + (match_operand 8 "const_int_operand" " i, i") + (match_operand 9 "const_int_operand" " i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (minus:V_VLSI_QHS + (match_operand:V_VLSI_QHS 5 "register_operand" " 0, 0") + (mult:V_VLSI_QHS + (vec_duplicate:V_VLSI_QHS + (match_operand:<VEL> 3 "reg_or_0_operand" " rJ, rJ")) + (match_operand:V_VLSI_QHS 4 "register_operand" " vr, vr"))) + (match_operand:V_VLSI_QHS 2 "vector_undef_operand")))] + "TARGET_VECTOR" + "@ + vnmsac.vx\t%0,%z3,%4%p1 + vnmsac.vx\t%0,%z3,%4%p1" + [(set_attr "type" "vimuladd") + (set_attr "mode" "<MODE>")]) + +(define_insn "*pred_nmsac_<mode>_scalar_undef" + [(set (match_operand:V_VLSI_D 0 "register_operand" "=vd, vr") + (if_then_else:V_VLSI_D + (unspec:<VM> + [(match_operand:<VM> 1 "vector_mask_operand" " vm, Wc1") + (match_operand 6 "vector_length_operand" "rvl, rvl") + (match_operand 7 "const_int_operand" " i, i") + (match_operand 8 "const_int_operand" " i, i") + (match_operand 9 "const_int_operand" " i, i") + (reg:SI VL_REGNUM) + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) + (minus:V_VLSI_D + (match_operand:V_VLSI_D 5 "register_operand" " 0, 0") + (mult:V_VLSI_D + (vec_duplicate:V_VLSI_D + (match_operand:<VEL> 3 "reg_or_0_operand" " rJ, rJ")) + (match_operand:V_VLSI_D 4 "register_operand" " vr, vr"))) + (match_operand:V_VLSI_D 2 "vector_undef_operand")))] + "TARGET_VECTOR && TARGET_64BIT" + "@ + vnmsac.vx\t%0,%z3,%4%p1 + vnmsac.vx\t%0,%z3,%4%p1" + [(set_attr "type" "vimuladd") + (set_attr "mode" "<MODE>")]) + (include "autovec.md") (include "autovec-opt.md") (include "sifive-vector.md") diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md index 34b4a8f..6179140 100644 --- a/gcc/config/riscv/xiangshan.md +++ b/gcc/config/riscv/xiangshan.md @@ -144,13 +144,13 @@ (define_insn_reservation "xiangshan_sfdiv" 11 (and (eq_attr "tune" "xiangshan") (eq_attr "type" "fdiv") - (eq_attr "mode" "SF")) + (eq_attr "mode" "HF,SF")) "xs_fmisc_rs") (define_insn_reservation "xiangshan_sfsqrt" 17 (and (eq_attr "tune" "xiangshan") (eq_attr "type" "fsqrt") - (eq_attr "mode" "SF")) + (eq_attr "mode" "HF,SF")) "xs_fmisc_rs") (define_insn_reservation "xiangshan_dfdiv" 21 diff --git a/gcc/config/rl78/rl78.opt.urls b/gcc/config/rl78/rl78.opt.urls index 96eff5f..66e874b 100644 --- a/gcc/config/rl78/rl78.opt.urls +++ b/gcc/config/rl78/rl78.opt.urls @@ -4,7 +4,7 @@ msim UrlSuffix(gcc/RL78-Options.html#index-msim-6) mmul= -UrlSuffix(gcc/RL78-Options.html#index-mmul) +UrlSuffix(gcc/RL78-Options.html#index-mmul-1) mallregs UrlSuffix(gcc/RL78-Options.html#index-mallregs) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 764b499..8dd23f8 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -10322,7 +10322,7 @@ can_be_rotated_to_negative_lis (HOST_WIDE_INT c, int *rot) rotated over the highest bit. */ unsigned HOST_WIDE_INT uc = c; int pos_one = clz_hwi ((HOST_WIDE_INT) (uc << 16) >> 16); - if (pos_one != 0) + if (pos_one > 0 && pos_one < HOST_BITS_PER_WIDE_INT) { middle_zeros = ctz_hwi (c >> (HOST_BITS_PER_WIDE_INT - pos_one)); int middle_ones = clz_hwi (~(uc << pos_one)); @@ -10585,7 +10585,7 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c, int *num_insns) { /* li/lis; rldicX */ unsigned HOST_WIDE_INT imm = (c | ~mask); - if (shift != 0) + if (shift > 0 && shift < HOST_BITS_PER_WIDE_INT) imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift)); count_or_emit_insn (temp, GEN_INT (imm)); diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index e31ee40..04a6c0f 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -15665,10 +15665,10 @@ (if_then_else:SI (lt (match_dup 3) (const_int 0)) (const_int -1) - (if_then_else (gt (match_dup 3) - (const_int 0)) - (const_int 1) - (const_int 0))))] + (if_then_else:SI (gt (match_dup 3) + (const_int 0)) + (const_int 1) + (const_int 0))))] "TARGET_P9_MISC" { operands[3] = gen_reg_rtx (CCmode); @@ -15703,10 +15703,10 @@ (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y") (const_int 0)) (const_int -1) - (if_then_else (gt (match_dup 1) - (const_int 0)) - (const_int 1) - (const_int 0))))] + (if_then_else:SI (gt (match_dup 1) + (const_int 0)) + (const_int 1) + (const_int 0))))] "TARGET_P9_MISC" "setb %0,%1" [(set_attr "type" "logical")]) @@ -15716,10 +15716,10 @@ (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y") (const_int 0)) (const_int -1) - (if_then_else (gtu (match_dup 1) - (const_int 0)) - (const_int 1) - (const_int 0))))] + (if_then_else:SI (gtu (match_dup 1) + (const_int 0)) + (const_int 1) + (const_int 0))))] "TARGET_P9_MISC" "setb %0,%1" [(set_attr "type" "logical")]) @@ -15751,10 +15751,10 @@ (if_then_else:SI (lt (match_dup 3) (const_int 0)) (const_int -1) - (if_then_else (gt (match_dup 3) - (const_int 0)) - (const_int 1) - (const_int 0))))] + (if_then_else:SI (gt (match_dup 3) + (const_int 0)) + (const_int 1) + (const_int 0))))] "TARGET_P9_MISC" { operands[3] = gen_reg_rtx (CCmode); @@ -15807,10 +15807,10 @@ (if_then_else:SI (lt (match_dup 3) (const_int 0)) (const_int -1) - (if_then_else (gt (match_dup 3) - (const_int 0)) - (const_int 1) - (const_int 0))))] + (if_then_else:SI (gt (match_dup 3) + (const_int 0)) + (const_int 1) + (const_int 0))))] "TARGET_P9_MISC && TARGET_64BIT" { operands[3] = gen_reg_rtx (CCmode); diff --git a/gcc/config/rx/rx.cc b/gcc/config/rx/rx.cc index dd730dc..c563881 100644 --- a/gcc/config/rx/rx.cc +++ b/gcc/config/rx/rx.cc @@ -1648,16 +1648,20 @@ mark_frame_related (rtx insn) static void add_pop_cfi_notes (rtx_insn *insn, unsigned int high, unsigned int low) { - rtx t = plus_constant (Pmode, stack_pointer_rtx, - (high - low + 1) * UNITS_PER_WORD); + rtx src = stack_pointer_rtx; + rtx t; + for (unsigned int i = low; i <= high; i++) + { + add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i)); + if (i == FRAME_POINTER_REGNUM && frame_pointer_needed) + src = frame_pointer_rtx; + } + t = plus_constant (Pmode, src, (high - low + 1) * UNITS_PER_WORD); t = gen_rtx_SET (stack_pointer_rtx, t); add_reg_note (insn, REG_CFA_ADJUST_CFA, t); RTX_FRAME_RELATED_P (insn) = 1; - for (unsigned int i = low; i <= high; i++) - add_reg_note (insn, REG_CFA_RESTORE, gen_rtx_REG (word_mode, i)); } - static bool ok_for_max_constant (HOST_WIDE_INT val) { @@ -1816,36 +1820,17 @@ rx_expand_prologue (void) } } - /* If needed, set up the frame pointer. */ - if (frame_pointer_needed) - gen_safe_add (frame_pointer_rtx, stack_pointer_rtx, - GEN_INT (- (HOST_WIDE_INT) frame_size), true); - - /* Allocate space for the outgoing args. - If the stack frame has not already been set up then handle this as well. */ - if (stack_size) + if (stack_size || frame_size) { - if (frame_size) - { - if (frame_pointer_needed) - gen_safe_add (stack_pointer_rtx, frame_pointer_rtx, - GEN_INT (- (HOST_WIDE_INT) stack_size), true); - else - gen_safe_add (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (- (HOST_WIDE_INT) (frame_size + stack_size)), - true); - } - else - gen_safe_add (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (- (HOST_WIDE_INT) stack_size), true); + gen_safe_add (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (- (HOST_WIDE_INT) (stack_size + frame_size)), + true); } - else if (frame_size) + if (frame_pointer_needed) { - if (! frame_pointer_needed) - gen_safe_add (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (- (HOST_WIDE_INT) frame_size), true); - else - gen_safe_add (stack_pointer_rtx, frame_pointer_rtx, NULL_RTX, true); + gen_safe_add (frame_pointer_rtx, stack_pointer_rtx, + GEN_INT ((HOST_WIDE_INT) stack_size), + true); } } diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index d044f9a..1a47f47 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -8318,7 +8318,7 @@ s390_expand_int_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) } /* Expand floating-point op0 = op1 <=> op2, i.e., - op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : 2. + op0 = op1 == op2 ? 0 : op1 < op2 ? -1 : op1 > op2 ? 1 : -128. If op3 equals const0_rtx, then we are interested in the compare only (see test spaceship-fp-4.c). Otherwise, op3 is a CONST_INT different than @@ -8368,7 +8368,7 @@ s390_expand_fp_spaceship (rtx op0, rtx op1, rtx op2, rtx op3) { emit_jump (l_end); emit_label (l_unordered); - rtx unord_val = op3 == const0_rtx ? const2_rtx : op3; + rtx unord_val = op3 == const0_rtx ? GEN_INT (-128) : op3; emit_move_insn (op0, unord_val); } emit_label (l_end); diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 8cc48b0..858387c 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -5248,18 +5248,19 @@ }) (define_insn "*zero_extendsidi2" - [(set (match_operand:DI 0 "register_operand" "=d,d,d") - (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b")))] + [(set (match_operand:DI 0 "register_operand" "=d,d,d,d") + (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand" "d,T,b,v")))] "TARGET_ZARCH" "@ llgfr\t%0,%1 llgf\t%0,%1 - llgfrl\t%0,%1" - [(set_attr "op_type" "RRE,RXY,RIL") - (set_attr "type" "*,*,larl") - (set_attr "cpu_facility" "*,*,z10") - (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3") - (set_attr "relative_long" "*,*,yes")]) + llgfrl\t%0,%1 + vlgvf\t%0,%v1,0" + [(set_attr "op_type" "RRE,RXY,RIL,VRS") + (set_attr "type" "*,*,larl,*") + (set_attr "cpu_facility" "*,*,z10,vx") + (set_attr "z10prop" "z10_fwd_E1,z10_fwd_A3,z10_fwd_A3,*") + (set_attr "relative_long" "*,*,yes,*")]) ; ; LLGT-type instructions (zero-extend from 31 bit to 64 bit). @@ -5362,29 +5363,32 @@ ; llhrl, llghrl (define_insn "*zero_extendhi<mode>2_z10" - [(set (match_operand:GPR 0 "register_operand" "=d,d,d") - (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b")))] + [(set (match_operand:GPR 0 "register_operand" "=d,d,d,d") + (zero_extend:GPR (match_operand:HI 1 "nonimmediate_operand" "d,T,b,v")))] "TARGET_Z10" "@ ll<g>hr\t%0,%1 ll<g>h\t%0,%1 - ll<g>hrl\t%0,%1" - [(set_attr "op_type" "RXY,RRE,RIL") - (set_attr "type" "*,*,larl") - (set_attr "cpu_facility" "*,*,z10") - (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3") - (set_attr "relative_long" "*,*,yes")]) + ll<g>hrl\t%0,%1 + vlgvh\t%0,%v1,0" + [(set_attr "op_type" "RXY,RRE,RIL,VRS") + (set_attr "type" "*,*,larl,*") + (set_attr "cpu_facility" "*,*,z10,vx") + (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,z10_fwd_A3,*") + (set_attr "relative_long" "*,*,yes,*")]) ; llhr, llcr, llghr, llgcr, llh, llc, llgh, llgc (define_insn "*zero_extend<HQI:mode><GPR:mode>2_extimm" - [(set (match_operand:GPR 0 "register_operand" "=d,d") - (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T")))] + [(set (match_operand:GPR 0 "register_operand" "=d,d,d") + (zero_extend:GPR (match_operand:HQI 1 "nonimmediate_operand" "d,T,v")))] "TARGET_EXTIMM" "@ ll<g><hc>r\t%0,%1 - ll<g><hc>\t%0,%1" - [(set_attr "op_type" "RRE,RXY") - (set_attr "z10prop" "z10_super_E1,z10_fwd_A3")]) + ll<g><hc>\t%0,%1 + vlgv<HQI:bhfgq>\t%0,%v1,0" + [(set_attr "op_type" "RRE,RXY,VRS") + (set_attr "cpu_facility" "*,*,vx") + (set_attr "z10prop" "z10_super_E1,z10_fwd_A3,*")]) ; llgh, llgc (define_insn "*zero_extend<HQI:mode><GPR:mode>2" diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 12bbeb6..745634e 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -501,54 +501,6 @@ SIL,SIL,RI,RI,RRE,RRE,RIL,RR,RXY,RXY,RIL")]) -; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e., -; an implicit zero extend is done. - -(define_insn "*movdi<mode>_zero_extend_A" - [(set (match_operand:DI 0 "register_operand" "=d") - (zero_extend:DI (match_operand:SINT 1 "register_operand" "v")))] - "TARGET_VX" - "vlgv<bhfgq>\t%0,%v1,0" - [(set_attr "op_type" "VRS")]) - -(define_insn "*movsi<mode>_zero_extend_A" - [(set (match_operand:SI 0 "register_operand" "=d") - (zero_extend:SI (match_operand:HQI 1 "register_operand" "v")))] - "TARGET_VX" - "vlgv<bhfgq>\t%0,%v1,0" - [(set_attr "op_type" "VRS")]) - -(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI - V1HI V2HI V4HI V8HI - V1SI V2SI V4SI]) -(define_insn "*movdi<mode>_zero_extend_B" - [(set (match_operand:DI 0 "register_operand" "=d") - (zero_extend:DI (vec_select:<non_vec> - (match_operand:VLGV_DI 1 "register_operand" "v") - (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))] - "TARGET_VX" -{ - operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1)); - return "vlgv<bhfgq>\t%0,%v1,%Y2"; -} - [(set_attr "op_type" "VRS") - (set_attr "mnemonic" "vlgv<bhfgq>")]) - -(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI - V1HI V2HI V4HI V8HI]) -(define_insn "*movsi<mode>_zero_extend_B" - [(set (match_operand:SI 0 "register_operand" "=d") - (zero_extend:SI (vec_select:<non_vec> - (match_operand:VLGV_SI 1 "register_operand" "v") - (parallel [(match_operand:SI 2 "const_int_operand" "n")]))))] - "TARGET_VX" -{ - operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1)); - return "vlgv<bhfgq>\t%0,%v1,%Y2"; -} - [(set_attr "op_type" "VRS") - (set_attr "mnemonic" "vlgv<bhfgq>")]) - ; vec_load_lanes? ; vec_store_lanes? @@ -763,6 +715,42 @@ DONE; }) +; Instructions vlgvb, vlgvh, vlgvf zero all remaining bits of a GPR, i.e., +; an implicit zero extend is done. + +(define_mode_iterator VLGV_DI [V1QI V2QI V4QI V8QI V16QI + V1HI V2HI V4HI V8HI + V1SI V2SI V4SI]) +(define_insn "*vec_extract<mode>_zero_extend" + [(set (match_operand:DI 0 "register_operand" "=d") + (zero_extend:DI (vec_select:<non_vec> + (match_operand:VLGV_DI 1 "register_operand" "v") + (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))] + "TARGET_VX" +{ + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1)); + return "vlgv<bhfgq>\t%0,%v1,%Y2"; +} + [(set_attr "op_type" "VRS") + (set_attr "mnemonic" "vlgv<bhfgq>")]) + +(define_mode_iterator VLGV_SI [V1QI V2QI V4QI V8QI V16QI + V1HI V2HI V4HI V8HI]) +(define_insn "*vec_extract<mode>_zero_extend" + [(set (match_operand:SI 0 "register_operand" "=d") + (zero_extend:SI (vec_select:<non_vec> + (match_operand:VLGV_SI 1 "register_operand" "v") + (parallel [(match_operand:SI 2 "nonmemory_operand" "an")]))))] + "TARGET_VX" +{ + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (UINTVAL (operands[2]) & (GET_MODE_NUNITS (<MODE>mode) - 1)); + return "vlgv<bhfgq>\t%0,%v1,%Y2"; +} + [(set_attr "op_type" "VRS") + (set_attr "mnemonic" "vlgv<bhfgq>")]) + (define_insn "*vec_vllezlf<mode>" [(set (match_operand:V_HW_4 0 "register_operand" "=v") (vec_concat:V_HW_4 diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md index 77c9571..727ec1e 100644 --- a/gcc/config/xtensa/constraints.md +++ b/gcc/config/xtensa/constraints.md @@ -130,7 +130,7 @@ (and (match_code "mem") (match_test "smalloffset_mem_p (op)"))) -(define_memory_constraint "T" +(define_special_memory_constraint "T" "Memory in a literal pool (addressable with an L32R instruction)." (and (match_code "mem") (match_test "!TARGET_CONST16 && constantpool_mem_p (op)"))) diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md index 9aeaba6..20160a4 100644 --- a/gcc/config/xtensa/predicates.md +++ b/gcc/config/xtensa/predicates.md @@ -189,6 +189,9 @@ (define_predicate "ubranch_operator" (match_code "ltu,geu")) +(define_predicate "alt_ubranch_operator" + (match_code "gtu,leu")) + (define_predicate "boolean_operator" (match_code "eq,ne")) diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h index 1f5dcf5..98e75c6 100644 --- a/gcc/config/xtensa/xtensa-protos.h +++ b/gcc/config/xtensa/xtensa-protos.h @@ -60,6 +60,7 @@ extern bool xtensa_tls_referenced_p (rtx); extern enum rtx_code xtensa_shlrd_which_direction (rtx, rtx); extern bool xtensa_split1_finished_p (void); extern void xtensa_split_DI_reg_imm (rtx *); +extern char *xtensa_bswapsi2_output (rtx_insn *, const char *); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, int); diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index d75cba4..f3b89de 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -2645,6 +2645,94 @@ xtensa_split_DI_reg_imm (rtx *operands) } +/* Return the asm output string of bswapsi2_internal insn pattern. + It does this by scanning backwards for the BB from the specified insn, + and if an another bswapsi2_internal is found, it omits the instruction + to set SAR to 8. If not found, or if a CALL, JUMP, ASM, or other insn + that clobbers SAR is found first, prepend an instruction to set SAR to + 8 as usual. */ + +static int +xtensa_bswapsi2_output_1 (rtx_insn *insn) +{ + int icode; + rtx pat; + const char *iname; + + /* CALL insn do not preserve SAR. + JUMP insn only appear at the end of BB, so they do not need to be + considered when scanning backwards. */ + if (CALL_P (insn)) + return -1; + + switch (icode = INSN_CODE (insn)) + { + /* rotate insns clobber SAR. */ + case CODE_FOR_rotlsi3: + case CODE_FOR_rotrsi3: + return -1; + /* simple shift insns clobber SAR if non-immediate shift amounts. */ + case CODE_FOR_ashlsi3_internal: + case CODE_FOR_ashrsi3: + case CODE_FOR_lshrsi3: + if (! CONST_INT_P (XEXP (SET_SRC (PATTERN (insn)), 1))) + return -1; + break; + /* this insn always set SAR to 8. */ + case CODE_FOR_bswapsi2_internal: + return 1; + default: + break; + } + + /* "*shift_per_byte" and "*shlrd_*" complex shift insns clobber SAR. */ + if (icode >= CODE_FOR_nothing + && (! strcmp (iname = insn_data[icode].name, "*shift_per_byte") + || ! strncmp (iname, "*shlrd_", 7))) + return -1; + + /* asm statements may also clobber SAR, so they are anything goes. */ + if (NONJUMP_INSN_P (insn)) + switch (GET_CODE (pat = PATTERN (insn))) + { + case SET: + return GET_CODE (SET_SRC (pat)) == ASM_OPERANDS ? -1 : 0; + case PARALLEL: + return (GET_CODE (pat = XVECEXP (pat, 0, 0)) == SET + && GET_CODE (SET_SRC (pat)) == ASM_OPERANDS) + || GET_CODE (pat) == ASM_OPERANDS + || GET_CODE (pat) == ASM_INPUT ? -1 : 0; + case ASM_OPERANDS: + return -1; + default: + break; + } + + /* All other insns are not interested in SAR. */ + return 0; +} + +char * +xtensa_bswapsi2_output (rtx_insn *insn, const char *output) +{ + static char result[128]; + int i; + + strcpy (result, "ssai\t8\n\t"); + while ((insn = prev_nonnote_nondebug_insn_bb (insn))) + if ((i = xtensa_bswapsi2_output_1 (insn)) < 0) + break; + else if (i > 0) + { + result[0] = '\0'; + break; + } + strcat (result, output); + + return result; +} + + /* Try to split an integer value into what are suitable for two consecutive immediate addition instructions, ADDI or ADDMI. */ diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md index 629dfdd..52ffb16 100644 --- a/gcc/config/xtensa/xtensa.md +++ b/gcc/config/xtensa/xtensa.md @@ -88,6 +88,7 @@ ;; This mode iterator allows the HI and QI patterns to be defined from ;; the same template. (define_mode_iterator HQI [HI QI]) +(define_mode_attr mode_bits [(HI "16") (QI "8")]) ;; This mode iterator allows the SI and HI patterns to be defined from ;; the same template. @@ -176,19 +177,18 @@ ;; Addition. (define_insn "addsi3" - [(set (match_operand:SI 0 "register_operand" "=D,D,a,a,a") - (plus:SI (match_operand:SI 1 "register_operand" "%d,d,r,r,r") - (match_operand:SI 2 "add_operand" "d,O,r,J,N")))] - "" - "@ - add.n\t%0, %1, %2 - addi.n\t%0, %1, %d2 - add\t%0, %1, %2 - addi\t%0, %1, %d2 - addmi\t%0, %1, %x2" - [(set_attr "type" "arith,arith,arith,arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "2,2,3,3,3")]) + [(set (match_operand:SI 0 "register_operand") + (plus:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "add_operand")))] + "" + {@ [cons: =0, %1, 2; attrs: type, length] + [D, d, d; arith, 2] add.n\t%0, %1, %2 + [D, d, O; arith, 2] addi.n\t%0, %1, %d2 + [a, r, r; arith, 3] add\t%0, %1, %2 + [a, r, J; arith, 3] addi\t%0, %1, %d2 + [a, r, N; arith, 3] addmi\t%0, %1, %x2 + } + [(set_attr "mode" "SI")]) (define_insn "*addsubx" [(set (match_operand:SI 0 "register_operand" "=a") @@ -392,18 +392,15 @@ (set_attr "length" "3")]) (define_insn "<u>mulhisi3" - [(set (match_operand:SI 0 "register_operand" "=C,A") - (mult:SI (any_extend:SI - (match_operand:HI 1 "register_operand" "%r,r")) - (any_extend:SI - (match_operand:HI 2 "register_operand" "r,r"))))] + [(set (match_operand:SI 0 "register_operand") + (mult:SI (any_extend:SI (match_operand:HI 1 "register_operand")) + (any_extend:SI (match_operand:HI 2 "register_operand"))))] "TARGET_MUL16 || TARGET_MAC16" - "@ - mul16<su>\t%0, %1, %2 - <u>mul.aa.ll\t%1, %2" - [(set_attr "type" "mul16,mac16") - (set_attr "mode" "SI") - (set_attr "length" "3,3")]) + {@ [cons: =0, %1, 2; attrs: type, length] + [C, r, r; mul16, 3] mul16<su>\t%0, %1, %2 + [A, r, r; mac16, 3] <u>mul.aa.ll\t%1, %2 + } + [(set_attr "mode" "SI")]) (define_insn "muladdhisi" [(set (match_operand:SI 0 "register_operand" "=A") @@ -652,36 +649,15 @@ }) (define_insn "bswapsi2_internal" - [(set (match_operand:SI 0 "register_operand" "=a,&a") - (bswap:SI (match_operand:SI 1 "register_operand" "0,r"))) - (clobber (match_scratch:SI 2 "=&a,X"))] + [(set (match_operand:SI 0 "register_operand") + (bswap:SI (match_operand:SI 1 "register_operand"))) + (clobber (match_scratch:SI 2))] "!optimize_debug && optimize > 1 && !optimize_size" -{ - rtx_insn *prev_insn = prev_nonnote_nondebug_insn (insn); - const char *init = "ssai\t8\;"; - static char result[128]; - if (prev_insn && NONJUMP_INSN_P (prev_insn)) - { - rtx x = PATTERN (prev_insn); - if (GET_CODE (x) == PARALLEL && XVECLEN (x, 0) == 2 - && GET_CODE (XVECEXP (x, 0, 0)) == SET - && GET_CODE (XVECEXP (x, 0, 1)) == CLOBBER) - { - x = XEXP (XVECEXP (x, 0, 0), 1); - if (GET_CODE (x) == BSWAP && GET_MODE (x) == SImode) - init = ""; - } - } - sprintf (result, - (which_alternative == 0) - ? "%s" "srli\t%%2, %%1, 16\;src\t%%2, %%2, %%1\;src\t%%2, %%2, %%2\;src\t%%0, %%1, %%2" - : "%s" "srli\t%%0, %%1, 16\;src\t%%0, %%0, %%1\;src\t%%0, %%0, %%0\;src\t%%0, %%1, %%0", - init); - return result; -} - [(set_attr "type" "arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "15,15")]) + {@ [cons: =0, 1, =2; attrs: type, length] + [ a, 0, &a; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%2, %1, 16\;src\t%2, %2, %1\;src\t%2, %2, %2\;src\t%0, %1, %2"); + [&a, r, X; arith, 15] << xtensa_bswapsi2_output (insn, "srli\t%0, %1, 16\;src\t%0, %0, %1\;src\t%0, %0, %0\;src\t%0, %1, %0"); + } + [(set_attr "mode" "SI")]) (define_expand "bswapdi2" [(set (match_operand:DI 0 "register_operand" "") @@ -742,16 +718,15 @@ ;; Logical instructions. (define_insn "andsi3" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (and:SI (match_operand:SI 1 "register_operand" "%r,r") - (match_operand:SI 2 "mask_operand" "P,r")))] + [(set (match_operand:SI 0 "register_operand") + (and:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "mask_operand")))] "" - "@ - extui\t%0, %1, 0, %K2 - and\t%0, %1, %2" - [(set_attr "type" "arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "3,3")]) + {@ [cons: =0, %1, 2; attrs: type, length] + [a, r, P; arith, 3] extui\t%0, %1, 0, %K2 + [a, r, r; arith, 3] and\t%0, %1, %2 + } + [(set_attr "mode" "SI")]) (define_insn_and_split "*andsi3_bitcmpl" [(set (match_operand:SI 0 "register_operand" "=a") @@ -944,27 +919,15 @@ ;; Zero-extend instructions. -(define_insn "zero_extendhisi2" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (zero_extend:SI (match_operand:HI 1 "nonimmed_operand" "r,U")))] - "" - "@ - extui\t%0, %1, 0, 16 - %v1l16ui\t%0, %1" - [(set_attr "type" "arith,load") - (set_attr "mode" "SI") - (set_attr "length" "3,3")]) - -(define_insn "zero_extendqisi2" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (zero_extend:SI (match_operand:QI 1 "nonimmed_operand" "r,U")))] +(define_insn "zero_extend<mode>si2" + [(set (match_operand:SI 0 "register_operand") + (zero_extend:SI (match_operand:HQI 1 "nonimmed_operand")))] "" - "@ - extui\t%0, %1, 0, 8 - %v1l8ui\t%0, %1" - [(set_attr "type" "arith,load") - (set_attr "mode" "SI") - (set_attr "length" "3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [a, r; arith, 3] extui\t%0, %1, 0, <mode_bits> + [a, U; load , 3] %v1l<mode_bits>ui\t%0, %1 + } + [(set_attr "mode" "SI")]) ;; Sign-extend instructions. @@ -982,15 +945,14 @@ }) (define_insn "extendhisi2_internal" - [(set (match_operand:SI 0 "register_operand" "=B,a") - (sign_extend:SI (match_operand:HI 1 "sext_operand" "r,U")))] + [(set (match_operand:SI 0 "register_operand") + (sign_extend:SI (match_operand:HI 1 "sext_operand")))] "" - "@ - sext\t%0, %1, 15 - %v1l16si\t%0, %1" - [(set_attr "type" "arith,load") - (set_attr "mode" "SI") - (set_attr "length" "3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [B, r; arith, 3] sext\t%0, %1, 15 + [a, U; load , 3] %v1l16si\t%0, %1 + } + [(set_attr "mode" "SI")]) (define_expand "extendqisi2" [(set (match_operand:SI 0 "register_operand" "") @@ -1327,29 +1289,28 @@ }) (define_insn "movsi_internal" - [(set (match_operand:SI 0 "nonimmed_operand" "=D,D,D,D,R,R,a,q,a,a,W,a,a,U,*a,*A") - (match_operand:SI 1 "move_operand" "M,D,d,R,D,d,r,r,I,Y,i,T,U,r,*A,*r"))] + [(set (match_operand:SI 0 "nonimmed_operand") + (match_operand:SI 1 "move_operand"))] "xtensa_valid_move (SImode, operands)" - "@ - movi.n\t%0, %x1 - mov.n\t%0, %1 - mov.n\t%0, %1 - %v1l32i.n\t%0, %1 - %v0s32i.n\t%1, %0 - %v0s32i.n\t%1, %0 - mov\t%0, %1 - movsp\t%0, %1 - movi\t%0, %x1 - movi\t%0, %1 - const16\t%0, %t1\;const16\t%0, %b1 - %v1l32r\t%0, %1 - %v1l32i\t%0, %1 - %v0s32i\t%1, %0 - rsr\t%0, ACCLO - wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,load,store,store,move,move,move,load,move,load,load,store,rsr,wsr") - (set_attr "mode" "SI") - (set_attr "length" "2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [ D, M; move , 2] movi.n\t%0, %x1 + [ D, D; move , 2] mov.n\t%0, %1 + [ D, d; move , 2] ^ + [ D, R; load , 2] %v1l32i.n\t%0, %1 + [ R, D; store, 2] %v0s32i.n\t%1, %0 + [ R, d; store, 2] ^ + [ a, r; move , 3] mov\t%0, %1 + [ q, r; move , 3] movsp\t%0, %1 + [ a, I; move , 3] movi\t%0, %x1 + [ a, Y; load , 3] movi\t%0, %1 + [ W, i; move , 6] const16\t%0, %t1\;const16\t%0, %b1 + [ a, T; load , 3] %v1l32r\t%0, %1 + [ a, U; load , 3] %v1l32i\t%0, %1 + [ U, r; store, 3] %v0s32i\t%1, %0 + [*a, *A; rsr , 3] rsr\t%0, ACCLO + [*A, *r; wsr , 3] wsr\t%1, ACCLO + } + [(set_attr "mode" "SI")]) (define_split [(set (match_operand:SHI 0 "register_operand") @@ -1399,23 +1360,22 @@ }) (define_insn "movhi_internal" - [(set (match_operand:HI 0 "nonimmed_operand" "=D,D,a,a,a,a,a,U,*a,*A") - (match_operand:HI 1 "move_operand" "M,d,r,I,Y,T,U,r,*A,*r"))] + [(set (match_operand:HI 0 "nonimmed_operand") + (match_operand:HI 1 "move_operand"))] "xtensa_valid_move (HImode, operands)" - "@ - movi.n\t%0, %x1 - mov.n\t%0, %1 - mov\t%0, %1 - movi\t%0, %x1 - movi\t%0, %1 - %v1l32r\t%0, %1 - %v1l16ui\t%0, %1 - %v0s16i\t%1, %0 - rsr\t%0, ACCLO - wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,move,load,load,load,store,rsr,wsr") - (set_attr "mode" "HI") - (set_attr "length" "2,2,3,3,3,3,3,3,3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [ D, M; move , 2] movi.n\t%0, %x1 + [ D, d; move , 2] mov.n\t%0, %1 + [ a, r; move , 3] mov\t%0, %1 + [ a, I; move , 3] movi\t%0, %x1 + [ a, Y; load , 3] movi\t%0, %1 + [ a, T; load , 3] %v1l32r\t%0, %1 + [ a, U; load , 3] %v1l16ui\t%0, %1 + [ U, r; store, 3] %v0s16i\t%1, %0 + [*a, *A; rsr , 3] rsr\t%0, ACCLO + [*A, *r; wsr , 3] wsr\t%1, ACCLO + } + [(set_attr "mode" "HI")]) ;; 8-bit Integer moves @@ -1429,21 +1389,20 @@ }) (define_insn "movqi_internal" - [(set (match_operand:QI 0 "nonimmed_operand" "=D,D,a,a,a,U,*a,*A") - (match_operand:QI 1 "move_operand" "M,d,r,I,U,r,*A,*r"))] + [(set (match_operand:QI 0 "nonimmed_operand") + (match_operand:QI 1 "move_operand"))] "xtensa_valid_move (QImode, operands)" - "@ - movi.n\t%0, %x1 - mov.n\t%0, %1 - mov\t%0, %1 - movi\t%0, %x1 - %v1l8ui\t%0, %1 - %v0s8i\t%1, %0 - rsr\t%0, ACCLO - wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,move,load,store,rsr,wsr") - (set_attr "mode" "QI") - (set_attr "length" "2,2,3,3,3,3,3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [ D, M; move , 2] movi.n\t%0, %x1 + [ D, d; move , 2] mov.n\t%0, %1 + [ a, r; move , 3] mov\t%0, %1 + [ a, I; move , 3] movi\t%0, %x1 + [ a, U; load , 3] %v1l8ui\t%0, %1 + [ U, r; store, 3] %v0s8i\t%1, %0 + [*a, *A; rsr , 3] rsr\t%0, ACCLO + [*A, *r; wsr , 3] wsr\t%1, ACCLO + } + [(set_attr "mode" "QI")]) ;; Sub-word reloads from the constant pool. @@ -1501,30 +1460,29 @@ }) (define_insn "movsf_internal" - [(set (match_operand:SF 0 "nonimmed_operand" "=f,f,U,D,a,D,R,a,f,a,a,W,a,U") - (match_operand:SF 1 "move_operand" "f,^U,f,d,T,R,d,r,r,f,Y,iF,U,r"))] + [(set (match_operand:SF 0 "nonimmed_operand") + (match_operand:SF 1 "move_operand"))] "((register_operand (operands[0], SFmode) || register_operand (operands[1], SFmode)) && !(FP_REG_P (xt_true_regnum (operands[0])) && (constantpool_mem_p (operands[1]) || CONSTANT_P (operands[1]))))" - "@ - mov.s\t%0, %1 - %v1lsi\t%0, %1 - %v0ssi\t%1, %0 - mov.n\t%0, %1 - %v1l32r\t%0, %1 - %v1l32i.n\t%0, %1 - %v0s32i.n\t%1, %0 - mov\t%0, %1 - wfr\t%0, %1 - rfr\t%0, %1 - movi\t%0, %y1 - const16\t%0, %t1\;const16\t%0, %b1 - %v1l32i\t%0, %1 - %v0s32i\t%1, %0" - [(set_attr "type" "farith,fload,fstore,move,load,load,store,move,farith,farith,load,move,load,store") - (set_attr "mode" "SF") - (set_attr "length" "3,3,3,2,3,2,2,3,3,3,3,6,3,3")]) + {@ [cons: =0, 1; attrs: type, length] + [f, f; farith, 3] mov.s\t%0, %1 + [f, ^U; fload , 3] %v1lsi\t%0, %1 + [U, f; fstore, 3] %v0ssi\t%1, %0 + [D, d; move , 2] mov.n\t%0, %1 + [a, T; load , 3] %v1l32r\t%0, %1 + [D, R; load , 2] %v1l32i.n\t%0, %1 + [R, d; store , 2] %v0s32i.n\t%1, %0 + [a, r; move , 3] mov\t%0, %1 + [f, r; farith, 3] wfr\t%0, %1 + [a, f; farith, 3] rfr\t%0, %1 + [a, Y; load , 3] movi\t%0, %y1 + [W, iF; move , 6] const16\t%0, %t1\;const16\t%0, %b1 + [a, U; load , 3] %v1l32i\t%0, %1 + [U, r; store , 3] %v0s32i\t%1, %0 + } + [(set_attr "mode" "SF")]) (define_insn "*lsiu" [(set (match_operand:SF 0 "register_operand" "=f") @@ -1692,16 +1650,15 @@ }) (define_insn "ashlsi3_internal" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (ashift:SI (match_operand:SI 1 "register_operand" "r,r") - (match_operand:SI 2 "arith_operand" "J,r")))] + [(set (match_operand:SI 0 "register_operand") + (ashift:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "arith_operand")))] "" - "@ - slli\t%0, %1, %R2 - ssl\t%2\;sll\t%0, %1" - [(set_attr "type" "arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "3,6")]) + {@ [cons: =0, 1, 2; attrs: type, length] + [a, r, J; arith, 3] slli\t%0, %1, %R2 + [a, r, r; arith, 6] ssl\t%2\;sll\t%0, %1 + } + [(set_attr "mode" "SI")]) (define_split [(set (match_operand:SI 0 "register_operand") @@ -1713,35 +1670,26 @@ (match_dup 1)))]) (define_insn "ashrsi3" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (ashiftrt:SI (match_operand:SI 1 "register_operand" "r,r") - (match_operand:SI 2 "arith_operand" "J,r")))] + [(set (match_operand:SI 0 "register_operand") + (ashiftrt:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "arith_operand")))] "" - "@ - srai\t%0, %1, %R2 - ssr\t%2\;sra\t%0, %1" - [(set_attr "type" "arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "3,6")]) + {@ [cons: =0, 1, 2; attrs: type, length] + [a, r, J; arith, 3] srai\t%0, %1, %R2 + [a, r, r; arith, 6] ssr\t%2\;sra\t%0, %1 + } + [(set_attr "mode" "SI")]) (define_insn "lshrsi3" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (lshiftrt:SI (match_operand:SI 1 "register_operand" "r,r") - (match_operand:SI 2 "arith_operand" "J,r")))] + [(set (match_operand:SI 0 "register_operand") + (lshiftrt:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "arith_operand")))] "" -{ - if (which_alternative == 0) - { - if ((INTVAL (operands[2]) & 0x1f) < 16) - return "srli\t%0, %1, %R2"; - else - return "extui\t%0, %1, %R2, %L2"; - } - return "ssr\t%2\;srl\t%0, %1"; -} - [(set_attr "type" "arith,arith") - (set_attr "mode" "SI") - (set_attr "length" "3,6")]) + {@ [cons: =0, 1, 2; attrs: type, length] + [a, r, J; arith, 3] << (INTVAL (operands[2]) & 0x1f) < 16 ? \"srli\t%0, %1, %R2\" : \"extui\t%0, %1, %R2, %L2\"; + [a, r, r; arith, 6] ssr\t%2\;srl\t%0, %1 + } + [(set_attr "mode" "SI")]) (define_insn "*shift_per_byte" [(set (match_operand:SI 0 "register_operand" "=a") @@ -1944,28 +1892,26 @@ (set_attr "length" "6")]) (define_insn "rotlsi3" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (rotate:SI (match_operand:SI 1 "register_operand" "r,r") - (match_operand:SI 2 "arith_operand" "J,r")))] + [(set (match_operand:SI 0 "register_operand") + (rotate:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "arith_operand")))] "" - "@ - ssai\t%L2\;src\t%0, %1, %1 - ssl\t%2\;src\t%0, %1, %1" - [(set_attr "type" "multi,multi") - (set_attr "mode" "SI") - (set_attr "length" "6,6")]) + {@ [cons: =0, 1, 2; attrs: type, length] + [a, r, J; multi, 6] ssai\t%L2\;src\t%0, %1, %1 + [a, r, r; multi, 6] ssl\t%2\;src\t%0, %1, %1 + } + [(set_attr "mode" "SI")]) (define_insn "rotrsi3" - [(set (match_operand:SI 0 "register_operand" "=a,a") - (rotatert:SI (match_operand:SI 1 "register_operand" "r,r") - (match_operand:SI 2 "arith_operand" "J,r")))] + [(set (match_operand:SI 0 "register_operand") + (rotatert:SI (match_operand:SI 1 "register_operand") + (match_operand:SI 2 "arith_operand")))] "" - "@ - ssai\t%R2\;src\t%0, %1, %1 - ssr\t%2\;src\t%0, %1, %1" - [(set_attr "type" "multi,multi") - (set_attr "mode" "SI") - (set_attr "length" "6,6")]) + {@ [cons: =0, 1, 2; attrs: type, length] + [a, r, J; multi, 6] ssai\t%R2\;src\t%0, %1, %1 + [a, r, r; multi, 6] ssr\t%2\;src\t%0, %1, %1 + } + [(set_attr "mode" "SI")]) ;; Comparisons. @@ -2024,26 +1970,23 @@ [(match_operand:SI 0 "register_operand" "r") (const_int -2147483648)]) (label_ref (match_operand 1 "")) - (pc)))] + (pc))) + (clobber (match_scratch:SI 3 "=a"))] "TARGET_ABS" "#" - "&& can_create_pseudo_p ()" + "&& 1" [(set (match_dup 3) (abs:SI (match_dup 0))) (set (pc) (if_then_else (match_op_dup 2 - [(zero_extract:SI (match_dup 3) - (const_int 1) - (match_dup 4)) + [(match_dup 3) (const_int 0)]) (label_ref (match_dup 1)) (pc)))] { - operands[3] = gen_reg_rtx (SImode); - operands[4] = GEN_INT (BITS_BIG_ENDIAN ? 0 : 31); - operands[2] = gen_rtx_fmt_ee (reverse_condition (GET_CODE (operands[2])), - VOIDmode, XEXP (operands[2], 0), - const0_rtx); + if (GET_CODE (operands[3]) == SCRATCH) + operands[3] = gen_reg_rtx (SImode); + PUT_CODE (operands[2], GET_CODE (operands[2]) == EQ ? LT : GE); } [(set_attr "type" "jump") (set_attr "mode" "none") @@ -2190,7 +2133,7 @@ (label_ref (match_dup 1)) (pc)))] { - operands[3] = GEN_INT ((1 << GET_MODE_BITSIZE (GET_MODE (operands[3]))) - 1); + operands[3] = GEN_INT (GET_MODE_MASK (GET_MODE (operands[3]))); }) (define_insn_and_split "*masktrue_const_pow2_minus_one" @@ -3370,6 +3313,42 @@ (const_int 8) (const_int 9))))]) +(define_insn_and_split "*eqne_in_range" + [(set (pc) + (if_then_else (match_operator 4 "alt_ubranch_operator" + [(plus:SI (match_operand:SI 0 "register_operand" "r") + (match_operand:SI 1 "const_int_operand" "i")) + (match_operand:SI 2 "const_int_operand" "i")]) + (label_ref (match_operand 3 "")) + (pc))) + (clobber (match_scratch:SI 5 "=&a"))] + "TARGET_MINMAX && TARGET_CLAMPS + && INTVAL (operands[1]) * 2 - INTVAL (operands[2]) == 1 + && IN_RANGE (exact_log2 (INTVAL (operands[1])), 7, 22)" + "#" + "&& 1" + [(set (match_dup 5) + (smin:SI (smax:SI (match_dup 0) + (match_dup 1)) + (match_dup 2))) + (set (pc) + (if_then_else (match_op_dup 4 + [(match_dup 0) + (match_dup 5)]) + (label_ref (match_dup 3)) + (pc)))] +{ + HOST_WIDE_INT v = INTVAL (operands[1]); + operands[1] = GEN_INT (-v); + operands[2] = GEN_INT (v - 1); + PUT_CODE (operands[4], GET_CODE (operands[4]) == GTU ? NE : EQ); + if (GET_CODE (operands[5]) == SCRATCH) + operands[5] = gen_reg_rtx (SImode); +} + [(set_attr "type" "jump") + (set_attr "mode" "none") + (set_attr "length" "6")]) + (define_split [(clobber (match_operand 0 "register_operand"))] "HARD_REGISTER_P (operands[0]) |