diff options
Diffstat (limited to 'gcc')
29 files changed, 677 insertions, 107 deletions
diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h index d587d25..743cc38 100644 --- a/gcc/config/i386/cygming.h +++ b/gcc/config/i386/cygming.h @@ -28,16 +28,15 @@ along with GCC; see the file COPYING3. If not see #undef TARGET_SEH #define TARGET_SEH (TARGET_64BIT_MS_ABI && flag_unwind_tables) +#undef PREFERRED_STACK_BOUNDARY_DEFAULT +#define PREFERRED_STACK_BOUNDARY_DEFAULT \ + (TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY) + /* Win64 with SEH cannot represent DRAP stack frames. Disable its use. Force the use of different mechanisms to allocate aligned local data. */ #undef MAX_STACK_ALIGNMENT #define MAX_STACK_ALIGNMENT (TARGET_SEH ? 128 : MAX_OFILE_ALIGNMENT) -/* 32-bit Windows aligns the stack on a 4-byte boundary but SSE instructions - may require 16-byte alignment. */ -#undef STACK_REALIGN_DEFAULT -#define STACK_REALIGN_DEFAULT (TARGET_64BIT ? 0 : 1) - /* Support hooks for SEH. */ #undef TARGET_ASM_UNWIND_EMIT #define TARGET_ASM_UNWIND_EMIT i386_pe_seh_unwind_emit diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 1ba5ac4..54b3f6d 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -518,15 +518,17 @@ scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed) instead of using a scalar one. */ int -general_scalar_chain::vector_const_cost (rtx exp) +general_scalar_chain::vector_const_cost (rtx exp, basic_block bb) { gcc_assert (CONST_INT_P (exp)); if (standard_sse_constant_p (exp, vmode)) return ix86_cost->sse_op; + if (optimize_bb_for_size_p (bb)) + return COSTS_N_BYTES (8); /* We have separate costs for SImode and DImode, use SImode costs for smaller modes. */ - return ix86_cost->sse_load[smode == DImode ? 1 : 0]; + return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2; } /* Compute a gain for chain conversion. */ @@ -547,7 +549,7 @@ general_scalar_chain::compute_convert_gain () smaller modes than SImode the int load/store costs need to be adjusted as well. */ unsigned sse_cost_idx = smode == DImode ? 1 : 0; - unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1; + int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1; EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) { @@ -555,26 +557,55 @@ general_scalar_chain::compute_convert_gain () rtx def_set = single_set (insn); rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); + basic_block bb = BLOCK_FOR_INSN (insn); int igain = 0; if (REG_P (src) && REG_P (dst)) - igain += 2 * m - ix86_cost->xmm_move; + { + if (optimize_bb_for_size_p (bb)) + /* reg-reg move is 2 bytes, while SSE 3. */ + igain += COSTS_N_BYTES (2 * m - 3); + else + /* Move costs are normalized to reg-reg move having cost 2. */ + igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2; + } else if (REG_P (src) && MEM_P (dst)) - igain - += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx]; + { + if (optimize_bb_for_size_p (bb)) + /* Integer load/store is 3+ bytes and SSE 4+. */ + igain += COSTS_N_BYTES (3 * m - 4); + else + igain + += COSTS_N_INSNS (m * ix86_cost->int_store[2] + - ix86_cost->sse_store[sse_cost_idx]) / 2; + } else if (MEM_P (src) && REG_P (dst)) - igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx]; + { + if (optimize_bb_for_size_p (bb)) + igain += COSTS_N_BYTES (3 * m - 4); + else + igain += COSTS_N_INSNS (m * ix86_cost->int_load[2] + - ix86_cost->sse_load[sse_cost_idx]) / 2; + } else { /* For operations on memory operands, include the overhead of explicit load and store instructions. */ if (MEM_P (dst)) - igain += optimize_insn_for_size_p () - ? -COSTS_N_BYTES (8) - : (m * (ix86_cost->int_load[2] - + ix86_cost->int_store[2]) - - (ix86_cost->sse_load[sse_cost_idx] + - ix86_cost->sse_store[sse_cost_idx])); + { + if (optimize_bb_for_size_p (bb)) + /* ??? This probably should account size difference + of SSE and integer load rather than full SSE load. */ + igain -= COSTS_N_BYTES (8); + else + { + int cost = (m * (ix86_cost->int_load[2] + + ix86_cost->int_store[2]) + - (ix86_cost->sse_load[sse_cost_idx] + + ix86_cost->sse_store[sse_cost_idx])); + igain += COSTS_N_INSNS (cost) / 2; + } + } switch (GET_CODE (src)) { @@ -595,7 +626,7 @@ general_scalar_chain::compute_convert_gain () igain += ix86_cost->shift_const - ix86_cost->sse_op; if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); + igain -= vector_const_cost (XEXP (src, 0), bb); break; case ROTATE: @@ -631,16 +662,17 @@ general_scalar_chain::compute_convert_gain () igain += m * ix86_cost->add; if (CONST_INT_P (XEXP (src, 0))) - igain -= vector_const_cost (XEXP (src, 0)); + igain -= vector_const_cost (XEXP (src, 0), bb); if (CONST_INT_P (XEXP (src, 1))) - igain -= vector_const_cost (XEXP (src, 1)); + igain -= vector_const_cost (XEXP (src, 1), bb); if (MEM_P (XEXP (src, 1))) { - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) igain -= COSTS_N_BYTES (m == 2 ? 3 : 5); else - igain += m * ix86_cost->int_load[2] - - ix86_cost->sse_load[sse_cost_idx]; + igain += COSTS_N_INSNS + (m * ix86_cost->int_load[2] + - ix86_cost->sse_load[sse_cost_idx]) / 2; } break; @@ -698,7 +730,7 @@ general_scalar_chain::compute_convert_gain () case CONST_INT: if (REG_P (dst)) { - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) { /* xor (2 bytes) vs. xorps (3 bytes). */ if (src == const0_rtx) @@ -722,14 +754,14 @@ general_scalar_chain::compute_convert_gain () /* DImode can be immediate for TARGET_64BIT and SImode always. */ igain += m * COSTS_N_INSNS (1); - igain -= vector_const_cost (src); + igain -= vector_const_cost (src, bb); } } else if (MEM_P (dst)) { igain += (m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx]); - igain -= vector_const_cost (src); + igain -= vector_const_cost (src, bb); } break; @@ -737,13 +769,14 @@ general_scalar_chain::compute_convert_gain () if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx) { // movd (4 bytes) replaced with movdqa (4 bytes). - if (!optimize_insn_for_size_p ()) - igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move; + if (!optimize_bb_for_size_p (bb)) + igain += COSTS_N_INSNS (ix86_cost->sse_to_integer + - ix86_cost->xmm_move) / 2; } else { // pshufd; movd replaced with pshufd. - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) igain += COSTS_N_BYTES (4); else igain += ix86_cost->sse_to_integer; @@ -769,11 +802,11 @@ general_scalar_chain::compute_convert_gain () /* Cost the integer to sse and sse to integer moves. */ if (!optimize_function_for_size_p (cfun)) { - cost += n_sse_to_integer * ix86_cost->sse_to_integer; + cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2; /* ??? integer_to_sse but we only have that in the RA cost table. Assume sse_to_integer/integer_to_sse are the same which they are at the moment. */ - cost += n_integer_to_sse * ix86_cost->sse_to_integer; + cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; } else if (TARGET_64BIT || smode == SImode) { @@ -1508,13 +1541,13 @@ general_scalar_chain::convert_insn (rtx_insn *insn) with numerous special cases. */ static int -timode_immed_const_gain (rtx cst) +timode_immed_const_gain (rtx cst, basic_block bb) { /* movabsq vs. movabsq+vmovq+vunpacklqdq. */ if (CONST_WIDE_INT_P (cst) && CONST_WIDE_INT_NUNITS (cst) == 2 && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1)) - return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9) + return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9) : -COSTS_N_INSNS (2); /* 2x movabsq ~ vmovdqa. */ return 0; @@ -1546,33 +1579,34 @@ timode_scalar_chain::compute_convert_gain () rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); HOST_WIDE_INT op1val; + basic_block bb = BLOCK_FOR_INSN (insn); int scost, vcost; int igain = 0; switch (GET_CODE (src)) { case REG: - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3); else igain = COSTS_N_INSNS (1); break; case MEM: - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7) + igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1); break; case CONST_INT: if (MEM_P (dst) && standard_sse_constant_p (src, V1TImode)) - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1; + igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1; break; case CONST_WIDE_INT: /* 2 x mov vs. vmovdqa. */ if (MEM_P (dst)) - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3) + igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1); break; @@ -1587,14 +1621,14 @@ timode_scalar_chain::compute_convert_gain () if (!MEM_P (dst)) igain = COSTS_N_INSNS (1); if (CONST_SCALAR_INT_P (XEXP (src, 1))) - igain += timode_immed_const_gain (XEXP (src, 1)); + igain += timode_immed_const_gain (XEXP (src, 1), bb); break; case ASHIFT: case LSHIFTRT: /* See ix86_expand_v1ti_shift. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) { if (op1val == 64 || op1val == 65) scost = COSTS_N_BYTES (5); @@ -1628,7 +1662,7 @@ timode_scalar_chain::compute_convert_gain () case ASHIFTRT: /* See ix86_expand_v1ti_ashiftrt. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) { if (op1val == 64 || op1val == 127) scost = COSTS_N_BYTES (7); @@ -1706,7 +1740,7 @@ timode_scalar_chain::compute_convert_gain () case ROTATERT: /* See ix86_expand_v1ti_rotate. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (optimize_bb_for_size_p (bb)) { scost = COSTS_N_BYTES (13); if ((op1val & 31) == 0) @@ -1738,16 +1772,16 @@ timode_scalar_chain::compute_convert_gain () { if (GET_CODE (XEXP (src, 0)) == AND) /* and;and;or (9 bytes) vs. ptest (5 bytes). */ - igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4) - : COSTS_N_INSNS (2); + igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4) + : COSTS_N_INSNS (2); /* or (3 bytes) vs. ptest (5 bytes). */ - else if (optimize_insn_for_size_p ()) + else if (optimize_bb_for_size_p (bb)) igain = -COSTS_N_BYTES (2); } else if (XEXP (src, 1) == const1_rtx) /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */ - igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6) - : -COSTS_N_INSNS (1); + igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6) + : -COSTS_N_INSNS (1); break; default: diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index 24b0c4e..7f7c0f7 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -188,7 +188,7 @@ class general_scalar_chain : public scalar_chain private: void convert_insn (rtx_insn *insn) final override; - int vector_const_cost (rtx exp); + int vector_const_cost (rtx exp, basic_block bb); rtx convert_rotate (enum rtx_code, rtx op0, rtx op1, rtx_insn *insn); }; diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index fd36ea8..9c24a92 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -7942,6 +7942,15 @@ ix86_update_stack_boundary (void) if (ix86_tls_descriptor_calls_expanded_in_cfun && crtl->preferred_stack_boundary < 128) crtl->preferred_stack_boundary = 128; + + /* For 32-bit MS ABI, both the incoming and preferred stack boundaries + are 32 bits, but if force_align_arg_pointer is specified, it should + prefer 128 bits for a backward-compatibility reason, which is also + what the doc suggests. */ + if (lookup_attribute ("force_align_arg_pointer", + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))) + && crtl->preferred_stack_boundary < 128) + crtl->preferred_stack_boundary = 128; } /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 6a38de3..18fa97a 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -179,6 +179,7 @@ struct processor_costs { const int xmm_move, ymm_move, /* cost of moving XMM and YMM register. */ zmm_move; const int sse_to_integer; /* cost of moving SSE register to integer. */ + const int integer_to_sse; /* cost of moving integer register to SSE. */ const int gather_static, gather_per_elt; /* Cost of gather load is computed as static + per_item * nelts. */ const int scatter_static, scatter_per_elt; /* Cost of gather store is diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 6cce70a..e509129 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -107,6 +107,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ in 128bit, 256bit and 512bit */ 4, 4, 6, /* cost of moving XMM,YMM,ZMM register */ 4, /* cost of moving SSE register to integer. */ + 4, /* cost of moving integer register to SSE. */ COSTS_N_BYTES (5), 0, /* Gather load static, per_elt. */ COSTS_N_BYTES (5), 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -227,6 +228,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ @@ -345,6 +347,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache @@ -465,6 +468,7 @@ struct processor_costs pentium_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -576,6 +580,7 @@ struct processor_costs lakemont_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -702,6 +707,7 @@ struct processor_costs pentiumpro_cost = { {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -819,6 +825,7 @@ struct processor_costs geode_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -936,6 +943,7 @@ struct processor_costs k6_cost = { {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 2, 2, /* Gather load static, per_elt. */ 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -1059,6 +1067,7 @@ struct processor_costs athlon_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + 5, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1184,6 +1193,7 @@ struct processor_costs k8_cost = { {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 5, /* cost of moving SSE register to integer. */ + 5, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1322,6 +1332,7 @@ struct processor_costs amdfam10_cost = { {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 3, /* cost of moving SSE register to integer. */ + 3, /* cost of moving integer register to SSE. */ 4, 4, /* Gather load static, per_elt. */ 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -1452,6 +1463,7 @@ const struct processor_costs bdver_cost = { {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 16, /* cost of moving SSE register to integer. */ + 16, /* cost of moving integer register to SSE. */ 12, 12, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ @@ -1603,6 +1615,7 @@ struct processor_costs znver1_cost = { {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1770,6 +1783,7 @@ struct processor_costs znver2_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, throughput 12. Approx 9 uops do not depend on vector size and every load is 7 uops. */ @@ -1912,6 +1926,7 @@ struct processor_costs znver3_cost = { 2, 2, 3, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, throughput 9. Approx 7 uops do not depend on vector size and every load is 4 uops. */ @@ -2056,6 +2071,7 @@ struct processor_costs znver4_cost = { 2, 2, 2, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops, throughput 5. Approx 7 uops do not depend on vector size and every load is 5 uops. */ @@ -2204,6 +2220,7 @@ struct processor_costs znver5_cost = { 2, 2, 2, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ /* TODO: gather and scatter instructions are currently disabled in x86-tune.def. In some cases they are however a win, see PR116582 @@ -2372,6 +2389,7 @@ struct processor_costs skylake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2508,6 +2526,7 @@ struct processor_costs icelake_cost = { {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 20, 8, /* Gather load static, per_elt. */ 22, 10, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ @@ -2638,6 +2657,7 @@ struct processor_costs alderlake_cost = { {8, 8, 8, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2761,6 +2781,7 @@ const struct processor_costs btver1_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + 14, /* cost of moving integer register to SSE. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -2881,6 +2902,7 @@ const struct processor_costs btver2_cost = { {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 14, /* cost of moving SSE register to integer. */ + 14, /* cost of moving integer register to SSE. */ 10, 10, /* Gather load static, per_elt. */ 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3000,6 +3022,7 @@ struct processor_costs pentium4_cost = { {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + 20, /* cost of moving integer register to SSE. */ 16, 16, /* Gather load static, per_elt. */ 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -3122,6 +3145,7 @@ struct processor_costs nocona_cost = { {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ 20, /* cost of moving SSE register to integer. */ + 20, /* cost of moving integer register to SSE. */ 12, 12, /* Gather load static, per_elt. */ 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ @@ -3242,6 +3266,7 @@ struct processor_costs atom_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + 8, /* cost of moving integer register to SSE. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3362,6 +3387,7 @@ struct processor_costs slm_cost = { {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ 8, /* cost of moving SSE register to integer. */ + 8, /* cost of moving integer register to SSE. */ 8, 8, /* Gather load static, per_elt. */ 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3494,6 +3520,7 @@ struct processor_costs tremont_cost = { {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3616,6 +3643,7 @@ struct processor_costs intel_cost = { {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ 4, /* cost of moving SSE register to integer. */ + 4, /* cost of moving integer register to SSE. */ 6, 6, /* Gather load static, per_elt. */ 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3731,15 +3759,16 @@ struct processor_costs lujiazui_cost = { {6, 6, 6}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {6, 6, 6}, /* cost of storing integer registers. */ + {6, 6, 6}, /* cost of storing integer registers. */ {6, 6, 6, 10, 15}, /* cost of loading SSE register - in 32bit, 64bit, 128bit, 256bit and 512bit. */ + in 32bit, 64bit, 128bit, 256bit and 512bit. */ {6, 6, 6, 10, 15}, /* cost of storing SSE register - in 32bit, 64bit, 128bit, 256bit and 512bit. */ + in 32bit, 64bit, 128bit, 256bit and 512bit. */ {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ - 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ - 6, /* cost of moving SSE register to integer. */ + 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ + 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3864,6 +3893,7 @@ struct processor_costs yongfeng_cost = { {8, 8, 8, 12, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ 8, /* cost of moving SSE register to integer. */ + 8, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -3987,6 +4017,7 @@ struct processor_costs shijidadao_cost = { {8, 8, 8, 12, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */ 8, /* cost of moving SSE register to integer. */ + 8, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -4116,6 +4147,7 @@ struct processor_costs generic_cost = { {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ 6, /* cost of moving SSE register to integer. */ + 6, /* cost of moving integer register to SSE. */ 18, 6, /* Gather load static, per_elt. */ 18, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ @@ -4249,6 +4281,7 @@ struct processor_costs core_cost = { {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ 2, /* cost of moving SSE register to integer. */ + 2, /* cost of moving integer register to SSE. */ /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, rec. throughput 6. So 5 uops statically and one uops per load. */ diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md index 95df533..c226c39 100644 --- a/gcc/config/riscv/bitmanip.md +++ b/gcc/config/riscv/bitmanip.md @@ -1222,7 +1222,7 @@ we can't keep it in 64 bit variable.) then use clmul instruction to implement the CRC, otherwise (TARGET_ZBKB) generate table based using brev. */ - if ((TARGET_ZBKC || TARGET_ZBC) && <ANYI:MODE>mode < word_mode) + if ((TARGET_ZBKC || TARGET_ZBC || TARGET_ZVBC) && <ANYI:MODE>mode < word_mode) expand_reversed_crc_using_clmul (<ANYI:MODE>mode, <ANYI1:MODE>mode, operands); else if (TARGET_ZBKB) @@ -1254,7 +1254,8 @@ (match_operand:SUBX 3)] UNSPEC_CRC))] /* We don't support the case when data's size is bigger than CRC's size. */ - "(TARGET_ZBKC || TARGET_ZBC) && <SUBX:MODE>mode >= <SUBX1:MODE>mode" + "(TARGET_ZBKC || TARGET_ZBC || TARGET_ZVBC) + && <SUBX:MODE>mode >= <SUBX1:MODE>mode" { /* If we have the ZBC or ZBKC extension (ie, clmul) and it is possible to store the quotient within a single variable diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md index 214c20b..584b345 100644 --- a/gcc/config/riscv/iterators.md +++ b/gcc/config/riscv/iterators.md @@ -262,6 +262,9 @@ (define_code_attr fix_uns [(fix "fix") (unsigned_fix "fixuns")]) +(define_code_attr OPTAB [(ior "IOR") + (xor "XOR")]) + ;; ------------------------------------------------------------------- ;; Code Attributes diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index c9a638c..23690792 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -380,14 +380,6 @@ (and (match_code "const_int") (match_test "SINGLE_BIT_MASK_OPERAND (UINTVAL (op))"))) -;; Register, small constant or single bit constant for use in -;; bseti/binvi. -(define_predicate "arith_or_zbs_operand" - (ior (match_operand 0 "const_arith_operand") - (match_operand 0 "register_operand") - (and (match_test "TARGET_ZBS") - (match_operand 0 "single_bit_mask_operand")))) - (define_predicate "not_single_bit_mask_operand" (and (match_code "const_int") (match_test "SINGLE_BIT_MASK_OPERAND (~UINTVAL (op))"))) @@ -689,3 +681,7 @@ (define_predicate "bitpos_mask_operand" (and (match_code "const_int") (match_test "TARGET_64BIT ? INTVAL (op) == 63 : INTVAL (op) == 31"))) + +(define_predicate "reg_or_const_int_operand" + (ior (match_operand 0 "const_int_operand") + (match_operand 0 "register_operand"))) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index b0d5bbb..271a9a3 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -140,6 +140,7 @@ extern void riscv_expand_sssub (rtx, rtx, rtx); extern void riscv_expand_ustrunc (rtx, rtx); extern void riscv_expand_sstrunc (rtx, rtx); extern int riscv_register_move_cost (machine_mode, reg_class_t, reg_class_t); +extern bool synthesize_ior_xor (rtx_code, rtx [3]); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 3ee88db..8b77a35 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -14035,17 +14035,53 @@ expand_crc_using_clmul (scalar_mode crc_mode, scalar_mode data_mode, rtx data = gen_rtx_ZERO_EXTEND (word_mode, operands[2]); riscv_expand_op (XOR, word_mode, a0, crc, data); - if (TARGET_64BIT) - emit_insn (gen_riscv_clmul_di (a0, a0, t0)); - else - emit_insn (gen_riscv_clmul_si (a0, a0, t0)); + if (TARGET_ZBKC || TARGET_ZBC) + { + if (TARGET_64BIT) + emit_insn (gen_riscv_clmul_di (a0, a0, t0)); + else + emit_insn (gen_riscv_clmul_si (a0, a0, t0)); - riscv_expand_op (LSHIFTRT, word_mode, a0, a0, - gen_int_mode (crc_size, word_mode)); - if (TARGET_64BIT) - emit_insn (gen_riscv_clmul_di (a0, a0, t1)); + riscv_expand_op (LSHIFTRT, word_mode, a0, a0, + gen_int_mode (crc_size, word_mode)); + if (TARGET_64BIT) + emit_insn (gen_riscv_clmul_di (a0, a0, t1)); + else + emit_insn (gen_riscv_clmul_si (a0, a0, t1)); + } else - emit_insn (gen_riscv_clmul_si (a0, a0, t1)); + { + machine_mode vmode; + if (!riscv_vector::get_vector_mode (DImode, 1).exists (&vmode)) + gcc_unreachable (); + + rtx vec = gen_reg_rtx (vmode); + + insn_code icode1 = code_for_pred_broadcast (vmode); + rtx ops1[] = {vec, a0}; + emit_nonvlmax_insn (icode1, UNARY_OP, ops1, CONST1_RTX (Pmode)); + + rtx rvv1di_reg = gen_rtx_SUBREG (RVVM1DImode, vec, 0); + insn_code icode2 = code_for_pred_vclmul_scalar (UNSPEC_VCLMUL, + E_RVVM1DImode); + rtx ops2[] = {rvv1di_reg, rvv1di_reg, t0}; + emit_nonvlmax_insn (icode2, riscv_vector::BINARY_OP, ops2, CONST1_RTX + (Pmode)); + + rtx shift_amount = gen_int_mode (data_size, Pmode); + insn_code icode3 = code_for_pred_scalar (LSHIFTRT, vmode); + rtx ops3[] = {vec, vec, shift_amount}; + emit_nonvlmax_insn (icode3, BINARY_OP, ops3, CONST1_RTX (Pmode)); + + insn_code icode4 = code_for_pred_vclmul_scalar (UNSPEC_VCLMULH, + E_RVVM1DImode); + rtx ops4[] = {rvv1di_reg, rvv1di_reg, t1}; + emit_nonvlmax_insn (icode4, riscv_vector::BINARY_OP, ops4, CONST1_RTX + (Pmode)); + + rtx vec_low_lane = gen_lowpart (DImode, vec); + riscv_emit_move (a0, vec_low_lane); + } if (crc_size > data_size) { @@ -14094,19 +14130,53 @@ expand_reversed_crc_using_clmul (scalar_mode crc_mode, scalar_mode data_mode, rtx a0 = gen_reg_rtx (word_mode); riscv_expand_op (XOR, word_mode, a0, crc, data); - if (TARGET_64BIT) - emit_insn (gen_riscv_clmul_di (a0, a0, t0)); - else - emit_insn (gen_riscv_clmul_si (a0, a0, t0)); + if (TARGET_ZBKC || TARGET_ZBC) + { + if (TARGET_64BIT) + emit_insn (gen_riscv_clmul_di (a0, a0, t0)); + else + emit_insn (gen_riscv_clmul_si (a0, a0, t0)); - rtx num_shift = gen_int_mode (GET_MODE_BITSIZE (word_mode) - data_size, - word_mode); - riscv_expand_op (ASHIFT, word_mode, a0, a0, num_shift); + rtx num_shift = gen_int_mode (BITS_PER_WORD - data_size, word_mode); + riscv_expand_op (ASHIFT, word_mode, a0, a0, num_shift); - if (TARGET_64BIT) - emit_insn (gen_riscv_clmulh_di (a0, a0, t1)); + if (TARGET_64BIT) + emit_insn (gen_riscv_clmulh_di (a0, a0, t1)); + else + emit_insn (gen_riscv_clmulh_si (a0, a0, t1)); + } else - emit_insn (gen_riscv_clmulh_si (a0, a0, t1)); + { + machine_mode vmode; + if (!riscv_vector::get_vector_mode (DImode, 1).exists (&vmode)) + gcc_unreachable (); + + rtx vec = gen_reg_rtx (vmode); + insn_code icode1 = code_for_pred_broadcast (vmode); + rtx ops1[] = {vec, a0}; + emit_nonvlmax_insn (icode1, UNARY_OP, ops1, CONST1_RTX (Pmode)); + + rtx rvv1di_reg = gen_rtx_SUBREG (RVVM1DImode, vec, 0); + insn_code icode2 = code_for_pred_vclmul_scalar (UNSPEC_VCLMUL, + E_RVVM1DImode); + rtx ops2[] = {rvv1di_reg, rvv1di_reg, t0}; + emit_nonvlmax_insn (icode2, riscv_vector::BINARY_OP, ops2, CONST1_RTX + (Pmode)); + + rtx shift_amount = gen_int_mode (BITS_PER_WORD - data_size, Pmode); + insn_code icode3 = code_for_pred_scalar (ASHIFT, vmode); + rtx ops3[] = {vec, vec, shift_amount}; + emit_nonvlmax_insn (icode3, BINARY_OP, ops3, CONST1_RTX (Pmode)); + + insn_code icode4 = code_for_pred_vclmul_scalar (UNSPEC_VCLMULH, + E_RVVM1DImode); + rtx ops4[] = {rvv1di_reg, rvv1di_reg, t1}; + emit_nonvlmax_insn (icode4, riscv_vector::BINARY_OP, ops4, CONST1_RTX + (Pmode)); + + rtx vec_low_lane = gen_lowpart (DImode, vec); + riscv_emit_move (a0, vec_low_lane); + } if (crc_size > data_size) { @@ -14140,6 +14210,205 @@ bool need_shadow_stack_push_pop_p () return is_zicfiss_p () && riscv_save_return_addr_reg_p (); } +/* Synthesize OPERANDS[0] = OPERANDS[1] CODE OPERANDS[2]. + + OPERANDS[0] and OPERANDS[1] will be a REG and may be the same + REG. + + OPERANDS[2] is a CONST_INT. + + CODE is IOR or XOR. + + Return TRUE if the operation was fully synthesized and the caller + need not generate additional code. Return FALSE if the operation + was not synthesized and the caller is responsible for emitting the + proper sequence. */ + +bool +synthesize_ior_xor (rtx_code code, rtx operands[3]) +{ + /* Trivial cases that don't need synthesis. */ + if (SMALL_OPERAND (INTVAL (operands[2])) + || ((TARGET_ZBS || TARGET_XTHEADBS || TARGET_ZBKB) + && single_bit_mask_operand (operands[2], word_mode))) + return false; + + /* The number of instructions to synthesize the constant is a good + estimate of the budget. That does not account for out of order + execution an fusion in the constant synthesis those would naturally + decrease the budget. It also does not account for the IOR/XOR at + the end of the sequence which would increase the budget. */ + int budget = (TARGET_ZBS ? riscv_const_insns (operands[2], true) : -1); + int original_budget = budget; + + /* Bits we need to set in operands[0]. As we synthesize the operation, + we clear bits in IVAL. Once IVAL is zero, then synthesis of the + operation is complete. */ + unsigned HOST_WIDE_INT ival = INTVAL (operands[2]); + + /* Check if we want to use [x]ori. Then get the remaining bits + and decrease the budget by one. */ + if ((ival & HOST_WIDE_INT_UC (0x7ff)) != 0) + { + ival &= ~HOST_WIDE_INT_UC (0x7ff); + budget--; + } + + /* Check for bseti cases. For each remaining bit in ival, + decrease the budget by one. */ + while (ival) + { + HOST_WIDE_INT tmpval = HOST_WIDE_INT_UC (1) << ctz_hwi (ival); + ival &= ~tmpval; + budget--; + } + + /* If we're flipping all but a small number of bits we can pre-flip + the outliers, then flip all the bits, which would restore those + bits that were pre-flipped. */ + if ((TARGET_ZBS || TARGET_XTHEADBS || TARGET_ZBKB) + && budget < 0 + && code == XOR + && popcount_hwi (~INTVAL (operands[2])) < original_budget) + { + /* Pre-flipping bits we want to preserve. */ + rtx input = operands[1]; + ival = ~INTVAL (operands[2]); + while (ival) + { + HOST_WIDE_INT tmpval = HOST_WIDE_INT_UC (1) << ctz_hwi (ival); + rtx x = GEN_INT (tmpval); + x = gen_rtx_XOR (word_mode, input, x); + emit_insn (gen_rtx_SET (operands[0], x)); + input = operands[0]; + ival &= ~tmpval; + } + + /* Now flip all the bits, which restores the bits we were + preserving. */ + rtx x = gen_rtx_NOT (word_mode, input); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; + } + + /* One more approach we can try. If our budget is 3+ instructions, + then we can try to rotate the source so that the bits we want to + set are in the low 11 bits. We then use [x]ori to set those low + bits, then rotate things back into their proper place. */ + if ((TARGET_ZBB || TARGET_XTHEADBB || TARGET_ZBKB) + && budget < 0 + && popcount_hwi (INTVAL (operands[2])) <= 11 + && riscv_const_insns (operands[2], true) >= 3) + { + ival = INTVAL (operands[2]); + /* First see if the constant trivially fits into 11 bits in the LSB. */ + int lsb = ctz_hwi (ival); + int msb = BITS_PER_WORD - 1 - clz_hwi (ival); + if (msb - lsb + 1 <= 11) + { + /* Rotate the source right by LSB bits. */ + rtx x = GEN_INT (lsb); + x = gen_rtx_ROTATERT (word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + + /* Shift the constant right by LSB bits. */ + x = GEN_INT (ival >> lsb); + + /* Perform the IOR/XOR operation. */ + x = gen_rtx_fmt_ee (code, word_mode, operands[0], x); + emit_insn (gen_rtx_SET (operands[0], x)); + + /* And rotate left to put everything back in place, we don't + have rotate left by a constant, so use rotate right by + an adjusted constant. */ + x = GEN_INT (BITS_PER_WORD - lsb); + x = gen_rtx_ROTATERT (word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; + } + + /* Maybe the bits are split between the high and low parts + of the constant. A bit more complex, but still manageable. + + Conceptually we want to rotate left the constant by the number + of leading zeros after masking off all but the low 11 bits. */ + int rotcount = clz_hwi (ival & 0x7ff) - (BITS_PER_WORD - 11); + + /* Rotate the constant left by MSB bits. */ + ival = (ival << rotcount) | (ival >> (BITS_PER_WORD - rotcount)); + + /* Now we can do the same tests as before. */ + lsb = ctz_hwi (ival); + msb = BITS_PER_WORD - clz_hwi (ival); + if ((INTVAL (operands[2]) & HOST_WIDE_INT_UC (0x7ff)) != 0 + && msb - lsb + 1 <= 11) + { + /* Rotate the source left by ROTCOUNT bits, we don't have + rotate left by a constant, so use rotate right by an + adjusted constant. */ + rtx x = GEN_INT (BITS_PER_WORD - rotcount); + x = gen_rtx_ROTATERT (word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + + /* We've already rotated the constant. So perform the IOR/XOR + operation. */ + x = GEN_INT (ival); + x = gen_rtx_fmt_ee (code, word_mode, operands[0], x); + emit_insn (gen_rtx_SET (operands[0], x)); + + /* And rotate right to put everything into its proper place. */ + x = GEN_INT (rotcount); + x = gen_rtx_ROTATERT (word_mode, operands[0], x); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; + } + } + + /* If after accounting for bseti the remaining budget has + gone to less than zero, it forces the value into a + register and performs the IOR operation. It returns + TRUE to the caller so the caller knows code generation + is complete. */ + if (budget < 0) + { + rtx x = force_reg (word_mode, operands[2]); + x = gen_rtx_fmt_ee (code, word_mode, operands[1], x); + emit_insn (gen_rtx_SET (operands[0], x)); + return true; + } + + /* Synthesis is better than loading the constant. */ + ival = INTVAL (operands[2]); + rtx input = operands[1]; + + /* Emit the [x]ori insn that sets the low 11 bits into + the proper state. */ + if ((ival & HOST_WIDE_INT_UC (0x7ff)) != 0) + { + rtx x = GEN_INT (ival & HOST_WIDE_INT_UC (0x7ff)); + x = gen_rtx_fmt_ee (code, word_mode, input, x); + emit_insn (gen_rtx_SET (operands[0], x)); + input = operands[0]; + ival &= ~HOST_WIDE_INT_UC (0x7ff); + } + + /* We figure out a single bit as a constant and + generate a CONST_INT node for that. Then we + construct the IOR node, then the SET node and + emit it. An IOR with a suitable constant that is + a single bit will be implemented with a bseti. */ + while (ival) + { + HOST_WIDE_INT tmpval = HOST_WIDE_INT_UC (1) << ctz_hwi (ival); + rtx x = GEN_INT (tmpval); + x = gen_rtx_fmt_ee (code, word_mode, input, x); + emit_insn (gen_rtx_SET (operands[0], x)); + input = operands[0]; + ival &= ~tmpval; + } + return true; +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index 259997f..154b49d 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -1767,8 +1767,15 @@ (define_expand "<optab><mode>3" [(set (match_operand:X 0 "register_operand") (any_or:X (match_operand:X 1 "register_operand" "") - (match_operand:X 2 "arith_or_zbs_operand" "")))] - "") + (match_operand:X 2 "reg_or_const_int_operand" "")))] + "" + +{ + /* If synthesis of the logical op is successful, then no further code + generation is necessary. Else just generate code normally. */ + if (CONST_INT_P (operands[2]) && synthesize_ior_xor (<OPTAB>, operands)) + DONE; +}) (define_insn "*<optab><mode>3" [(set (match_operand:X 0 "register_operand" "=r,r") diff --git a/gcc/fortran/dependency.cc b/gcc/fortran/dependency.cc index 57c0c49..aa8a57a 100644 --- a/gcc/fortran/dependency.cc +++ b/gcc/fortran/dependency.cc @@ -944,8 +944,12 @@ gfc_ref_needs_temporary_p (gfc_ref *ref) types), not in characters. */ return subarray_p; - case REF_COMPONENT: case REF_INQUIRY: + /* Within an array reference, inquiry references of complex + variables generally need a temporary. */ + return subarray_p; + + case REF_COMPONENT: break; } diff --git a/gcc/fortran/trans-types.cc b/gcc/fortran/trans-types.cc index 3374778..f898075 100644 --- a/gcc/fortran/trans-types.cc +++ b/gcc/fortran/trans-types.cc @@ -1140,11 +1140,6 @@ gfc_init_types (void) } gfc_character1_type_node = gfc_character_types[0]; - /* The middle end only recognizes a single unsigned type. For - compatibility of existing test cases, let's just use the - character type. The reader of tree dumps is expected to be able - to deal with this. */ - if (flag_unsigned) { for (index = 0; gfc_unsigned_kinds[index].kind != 0;++index) @@ -1159,18 +1154,26 @@ gfc_init_types (void) break; } } - if (index_char > 0) + if (index_char > -1) { - gfc_unsigned_types[index] = gfc_character_types[index_char]; + type = gfc_character_types[index_char]; + if (TYPE_STRING_FLAG (type)) + { + type = build_distinct_type_copy (type); + TYPE_CANONICAL (type) + = TYPE_CANONICAL (gfc_character_types[index_char]); + } + else + type = build_variant_type_copy (type); + TYPE_NAME (type) = NULL_TREE; + TYPE_STRING_FLAG (type) = 0; } else - { - type = gfc_build_unsigned_type (&gfc_unsigned_kinds[index]); - gfc_unsigned_types[index] = type; - snprintf (name_buf, sizeof(name_buf), "unsigned(kind=%d)", - gfc_integer_kinds[index].kind); - PUSH_TYPE (name_buf, type); - } + type = gfc_build_unsigned_type (&gfc_unsigned_kinds[index]); + gfc_unsigned_types[index] = type; + snprintf (name_buf, sizeof(name_buf), "unsigned(kind=%d)", + gfc_integer_kinds[index].kind); + PUSH_TYPE (name_buf, type); } } diff --git a/gcc/testsuite/gcc.dg/pr87600.h b/gcc/testsuite/gcc.dg/pr87600.h index af91f63..c89071eb 100644 --- a/gcc/testsuite/gcc.dg/pr87600.h +++ b/gcc/testsuite/gcc.dg/pr87600.h @@ -7,7 +7,7 @@ #elif defined (__i386__) # define REG1 "%eax" # define REG2 "%edx" -#elif defined (__powerpc__) || defined (__POWERPC__) +#elif defined (__powerpc__) || defined (__POWERPC__) || defined (__PPC__) # define REG1 "r3" # define REG2 "r4" #elif defined (__s390__) diff --git a/gcc/testsuite/gcc.dg/pr89313.c b/gcc/testsuite/gcc.dg/pr89313.c index 76cb091..7de64da 100644 --- a/gcc/testsuite/gcc.dg/pr89313.c +++ b/gcc/testsuite/gcc.dg/pr89313.c @@ -8,7 +8,7 @@ # define REG "r0" #elif defined (__i386__) # define REG "%eax" -#elif defined (__powerpc__) || defined (__POWERPC__) +#elif defined (__powerpc__) || defined (__POWERPC__) || defined (__PPC__) # define REG "r3" #elif defined (__s390__) # define REG "0" diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr120080.c b/gcc/testsuite/gcc.dg/tree-ssa/pr120080.c new file mode 100644 index 0000000..d71ef5e --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr120080.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-fgimple -O2" } */ + +void __GIMPLE (ssa,startwith("switchlower1")) +foo (int b) +{ + __BB(2): + switch (b) {default: L9; case 0: L5; case 5: L5; case 101: L5; } + + __BB(3): +L9: + switch (b) {default: L7; case 5: L6; case 101: L6; } + + __BB(4): +L6: + __builtin_unreachable (); + + __BB(5): +L7: + __builtin_trap (); + + __BB(6): +L5: + return; + +} diff --git a/gcc/testsuite/gcc.target/aarch64/pr99988.c b/gcc/testsuite/gcc.target/aarch64/pr99988.c index 7cca496..c09ce67 100644 --- a/gcc/testsuite/gcc.target/aarch64/pr99988.c +++ b/gcc/testsuite/gcc.target/aarch64/pr99988.c @@ -1,5 +1,5 @@ /* { dg-do compile { target lp64 } } */ -/* { dg-options "-O2 -mbranch-protection=standard" } */ +/* { dg-options "-O2 -mbranch-protection=standard -fno-bit-tests" } */ /* { dg-final { scan-assembler-times {bti j} 13 } } */ int a; int c(); diff --git a/gcc/testsuite/gcc.target/i386/minmax-6.c b/gcc/testsuite/gcc.target/i386/minmax-6.c index 615f919..23f61c5 100644 --- a/gcc/testsuite/gcc.target/i386/minmax-6.c +++ b/gcc/testsuite/gcc.target/i386/minmax-6.c @@ -15,4 +15,4 @@ UMVLine16Y_11 (short unsigned int * Pic, int y, int width) /* We do not want the RA to spill %esi for it's dual-use but using pmaxsd is OK. */ /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */ -/* { dg-final { scan-assembler "pmaxsd" } } */ +/* { dg-final { scan-assembler "pmaxsd" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/i386/minmax-7.c b/gcc/testsuite/gcc.target/i386/minmax-7.c index 619a939..b2cb1c2 100644 --- a/gcc/testsuite/gcc.target/i386/minmax-7.c +++ b/gcc/testsuite/gcc.target/i386/minmax-7.c @@ -17,4 +17,4 @@ void bar (int aleft, int axcenter) /* We do not want the RA to spill %esi for it's dual-use but using pminsd is OK. */ /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */ -/* { dg-final { scan-assembler "pminsd" } } */ +/* { dg-final { scan-assembler "pminsd" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/riscv/ior-synthesis-1.c b/gcc/testsuite/gcc.target/riscv/ior-synthesis-1.c new file mode 100644 index 0000000..04644cd --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/ior-synthesis-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile { target { rv64 } } } */ +/* { dg-options "-march=rv64gb -mabi=lp64d" } */ + +unsigned long foo(unsigned long src) { return src | 0x8c00000000000001; } + +/* { dg-final { scan-assembler-times "\\srori\t" 2 } } */ +/* { dg-final { scan-assembler-times "\\sori\t" 1 } } */ + diff --git a/gcc/testsuite/gcc.target/riscv/ior-synthesis-2.c b/gcc/testsuite/gcc.target/riscv/ior-synthesis-2.c new file mode 100644 index 0000000..f28fe5e --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/ior-synthesis-2.c @@ -0,0 +1,8 @@ +/* { dg-do compile { target { rv64 } } } */ +/* { dg-options "-march=rv64gb -mabi=lp64d" } */ + +unsigned long foo(unsigned long src) { return src | 0x8800000000000007; } + +/* { dg-final { scan-assembler-times "\\sbseti\t" 2 } } */ +/* { dg-final { scan-assembler-times "\\sori\t" 1 } } */ + diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/crc-builtin-zvbc.c b/gcc/testsuite/gcc.target/riscv/rvv/base/crc-builtin-zvbc.c new file mode 100644 index 0000000..2d5fa88 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/crc-builtin-zvbc.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvbc -mabi=lp64d" } */ + +#include <stdint-gcc.h> + +int8_t crc8_data8 () +{ + return __builtin_crc8_data8 (0x34, 'a', 0x12); +} + +int16_t crc16_data8 () +{ + return __builtin_crc16_data8 (0x1234, 'a', 0x1021); +} + +int16_t crc16_data16 () +{ + return __builtin_crc16_data16 (0x1234, 0x3214, 0x1021); +} + +int32_t crc32_data8 () +{ + return __builtin_crc32_data8 (0xffffffff, 0x32, 0x4002123); +} + +int32_t crc32_data16 () +{ + return __builtin_crc32_data16 (0xffffffff, 0x3232, 0x4002123); +} + +int32_t crc32_data32 () +{ + return __builtin_crc32_data32 (0xffffffff, 0x123546ff, 0x4002123); +} + +int8_t rev_crc8_data8 () +{ + return __builtin_rev_crc8_data8 (0x34, 'a', 0x12); +} + +int16_t rev_crc16_data8 () +{ + return __builtin_rev_crc16_data8 (0x1234, 'a', 0x1021); +} + +int16_t rev_crc16_data16 () +{ + return __builtin_rev_crc16_data16 (0x1234, 0x3214, 0x1021); +} + +int32_t rev_crc32_data8 () +{ + return __builtin_rev_crc32_data8 (0xffffffff, 0x32, 0x4002123); +} + +int32_t rev_crc32_data16 () +{ + return __builtin_rev_crc32_data16 (0xffffffff, 0x3232, 0x4002123); +} + +int32_t rev_crc32_data32 () +{ + return __builtin_rev_crc32_data32 (0xffffffff, 0x123546ff, 0x4002123); +} +/* { dg-final { scan-assembler-times "vclmul.vx" 12 } } */ +/* { dg-final { scan-assembler-times "vclmulh.vx" 12 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/xor-synthesis-1.c b/gcc/testsuite/gcc.target/riscv/xor-synthesis-1.c new file mode 100644 index 0000000..c630a79 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/xor-synthesis-1.c @@ -0,0 +1,8 @@ +/* { dg-do compile { target { rv64 } } } */ +/* { dg-options "-march=rv64gb -mabi=lp64d" } */ + +unsigned long foo(unsigned long src) { return src ^ 0xffffffffefffffffUL; } + +/* { dg-final { scan-assembler-times "\\sbinvi\t" 1 } } */ +/* { dg-final { scan-assembler-times "\\snot\t" 1 } } */ + diff --git a/gcc/testsuite/gcc.target/riscv/xor-synthesis-2.c b/gcc/testsuite/gcc.target/riscv/xor-synthesis-2.c new file mode 100644 index 0000000..25457d2 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/xor-synthesis-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target { rv64 } } } */ +/* { dg-options "-march=rv64gb -mabi=lp64d" } */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-Og" } } */ + +unsigned long foo(unsigned long src) { return src ^ 0x8800000000000007; } + +/* xfailed until we remove mvconst_internal. */ +/* { dg-final { scan-assembler-times "\\sbinvi\t" 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times "\\sxori\t" 1 { xfail *-*-* } } } */ + diff --git a/gcc/testsuite/gcc.target/riscv/xor-synthesis-3.c b/gcc/testsuite/gcc.target/riscv/xor-synthesis-3.c new file mode 100644 index 0000000..765904b --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/xor-synthesis-3.c @@ -0,0 +1,8 @@ +/* { dg-do compile { target { rv64 } } } */ +/* { dg-options "-march=rv64gb -mabi=lp64d" } */ + +unsigned long foo(unsigned long src) { return src ^ 0x8c00000000000001; } + +/* { dg-final { scan-assembler-times "\\srori\t" 2 } } */ +/* { dg-final { scan-assembler-times "\\sxori\t" 1 } } */ + diff --git a/gcc/testsuite/gfortran.dg/guality/pr120193.f90 b/gcc/testsuite/gfortran.dg/guality/pr120193.f90 new file mode 100644 index 0000000..e65febf --- /dev/null +++ b/gcc/testsuite/gfortran.dg/guality/pr120193.f90 @@ -0,0 +1,26 @@ +! PR fortran/120193 +! { dg-do run } +! { dg-options "-g -funsigned" } +! { dg-skip-if "" { *-*-* } { "*" } { "-O0" } } + +program foo + unsigned(kind=1) :: a(2), e + unsigned(kind=2) :: b(2), f + unsigned(kind=4) :: c(2), g + unsigned(kind=8) :: d(2), h + character(kind=1, len=1) :: i(2), j + character(kind=4, len=1) :: k(2), l + a = 97u_1 ! { dg-final { gdb-test 24 "a" "d" } } + b = 97u_2 ! { dg-final { gdb-test 24 "b" "c" } } + c = 97u_4 ! { dg-final { gdb-test 24 "c" "b" } } + d = 97u_8 ! { dg-final { gdb-test 24 "d" "a" } } + e = 97u_1 ! { dg-final { gdb-test 24 "e" "97" } } + f = 97u_2 ! { dg-final { gdb-test 24 "f" "97" } } + g = 97u_4 ! { dg-final { gdb-test 24 "g" "97" } } + h = 97u_8 ! { dg-final { gdb-test 24 "h" "97" } } + i = 'a' ! { dg-final { gdb-test 24 "i" "('a', 'a')" } } + j = 'b' ! { dg-final { gdb-test 24 "j" "'b'" } } + k = 'c' + l = 'd' + print *, a +end program diff --git a/gcc/testsuite/gfortran.dg/transfer_array_subref.f90 b/gcc/testsuite/gfortran.dg/transfer_array_subref.f90 new file mode 100644 index 0000000..b480dff --- /dev/null +++ b/gcc/testsuite/gfortran.dg/transfer_array_subref.f90 @@ -0,0 +1,48 @@ +! { dg-do run } +! { dg-additional-options "-O2 -fdump-tree-optimized" } +! +! PR fortran/102891 - passing of inquiry ref of complex array to TRANSFER + +program main + implicit none + integer, parameter :: dp = 8 + + type complex_wrap1 + complex(dp) :: z(2) + end type complex_wrap1 + + type complex_wrap2 + complex(dp), dimension(:), allocatable :: z + end type complex_wrap2 + + type(complex_wrap1) :: x = complex_wrap1([ (1, 2), (3, 4) ]) + type(complex_wrap2) :: w + + w%z = x%z + + ! The following statements should get optimized away... + if (size (transfer ( x%z%re ,[1.0_dp])) /= 2) error stop 1 + if (size (transfer ((x%z%re),[1.0_dp])) /= 2) error stop 2 + if (size (transfer ([x%z%re],[1.0_dp])) /= 2) error stop 3 + if (size (transfer ( x%z%im ,[1.0_dp])) /= 2) error stop 4 + if (size (transfer ((x%z%im),[1.0_dp])) /= 2) error stop 5 + if (size (transfer ([x%z%im],[1.0_dp])) /= 2) error stop 6 + + ! ... while the following may not: + if (any (transfer ( x%z%re ,[1.0_dp]) /= x%z%re)) stop 7 + if (any (transfer ( x%z%im ,[1.0_dp]) /= x%z%im)) stop 8 + + if (size (transfer ( w%z%re ,[1.0_dp])) /= 2) stop 11 + if (size (transfer ((w%z%re),[1.0_dp])) /= 2) stop 12 + if (size (transfer ([w%z%re],[1.0_dp])) /= 2) stop 13 + if (size (transfer ( w%z%im ,[1.0_dp])) /= 2) stop 14 + if (size (transfer ((w%z%im),[1.0_dp])) /= 2) stop 15 + if (size (transfer ([w%z%im],[1.0_dp])) /= 2) stop 16 + + if (any (transfer ( w%z%re ,[1.0_dp]) /= x%z%re)) stop 17 + if (any (transfer ( w%z%im ,[1.0_dp]) /= x%z%im)) stop 18 + + deallocate (w%z) +end program main + +! { dg-final { scan-tree-dump-not "_gfortran_error_stop_numeric" "optimized" } } diff --git a/gcc/tree-switch-conversion.cc b/gcc/tree-switch-conversion.cc index dea217a..bd4de96 100644 --- a/gcc/tree-switch-conversion.cc +++ b/gcc/tree-switch-conversion.cc @@ -1793,12 +1793,14 @@ bit_test_cluster::find_bit_tests (vec<cluster *> &clusters, int max_c) end up with as few clusters as possible. */ unsigned l = clusters.length (); - auto_vec<min_cluster_item> min; - min.reserve (l + 1); - gcc_checking_assert (l > 0); + if (l == 0) + return clusters.copy (); gcc_checking_assert (l <= INT_MAX); + auto_vec<min_cluster_item> min; + min.reserve (l + 1); + int bits_in_word = GET_MODE_BITSIZE (word_mode); /* First phase: Compute the minimum number of clusters for each prefix of the |