From 22aede7a1228617661105048a91fddd8797e141b Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Sat, 10 Apr 2021 12:49:01 +0200 Subject: expand: Fix up LTO ICE with COMPOUND_LITERAL_EXPR [PR99849] The gimplifier optimizes away COMPOUND_LITERAL_EXPRs, but they can remain in the form of ADDR_EXPR of COMPOUND_LITERAL_EXPRs in static initializers. By the TREE_STATIC check I meant to check that the underlying decl of the compound literal is a global rather than automatic variable which obviously can't be referenced in static initializers, but unfortunately with LTO it might end up in another partition and thus be DECL_EXTERNAL instead. 2021-04-10 Jakub Jelinek PR lto/99849 * expr.c (expand_expr_addr_expr_1): Test is_global_var rather than just TREE_STATIC on COMPOUND_LITERAL_EXPR_DECLs. * gcc.dg/lto/pr99849_0.c: New test. --- gcc/expr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 92035c7..a0e1946 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -8204,7 +8204,7 @@ expand_expr_addr_expr_1 (tree exp, rtx target, scalar_int_mode tmode, array with address of COMPOUND_LITERAL_EXPR in DECL_INITIAL; the initializers aren't gimplified. */ if (COMPOUND_LITERAL_EXPR_DECL (exp) - && TREE_STATIC (COMPOUND_LITERAL_EXPR_DECL (exp))) + && is_global_var (COMPOUND_LITERAL_EXPR_DECL (exp))) return expand_expr_addr_expr_1 (COMPOUND_LITERAL_EXPR_DECL (exp), target, tmode, modifier, as); /* FALLTHRU */ -- cgit v1.1 From b972e036f40c12b106f9070c3e8adea0eb8a45fa Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Thu, 15 Apr 2021 13:03:21 +0200 Subject: Move gimplify_buildN API local to only remaining user This moves the legacy gimplify_buildN API to tree-vect-generic.c, its only user and elides the gimplification step, making it a wrapper around gimple_build, adjusting tree_vec_extract for this. I've noticed that vector CTOR expansion doesn't deal with unfolded {} and thus this makes it more resilent. I've also adjusted the match.pd vector CTOR extraction code to make sure it doesn't produce a CTOR when folding would make it a vector constant. 2021-04-15 Richard Biener * tree-cfg.h (gimplify_build1): Remove. (gimplify_build2): Likewise. (gimplify_build3): Likewise. * tree-cfg.c (gimplify_build1): Move to tree-vect-generic.c. (gimplify_build2): Likewise. (gimplify_build3): Likewise. * tree-vect-generic.c (gimplify_build1): Move from tree-cfg.c. Modernize. (gimplify_build2): Likewise. (gimplify_build3): Likewise. (tree_vec_extract): Use resimplify with following SSA edges. (expand_vector_parallel): Avoid passing NULL size/bitpos to tree_vec_extract. * expr.c (store_constructor): Deal with zero-element CTORs. * match.pd (bit_field_ref ): Make sure to produce vector constants when possible. --- gcc/expr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index a0e1946..5ed716c 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -7019,7 +7019,9 @@ store_constructor (tree exp, rtx target, int cleared, poly_int64 size, /* Compute the size of the elements in the CTOR. It differs from the size of the vector type elements only when the CTOR elements are vectors themselves. */ - tree val_type = TREE_TYPE (CONSTRUCTOR_ELT (exp, 0)->value); + tree val_type = (CONSTRUCTOR_NELTS (exp) != 0 + ? TREE_TYPE (CONSTRUCTOR_ELT (exp, 0)->value) + : elttype); if (VECTOR_TYPE_P (val_type)) bitsize = tree_to_uhwi (TYPE_SIZE (val_type)); else -- cgit v1.1 From 3dcd1334b4f522352b80814513fdca902fc2a207 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 27 Apr 2021 14:45:45 +0200 Subject: expand: Expand x / y * y as x - x % y if the latter is cheaper [PR96696] The following patch tests both x / y * y and x - x % y expansion for the former GIMPLE code and chooses the cheaper of those sequences. 2021-04-27 Jakub Jelinek PR tree-optimization/96696 * expr.c (expand_expr_divmod): New function. (expand_expr_real_2) : Use it for truncations and divisions. Formatting fixes. : Optimize x / y * y as x - x % y if the latter is cheaper. * gcc.target/i386/pr96696.c: New test. --- gcc/expr.c | 190 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 132 insertions(+), 58 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 5ed716c..5a1fda7 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -8664,6 +8664,56 @@ expand_misaligned_mem_ref (rtx temp, machine_mode mode, int unsignedp, return temp; } +/* Helper function of expand_expr_2, expand a division or modulo. + op0 and op1 should be already expanded treeop0 and treeop1, using + expand_operands. */ + +static rtx +expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0, + tree treeop1, rtx op0, rtx op1, rtx target, int unsignedp) +{ + bool mod_p = (code == TRUNC_MOD_EXPR || code == FLOOR_MOD_EXPR + || code == CEIL_MOD_EXPR || code == ROUND_MOD_EXPR); + if (SCALAR_INT_MODE_P (mode) + && optimize >= 2 + && get_range_pos_neg (treeop0) == 1 + && get_range_pos_neg (treeop1) == 1) + { + /* If both arguments are known to be positive when interpreted + as signed, we can expand it as both signed and unsigned + division or modulo. Choose the cheaper sequence in that case. */ + bool speed_p = optimize_insn_for_speed_p (); + do_pending_stack_adjust (); + start_sequence (); + rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1); + rtx_insn *uns_insns = get_insns (); + end_sequence (); + start_sequence (); + rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0); + rtx_insn *sgn_insns = get_insns (); + end_sequence (); + unsigned uns_cost = seq_cost (uns_insns, speed_p); + unsigned sgn_cost = seq_cost (sgn_insns, speed_p); + + /* If costs are the same then use as tie breaker the other other + factor. */ + if (uns_cost == sgn_cost) + { + uns_cost = seq_cost (uns_insns, !speed_p); + sgn_cost = seq_cost (sgn_insns, !speed_p); + } + + if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp)) + { + emit_insn (uns_insns); + return uns_ret; + } + emit_insn (sgn_insns); + return sgn_ret; + } + return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp); +} + rtx expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, enum expand_modifier modifier) @@ -9201,14 +9251,78 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, if (!REG_P (op0)) op0 = copy_to_mode_reg (mode, op0); - return REDUCE_BIT_FIELD (gen_rtx_MULT (mode, op0, - gen_int_mode (tree_to_shwi (exp1), - TYPE_MODE (TREE_TYPE (exp1))))); + op1 = gen_int_mode (tree_to_shwi (exp1), + TYPE_MODE (TREE_TYPE (exp1))); + return REDUCE_BIT_FIELD (gen_rtx_MULT (mode, op0, op1)); } if (modifier == EXPAND_STACK_PARM) target = 0; + if (SCALAR_INT_MODE_P (mode) && optimize >= 2) + { + gimple *def_stmt0 = get_def_for_expr (treeop0, TRUNC_DIV_EXPR); + gimple *def_stmt1 = get_def_for_expr (treeop1, TRUNC_DIV_EXPR); + if (def_stmt0 + && !operand_equal_p (treeop1, gimple_assign_rhs2 (def_stmt0), 0)) + def_stmt0 = NULL; + if (def_stmt1 + && !operand_equal_p (treeop0, gimple_assign_rhs2 (def_stmt1), 0)) + def_stmt1 = NULL; + + if (def_stmt0 || def_stmt1) + { + /* X / Y * Y can be expanded as X - X % Y too. + Choose the cheaper sequence of those two. */ + if (def_stmt0) + treeop0 = gimple_assign_rhs1 (def_stmt0); + else + { + treeop1 = treeop0; + treeop0 = gimple_assign_rhs1 (def_stmt1); + } + expand_operands (treeop0, treeop1, subtarget, &op0, &op1, + EXPAND_NORMAL); + bool speed_p = optimize_insn_for_speed_p (); + do_pending_stack_adjust (); + start_sequence (); + rtx divmul_ret + = expand_expr_divmod (TRUNC_DIV_EXPR, mode, treeop0, treeop1, + op0, op1, NULL_RTX, unsignedp); + divmul_ret = expand_mult (mode, divmul_ret, op1, target, + unsignedp); + rtx_insn *divmul_insns = get_insns (); + end_sequence (); + start_sequence (); + rtx modsub_ret + = expand_expr_divmod (TRUNC_MOD_EXPR, mode, treeop0, treeop1, + op0, op1, NULL_RTX, unsignedp); + this_optab = optab_for_tree_code (MINUS_EXPR, type, + optab_default); + modsub_ret = expand_binop (mode, this_optab, op0, modsub_ret, + target, unsignedp, OPTAB_LIB_WIDEN); + rtx_insn *modsub_insns = get_insns (); + end_sequence (); + unsigned divmul_cost = seq_cost (divmul_insns, speed_p); + unsigned modsub_cost = seq_cost (modsub_insns, speed_p); + /* If costs are the same then use as tie breaker the other other + factor. */ + if (divmul_cost == modsub_cost) + { + divmul_cost = seq_cost (divmul_insns, !speed_p); + modsub_cost = seq_cost (modsub_insns, !speed_p); + } + + if (divmul_cost <= modsub_cost) + { + emit_insn (divmul_insns); + return REDUCE_BIT_FIELD (divmul_ret); + } + emit_insn (modsub_insns); + return REDUCE_BIT_FIELD (modsub_ret); + } + } + expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL); return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp)); @@ -9222,61 +9336,21 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, case CEIL_DIV_EXPR: case ROUND_DIV_EXPR: case EXACT_DIV_EXPR: - { - /* If this is a fixed-point operation, then we cannot use the code - below because "expand_divmod" doesn't support sat/no-sat fixed-point - divisions. */ - if (ALL_FIXED_POINT_MODE_P (mode)) - goto binop; - - if (modifier == EXPAND_STACK_PARM) - target = 0; - /* Possible optimization: compute the dividend with EXPAND_SUM - then if the divisor is constant can optimize the case - where some terms of the dividend have coeffs divisible by it. */ - expand_operands (treeop0, treeop1, - subtarget, &op0, &op1, EXPAND_NORMAL); - bool mod_p = code == TRUNC_MOD_EXPR || code == FLOOR_MOD_EXPR - || code == CEIL_MOD_EXPR || code == ROUND_MOD_EXPR; - if (SCALAR_INT_MODE_P (mode) - && optimize >= 2 - && get_range_pos_neg (treeop0) == 1 - && get_range_pos_neg (treeop1) == 1) - { - /* If both arguments are known to be positive when interpreted - as signed, we can expand it as both signed and unsigned - division or modulo. Choose the cheaper sequence in that case. */ - bool speed_p = optimize_insn_for_speed_p (); - do_pending_stack_adjust (); - start_sequence (); - rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1); - rtx_insn *uns_insns = get_insns (); - end_sequence (); - start_sequence (); - rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0); - rtx_insn *sgn_insns = get_insns (); - end_sequence (); - unsigned uns_cost = seq_cost (uns_insns, speed_p); - unsigned sgn_cost = seq_cost (sgn_insns, speed_p); - - /* If costs are the same then use as tie breaker the other - other factor. */ - if (uns_cost == sgn_cost) - { - uns_cost = seq_cost (uns_insns, !speed_p); - sgn_cost = seq_cost (sgn_insns, !speed_p); - } - - if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp)) - { - emit_insn (uns_insns); - return uns_ret; - } - emit_insn (sgn_insns); - return sgn_ret; - } - return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp); - } + /* If this is a fixed-point operation, then we cannot use the code + below because "expand_divmod" doesn't support sat/no-sat fixed-point + divisions. */ + if (ALL_FIXED_POINT_MODE_P (mode)) + goto binop; + + if (modifier == EXPAND_STACK_PARM) + target = 0; + /* Possible optimization: compute the dividend with EXPAND_SUM + then if the divisor is constant can optimize the case + where some terms of the dividend have coeffs divisible by it. */ + expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL); + return expand_expr_divmod (code, mode, treeop0, treeop1, op0, op1, + target, unsignedp); + case RDIV_EXPR: goto binop; -- cgit v1.1 From 3bb41228d76b3a3cbd9923d57388f0903f7683de Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 26 Apr 2021 15:36:18 -0700 Subject: op_by_pieces_d::run: Change a while loop to a do-while loop Change a while loop in op_by_pieces_d::run to a do-while loop to prepare for offset adjusted operation for the remaining bytes on the last piece operation of a memory region. PR middle-end/90773 * expr.c (op_by_pieces_d::get_usable_mode): New member function. (op_by_pieces_d::run): Cange a while loop to a do-while loop. --- gcc/expr.c | 76 +++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 23 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 5a1fda7..a4a004d 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1041,6 +1041,9 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) class op_by_pieces_d { + private: + scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); + protected: pieces_addr m_to, m_from; unsigned HOST_WIDE_INT m_len; @@ -1108,6 +1111,25 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, m_align = align; } +/* This function returns the largest usable integer mode for LEN bytes + whose size is no bigger than size of MODE. */ + +scalar_int_mode +op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) +{ + unsigned int size; + do + { + size = GET_MODE_SIZE (mode); + if (len >= size && prepare_mode (mode, m_align)) + break; + /* NB: widest_int_mode_for_size checks SIZE > 1. */ + mode = widest_int_mode_for_size (size); + } + while (1); + return mode; +} + /* This function contains the main loop used for expanding a block operation. First move what we can in the largest integer mode, then go to successively smaller modes. For every access, call @@ -1116,42 +1138,50 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, void op_by_pieces_d::run () { - while (m_max_size > 1 && m_len > 0) + if (m_len == 0) + return; + + /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ + scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + mode = get_usable_mode (mode, m_len); + + do { - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + unsigned int size = GET_MODE_SIZE (mode); + rtx to1 = NULL_RTX, from1; - if (prepare_mode (mode, m_align)) + while (m_len >= size) { - unsigned int size = GET_MODE_SIZE (mode); - rtx to1 = NULL_RTX, from1; + if (m_reverse) + m_offset -= size; - while (m_len >= size) - { - if (m_reverse) - m_offset -= size; + to1 = m_to.adjust (mode, m_offset); + from1 = m_from.adjust (mode, m_offset); - to1 = m_to.adjust (mode, m_offset); - from1 = m_from.adjust (mode, m_offset); + m_to.maybe_predec (-(HOST_WIDE_INT)size); + m_from.maybe_predec (-(HOST_WIDE_INT)size); - m_to.maybe_predec (-(HOST_WIDE_INT)size); - m_from.maybe_predec (-(HOST_WIDE_INT)size); + generate (to1, from1, mode); - generate (to1, from1, mode); + m_to.maybe_postinc (size); + m_from.maybe_postinc (size); - m_to.maybe_postinc (size); - m_from.maybe_postinc (size); + if (!m_reverse) + m_offset += size; - if (!m_reverse) - m_offset += size; + m_len -= size; + } - m_len -= size; - } + finish_mode (mode); - finish_mode (mode); - } + if (m_len == 0) + return; - m_max_size = GET_MODE_SIZE (mode); + /* NB: widest_int_mode_for_size checks SIZE > 1. */ + mode = widest_int_mode_for_size (size); + mode = get_usable_mode (mode, m_len); } + while (1); /* The code above should have handled everything. */ gcc_assert (!m_len); -- cgit v1.1 From 985b3a6837dee7001e6b618f073ed74f0edf5787 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 10 Jun 2019 09:57:15 -0700 Subject: Generate offset adjusted operation for op_by_pieces operations Add an overlap_op_by_pieces_p target hook for op_by_pieces operations between two areas of memory to generate one offset adjusted operation in the smallest integer mode for the remaining bytes on the last piece operation of a memory region to avoid doing more than one smaller operations. Pass the RTL information from the previous iteration to m_constfn in op_by_pieces operation so that builtin_memset_[read|gen]_str can generate the new RTL from the previous RTL. Tested on Linux/x86-64. gcc/ PR middle-end/90773 * builtins.c (builtin_memcpy_read_str): Add a dummy argument. (builtin_strncpy_read_str): Likewise. (builtin_memset_read_str): Add an argument for the previous RTL information and generate the new RTL from the previous RTL info. (builtin_memset_gen_str): Likewise. * builtins.h (builtin_strncpy_read_str): Update the prototype. (builtin_memset_read_str): Likewise. * expr.c (by_pieces_ninsns): If targetm.overlap_op_by_pieces_p() returns true, round up size and alignment to the widest integer mode for maximum size. (pieces_addr::adjust): Add a pointer to by_pieces_prev argument and pass it to m_constfn. (op_by_pieces_d): Add m_push and m_overlap_op_by_pieces. (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize m_push. Initialize m_overlap_op_by_pieces with targetm.overlap_op_by_pieces_p (). (op_by_pieces_d::run): Pass the previous RTL information to pieces_addr::adjust and generate overlapping operations if m_overlap_op_by_pieces is true. (PUSHG_P): New. (move_by_pieces_d::move_by_pieces_d): Updated for op_by_pieces_d change. (store_by_pieces_d::store_by_pieces_d): Updated for op_by_pieces_d change. (can_store_by_pieces): Use by_pieces_constfn on constfun. (store_by_pieces): Use by_pieces_constfn on constfun. Updated for op_by_pieces_d change. (clear_by_pieces_1): Add a dummy argument. (clear_by_pieces): Updated for op_by_pieces_d change. (compare_by_pieces_d::compare_by_pieces_d): Likewise. (string_cst_read_str): Add a dummy argument. * expr.h (by_pieces_constfn): Add a dummy argument. (by_pieces_prev): New. * target.def (overlap_op_by_pieces_p): New target hook. * config/i386/i386.c (TARGET_OVERLAP_OP_BY_PIECES_P): New. * doc/tm.texi.in: Add TARGET_OVERLAP_OP_BY_PIECES_P. * doc/tm.texi: Regenerated. gcc/testsuite/ PR middle-end/90773 * g++.dg/pr90773-1.h: New test. * g++.dg/pr90773-1a.C: Likewise. * g++.dg/pr90773-1b.C: Likewise. * g++.dg/pr90773-1c.C: Likewise. * g++.dg/pr90773-1d.C: Likewise. * gcc.target/i386/pr90773-1.c: Likewise. * gcc.target/i386/pr90773-2.c: Likewise. * gcc.target/i386/pr90773-3.c: Likewise. * gcc.target/i386/pr90773-4.c: Likewise. * gcc.target/i386/pr90773-5.c: Likewise. * gcc.target/i386/pr90773-6.c: Likewise. * gcc.target/i386/pr90773-7.c: Likewise. * gcc.target/i386/pr90773-8.c: Likewise. * gcc.target/i386/pr90773-9.c: Likewise. * gcc.target/i386/pr90773-10.c: Likewise. * gcc.target/i386/pr90773-11.c: Likewise. * gcc.target/i386/pr90773-12.c: Likewise. * gcc.target/i386/pr90773-13.c: Likewise. * gcc.target/i386/pr90773-14.c: Likewise. --- gcc/expr.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 20 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index a4a004d..e0167b7 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -815,12 +815,27 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, unsigned int max_size, by_pieces_operation op) { unsigned HOST_WIDE_INT n_insns = 0; + scalar_int_mode mode; + + if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) + { + /* NB: Round up L and ALIGN to the widest integer mode for + MAX_SIZE. */ + mode = widest_int_mode_for_size (max_size); + if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) + { + unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); + if (up > l) + l = up; + align = GET_MODE_ALIGNMENT (mode); + } + } align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align); while (max_size > 1 && l > 0) { - scalar_int_mode mode = widest_int_mode_for_size (max_size); + mode = widest_int_mode_for_size (max_size); enum insn_code icode; unsigned int modesize = GET_MODE_SIZE (mode); @@ -888,7 +903,8 @@ class pieces_addr void *m_cfndata; public: pieces_addr (rtx, bool, by_pieces_constfn, void *); - rtx adjust (scalar_int_mode, HOST_WIDE_INT); + rtx adjust (scalar_int_mode, HOST_WIDE_INT, + by_pieces_prev * = nullptr); void increment_address (HOST_WIDE_INT); void maybe_predec (HOST_WIDE_INT); void maybe_postinc (HOST_WIDE_INT); @@ -990,10 +1006,12 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, but we still modify the MEM's properties. */ rtx -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset) +pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, + by_pieces_prev *prev) { if (m_constfn) - return m_constfn (m_cfndata, offset, mode); + /* Pass the previous data to m_constfn. */ + return m_constfn (m_cfndata, prev, offset, mode); if (m_obj == NULL_RTX) return NULL_RTX; if (m_auto) @@ -1051,6 +1069,10 @@ class op_by_pieces_d unsigned int m_align; unsigned int m_max_size; bool m_reverse; + /* True if this is a stack push. */ + bool m_push; + /* True if targetm.overlap_op_by_pieces_p () returns true. */ + bool m_overlap_op_by_pieces; /* Virtual functions, overriden by derived classes for the specific operation. */ @@ -1062,7 +1084,7 @@ class op_by_pieces_d public: op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int); + unsigned HOST_WIDE_INT, unsigned int, bool); void run (); }; @@ -1077,10 +1099,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, - unsigned int align) + unsigned int align, bool push) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), - m_len (len), m_max_size (MOVE_MAX_PIECES + 1) + m_len (len), m_max_size (MOVE_MAX_PIECES + 1), + m_push (push) { int toi = m_to.get_addr_inc (); int fromi = m_from.get_addr_inc (); @@ -1109,6 +1132,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align); m_align = align; + + m_overlap_op_by_pieces = targetm.overlap_op_by_pieces_p (); } /* This function returns the largest usable integer mode for LEN bytes @@ -1145,6 +1170,9 @@ op_by_pieces_d::run () scalar_int_mode mode = widest_int_mode_for_size (m_max_size); mode = get_usable_mode (mode, m_len); + by_pieces_prev to_prev = { nullptr, mode }; + by_pieces_prev from_prev = { nullptr, mode }; + do { unsigned int size = GET_MODE_SIZE (mode); @@ -1155,8 +1183,12 @@ op_by_pieces_d::run () if (m_reverse) m_offset -= size; - to1 = m_to.adjust (mode, m_offset); - from1 = m_from.adjust (mode, m_offset); + to1 = m_to.adjust (mode, m_offset, &to_prev); + to_prev.data = to1; + to_prev.mode = mode; + from1 = m_from.adjust (mode, m_offset, &from_prev); + from_prev.data = from1; + from_prev.mode = mode; m_to.maybe_predec (-(HOST_WIDE_INT)size); m_from.maybe_predec (-(HOST_WIDE_INT)size); @@ -1177,9 +1209,32 @@ op_by_pieces_d::run () if (m_len == 0) return; - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); - mode = get_usable_mode (mode, m_len); + if (!m_push && m_overlap_op_by_pieces) + { + /* NB: Generate overlapping operations if it is not a stack + push since stack push must not overlap. Get the smallest + integer mode for M_LEN bytes. */ + mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); + mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); + int gap = GET_MODE_SIZE (mode) - m_len; + if (gap > 0) + { + /* If size of MODE > M_LEN, generate the last operation + in MODE for the remaining bytes with ovelapping memory + from the previois operation. */ + if (m_reverse) + m_offset += gap; + else + m_offset -= gap; + m_len += gap; + } + } + else + { + /* NB: widest_int_mode_for_size checks SIZE > 1. */ + mode = widest_int_mode_for_size (size); + mode = get_usable_mode (mode, m_len); + } } while (1); @@ -1190,6 +1245,12 @@ op_by_pieces_d::run () /* Derived class from op_by_pieces_d, providing support for block move operations. */ +#ifdef PUSH_ROUNDING +#define PUSHG_P(to) ((to) == nullptr) +#else +#define PUSHG_P(to) false +#endif + class move_by_pieces_d : public op_by_pieces_d { insn_gen_fn m_gen_fun; @@ -1199,7 +1260,8 @@ class move_by_pieces_d : public op_by_pieces_d public: move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len, unsigned int align) - : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align) + : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align, + PUSHG_P (to)) { } rtx finish_retmode (memop_ret); @@ -1294,7 +1356,8 @@ class store_by_pieces_d : public op_by_pieces_d public: store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, unsigned HOST_WIDE_INT len, unsigned int align) - : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, align) + : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, + align, false) { } rtx finish_retmode (memop_ret); @@ -1349,7 +1412,7 @@ store_by_pieces_d::finish_retmode (memop_ret retmode) int can_store_by_pieces (unsigned HOST_WIDE_INT len, - rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode), + by_pieces_constfn constfun, void *constfundata, unsigned int align, bool memsetp) { unsigned HOST_WIDE_INT l; @@ -1396,7 +1459,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, if (reverse) offset -= size; - cst = (*constfun) (constfundata, offset, mode); + cst = (*constfun) (constfundata, nullptr, offset, mode); if (!targetm.legitimate_constant_p (mode, cst)) return 0; @@ -1426,7 +1489,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, rtx store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, - rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode), + by_pieces_constfn constfun, void *constfundata, unsigned int align, bool memsetp, memop_ret retmode) { @@ -1454,7 +1517,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, Return const0_rtx unconditionally. */ static rtx -clear_by_pieces_1 (void *, HOST_WIDE_INT, scalar_int_mode) +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) { return const0_rtx; } @@ -1490,7 +1553,8 @@ class compare_by_pieces_d : public op_by_pieces_d compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn, void *op1_cfn_data, HOST_WIDE_INT len, int align, rtx_code_label *fail_label) - : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, align) + : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, + align, false) { m_fail_label = fail_label; } @@ -5676,7 +5740,8 @@ emit_storent_insn (rtx to, rtx from) /* Helper function for store_expr storing of STRING_CST. */ static rtx -string_cst_read_str (void *data, HOST_WIDE_INT offset, scalar_int_mode mode) +string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, + scalar_int_mode mode) { tree str = (tree) data; -- cgit v1.1 From 18d713fbd345c5c54ab6091ac5f114df4551d1bb Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 30 Apr 2021 04:36:36 -0700 Subject: Update alignment_for_piecewise_move alignment_for_piecewise_move is called only with MOVE_MAX_PIECES or STORE_MAX_PIECES, which are the number of bytes at a time that we can move or store efficiently. We should call mode_for_size without limit to MAX_FIXED_MODE_SIZE, which is an integer expression for the size in bits of the largest integer machine mode that should actually be used, may be smaller than MOVE_MAX_PIECES or STORE_MAX_PIECES, which may use vector. * expr.c (alignment_for_piecewise_move): Call mode_for_size without limit to MAX_FIXED_MODE_SIZE. --- gcc/expr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index e0167b7..b4c110f 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -746,7 +746,7 @@ static unsigned int alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) { scalar_int_mode tmode - = int_mode_for_size (max_pieces * BITS_PER_UNIT, 1).require (); + = int_mode_for_size (max_pieces * BITS_PER_UNIT, 0).require (); if (align >= GET_MODE_ALIGNMENT (tmode)) align = GET_MODE_ALIGNMENT (tmode); -- cgit v1.1 From da9e6e63d1ae22e530ec7baf59f6ed028bf05776 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Mon, 3 May 2021 22:48:47 -0300 Subject: introduce try store by multiple pieces The ldist pass turns even very short loops into memset calls. E.g., the TFmode emulation calls end with a loop of up to 3 iterations, to zero out trailing words, and the loop distribution pass turns them into calls of the memset builtin. Though short constant-length clearing memsets are usually dealt with efficiently, for non-constant-length ones, the options are setmemM, or a function calls. RISC-V doesn't have any setmemM pattern, so the loops above end up "optimized" into memset calls, incurring not only the overhead of an explicit call, but also discarding the information the compiler has about the alignment of the destination, and that the length is a multiple of the word alignment. This patch handles variable lengths with multiple conditional power-of-2-constant-sized stores-by-pieces, so as to reduce the overhead of length compares. It also changes the last copy-prop pass into ccp, so that pointer alignment and length's nonzero bits are detected and made available for the expander, even for ldist-introduced SSA_NAMEs. for gcc/ChangeLog * builtins.c (try_store_by_multiple_pieces): New. (expand_builtin_memset_args): Use it. If target_char_cast fails, proceed as for non-constant val. Pass len's ctz to... * expr.c (clear_storage_hints): ... this. Try store by multiple pieces after setmem. (clear_storage): Adjust. * expr.h (clear_storage_hints): Likewise. (try_store_by_multiple_pieces): Declare. * passes.def: Replace the last copy_prop with ccp. --- gcc/expr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index b4c110f..1b65f6b 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -3150,7 +3150,8 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method, unsigned int expected_align, HOST_WIDE_INT expected_size, unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, - unsigned HOST_WIDE_INT probable_max_size) + unsigned HOST_WIDE_INT probable_max_size, + unsigned ctz_size) { machine_mode mode = GET_MODE (object); unsigned int align; @@ -3197,6 +3198,10 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method, expected_align, expected_size, min_size, max_size, probable_max_size)) ; + else if (try_store_by_multiple_pieces (object, size, ctz_size, + min_size, max_size, + NULL_RTX, 0, align)) + ; else if (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (object))) return set_storage_via_libcall (object, size, const0_rtx, method == BLOCK_OP_TAILCALL); @@ -3214,7 +3219,7 @@ clear_storage (rtx object, rtx size, enum block_op_methods method) min = max = UINTVAL (size); else max = GET_MODE_MASK (GET_MODE (size)); - return clear_storage_hints (object, size, method, 0, -1, min, max, max); + return clear_storage_hints (object, size, method, 0, -1, min, max, max, 0); } -- cgit v1.1 From de09c0ddb81e2dc0e35c2e362532e93ca417200c Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Tue, 11 May 2021 13:58:48 -0600 Subject: Replace unreachable code with an assert. Resolves: PR middle-end/21433 - The COMPONENT_REF case of expand_expr_real_1 is probably wrong gcc/ChangeLog: PR middle-end/21433 * expr.c (expand_expr_real_1): Replace unreachable code with an assert. --- gcc/expr.c | 57 ++------------------------------------------------------- 1 file changed, 2 insertions(+), 55 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 1b65f6b..d09ee42 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -10949,61 +10949,8 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, goto normal_inner_ref; case COMPONENT_REF: - /* If the operand is a CONSTRUCTOR, we can just extract the - appropriate field if it is present. */ - if (TREE_CODE (treeop0) == CONSTRUCTOR) - { - unsigned HOST_WIDE_INT idx; - tree field, value; - scalar_int_mode field_mode; - - FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (treeop0), - idx, field, value) - if (field == treeop1 - /* We can normally use the value of the field in the - CONSTRUCTOR. However, if this is a bitfield in - an integral mode that we can fit in a HOST_WIDE_INT, - we must mask only the number of bits in the bitfield, - since this is done implicitly by the constructor. If - the bitfield does not meet either of those conditions, - we can't do this optimization. */ - && (! DECL_BIT_FIELD (field) - || (is_int_mode (DECL_MODE (field), &field_mode) - && (GET_MODE_PRECISION (field_mode) - <= HOST_BITS_PER_WIDE_INT)))) - { - if (DECL_BIT_FIELD (field) - && modifier == EXPAND_STACK_PARM) - target = 0; - op0 = expand_expr (value, target, tmode, modifier); - if (DECL_BIT_FIELD (field)) - { - HOST_WIDE_INT bitsize = TREE_INT_CST_LOW (DECL_SIZE (field)); - scalar_int_mode imode - = SCALAR_INT_TYPE_MODE (TREE_TYPE (field)); - - if (TYPE_UNSIGNED (TREE_TYPE (field))) - { - op1 = gen_int_mode ((HOST_WIDE_INT_1 << bitsize) - 1, - imode); - op0 = expand_and (imode, op0, op1, target); - } - else - { - int count = GET_MODE_PRECISION (imode) - bitsize; - - op0 = expand_shift (LSHIFT_EXPR, imode, op0, count, - target, 0); - op0 = expand_shift (RSHIFT_EXPR, imode, op0, count, - target, 0); - } - } - - return op0; - } - } - goto normal_inner_ref; - + gcc_assert (TREE_CODE (treeop0) != CONSTRUCTOR); + /* Fall through. */ case BIT_FIELD_REF: case ARRAY_RANGE_REF: normal_inner_ref: -- cgit v1.1 From 53fb833d635da04f5b44af16bcea1082e7b59e75 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 21 May 2021 05:16:20 -0700 Subject: Elide expand_constructor if move by pieces is preferred Elide expand_constructor when the constructor is static storage and not mostly zeros and we can move it by pieces prefer to do so since that's usually more efficient than performing a series of stores from immediates. 2021-05-21 Richard Biener H.J. Lu gcc/ PR middle-end/90773 * expr.c (expand_constructor): Elide expand_constructor if move by pieces is preferred. gcc/testsuite/ * gcc.target/i386/pr90773-24.c: New test. * gcc.target/i386/pr90773-25.c: Likewise. --- gcc/expr.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index d09ee42..ba61eb9 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -8523,6 +8523,19 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier, return constructor; } + /* If the CTOR is available in static storage and not mostly + zeros and we can move it by pieces prefer to do so since + that's usually more efficient than performing a series of + stores from immediates. */ + if (avoid_temp_mem + && TREE_STATIC (exp) + && TREE_CONSTANT (exp) + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type)) + && can_move_by_pieces (tree_to_uhwi (TYPE_SIZE_UNIT (type)), + TYPE_ALIGN (type)) + && ! mostly_zeros_p (exp)) + return NULL_RTX; + /* Handle calls that pass values in multiple non-contiguous locations. The Irix 6 ABI has examples of this. */ if (target == 0 || ! safe_from_p (target, exp, 1) -- cgit v1.1 From 29a2f51806c5b30e17a8d0e9ba7915a3c53c34ff Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Fri, 26 Feb 2021 04:34:49 -0800 Subject: openacc: Add support for gang local storage allocation in shared memory [PR90115] This patch implements a method to track the "private-ness" of OpenACC variables declared in offload regions in gang-partitioned, worker-partitioned or vector-partitioned modes. Variables declared implicitly in scoped blocks and those declared "private" on enclosing directives (e.g. "acc parallel") are both handled. Variables that are e.g. gang-private can then be adjusted so they reside in GPU shared memory. The reason for doing this is twofold: correct implementation of OpenACC semantics, and optimisation, since shared memory might be faster than the main memory on a GPU. Handling of private variables is intimately tied to the execution model for gangs/workers/vectors implemented by a particular target: for current targets, we use (or on mainline, will soon use) a broadcasting/neutering scheme. That is sufficient for code that e.g. sets a variable in worker-single mode and expects to use the value in worker-partitioned mode. The difficulty (semantics-wise) comes when the user wants to do something like an atomic operation in worker-partitioned mode and expects a worker-single (gang private) variable to be shared across each partitioned worker. Forcing use of shared memory for such variables makes that work properly. In terms of implementation, the parallelism level of a given loop is not fixed until the oaccdevlow pass in the offload compiler, so the patch delays fixing the parallelism level of variables declared on or within such loops until the same point. This is done by adding a new internal UNIQUE function (OACC_PRIVATE) that lists (the address of) each private variable as an argument, and other arguments set so as to be able to determine the correct parallelism level to use for the listed variables. This new internal function fits into the existing scheme for demarcating OpenACC loops, as described in comments in the patch. Two new target hooks are introduced: TARGET_GOACC_ADJUST_PRIVATE_DECL and TARGET_GOACC_EXPAND_VAR_DECL. The first can tweak a variable declaration at oaccdevlow time, and the second at expand time. The first or both of these target hooks can be used by a given offload target, depending on its strategy for implementing private variables. This patch updates the TARGET_GOACC_ADJUST_PRIVATE_DECL target hook in the AMD GCN backend to the current name and prototype. (An earlier version of the hook was already present, but dormant.) gcc/ PR middle-end/90115 * doc/tm.texi.in (TARGET_GOACC_EXPAND_VAR_DECL) (TARGET_GOACC_ADJUST_PRIVATE_DECL): Add documentation hooks. * doc/tm.texi: Regenerate. * expr.c (expand_expr_real_1): Expand decls using the expand_var_decl OpenACC hook if defined. * internal-fn.c (expand_UNIQUE): Handle IFN_UNIQUE_OACC_PRIVATE. * internal-fn.h (IFN_UNIQUE_CODES): Add OACC_PRIVATE. * omp-low.c (omp_context): Add oacc_privatization_candidates field. (lower_oacc_reductions): Add PRIVATE_MARKER parameter. Insert before fork. (lower_oacc_head_tail): Add PRIVATE_MARKER parameter. Modify private marker's gimple call arguments, and pass it to lower_oacc_reductions. (oacc_privatization_scan_clause_chain) (oacc_privatization_scan_decl_chain, lower_oacc_private_marker): New functions. (lower_omp_for, lower_omp_target, lower_omp_1): Use these. * omp-offload.c (convert.h): Include. (oacc_loop_xform_head_tail): Treat private-variable markers like fork/join when transforming head/tail sequences. (struct var_decl_rewrite_info): Add struct. (oacc_rewrite_var_decl, is_sync_builtin_call): New functions. (execute_oacc_device_lower): Support rewriting gang-private variables using target hook, and fix up addr_expr and var_decl nodes afterwards. * target.def (adjust_private_decl, expand_var_decl): New hooks. * config/gcn/gcn-protos.h (gcn_goacc_adjust_gangprivate_decl): Rename to... (gcn_goacc_adjust_private_decl): ...this. * config/gcn/gcn-tree.c (gcn_goacc_adjust_gangprivate_decl): Rename to... (gcn_goacc_adjust_private_decl): ...this. Add LEVEL parameter. * config/gcn/gcn.c (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Rename definition using gcn_goacc_adjust_gangprivate_decl... (TARGET_GOACC_ADJUST_PRIVATE_DECL): ...to this, using gcn_goacc_adjust_private_decl. * config/nvptx/nvptx.c (tree-pretty-print.h): Include. (gang_private_shared_size): New global variable. (gang_private_shared_align): Likewise. (gang_private_shared_sym): Likewise. (gang_private_shared_hmap): Likewise. (nvptx_option_override): Initialize these. (nvptx_file_end): Output gang_private_shared_sym. (nvptx_goacc_adjust_private_decl, nvptx_goacc_expand_var_decl): New functions. (nvptx_set_current_function): Clear gang_private_shared_hmap. (TARGET_GOACC_ADJUST_PRIVATE_DECL): Define hook. (TARGET_GOACC_EXPAND_VAR_DECL): Likewise. libgomp/ PR middle-end/90115 * testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c: New test. * testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90: Likewise. * testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90: Likewise. Co-Authored-By: Chung-Lin Tang Co-Authored-By: Thomas Schwinge --- gcc/expr.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index ba61eb9..e4660f0 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -10419,8 +10419,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, exp = SSA_NAME_VAR (ssa_name); goto expand_decl_rtl; - case PARM_DECL: case VAR_DECL: + /* Allow accel compiler to handle variables that require special + treatment, e.g. if they have been modified in some way earlier in + compilation by the adjust_private_decl OpenACC hook. */ + if (flag_openacc && targetm.goacc.expand_var_decl) + { + temp = targetm.goacc.expand_var_decl (exp); + if (temp) + return temp; + } + /* ... fall through ... */ + + case PARM_DECL: /* If a static var's type was incomplete when the decl was written, but the type is complete now, lay out the decl now. */ if (DECL_SIZE (exp) == 0 -- cgit v1.1 From 008153c8435ca3bf587e11654c31f05c0f99b43a Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 15 Jun 2021 11:36:47 +0200 Subject: expr: Fix up VEC_PACK_TRUNC_EXPR expansion [PR101046] The following testcase ICEs, because we have a mode mismatch. VEC_PACK_TRUNC_EXPR's operands have different modes from the result (same vector mode size but twice as large element), but we were passing non-NULL subtarget with the mode of the result to the expansion of its arguments, so the VEC_PERM_EXPR in one of the operands which had V8SImode operands and result had V16HImode target. Fixed by clearing the subtarget if we are changing mode. 2021-06-15 Jakub Jelinek PR target/101046 * expr.c (expand_expr_real_2) : Clear subtarget when changing mode. * gcc.target/i386/pr101046.c: New test. --- gcc/expr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index e4660f0..231487f 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -9998,6 +9998,7 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, case VEC_PACK_SAT_EXPR: case VEC_PACK_FIX_TRUNC_EXPR: mode = TYPE_MODE (TREE_TYPE (treeop0)); + subtarget = NULL_RTX; goto binop; case VEC_PACK_TRUNC_EXPR: @@ -10021,6 +10022,7 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, return eops[0].value; } mode = TYPE_MODE (TREE_TYPE (treeop0)); + subtarget = NULL_RTX; goto binop; case VEC_PACK_FLOAT_EXPR: -- cgit v1.1 From 967b46530234b4e6ad3983057705aea6c20a03c4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 21 May 2021 11:56:55 -0700 Subject: Add a target calls hook: TARGET_PUSH_ARGUMENT 1. Replace PUSH_ARGS with a target calls hook, TARGET_PUSH_ARGUMENT, which takes an integer argument. When it returns true, push instructions will be used to pass outgoing arguments. If the argument is nonzero, it is the number of bytes to push and indicates the PUSH instruction usage is optional so that the backend can decide if PUSH instructions should be generated. Otherwise, the argument is zero. 2. Implement x86 target hook which returns false when the number of bytes to push is no less than 16 (8 for 32-bit targets) if vector load and store can be used. 3. Remove target PUSH_ARGS definitions which return 0 as it is the same as the default. 4. Define TARGET_PUSH_ARGUMENT of cr16 and m32c to always return true. gcc/ PR target/100704 * calls.c (expand_call): Replace PUSH_ARGS with targetm.calls.push_argument (0). (emit_library_call_value_1): Likewise. * defaults.h (PUSH_ARGS): Removed. (PUSH_ARGS_REVERSED): Replace PUSH_ARGS with targetm.calls.push_argument (0). * expr.c (block_move_libcall_safe_for_call_parm): Likewise. (emit_push_insn): Pass the number bytes to push to targetm.calls.push_argument and pass 0 if ARGS_ADDR is 0. * hooks.c (hook_bool_uint_true): New. * hooks.h (hook_bool_uint_true): Likewise. * rtlanal.c (nonzero_bits1): Replace PUSH_ARGS with targetm.calls.push_argument (0). * target.def (push_argument): Add a targetm.calls hook. * targhooks.c (default_push_argument): New. * targhooks.h (default_push_argument): Likewise. * config/bpf/bpf.h (PUSH_ARGS): Removed. * config/cr16/cr16.c (TARGET_PUSH_ARGUMENT): New. * config/cr16/cr16.h (PUSH_ARGS): Removed. * config/i386/i386.c (ix86_push_argument): New. (TARGET_PUSH_ARGUMENT): Likewise. * config/i386/i386.h (PUSH_ARGS): Removed. * config/m32c/m32c.c (TARGET_PUSH_ARGUMENT): New. * config/m32c/m32c.h (PUSH_ARGS): Removed. * config/nios2/nios2.h (PUSH_ARGS): Likewise. * config/pru/pru.h (PUSH_ARGS): Likewise. * doc/tm.texi.in: Remove PUSH_ARGS documentation. Add TARGET_PUSH_ARGUMENT hook. * doc/tm.texi: Regenerated. gcc/testsuite/ PR target/100704 * gcc.target/i386/pr100704-1.c: New test. * gcc.target/i386/pr100704-2.c: Likewise. * gcc.target/i386/pr100704-3.c: Likewise. --- gcc/expr.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 231487f..025033c 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1823,7 +1823,7 @@ block_move_libcall_safe_for_call_parm (void) tree fn; /* If arguments are pushed on the stack, then they're safe. */ - if (PUSH_ARGS) + if (targetm.calls.push_argument (0)) return true; /* If registers go on the stack anyway, any argument is sure to clobber @@ -4639,11 +4639,19 @@ emit_push_insn (rtx x, machine_mode mode, tree type, rtx size, skip = (reg_parm_stack_space == 0) ? 0 : used; #ifdef PUSH_ROUNDING + /* NB: Let the backend known the number of bytes to push and + decide if push insns should be generated. */ + unsigned int push_size; + if (CONST_INT_P (size)) + push_size = INTVAL (size); + else + push_size = 0; + /* Do it with several push insns if that doesn't take lots of insns and if there is no difficulty with push insns that skip bytes on the stack for alignment purposes. */ if (args_addr == 0 - && PUSH_ARGS + && targetm.calls.push_argument (push_size) && CONST_INT_P (size) && skip == 0 && MEM_ALIGN (xinner) >= align @@ -4848,7 +4856,7 @@ emit_push_insn (rtx x, machine_mode mode, tree type, rtx size, anti_adjust_stack (gen_int_mode (extra, Pmode)); #ifdef PUSH_ROUNDING - if (args_addr == 0 && PUSH_ARGS) + if (args_addr == 0 && targetm.calls.push_argument (0)) emit_single_push_insn (mode, x, type); else #endif -- cgit v1.1 From 52c3fdf3e4780f75297515d3c2a3dae9b36586ba Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 2 Jul 2021 10:03:48 -0700 Subject: Don't use vec_duplicate on vector in CTOR expansion Since vec_duplicate only works on scalar, don't use it on vector in store constructor expansion. gcc/ PR middle-end/101294 * expr.c (store_constructor): Don't use vec_duplicate on vector. gcc/testsuite/ PR middle-end/101294 * gcc.dg/pr101294.c: New test. --- gcc/expr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 025033c..bd85bbf 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -7078,7 +7078,8 @@ store_constructor (tree exp, rtx target, int cleared, poly_int64 size, && eltmode == GET_MODE_INNER (mode) && ((icode = optab_handler (vec_duplicate_optab, mode)) != CODE_FOR_nothing) - && (elt = uniform_vector_p (exp))) + && (elt = uniform_vector_p (exp)) + && !VECTOR_TYPE_P (TREE_TYPE (elt))) { class expand_operand ops[2]; create_output_operand (&ops[0], target, mode); -- cgit v1.1 From 6d3bab5d5adb3e28ddb16c97b0831efdea23cf7d Mon Sep 17 00:00:00 2001 From: Martin Sebor Date: Tue, 6 Jul 2021 13:41:02 -0600 Subject: Improve warning suppression for inlined functions. Resolves: PR middle-end/98871 - Cannot silence -Wmaybe-uninitialized at declaration site PR middle-end/98512 - #pragma GCC diagnostic ignored ineffective in conjunction with alias attribute gcc/ChangeLog: * builtins.c (warn_string_no_nul): Remove %G. (maybe_warn_for_bound): Same. (warn_for_access): Same. (check_access): Same. (check_strncat_sizes): Same. (expand_builtin_strncat): Same. (expand_builtin_strncmp): Same. (expand_builtin): Same. (expand_builtin_object_size): Same. (warn_dealloc_offset): Same. (maybe_emit_free_warning): Same. * calls.c (maybe_warn_alloc_args_overflow): Same. (maybe_warn_nonstring_arg): Same. (maybe_warn_rdwr_sizes): Same. * expr.c (expand_expr_real_1): Remove %K. * gimple-fold.c (gimple_fold_builtin_strncpy): Remove %G. (gimple_fold_builtin_strncat): Same. * gimple-ssa-sprintf.c (format_directive): Same. (handle_printf_call): Same. * gimple-ssa-warn-alloca.c (pass_walloca::execute): Same. * gimple-ssa-warn-restrict.c (maybe_diag_overlap): Same. (maybe_diag_access_bounds): Same. Call gimple_location. (check_bounds_or_overlap): Same. * trans-mem.c (ipa_tm_scan_irr_block): Remove %K. Simplify. * tree-ssa-ccp.c (pass_post_ipa_warn::execute): Remove %G. * tree-ssa-strlen.c (maybe_warn_overflow): Same. (maybe_diag_stxncpy_trunc): Same. (handle_builtin_stxncpy_strncat): Same. (maybe_warn_pointless_strcmp): Same. * tree-ssa-uninit.c (maybe_warn_operand): Same. gcc/testsuite/ChangeLog: * gcc.dg/Wobjsize-1.c: Prune expected output. * gcc.dg/Warray-bounds-71.c: New test. * gcc.dg/Warray-bounds-71.h: New test header. * gcc.dg/Warray-bounds-72.c: New test. * gcc.dg/Warray-bounds-73.c: New test. * gcc.dg/Warray-bounds-74.c: New test. * gcc.dg/Warray-bounds-75.c: New test. * gcc.dg/Wfree-nonheap-object-4.c: Adjust expected output. * gcc.dg/Wfree-nonheap-object-5.c: New test. * gcc.dg/Wfree-nonheap-object-6.c: New test. * gcc.dg/pragma-diag-10.c: New test. * gcc.dg/pragma-diag-9.c: New test. * gcc.dg/uninit-suppress_3.c: New test. * gcc.dg/pr79214.c: Xfail tests. * gcc.dg/tree-ssa/builtin-sprintf-warn-27.c: New test. * gcc.dg/format/c90-printf-1.c: Adjust expected output. --- gcc/expr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index bd85bbf..6a43681 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -11402,7 +11402,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, /* All valid uses of __builtin_va_arg_pack () are removed during inlining. */ if (CALL_EXPR_VA_ARG_PACK (exp)) - error ("%Kinvalid use of %<__builtin_va_arg_pack ()%>", exp); + error ("invalid use of %<__builtin_va_arg_pack ()%>"); { tree fndecl = get_callee_fndecl (exp), attr; @@ -11414,7 +11414,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, DECL_ATTRIBUTES (fndecl))) != NULL) { const char *ident = lang_hooks.decl_printable_name (fndecl, 1); - error ("%Kcall to %qs declared with attribute error: %s", exp, + error ("call to %qs declared with attribute error: %s", identifier_to_locale (ident), TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)))); } @@ -11426,10 +11426,10 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, DECL_ATTRIBUTES (fndecl))) != NULL) { const char *ident = lang_hooks.decl_printable_name (fndecl, 1); - warning_at (tree_nonartificial_location (exp), + warning_at (EXPR_LOCATION (exp), OPT_Wattribute_warning, - "%Kcall to %qs declared with attribute warning: %s", - exp, identifier_to_locale (ident), + "call to %qs declared with attribute warning: %s", + identifier_to_locale (ident), TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)))); } -- cgit v1.1 From e5e164effa30fd2b5c5bc3e6883d63889e96d8da Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 6 Mar 2016 06:38:21 -0800 Subject: Add QI vector mode support to by-pieces for memset 1. Replace scalar_int_mode with fixed_size_mode in the by-pieces infrastructure to allow non-integer mode. 2. Rename widest_int_mode_for_size to widest_fixed_size_mode_for_size to return QI vector mode for memset. 3. Add op_by_pieces_d::smallest_fixed_size_mode_for_size to return the smallest integer or QI vector mode. 4. Remove clear_by_pieces_1 and use builtin_memset_read_str in clear_by_pieces to support vector mode broadcast. 5. Add lowpart_subreg_regno, a wrapper around simplify_subreg_regno that uses subreg_lowpart_offset (mode, prev_mode) as the offset. 6. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard scratch register to avoid stack realignment when expanding memset. gcc/ PR middle-end/90773 * builtins.c (builtin_memcpy_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. (builtin_strncpy_read_str): Likewise. (gen_memset_value_from_prev): New function. (builtin_memset_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. Use gen_memset_value_from_prev and support CONST_VECTOR. (builtin_memset_gen_str): Likewise. (try_store_by_multiple_pieces): Use by_pieces_constfn to declare constfun. * builtins.h (builtin_strncpy_read_str): Replace scalar_int_mode with fixed_size_mode. (builtin_memset_read_str): Likewise. * expr.c (widest_int_mode_for_size): Renamed to ... (widest_fixed_size_mode_for_size): Add a bool argument to indicate if QI vector mode can be used. (by_pieces_ninsns): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (pieces_addr::adjust): Change the mode argument from scalar_int_mode to fixed_size_mode. (op_by_pieces_d): Make m_len read-only. Add a bool member, m_qi_vector_mode, to indicate that QI vector mode can be used. (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize m_qi_vector_mode. Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::get_usable_mode): Change the mode argument from scalar_int_mode to fixed_size_mode. Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. (op_by_pieces_d::smallest_fixed_size_mode_for_size): New member function to return the smallest integer or QI vector mode. (op_by_pieces_d::run): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. Call smallest_fixed_size_mode_for_size instead of smallest_int_mode_for_size. (store_by_pieces_d::store_by_pieces_d): Add a bool argument to indicate that QI vector mode can be used and pass it to op_by_pieces_d::op_by_pieces_d. (can_store_by_pieces): Call widest_fixed_size_mode_for_size instead of widest_int_mode_for_size. Pass memsetp to widest_fixed_size_mode_for_size to support QI vector mode. Allow all CONST_VECTORs for memset if vec_duplicate is supported. (store_by_pieces): Pass memsetp to store_by_pieces_d::store_by_pieces_d. (clear_by_pieces_1): Removed. (clear_by_pieces): Replace clear_by_pieces_1 with builtin_memset_read_str and pass true to store_by_pieces_d to support vector mode broadcast. (string_cst_read_str): Change the mode argument from scalar_int_mode to fixed_size_mode. * expr.h (by_pieces_constfn): Change scalar_int_mode to fixed_size_mode. (by_pieces_prev): Likewise. * rtl.h (lowpart_subreg_regno): New. * rtlanal.c (lowpart_subreg_regno): New. A wrapper around simplify_subreg_regno. * target.def (gen_memset_scratch_rtx): New hook. * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. * doc/tm.texi: Regenerated. gcc/testsuite/ * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of vmovdqu. * gcc.target/i386/pr100865-4b.c: Likewise. --- gcc/expr.c | 172 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 120 insertions(+), 52 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 6a43681..b65cfcf 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) return align; } -/* Return the widest integer mode that is narrower than SIZE bytes. */ +/* Return the widest QI vector, if QI_MODE is true, or integer mode + that is narrower than SIZE bytes. */ -static scalar_int_mode -widest_int_mode_for_size (unsigned int size) +static fixed_size_mode +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector) { - scalar_int_mode result = NARROWEST_INT_MODE; + fixed_size_mode result = NARROWEST_INT_MODE; gcc_checking_assert (size > 1); + /* Use QI vector only if size is wider than a WORD. */ + if (qi_vector && size > UNITS_PER_WORD) + { + machine_mode mode; + fixed_size_mode candidate; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (is_a (mode, &candidate) + && GET_MODE_INNER (candidate) == QImode) + { + if (GET_MODE_SIZE (candidate) >= size) + break; + if (optab_handler (vec_duplicate_optab, candidate) + != CODE_FOR_nothing) + result = candidate; + } + + if (result != NARROWEST_INT_MODE) + return result; + } + opt_scalar_int_mode tmode; FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) if (GET_MODE_SIZE (tmode.require ()) < size) @@ -815,13 +836,14 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, unsigned int max_size, by_pieces_operation op) { unsigned HOST_WIDE_INT n_insns = 0; - scalar_int_mode mode; + fixed_size_mode mode; if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) { /* NB: Round up L and ALIGN to the widest integer mode for MAX_SIZE. */ - mode = widest_int_mode_for_size (max_size); + mode = widest_fixed_size_mode_for_size (max_size, + op == SET_BY_PIECES); if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) { unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); @@ -835,7 +857,8 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, while (max_size > 1 && l > 0) { - mode = widest_int_mode_for_size (max_size); + mode = widest_fixed_size_mode_for_size (max_size, + op == SET_BY_PIECES); enum insn_code icode; unsigned int modesize = GET_MODE_SIZE (mode); @@ -903,8 +926,7 @@ class pieces_addr void *m_cfndata; public: pieces_addr (rtx, bool, by_pieces_constfn, void *); - rtx adjust (scalar_int_mode, HOST_WIDE_INT, - by_pieces_prev * = nullptr); + rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr); void increment_address (HOST_WIDE_INT); void maybe_predec (HOST_WIDE_INT); void maybe_postinc (HOST_WIDE_INT); @@ -1006,7 +1028,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, but we still modify the MEM's properties. */ rtx -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, +pieces_addr::adjust (fixed_size_mode mode, HOST_WIDE_INT offset, by_pieces_prev *prev) { if (m_constfn) @@ -1060,11 +1082,14 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) class op_by_pieces_d { private: - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); + fixed_size_mode get_usable_mode (fixed_size_mode, unsigned int); + fixed_size_mode smallest_fixed_size_mode_for_size (unsigned int); protected: pieces_addr m_to, m_from; - unsigned HOST_WIDE_INT m_len; + /* Make m_len read-only so that smallest_fixed_size_mode_for_size can + use it to check the valid mode size. */ + const unsigned HOST_WIDE_INT m_len; HOST_WIDE_INT m_offset; unsigned int m_align; unsigned int m_max_size; @@ -1073,6 +1098,8 @@ class op_by_pieces_d bool m_push; /* True if targetm.overlap_op_by_pieces_p () returns true. */ bool m_overlap_op_by_pieces; + /* True if QI vector mode can be used. */ + bool m_qi_vector_mode; /* Virtual functions, overriden by derived classes for the specific operation. */ @@ -1084,7 +1111,8 @@ class op_by_pieces_d public: op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool); + unsigned HOST_WIDE_INT, unsigned int, bool, + bool = false); void run (); }; @@ -1099,11 +1127,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, - unsigned int align, bool push) + unsigned int align, bool push, + bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), m_len (len), m_max_size (MOVE_MAX_PIECES + 1), - m_push (push) + m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); int fromi = m_from.get_addr_inc (); @@ -1124,7 +1153,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2) { /* Find the mode of the largest comparison. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + fixed_size_mode mode + = widest_fixed_size_mode_for_size (m_max_size, + m_qi_vector_mode); m_from.decide_autoinc (mode, m_reverse, len); m_to.decide_autoinc (mode, m_reverse, len); @@ -1139,8 +1170,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, /* This function returns the largest usable integer mode for LEN bytes whose size is no bigger than size of MODE. */ -scalar_int_mode -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) +fixed_size_mode +op_by_pieces_d::get_usable_mode (fixed_size_mode mode, unsigned int len) { unsigned int size; do @@ -1148,13 +1179,42 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) size = GET_MODE_SIZE (mode); if (len >= size && prepare_mode (mode, m_align)) break; - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); + /* widest_fixed_size_mode_for_size checks SIZE > 1. */ + mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode); } while (1); return mode; } +/* Return the smallest integer or QI vector mode that is not narrower + than SIZE bytes. */ + +fixed_size_mode +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size) +{ + /* Use QI vector only for > size of WORD. */ + if (m_qi_vector_mode && size > UNITS_PER_WORD) + { + machine_mode mode; + fixed_size_mode candidate; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (is_a (mode, &candidate) + && GET_MODE_INNER (candidate) == QImode) + { + /* Don't return a mode wider than M_LEN. */ + if (GET_MODE_SIZE (candidate) > m_len) + break; + + if (GET_MODE_SIZE (candidate) >= size + && (optab_handler (vec_duplicate_optab, candidate) + != CODE_FOR_nothing)) + return candidate; + } + } + + return smallest_int_mode_for_size (size * BITS_PER_UNIT); +} + /* This function contains the main loop used for expanding a block operation. First move what we can in the largest integer mode, then go to successively smaller modes. For every access, call @@ -1166,9 +1226,12 @@ op_by_pieces_d::run () if (m_len == 0) return; - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); - mode = get_usable_mode (mode, m_len); + unsigned HOST_WIDE_INT length = m_len; + + /* widest_fixed_size_mode_for_size checks M_MAX_SIZE > 1. */ + fixed_size_mode mode + = widest_fixed_size_mode_for_size (m_max_size, m_qi_vector_mode); + mode = get_usable_mode (mode, length); by_pieces_prev to_prev = { nullptr, mode }; by_pieces_prev from_prev = { nullptr, mode }; @@ -1178,7 +1241,7 @@ op_by_pieces_d::run () unsigned int size = GET_MODE_SIZE (mode); rtx to1 = NULL_RTX, from1; - while (m_len >= size) + while (length >= size) { if (m_reverse) m_offset -= size; @@ -1201,22 +1264,22 @@ op_by_pieces_d::run () if (!m_reverse) m_offset += size; - m_len -= size; + length -= size; } finish_mode (mode); - if (m_len == 0) + if (length == 0) return; if (!m_push && m_overlap_op_by_pieces) { /* NB: Generate overlapping operations if it is not a stack push since stack push must not overlap. Get the smallest - integer mode for M_LEN bytes. */ - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); + fixed size mode for M_LEN bytes. */ + mode = smallest_fixed_size_mode_for_size (length); mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); - int gap = GET_MODE_SIZE (mode) - m_len; + int gap = GET_MODE_SIZE (mode) - length; if (gap > 0) { /* If size of MODE > M_LEN, generate the last operation @@ -1226,20 +1289,21 @@ op_by_pieces_d::run () m_offset += gap; else m_offset -= gap; - m_len += gap; + length += gap; } } else { - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); - mode = get_usable_mode (mode, m_len); + /* widest_fixed_size_mode_for_size checks SIZE > 1. */ + mode = widest_fixed_size_mode_for_size (size, + m_qi_vector_mode); + mode = get_usable_mode (mode, length); } } while (1); /* The code above should have handled everything. */ - gcc_assert (!m_len); + gcc_assert (!length); } /* Derived class from op_by_pieces_d, providing support for block move @@ -1355,9 +1419,10 @@ class store_by_pieces_d : public op_by_pieces_d public: store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, - unsigned HOST_WIDE_INT len, unsigned int align) + unsigned HOST_WIDE_INT len, unsigned int align, + bool qi_vector_mode) : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false) + align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1446,7 +1511,8 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, max_size = STORE_MAX_PIECES + 1; while (max_size > 1 && l > 0) { - scalar_int_mode mode = widest_int_mode_for_size (max_size); + fixed_size_mode mode + = widest_fixed_size_mode_for_size (max_size, memsetp); icode = optab_handler (mov_optab, mode); if (icode != CODE_FOR_nothing @@ -1460,7 +1526,11 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, offset -= size; cst = (*constfun) (constfundata, nullptr, offset, mode); - if (!targetm.legitimate_constant_p (mode, cst)) + /* All CONST_VECTORs can be loaded for memset since + vec_duplicate_optab is a precondition to pick a + vector mode for the memset expander. */ + if (!((memsetp && VECTOR_MODE_P (mode)) + || targetm.legitimate_constant_p (mode, cst))) return 0; if (!reverse) @@ -1504,7 +1574,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, memsetp ? SET_BY_PIECES : STORE_BY_PIECES, optimize_insn_for_speed_p ())); - store_by_pieces_d data (to, constfun, constfundata, len, align); + store_by_pieces_d data (to, constfun, constfundata, len, align, + memsetp); data.run (); if (retmode != RETURN_BEGIN) @@ -1513,15 +1584,6 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, return to; } -/* Callback routine for clear_by_pieces. - Return const0_rtx unconditionally. */ - -static rtx -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) -{ - return const0_rtx; -} - /* Generate several move instructions to clear LEN bytes of block TO. (A MEM rtx with BLKmode). ALIGN is maximum alignment we can assume. */ @@ -1531,7 +1593,10 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align) if (len == 0) return; - store_by_pieces_d data (to, clear_by_pieces_1, NULL, len, align); + /* Use builtin_memset_read_str to support vector mode broadcast. */ + char c = 0; + store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align, + true); data.run (); } @@ -5754,7 +5819,7 @@ emit_storent_insn (rtx to, rtx from) static rtx string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + fixed_size_mode mode) { tree str = (tree) data; @@ -5769,10 +5834,13 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, size_t l = TREE_STRING_LENGTH (str) - offset; memcpy (p, TREE_STRING_POINTER (str) + offset, l); memset (p + l, '\0', GET_MODE_SIZE (mode) - l); - return c_readstr (p, mode, false); + return c_readstr (p, as_a (mode), false); } - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false); + /* The by-pieces infrastructure does not try to pick a vector mode + for storing STRING_CST. */ + return c_readstr (TREE_STRING_POINTER (str) + offset, + as_a (mode), false); } /* Generate code for computing expression EXP, -- cgit v1.1 From f2e5d2717d9e249edc5e0d45e49e4f9ef81fc694 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 3 Aug 2021 06:17:22 -0700 Subject: by_pieces: Pass MAX_PIECES to op_by_pieces_d Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and compare. PR target/101742 * expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces argument to set m_max_size. (move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d. (store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d. (compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d. --- gcc/expr.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index b65cfcf..096c031 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1110,8 +1110,8 @@ class op_by_pieces_d } public: - op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool, + op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn, + void *, unsigned HOST_WIDE_INT, unsigned int, bool, bool = false); void run (); }; @@ -1120,10 +1120,12 @@ class op_by_pieces_d objects named TO and FROM, which are identified as loads or stores by TO_LOAD and FROM_LOAD. If FROM is a load, the optional FROM_CFN and its associated FROM_CFN_DATA can be used to replace loads with - constant values. LEN describes the length of the operation. */ + constant values. MAX_PIECES describes the maximum number of bytes + at a time which can be moved efficiently. LEN describes the length + of the operation. */ -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, - rtx from, bool from_load, +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to, + bool to_load, rtx from, bool from_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, @@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), + m_len (len), m_max_size (max_pieces + 1), m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); @@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d public: move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len, unsigned int align) - : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align, - PUSHG_P (to)) + : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL, + NULL, len, align, PUSHG_P (to)) { } rtx finish_retmode (memop_ret); @@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, unsigned HOST_WIDE_INT len, unsigned int align, bool qi_vector_mode) - : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false, qi_vector_mode) + : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn, + cfn_data, len, align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn, void *op1_cfn_data, HOST_WIDE_INT len, int align, rtx_code_label *fail_label) - : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, - align, false) + : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn, + op1_cfn_data, len, align, false) { m_fail_label = fail_label; } -- cgit v1.1 From cad36f38576a6a781e3c62ab061c68f5b8dab13a Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 31 Aug 2021 11:45:07 +0100 Subject: Preserve SUBREG_PROMOTED_VAR_P on (extend:HI (subreg/s:QI (reg:SI))). SUBREG_PROMOTED_VAR_P is a mechanism for tracking that a partial subreg is correctly zero-extended or sign-extended in the parent register. For example, the RTL (subreg/s/v:QI (reg/v:SI 23 [ x ]) 0) indicates that the byte x is zero extended in reg:SI 23, which is useful for optimization. An example is that zero extending the above QImode value to HImode can simply use a wider subreg, i.e. (subreg:HI (reg/v:SI 23 [ x ]) 0). This patch addresses the oversight/missed optimization opportunity that the new HImode subreg above should retain its SUBREG_PROMOTED_VAR_P annotation as its value is guaranteed to be correctly extended in the SImode parent. The code below to preserve SUBREG_PROMOTED_VAR_P is already present in the middle-end (e.g. simplify-rtx.c:7232-7242) but missing from one or two (precisely three) places that (accidentally) strip it. Whilst there I also added another optimization. If we need to extend the above QImode value beyond the SImode register holding it, say to DImode, we can eliminate the SUBREG and simply extend from the SImode register to DImode. 2021-08-31 Roger Sayle gcc/ChangeLog * expr.c (convert_modes): Preserve SUBREG_PROMOTED_VAR_P when creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P subreg. * simplify-rtx.c (simplify_unary_operation_1) [SIGN_EXTEND]: Likewise, preserve SUBREG_PROMOTED_VAR_P when creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P subreg. Generate SIGN_EXTEND of the SUBREG_REG when a subreg would be paradoxical. [ZERO_EXTEND]: Likewise, preserve SUBREG_PROMOTED_VAR_P when creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P subreg. Generate ZERO_EXTEND of the SUBREG_REG when a subreg would be paradoxical. --- gcc/expr.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 096c031..5dd98a9 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -688,7 +688,24 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp) && (GET_MODE_PRECISION (subreg_promoted_mode (x)) >= GET_MODE_PRECISION (int_mode)) && SUBREG_CHECK_PROMOTED_SIGN (x, unsignedp)) - x = gen_lowpart (int_mode, SUBREG_REG (x)); + { + scalar_int_mode int_orig_mode; + machine_mode orig_mode = GET_MODE (x); + x = gen_lowpart (int_mode, SUBREG_REG (x)); + + /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than + the original mode, but narrower than the inner mode. */ + if (GET_CODE (x) == SUBREG + && GET_MODE_PRECISION (subreg_promoted_mode (x)) + > GET_MODE_PRECISION (int_mode) + && is_a (orig_mode, &int_orig_mode) + && GET_MODE_PRECISION (int_mode) + > GET_MODE_PRECISION (int_orig_mode)) + { + SUBREG_PROMOTED_VAR_P (x) = 1; + SUBREG_PROMOTED_SET (x, unsignedp); + } + } if (GET_MODE (x) != VOIDmode) oldmode = GET_MODE (x); -- cgit v1.1 From 863d6524f320abe14d6993fec188e787470e69b6 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 31 Aug 2021 17:25:58 +0100 Subject: [Committed] Fix subreg_promoted_mode breakage on various platforms. My apologies for the inconvenience. My recent patch to preserve SUBREG_PROMOTED_VAR_P on (extend:HI (subreg/s:QI (reg:SI))), and other places in the middle-end, has broken the build on several targets. The change to convert_modes inadvertently used the same subreg_promoted_mode idiom for retrieving the mode of a SUBREG_REG as the existing code just a few lines earlier. Alas in the meantime, the original SUBREG gets replaced by one without SUBREG_PROMOTED_VAR_P, the whole raison-d'etre for my patch, and I'd not realized/noticed that subreg_promoted_mode asserts for this. Alas neither the bootstrap and regression test on x86_64-pc-linux-gnu nor my testing on nvptx-none must have hit this particular case. The logic of this transformation is sound, it's the implementation that's bitten me. This patch has been committed, after another "make bootstrap" on x86_64-pc-linux-gnu (just in case), and confirmation/pre-approval from Jeff Law that this indeed fixes the build failures seen on several platforms. My humble apologies again. 2021-08-31 Roger Sayle gcc/ChangeLog * expr.c (convert_modes): Don't use subreg_promoted_mode on a SUBREG if it can't be guaranteed to a SUBREG_PROMOTED_VAR_P set. Instead use the standard (safer) is_a idiom. --- gcc/expr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 5dd98a9..17f2c2f 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -690,17 +690,20 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp) && SUBREG_CHECK_PROMOTED_SIGN (x, unsignedp)) { scalar_int_mode int_orig_mode; + scalar_int_mode int_inner_mode; machine_mode orig_mode = GET_MODE (x); x = gen_lowpart (int_mode, SUBREG_REG (x)); /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than the original mode, but narrower than the inner mode. */ if (GET_CODE (x) == SUBREG - && GET_MODE_PRECISION (subreg_promoted_mode (x)) - > GET_MODE_PRECISION (int_mode) && is_a (orig_mode, &int_orig_mode) && GET_MODE_PRECISION (int_mode) - > GET_MODE_PRECISION (int_orig_mode)) + > GET_MODE_PRECISION (int_orig_mode) + && is_a (GET_MODE (SUBREG_REG (x)), + &int_inner_mode) + && GET_MODE_PRECISION (int_inner_mode) + > GET_MODE_PRECISION (int_mode)) { SUBREG_PROMOTED_VAR_P (x) = 1; SUBREG_PROMOTED_SET (x, unsignedp); -- cgit v1.1 From b195fae7c11b35c8ee551cd302e3daf8e08c78d0 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Sun, 12 Sep 2021 15:18:57 +0100 Subject: Also preserve SUBREG_PROMOTED_VAR_P in expr.c's convert_move. This patch catches another place in the middle-end where it's possible to preserve the SUBREG_PROMOTED_VAR_P annotation on a subreg to the benefit of later RTL optimizations. This adds the same logic to expr.c's convert_move as recently added to convert_modes. On nvptx-none, the simple test program: short foo (char c) { return c; } currently generates three instructions: mov.u32 %r23, %ar0; cvt.u16.u32 %r24, %r23; cvt.s32.s16 %value, %r24; with this patch, we now generate just one: mov.u32 %value, %ar0; This patch should look familiar, it's almost identical to the recent patch https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578331.html but with the fix https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578519.html 2021-09-12 Roger Sayle gcc/ChangeLog * expr.c (convert_move): Preserve SUBREG_PROMOTED_VAR_P when creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P subreg. --- gcc/expr.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'gcc/expr.c') diff --git a/gcc/expr.c b/gcc/expr.c index 17f2c2f..e0bcbcc 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -236,8 +236,27 @@ convert_move (rtx to, rtx from, int unsignedp) >= GET_MODE_PRECISION (to_int_mode)) && SUBREG_CHECK_PROMOTED_SIGN (from, unsignedp)) { + scalar_int_mode int_orig_mode; + scalar_int_mode int_inner_mode; + machine_mode orig_mode = GET_MODE (from); + from = gen_lowpart (to_int_mode, SUBREG_REG (from)); from_mode = to_int_mode; + + /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than + the original mode, but narrower than the inner mode. */ + if (GET_CODE (from) == SUBREG + && is_a (orig_mode, &int_orig_mode) + && GET_MODE_PRECISION (to_int_mode) + > GET_MODE_PRECISION (int_orig_mode) + && is_a (GET_MODE (SUBREG_REG (from)), + &int_inner_mode) + && GET_MODE_PRECISION (int_inner_mode) + > GET_MODE_PRECISION (to_int_mode)) + { + SUBREG_PROMOTED_VAR_P (from) = 1; + SUBREG_PROMOTED_SET (from, unsignedp); + } } gcc_assert (GET_CODE (to) != SUBREG || !SUBREG_PROMOTED_VAR_P (to)); -- cgit v1.1