From 22aede7a1228617661105048a91fddd8797e141b Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Sat, 10 Apr 2021 12:49:01 +0200
Subject: expand: Fix up LTO ICE with COMPOUND_LITERAL_EXPR [PR99849]

The gimplifier optimizes away COMPOUND_LITERAL_EXPRs, but they can remain
in the form of ADDR_EXPR of COMPOUND_LITERAL_EXPRs in static initializers.
By the TREE_STATIC check I meant to check that the underlying decl of
the compound literal is a global rather than automatic variable which
obviously can't be referenced in static initializers, but unfortunately
with LTO it might end up in another partition and thus be DECL_EXTERNAL
instead.

2021-04-10  Jakub Jelinek  <jakub@redhat.com>

	PR lto/99849
	* expr.c (expand_expr_addr_expr_1): Test is_global_var rather than
	just TREE_STATIC on COMPOUND_LITERAL_EXPR_DECLs.

	* gcc.dg/lto/pr99849_0.c: New test.
---
 gcc/expr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/expr.c')
diff --git a/gcc/expr.c b/gcc/expr.c
index 92035c7..a0e1946 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8204,7 +8204,7 @@ expand_expr_addr_expr_1 (tree exp, rtx target, scalar_int_mode tmode,
 	 array with address of COMPOUND_LITERAL_EXPR in DECL_INITIAL;
 	 the initializers aren't gimplified.  */
       if (COMPOUND_LITERAL_EXPR_DECL (exp)
-	  && TREE_STATIC (COMPOUND_LITERAL_EXPR_DECL (exp)))
+	  && is_global_var (COMPOUND_LITERAL_EXPR_DECL (exp)))
 	return expand_expr_addr_expr_1 (COMPOUND_LITERAL_EXPR_DECL (exp),
 					target, tmode, modifier, as);
       /* FALLTHRU */
-- 
cgit v1.1


From b972e036f40c12b106f9070c3e8adea0eb8a45fa Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Thu, 15 Apr 2021 13:03:21 +0200
Subject: Move gimplify_buildN API local to only remaining user

This moves the legacy gimplify_buildN API to tree-vect-generic.c,
its only user and elides the gimplification step, making it a wrapper
around gimple_build, adjusting tree_vec_extract for this.

I've noticed that vector CTOR expansion doesn't deal with unfolded
{} and thus this makes it more resilent.  I've also adjusted the
match.pd vector CTOR extraction code to make sure it doesn't
produce a CTOR when folding would make it a vector constant.

2021-04-15  Richard Biener  <rguenther@suse.de>

	* tree-cfg.h (gimplify_build1): Remove.
	(gimplify_build2): Likewise.
	(gimplify_build3): Likewise.
	* tree-cfg.c (gimplify_build1): Move to tree-vect-generic.c.
	(gimplify_build2): Likewise.
	(gimplify_build3): Likewise.
	* tree-vect-generic.c (gimplify_build1): Move from tree-cfg.c.
	Modernize.
	(gimplify_build2): Likewise.
	(gimplify_build3): Likewise.
	(tree_vec_extract): Use resimplify with following SSA edges.
	(expand_vector_parallel): Avoid passing NULL size/bitpos
	to tree_vec_extract.
	* expr.c (store_constructor): Deal with zero-element CTORs.
	* match.pd (bit_field_ref <vector CTOR>): Make sure to
	produce vector constants when possible.
---
 gcc/expr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index a0e1946..5ed716c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7019,7 +7019,9 @@ store_constructor (tree exp, rtx target, int cleared, poly_int64 size,
 	/* Compute the size of the elements in the CTOR.  It differs
 	   from the size of the vector type elements only when the
 	   CTOR elements are vectors themselves.  */
-	tree val_type = TREE_TYPE (CONSTRUCTOR_ELT (exp, 0)->value);
+	tree val_type = (CONSTRUCTOR_NELTS (exp) != 0
+			 ? TREE_TYPE (CONSTRUCTOR_ELT (exp, 0)->value)
+			 : elttype);
 	if (VECTOR_TYPE_P (val_type))
 	  bitsize = tree_to_uhwi (TYPE_SIZE (val_type));
 	else
-- 
cgit v1.1


From 3dcd1334b4f522352b80814513fdca902fc2a207 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 27 Apr 2021 14:45:45 +0200
Subject: expand: Expand x / y * y as x - x % y if the latter is cheaper
 [PR96696]

The following patch tests both x / y * y and x - x % y expansion for the
former GIMPLE code and chooses the cheaper of those sequences.

2021-04-27  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/96696
	* expr.c (expand_expr_divmod): New function.
	(expand_expr_real_2) <case TRUNC_DIV_EXPR>: Use it for truncations and
	divisions.  Formatting fixes.
	<case MULT_EXPR>: Optimize x / y * y as x - x % y if the latter is
	cheaper.

	* gcc.target/i386/pr96696.c: New test.
---
 gcc/expr.c | 190 ++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 132 insertions(+), 58 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 5ed716c..5a1fda7 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8664,6 +8664,56 @@ expand_misaligned_mem_ref (rtx temp, machine_mode mode, int unsignedp,
   return temp;
 }
 
+/* Helper function of expand_expr_2, expand a division or modulo.
+   op0 and op1 should be already expanded treeop0 and treeop1, using
+   expand_operands.  */
+
+static rtx
+expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
+		    tree treeop1, rtx op0, rtx op1, rtx target, int unsignedp)
+{
+  bool mod_p = (code == TRUNC_MOD_EXPR || code == FLOOR_MOD_EXPR
+		|| code == CEIL_MOD_EXPR || code == ROUND_MOD_EXPR);
+  if (SCALAR_INT_MODE_P (mode)
+      && optimize >= 2
+      && get_range_pos_neg (treeop0) == 1
+      && get_range_pos_neg (treeop1) == 1)
+    {
+      /* If both arguments are known to be positive when interpreted
+	 as signed, we can expand it as both signed and unsigned
+	 division or modulo.  Choose the cheaper sequence in that case.  */
+      bool speed_p = optimize_insn_for_speed_p ();
+      do_pending_stack_adjust ();
+      start_sequence ();
+      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx_insn *uns_insns = get_insns ();
+      end_sequence ();
+      start_sequence ();
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx_insn *sgn_insns = get_insns ();
+      end_sequence ();
+      unsigned uns_cost = seq_cost (uns_insns, speed_p);
+      unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
+
+      /* If costs are the same then use as tie breaker the other other
+	 factor.  */
+      if (uns_cost == sgn_cost)
+	{
+	  uns_cost = seq_cost (uns_insns, !speed_p);
+	  sgn_cost = seq_cost (sgn_insns, !speed_p);
+	}
+
+      if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp))
+	{
+	  emit_insn (uns_insns);
+	  return uns_ret;
+	}
+      emit_insn (sgn_insns);
+      return sgn_ret;
+    }
+  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+}
+
 rtx
 expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 		    enum expand_modifier modifier)
@@ -9201,14 +9251,78 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 	  if (!REG_P (op0))
 	    op0 = copy_to_mode_reg (mode, op0);
 
-	  return REDUCE_BIT_FIELD (gen_rtx_MULT (mode, op0,
-			       gen_int_mode (tree_to_shwi (exp1),
-					     TYPE_MODE (TREE_TYPE (exp1)))));
+	  op1 = gen_int_mode (tree_to_shwi (exp1),
+			      TYPE_MODE (TREE_TYPE (exp1)));
+	  return REDUCE_BIT_FIELD (gen_rtx_MULT (mode, op0, op1));
 	}
 
       if (modifier == EXPAND_STACK_PARM)
 	target = 0;
 
+      if (SCALAR_INT_MODE_P (mode) && optimize >= 2)
+	{
+	  gimple *def_stmt0 = get_def_for_expr (treeop0, TRUNC_DIV_EXPR);
+	  gimple *def_stmt1 = get_def_for_expr (treeop1, TRUNC_DIV_EXPR);
+	  if (def_stmt0
+	      && !operand_equal_p (treeop1, gimple_assign_rhs2 (def_stmt0), 0))
+	    def_stmt0 = NULL;
+	  if (def_stmt1
+	      && !operand_equal_p (treeop0, gimple_assign_rhs2 (def_stmt1), 0))
+	    def_stmt1 = NULL;
+
+	  if (def_stmt0 || def_stmt1)
+	    {
+	      /* X / Y * Y can be expanded as X - X % Y too.
+		 Choose the cheaper sequence of those two.  */
+	      if (def_stmt0)
+		treeop0 = gimple_assign_rhs1 (def_stmt0);
+	      else
+		{
+		  treeop1 = treeop0;
+		  treeop0 = gimple_assign_rhs1 (def_stmt1);
+		}
+	      expand_operands (treeop0, treeop1, subtarget, &op0, &op1,
+			       EXPAND_NORMAL);
+	      bool speed_p = optimize_insn_for_speed_p ();
+	      do_pending_stack_adjust ();
+	      start_sequence ();
+	      rtx divmul_ret
+		= expand_expr_divmod (TRUNC_DIV_EXPR, mode, treeop0, treeop1,
+				      op0, op1, NULL_RTX, unsignedp);
+	      divmul_ret = expand_mult (mode, divmul_ret, op1, target,
+					unsignedp);
+	      rtx_insn *divmul_insns = get_insns ();
+	      end_sequence ();
+	      start_sequence ();
+	      rtx modsub_ret
+		= expand_expr_divmod (TRUNC_MOD_EXPR, mode, treeop0, treeop1,
+				      op0, op1, NULL_RTX, unsignedp);
+	      this_optab = optab_for_tree_code (MINUS_EXPR, type,
+						optab_default);
+	      modsub_ret = expand_binop (mode, this_optab, op0, modsub_ret,
+					 target, unsignedp, OPTAB_LIB_WIDEN);
+	      rtx_insn *modsub_insns = get_insns ();
+	      end_sequence ();
+	      unsigned divmul_cost = seq_cost (divmul_insns, speed_p);
+	      unsigned modsub_cost = seq_cost (modsub_insns, speed_p);
+	      /* If costs are the same then use as tie breaker the other other
+		 factor.  */
+	      if (divmul_cost == modsub_cost)
+		{
+		  divmul_cost = seq_cost (divmul_insns, !speed_p);
+		  modsub_cost = seq_cost (modsub_insns, !speed_p);
+		}
+
+	      if (divmul_cost <= modsub_cost)
+		{
+		  emit_insn (divmul_insns);
+		  return REDUCE_BIT_FIELD (divmul_ret);
+		}
+	      emit_insn (modsub_insns);
+	      return REDUCE_BIT_FIELD (modsub_ret);
+	    }
+	}
+
       expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
       return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp));
 
@@ -9222,61 +9336,21 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
     case CEIL_DIV_EXPR:
     case ROUND_DIV_EXPR:
     case EXACT_DIV_EXPR:
-     {
-       /* If this is a fixed-point operation, then we cannot use the code
-	  below because "expand_divmod" doesn't support sat/no-sat fixed-point
-	  divisions.   */
-       if (ALL_FIXED_POINT_MODE_P (mode))
-	 goto binop;
-
-       if (modifier == EXPAND_STACK_PARM)
-	 target = 0;
-       /* Possible optimization: compute the dividend with EXPAND_SUM
-	  then if the divisor is constant can optimize the case
-	  where some terms of the dividend have coeffs divisible by it.  */
-       expand_operands (treeop0, treeop1,
-			subtarget, &op0, &op1, EXPAND_NORMAL);
-       bool mod_p = code == TRUNC_MOD_EXPR || code == FLOOR_MOD_EXPR
-		    || code == CEIL_MOD_EXPR || code == ROUND_MOD_EXPR;
-       if (SCALAR_INT_MODE_P (mode)
-	   && optimize >= 2
-	   && get_range_pos_neg (treeop0) == 1
-	   && get_range_pos_neg (treeop1) == 1)
-	 {
-	   /* If both arguments are known to be positive when interpreted
-	      as signed, we can expand it as both signed and unsigned
-	      division or modulo.  Choose the cheaper sequence in that case.  */
-	   bool speed_p = optimize_insn_for_speed_p ();
-	   do_pending_stack_adjust ();
-	   start_sequence ();
-	   rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
-	   rtx_insn *uns_insns = get_insns ();
-	   end_sequence ();
-	   start_sequence ();
-	   rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
-	   rtx_insn *sgn_insns = get_insns ();
-	   end_sequence ();
-	   unsigned uns_cost = seq_cost (uns_insns, speed_p);
-	   unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
-
-	   /* If costs are the same then use as tie breaker the other
-	      other factor.  */
-	   if (uns_cost == sgn_cost)
-	     {
-		uns_cost = seq_cost (uns_insns, !speed_p);
-		sgn_cost = seq_cost (sgn_insns, !speed_p);
-	     }
-
-	   if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp))
-	     {
-	       emit_insn (uns_insns);
-	       return uns_ret;
-	     }
-	   emit_insn (sgn_insns);
-	   return sgn_ret;
-	 }
-       return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
-     }
+      /* If this is a fixed-point operation, then we cannot use the code
+	 below because "expand_divmod" doesn't support sat/no-sat fixed-point
+	 divisions.   */
+      if (ALL_FIXED_POINT_MODE_P (mode))
+	goto binop;
+
+      if (modifier == EXPAND_STACK_PARM)
+	target = 0;
+      /* Possible optimization: compute the dividend with EXPAND_SUM
+	 then if the divisor is constant can optimize the case
+	 where some terms of the dividend have coeffs divisible by it.  */
+      expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL);
+      return expand_expr_divmod (code, mode, treeop0, treeop1, op0, op1,
+				 target, unsignedp);
+
     case RDIV_EXPR:
       goto binop;
 
-- 
cgit v1.1


From 3bb41228d76b3a3cbd9923d57388f0903f7683de Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 26 Apr 2021 15:36:18 -0700
Subject: op_by_pieces_d::run: Change a while loop to a do-while loop

Change a while loop in op_by_pieces_d::run to a do-while loop to prepare
for offset adjusted operation for the remaining bytes on the last piece
operation of a memory region.

	PR middle-end/90773
	* expr.c (op_by_pieces_d::get_usable_mode): New member function.
	(op_by_pieces_d::run): Cange a while loop to a do-while loop.
---
 gcc/expr.c | 76 +++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 23 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 5a1fda7..a4a004d 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1041,6 +1041,9 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
 
 class op_by_pieces_d
 {
+ private:
+  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+
  protected:
   pieces_addr m_to, m_from;
   unsigned HOST_WIDE_INT m_len;
@@ -1108,6 +1111,25 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
   m_align = align;
 }
 
+/* This function returns the largest usable integer mode for LEN bytes
+   whose size is no bigger than size of MODE.  */
+
+scalar_int_mode
+op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+{
+  unsigned int size;
+  do
+    {
+      size = GET_MODE_SIZE (mode);
+      if (len >= size && prepare_mode (mode, m_align))
+	break;
+      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
+      mode = widest_int_mode_for_size (size);
+    }
+  while (1);
+  return mode;
+}
+
 /* This function contains the main loop used for expanding a block
    operation.  First move what we can in the largest integer mode,
    then go to successively smaller modes.  For every access, call
@@ -1116,42 +1138,50 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 void
 op_by_pieces_d::run ()
 {
-  while (m_max_size > 1 && m_len > 0)
+  if (m_len == 0)
+    return;
+
+  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
+  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+  mode = get_usable_mode (mode, m_len);
+
+  do
     {
-      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+      unsigned int size = GET_MODE_SIZE (mode);
+      rtx to1 = NULL_RTX, from1;
 
-      if (prepare_mode (mode, m_align))
+      while (m_len >= size)
 	{
-	  unsigned int size = GET_MODE_SIZE (mode);
-	  rtx to1 = NULL_RTX, from1;
+	  if (m_reverse)
+	    m_offset -= size;
 
-	  while (m_len >= size)
-	    {
-	      if (m_reverse)
-		m_offset -= size;
+	  to1 = m_to.adjust (mode, m_offset);
+	  from1 = m_from.adjust (mode, m_offset);
 
-	      to1 = m_to.adjust (mode, m_offset);
-	      from1 = m_from.adjust (mode, m_offset);
+	  m_to.maybe_predec (-(HOST_WIDE_INT)size);
+	  m_from.maybe_predec (-(HOST_WIDE_INT)size);
 
-	      m_to.maybe_predec (-(HOST_WIDE_INT)size);
-	      m_from.maybe_predec (-(HOST_WIDE_INT)size);
+	  generate (to1, from1, mode);
 
-	      generate (to1, from1, mode);
+	  m_to.maybe_postinc (size);
+	  m_from.maybe_postinc (size);
 
-	      m_to.maybe_postinc (size);
-	      m_from.maybe_postinc (size);
+	  if (!m_reverse)
+	    m_offset += size;
 
-	      if (!m_reverse)
-		m_offset += size;
+	  m_len -= size;
+	}
 
-	      m_len -= size;
-	    }
+      finish_mode (mode);
 
-	  finish_mode (mode);
-	}
+      if (m_len == 0)
+	return;
 
-      m_max_size = GET_MODE_SIZE (mode);
+      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
+      mode = widest_int_mode_for_size (size);
+      mode = get_usable_mode (mode, m_len);
     }
+  while (1);
 
   /* The code above should have handled everything.  */
   gcc_assert (!m_len);
-- 
cgit v1.1


From 985b3a6837dee7001e6b618f073ed74f0edf5787 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 10 Jun 2019 09:57:15 -0700
Subject: Generate offset adjusted operation for op_by_pieces operations

Add an overlap_op_by_pieces_p target hook for op_by_pieces operations
between two areas of memory to generate one offset adjusted operation
in the smallest integer mode for the remaining bytes on the last piece
operation of a memory region to avoid doing more than one smaller
operations.

Pass the RTL information from the previous iteration to m_constfn in
op_by_pieces operation so that builtin_memset_[read|gen]_str can
generate the new RTL from the previous RTL.

Tested on Linux/x86-64.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memcpy_read_str): Add a dummy argument.
	(builtin_strncpy_read_str): Likewise.
	(builtin_memset_read_str): Add an argument for the previous RTL
	information and generate the new RTL from the previous RTL info.
	(builtin_memset_gen_str): Likewise.
	* builtins.h (builtin_strncpy_read_str): Update the prototype.
	(builtin_memset_read_str): Likewise.
	* expr.c (by_pieces_ninsns): If targetm.overlap_op_by_pieces_p()
	returns true, round up size and alignment to the widest integer
	mode for maximum size.
	(pieces_addr::adjust): Add a pointer to by_pieces_prev argument
	and pass it to m_constfn.
	(op_by_pieces_d): Add m_push and m_overlap_op_by_pieces.
	(op_by_pieces_d::op_by_pieces_d): Add a bool argument to
	initialize m_push.  Initialize m_overlap_op_by_pieces with
	targetm.overlap_op_by_pieces_p ().
	(op_by_pieces_d::run): Pass the previous RTL information to
	pieces_addr::adjust and generate overlapping operations if
	m_overlap_op_by_pieces is true.
	(PUSHG_P): New.
	(move_by_pieces_d::move_by_pieces_d): Updated for op_by_pieces_d
	change.
	(store_by_pieces_d::store_by_pieces_d): Updated for op_by_pieces_d
	change.
	(can_store_by_pieces): Use by_pieces_constfn on constfun.
	(store_by_pieces): Use by_pieces_constfn on constfun.  Updated
	for op_by_pieces_d change.
	(clear_by_pieces_1): Add a dummy argument.
	(clear_by_pieces): Updated for op_by_pieces_d change.
	(compare_by_pieces_d::compare_by_pieces_d): Likewise.
	(string_cst_read_str): Add a dummy argument.
	* expr.h (by_pieces_constfn): Add a dummy argument.
	(by_pieces_prev): New.
	* target.def (overlap_op_by_pieces_p): New target hook.
	* config/i386/i386.c (TARGET_OVERLAP_OP_BY_PIECES_P): New.
	* doc/tm.texi.in: Add TARGET_OVERLAP_OP_BY_PIECES_P.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	PR middle-end/90773
	* g++.dg/pr90773-1.h: New test.
	* g++.dg/pr90773-1a.C: Likewise.
	* g++.dg/pr90773-1b.C: Likewise.
	* g++.dg/pr90773-1c.C: Likewise.
	* g++.dg/pr90773-1d.C: Likewise.
	* gcc.target/i386/pr90773-1.c: Likewise.
	* gcc.target/i386/pr90773-2.c: Likewise.
	* gcc.target/i386/pr90773-3.c: Likewise.
	* gcc.target/i386/pr90773-4.c: Likewise.
	* gcc.target/i386/pr90773-5.c: Likewise.
	* gcc.target/i386/pr90773-6.c: Likewise.
	* gcc.target/i386/pr90773-7.c: Likewise.
	* gcc.target/i386/pr90773-8.c: Likewise.
	* gcc.target/i386/pr90773-9.c: Likewise.
	* gcc.target/i386/pr90773-10.c: Likewise.
	* gcc.target/i386/pr90773-11.c: Likewise.
	* gcc.target/i386/pr90773-12.c: Likewise.
	* gcc.target/i386/pr90773-13.c: Likewise.
	* gcc.target/i386/pr90773-14.c: Likewise.
---
 gcc/expr.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 85 insertions(+), 20 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index a4a004d..e0167b7 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -815,12 +815,27 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 		  unsigned int max_size, by_pieces_operation op)
 {
   unsigned HOST_WIDE_INT n_insns = 0;
+  scalar_int_mode mode;
+
+  if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
+    {
+      /* NB: Round up L and ALIGN to the widest integer mode for
+	 MAX_SIZE.  */
+      mode = widest_int_mode_for_size (max_size);
+      if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
+	{
+	  unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+	  if (up > l)
+	    l = up;
+	  align = GET_MODE_ALIGNMENT (mode);
+	}
+    }
 
   align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
 
   while (max_size > 1 && l > 0)
     {
-      scalar_int_mode mode = widest_int_mode_for_size (max_size);
+      mode = widest_int_mode_for_size (max_size);
       enum insn_code icode;
 
       unsigned int modesize = GET_MODE_SIZE (mode);
@@ -888,7 +903,8 @@ class pieces_addr
   void *m_cfndata;
 public:
   pieces_addr (rtx, bool, by_pieces_constfn, void *);
-  rtx adjust (scalar_int_mode, HOST_WIDE_INT);
+  rtx adjust (scalar_int_mode, HOST_WIDE_INT,
+	      by_pieces_prev * = nullptr);
   void increment_address (HOST_WIDE_INT);
   void maybe_predec (HOST_WIDE_INT);
   void maybe_postinc (HOST_WIDE_INT);
@@ -990,10 +1006,12 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
    but we still modify the MEM's properties.  */
 
 rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset)
+pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+		     by_pieces_prev *prev)
 {
   if (m_constfn)
-    return m_constfn (m_cfndata, offset, mode);
+    /* Pass the previous data to m_constfn.  */
+    return m_constfn (m_cfndata, prev, offset, mode);
   if (m_obj == NULL_RTX)
     return NULL_RTX;
   if (m_auto)
@@ -1051,6 +1069,10 @@ class op_by_pieces_d
   unsigned int m_align;
   unsigned int m_max_size;
   bool m_reverse;
+  /* True if this is a stack push.  */
+  bool m_push;
+  /* True if targetm.overlap_op_by_pieces_p () returns true.  */
+  bool m_overlap_op_by_pieces;
 
   /* Virtual functions, overriden by derived classes for the specific
      operation.  */
@@ -1062,7 +1084,7 @@ class op_by_pieces_d
 
  public:
   op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int);
+		  unsigned HOST_WIDE_INT, unsigned int, bool);
   void run ();
 };
 
@@ -1077,10 +1099,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
-				unsigned int align)
+				unsigned int align, bool push)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1)
+    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_push (push)
 {
   int toi = m_to.get_addr_inc ();
   int fromi = m_from.get_addr_inc ();
@@ -1109,6 +1132,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 
   align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
   m_align = align;
+
+  m_overlap_op_by_pieces = targetm.overlap_op_by_pieces_p ();
 }
 
 /* This function returns the largest usable integer mode for LEN bytes
@@ -1145,6 +1170,9 @@ op_by_pieces_d::run ()
   scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
   mode = get_usable_mode (mode, m_len);
 
+  by_pieces_prev to_prev = { nullptr, mode };
+  by_pieces_prev from_prev = { nullptr, mode };
+
   do
     {
       unsigned int size = GET_MODE_SIZE (mode);
@@ -1155,8 +1183,12 @@ op_by_pieces_d::run ()
 	  if (m_reverse)
 	    m_offset -= size;
 
-	  to1 = m_to.adjust (mode, m_offset);
-	  from1 = m_from.adjust (mode, m_offset);
+	  to1 = m_to.adjust (mode, m_offset, &to_prev);
+	  to_prev.data = to1;
+	  to_prev.mode = mode;
+	  from1 = m_from.adjust (mode, m_offset, &from_prev);
+	  from_prev.data = from1;
+	  from_prev.mode = mode;
 
 	  m_to.maybe_predec (-(HOST_WIDE_INT)size);
 	  m_from.maybe_predec (-(HOST_WIDE_INT)size);
@@ -1177,9 +1209,32 @@ op_by_pieces_d::run ()
       if (m_len == 0)
 	return;
 
-      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-      mode = widest_int_mode_for_size (size);
-      mode = get_usable_mode (mode, m_len);
+      if (!m_push && m_overlap_op_by_pieces)
+	{
+	  /* NB: Generate overlapping operations if it is not a stack
+	     push since stack push must not overlap.  Get the smallest
+	     integer mode for M_LEN bytes.  */
+	  mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
+	  mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
+	  int gap = GET_MODE_SIZE (mode) - m_len;
+	  if (gap > 0)
+	    {
+	      /* If size of MODE > M_LEN, generate the last operation
+		 in MODE for the remaining bytes with ovelapping memory
+		 from the previois operation.  */
+	      if (m_reverse)
+		m_offset += gap;
+	      else
+		m_offset -= gap;
+	      m_len += gap;
+	    }
+	}
+      else
+	{
+	  /* NB: widest_int_mode_for_size checks SIZE > 1.  */
+	  mode = widest_int_mode_for_size (size);
+	  mode = get_usable_mode (mode, m_len);
+	}
     }
   while (1);
 
@@ -1190,6 +1245,12 @@ op_by_pieces_d::run ()
 /* Derived class from op_by_pieces_d, providing support for block move
    operations.  */
 
+#ifdef PUSH_ROUNDING
+#define PUSHG_P(to)  ((to) == nullptr)
+#else
+#define PUSHG_P(to)  false
+#endif
+
 class move_by_pieces_d : public op_by_pieces_d
 {
   insn_gen_fn m_gen_fun;
@@ -1199,7 +1260,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align)
+    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
+		      PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1294,7 +1356,8 @@ class store_by_pieces_d : public op_by_pieces_d
  public:
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, align)
+    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
+		      align, false)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1349,7 +1412,7 @@ store_by_pieces_d::finish_retmode (memop_ret retmode)
 
 int
 can_store_by_pieces (unsigned HOST_WIDE_INT len,
-		     rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode),
+		     by_pieces_constfn constfun,
 		     void *constfundata, unsigned int align, bool memsetp)
 {
   unsigned HOST_WIDE_INT l;
@@ -1396,7 +1459,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
 		  if (reverse)
 		    offset -= size;
 
-		  cst = (*constfun) (constfundata, offset, mode);
+		  cst = (*constfun) (constfundata, nullptr, offset, mode);
 		  if (!targetm.legitimate_constant_p (mode, cst))
 		    return 0;
 
@@ -1426,7 +1489,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
 
 rtx
 store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
-		 rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode),
+		 by_pieces_constfn constfun,
 		 void *constfundata, unsigned int align, bool memsetp,
 		 memop_ret retmode)
 {
@@ -1454,7 +1517,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
    Return const0_rtx unconditionally.  */
 
 static rtx
-clear_by_pieces_1 (void *, HOST_WIDE_INT, scalar_int_mode)
+clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
 {
   return const0_rtx;
 }
@@ -1490,7 +1553,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, align)
+    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
+		      align, false)
   {
     m_fail_label = fail_label;
   }
@@ -5676,7 +5740,8 @@ emit_storent_insn (rtx to, rtx from)
 /* Helper function for store_expr storing of STRING_CST.  */
 
 static rtx
-string_cst_read_str (void *data, HOST_WIDE_INT offset, scalar_int_mode mode)
+string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
+		     scalar_int_mode mode)
 {
   tree str = (tree) data;
 
-- 
cgit v1.1


From 18d713fbd345c5c54ab6091ac5f114df4551d1bb Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 30 Apr 2021 04:36:36 -0700
Subject: Update alignment_for_piecewise_move

alignment_for_piecewise_move is called only with MOVE_MAX_PIECES or
STORE_MAX_PIECES, which are the number of bytes at a time that we
can move or store efficiently.  We should call mode_for_size without
limit to MAX_FIXED_MODE_SIZE, which is an integer expression for the
size in bits of the largest integer machine mode that should actually
be used, may be smaller than MOVE_MAX_PIECES or STORE_MAX_PIECES, which
may use vector.

	* expr.c (alignment_for_piecewise_move): Call mode_for_size
	without limit to MAX_FIXED_MODE_SIZE.
---
 gcc/expr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index e0167b7..b4c110f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -746,7 +746,7 @@ static unsigned int
 alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
 {
   scalar_int_mode tmode
-    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 1).require ();
+    = int_mode_for_size (max_pieces * BITS_PER_UNIT, 0).require ();
 
   if (align >= GET_MODE_ALIGNMENT (tmode))
     align = GET_MODE_ALIGNMENT (tmode);
-- 
cgit v1.1


From da9e6e63d1ae22e530ec7baf59f6ed028bf05776 Mon Sep 17 00:00:00 2001
From: Alexandre Oliva <oliva@adacore.com>
Date: Mon, 3 May 2021 22:48:47 -0300
Subject: introduce try store by multiple pieces

The ldist pass turns even very short loops into memset calls.  E.g.,
the TFmode emulation calls end with a loop of up to 3 iterations, to
zero out trailing words, and the loop distribution pass turns them
into calls of the memset builtin.

Though short constant-length clearing memsets are usually dealt with
efficiently, for non-constant-length ones, the options are setmemM, or
a function calls.

RISC-V doesn't have any setmemM pattern, so the loops above end up
"optimized" into memset calls, incurring not only the overhead of an
explicit call, but also discarding the information the compiler has
about the alignment of the destination, and that the length is a
multiple of the word alignment.

This patch handles variable lengths with multiple conditional
power-of-2-constant-sized stores-by-pieces, so as to reduce the
overhead of length compares.

It also changes the last copy-prop pass into ccp, so that pointer
alignment and length's nonzero bits are detected and made available
for the expander, even for ldist-introduced SSA_NAMEs.


for  gcc/ChangeLog

	* builtins.c (try_store_by_multiple_pieces): New.
	(expand_builtin_memset_args): Use it.  If target_char_cast
	fails, proceed as for non-constant val.  Pass len's ctz to...
	* expr.c (clear_storage_hints): ... this.  Try store by
	multiple pieces after setmem.
	(clear_storage): Adjust.
	* expr.h (clear_storage_hints): Likewise.
	(try_store_by_multiple_pieces): Declare.
	* passes.def: Replace the last copy_prop with ccp.
---
 gcc/expr.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index b4c110f..1b65f6b 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -3150,7 +3150,8 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
 		     unsigned int expected_align, HOST_WIDE_INT expected_size,
 		     unsigned HOST_WIDE_INT min_size,
 		     unsigned HOST_WIDE_INT max_size,
-		     unsigned HOST_WIDE_INT probable_max_size)
+		     unsigned HOST_WIDE_INT probable_max_size,
+		     unsigned ctz_size)
 {
   machine_mode mode = GET_MODE (object);
   unsigned int align;
@@ -3197,6 +3198,10 @@ clear_storage_hints (rtx object, rtx size, enum block_op_methods method,
 				   expected_align, expected_size,
 				   min_size, max_size, probable_max_size))
     ;
+  else if (try_store_by_multiple_pieces (object, size, ctz_size,
+					 min_size, max_size,
+					 NULL_RTX, 0, align))
+    ;
   else if (ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (object)))
     return set_storage_via_libcall (object, size, const0_rtx,
 				    method == BLOCK_OP_TAILCALL);
@@ -3214,7 +3219,7 @@ clear_storage (rtx object, rtx size, enum block_op_methods method)
     min = max = UINTVAL (size);
   else
     max = GET_MODE_MASK (GET_MODE (size));
-  return clear_storage_hints (object, size, method, 0, -1, min, max, max);
+  return clear_storage_hints (object, size, method, 0, -1, min, max, max, 0);
 }
 
 
-- 
cgit v1.1


From de09c0ddb81e2dc0e35c2e362532e93ca417200c Mon Sep 17 00:00:00 2001
From: Martin Sebor <msebor@redhat.com>
Date: Tue, 11 May 2021 13:58:48 -0600
Subject: Replace unreachable code with an assert.

Resolves:
PR middle-end/21433 - The COMPONENT_REF case of expand_expr_real_1 is probably wrong

gcc/ChangeLog:

	PR middle-end/21433
	* expr.c (expand_expr_real_1): Replace unreachable code with an assert.
---
 gcc/expr.c | 57 ++-------------------------------------------------------
 1 file changed, 2 insertions(+), 55 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 1b65f6b..d09ee42 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -10949,61 +10949,8 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
       goto normal_inner_ref;
 
     case COMPONENT_REF:
-      /* If the operand is a CONSTRUCTOR, we can just extract the
-	 appropriate field if it is present.  */
-      if (TREE_CODE (treeop0) == CONSTRUCTOR)
-	{
-	  unsigned HOST_WIDE_INT idx;
-	  tree field, value;
-	  scalar_int_mode field_mode;
-
-	  FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (treeop0),
-				    idx, field, value)
-	    if (field == treeop1
-		/* We can normally use the value of the field in the
-		   CONSTRUCTOR.  However, if this is a bitfield in
-		   an integral mode that we can fit in a HOST_WIDE_INT,
-		   we must mask only the number of bits in the bitfield,
-		   since this is done implicitly by the constructor.  If
-		   the bitfield does not meet either of those conditions,
-		   we can't do this optimization.  */
-		&& (! DECL_BIT_FIELD (field)
-		    || (is_int_mode (DECL_MODE (field), &field_mode)
-			&& (GET_MODE_PRECISION (field_mode)
-			    <= HOST_BITS_PER_WIDE_INT))))
-	      {
-		if (DECL_BIT_FIELD (field)
-		    && modifier == EXPAND_STACK_PARM)
-		  target = 0;
-		op0 = expand_expr (value, target, tmode, modifier);
-		if (DECL_BIT_FIELD (field))
-		  {
-		    HOST_WIDE_INT bitsize = TREE_INT_CST_LOW (DECL_SIZE (field));
-		    scalar_int_mode imode
-		      = SCALAR_INT_TYPE_MODE (TREE_TYPE (field));
-
-		    if (TYPE_UNSIGNED (TREE_TYPE (field)))
-		      {
-			op1 = gen_int_mode ((HOST_WIDE_INT_1 << bitsize) - 1,
-					    imode);
-			op0 = expand_and (imode, op0, op1, target);
-		      }
-		    else
-		      {
-			int count = GET_MODE_PRECISION (imode) - bitsize;
-
-			op0 = expand_shift (LSHIFT_EXPR, imode, op0, count,
-					    target, 0);
-			op0 = expand_shift (RSHIFT_EXPR, imode, op0, count,
-					    target, 0);
-		      }
-		  }
-
-		return op0;
-	      }
-	}
-      goto normal_inner_ref;
-
+      gcc_assert (TREE_CODE (treeop0) != CONSTRUCTOR);
+      /* Fall through.  */
     case BIT_FIELD_REF:
     case ARRAY_RANGE_REF:
     normal_inner_ref:
-- 
cgit v1.1


From 53fb833d635da04f5b44af16bcea1082e7b59e75 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 May 2021 05:16:20 -0700
Subject: Elide expand_constructor if move by pieces is preferred

Elide expand_constructor when the constructor is static storage and not
mostly zeros and we can move it by pieces prefer to do so since that's
usually more efficient than performing a series of stores from immediates.

2021-05-21  Richard Biener  <rguenther@suse.de>
	    H.J. Lu  <hjl.tools@gmail.com>

gcc/

	PR middle-end/90773
	* expr.c (expand_constructor): Elide expand_constructor if
	move by pieces is preferred.

gcc/testsuite/

	* gcc.target/i386/pr90773-24.c: New test.
	* gcc.target/i386/pr90773-25.c: Likewise.
---
 gcc/expr.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index d09ee42..ba61eb9 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -8523,6 +8523,19 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier,
       return constructor;
     }
 
+  /* If the CTOR is available in static storage and not mostly
+     zeros and we can move it by pieces prefer to do so since
+     that's usually more efficient than performing a series of
+     stores from immediates.  */
+  if (avoid_temp_mem
+      && TREE_STATIC (exp)
+      && TREE_CONSTANT (exp)
+      && tree_fits_uhwi_p (TYPE_SIZE_UNIT (type))
+      && can_move_by_pieces (tree_to_uhwi (TYPE_SIZE_UNIT (type)),
+			     TYPE_ALIGN (type))
+      && ! mostly_zeros_p (exp))
+    return NULL_RTX;
+
   /* Handle calls that pass values in multiple non-contiguous
      locations.  The Irix 6 ABI has examples of this.  */
   if (target == 0 || ! safe_from_p (target, exp, 1)
-- 
cgit v1.1


From 29a2f51806c5b30e17a8d0e9ba7915a3c53c34ff Mon Sep 17 00:00:00 2001
From: Julian Brown <julian@codesourcery.com>
Date: Fri, 26 Feb 2021 04:34:49 -0800
Subject: openacc: Add support for gang local storage allocation in shared
 memory [PR90115]

This patch implements a method to track the "private-ness" of
OpenACC variables declared in offload regions in gang-partitioned,
worker-partitioned or vector-partitioned modes. Variables declared
implicitly in scoped blocks and those declared "private" on enclosing
directives (e.g. "acc parallel") are both handled. Variables that are
e.g. gang-private can then be adjusted so they reside in GPU shared
memory.

The reason for doing this is twofold: correct implementation of OpenACC
semantics, and optimisation, since shared memory might be faster than
the main memory on a GPU. Handling of private variables is intimately
tied to the execution model for gangs/workers/vectors implemented by
a particular target: for current targets, we use (or on mainline, will
soon use) a broadcasting/neutering scheme.

That is sufficient for code that e.g. sets a variable in worker-single
mode and expects to use the value in worker-partitioned mode. The
difficulty (semantics-wise) comes when the user wants to do something like
an atomic operation in worker-partitioned mode and expects a worker-single
(gang private) variable to be shared across each partitioned worker.
Forcing use of shared memory for such variables makes that work properly.

In terms of implementation, the parallelism level of a given loop is
not fixed until the oaccdevlow pass in the offload compiler, so the
patch delays fixing the parallelism level of variables declared on or
within such loops until the same point. This is done by adding a new
internal UNIQUE function (OACC_PRIVATE) that lists (the address of) each
private variable as an argument, and other arguments set so as to be able
to determine the correct parallelism level to use for the listed
variables. This new internal function fits into the existing scheme for
demarcating OpenACC loops, as described in comments in the patch.

Two new target hooks are introduced: TARGET_GOACC_ADJUST_PRIVATE_DECL and
TARGET_GOACC_EXPAND_VAR_DECL.  The first can tweak a variable declaration
at oaccdevlow time, and the second at expand time.  The first or both
of these target hooks can be used by a given offload target, depending
on its strategy for implementing private variables.

This patch updates the TARGET_GOACC_ADJUST_PRIVATE_DECL target hook in
the AMD GCN backend to the current name and prototype. (An earlier
version of the hook was already present, but dormant.)

	gcc/
	PR middle-end/90115
	* doc/tm.texi.in (TARGET_GOACC_EXPAND_VAR_DECL)
	(TARGET_GOACC_ADJUST_PRIVATE_DECL): Add documentation hooks.
	* doc/tm.texi: Regenerate.
	* expr.c (expand_expr_real_1): Expand decls using the
	expand_var_decl OpenACC hook if defined.
	* internal-fn.c (expand_UNIQUE): Handle IFN_UNIQUE_OACC_PRIVATE.
	* internal-fn.h (IFN_UNIQUE_CODES): Add OACC_PRIVATE.
	* omp-low.c (omp_context): Add oacc_privatization_candidates
	field.
	(lower_oacc_reductions): Add PRIVATE_MARKER parameter.  Insert
	before fork.
	(lower_oacc_head_tail): Add PRIVATE_MARKER parameter.  Modify
	private marker's gimple call arguments, and pass it to
	lower_oacc_reductions.
	(oacc_privatization_scan_clause_chain)
	(oacc_privatization_scan_decl_chain, lower_oacc_private_marker):
	New functions.
	(lower_omp_for, lower_omp_target, lower_omp_1): Use these.
	* omp-offload.c (convert.h): Include.
	(oacc_loop_xform_head_tail): Treat private-variable markers like
	fork/join when transforming head/tail sequences.
	(struct var_decl_rewrite_info): Add struct.
	(oacc_rewrite_var_decl, is_sync_builtin_call): New functions.
	(execute_oacc_device_lower): Support rewriting gang-private
	variables using target hook, and fix up addr_expr and var_decl
	nodes afterwards.
	* target.def (adjust_private_decl, expand_var_decl): New hooks.
	* config/gcn/gcn-protos.h (gcn_goacc_adjust_gangprivate_decl):
	Rename to...
	(gcn_goacc_adjust_private_decl): ...this.
	* config/gcn/gcn-tree.c (gcn_goacc_adjust_gangprivate_decl):
	Rename to...
	(gcn_goacc_adjust_private_decl): ...this. Add LEVEL parameter.
	* config/gcn/gcn.c (TARGET_GOACC_ADJUST_GANGPRIVATE_DECL): Rename
	definition using gcn_goacc_adjust_gangprivate_decl...
	(TARGET_GOACC_ADJUST_PRIVATE_DECL): ...to this, using
	gcn_goacc_adjust_private_decl.
	* config/nvptx/nvptx.c (tree-pretty-print.h): Include.
	(gang_private_shared_size): New global variable.
	(gang_private_shared_align): Likewise.
	(gang_private_shared_sym): Likewise.
	(gang_private_shared_hmap): Likewise.
	(nvptx_option_override): Initialize these.
	(nvptx_file_end): Output gang_private_shared_sym.
	(nvptx_goacc_adjust_private_decl, nvptx_goacc_expand_var_decl):
	New functions.
	(nvptx_set_current_function): Clear gang_private_shared_hmap.
	(TARGET_GOACC_ADJUST_PRIVATE_DECL): Define hook.
	(TARGET_GOACC_EXPAND_VAR_DECL): Likewise.
	libgomp/
	PR middle-end/90115
	* testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c: New
	test.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90:
	Likewise.
	* testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90:
	Likewise.

Co-Authored-By: Chung-Lin Tang <cltang@codesourcery.com>
Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
---
 gcc/expr.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index ba61eb9..e4660f0 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -10419,8 +10419,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
       exp = SSA_NAME_VAR (ssa_name);
       goto expand_decl_rtl;
 
-    case PARM_DECL:
     case VAR_DECL:
+      /* Allow accel compiler to handle variables that require special
+	 treatment, e.g. if they have been modified in some way earlier in
+	 compilation by the adjust_private_decl OpenACC hook.  */
+      if (flag_openacc && targetm.goacc.expand_var_decl)
+	{
+	  temp = targetm.goacc.expand_var_decl (exp);
+	  if (temp)
+	    return temp;
+	}
+      /* ... fall through ...  */
+
+    case PARM_DECL:
       /* If a static var's type was incomplete when the decl was written,
 	 but the type is complete now, lay out the decl now.  */
       if (DECL_SIZE (exp) == 0
-- 
cgit v1.1


From 008153c8435ca3bf587e11654c31f05c0f99b43a Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Tue, 15 Jun 2021 11:36:47 +0200
Subject: expr: Fix up VEC_PACK_TRUNC_EXPR expansion [PR101046]

The following testcase ICEs, because we have a mode mismatch.
VEC_PACK_TRUNC_EXPR's operands have different modes from the result
(same vector mode size but twice as large element),
but we were passing non-NULL subtarget with the mode of the result
to the expansion of its arguments, so the VEC_PERM_EXPR in one of the
operands which had V8SImode operands and result had V16HImode target.

Fixed by clearing the subtarget if we are changing mode.

2021-06-15  Jakub Jelinek  <jakub@redhat.com>

	PR target/101046
	* expr.c (expand_expr_real_2) <case VEC_PACK_FIX_TRUNC_EXPR,
	case VEC_PACK_TRUNC_EXPR>: Clear subtarget when changing mode.

	* gcc.target/i386/pr101046.c: New test.
---
 gcc/expr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index e4660f0..231487f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -9998,6 +9998,7 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
     case VEC_PACK_SAT_EXPR:
     case VEC_PACK_FIX_TRUNC_EXPR:
       mode = TYPE_MODE (TREE_TYPE (treeop0));
+      subtarget = NULL_RTX;
       goto binop;
 
     case VEC_PACK_TRUNC_EXPR:
@@ -10021,6 +10022,7 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 	  return eops[0].value;
 	}
       mode = TYPE_MODE (TREE_TYPE (treeop0));
+      subtarget = NULL_RTX;
       goto binop;
 
     case VEC_PACK_FLOAT_EXPR:
-- 
cgit v1.1


From 967b46530234b4e6ad3983057705aea6c20a03c4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 21 May 2021 11:56:55 -0700
Subject: Add a target calls hook: TARGET_PUSH_ARGUMENT

1. Replace PUSH_ARGS with a target calls hook, TARGET_PUSH_ARGUMENT, which
takes an integer argument.  When it returns true, push instructions will
be used to pass outgoing arguments.  If the argument is nonzero, it is
the number of bytes to push and indicates the PUSH instruction usage is
optional so that the backend can decide if PUSH instructions should be
generated.  Otherwise, the argument is zero.
2. Implement x86 target hook which returns false when the number of bytes
to push is no less than 16 (8 for 32-bit targets) if vector load and store
can be used.
3. Remove target PUSH_ARGS definitions which return 0 as it is the same
as the default.
4. Define TARGET_PUSH_ARGUMENT of cr16 and m32c to always return true.

gcc/

	PR target/100704
	* calls.c (expand_call): Replace PUSH_ARGS with
	targetm.calls.push_argument (0).
	(emit_library_call_value_1): Likewise.
	* defaults.h (PUSH_ARGS): Removed.
	(PUSH_ARGS_REVERSED): Replace PUSH_ARGS with
	targetm.calls.push_argument (0).
	* expr.c (block_move_libcall_safe_for_call_parm): Likewise.
	(emit_push_insn): Pass the number bytes to push to
	targetm.calls.push_argument and pass 0 if ARGS_ADDR is 0.
	* hooks.c (hook_bool_uint_true): New.
	* hooks.h (hook_bool_uint_true): Likewise.
	* rtlanal.c (nonzero_bits1): Replace PUSH_ARGS with
	targetm.calls.push_argument (0).
	* target.def (push_argument): Add a targetm.calls hook.
	* targhooks.c (default_push_argument): New.
	* targhooks.h (default_push_argument): Likewise.
	* config/bpf/bpf.h (PUSH_ARGS): Removed.
	* config/cr16/cr16.c (TARGET_PUSH_ARGUMENT): New.
	* config/cr16/cr16.h (PUSH_ARGS): Removed.
	* config/i386/i386.c (ix86_push_argument): New.
	(TARGET_PUSH_ARGUMENT): Likewise.
	* config/i386/i386.h (PUSH_ARGS): Removed.
	* config/m32c/m32c.c (TARGET_PUSH_ARGUMENT): New.
	* config/m32c/m32c.h (PUSH_ARGS): Removed.
	* config/nios2/nios2.h (PUSH_ARGS): Likewise.
	* config/pru/pru.h (PUSH_ARGS): Likewise.
	* doc/tm.texi.in: Remove PUSH_ARGS documentation.  Add
	TARGET_PUSH_ARGUMENT hook.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	PR target/100704
	* gcc.target/i386/pr100704-1.c: New test.
	* gcc.target/i386/pr100704-2.c: Likewise.
	* gcc.target/i386/pr100704-3.c: Likewise.
---
 gcc/expr.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 231487f..025033c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1823,7 +1823,7 @@ block_move_libcall_safe_for_call_parm (void)
   tree fn;
 
   /* If arguments are pushed on the stack, then they're safe.  */
-  if (PUSH_ARGS)
+  if (targetm.calls.push_argument (0))
     return true;
 
   /* If registers go on the stack anyway, any argument is sure to clobber
@@ -4639,11 +4639,19 @@ emit_push_insn (rtx x, machine_mode mode, tree type, rtx size,
       skip = (reg_parm_stack_space == 0) ? 0 : used;
 
 #ifdef PUSH_ROUNDING
+      /* NB: Let the backend known the number of bytes to push and
+	 decide if push insns should be generated.  */
+      unsigned int push_size;
+      if (CONST_INT_P (size))
+	push_size = INTVAL (size);
+      else
+	push_size = 0;
+
       /* Do it with several push insns if that doesn't take lots of insns
 	 and if there is no difficulty with push insns that skip bytes
 	 on the stack for alignment purposes.  */
       if (args_addr == 0
-	  && PUSH_ARGS
+	  && targetm.calls.push_argument (push_size)
 	  && CONST_INT_P (size)
 	  && skip == 0
 	  && MEM_ALIGN (xinner) >= align
@@ -4848,7 +4856,7 @@ emit_push_insn (rtx x, machine_mode mode, tree type, rtx size,
 	anti_adjust_stack (gen_int_mode (extra, Pmode));
 
 #ifdef PUSH_ROUNDING
-      if (args_addr == 0 && PUSH_ARGS)
+      if (args_addr == 0 && targetm.calls.push_argument (0))
 	emit_single_push_insn (mode, x, type);
       else
 #endif
-- 
cgit v1.1


From 52c3fdf3e4780f75297515d3c2a3dae9b36586ba Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 2 Jul 2021 10:03:48 -0700
Subject: Don't use vec_duplicate on vector in CTOR expansion

Since vec_duplicate only works on scalar, don't use it on vector in
store constructor expansion.

gcc/

	PR middle-end/101294
	* expr.c (store_constructor): Don't use vec_duplicate on vector.

gcc/testsuite/

	PR middle-end/101294
	* gcc.dg/pr101294.c: New test.
---
 gcc/expr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 025033c..bd85bbf 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7078,7 +7078,8 @@ store_constructor (tree exp, rtx target, int cleared, poly_int64 size,
 	    && eltmode == GET_MODE_INNER (mode)
 	    && ((icode = optab_handler (vec_duplicate_optab, mode))
 		!= CODE_FOR_nothing)
-	    && (elt = uniform_vector_p (exp)))
+	    && (elt = uniform_vector_p (exp))
+	    && !VECTOR_TYPE_P (TREE_TYPE (elt)))
 	  {
 	    class expand_operand ops[2];
 	    create_output_operand (&ops[0], target, mode);
-- 
cgit v1.1


From 6d3bab5d5adb3e28ddb16c97b0831efdea23cf7d Mon Sep 17 00:00:00 2001
From: Martin Sebor <msebor@redhat.com>
Date: Tue, 6 Jul 2021 13:41:02 -0600
Subject: Improve warning suppression for inlined functions.

Resolves:
PR middle-end/98871 - Cannot silence -Wmaybe-uninitialized at declaration site
PR middle-end/98512 - #pragma GCC diagnostic ignored ineffective in conjunction with alias attribute

gcc/ChangeLog:

	* builtins.c (warn_string_no_nul): Remove %G.
	(maybe_warn_for_bound): Same.
	(warn_for_access): Same.
	(check_access): Same.
	(check_strncat_sizes): Same.
	(expand_builtin_strncat): Same.
	(expand_builtin_strncmp): Same.
	(expand_builtin): Same.
	(expand_builtin_object_size): Same.
	(warn_dealloc_offset): Same.
	(maybe_emit_free_warning): Same.
	* calls.c (maybe_warn_alloc_args_overflow): Same.
	(maybe_warn_nonstring_arg): Same.
	(maybe_warn_rdwr_sizes): Same.
	* expr.c (expand_expr_real_1): Remove %K.
	* gimple-fold.c (gimple_fold_builtin_strncpy): Remove %G.
	(gimple_fold_builtin_strncat): Same.
	* gimple-ssa-sprintf.c (format_directive): Same.
	(handle_printf_call): Same.
	* gimple-ssa-warn-alloca.c (pass_walloca::execute): Same.
	* gimple-ssa-warn-restrict.c (maybe_diag_overlap): Same.
	(maybe_diag_access_bounds): Same.  Call gimple_location.
	(check_bounds_or_overlap): Same.
	* trans-mem.c (ipa_tm_scan_irr_block): Remove %K.  Simplify.
	* tree-ssa-ccp.c (pass_post_ipa_warn::execute): Remove %G.
	* tree-ssa-strlen.c (maybe_warn_overflow): Same.
	(maybe_diag_stxncpy_trunc): Same.
	(handle_builtin_stxncpy_strncat): Same.
	(maybe_warn_pointless_strcmp): Same.
	* tree-ssa-uninit.c (maybe_warn_operand): Same.

gcc/testsuite/ChangeLog:

	* gcc.dg/Wobjsize-1.c: Prune expected output.
	* gcc.dg/Warray-bounds-71.c: New test.
	* gcc.dg/Warray-bounds-71.h: New test header.
	* gcc.dg/Warray-bounds-72.c: New test.
	* gcc.dg/Warray-bounds-73.c: New test.
	* gcc.dg/Warray-bounds-74.c: New test.
	* gcc.dg/Warray-bounds-75.c: New test.
	* gcc.dg/Wfree-nonheap-object-4.c: Adjust expected output.
	* gcc.dg/Wfree-nonheap-object-5.c: New test.
	* gcc.dg/Wfree-nonheap-object-6.c: New test.
	* gcc.dg/pragma-diag-10.c: New test.
	* gcc.dg/pragma-diag-9.c: New test.
	* gcc.dg/uninit-suppress_3.c: New test.
	* gcc.dg/pr79214.c: Xfail tests.
	* gcc.dg/tree-ssa/builtin-sprintf-warn-27.c: New test.
	* gcc.dg/format/c90-printf-1.c: Adjust expected output.
---
 gcc/expr.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index bd85bbf..6a43681 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -11402,7 +11402,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
       /* All valid uses of __builtin_va_arg_pack () are removed during
 	 inlining.  */
       if (CALL_EXPR_VA_ARG_PACK (exp))
-	error ("%Kinvalid use of %<__builtin_va_arg_pack ()%>", exp);
+	error ("invalid use of %<__builtin_va_arg_pack ()%>");
       {
 	tree fndecl = get_callee_fndecl (exp), attr;
 
@@ -11414,7 +11414,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 					 DECL_ATTRIBUTES (fndecl))) != NULL)
 	  {
 	    const char *ident = lang_hooks.decl_printable_name (fndecl, 1);
-	    error ("%Kcall to %qs declared with attribute error: %s", exp,
+	    error ("call to %qs declared with attribute error: %s",
 		   identifier_to_locale (ident),
 		   TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))));
 	  }
@@ -11426,10 +11426,10 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 					 DECL_ATTRIBUTES (fndecl))) != NULL)
 	  {
 	    const char *ident = lang_hooks.decl_printable_name (fndecl, 1);
-	    warning_at (tree_nonartificial_location (exp),
+	    warning_at (EXPR_LOCATION (exp),
 			OPT_Wattribute_warning,
-			"%Kcall to %qs declared with attribute warning: %s",
-			exp, identifier_to_locale (ident),
+			"call to %qs declared with attribute warning: %s",
+			identifier_to_locale (ident),
 			TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))));
 	  }
 
-- 
cgit v1.1


From e5e164effa30fd2b5c5bc3e6883d63889e96d8da Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 6 Mar 2016 06:38:21 -0800
Subject: Add QI vector mode support to by-pieces for memset

1. Replace scalar_int_mode with fixed_size_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_fixed_size_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_fixed_size_mode_for_size to return the
smallest integer or QI vector mode.
4. Remove clear_by_pieces_1 and use builtin_memset_read_str in
clear_by_pieces to support vector mode broadcast.
5. Add lowpart_subreg_regno, a wrapper around simplify_subreg_regno that
uses subreg_lowpart_offset (mode, prev_mode) as the offset.
6. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
scratch register to avoid stack realignment when expanding memset.

gcc/

	PR middle-end/90773
	* builtins.c (builtin_memcpy_read_str): Change the mode argument
	from scalar_int_mode to fixed_size_mode.
	(builtin_strncpy_read_str): Likewise.
	(gen_memset_value_from_prev): New function.
	(builtin_memset_read_str): Change the mode argument from
	scalar_int_mode to fixed_size_mode.  Use gen_memset_value_from_prev
	and support CONST_VECTOR.
	(builtin_memset_gen_str): Likewise.
	(try_store_by_multiple_pieces): Use by_pieces_constfn to declare
	constfun.
	* builtins.h (builtin_strncpy_read_str): Replace scalar_int_mode
	with fixed_size_mode.
	(builtin_memset_read_str): Likewise.
	* expr.c (widest_int_mode_for_size): Renamed to ...
	(widest_fixed_size_mode_for_size): Add a bool argument to
	indicate if QI vector mode can be used.
	(by_pieces_ninsns): Call widest_fixed_size_mode_for_size
	instead of widest_int_mode_for_size.
	(pieces_addr::adjust): Change the mode argument from
	scalar_int_mode to fixed_size_mode.
	(op_by_pieces_d): Make m_len read-only.  Add a bool member,
	m_qi_vector_mode, to indicate that QI vector mode can be used.
	(op_by_pieces_d::op_by_pieces_d): Add a bool argument to
	initialize m_qi_vector_mode.  Call widest_fixed_size_mode_for_size
	instead of widest_int_mode_for_size.
	(op_by_pieces_d::get_usable_mode): Change the mode argument from
	scalar_int_mode to fixed_size_mode.  Call
	widest_fixed_size_mode_for_size instead of
	widest_int_mode_for_size.
	(op_by_pieces_d::smallest_fixed_size_mode_for_size): New member
	function to return the smallest integer or QI vector mode.
	(op_by_pieces_d::run): Call widest_fixed_size_mode_for_size
	instead of widest_int_mode_for_size.  Call
	smallest_fixed_size_mode_for_size instead of
	smallest_int_mode_for_size.
	(store_by_pieces_d::store_by_pieces_d): Add a bool argument to
	indicate that QI vector mode can be used and pass it to
	op_by_pieces_d::op_by_pieces_d.
	(can_store_by_pieces): Call widest_fixed_size_mode_for_size
	instead of widest_int_mode_for_size.  Pass memsetp to
	widest_fixed_size_mode_for_size to support QI vector mode.
	Allow all CONST_VECTORs for memset if vec_duplicate is supported.
	(store_by_pieces): Pass memsetp to
	store_by_pieces_d::store_by_pieces_d.
	(clear_by_pieces_1): Removed.
	(clear_by_pieces): Replace clear_by_pieces_1 with
	builtin_memset_read_str and pass true to store_by_pieces_d to
	support vector mode broadcast.
	(string_cst_read_str): Change the mode argument from
	scalar_int_mode to fixed_size_mode.
	* expr.h (by_pieces_constfn): Change scalar_int_mode to
	fixed_size_mode.
	(by_pieces_prev): Likewise.
	* rtl.h (lowpart_subreg_regno): New.
	* rtlanal.c (lowpart_subreg_regno): New.  A wrapper around
	simplify_subreg_regno.
	* target.def (gen_memset_scratch_rtx): New hook.
	* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
	* doc/tm.texi: Regenerated.

gcc/testsuite/

	* gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
	vmovdqu.
	* gcc.target/i386/pr100865-4b.c: Likewise.
---
 gcc/expr.c | 172 ++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 120 insertions(+), 52 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 6a43681..b65cfcf 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
   return align;
 }
 
-/* Return the widest integer mode that is narrower than SIZE bytes.  */
+/* Return the widest QI vector, if QI_MODE is true, or integer mode
+   that is narrower than SIZE bytes.  */
 
-static scalar_int_mode
-widest_int_mode_for_size (unsigned int size)
+static fixed_size_mode
+widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector)
 {
-  scalar_int_mode result = NARROWEST_INT_MODE;
+  fixed_size_mode result = NARROWEST_INT_MODE;
 
   gcc_checking_assert (size > 1);
 
+  /* Use QI vector only if size is wider than a WORD.  */
+  if (qi_vector && size > UNITS_PER_WORD)
+    {
+      machine_mode mode;
+      fixed_size_mode candidate;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (is_a<fixed_size_mode> (mode, &candidate)
+	    && GET_MODE_INNER (candidate) == QImode)
+	  {
+	    if (GET_MODE_SIZE (candidate) >= size)
+	      break;
+	    if (optab_handler (vec_duplicate_optab, candidate)
+		!= CODE_FOR_nothing)
+	      result = candidate;
+	  }
+
+      if (result != NARROWEST_INT_MODE)
+	return result;
+    }
+
   opt_scalar_int_mode tmode;
   FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
     if (GET_MODE_SIZE (tmode.require ()) < size)
@@ -815,13 +836,14 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 		  unsigned int max_size, by_pieces_operation op)
 {
   unsigned HOST_WIDE_INT n_insns = 0;
-  scalar_int_mode mode;
+  fixed_size_mode mode;
 
   if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
     {
       /* NB: Round up L and ALIGN to the widest integer mode for
 	 MAX_SIZE.  */
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_fixed_size_mode_for_size (max_size,
+					      op == SET_BY_PIECES);
       if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
 	{
 	  unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
@@ -835,7 +857,8 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
 
   while (max_size > 1 && l > 0)
     {
-      mode = widest_int_mode_for_size (max_size);
+      mode = widest_fixed_size_mode_for_size (max_size,
+					      op == SET_BY_PIECES);
       enum insn_code icode;
 
       unsigned int modesize = GET_MODE_SIZE (mode);
@@ -903,8 +926,7 @@ class pieces_addr
   void *m_cfndata;
 public:
   pieces_addr (rtx, bool, by_pieces_constfn, void *);
-  rtx adjust (scalar_int_mode, HOST_WIDE_INT,
-	      by_pieces_prev * = nullptr);
+  rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
   void increment_address (HOST_WIDE_INT);
   void maybe_predec (HOST_WIDE_INT);
   void maybe_postinc (HOST_WIDE_INT);
@@ -1006,7 +1028,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
    but we still modify the MEM's properties.  */
 
 rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+pieces_addr::adjust (fixed_size_mode mode, HOST_WIDE_INT offset,
 		     by_pieces_prev *prev)
 {
   if (m_constfn)
@@ -1060,11 +1082,14 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
 class op_by_pieces_d
 {
  private:
-  scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+  fixed_size_mode get_usable_mode (fixed_size_mode, unsigned int);
+  fixed_size_mode smallest_fixed_size_mode_for_size (unsigned int);
 
  protected:
   pieces_addr m_to, m_from;
-  unsigned HOST_WIDE_INT m_len;
+  /* Make m_len read-only so that smallest_fixed_size_mode_for_size can
+     use it to check the valid mode size.  */
+  const unsigned HOST_WIDE_INT m_len;
   HOST_WIDE_INT m_offset;
   unsigned int m_align;
   unsigned int m_max_size;
@@ -1073,6 +1098,8 @@ class op_by_pieces_d
   bool m_push;
   /* True if targetm.overlap_op_by_pieces_p () returns true.  */
   bool m_overlap_op_by_pieces;
+  /* True if QI vector mode can be used.  */
+  bool m_qi_vector_mode;
 
   /* Virtual functions, overriden by derived classes for the specific
      operation.  */
@@ -1084,7 +1111,8 @@ class op_by_pieces_d
 
  public:
   op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool);
+		  unsigned HOST_WIDE_INT, unsigned int, bool,
+		  bool = false);
   void run ();
 };
 
@@ -1099,11 +1127,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
-				unsigned int align, bool push)
+				unsigned int align, bool push,
+				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
     m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
-    m_push (push)
+    m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
   int fromi = m_from.get_addr_inc ();
@@ -1124,7 +1153,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
   if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
     {
       /* Find the mode of the largest comparison.  */
-      scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+      fixed_size_mode mode
+	= widest_fixed_size_mode_for_size (m_max_size,
+					   m_qi_vector_mode);
 
       m_from.decide_autoinc (mode, m_reverse, len);
       m_to.decide_autoinc (mode, m_reverse, len);
@@ -1139,8 +1170,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 /* This function returns the largest usable integer mode for LEN bytes
    whose size is no bigger than size of MODE.  */
 
-scalar_int_mode
-op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+fixed_size_mode
+op_by_pieces_d::get_usable_mode (fixed_size_mode mode, unsigned int len)
 {
   unsigned int size;
   do
@@ -1148,13 +1179,42 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
       size = GET_MODE_SIZE (mode);
       if (len >= size && prepare_mode (mode, m_align))
 	break;
-      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-      mode = widest_int_mode_for_size (size);
+      /* widest_fixed_size_mode_for_size checks SIZE > 1.  */
+      mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode);
     }
   while (1);
   return mode;
 }
 
+/* Return the smallest integer or QI vector mode that is not narrower
+   than SIZE bytes.  */
+
+fixed_size_mode
+op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size)
+{
+  /* Use QI vector only for > size of WORD.  */
+  if (m_qi_vector_mode && size > UNITS_PER_WORD)
+    {
+      machine_mode mode;
+      fixed_size_mode candidate;
+      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+	if (is_a<fixed_size_mode> (mode, &candidate)
+	    && GET_MODE_INNER (candidate) == QImode)
+	  {
+	    /* Don't return a mode wider than M_LEN.  */
+	    if (GET_MODE_SIZE (candidate) > m_len)
+	      break;
+
+	    if (GET_MODE_SIZE (candidate) >= size
+		&& (optab_handler (vec_duplicate_optab, candidate)
+		    != CODE_FOR_nothing))
+	      return candidate;
+	  }
+    }
+
+  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
+}
+
 /* This function contains the main loop used for expanding a block
    operation.  First move what we can in the largest integer mode,
    then go to successively smaller modes.  For every access, call
@@ -1166,9 +1226,12 @@ op_by_pieces_d::run ()
   if (m_len == 0)
     return;
 
-  /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1.  */
-  scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
-  mode = get_usable_mode (mode, m_len);
+  unsigned HOST_WIDE_INT length = m_len;
+
+  /* widest_fixed_size_mode_for_size checks M_MAX_SIZE > 1.  */
+  fixed_size_mode mode
+    = widest_fixed_size_mode_for_size (m_max_size, m_qi_vector_mode);
+  mode = get_usable_mode (mode, length);
 
   by_pieces_prev to_prev = { nullptr, mode };
   by_pieces_prev from_prev = { nullptr, mode };
@@ -1178,7 +1241,7 @@ op_by_pieces_d::run ()
       unsigned int size = GET_MODE_SIZE (mode);
       rtx to1 = NULL_RTX, from1;
 
-      while (m_len >= size)
+      while (length >= size)
 	{
 	  if (m_reverse)
 	    m_offset -= size;
@@ -1201,22 +1264,22 @@ op_by_pieces_d::run ()
 	  if (!m_reverse)
 	    m_offset += size;
 
-	  m_len -= size;
+	  length -= size;
 	}
 
       finish_mode (mode);
 
-      if (m_len == 0)
+      if (length == 0)
 	return;
 
       if (!m_push && m_overlap_op_by_pieces)
 	{
 	  /* NB: Generate overlapping operations if it is not a stack
 	     push since stack push must not overlap.  Get the smallest
-	     integer mode for M_LEN bytes.  */
-	  mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
+	     fixed size mode for M_LEN bytes.  */
+	  mode = smallest_fixed_size_mode_for_size (length);
 	  mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
-	  int gap = GET_MODE_SIZE (mode) - m_len;
+	  int gap = GET_MODE_SIZE (mode) - length;
 	  if (gap > 0)
 	    {
 	      /* If size of MODE > M_LEN, generate the last operation
@@ -1226,20 +1289,21 @@ op_by_pieces_d::run ()
 		m_offset += gap;
 	      else
 		m_offset -= gap;
-	      m_len += gap;
+	      length += gap;
 	    }
 	}
       else
 	{
-	  /* NB: widest_int_mode_for_size checks SIZE > 1.  */
-	  mode = widest_int_mode_for_size (size);
-	  mode = get_usable_mode (mode, m_len);
+	  /* widest_fixed_size_mode_for_size checks SIZE > 1.  */
+	  mode = widest_fixed_size_mode_for_size (size,
+						  m_qi_vector_mode);
+	  mode = get_usable_mode (mode, length);
 	}
     }
   while (1);
 
   /* The code above should have handled everything.  */
-  gcc_assert (!m_len);
+  gcc_assert (!length);
 }
 
 /* Derived class from op_by_pieces_d, providing support for block move
@@ -1355,9 +1419,10 @@ class store_by_pieces_d : public op_by_pieces_d
 
  public:
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
-		     unsigned HOST_WIDE_INT len, unsigned int align)
+		     unsigned HOST_WIDE_INT len, unsigned int align,
+		     bool qi_vector_mode)
     : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false)
+		      align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1446,7 +1511,8 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
       max_size = STORE_MAX_PIECES + 1;
       while (max_size > 1 && l > 0)
 	{
-	  scalar_int_mode mode = widest_int_mode_for_size (max_size);
+	  fixed_size_mode mode
+	    = widest_fixed_size_mode_for_size (max_size, memsetp);
 
 	  icode = optab_handler (mov_optab, mode);
 	  if (icode != CODE_FOR_nothing
@@ -1460,7 +1526,11 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
 		    offset -= size;
 
 		  cst = (*constfun) (constfundata, nullptr, offset, mode);
-		  if (!targetm.legitimate_constant_p (mode, cst))
+		  /* All CONST_VECTORs can be loaded for memset since
+		     vec_duplicate_optab is a precondition to pick a
+		     vector mode for the memset expander.  */
+		  if (!((memsetp && VECTOR_MODE_P (mode))
+			|| targetm.legitimate_constant_p (mode, cst)))
 		    return 0;
 
 		  if (!reverse)
@@ -1504,7 +1574,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
 		 memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
 		 optimize_insn_for_speed_p ()));
 
-  store_by_pieces_d data (to, constfun, constfundata, len, align);
+  store_by_pieces_d data (to, constfun, constfundata, len, align,
+			  memsetp);
   data.run ();
 
   if (retmode != RETURN_BEGIN)
@@ -1513,15 +1584,6 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
     return to;
 }
 
-/* Callback routine for clear_by_pieces.
-   Return const0_rtx unconditionally.  */
-
-static rtx
-clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
-{
-  return const0_rtx;
-}
-
 /* Generate several move instructions to clear LEN bytes of block TO.  (A MEM
    rtx with BLKmode).  ALIGN is maximum alignment we can assume.  */
 
@@ -1531,7 +1593,10 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align)
   if (len == 0)
     return;
 
-  store_by_pieces_d data (to, clear_by_pieces_1, NULL, len, align);
+  /* Use builtin_memset_read_str to support vector mode broadcast.  */
+  char c = 0;
+  store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align,
+			  true);
   data.run ();
 }
 
@@ -5754,7 +5819,7 @@ emit_storent_insn (rtx to, rtx from)
 
 static rtx
 string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
-		     scalar_int_mode mode)
+		     fixed_size_mode mode)
 {
   tree str = (tree) data;
 
@@ -5769,10 +5834,13 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
       size_t l = TREE_STRING_LENGTH (str) - offset;
       memcpy (p, TREE_STRING_POINTER (str) + offset, l);
       memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
-      return c_readstr (p, mode, false);
+      return c_readstr (p, as_a <scalar_int_mode> (mode), false);
     }
 
-  return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
+  /* The by-pieces infrastructure does not try to pick a vector mode
+     for storing STRING_CST.  */
+  return c_readstr (TREE_STRING_POINTER (str) + offset,
+		    as_a <scalar_int_mode> (mode), false);
 }
 
 /* Generate code for computing expression EXP,
-- 
cgit v1.1


From f2e5d2717d9e249edc5e0d45e49e4f9ef81fc694 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: by_pieces: Pass MAX_PIECES to op_by_pieces_d

Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and
compare.

	PR target/101742
	* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
	argument to set m_max_size.
	(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
	(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
	(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.
---
 gcc/expr.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcf..096c031 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1120,10 +1120,12 @@ class op_by_pieces_d
    objects named TO and FROM, which are identified as loads or stores
    by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
    and its associated FROM_CFN_DATA can be used to replace loads with
-   constant values.  LEN describes the length of the operation.  */
+   constant values.  MAX_PIECES describes the maximum number of bytes
+   at a time which can be moved efficiently.  LEN describes the length
+   of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }
-- 
cgit v1.1


From cad36f38576a6a781e3c62ab061c68f5b8dab13a Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue, 31 Aug 2021 11:45:07 +0100
Subject: Preserve SUBREG_PROMOTED_VAR_P on (extend:HI (subreg/s:QI (reg:SI))).

SUBREG_PROMOTED_VAR_P is a mechanism for tracking that a partial subreg
is correctly zero-extended or sign-extended in the parent register.  For
example, the RTL (subreg/s/v:QI (reg/v:SI 23 [ x ]) 0) indicates that the
byte x is zero extended in reg:SI 23, which is useful for optimization.
An example is that zero extending the above QImode value to HImode can
simply use a wider subreg, i.e. (subreg:HI (reg/v:SI 23 [ x ]) 0).

This patch addresses the oversight/missed optimization opportunity that
the new HImode subreg above should retain its SUBREG_PROMOTED_VAR_P
annotation as its value is guaranteed to be correctly extended in the
SImode parent.  The code below to preserve SUBREG_PROMOTED_VAR_P is already
present in the middle-end (e.g. simplify-rtx.c:7232-7242) but missing
from one or two (precisely three) places that (accidentally) strip it.

Whilst there I also added another optimization.  If we need to extend
the above QImode value beyond the SImode register holding it, say to
DImode, we can eliminate the SUBREG and simply extend from the SImode
register to DImode.

2021-08-31  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* expr.c (convert_modes): Preserve SUBREG_PROMOTED_VAR_P when
	creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P
	subreg.
	* simplify-rtx.c (simplify_unary_operation_1) [SIGN_EXTEND]:
	Likewise, preserve SUBREG_PROMOTED_VAR_P when creating a (wider)
	partial subreg from a SUBREG_PROMOTED_VAR_P subreg.  Generate
	SIGN_EXTEND of the SUBREG_REG when a subreg would be paradoxical.
	[ZERO_EXTEND]: Likewise, preserve SUBREG_PROMOTED_VAR_P when
	creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P
	subreg.  Generate ZERO_EXTEND of the SUBREG_REG when a subreg
	would be paradoxical.
---
 gcc/expr.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 096c031..5dd98a9 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -688,7 +688,24 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
       && (GET_MODE_PRECISION (subreg_promoted_mode (x))
 	  >= GET_MODE_PRECISION (int_mode))
       && SUBREG_CHECK_PROMOTED_SIGN (x, unsignedp))
-    x = gen_lowpart (int_mode, SUBREG_REG (x));
+    {
+      scalar_int_mode int_orig_mode;
+      machine_mode orig_mode = GET_MODE (x);
+      x = gen_lowpart (int_mode, SUBREG_REG (x));
+
+      /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than
+	 the original mode, but narrower than the inner mode.  */
+      if (GET_CODE (x) == SUBREG
+	  && GET_MODE_PRECISION (subreg_promoted_mode (x))
+	     > GET_MODE_PRECISION (int_mode)
+	  && is_a <scalar_int_mode> (orig_mode, &int_orig_mode)
+	  && GET_MODE_PRECISION (int_mode)
+	     > GET_MODE_PRECISION (int_orig_mode))
+	{
+	  SUBREG_PROMOTED_VAR_P (x) = 1;
+	  SUBREG_PROMOTED_SET (x, unsignedp);
+	}
+    }
 
   if (GET_MODE (x) != VOIDmode)
     oldmode = GET_MODE (x);
-- 
cgit v1.1


From 863d6524f320abe14d6993fec188e787470e69b6 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue, 31 Aug 2021 17:25:58 +0100
Subject: [Committed] Fix subreg_promoted_mode breakage on various platforms.

My apologies for the inconvenience.  My recent patch to preserve
SUBREG_PROMOTED_VAR_P on (extend:HI (subreg/s:QI (reg:SI))), and other
places in the middle-end, has broken the build on several targets.

The change to convert_modes inadvertently used the same
subreg_promoted_mode idiom for retrieving the mode of a SUBREG_REG
as the existing code just a few lines earlier.  Alas in the meantime,
the original SUBREG gets replaced by one without SUBREG_PROMOTED_VAR_P,
the whole raison-d'etre for my patch, and I'd not realized/noticed
that subreg_promoted_mode asserts for this.  Alas neither the bootstrap
and regression test on x86_64-pc-linux-gnu nor my testing on nvptx-none
must have hit this particular case.  The logic of this transformation
is sound, it's the implementation that's bitten me.

This patch has been committed, after another "make bootstrap" on
x86_64-pc-linux-gnu (just in case), and confirmation/pre-approval
from Jeff Law that this indeed fixes the build failures seen on
several platforms.

My humble apologies again.

2021-08-31  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* expr.c (convert_modes): Don't use subreg_promoted_mode on a
	SUBREG if it can't be guaranteed to a SUBREG_PROMOTED_VAR_P set.
	Instead use the standard (safer) is_a <scalar_int_mode> idiom.
---
 gcc/expr.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 5dd98a9..17f2c2f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -690,17 +690,20 @@ convert_modes (machine_mode mode, machine_mode oldmode, rtx x, int unsignedp)
       && SUBREG_CHECK_PROMOTED_SIGN (x, unsignedp))
     {
       scalar_int_mode int_orig_mode;
+      scalar_int_mode int_inner_mode;
       machine_mode orig_mode = GET_MODE (x);
       x = gen_lowpart (int_mode, SUBREG_REG (x));
 
       /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than
 	 the original mode, but narrower than the inner mode.  */
       if (GET_CODE (x) == SUBREG
-	  && GET_MODE_PRECISION (subreg_promoted_mode (x))
-	     > GET_MODE_PRECISION (int_mode)
 	  && is_a <scalar_int_mode> (orig_mode, &int_orig_mode)
 	  && GET_MODE_PRECISION (int_mode)
-	     > GET_MODE_PRECISION (int_orig_mode))
+	     > GET_MODE_PRECISION (int_orig_mode)
+	  && is_a <scalar_int_mode> (GET_MODE (SUBREG_REG (x)),
+				     &int_inner_mode)
+	  && GET_MODE_PRECISION (int_inner_mode)
+	     > GET_MODE_PRECISION (int_mode))
 	{
 	  SUBREG_PROMOTED_VAR_P (x) = 1;
 	  SUBREG_PROMOTED_SET (x, unsignedp);
-- 
cgit v1.1


From b195fae7c11b35c8ee551cd302e3daf8e08c78d0 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Sun, 12 Sep 2021 15:18:57 +0100
Subject: Also preserve SUBREG_PROMOTED_VAR_P in expr.c's convert_move.

This patch catches another place in the middle-end where it's possible
to preserve the SUBREG_PROMOTED_VAR_P annotation on a subreg to the
benefit of later RTL optimizations.  This adds the same logic to
expr.c's convert_move as recently added to convert_modes.

On nvptx-none, the simple test program:

short foo (char c) { return c; }

currently generates three instructions:

mov.u32	%r23, %ar0;
cvt.u16.u32     %r24, %r23;
cvt.s32.s16     %value, %r24;

with this patch, we now generate just one:

mov.u32 %value, %ar0;

This patch should look familiar, it's almost identical to the recent patch
https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578331.html but with
the fix https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578519.html

2021-09-12  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* expr.c (convert_move): Preserve SUBREG_PROMOTED_VAR_P when
	creating a (wider) partial subreg from a SUBREG_PROMOTED_VAR_P
	subreg.
---
 gcc/expr.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'gcc/expr.c')

diff --git a/gcc/expr.c b/gcc/expr.c
index 17f2c2f..e0bcbcc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -236,8 +236,27 @@ convert_move (rtx to, rtx from, int unsignedp)
 	  >= GET_MODE_PRECISION (to_int_mode))
       && SUBREG_CHECK_PROMOTED_SIGN (from, unsignedp))
     {
+      scalar_int_mode int_orig_mode;
+      scalar_int_mode int_inner_mode;
+      machine_mode orig_mode = GET_MODE (from);
+
       from = gen_lowpart (to_int_mode, SUBREG_REG (from));
       from_mode = to_int_mode;
+
+      /* Preserve SUBREG_PROMOTED_VAR_P if the new mode is wider than
+	 the original mode, but narrower than the inner mode.  */
+      if (GET_CODE (from) == SUBREG
+	  && is_a <scalar_int_mode> (orig_mode, &int_orig_mode)
+	  && GET_MODE_PRECISION (to_int_mode)
+	     > GET_MODE_PRECISION (int_orig_mode)
+	  && is_a <scalar_int_mode> (GET_MODE (SUBREG_REG (from)),
+				     &int_inner_mode)
+	  && GET_MODE_PRECISION (int_inner_mode)
+	     > GET_MODE_PRECISION (to_int_mode))
+	{
+	  SUBREG_PROMOTED_VAR_P (from) = 1;
+	  SUBREG_PROMOTED_SET (from, unsignedp);
+	}
     }
 
   gcc_assert (GET_CODE (to) != SUBREG || !SUBREG_PROMOTED_VAR_P (to));
-- 
cgit v1.1