dse: Remove partial load after full store for high part access[PR71309]

v5 update as comments: 1. Move const_rhs out of loop; 2. Iterate from int size for read_mode. This patch could optimize(works for char/short/int/void*): 6: r119:TI=[r118:DI+0x10] 7: [r118:DI]=r119:TI 8: r121:DI=[r118:DI+0x8] => 6: r119:TI=[r118:DI+0x10] 16: r122:DI=r119:TI#8 Final ASM will be as below without partial load after full store(stxv+ld): ld 10,16(3) mr 9,3 ld 3,24(3) std 10,0(9) std 3,8(9) blr It could achieve ~25% performance improvement for typical cases on Power9. Bootstrap and regression tested on Power9-LE. For AArch64, one ldr is replaced by mov with this patch: ldp x2, x3, [x0, 16] stp x2, x3, [x0] ldr x0, [x0, 8] => mov x1, x0 ldp x2, x0, [x0, 16] stp x2, x0, [x1] gcc/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * dse.c (find_shift_sequence): Use subreg of shifted from high part register to avoid loading from address. gcc/testsuite/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * gcc.target/powerpc/pr71309.c: New test.
author: Xionghu Luo <luoxhu@linux.ibm.com> 2020-08-03 22:09:15 -0500
committer: Xionghu Luo <luoxhu@linux.ibm.com> 2020-08-03 22:09:15 -0500
commit: 265d817b1eb4644c7a9613ad6920315d98e2e0a4 (patch)
tree: 10b0950642c2dc9f5e0468fcfd882a895396e817 /gcc
parent: 6a1ad710ad20ef05296013679dd42724865a0396 (diff)
download: gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.zip
gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.gz
gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.bz2
2 files changed, 83 insertions, 29 deletions
diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e..d65266b 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
   scalar_int_mode new_mode;
   rtx read_reg = NULL;
 
+  /* If a constant was stored into memory, try to simplify it here,
+     otherwise the cost of the shift might preclude this optimization
+     e.g. at -Os, even when no actual shift will be needed.  */
+  if (store_info->const_rhs)
+    {
+      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+      auto byte = subreg_lowpart_offset (new_mode, store_mode);
+      rtx ret
+	= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+      if (ret && CONSTANT_P (ret))
+	{
+	  rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+	  ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+						 shift_rtx);
+	  if (ret && CONSTANT_P (ret))
+	    {
+	      byte = subreg_lowpart_offset (read_mode, new_mode);
+	      ret = simplify_subreg (read_mode, ret, new_mode, byte);
+	      if (ret && CONSTANT_P (ret)
+		  && (set_src_cost (ret, read_mode, speed)
+		      <= COSTS_N_INSNS (1)))
+		return ret;
+	    }
+	}
+    }
+
+  if (require_cst)
+    return NULL_RTX;
+
   /* Some machines like the x86 have shift insns for each size of
      operand.  Other machines like the ppc or the ia-64 may only have
      shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
 
   opt_scalar_int_mode new_mode_iter;
   FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+		      smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
       if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
 	break;
 
-      /* If a constant was stored into memory, try to simplify it here,
-	 otherwise the cost of the shift might preclude this optimization
-	 e.g. at -Os, even when no actual shift will be needed.  */
-      if (store_info->const_rhs)
-	{
-	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
-	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
-				     store_mode, byte);
-	  if (ret && CONSTANT_P (ret))
-	    {
-	      rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
-	      ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
-						     ret, shift_rtx);
-	      if (ret && CONSTANT_P (ret))
-		{
-		  byte = subreg_lowpart_offset (read_mode, new_mode);
-		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
-		  if (ret && CONSTANT_P (ret)
-		      && (set_src_cost (ret, read_mode, speed)
-			  <= COSTS_N_INSNS (1)))
-		    return ret;
-		}
-	    }
-	}
-
-      if (require_cst)
-	return NULL_RTX;
-
       /* Try a wider mode if truncating the store mode to NEW_MODE
 	 requires a real instruction.  */
       if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (shift, GET_MODE_BITSIZE (new_mode))
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset
+	    = subreg_offset_from_lsb (new_mode, store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 0000000..e1cbcea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
author	Xionghu Luo <luoxhu@linux.ibm.com>	2020-08-03 22:09:15 -0500
committer	Xionghu Luo <luoxhu@linux.ibm.com>	2020-08-03 22:09:15 -0500
commit	265d817b1eb4644c7a9613ad6920315d98e2e0a4 (patch)
tree	10b0950642c2dc9f5e0468fcfd882a895396e817 /gcc
parent	6a1ad710ad20ef05296013679dd42724865a0396 (diff)
download	gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.zip gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.gz gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.bz2