aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorXionghu Luo <luoxhu@linux.ibm.com>2020-08-03 22:09:15 -0500
committerXionghu Luo <luoxhu@linux.ibm.com>2020-08-03 22:09:15 -0500
commit265d817b1eb4644c7a9613ad6920315d98e2e0a4 (patch)
tree10b0950642c2dc9f5e0468fcfd882a895396e817 /gcc
parent6a1ad710ad20ef05296013679dd42724865a0396 (diff)
downloadgcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.zip
gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.gz
gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.bz2
dse: Remove partial load after full store for high part access[PR71309]
v5 update as comments: 1. Move const_rhs out of loop; 2. Iterate from int size for read_mode. This patch could optimize(works for char/short/int/void*): 6: r119:TI=[r118:DI+0x10] 7: [r118:DI]=r119:TI 8: r121:DI=[r118:DI+0x8] => 6: r119:TI=[r118:DI+0x10] 16: r122:DI=r119:TI#8 Final ASM will be as below without partial load after full store(stxv+ld): ld 10,16(3) mr 9,3 ld 3,24(3) std 10,0(9) std 3,8(9) blr It could achieve ~25% performance improvement for typical cases on Power9. Bootstrap and regression tested on Power9-LE. For AArch64, one ldr is replaced by mov with this patch: ldp x2, x3, [x0, 16] stp x2, x3, [x0] ldr x0, [x0, 8] => mov x1, x0 ldp x2, x0, [x0, 16] stp x2, x0, [x1] gcc/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * dse.c (find_shift_sequence): Use subreg of shifted from high part register to avoid loading from address. gcc/testsuite/ChangeLog: 2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com> PR rtl-optimization/71309 * gcc.target/powerpc/pr71309.c: New test.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/dse.c78
-rw-r--r--gcc/testsuite/gcc.target/powerpc/pr71309.c34
2 files changed, 83 insertions, 29 deletions
diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e..d65266b 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
scalar_int_mode new_mode;
rtx read_reg = NULL;
+ /* If a constant was stored into memory, try to simplify it here,
+ otherwise the cost of the shift might preclude this optimization
+ e.g. at -Os, even when no actual shift will be needed. */
+ if (store_info->const_rhs)
+ {
+ auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+ auto byte = subreg_lowpart_offset (new_mode, store_mode);
+ rtx ret
+ = simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+ if (ret && CONSTANT_P (ret))
+ {
+ rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+ ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+ shift_rtx);
+ if (ret && CONSTANT_P (ret))
+ {
+ byte = subreg_lowpart_offset (read_mode, new_mode);
+ ret = simplify_subreg (read_mode, ret, new_mode, byte);
+ if (ret && CONSTANT_P (ret)
+ && (set_src_cost (ret, read_mode, speed)
+ <= COSTS_N_INSNS (1)))
+ return ret;
+ }
+ }
+ }
+
+ if (require_cst)
+ return NULL_RTX;
+
/* Some machines like the x86 have shift insns for each size of
operand. Other machines like the ppc or the ia-64 may only have
shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
opt_scalar_int_mode new_mode_iter;
FOR_EACH_MODE_FROM (new_mode_iter,
- smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+ smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
{
rtx target, new_reg, new_lhs;
rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
break;
- /* If a constant was stored into memory, try to simplify it here,
- otherwise the cost of the shift might preclude this optimization
- e.g. at -Os, even when no actual shift will be needed. */
- if (store_info->const_rhs)
- {
- poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
- rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
- store_mode, byte);
- if (ret && CONSTANT_P (ret))
- {
- rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
- ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
- ret, shift_rtx);
- if (ret && CONSTANT_P (ret))
- {
- byte = subreg_lowpart_offset (read_mode, new_mode);
- ret = simplify_subreg (read_mode, ret, new_mode, byte);
- if (ret && CONSTANT_P (ret)
- && (set_src_cost (ret, read_mode, speed)
- <= COSTS_N_INSNS (1)))
- return ret;
- }
- }
- }
-
- if (require_cst)
- return NULL_RTX;
-
/* Try a wider mode if truncating the store mode to NEW_MODE
requires a real instruction. */
if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
&& !targetm.modes_tieable_p (new_mode, store_mode))
continue;
+ if (multiple_p (shift, GET_MODE_BITSIZE (new_mode))
+ && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+ {
+ /* Try to implement the shift using a subreg. */
+ poly_int64 offset
+ = subreg_offset_from_lsb (new_mode, store_mode, shift);
+ rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+ store_mode, offset);
+ if (rhs_subreg)
+ {
+ read_reg
+ = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+ break;
+ }
+ }
+
+ if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+ continue;
+
new_reg = gen_reg_rtx (new_mode);
start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 0000000..e1cbcea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+ TYPE2 mnt;
+ TYPE dentry;
+};
+
+struct nameidata {
+ struct path path;
+ struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+ TYPE d;
+ TYPE2 d2;
+
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ d2 = nd->path.mnt;
+ return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */