diff options
author | Xionghu Luo <luoxhu@linux.ibm.com> | 2020-08-03 22:09:15 -0500 |
---|---|---|
committer | Xionghu Luo <luoxhu@linux.ibm.com> | 2020-08-03 22:09:15 -0500 |
commit | 265d817b1eb4644c7a9613ad6920315d98e2e0a4 (patch) | |
tree | 10b0950642c2dc9f5e0468fcfd882a895396e817 /gcc/dse.c | |
parent | 6a1ad710ad20ef05296013679dd42724865a0396 (diff) | |
download | gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.zip gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.gz gcc-265d817b1eb4644c7a9613ad6920315d98e2e0a4.tar.bz2 |
dse: Remove partial load after full store for high part access[PR71309]
v5 update as comments:
1. Move const_rhs out of loop;
2. Iterate from int size for read_mode.
This patch could optimize(works for char/short/int/void*):
6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]
=>
6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8
Final ASM will be as below without partial load after full store(stxv+ld):
ld 10,16(3)
mr 9,3
ld 3,24(3)
std 10,0(9)
std 3,8(9)
blr
It could achieve ~25% performance improvement for typical cases on
Power9. Bootstrap and regression tested on Power9-LE.
For AArch64, one ldr is replaced by mov with this patch:
ldp x2, x3, [x0, 16]
stp x2, x3, [x0]
ldr x0, [x0, 8]
=>
mov x1, x0
ldp x2, x0, [x0, 16]
stp x2, x0, [x1]
gcc/ChangeLog:
2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com>
PR rtl-optimization/71309
* dse.c (find_shift_sequence): Use subreg of shifted from high part
register to avoid loading from address.
gcc/testsuite/ChangeLog:
2020-08-04 Xionghu Luo <luoxhu@linux.ibm.com>
PR rtl-optimization/71309
* gcc.target/powerpc/pr71309.c: New test.
Diffstat (limited to 'gcc/dse.c')
-rw-r--r-- | gcc/dse.c | 78 |
1 files changed, 49 insertions, 29 deletions
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size, scalar_int_mode new_mode; rtx read_reg = NULL; + /* If a constant was stored into memory, try to simplify it here, + otherwise the cost of the shift might preclude this optimization + e.g. at -Os, even when no actual shift will be needed. */ + if (store_info->const_rhs) + { + auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT); + auto byte = subreg_lowpart_offset (new_mode, store_mode); + rtx ret + = simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte); + if (ret && CONSTANT_P (ret)) + { + rtx shift_rtx = gen_int_shift_amount (new_mode, shift); + ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret, + shift_rtx); + if (ret && CONSTANT_P (ret)) + { + byte = subreg_lowpart_offset (read_mode, new_mode); + ret = simplify_subreg (read_mode, ret, new_mode, byte); + if (ret && CONSTANT_P (ret) + && (set_src_cost (ret, read_mode, speed) + <= COSTS_N_INSNS (1))) + return ret; + } + } + } + + if (require_cst) + return NULL_RTX; + /* Some machines like the x86 have shift insns for each size of operand. Other machines like the ppc or the ia-64 may only have shift insns that shift values within 32 or 64 bit registers. @@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size, opt_scalar_int_mode new_mode_iter; FOR_EACH_MODE_FROM (new_mode_iter, - smallest_int_mode_for_size (access_size * BITS_PER_UNIT)) + smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode))) { rtx target, new_reg, new_lhs; rtx_insn *shift_seq, *insn; @@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size, if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD) break; - /* If a constant was stored into memory, try to simplify it here, - otherwise the cost of the shift might preclude this optimization - e.g. at -Os, even when no actual shift will be needed. */ - if (store_info->const_rhs) - { - poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode); - rtx ret = simplify_subreg (new_mode, store_info->const_rhs, - store_mode, byte); - if (ret && CONSTANT_P (ret)) - { - rtx shift_rtx = gen_int_shift_amount (new_mode, shift); - ret = simplify_const_binary_operation (LSHIFTRT, new_mode, - ret, shift_rtx); - if (ret && CONSTANT_P (ret)) - { - byte = subreg_lowpart_offset (read_mode, new_mode); - ret = simplify_subreg (read_mode, ret, new_mode, byte); - if (ret && CONSTANT_P (ret) - && (set_src_cost (ret, read_mode, speed) - <= COSTS_N_INSNS (1))) - return ret; - } - } - } - - if (require_cst) - return NULL_RTX; - /* Try a wider mode if truncating the store mode to NEW_MODE requires a real instruction. */ if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)) @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size, && !targetm.modes_tieable_p (new_mode, store_mode)) continue; + if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)) + && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))) + { + /* Try to implement the shift using a subreg. */ + poly_int64 offset + = subreg_offset_from_lsb (new_mode, store_mode, shift); + rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs, + store_mode, offset); + if (rhs_subreg) + { + read_reg + = extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg)); + break; + } + } + + if (maybe_lt (GET_MODE_SIZE (new_mode), access_size)) + continue; + new_reg = gen_reg_rtx (new_mode); start_sequence (); |