diff options
author | Aaron Sawdey <acsawdey@linux.vnet.ibm.com> | 2016-10-10 04:42:08 +0000 |
---|---|---|
committer | Aaron Sawdey <acsawdey@gcc.gnu.org> | 2016-10-09 23:42:08 -0500 |
commit | 87b44b83c0aab29fac51f510288032690a6e2886 (patch) | |
tree | 2507e346ef67a5a25f1c0562f4206555a723efe1 /gcc | |
parent | 4815e7d4056e321691cd2da79d20f3f9e5d13914 (diff) | |
download | gcc-87b44b83c0aab29fac51f510288032690a6e2886.zip gcc-87b44b83c0aab29fac51f510288032690a6e2886.tar.gz gcc-87b44b83c0aab29fac51f510288032690a6e2886.tar.bz2 |
rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads.
2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED):
Add macro to say we can efficiently handle overlapping unaligned
loads.
* config/rs6000/rs6000.c (expand_block_compare): Avoid generating
poor code for processors older than p8.
From-SVN: r240908
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 8 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.c | 26 | ||||
-rw-r--r-- | gcc/config/rs6000/rs6000.h | 3 |
3 files changed, 33 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a6c2fcd..7842655 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com> + + * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): + Add macro to say we can efficiently handle overlapping unaligned + loads. + * config/rs6000/rs6000.c (expand_block_compare): Avoid generating + poor code for processors older than p8. + 2016-10-09 Eric Botcazou <ebotcazou@adacore.com> * gen-pass-instances.awk: Remove GNUism. diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 49da4b6..8c7ab18 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[]) if (bytes <= 0) return true; + /* The code generated for p7 and older is not faster than glibc + memcmp if alignment is small and length is not short, so bail + out to avoid those conditions. */ + if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED + && ((base_align == 1 && bytes > 16) + || (base_align == 2 && bytes > 32))) + return false; + rtx tmp_reg_src1 = gen_reg_rtx (word_mode); rtx tmp_reg_src2 = gen_reg_rtx (word_mode); @@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[]) while (bytes > 0) { int align = compute_current_alignment (base_align, offset); - load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok); + if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) + load_mode = select_block_compare_mode (offset, bytes, align, + word_mode_ok); + else + load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok); load_mode_size = GET_MODE_SIZE (load_mode); if (bytes >= load_mode_size) cmp_bytes = load_mode_size; - else + else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) { - /* Move this load back so it doesn't go past the end. */ + /* Move this load back so it doesn't go past the end. + P8/P9 can do this efficiently. */ int extra_bytes = load_mode_size - bytes; cmp_bytes = bytes; if (extra_bytes < offset) @@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[]) bytes = cmp_bytes; } } - + else + /* P7 and earlier can't do the overlapping load trick fast, + so this forces a non-overlapping load and a shift to get + rid of the extra bytes. */ + cmp_bytes = bytes; + src1 = adjust_address (orig_src1, load_mode, offset); src2 = adjust_address (orig_src2, load_mode, offset); diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h index ad58193..f53da15 100644 --- a/gcc/config/rs6000/rs6000.h +++ b/gcc/config/rs6000/rs6000.h @@ -607,6 +607,9 @@ extern int rs6000_vector_align[]; && TARGET_POWERPC64) #define TARGET_VEXTRACTUB (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \ && TARGET_UPPER_REGS_DI && TARGET_POWERPC64) +/* This wants to be set for p8 and newer. On p7, overlapping unaligned + loads are slow. */ +#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present in power7, so conditionalize them on p8 features. TImode syncs need quad |