aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron Sawdey <acsawdey@linux.vnet.ibm.com>2016-10-10 04:42:08 +0000
committerAaron Sawdey <acsawdey@gcc.gnu.org>2016-10-09 23:42:08 -0500
commit87b44b83c0aab29fac51f510288032690a6e2886 (patch)
tree2507e346ef67a5a25f1c0562f4206555a723efe1
parent4815e7d4056e321691cd2da79d20f3f9e5d13914 (diff)
downloadgcc-87b44b83c0aab29fac51f510288032690a6e2886.zip
gcc-87b44b83c0aab29fac51f510288032690a6e2886.tar.gz
gcc-87b44b83c0aab29fac51f510288032690a6e2886.tar.bz2
rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads.
2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com> * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficiently handle overlapping unaligned loads. * config/rs6000/rs6000.c (expand_block_compare): Avoid generating poor code for processors older than p8. From-SVN: r240908
-rw-r--r--gcc/ChangeLog8
-rw-r--r--gcc/config/rs6000/rs6000.c26
-rw-r--r--gcc/config/rs6000/rs6000.h3
3 files changed, 33 insertions, 4 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a6c2fcd..7842655 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2016-10-09 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
+
+ * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED):
+ Add macro to say we can efficiently handle overlapping unaligned
+ loads.
+ * config/rs6000/rs6000.c (expand_block_compare): Avoid generating
+ poor code for processors older than p8.
+
2016-10-09 Eric Botcazou <ebotcazou@adacore.com>
* gen-pass-instances.awk: Remove GNUism.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 49da4b6..8c7ab18 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[])
if (bytes <= 0)
return true;
+ /* The code generated for p7 and older is not faster than glibc
+ memcmp if alignment is small and length is not short, so bail
+ out to avoid those conditions. */
+ if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+ && ((base_align == 1 && bytes > 16)
+ || (base_align == 2 && bytes > 32)))
+ return false;
+
rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
@@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[])
while (bytes > 0)
{
int align = compute_current_alignment (base_align, offset);
- load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok);
+ if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+ load_mode = select_block_compare_mode (offset, bytes, align,
+ word_mode_ok);
+ else
+ load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
load_mode_size = GET_MODE_SIZE (load_mode);
if (bytes >= load_mode_size)
cmp_bytes = load_mode_size;
- else
+ else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
{
- /* Move this load back so it doesn't go past the end. */
+ /* Move this load back so it doesn't go past the end.
+ P8/P9 can do this efficiently. */
int extra_bytes = load_mode_size - bytes;
cmp_bytes = bytes;
if (extra_bytes < offset)
@@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[])
bytes = cmp_bytes;
}
}
-
+ else
+ /* P7 and earlier can't do the overlapping load trick fast,
+ so this forces a non-overlapping load and a shift to get
+ rid of the extra bytes. */
+ cmp_bytes = bytes;
+
src1 = adjust_address (orig_src1, load_mode, offset);
src2 = adjust_address (orig_src2, load_mode, offset);
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index ad58193..f53da15 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -607,6 +607,9 @@ extern int rs6000_vector_align[];
&& TARGET_POWERPC64)
#define TARGET_VEXTRACTUB (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
&& TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
+/* This wants to be set for p8 and newer. On p7, overlapping unaligned
+ loads are slow. */
+#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
/* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
in power7, so conditionalize them on p8 features. TImode syncs need quad