aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRoger Sayle <roger@nextmovesoftware.com>2022-03-11 17:46:50 +0000
committerRoger Sayle <roger@nextmovesoftware.com>2022-03-11 17:51:18 +0000
commitc5288df751f9ecd11898dec5f2a7b6b03267f79e (patch)
tree6ba408ab136ef26fb762ca5138c9958f341f1529 /gcc
parent098c538ae8c0c5e281d9191a6b54ffe38b624ef3 (diff)
downloadgcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.zip
gcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.tar.gz
gcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.tar.bz2
PR tree-optimization/98335: Improvements to DSE's compute_trims.
This patch is the main middle-end piece of a fix for PR tree-opt/98335, which is a code-quality regression affecting mainline. The issue occurs in DSE's (dead store elimination's) compute_trims function that determines where a store to memory can be trimmed. In the testcase given in the PR, this function notices that the first byte of a DImode store is dead, and replaces the 8-byte store at (aligned) offset zero, with a 7-byte store at (unaligned) offset one. Most architectures can store a power-of-two bytes (up to a maximum) in single instruction, so writing 7 bytes requires more instructions than writing 8 bytes. This patch follows Jakub Jelinek's suggestion in comment 5, that compute_trims needs improved heuristics. On x86_64-pc-linux-gnu with -O2 the new test case in the PR goes from: movl $0, -24(%rsp) movabsq $72057594037927935, %rdx movl $0, -21(%rsp) andq -24(%rsp), %rdx movq %rdx, %rax salq $8, %rax movb c(%rip), %al ret to xorl %eax, %eax movb c(%rip), %al ret 2022-03-11 Roger Sayle <roger@nextmovesoftware.com> Richard Biener <rguenther@suse.de> gcc/ChangeLog PR tree-optimization/98335 * builtins.cc (get_object_alignment_2): Export. * builtins.h (get_object_alignment_2): Likewise. * tree-ssa-alias.cc (ao_ref_alignment): New. * tree-ssa-alias.h (ao_ref_alignment): Declare. * tree-ssa-dse.cc (compute_trims): Improve logic deciding whether to align head/tail, writing more bytes but using fewer store insns. gcc/testsuite/ChangeLog PR tree-optimization/98335 * g++.dg/pr98335.C: New test case. * gcc.dg/pr86010.c: New test case. * gcc.dg/pr86010-2.c: New test case.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/builtins.cc2
-rw-r--r--gcc/builtins.h2
-rw-r--r--gcc/testsuite/g++.dg/pr98335.C15
-rw-r--r--gcc/testsuite/gcc.dg/pr86010-2.c22
-rw-r--r--gcc/testsuite/gcc.dg/pr86010.c24
-rw-r--r--gcc/tree-ssa-alias.cc23
-rw-r--r--gcc/tree-ssa-alias.h2
-rw-r--r--gcc/tree-ssa-dse.cc54
8 files changed, 139 insertions, 5 deletions
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index d784a57..4c6c293 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -233,7 +233,7 @@ called_as_built_in (tree node)
If ADDR_P is true we are taking the address of the memory reference EXP
and thus cannot rely on the access taking place. */
-static bool
+bool
get_object_alignment_2 (tree exp, unsigned int *alignp,
unsigned HOST_WIDE_INT *bitposp, bool addr_p)
{
diff --git a/gcc/builtins.h b/gcc/builtins.h
index ea10b4b..5ad830c 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -52,6 +52,8 @@ extern bool force_folding_builtin_constant_p;
extern bool called_as_built_in (tree);
extern bool get_object_alignment_1 (tree, unsigned int *,
unsigned HOST_WIDE_INT *);
+extern bool get_object_alignment_2 (tree, unsigned int *,
+ unsigned HOST_WIDE_INT *, bool);
extern unsigned int get_object_alignment (tree);
extern bool get_pointer_alignment_1 (tree, unsigned int *,
unsigned HOST_WIDE_INT *);
diff --git a/gcc/testsuite/g++.dg/pr98335.C b/gcc/testsuite/g++.dg/pr98335.C
new file mode 100644
index 0000000..c54f4d9
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr98335.C
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+struct Data {
+ char a;
+ int b;
+};
+
+char c;
+
+Data val(int idx) {
+ return { c }; // { dg-warning "extended initializer" "c++ 98" { target { c++98_only } } }
+}
+
+/* { dg-final { scan-tree-dump-not " + 1B] = {}" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr86010-2.c b/gcc/testsuite/gcc.dg/pr86010-2.c
new file mode 100644
index 0000000..4c82e65
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr86010-2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+void f (void*);
+
+void g (char *a)
+{
+ __builtin_memset (a, 0, 8);
+ __builtin_memset (a, 0, 8);
+
+ f (a);
+}
+
+void h (char *a)
+{
+ __builtin_memset (a, 0, 8);
+ __builtin_memset (a, 0, 7);
+
+ f (a);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr86010.c b/gcc/testsuite/gcc.dg/pr86010.c
new file mode 100644
index 0000000..ac27989
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr86010.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+void f (void*);
+
+void g (void)
+{
+ char a[8];
+ __builtin_memset (a, 0, 8);
+ __builtin_memset (a, 0, 8);
+
+ f (a);
+}
+
+void h (void)
+{
+ char a[8];
+ __builtin_memset (a, 0, 8);
+ __builtin_memset (a, 0, 7);
+
+ f (a);
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */
diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc
index 3e8d245..50bd47b 100644
--- a/gcc/tree-ssa-alias.cc
+++ b/gcc/tree-ssa-alias.cc
@@ -790,6 +790,29 @@ ao_ref_alias_ptr_type (ao_ref *ref)
return ret;
}
+/* Return the alignment of the access *REF and store it in the *ALIGN
+ and *BITPOS pairs. Returns false if no alignment could be determined.
+ See get_object_alignment_2 for details. */
+
+bool
+ao_ref_alignment (ao_ref *ref, unsigned int *align,
+ unsigned HOST_WIDE_INT *bitpos)
+{
+ if (ref->ref)
+ return get_object_alignment_1 (ref->ref, align, bitpos);
+
+ /* When we just have ref->base we cannot use get_object_alignment since
+ that will eventually use the type of the appearant access while for
+ example ao_ref_init_from_ptr_and_range is not careful to adjust that. */
+ *align = BITS_PER_UNIT;
+ HOST_WIDE_INT offset;
+ if (!ref->offset.is_constant (&offset)
+ || !get_object_alignment_2 (ref->base, align, bitpos, true))
+ return false;
+ *bitpos += (unsigned HOST_WIDE_INT)offset * BITS_PER_UNIT;
+ *bitpos = *bitpos & (*align - 1);
+ return true;
+}
/* Init an alias-oracle reference representation from a gimple pointer
PTR a range specified by OFFSET, SIZE and MAX_SIZE under the assumption
diff --git a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h
index 6ae1a65..dfb2127 100644
--- a/gcc/tree-ssa-alias.h
+++ b/gcc/tree-ssa-alias.h
@@ -119,6 +119,8 @@ extern alias_set_type ao_ref_alias_set (ao_ref *);
extern alias_set_type ao_ref_base_alias_set (ao_ref *);
extern tree ao_ref_alias_ptr_type (ao_ref *);
extern tree ao_ref_base_alias_ptr_type (ao_ref *);
+extern bool ao_ref_alignment (ao_ref *, unsigned int *,
+ unsigned HOST_WIDE_INT *);
extern bool ptr_deref_may_alias_global_p (tree);
extern bool ptr_derefs_may_alias_p (tree, tree);
extern bool ptrs_compare_unequal (tree, tree);
diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 2b22a61..3beaed3 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -405,10 +405,56 @@ compute_trims (ao_ref *ref, sbitmap live, int *trim_head, int *trim_tail,
int first_live = bitmap_first_set_bit (live);
*trim_head = first_live - first_orig;
- /* If more than a word remains, then make sure to keep the
- starting point at least word aligned. */
- if (last_live - first_live > UNITS_PER_WORD)
- *trim_head &= ~(UNITS_PER_WORD - 1);
+ /* If REF is aligned, try to maintain this alignment if it reduces
+ the number of (power-of-two sized aligned) writes to memory. */
+ unsigned int align_bits;
+ unsigned HOST_WIDE_INT bitpos;
+ if ((*trim_head || *trim_tail)
+ && last_live - first_live >= 2
+ && ao_ref_alignment (ref, &align_bits, &bitpos)
+ && align_bits >= 32
+ && bitpos == 0
+ && align_bits % BITS_PER_UNIT == 0)
+ {
+ unsigned int align_units = align_bits / BITS_PER_UNIT;
+ if (align_units > 16)
+ align_units = 16;
+ while ((first_live | (align_units - 1)) > (unsigned int)last_live)
+ align_units >>= 1;
+
+ if (*trim_head)
+ {
+ unsigned int pos = first_live & (align_units - 1);
+ for (unsigned int i = 1; i <= align_units; i <<= 1)
+ {
+ unsigned int mask = ~(i - 1);
+ unsigned int bytes = align_units - (pos & mask);
+ if (wi::popcount (bytes) <= 1)
+ {
+ *trim_head &= mask;
+ break;
+ }
+ }
+ }
+
+ if (*trim_tail)
+ {
+ unsigned int pos = last_live & (align_units - 1);
+ for (unsigned int i = 1; i <= align_units; i <<= 1)
+ {
+ int mask = i - 1;
+ unsigned int bytes = (pos | mask) + 1;
+ if ((last_live | mask) > (last_live + *trim_tail))
+ break;
+ if (wi::popcount (bytes) <= 1)
+ {
+ unsigned int extra = (last_live | mask) - last_live;
+ *trim_tail -= extra;
+ break;
+ }
+ }
+ }
+ }
if ((*trim_head || *trim_tail)
&& dump_file && (dump_flags & TDF_DETAILS))