diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2022-03-11 17:46:50 +0000 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2022-03-11 17:51:18 +0000 |
commit | c5288df751f9ecd11898dec5f2a7b6b03267f79e (patch) | |
tree | 6ba408ab136ef26fb762ca5138c9958f341f1529 /gcc | |
parent | 098c538ae8c0c5e281d9191a6b54ffe38b624ef3 (diff) | |
download | gcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.zip gcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.tar.gz gcc-c5288df751f9ecd11898dec5f2a7b6b03267f79e.tar.bz2 |
PR tree-optimization/98335: Improvements to DSE's compute_trims.
This patch is the main middle-end piece of a fix for PR tree-opt/98335,
which is a code-quality regression affecting mainline. The issue occurs
in DSE's (dead store elimination's) compute_trims function that determines
where a store to memory can be trimmed. In the testcase given in the
PR, this function notices that the first byte of a DImode store is dead,
and replaces the 8-byte store at (aligned) offset zero, with a 7-byte store
at (unaligned) offset one. Most architectures can store a power-of-two
bytes (up to a maximum) in single instruction, so writing 7 bytes requires
more instructions than writing 8 bytes. This patch follows Jakub Jelinek's
suggestion in comment 5, that compute_trims needs improved heuristics.
On x86_64-pc-linux-gnu with -O2 the new test case in the PR goes from:
movl $0, -24(%rsp)
movabsq $72057594037927935, %rdx
movl $0, -21(%rsp)
andq -24(%rsp), %rdx
movq %rdx, %rax
salq $8, %rax
movb c(%rip), %al
ret
to
xorl %eax, %eax
movb c(%rip), %al
ret
2022-03-11 Roger Sayle <roger@nextmovesoftware.com>
Richard Biener <rguenther@suse.de>
gcc/ChangeLog
PR tree-optimization/98335
* builtins.cc (get_object_alignment_2): Export.
* builtins.h (get_object_alignment_2): Likewise.
* tree-ssa-alias.cc (ao_ref_alignment): New.
* tree-ssa-alias.h (ao_ref_alignment): Declare.
* tree-ssa-dse.cc (compute_trims): Improve logic deciding whether
to align head/tail, writing more bytes but using fewer store insns.
gcc/testsuite/ChangeLog
PR tree-optimization/98335
* g++.dg/pr98335.C: New test case.
* gcc.dg/pr86010.c: New test case.
* gcc.dg/pr86010-2.c: New test case.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/builtins.cc | 2 | ||||
-rw-r--r-- | gcc/builtins.h | 2 | ||||
-rw-r--r-- | gcc/testsuite/g++.dg/pr98335.C | 15 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/pr86010-2.c | 22 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/pr86010.c | 24 | ||||
-rw-r--r-- | gcc/tree-ssa-alias.cc | 23 | ||||
-rw-r--r-- | gcc/tree-ssa-alias.h | 2 | ||||
-rw-r--r-- | gcc/tree-ssa-dse.cc | 54 |
8 files changed, 139 insertions, 5 deletions
diff --git a/gcc/builtins.cc b/gcc/builtins.cc index d784a57..4c6c293 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -233,7 +233,7 @@ called_as_built_in (tree node) If ADDR_P is true we are taking the address of the memory reference EXP and thus cannot rely on the access taking place. */ -static bool +bool get_object_alignment_2 (tree exp, unsigned int *alignp, unsigned HOST_WIDE_INT *bitposp, bool addr_p) { diff --git a/gcc/builtins.h b/gcc/builtins.h index ea10b4b..5ad830c 100644 --- a/gcc/builtins.h +++ b/gcc/builtins.h @@ -52,6 +52,8 @@ extern bool force_folding_builtin_constant_p; extern bool called_as_built_in (tree); extern bool get_object_alignment_1 (tree, unsigned int *, unsigned HOST_WIDE_INT *); +extern bool get_object_alignment_2 (tree, unsigned int *, + unsigned HOST_WIDE_INT *, bool); extern unsigned int get_object_alignment (tree); extern bool get_pointer_alignment_1 (tree, unsigned int *, unsigned HOST_WIDE_INT *); diff --git a/gcc/testsuite/g++.dg/pr98335.C b/gcc/testsuite/g++.dg/pr98335.C new file mode 100644 index 0000000..c54f4d9 --- /dev/null +++ b/gcc/testsuite/g++.dg/pr98335.C @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +struct Data { + char a; + int b; +}; + +char c; + +Data val(int idx) { + return { c }; // { dg-warning "extended initializer" "c++ 98" { target { c++98_only } } } +} + +/* { dg-final { scan-tree-dump-not " + 1B] = {}" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/pr86010-2.c b/gcc/testsuite/gcc.dg/pr86010-2.c new file mode 100644 index 0000000..4c82e65 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr86010-2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +void f (void*); + +void g (char *a) +{ + __builtin_memset (a, 0, 8); + __builtin_memset (a, 0, 8); + + f (a); +} + +void h (char *a) +{ + __builtin_memset (a, 0, 8); + __builtin_memset (a, 0, 7); + + f (a); +} + +/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/pr86010.c b/gcc/testsuite/gcc.dg/pr86010.c new file mode 100644 index 0000000..ac27989 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr86010.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +void f (void*); + +void g (void) +{ + char a[8]; + __builtin_memset (a, 0, 8); + __builtin_memset (a, 0, 8); + + f (a); +} + +void h (void) +{ + char a[8]; + __builtin_memset (a, 0, 8); + __builtin_memset (a, 0, 7); + + f (a); +} + +/* { dg-final { scan-tree-dump-times "__builtin_memset" 2 "optimized" } } */ diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index 3e8d245..50bd47b 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -790,6 +790,29 @@ ao_ref_alias_ptr_type (ao_ref *ref) return ret; } +/* Return the alignment of the access *REF and store it in the *ALIGN + and *BITPOS pairs. Returns false if no alignment could be determined. + See get_object_alignment_2 for details. */ + +bool +ao_ref_alignment (ao_ref *ref, unsigned int *align, + unsigned HOST_WIDE_INT *bitpos) +{ + if (ref->ref) + return get_object_alignment_1 (ref->ref, align, bitpos); + + /* When we just have ref->base we cannot use get_object_alignment since + that will eventually use the type of the appearant access while for + example ao_ref_init_from_ptr_and_range is not careful to adjust that. */ + *align = BITS_PER_UNIT; + HOST_WIDE_INT offset; + if (!ref->offset.is_constant (&offset) + || !get_object_alignment_2 (ref->base, align, bitpos, true)) + return false; + *bitpos += (unsigned HOST_WIDE_INT)offset * BITS_PER_UNIT; + *bitpos = *bitpos & (*align - 1); + return true; +} /* Init an alias-oracle reference representation from a gimple pointer PTR a range specified by OFFSET, SIZE and MAX_SIZE under the assumption diff --git a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h index 6ae1a65..dfb2127 100644 --- a/gcc/tree-ssa-alias.h +++ b/gcc/tree-ssa-alias.h @@ -119,6 +119,8 @@ extern alias_set_type ao_ref_alias_set (ao_ref *); extern alias_set_type ao_ref_base_alias_set (ao_ref *); extern tree ao_ref_alias_ptr_type (ao_ref *); extern tree ao_ref_base_alias_ptr_type (ao_ref *); +extern bool ao_ref_alignment (ao_ref *, unsigned int *, + unsigned HOST_WIDE_INT *); extern bool ptr_deref_may_alias_global_p (tree); extern bool ptr_derefs_may_alias_p (tree, tree); extern bool ptrs_compare_unequal (tree, tree); diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc index 2b22a61..3beaed3 100644 --- a/gcc/tree-ssa-dse.cc +++ b/gcc/tree-ssa-dse.cc @@ -405,10 +405,56 @@ compute_trims (ao_ref *ref, sbitmap live, int *trim_head, int *trim_tail, int first_live = bitmap_first_set_bit (live); *trim_head = first_live - first_orig; - /* If more than a word remains, then make sure to keep the - starting point at least word aligned. */ - if (last_live - first_live > UNITS_PER_WORD) - *trim_head &= ~(UNITS_PER_WORD - 1); + /* If REF is aligned, try to maintain this alignment if it reduces + the number of (power-of-two sized aligned) writes to memory. */ + unsigned int align_bits; + unsigned HOST_WIDE_INT bitpos; + if ((*trim_head || *trim_tail) + && last_live - first_live >= 2 + && ao_ref_alignment (ref, &align_bits, &bitpos) + && align_bits >= 32 + && bitpos == 0 + && align_bits % BITS_PER_UNIT == 0) + { + unsigned int align_units = align_bits / BITS_PER_UNIT; + if (align_units > 16) + align_units = 16; + while ((first_live | (align_units - 1)) > (unsigned int)last_live) + align_units >>= 1; + + if (*trim_head) + { + unsigned int pos = first_live & (align_units - 1); + for (unsigned int i = 1; i <= align_units; i <<= 1) + { + unsigned int mask = ~(i - 1); + unsigned int bytes = align_units - (pos & mask); + if (wi::popcount (bytes) <= 1) + { + *trim_head &= mask; + break; + } + } + } + + if (*trim_tail) + { + unsigned int pos = last_live & (align_units - 1); + for (unsigned int i = 1; i <= align_units; i <<= 1) + { + int mask = i - 1; + unsigned int bytes = (pos | mask) + 1; + if ((last_live | mask) > (last_live + *trim_tail)) + break; + if (wi::popcount (bytes) <= 1) + { + unsigned int extra = (last_live | mask) - last_live; + *trim_tail -= extra; + break; + } + } + } + } if ((*trim_head || *trim_tail) && dump_file && (dump_flags & TDF_DETAILS)) |