diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2021-07-08 11:46:14 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2021-07-08 11:47:55 +0100 |
commit | 4c619132b3f14dc5e672a7f2f0e09cb784193559 (patch) | |
tree | 63d472f471b0c12d4140ad40b0fc3f53630ae5ec /gcc/match.pd | |
parent | 663a014e77709bfbd4145c605b178169eaf334fc (diff) | |
download | gcc-4c619132b3f14dc5e672a7f2f0e09cb784193559.zip gcc-4c619132b3f14dc5e672a7f2f0e09cb784193559.tar.gz gcc-4c619132b3f14dc5e672a7f2f0e09cb784193559.tar.bz2 |
PR tree-optimization/40210: Fold (bswap(X)>>C1)&C2 to (X>>C3)&C2 in match.pd
All of the optimizations/transformations mentioned in bugzilla for
PR tree-optimization/40210 are already implemented in mainline GCC,
with one exception. In comment #5, there's a suggestion that
(bswap64(x)>>56)&0xff can be implemented without the bswap as
(unsigned char)x, or equivalently x&0xff.
This patch implements the above optimization, and closely related
variants. For any single bit, (bswap(X)>>C1)&1 can be simplified
to (X>>C2)&1, where bit position C2 is the appropriate permutation
of C1. Similarly, the bswap can eliminated if the desired set of
bits all lie within the same byte, hence (bswap(x)>>8)&255 can
always be optimized, as can (bswap(x)>>8)&123.
Previously,
int foo(long long x) {
return (__builtin_bswap64(x) >> 56) & 0xff;
}
compiled with -O2 to
foo: movq %rdi, %rax
bswap %rax
shrq $56, %rax
ret
with this patch, it now compiles to
foo: movzbl %dil, %eax
ret
2021-07-08 Roger Sayle <roger@nextmovesoftware.com>
Richard Biener <rguenther@suse.de>
gcc/ChangeLog
PR tree-optimization/40210
* match.pd (bswap optimizations): Simplify (bswap(x)>>C1)&C2 as
(x>>C3)&C2 when possible. Simplify bswap(x)>>C1 as ((T)x)>>C2
when possible. Simplify bswap(x)&C1 as (x>>C2)&C1 when 0<=C1<=255.
gcc/testsuite/ChangeLog
PR tree-optimization/40210
* gcc.dg/builtin-bswap-13.c: New test.
* gcc.dg/builtin-bswap-14.c: New test.
Diffstat (limited to 'gcc/match.pd')
-rw-r--r-- | gcc/match.pd | 68 |
1 files changed, 66 insertions, 2 deletions
diff --git a/gcc/match.pd b/gcc/match.pd index 72860fb..334e8cc 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3610,7 +3610,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (complex (convert:itype @0) (negate (convert:itype @1))))) /* BSWAP simplifications, transforms checked by gcc.dg/builtin-bswap-8.c. */ -(for bswap (BUILT_IN_BSWAP16 BUILT_IN_BSWAP32 BUILT_IN_BSWAP64) +(for bswap (BUILT_IN_BSWAP16 BUILT_IN_BSWAP32 + BUILT_IN_BSWAP64 BUILT_IN_BSWAP128) (simplify (bswap (bswap @0)) @0) @@ -3620,7 +3621,70 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (for bitop (bit_xor bit_ior bit_and) (simplify (bswap (bitop:c (bswap @0) @1)) - (bitop @0 (bswap @1))))) + (bitop @0 (bswap @1)))) + /* (bswap(x) >> C1) & C2 can sometimes be simplified to (x >> C3) & C2. */ + (simplify + (bit_and (convert1? (rshift@0 (convert2? (bswap@4 @1)) INTEGER_CST@2)) + INTEGER_CST@3) + (if (BITS_PER_UNIT == 8 + && tree_fits_uhwi_p (@2) + && tree_fits_uhwi_p (@3)) + (with + { + unsigned HOST_WIDE_INT prec = TYPE_PRECISION (TREE_TYPE (@4)); + unsigned HOST_WIDE_INT bits = tree_to_uhwi (@2); + unsigned HOST_WIDE_INT mask = tree_to_uhwi (@3); + unsigned HOST_WIDE_INT lo = bits & 7; + unsigned HOST_WIDE_INT hi = bits - lo; + } + (if (bits < prec + && mask < (256u>>lo) + && bits < TYPE_PRECISION (TREE_TYPE(@0))) + (with { unsigned HOST_WIDE_INT ns = (prec - (hi + 8)) + lo; } + (if (ns == 0) + (bit_and (convert @1) @3) + (with + { + tree utype = unsigned_type_for (TREE_TYPE (@1)); + tree nst = build_int_cst (integer_type_node, ns); + } + (bit_and (convert (rshift:utype (convert:utype @1) {nst;})) @3)))))))) + /* bswap(x) >> C1 can sometimes be simplified to (T)x >> C2. */ + (simplify + (rshift (convert? (bswap@2 @0)) INTEGER_CST@1) + (if (BITS_PER_UNIT == 8 + && CHAR_TYPE_SIZE == 8 + && tree_fits_uhwi_p (@1)) + (with + { + unsigned HOST_WIDE_INT prec = TYPE_PRECISION (TREE_TYPE (@2)); + unsigned HOST_WIDE_INT bits = tree_to_uhwi (@1); + } + (if (bits + 8 == prec) + (if (TYPE_UNSIGNED (type)) + (convert (convert:unsigned_char_type_node @0)) + (convert (convert:signed_char_type_node @0))) + (if (bits < prec && bits + 8 > prec) + (with + { + tree nst = build_int_cst (integer_type_node, bits & 7); + tree bt = TYPE_UNSIGNED (type) ? unsigned_char_type_node + : signed_char_type_node; + } + (convert (rshift:bt (convert:bt @0) {nst;})))))))) + /* bswap(x) & C1 can sometimes be simplified to (x >> C2) & C1. */ + (simplify + (bit_and (convert? (bswap@2 @0)) INTEGER_CST@1) + (if (BITS_PER_UNIT == 8 + && tree_fits_uhwi_p (@1) + && tree_to_uhwi (@1) < 256) + (with + { + unsigned HOST_WIDE_INT prec = TYPE_PRECISION (TREE_TYPE (@2)); + tree utype = unsigned_type_for (TREE_TYPE (@0)); + tree nst = build_int_cst (integer_type_node, prec - 8); + } + (bit_and (convert (rshift:utype (convert:utype @0) {nst;})) @1))))) /* Combine COND_EXPRs and VEC_COND_EXPRs. */ |