diff options
author | Andrew Pinski <quic_apinski@quicinc.com> | 2024-08-28 15:03:53 -0700 |
---|---|---|
committer | Andrew Pinski <quic_apinski@quicinc.com> | 2024-08-29 08:24:45 -0700 |
commit | 215c7e3084ea9ec75dcc803f73c94b36e2751e54 (patch) | |
tree | 988851fcedab9e44231ac03976f81d8ae55826d1 /gcc | |
parent | cdd5dd2125ca850aa8599f76bed02509590541ef (diff) | |
download | gcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.zip gcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.tar.gz gcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.tar.bz2 |
expand: Allow widdening optab when expanding popcount==1 [PR116508]
After adding popcount{qi,hi}2 to the aarch64 backend, I noticed that
the expansion for popcount==1 was no longer trying to do the trick
of handling popcount==1 as `(arg ^ (arg - 1)) > arg - 1`. The problem
is the expansion was using OPTAB_DIRECT, when using OPTAB_WIDEN
will allow modes which are smaller than SImode (in the aarch64 case).
Note QImode's cost still needs some improvements so part of popcnt-eq-1.c
is xfailed. Though there is a check to make sure the costs are compared now.
Built and tested on aarch64-linux-gnu.
PR middle-end/116508
gcc/ChangeLog:
* internal-fn.cc (expand_POPCOUNT): Use OPTAB_WIDEN for PLUS and
XOR/AND expansion.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/popcnt-eq-1.c: New test.
Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/internal-fn.cc | 4 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c | 45 |
2 files changed, 47 insertions, 2 deletions
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 78997ef..4e33db3 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -5332,11 +5332,11 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) start_sequence (); rtx op0 = expand_normal (arg); rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, - 1, OPTAB_DIRECT); + 1, OPTAB_WIDEN); if (argm1 == NULL_RTX) goto fail; rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0, - argm1, NULL_RTX, 1, OPTAB_DIRECT); + argm1, NULL_RTX, 1, OPTAB_WIDEN); if (argxorargm1 == NULL_RTX) goto fail; rtx cmp; diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c new file mode 100644 index 0000000..bb9e2bf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-rtl-expand-details" } */ +/* { dg-final { check-function-bodies "**" "" } } */ +/* PR middle-end/116508 */ + +#pragma GCC target "+nocssc" + +/* +** h16: +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 65535 +** cmp w\3, w\2, uxth +** cset w0, cc +** ret +*/ + +/* when expanding popcount == 1, should use + `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency + than doing the popcount then comparing against 1. + The popcount/addv can be costly. */ +unsigned h16 (const unsigned short a) { + return __builtin_popcountg (a) == 1; +} + +/* unsigned char should also do the same trick */ +/* Currently xfailed since the cost does not take into account the + moving between gprs and vector regs correctly. */ +/* +** h8: { xfail *-*-* } +** sub w([0-9]+), w0, #1 +** eor w([0-9]+), w0, w\1 +** and w([0-9]+), w\1, 255 +** cmp w\3, w\2, uxtb +** cset w0, cc +** ret +*/ + + +unsigned h8 (const unsigned char a) { + return __builtin_popcountg (a) == 1; +} + +/* There should be printing out the costs for h8 and h16's popcount == 1 */ +/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */ |