aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorAndrew Pinski <quic_apinski@quicinc.com>2024-08-28 15:03:53 -0700
committerAndrew Pinski <quic_apinski@quicinc.com>2024-08-29 08:24:45 -0700
commit215c7e3084ea9ec75dcc803f73c94b36e2751e54 (patch)
tree988851fcedab9e44231ac03976f81d8ae55826d1 /gcc
parentcdd5dd2125ca850aa8599f76bed02509590541ef (diff)
downloadgcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.zip
gcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.tar.gz
gcc-215c7e3084ea9ec75dcc803f73c94b36e2751e54.tar.bz2
expand: Allow widdening optab when expanding popcount==1 [PR116508]
After adding popcount{qi,hi}2 to the aarch64 backend, I noticed that the expansion for popcount==1 was no longer trying to do the trick of handling popcount==1 as `(arg ^ (arg - 1)) > arg - 1`. The problem is the expansion was using OPTAB_DIRECT, when using OPTAB_WIDEN will allow modes which are smaller than SImode (in the aarch64 case). Note QImode's cost still needs some improvements so part of popcnt-eq-1.c is xfailed. Though there is a check to make sure the costs are compared now. Built and tested on aarch64-linux-gnu. PR middle-end/116508 gcc/ChangeLog: * internal-fn.cc (expand_POPCOUNT): Use OPTAB_WIDEN for PLUS and XOR/AND expansion. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-eq-1.c: New test. Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
Diffstat (limited to 'gcc')
-rw-r--r--gcc/internal-fn.cc4
-rw-r--r--gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c45
2 files changed, 47 insertions, 2 deletions
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 78997ef..4e33db3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5332,11 +5332,11 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
start_sequence ();
rtx op0 = expand_normal (arg);
rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX,
- 1, OPTAB_DIRECT);
+ 1, OPTAB_WIDEN);
if (argm1 == NULL_RTX)
goto fail;
rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0,
- argm1, NULL_RTX, 1, OPTAB_DIRECT);
+ argm1, NULL_RTX, 1, OPTAB_WIDEN);
if (argxorargm1 == NULL_RTX)
goto fail;
rtx cmp;
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
new file mode 100644
index 0000000..bb9e2bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-eq-1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/116508 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h16:
+** sub w([0-9]+), w0, #1
+** eor w([0-9]+), w0, w\1
+** and w([0-9]+), w\1, 65535
+** cmp w\3, w\2, uxth
+** cset w0, cc
+** ret
+*/
+
+/* when expanding popcount == 1, should use
+ `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency
+ than doing the popcount then comparing against 1.
+ The popcount/addv can be costly. */
+unsigned h16 (const unsigned short a) {
+ return __builtin_popcountg (a) == 1;
+}
+
+/* unsigned char should also do the same trick */
+/* Currently xfailed since the cost does not take into account the
+ moving between gprs and vector regs correctly. */
+/*
+** h8: { xfail *-*-* }
+** sub w([0-9]+), w0, #1
+** eor w([0-9]+), w0, w\1
+** and w([0-9]+), w\1, 255
+** cmp w\3, w\2, uxtb
+** cset w0, cc
+** ret
+*/
+
+
+unsigned h8 (const unsigned char a) {
+ return __builtin_popcountg (a) == 1;
+}
+
+/* There should be printing out the costs for h8 and h16's popcount == 1 */
+/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */