aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2022-04-07 08:27:53 +0100
committerTamar Christina <tamar.christina@arm.com>2022-04-07 08:27:53 +0100
commit024edf08959e9c1d5022901e6c4e5cbaa5b6c8d5 (patch)
tree17513f01d1c11672741ff1243864b6ceacf259f6
parentfdd81afcf18d1a926d81d63cc4525fc9442aa9a5 (diff)
downloadgcc-024edf08959e9c1d5022901e6c4e5cbaa5b6c8d5.zip
gcc-024edf08959e9c1d5022901e6c4e5cbaa5b6c8d5.tar.gz
gcc-024edf08959e9c1d5022901e6c4e5cbaa5b6c8d5.tar.bz2
AArch64: Fix left fold sum reduction RTL patterns [PR104049]
As the discussion in the PR pointed out the RTL we have for the REDUC_PLUS patterns are wrong. The UNSPECs are modelled as returning a vector and then in an expand pattern we emit a vec_select of the 0th element to get the scalar. This is incorrect as the instruction itself already only returns a single scalar and by declaring it returns a vector it allows combine to push in a subreg into the pattern, which causes reload to make duplicate moves. This patch corrects this by removing the weird indirection and making the RTL pattern model the correct semantics of the instruction immediately. gcc/ChangeLog: PR target/104049 * config/aarch64/aarch64-simd.md (aarch64_reduc_plus_internal<mode>): Fix RTL and rename to... (reduc_plus_scal_<mode>): ... This. (reduc_plus_scal_v4sf): Moved. (aarch64_reduc_plus_internalv2si): Fix RTL and rename to... (reduc_plus_scal_v2si): ... This. gcc/testsuite/ChangeLog: PR target/104049 * gcc.target/aarch64/vadd_reduc-1.c: New test. * gcc.target/aarch64/vadd_reduc-2.c: New test.
-rw-r--r--gcc/config/aarch64/aarch64-simd.md70
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c22
-rw-r--r--gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c34
3 files changed, 84 insertions, 42 deletions
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 1873342..a00e1c6 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3385,20 +3385,6 @@
;; 'across lanes' add.
-(define_expand "reduc_plus_scal_<mode>"
- [(match_operand:<VEL> 0 "register_operand")
- (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
- UNSPEC_ADDV)]
- "TARGET_SIMD"
- {
- rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
- rtx scratch = gen_reg_rtx (<MODE>mode);
- emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
- emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
- DONE;
- }
-)
-
(define_insn "aarch64_faddp<mode>"
[(set (match_operand:VHSDF 0 "register_operand" "=w")
(unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
@@ -3409,31 +3395,22 @@
[(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
)
-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
- (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+ (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
UNSPEC_ADDV))]
"TARGET_SIMD"
"add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
[(set_attr "type" "neon_reduc_add<q>")]
)
-(define_insn "aarch64_<su>addlv<mode>"
- [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
- (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
- USADDLV))]
- "TARGET_SIMD"
- "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
- [(set_attr "type" "neon_reduc_add<q>")]
-)
-
-(define_insn "aarch64_<su>addlp<mode>"
- [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
- (unspec:<VDBLW> [(match_operand:VDQV_L 1 "register_operand" "w")]
- USADDLP))]
+(define_insn "reduc_plus_scal_v2si"
+ [(set (match_operand:SI 0 "register_operand" "=w")
+ (unspec:SI [(match_operand:V2SI 1 "register_operand" "w")]
+ UNSPEC_ADDV))]
"TARGET_SIMD"
- "<su>addlp\\t%0.<Vwhalf>, %1.<Vtype>"
- [(set_attr "type" "neon_reduc_add<q>")]
+ "addp\\t%0.2s, %1.2s, %1.2s"
+ [(set_attr "type" "neon_reduc_add")]
)
;; ADDV with result zero-extended to SI/DImode (for popcount).
@@ -3447,15 +3424,6 @@
[(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
)
-(define_insn "aarch64_reduc_plus_internalv2si"
- [(set (match_operand:V2SI 0 "register_operand" "=w")
- (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
- UNSPEC_ADDV))]
- "TARGET_SIMD"
- "addp\\t%0.2s, %1.2s, %1.2s"
- [(set_attr "type" "neon_reduc_add")]
-)
-
(define_insn "reduc_plus_scal_<mode>"
[(set (match_operand:<VEL> 0 "register_operand" "=w")
(unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
@@ -3467,7 +3435,7 @@
(define_expand "reduc_plus_scal_v4sf"
[(set (match_operand:SF 0 "register_operand")
- (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
+ (unspec:SF [(match_operand:V4SF 1 "register_operand")]
UNSPEC_FADDV))]
"TARGET_SIMD"
{
@@ -3479,6 +3447,24 @@
DONE;
})
+(define_insn "aarch64_<su>addlv<mode>"
+ [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
+ (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
+ USADDLV))]
+ "TARGET_SIMD"
+ "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
+ [(set_attr "type" "neon_reduc_add<q>")]
+)
+
+(define_insn "aarch64_<su>addlp<mode>"
+ [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
+ (unspec:<VDBLW> [(match_operand:VDQV_L 1 "register_operand" "w")]
+ USADDLP))]
+ "TARGET_SIMD"
+ "<su>addlp\\t%0.<Vwhalf>, %1.<Vtype>"
+ [(set_attr "type" "neon_reduc_add<q>")]
+)
+
(define_insn "clrsb<mode>2"
[(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
(clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
new file mode 100644
index 0000000..271a1c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**bar:
+** ...
+** addv s0, v0.4s
+** fmov w0, s0
+** lsr w1, w0, 16
+** add w0, w1, w0, uxth
+** ret
+*/
+int bar (v4si x)
+{
+ unsigned int sum = vaddvq_s32 (x);
+ return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
new file mode 100644
index 0000000..0ad9695
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+**test:
+** ...
+** addv s0, v0.4s
+** fmov w0, s0
+** and w1, w0, 65535
+** add w0, w1, w0, lsr 16
+** lsr w0, w0, 1
+** ret
+*/
+int test (uint8_t *p, uint32_t t[1][1], int n) {
+
+ int sum = 0;
+ uint32_t a0;
+ for (int i = 0; i < 4; i++, p++)
+ t[i][0] = p[0];
+
+ for (int i = 0; i < 4; i++) {
+ {
+ int t0 = t[0][i] + t[0][i];
+ a0 = t0;
+ };
+ sum += a0;
+ }
+ return (((uint16_t)sum) + ((uint32_t)sum >> 16)) >> 1;
+}