aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorPrzemyslaw Wirkus <przemyslaw.wirkus@arm.com>2019-06-12 08:27:59 +0000
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>2019-06-12 08:27:59 +0000
commit84ae721396176b7f6f4b42f8c5caeb1b10012e52 (patch)
tree7ad458ce9ae258f8f1613edb2f9ced98ef1f1644 /gcc
parentd134323ba975f2f62178030c2eeaa2b83bd61117 (diff)
downloadgcc-84ae721396176b7f6f4b42f8c5caeb1b10012e52.zip
gcc-84ae721396176b7f6f4b42f8c5caeb1b10012e52.tar.gz
gcc-84ae721396176b7f6f4b42f8c5caeb1b10012e52.tar.bz2
[arm] Implement usadv16qi and ssadv16qi standard names
This patch implements the usadv16qi and ssadv16qi standard names for arm. The V16QImode variant is important as it is the most commonly used pattern: reducing vectors of bytes into an int. The midend expects the optab to compute the absolute differences of operands 1 and 2 and reduce them while widening along the way up to SImode. So the inputs are V16QImode and the output is V4SImode. I've based my solution on Aarch64 usadv16qi and ssadv16qi standard names current implementation (r260437). This solution emits below sequence of instructions: VABDL.u8 tmp, op1, op2 # op1, op2 lowpart VABAL.u8 tmp, op1, op2 # op1, op2 highpart VPADAL.u16 op3, tmp So, for the code: $ arm-none-linux-gnueabihf-gcc -S -O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard usadv16qi.c -dp #define N 1024 unsigned char pix1[N]; unsigned char pix2[N]; int foo (void) { int i_sum = 0; int i; for (i = 0; i < N; i++) i_sum += __builtin_abs (pix1[i] - pix2[i]); return i_sum; } we now generate on arm: foo: movw r3, #:lower16:pix2 @ 57 [c=4 l=4] *arm_movsi_vfp/3 movt r3, #:upper16:pix2 @ 58 [c=4 l=4] *arm_movt/0 vmov.i32 q9, #0 @ v4si @ 3 [c=4 l=4] *neon_movv4si/2 movw r2, #:lower16:pix1 @ 59 [c=4 l=4] *arm_movsi_vfp/3 movt r2, #:upper16:pix1 @ 60 [c=4 l=4] *arm_movt/0 add r1, r3, #1024 @ 8 [c=4 l=4] *arm_addsi3/4 .L2: vld1.8 {q11}, [r3]! @ 11 [c=8 l=4] *movmisalignv16qi_neon_load vld1.8 {q10}, [r2]! @ 10 [c=8 l=4] *movmisalignv16qi_neon_load cmp r1, r3 @ 21 [c=4 l=4] *arm_cmpsi_insn/2 vabdl.u8 q8, d20, d22 @ 12 [c=8 l=4] neon_vabdluv8qi vabal.u8 q8, d21, d23 @ 15 [c=88 l=4] neon_vabaluv8qi vpadal.u16 q9, q8 @ 16 [c=8 l=4] neon_vpadaluv8hi bne .L2 @ 22 [c=16 l=4] arm_cond_branch vadd.i32 d18, d18, d19 @ 24 [c=120 l=4] quad_halves_plusv4si vpadd.i32 d18, d18, d18 @ 25 [c=8 l=4] neon_vpadd_internalv2si vmov.32 r0, d18[0] @ 30 [c=12 l=4] vec_extractv2sisi/1 instead of: foo: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. movw r3, #:lower16:pix1 movt r3, #:upper16:pix1 vmov.i32 q9, #0 @ v4si movw r2, #:lower16:pix2 movt r2, #:upper16:pix2 add r1, r3, #1024 .L2: vld1.8 {q8}, [r3]! vld1.8 {q11}, [r2]! vmovl.u8 q10, d16 cmp r1, r3 vmovl.u8 q8, d17 vmovl.u8 q12, d22 vmovl.u8 q11, d23 vsub.i16 q10, q10, q12 vsub.i16 q8, q8, q11 vabs.s16 q10, q10 vabs.s16 q8, q8 vaddw.s16 q9, q9, d20 vaddw.s16 q9, q9, d21 vaddw.s16 q9, q9, d16 vaddw.s16 q9, q9, d17 bne .L2 vadd.i32 d18, d18, d19 vpadd.i32 d18, d18, d18 vmov.32 r0, d18[0] 2019-06-12 Przemyslaw Wirkus <przemyslaw.wirkus@arm.com> * config/arm/iterators.md (VABAL): New int iterator. * config/arm/neon.md (<sup>sadv16qi): New define_expand. * config/arm/unspecs.md ("unspec"): Define UNSPEC_VABAL_S, UNSPEC_VABAL_U values. * gcc.target/arm/ssadv16qi.c: New test. * gcc.target/arm/usadv16qi.c: Likewise. From-SVN: r272180
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog7
-rw-r--r--gcc/config/arm/iterators.md3
-rw-r--r--gcc/config/arm/neon.md26
-rw-r--r--gcc/config/arm/unspecs.md2
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/arm/ssadv16qi.c29
-rw-r--r--gcc/testsuite/gcc.target/arm/usadv16qi.c29
7 files changed, 101 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c1d58a1..eb29748 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2019-06-12 Przemyslaw Wirkus <przemyslaw.wirkus@arm.com>
+
+ * config/arm/iterators.md (VABAL): New int iterator.
+ * config/arm/neon.md (<sup>sadv16qi): New define_expand.
+ * config/arm/unspecs.md ("unspec"): Define UNSPEC_VABAL_S, UNSPEC_VABAL_U
+ values.
+
2019-06-12 Martin Liska <mliska@suse.cz>
* value-prof.c (stream_out_histogram_value): Only first value
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index eb07c5b..2462b8c 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -341,6 +341,8 @@
(define_int_iterator VSUBHN [UNSPEC_VSUBHN UNSPEC_VRSUBHN])
+(define_int_iterator VABAL [UNSPEC_VABAL_S UNSPEC_VABAL_U])
+
(define_int_iterator VABD [UNSPEC_VABD_S UNSPEC_VABD_U])
(define_int_iterator VABDL [UNSPEC_VABDL_S UNSPEC_VABDL_U])
@@ -834,6 +836,7 @@
(UNSPEC_VSUBW_S "s") (UNSPEC_VSUBW_U "u")
(UNSPEC_VHSUB_S "s") (UNSPEC_VHSUB_U "u")
(UNSPEC_VQSUB_S "s") (UNSPEC_VQSUB_U "u")
+ (UNSPEC_VABAL_S "s") (UNSPEC_VABAL_U "u")
(UNSPEC_VABD_S "s") (UNSPEC_VABD_U "u")
(UNSPEC_VABDL_S "s") (UNSPEC_VABDL_U "u")
(UNSPEC_VMAX "s") (UNSPEC_VMAX_U "u")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 726b728..bcf838f 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3256,6 +3256,32 @@
[(set_attr "type" "neon_arith_acc<q>")]
)
+(define_expand "<sup>sadv16qi"
+ [(use (match_operand:V4SI 0 "register_operand"))
+ (unspec:V16QI [(use (match_operand:V16QI 1 "register_operand"))
+ (use (match_operand:V16QI 2 "register_operand"))] VABAL)
+ (use (match_operand:V4SI 3 "register_operand"))]
+ "TARGET_NEON"
+ {
+ rtx reduc = gen_reg_rtx (V8HImode);
+ rtx op1_highpart = gen_reg_rtx (V8QImode);
+ rtx op2_highpart = gen_reg_rtx (V8QImode);
+
+ emit_insn (gen_neon_vabdl<sup>v8qi (reduc,
+ gen_lowpart (V8QImode, operands[1]),
+ gen_lowpart (V8QImode, operands[2])));
+
+ emit_insn (gen_neon_vget_highv16qi (op1_highpart, operands[1]));
+ emit_insn (gen_neon_vget_highv16qi (op2_highpart, operands[2]));
+ emit_insn (gen_neon_vabal<sup>v8qi (reduc, reduc,
+ op1_highpart, op2_highpart));
+ emit_insn (gen_neon_vpadal<sup>v8hi (operands[3], operands[3], reduc));
+
+ emit_move_insn (operands[0], operands[3]);
+ DONE;
+ }
+)
+
(define_insn "neon_v<maxmin><sup><mode>"
[(set (match_operand:VDQIW 0 "s_register_operand" "=w")
(unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 174bcc5..41068ba 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -200,6 +200,8 @@
UNSPEC_SHA256SU1
UNSPEC_VMULLP64
UNSPEC_LOAD_COUNT
+ UNSPEC_VABAL_S
+ UNSPEC_VABAL_U
UNSPEC_VABD_F
UNSPEC_VABD_S
UNSPEC_VABD_U
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 2753571..b6cba15 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2019-06-12 Przemyslaw Wirkus <przemyslaw.wirkus@arm.com>
+
+ * gcc.target/arm/ssadv16qi.c: New test.
+ * gcc.target/arm/usadv16qi.c: Likewise.
+
2019-06-12 Jakub Jelinek <jakub@redhat.com>
PR c/90760
diff --git a/gcc/testsuite/gcc.target/arm/ssadv16qi.c b/gcc/testsuite/gcc.target/arm/ssadv16qi.c
new file mode 100644
index 0000000..dba5ef4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/ssadv16qi.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#define N 1024
+
+signed char pix1[N], pix2[N];
+
+int
+foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler {\tvabdl\.s8\t} } } */
+/* { dg-final { scan-assembler {\tvabal\.s8\t} } } */
+/* { dg-final { scan-assembler {\tvpadal\.s16\t} } } */
+
+/* { dg-final { scan-assembler-not {\tvmovl} } } */
+/* { dg-final { scan-assembler-not {\tvsub} } } */
+/* { dg-final { scan-assembler-not {\tvabs} } } */
diff --git a/gcc/testsuite/gcc.target/arm/usadv16qi.c b/gcc/testsuite/gcc.target/arm/usadv16qi.c
new file mode 100644
index 0000000..d744bcb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/usadv16qi.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-require-effective-target arm_fp_ok } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-add-options arm_neon } */
+
+#define N 1024
+
+unsigned char pix1[N], pix2[N];
+
+int
+foo (void)
+{
+ int i_sum = 0;
+ int i;
+
+ for (i = 0; i < N; i++)
+ i_sum += __builtin_abs (pix1[i] - pix2[i]);
+
+ return i_sum;
+}
+
+/* { dg-final { scan-assembler {\tvabdl\.u8\t} } } */
+/* { dg-final { scan-assembler {\tvabal\.u8\t} } } */
+/* { dg-final { scan-assembler {\tvpadal\.u16\t} } } */
+
+/* { dg-final { scan-assembler-not {\tvmovl} } } */
+/* { dg-final { scan-assembler-not {\tvsub} } } */
+/* { dg-final { scan-assembler-not {\tvabs} } } */