aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorAlejandro Martinez <alejandro.martinezvicente@arm.com>2019-05-07 16:34:20 +0000
committerAlejandro Martinez <alejandro@gcc.gnu.org>2019-05-07 16:34:20 +0000
commita9fad8fe6c84de272f2a56d462e67d53c9f4a73d (patch)
tree9f0ff9561477c22ee099d09330ce742f5830d9a3 /gcc
parent0a59215131c02dee4c8829f93d1ee678647614da (diff)
downloadgcc-a9fad8fe6c84de272f2a56d462e67d53c9f4a73d.zip
gcc-a9fad8fe6c84de272f2a56d462e67d53c9f4a73d.tar.gz
gcc-a9fad8fe6c84de272f2a56d462e67d53c9f4a73d.tar.bz2
This patch adds support to vectorize sum of abslolute differences (SAD_EXPR)
using SVE. Given this input code: int sum_abs (uint8_t *restrict x, uint8_t *restrict y, int n) { int sum = 0; for (int i = 0; i < n; i++) { sum += __builtin_abs (x[i] - y[i]); } return sum; } The resulting SVE code is: 0000000000000000 <sum_abs>: 0: 7100005f cmp w2, #0x0 4: 5400026d b.le 50 <sum_abs+0x50> 8: d2800003 mov x3, #0x0 // #0 c: 93407c42 sxtw x2, w2 10: 2538c002 mov z2.b, #0 14: 25221fe0 whilelo p0.b, xzr, x2 18: 2538c023 mov z3.b, #1 1c: 2518e3e1 ptrue p1.b 20: a4034000 ld1b {z0.b}, p0/z, [x0, x3] 24: a4034021 ld1b {z1.b}, p0/z, [x1, x3] 28: 0430e3e3 incb x3 2c: 0520c021 sel z1.b, p0, z1.b, z0.b 30: 25221c60 whilelo p0.b, x3, x2 34: 040d0420 uabd z0.b, p1/m, z0.b, z1.b 38: 44830402 udot z2.s, z0.b, z3.b 3c: 54ffff21 b.ne 20 <sum_abs+0x20> // b.any 40: 2598e3e0 ptrue p0.s 44: 04812042 uaddv d2, p0, z2.s 48: 1e260040 fmov w0, s2 4c: d65f03c0 ret 50: 1e2703e2 fmov s2, wzr 54: 1e260040 fmov w0, s2 58: d65f03c0 ret Notice how udot is used inside a fully masked loop. gcc/Changelog: 2019-05-07 Alejandro Martinez <alejandro.martinezvicente@arm.com> * config/aarch64/aarch64-sve.md (<su>abd<mode>_3): New define_expand. (aarch64_<su>abd<mode>_3): Likewise. (*aarch64_<su>abd<mode>_3): New define_insn. (<sur>sad<vsi2qi>): New define_expand. * config/aarch64/iterators.md: Added MAX_OPP attribute. * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR. (build_vect_cond_expr): Likewise. gcc/testsuite/Changelog: 2019-05-07 Alejandro Martinez <alejandro.martinezvicente@arm.com> * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute differences. From-SVN: r270975
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog10
-rw-r--r--gcc/config/aarch64/aarch64-sve.md61
-rw-r--r--gcc/config/aarch64/iterators.md3
-rw-r--r--gcc/testsuite/ChangeLog5
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/sad_1.c28
-rw-r--r--gcc/tree-vect-loop.c12
6 files changed, 119 insertions, 0 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index cf2ea44..d55adb2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2019-05-07 Alejandro Martinez <alejandro.martinezvicente@arm.com>
+
+ * config/aarch64/aarch64-sve.md (<su>abd<mode>_3): New define_expand.
+ (aarch64_<su>abd<mode>_3): Likewise.
+ (*aarch64_<su>abd<mode>_3): New define_insn.
+ (<sur>sad<vsi2qi>): New define_expand.
+ * config/aarch64/iterators.md: Added MAX_OPP attribute.
+ * tree-vect-loop.c (use_mask_by_cond_expr_p): Add SAD_EXPR.
+ (build_vect_cond_expr): Likewise.
+
2019-05-07 Uroš Bizjak <ubizjak@gmail.com>
* cfgexpand.c (asm_clobber_reg_is_valid): Reject
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 02d33b7..e94801d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3148,3 +3148,64 @@
movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
[(set_attr "movprfx" "*,yes")]
)
+
+;; Helper expander for aarch64_<su>abd<mode>_3 to save the callers
+;; the hassle of constructing the other arm of the MINUS.
+(define_expand "<su>abd<mode>_3"
+ [(use (match_operand:SVE_I 0 "register_operand"))
+ (USMAX:SVE_I (match_operand:SVE_I 1 "register_operand")
+ (match_operand:SVE_I 2 "register_operand"))]
+ "TARGET_SVE"
+ {
+ rtx pred = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+ rtx other_arm = gen_rtx_<MAX_OPP> (<MODE>mode, operands[1], operands[2]);
+ emit_insn (gen_aarch64_<su>abd<mode>_3 (operands[0], pred, operands[1],
+ operands[2], other_arm));
+ DONE;
+ }
+)
+
+;; Predicated integer absolute difference.
+(define_insn "aarch64_<su>abd<mode>_3"
+ [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_I
+ [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+ (minus:SVE_I
+ (USMAX:SVE_I
+ (match_operand:SVE_I 2 "register_operand" "0, w")
+ (match_operand:SVE_I 3 "register_operand" "w, w"))
+ (match_operator 4 "aarch64_<max_opp>"
+ [(match_dup 2)
+ (match_dup 3)]))]
+ UNSPEC_MERGE_PTRUE))]
+ "TARGET_SVE"
+ "@
+ <su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ movprfx\t%0, %2\;<su>abd\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+ [(set_attr "movprfx" "*,yes")]
+)
+
+;; Emit a sequence to produce a sum-of-absolute-differences of the inputs in
+;; operands 1 and 2. The sequence also has to perform a widening reduction of
+;; the difference into a vector and accumulate that into operand 3 before
+;; copying that into the result operand 0.
+;; Perform that with a sequence of:
+;; MOV ones.b, #1
+;; [SU]ABD diff.b, p0/m, op1.b, op2.b
+;; MOVPRFX op0, op3 // If necessary
+;; UDOT op0.s, diff.b, ones.b
+
+(define_expand "<sur>sad<vsi2qi>"
+ [(use (match_operand:SVE_SDI 0 "register_operand"))
+ (unspec:<VSI2QI> [(use (match_operand:<VSI2QI> 1 "register_operand"))
+ (use (match_operand:<VSI2QI> 2 "register_operand"))] ABAL)
+ (use (match_operand:SVE_SDI 3 "register_operand"))]
+ "TARGET_SVE"
+ {
+ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+ rtx diff = gen_reg_rtx (<VSI2QI>mode);
+ emit_insn (gen_<sur>abd<vsi2qi>_3 (diff, operands[1], operands[2]));
+ emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3]));
+ DONE;
+ }
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b3b2d6e..20aa0e9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1060,6 +1060,9 @@
;; Map smax to smin and umax to umin.
(define_code_attr max_opp [(smax "smin") (umax "umin")])
+;; Same as above, but louder.
+(define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
+
;; The number of subvectors in an SVE_STRUCT.
(define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
(VNx8SI "2") (VNx4DI "2")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index ae46a72..ae3e09a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2019-05-07 Alejandro Martinez <alejandro.martinezvicente@arm.com>
+
+ * gcc.target/aarch64/sve/sad_1.c: New test for sum of absolute
+ differences.
+
2019-05-07 Uroš Bizjak <ubizjak@gmail.com>
* gcc.target/i386/asm-7.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
new file mode 100644
index 0000000..e7bf64a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/sad_1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_SAD(TYPE1, TYPE2) \
+TYPE1 __attribute__ ((noinline, noclone)) \
+sum_abs_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n) \
+{ \
+ TYPE1 sum = 0; \
+ for (int i = 0; i < n; i++) \
+ { \
+ sum += __builtin_abs (x[i] - y[i]); \
+ } \
+ return sum; \
+}
+
+DEF_SAD(int32_t, uint8_t)
+DEF_SAD(int32_t, int8_t)
+DEF_SAD(int64_t, uint16_t)
+DEF_SAD(int64_t, int16_t)
+
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.b, p[0-7]/m, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tuabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsabd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 493c1ab..057a874 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5973,6 +5973,7 @@ use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
switch (code)
{
case DOT_PROD_EXPR:
+ case SAD_EXPR:
return true;
default:
@@ -6002,6 +6003,17 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
break;
}
+ case SAD_EXPR:
+ {
+ tree vectype = TREE_TYPE (vop[1]);
+ tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+ gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+ mask, vop[1], vop[0]);
+ gsi_insert_before (gsi, select, GSI_SAME_STMT);
+ vop[1] = masked_op1;
+ break;
+ }
+
default:
gcc_unreachable ();
}