[Aarch64][SVE] Dot product support

This patch does two things. For the general vectoriser, it adds support to perform fully masked reductions over expressions that don't support masking. This is achieved by using VEC_COND_EXPR where possible. At the moment this is implemented for DOT_PROD_EXPR only, but the framework is there to extend it to other expressions. Related to that, this patch adds support to vectorize dot product using SVE. It also uses the new functionality to ensure that the resulting loop is masked. Given this input code: uint32_t dotprod (uint8_t *restrict x, uint8_t *restrict y, int n) { uint32_t sum = 0; for (int i = 0; i < n; i++) { sum += x[i] * y[i]; } return sum; } The resulting SVE code is: 0000000000000000 <dotprod>: 0: 7100005f cmp w2, #0x0 4: 5400024d b.le 4c <dotprod+0x4c> 8: d2800003 mov x3, #0x0 // #0 c: 93407c42 sxtw x2, w2 10: 2538c001 mov z1.b, #0 14: 25221fe0 whilelo p0.b, xzr, x2 18: 2538c003 mov z3.b, #0 1c: d503201f nop 20: a4034002 ld1b {z2.b}, p0/z, [x0, x3] 24: a4034020 ld1b {z0.b}, p0/z, [x1, x3] 28: 0430e3e3 incb x3 2c: 0523c000 sel z0.b, p0, z0.b, z3.b 30: 25221c60 whilelo p0.b, x3, x2 34: 44820401 udot z1.s, z0.b, z2.b 38: 54ffff41 b.ne 20 <dotprod+0x20> // b.any 3c: 2598e3e0 ptrue p0.s 40: 04812021 uaddv d1, p0, z1.s 44: 1e260020 fmov w0, s1 48: d65f03c0 ret 4c: 1e2703e1 fmov s1, wzr 50: 1e260020 fmov w0, s1 54: d65f03c0 ret Notice how udot is used inside a fully masked loop. I tested this patch in an aarch64 machine bootstrapping the compiler and running the checks. gcc/Changelog: 2019-05-02 Alejandro Martinez <alejandro.martinezvicente@arm.com> * config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE ACLE branch. * config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from SVE ACLE branch. * tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a VEC_COND_EXPR be inserted to emulate a conditional internal function. (build_vect_cond_expr): Emit the VEC_COND_EXPR. (vectorizable_reduction): Use the functions above to vectorize in a fully masked loop codes that don't have a conditional internal function. gcc/testsuite/Changelog: 2019-05-02 Alejandro Martinez <alejandro.martinezvicente@arm.com> * gcc.target/aarch64/sve/dot_1.c: New test for dot product. From-SVN: r270790
author: Alejandro Martinez <alejandro.martinezvicente@arm.com> 2019-05-02 09:58:00 +0000
committer: Alejandro Martinez <alejandro@gcc.gnu.org> 2019-05-02 09:58:00 +0000
commit: 9feeafd7f95ea9f7211908c137c60074b3a52da2 (patch)
tree: a7afe6148066fd48b611f16c45c07b16a7ca6eee
parent: cc2a672a60ff7476b3e4751ba41cb77c7fc85b09 (diff)
download: gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.zip
gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.tar.gz
gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.tar.bz2
6 files changed, 139 insertions, 3 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index e99c6a1..720627b 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+	* config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE
+	ACLE branch.
+	* config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from
+	SVE ACLE branch.
+	* tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a
+	VEC_COND_EXPR be inserted to emulate a conditional internal function.
+	(build_vect_cond_expr): Emit the VEC_COND_EXPR.
+	(vectorizable_reduction): Use the functions above to vectorize in a
+	fully masked loop codes that don't have a conditional internal
+	function.
+
 2019-05-02  Martin Liska  <mliska@suse.cz>
 
 	* cgraphclones.c: Call valid_attribute_p with 1 for
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 3f39c4c..02d33b7 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3132,3 +3132,19 @@
     DONE;
   }
 )
+
+;; Unpredicated DOT product.
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+	(plus:SVE_SDI
+	  (unspec:SVE_SDI
+	    [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+	     (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+	    DOTPROD)
+	  (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
+  "TARGET_SVE"
+  "@
+   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
+   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
+  [(set_attr "movprfx" "*,yes")]
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 6caeeac..b3b2d6e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -663,6 +663,9 @@
 			  (QI "b")   (HI "h")
 			  (SI "s")   (DI "d")])
 
+;; Like Vetype, but map to types that are a quarter of the element size.
+(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
+
 ;; Equivalent of "size" for a vector element.
 (define_mode_attr Vesize [(VNx16QI "b")
 			  (VNx8HI  "h") (VNx8HF  "h")
@@ -1029,8 +1032,10 @@
 		      (V2SF "p") (V4SF  "v")
 		      (V4HF "v") (V8HF  "v")])
 
-(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
-(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
+			  (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")
+			  (VNx4SI "VNx16QI") (VNx2DI "VNx8HI")])
 
 
 ;; Register suffix for DOTPROD input types from the return type.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 8519934..37edbea 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+	* gcc.target/aarch64/sve/dot_1.c: New test for dot product.
+
 2019-05-02  Martin Liska  <mliska@suse.cz>
 
 	* gcc.target/i386/funcspec-4.c: Update scanned pattern.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
new file mode 100644
index 0000000..8ff6671
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_DOT(TYPE1, TYPE2)						\
+TYPE1 __attribute__ ((noinline, noclone))				\
+dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)	\
+{									\
+  TYPE1 sum = 0;							\
+  for (int i = 0; i < n; i++)						\
+    {									\
+      sum += x[i] * y[i];						\
+    }									\
+  return sum;								\
+}
+
+DEF_DOT(uint32_t, uint8_t)
+DEF_DOT(int32_t, int8_t)
+DEF_DOT(int64_t, int16_t)
+
+/* The uint16_t->uint64_t dot product requires a casting to satisfy the C
+   language rules.  */
+uint64_t __attribute__ ((noinline, noclone))
+dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n)
+{
+  uint64_t sum = 0;
+  for (int i = 0; i < n; i++)
+    {
+      sum += (unsigned int)x[i] * y[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 0edcdc7..493c1ab 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5958,6 +5958,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
 	  <= TYPE_PRECISION (lhs_type));
 }
 
+/* Check if masking can be supported by inserting a conditional expression.
+   CODE is the code for the operation.  COND_FN is the conditional internal
+   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
+static bool
+use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+			 tree vectype_in)
+{
+  if (cond_fn != IFN_LAST
+      && direct_internal_fn_supported_p (cond_fn, vectype_in,
+					 OPTIMIZE_FOR_SPEED))
+    return false;
+
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Insert a conditional expression to enable masked vectorization.  CODE is the
+   code for the operation.  VOP is the array of operands.  MASK is the loop
+   mask.  GSI is a statement iterator used to place the new conditional
+   expression.  */
+static void
+build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+		      gimple_stmt_iterator *gsi)
+{
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      {
+	tree vectype = TREE_TYPE (vop[1]);
+	tree zero = build_zero_cst (vectype);
+	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+					       mask, vop[1], zero);
+	gsi_insert_before (gsi, select, GSI_SAME_STMT);
+	vop[1] = masked_op1;
+	break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -6931,6 +6980,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 
   internal_fn cond_fn = get_conditional_internal_fn (code);
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -6938,6 +6988,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
 	{
 	  if (reduction_type != FOLD_LEFT_REDUCTION
+	      && !mask_by_cond_expr
 	      && (cond_fn == IFN_LAST
 		  || !direct_internal_fn_supported_p (cond_fn, vectype_in,
 						      OPTIMIZE_FOR_SPEED)))
@@ -7101,7 +7152,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
         {
 	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-	  if (masked_loop_p)
+	  if (masked_loop_p && !mask_by_cond_expr)
 	    {
 	      /* Make sure that the reduction accumulator is vop[0].  */
 	      if (reduc_index == 1)
@@ -7125,6 +7176,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 	      if (op_type == ternary_op)
 		vop[2] = vec_oprnds2[i];
 
+	      if (masked_loop_p && mask_by_cond_expr)
+		{
+		  tree mask = vect_get_loop_mask (gsi, masks,
+						  vec_num * ncopies,
+						  vectype_in, i * ncopies + j);
+		  build_vect_cond_expr (code, vop, mask, gsi);
+		}
+
 	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
 						       vop[0], vop[1], vop[2]);
 	      new_temp = make_ssa_name (vec_dest, new_stmt);
author	Alejandro Martinez <alejandro.martinezvicente@arm.com>	2019-05-02 09:58:00 +0000
committer	Alejandro Martinez <alejandro@gcc.gnu.org>	2019-05-02 09:58:00 +0000
commit	9feeafd7f95ea9f7211908c137c60074b3a52da2 (patch)
tree	a7afe6148066fd48b611f16c45c07b16a7ca6eee
parent	cc2a672a60ff7476b3e4751ba41cb77c7fc85b09 (diff)
download	gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.zip gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.tar.gz gcc-9feeafd7f95ea9f7211908c137c60074b3a52da2.tar.bz2