vect: Support multiple lane-reducing operations for loop reduction [PR114440]

For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, current vectorizer could only handle the pattern if the reduction chain does not contain other operation, no matter the other is normal or lane-reducing. This patches removes some constraints in reduction analysis to allow multiple arbitrary lane-reducing operations with mixed input vectypes in a loop reduction chain. For example: int sum = 1; for (i) { sum += d0[i] * d1[i]; // dot-prod <vector(16) char> sum += w[i]; // widen-sum <vector(16) char> sum += abs(s0[i] - s1[i]); // sad <vector(8) short> } The vector size is 128-bit vectorization factor is 16. Reduction statements would be transformed as: vector<4> int sum_v0 = { 0, 0, 0, 1 }; vector<4> int sum_v1 = { 0, 0, 0, 0 }; vector<4> int sum_v2 = { 0, 0, 0, 0 }; vector<4> int sum_v3 = { 0, 0, 0, 0 }; for (i / 16) { sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0); sum_v1 = sum_v1; // copy sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0); sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1); sum_v2 = sum_v2; // copy sum_v3 = sum_v3; // copy } sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3; // = sum_v0 + sum_v1 2024-03-22 Feng Xue <fxue@os.amperecomputing.com> gcc/ PR tree-optimization/114440 * tree-vectorizer.h (vectorizable_lane_reducing): New function declaration. * tree-vect-stmts.cc (vect_analyze_stmt): Call new function vectorizable_lane_reducing to analyze lane-reducing operation. * tree-vect-loop.cc (vect_model_reduction_cost): Remove cost computation code related to emulated_mixed_dot_prod. (vectorizable_lane_reducing): New function. (vectorizable_reduction): Allow multiple lane-reducing operations in loop reduction. Move some original lane-reducing related code to vectorizable_lane_reducing. (vect_transform_reduction): Adjust comments with updated example. gcc/testsuite/ PR tree-optimization/114440 * gcc.dg/vect/vect-reduc-chain-1.c * gcc.dg/vect/vect-reduc-chain-2.c * gcc.dg/vect/vect-reduc-chain-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-1.c * gcc.dg/vect/vect-reduc-chain-dot-slp-2.c * gcc.dg/vect/vect-reduc-chain-dot-slp-3.c * gcc.dg/vect/vect-reduc-chain-dot-slp-4.c * gcc.dg/vect/vect-reduc-dot-slp-1.c
author: Feng Xue <fxue@os.amperecomputing.com> 2024-05-29 17:22:36 +0800
committer: Feng Xue <fxue@os.amperecomputing.com> 2024-07-17 21:54:05 +0800
commit: 178cc419512f7e358f88dfe2336625aa99cd7438 (patch)
tree: 518eacbefd2e5457b33c4d3a0cc87c3943ff6306 /gcc/tree-vect-loop.cc
parent: 8b59fa9d8ca25bdf0792390a8bdeae151532a530 (diff)
download: gcc-178cc419512f7e358f88dfe2336625aa99cd7438.zip
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.gz
gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.bz2
1 files changed, 171 insertions, 70 deletions
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 9c5c305..1c3dbf4 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5328,8 +5328,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
     gcc_unreachable ();
 
-  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
-
   if (reduction_type == EXTRACT_LAST_REDUCTION)
     /* No extra instructions are needed in the prologue.  The loop body
        operations are costed in vectorizable_condition.  */
@@ -5364,12 +5362,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 	   initial result of the data reduction, initial value of the index
 	   reduction.  */
 	prologue_stmts = 4;
-      else if (emulated_mixed_dot_prod)
-	/* We need the initial reduction value and two invariants:
-	   one that contains the minimum signed value and one that
-	   contains half of its negative.  */
-	prologue_stmts = 3;
       else
+	/* We need the initial reduction value.  */
 	prologue_stmts = 1;
       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
 					 scalar_to_vec, stmt_info, 0,
@@ -7482,6 +7476,142 @@ vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
     }
 }
 
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+   the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
+   and the analysis is for slp if SLP_NODE is not NULL.
+
+   For a lane-reducing operation, the loop reduction path that it lies in,
+   may contain normal operation, or other lane-reducing operation of different
+   input type size, an example as:
+
+     int sum = 0;
+     for (i)
+       {
+         ...
+         sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
+         sum += w[i];                // widen-sum <vector(16) char>
+         sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
+         sum += n[i];                // normal <vector(4) int>
+         ...
+       }
+
+   Vectorization factor is essentially determined by operation whose input
+   vectype has the most lanes ("vector(16) char" in the example), while we
+   need to choose input vectype with the least lanes ("vector(4) int" in the
+   example) to determine effective number of vector reduction PHIs.  */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+			    slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  gimple *stmt = stmt_info->stmt;
+
+  if (!lane_reducing_stmt_p (stmt))
+    return false;
+
+  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+  if (!INTEGRAL_TYPE_P (type))
+    return false;
+
+  /* Do not try to vectorize bit-precision reductions.  */
+  if (!type_has_mode_precision_p (type))
+    return false;
+
+  stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+  /* TODO: Support lane-reducing operation that does not directly participate
+     in loop reduction.  */
+  if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+    return false;
+
+  /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+     recoginized.  */
+  gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+  gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+  for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+    {
+      stmt_vec_info def_stmt_info;
+      slp_tree slp_op;
+      tree op;
+      tree vectype;
+      enum vect_def_type dt;
+
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+			       &slp_op, &dt, &vectype, &def_stmt_info))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      if (!vectype)
+	{
+	  vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+						 slp_op);
+	  if (!vectype)
+	    return false;
+	}
+
+      if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "incompatible vector types for invariants\n");
+	  return false;
+	}
+
+      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+	continue;
+
+      /* There should be at most one cycle def in the stmt.  */
+      if (VECTORIZABLE_CYCLE_DEF (dt))
+	return false;
+    }
+
+  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+  gcc_assert (vectype_in);
+
+  /* Compute number of effective vector statements for costing.  */
+  unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
+						       vectype_in);
+  gcc_assert (ncopies_for_cost >= 1);
+
+  if (vect_is_emulated_mixed_dot_prod (stmt_info))
+    {
+      /* We need extra two invariants: one that contains the minimum signed
+	 value and one that contains half of its negative.  */
+      int prologue_stmts = 2;
+      unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+					scalar_to_vec, stmt_info, 0,
+					vect_prologue);
+      if (dump_enabled_p ())
+	dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+		     "extra prologue_cost = %d .\n", cost);
+
+      /* Three dot-products and a subtraction.  */
+      ncopies_for_cost *= 4;
+    }
+
+  record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
+		    0, vect_body);
+
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+    {
+      enum tree_code code = gimple_assign_rhs_code (stmt);
+      vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
+						  slp_node, code, type,
+						  vectype_in);
+    }
+
+  /* Transform via vect_transform_reduction.  */
+  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+  return true;
+}
+
 /* Function vectorizable_reduction.
 
    Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7721,7 +7851,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 		}
 
 	      /* For lane-reducing operation vectorizable analysis needs the
-		 reduction PHI information */
+		 reduction PHI information.  */
 	      STMT_VINFO_REDUC_DEF (def) = phi_info;
 
 	      /* Each lane-reducing operation has its own input vectype, while
@@ -7815,18 +7945,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (!type_has_mode_precision_p (op.type))
     return false;
 
-  /* For lane-reducing ops we're reducing the number of reduction PHIs
-     which means the only use of that may be in the lane-reducing operation.  */
-  if (lane_reducing
-      && reduc_chain_length != 1
-      && !only_slp_reduc_chain)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "lane-reducing reduction with extra stmts.\n");
-      return false;
-    }
-
   /* Lane-reducing ops also never can be used in a SLP reduction group
      since we'll mix lanes belonging to different reductions.  But it's
      OK to use them in a reduction chain or when the reduction group
@@ -8366,14 +8484,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
-  if (single_defuse_cycle || lane_reducing)
+  if (single_defuse_cycle && !lane_reducing)
     {
       gcc_assert (op.code != COND_EXPR);
 
-      /* 4. Supportable by target?  */
-      bool ok = true;
-
-      /* 4.1. check support for the operation in the loop
+      /* 4. check support for the operation in the loop
 
 	 This isn't necessary for the lane reduction codes, since they
 	 can only be produced by pattern matching, and it's up to the
@@ -8382,14 +8497,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	 mixed-sign dot-products can be implemented using signed
 	 dot-products.  */
       machine_mode vec_mode = TYPE_MODE (vectype_in);
-      if (!lane_reducing
-	  && !directly_supported_p (op.code, vectype_in, optab_vector))
+      if (!directly_supported_p (op.code, vectype_in, optab_vector))
         {
           if (dump_enabled_p ())
             dump_printf (MSG_NOTE, "op not supported by target.\n");
 	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
 	      || !vect_can_vectorize_without_simd_p (op.code))
-	    ok = false;
+	    single_defuse_cycle = false;
 	  else
 	    if (dump_enabled_p ())
 	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8402,16 +8516,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    dump_printf (MSG_NOTE, "using word mode not possible.\n");
 	  return false;
 	}
-
-      /* lane-reducing operations have to go through vect_transform_reduction.
-         For the other cases try without the single cycle optimization.  */
-      if (!ok)
-	{
-	  if (lane_reducing)
-	    return false;
-	  else
-	    single_defuse_cycle = false;
-	}
     }
   if (dump_enabled_p () && single_defuse_cycle)
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -8419,22 +8523,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 		     "multiple vectors to one in the loop body\n");
   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
 
-  /* If the reduction stmt is one of the patterns that have lane
-     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
-  if ((ncopies > 1 && ! single_defuse_cycle)
-      && lane_reducing)
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "multi def-use cycle not possible for lane-reducing "
-			 "reduction operation\n");
-      return false;
-    }
+  /* For lane-reducing operation, the below processing related to single
+     defuse-cycle will be done in its own vectorizable function.  One more
+     thing to note is that the operation must not be involved in fold-left
+     reduction.  */
+  single_defuse_cycle &= !lane_reducing;
 
   if (slp_node
-      && !(!single_defuse_cycle
-	   && !lane_reducing
-	   && reduction_type != FOLD_LEFT_REDUCTION))
+      && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
     for (i = 0; i < (int) op.num_ops; i++)
       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
 	{
@@ -8447,28 +8543,20 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
 			     reduction_type, ncopies, cost_vec);
   /* Cost the reduction op inside the loop if transformed via
-     vect_transform_reduction.  Otherwise this is costed by the
-     separate vectorizable_* routines.  */
-  if (single_defuse_cycle || lane_reducing)
-    {
-      int factor = 1;
-      if (vect_is_emulated_mixed_dot_prod (stmt_info))
-	/* Three dot-products and a subtraction.  */
-	factor = 4;
-      record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
-			stmt_info, 0, vect_body);
-    }
+     vect_transform_reduction for non-lane-reducing operation.  Otherwise
+     this is costed by the separate vectorizable_* routines.  */
+  if (single_defuse_cycle)
+    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
 
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "using an in-order (fold-left) reduction.\n");
   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
-  /* All but single defuse-cycle optimized, lane-reducing and fold-left
-     reductions go through their own vectorizable_* routines.  */
-  if (!single_defuse_cycle
-      && !lane_reducing
-      && reduction_type != FOLD_LEFT_REDUCTION)
+
+  /* All but single defuse-cycle optimized and fold-left reductions go
+     through their own vectorizable_* routines.  */
+  if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
     {
       stmt_vec_info tem
 	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8746,13 +8834,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	 And vector reduction PHIs are always generated to the full extent, no
 	 matter lane-reducing op exists or not.  If some copies or PHIs are
 	 actually superfluous, they would be cleaned up by passes after
-	 vectorization.  An example for single-lane slp is given as below.
+	 vectorization.  An example for single-lane slp, lane-reducing ops
+	 with mixed input vectypes in a reduction chain, is given as below.
 	 Similarly, this handling is applicable for multiple-lane slp as well.
 
 	   int sum = 1;
 	   for (i)
 	     {
 	       sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
+	       sum += w[i];               // widen-sum <vector(16) char>
+	       sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
 	     }
 
 	 The vector size is 128-bit，vectorization factor is 16.  Reduction
@@ -8769,9 +8860,19 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	       sum_v1 = sum_v1;  // copy
 	       sum_v2 = sum_v2;  // copy
 	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
+	       sum_v1 = sum_v1;  // copy
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
+
+	       sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
+	       sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
+	       sum_v2 = sum_v2;  // copy
+	       sum_v3 = sum_v3;  // copy
 	     }
 
-	   sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0
+	   sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0 + sum_v1
 	*/
       unsigned effec_ncopies = vec_oprnds[0].length ();
       unsigned total_ncopies = vec_oprnds[reduc_index].length ();
author	Feng Xue <fxue@os.amperecomputing.com>	2024-05-29 17:22:36 +0800
committer	Feng Xue <fxue@os.amperecomputing.com>	2024-07-17 21:54:05 +0800
commit	178cc419512f7e358f88dfe2336625aa99cd7438 (patch)
tree	518eacbefd2e5457b33c4d3a0cc87c3943ff6306 /gcc/tree-vect-loop.cc
parent	8b59fa9d8ca25bdf0792390a8bdeae151532a530 (diff)
download	gcc-178cc419512f7e358f88dfe2336625aa99cd7438.zip gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.gz gcc-178cc419512f7e358f88dfe2336625aa99cd7438.tar.bz2