tree-optimization/115252 - enhance peeling for gaps avoidance

Code generation for contiguous load vectorization can already deal with generalized avoidance of loading from a gap. The following extends detection of peeling for gaps requirement with that, gets rid of the old special casing of a half load and makes sure when we do access the gap we have peeling for gaps enabled. PR tree-optimization/115252 * tree-vect-stmts.cc (get_group_load_store_type): Enhance detecting the number of cases where we can avoid accessing a gap during code generation. (vectorizable_load): Remove old half-vector peeling for gap avoidance which is now redundant. Add gap-aligned case where it's OK to access the gap. Add assert that we have peeling for gaps enabled when we access a gap. * gcc.dg/vect/slp-gap-1.c: New testcase.
author: Richard Biener <rguenther@suse.de> 2024-05-27 16:04:35 +0200
committer: Richard Biener <rguenther@suse.de> 2024-05-29 13:05:24 +0200
commit: f46eaad445e680034df51bd0dec4e6c7b1f372a4 (patch)
tree: dd1c04eef158c554d4cf5cb9af6856b8573ca008
parent: 1065a7db6f2a69770a85b4d53b9123b090dd1771 (diff)
download: gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.zip
gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.gz
gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.bz2
2 files changed, 46 insertions, 30 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/slp-gap-1.c b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
new file mode 100644
index 0000000..36463ca
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-gap-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+
+typedef unsigned char uint8_t;
+typedef short int16_t;
+void pixel_sub_wxh(int16_t * __restrict diff, uint8_t *pix1, uint8_t *pix2) {
+  for (int y = 0; y < 4; y++) {
+    for (int x = 0; x < 4; x++)
+      diff[x + y * 4] = pix1[x] - pix2[x];
+    pix1 += 16;
+    pix2 += 32;
+  }
+}
+
+/* We can vectorize this without peeling for gaps and thus without epilogue,
+   but the only thing we can reliably scan is the zero-padding trick for the
+   partial loads.  */
+/* { dg-final { scan-tree-dump-times "\{_\[0-9\]\+, 0" 6 "vect" { target vect64 } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4219ad8..935d80f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2072,16 +2072,22 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 	  dr_alignment_support alss;
 	  int misalign = dr_misalignment (first_dr_info, vectype);
 	  tree half_vtype;
+	  poly_uint64 remain;
+	  unsigned HOST_WIDE_INT tem, num;
 	  if (overrun_p
 	      && !masked_p
 	      && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
 							  vectype, misalign)))
 		   == dr_aligned
 		  || alss == dr_unaligned_supported)
-	      && known_eq (nunits, (group_size - gap) * 2)
-	      && known_eq (nunits, group_size)
-	      && (vector_vector_composition_type (vectype, 2, &half_vtype)
-		  != NULL_TREE))
+	      && can_div_trunc_p (group_size
+				  * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
+				  nunits, &tem, &remain)
+	      && (known_eq (remain, 0u)
+		  || (constant_multiple_p (nunits, remain, &num)
+		      && (vector_vector_composition_type (vectype, num,
+							  &half_vtype)
+			  != NULL_TREE))))
 	    overrun_p = false;
 
 	  if (overrun_p && !can_overrun_p)
@@ -11513,33 +11519,14 @@ vectorizable_load (vec_info *vinfo,
 		    unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
 		    unsigned int vect_align
 		      = vect_known_alignment_in_bytes (first_dr_info, vectype);
-		    unsigned int scalar_dr_size
-		      = vect_get_scalar_dr_size (first_dr_info);
-		    /* If there's no peeling for gaps but we have a gap
-		       with slp loads then load the lower half of the
-		       vector only.  See get_group_load_store_type for
-		       when we apply this optimization.  */
-		    if (slp
-			&& loop_vinfo
-			&& !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
-			&& known_eq (nunits, (group_size - gap) * 2)
-			&& known_eq (nunits, group_size)
-			&& gap >= (vect_align / scalar_dr_size))
-		      {
-			tree half_vtype;
-			new_vtype
-			  = vector_vector_composition_type (vectype, 2,
-							    &half_vtype);
-			if (new_vtype != NULL_TREE)
-			  ltype = half_vtype;
-		      }
 		    /* Try to use a single smaller load when we are about
 		       to load excess elements compared to the unrolled
-		       scalar loop.
-		       ???  This should cover the above case as well.  */
-		    else if (known_gt ((vec_num * j + i + 1) * nunits,
+		       scalar loop.  */
+		    if (known_gt ((vec_num * j + i + 1) * nunits,
 				       (group_size * vf - gap)))
 		      {
+			poly_uint64 remain = ((group_size * vf - gap)
+					      - (vec_num * j + i) * nunits);
 			if (known_ge ((vec_num * j + i + 1) * nunits
 				      - (group_size * vf - gap), nunits))
 			  /* DR will be unused.  */
@@ -11551,11 +11538,15 @@ vectorizable_load (vec_info *vinfo,
 			     at least one element is accessed in the
 			     scalar loop.  */
 			  ;
+			else if (known_gt (vect_align,
+					   ((nunits - remain)
+					    * vect_get_scalar_dr_size
+						(first_dr_info))))
+			  /* Aligned access to the gap area when there's
+			     at least one element in it is OK.  */
+			  ;
 			else
 			  {
-			    auto remain
-			      = ((group_size * vf - gap)
-				 - (vec_num * j + i) * nunits);
 			    /* remain should now be > 0 and < nunits.  */
 			    unsigned num;
 			    if (constant_multiple_p (nunits, remain, &num))
@@ -11569,6 +11560,13 @@ vectorizable_load (vec_info *vinfo,
 				  ltype = ptype;
 			      }
 			    /* Else use multiple loads or a masked load?  */
+			    /* For loop vectorization we now should have
+			       an alternate type or LOOP_VINFO_PEELING_FOR_GAPS
+			       set.  */
+			    if (loop_vinfo)
+			      gcc_assert (new_vtype
+					  || LOOP_VINFO_PEELING_FOR_GAPS
+					       (loop_vinfo));
 			  }
 		      }
 		    tree offset
author	Richard Biener <rguenther@suse.de>	2024-05-27 16:04:35 +0200
committer	Richard Biener <rguenther@suse.de>	2024-05-29 13:05:24 +0200
commit	f46eaad445e680034df51bd0dec4e6c7b1f372a4 (patch)
tree	dd1c04eef158c554d4cf5cb9af6856b8573ca008
parent	1065a7db6f2a69770a85b4d53b9123b090dd1771 (diff)
download	gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.zip gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.gz gcc-f46eaad445e680034df51bd0dec4e6c7b1f372a4.tar.bz2