tree-optimization/112736 - avoid overread with non-grouped SLP load

The following aovids over/under-read of storage when vectorizing a non-grouped load with SLP. Instead of forcing peeling for gaps use a smaller load for the last vector which might access excess elements. This builds upon the existing optimization avoiding peeling for gaps, generalizing it to all gap widths leaving a power-of-two remaining number of elements (but it doesn't replace or improve that particular case at this point). I wonder if the poly relational compares I set up are good enough to guarantee /* remain should now be > 0 and < nunits. */. There is existing test coverage that runs into /* DR will be unused. */ always when the gap is wider than nunits. Compared to the existing gap == nunits/2 case this only adjusts the load that will cause the overrun at the end, not every load. Apart from the poly relational compares it should reliably cover these cases but I'll leave it for stage1 to remove. PR tree-optimization/112736 * tree-vect-stmts.cc (vectorizable_load): Extend optimization to avoid peeling for gaps to handle single-element non-groups we now allow with SLP. * gcc.dg/torture/pr112736.c: New testcase.
author: Richard Biener <rguenther@suse.de> 2023-12-11 14:39:48 +0100
committer: Richard Biener <rguenther@suse.de> 2023-12-12 15:25:25 +0100
commit: 6d0b0806eb638447c3184c59d996c2f178553d45 (patch)
tree: 0817faa5aa47b3ce75d4e8ca91a39778c93282b8
parent: eee13a3730bd1d7aa7b40687b1ee49c17d95159f (diff)
download: gcc-6d0b0806eb638447c3184c59d996c2f178553d45.zip
gcc-6d0b0806eb638447c3184c59d996c2f178553d45.tar.gz
gcc-6d0b0806eb638447c3184c59d996c2f178553d45.tar.bz2
2 files changed, 100 insertions, 19 deletions
diff --git a/gcc/testsuite/gcc.dg/torture/pr112736.c b/gcc/testsuite/gcc.dg/torture/pr112736.c
new file mode 100644
index 0000000..6abb56e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr112736.c
@@ -0,0 +1,27 @@
+/* { dg-do run { target *-*-linux* *-*-gnu* *-*-uclinux* } } */
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+int a, c[3][5];
+
+void __attribute__((noipa))
+fn1 (int * __restrict b)
+{
+  int e;
+  for (a = 2; a >= 0; a--)
+    for (e = 0; e < 4; e++)
+      c[a][e] = b[a];
+}
+
+int main()
+{
+  long pgsz = sysconf (_SC_PAGESIZE);
+  void *p = mmap (NULL, pgsz * 2, PROT_READ|PROT_WRITE,
+                  MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+  if (p == MAP_FAILED)
+    return 0;
+  mprotect (p, pgsz, PROT_NONE);
+  fn1 (p + pgsz);
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 390c847..fc6923c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11465,26 +11465,70 @@ vectorizable_load (vec_info *vinfo,
 			if (new_vtype != NULL_TREE)
 			  ltype = half_vtype;
 		      }
+		    /* Try to use a single smaller load when we are about
+		       to load excess elements compared to the unrolled
+		       scalar loop.
+		       ???  This should cover the above case as well.  */
+		    else if (known_gt ((vec_num * j + i + 1) * nunits,
+				       (group_size * vf - gap)))
+		      {
+			if (known_ge ((vec_num * j + i + 1) * nunits
+				      - (group_size * vf - gap), nunits))
+			  /* DR will be unused.  */
+			  ltype = NULL_TREE;
+			else if (alignment_support_scheme == dr_aligned)
+			  /* Aligned access to excess elements is OK if
+			     at least one element is accessed in the
+			     scalar loop.  */
+			  ;
+			else
+			  {
+			    auto remain
+			      = ((group_size * vf - gap)
+				 - (vec_num * j + i) * nunits);
+			    /* remain should now be > 0 and < nunits.  */
+			    unsigned num;
+			    if (constant_multiple_p (nunits, remain, &num))
+			      {
+				tree ptype;
+				new_vtype
+				  = vector_vector_composition_type (vectype,
+								    num,
+								    &ptype);
+				if (new_vtype)
+				  ltype = ptype;
+			      }
+			    /* Else use multiple loads or a masked load?  */
+			  }
+		      }
 		    tree offset
 		      = (dataref_offset ? dataref_offset
 					: build_int_cst (ref_type, 0));
-		    if (ltype != vectype
-			&& memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+		    if (!ltype)
+		      ;
+		    else if (ltype != vectype
+			     && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 		      {
-			unsigned HOST_WIDE_INT gap_offset
-			  = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
-			tree gapcst = build_int_cst (ref_type, gap_offset);
+			poly_uint64 gap_offset
+			  = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype))
+			     - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype)));
+			tree gapcst = build_int_cstu (ref_type, gap_offset);
 			offset = size_binop (PLUS_EXPR, offset, gapcst);
 		      }
-		    data_ref
-		      = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
-		    if (alignment_support_scheme == dr_aligned)
-		      ;
-		    else
-		      TREE_TYPE (data_ref)
-			= build_aligned_type (TREE_TYPE (data_ref),
-					      align * BITS_PER_UNIT);
-		    if (ltype != vectype)
+		    if (ltype)
+		      {
+			data_ref
+			  = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+			if (alignment_support_scheme == dr_aligned)
+			  ;
+			else
+			  TREE_TYPE (data_ref)
+			    = build_aligned_type (TREE_TYPE (data_ref),
+						  align * BITS_PER_UNIT);
+		      }
+		    if (!ltype)
+		      data_ref = build_constructor (vectype, NULL);
+		    else if (ltype != vectype)
 		      {
 			vect_copy_ref_info (data_ref,
 					    DR_REF (first_dr_info->dr));
@@ -11494,18 +11538,28 @@ vectorizable_load (vec_info *vinfo,
 						     gsi);
 			data_ref = NULL;
 			vec<constructor_elt, va_gc> *v;
-			vec_alloc (v, 2);
+			/* We've computed 'num' above to statically two
+			   or via constant_multiple_p.  */
+			unsigned num
+			  = (exact_div (tree_to_poly_uint64
+					  (TYPE_SIZE_UNIT (vectype)),
+					tree_to_poly_uint64
+					  (TYPE_SIZE_UNIT (ltype)))
+			     .to_constant ());
+			vec_alloc (v, num);
 			if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 			  {
-			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-						    build_zero_cst (ltype));
+			    while (--num)
+			      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						      build_zero_cst (ltype));
 			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
 			  }
 			else
 			  {
 			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-						    build_zero_cst (ltype));
+			    while (--num)
+			      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						      build_zero_cst (ltype));
 			  }
 			gcc_assert (new_vtype != NULL_TREE);
 			if (new_vtype == vectype)
author	Richard Biener <rguenther@suse.de>	2023-12-11 14:39:48 +0100
committer	Richard Biener <rguenther@suse.de>	2023-12-12 15:25:25 +0100
commit	6d0b0806eb638447c3184c59d996c2f178553d45 (patch)
tree	0817faa5aa47b3ce75d4e8ca91a39778c93282b8
parent	eee13a3730bd1d7aa7b40687b1ee49c17d95159f (diff)
download	gcc-6d0b0806eb638447c3184c59d996c2f178553d45.zip gcc-6d0b0806eb638447c3184c59d996c2f178553d45.tar.gz gcc-6d0b0806eb638447c3184c59d996c2f178553d45.tar.bz2