aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2023-01-18 10:59:52 +0100
committerRichard Biener <rguenther@suse.de>2023-04-28 14:40:48 +0200
commit6d4b59a9356ac4df7f4ae370495dc3366d0daa55 (patch)
treec4704cd661b920b5ce1b21e02287fd661b5eea36 /gcc
parent24905a4bd1375ccd99c02510b9f9529015a48315 (diff)
downloadgcc-6d4b59a9356ac4df7f4ae370495dc3366d0daa55.zip
gcc-6d4b59a9356ac4df7f4ae370495dc3366d0daa55.tar.gz
gcc-6d4b59a9356ac4df7f4ae370495dc3366d0daa55.tar.bz2
Add emulated scatter capability to the vectorizer
This adds a scatter vectorization capability to the vectorizer without target support by decomposing the offset and data vectors and then performing scalar stores in the order of vector lanes. This is aimed at cases where vectorizing the rest of the loop offsets the cost of vectorizing the scatter. The offset load is still vectorized and costed as such, but like with emulated gather those will be turned back to scalar loads by forwrpop. * tree-vect-data-refs.cc (vect_analyze_data_refs): Always consider scatters. * tree-vect-stmts.cc (vect_model_store_cost): Pass in the gather-scatter info and cost emulated scatters accordingly. (get_load_store_type): Support emulated scatters. (vectorizable_store): Likewise. Emulate them by extracting scalar offsets and data, doing scalar stores. * gcc.dg/vect/pr25413a.c: Un-XFAIL everywhere. * gcc.dg/vect/vect-71.c: Likewise. * gcc.dg/vect/tsvc/vect-tsvc-s4113.c: Likewise. * gcc.dg/vect/tsvc/vect-tsvc-s491.c: Likewise. * gcc.dg/vect/tsvc/vect-tsvc-vas.c: Likewise.
Diffstat (limited to 'gcc')
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr25413a.c3
-rw-r--r--gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/vect-71.c2
-rw-r--r--gcc/tree-vect-data-refs.cc4
-rw-r--r--gcc/tree-vect-stmts.cc117
7 files changed, 97 insertions, 35 deletions
diff --git a/gcc/testsuite/gcc.dg/vect/pr25413a.c b/gcc/testsuite/gcc.dg/vect/pr25413a.c
index e444b2c..ffb517c 100644
--- a/gcc/testsuite/gcc.dg/vect/pr25413a.c
+++ b/gcc/testsuite/gcc.dg/vect/pr25413a.c
@@ -123,7 +123,6 @@ int main (void)
return 0;
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! vect_scatter_store } } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_scatter_store } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "vector alignment may not be reachable" 1 "vect" { target { ! vector_alignment_reachable } } } } */
/* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { ! vector_alignment_reachable } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c
index b64682a..ddb7e9d 100644
--- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c
+++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s4113.c
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
return 0;
}
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve } } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c
index 8465e13..29e90ff 100644
--- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c
+++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s491.c
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
return 0;
}
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve } } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c
index 5ff3885..b72ee21 100644
--- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c
+++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-vas.c
@@ -39,4 +39,4 @@ int main (int argc, char **argv)
return 0;
}
-/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! aarch64_sve } } } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-71.c b/gcc/testsuite/gcc.dg/vect/vect-71.c
index f155211..581473f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-71.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-71.c
@@ -36,4 +36,4 @@ int main (void)
return main1 ();
}
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { ! vect_scatter_store } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index c03ffb3..6721ab6 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -4464,9 +4464,7 @@ vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
&& !TREE_THIS_VOLATILE (DR_REF (dr));
bool maybe_scatter
= DR_IS_WRITE (dr)
- && !TREE_THIS_VOLATILE (DR_REF (dr))
- && (targetm.vectorize.builtin_scatter != NULL
- || supports_vec_scatter_store_p ());
+ && !TREE_THIS_VOLATILE (DR_REF (dr));
/* If target supports vector gather loads or scatter stores,
see if they can't be used. */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index dc2dc2c..c71e287 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -942,6 +942,7 @@ cfun_returns (tree decl)
static void
vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
vect_memory_access_type memory_access_type,
+ gather_scatter_info *gs_info,
dr_alignment_support alignment_support_scheme,
int misalignment,
vec_load_store_type vls_type, slp_tree slp_node,
@@ -997,8 +998,16 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
if (memory_access_type == VMAT_ELEMENTWISE
|| memory_access_type == VMAT_GATHER_SCATTER)
{
- /* N scalar stores plus extracting the elements. */
unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
+ if (memory_access_type == VMAT_GATHER_SCATTER
+ && gs_info->ifn == IFN_LAST && !gs_info->decl)
+ /* For emulated scatter N offset vector element extracts
+ (we assume the scalar scaling and ptr + offset add is consumed by
+ the load). */
+ inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
+ vec_to_scalar, stmt_info, 0,
+ vect_body);
+ /* N scalar stores plus extracting the elements. */
inside_cost += record_stmt_cost (cost_vec,
ncopies * assumed_nunits,
scalar_store, stmt_info, 0, vect_body);
@@ -1008,7 +1017,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
misalignment, &inside_cost, cost_vec);
if (memory_access_type == VMAT_ELEMENTWISE
- || memory_access_type == VMAT_STRIDED_SLP)
+ || memory_access_type == VMAT_STRIDED_SLP
+ || (memory_access_type == VMAT_GATHER_SCATTER
+ && gs_info->ifn == IFN_LAST && !gs_info->decl))
{
/* N scalar stores plus extracting the elements. */
unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
@@ -2503,19 +2514,11 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
}
else if (gs_info->ifn == IFN_LAST && !gs_info->decl)
{
- if (vls_type != VLS_LOAD)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "unsupported emulated scatter.\n");
- return false;
- }
- else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
- || !TYPE_VECTOR_SUBPARTS
- (gs_info->offset_vectype).is_constant ()
- || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
- (gs_info->offset_vectype),
- TYPE_VECTOR_SUBPARTS (vectype)))
+ if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
+ || !TYPE_VECTOR_SUBPARTS (gs_info->offset_vectype).is_constant ()
+ || !constant_multiple_p (TYPE_VECTOR_SUBPARTS
+ (gs_info->offset_vectype),
+ TYPE_VECTOR_SUBPARTS (vectype)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7824,6 +7827,15 @@ vectorizable_store (vec_info *vinfo,
"unsupported access type for masked store.\n");
return false;
}
+ else if (memory_access_type == VMAT_GATHER_SCATTER
+ && gs_info.ifn == IFN_LAST
+ && !gs_info.decl)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported masked emulated scatter.\n");
+ return false;
+ }
}
else
{
@@ -7887,7 +7899,8 @@ vectorizable_store (vec_info *vinfo,
STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
vect_model_store_cost (vinfo, stmt_info, ncopies,
- memory_access_type, alignment_support_scheme,
+ memory_access_type, &gs_info,
+ alignment_support_scheme,
misalignment, vls_type, slp_node, cost_vec);
return true;
}
@@ -8527,12 +8540,9 @@ vectorizable_store (vec_info *vinfo,
dataref_offset = build_int_cst (ref_type, 0);
}
else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
- {
- vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
- slp_node, &gs_info, &dataref_ptr,
- &vec_offsets);
- vec_offset = vec_offsets[0];
- }
+ vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+ slp_node, &gs_info, &dataref_ptr,
+ &vec_offsets);
else
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -8558,9 +8568,7 @@ vectorizable_store (vec_info *vinfo,
if (dataref_offset)
dataref_offset
= int_const_binop (PLUS_EXPR, dataref_offset, bump);
- else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
- vec_offset = vec_offsets[j];
- else
+ else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
stmt_info, bump);
}
@@ -8648,8 +8656,11 @@ vectorizable_store (vec_info *vinfo,
final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
final_mask, vec_mask, gsi);
- if (memory_access_type == VMAT_GATHER_SCATTER)
+ if (memory_access_type == VMAT_GATHER_SCATTER
+ && gs_info.ifn != IFN_LAST)
{
+ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+ vec_offset = vec_offsets[vec_num * j + i];
tree scale = size_int (gs_info.scale);
gcall *call;
if (final_mask)
@@ -8665,6 +8676,60 @@ vectorizable_store (vec_info *vinfo,
new_stmt = call;
break;
}
+ else if (memory_access_type == VMAT_GATHER_SCATTER)
+ {
+ /* Emulated scatter. */
+ gcc_assert (!final_mask);
+ unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+ unsigned HOST_WIDE_INT const_offset_nunits
+ = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+ .to_constant ();
+ vec<constructor_elt, va_gc> *ctor_elts;
+ vec_alloc (ctor_elts, const_nunits);
+ gimple_seq stmts = NULL;
+ tree elt_type = TREE_TYPE (vectype);
+ unsigned HOST_WIDE_INT elt_size
+ = tree_to_uhwi (TYPE_SIZE (elt_type));
+ /* We support offset vectors with more elements
+ than the data vector for now. */
+ unsigned HOST_WIDE_INT factor
+ = const_offset_nunits / const_nunits;
+ vec_offset = vec_offsets[j / factor];
+ unsigned elt_offset = (j % factor) * const_nunits;
+ tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+ tree scale = size_int (gs_info.scale);
+ align = get_object_alignment (DR_REF (first_dr_info->dr));
+ tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
+ for (unsigned k = 0; k < const_nunits; ++k)
+ {
+ /* Compute the offsetted pointer. */
+ tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+ bitsize_int (k + elt_offset));
+ tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+ idx_type, vec_offset,
+ TYPE_SIZE (idx_type), boff);
+ idx = gimple_convert (&stmts, sizetype, idx);
+ idx = gimple_build (&stmts, MULT_EXPR,
+ sizetype, idx, scale);
+ tree ptr = gimple_build (&stmts, PLUS_EXPR,
+ TREE_TYPE (dataref_ptr),
+ dataref_ptr, idx);
+ ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+ /* Extract the element to be stored. */
+ tree elt = gimple_build (&stmts, BIT_FIELD_REF,
+ TREE_TYPE (vectype), vec_oprnd,
+ TYPE_SIZE (elt_type),
+ bitsize_int (k * elt_size));
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ stmts = NULL;
+ tree ref = build2 (MEM_REF, ltype, ptr,
+ build_int_cst (ref_type, 0));
+ new_stmt = gimple_build_assign (ref, elt);
+ vect_finish_stmt_generation (vinfo, stmt_info,
+ new_stmt, gsi);
+ }
+ break;
+ }
if (i > 0)
/* Bump the vector pointer. */