aboutsummaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2018-01-12 11:43:13 +0000
committerRichard Biener <rguenth@gcc.gnu.org>2018-01-12 11:43:13 +0000
commitc803b2a92822c57abf5464deaf5be5c31d8a4692 (patch)
tree4a8bf4ce632240e45cdadcf69318f73f17a0f232 /gcc
parent46336a0eab790e4f94dd7e7ecf9339a884c44746 (diff)
downloadgcc-c803b2a92822c57abf5464deaf5be5c31d8a4692.zip
gcc-c803b2a92822c57abf5464deaf5be5c31d8a4692.tar.gz
gcc-c803b2a92822c57abf5464deaf5be5c31d8a4692.tar.bz2
re PR target/80846 (auto-vectorized AVX2 horizontal sum should narrow to 128b right away, to be more efficient for Ryzen and Intel)
2018-01-12 Richard Biener <rguenther@suse.de> PR tree-optimization/80846 * target.def (split_reduction): New target hook. * targhooks.c (default_split_reduction): New function. * targhooks.h (default_split_reduction): Declare. * tree-vect-loop.c (vect_create_epilog_for_reduction): If the target requests first reduce vectors by combining low and high parts. * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust. (get_vectype_for_scalar_type_and_size): Export. * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare. * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document. * doc/tm.texi: Regenerate. i386/ * config/i386/i386.c (ix86_split_reduction): Implement TARGET_VECTORIZE_SPLIT_REDUCTION. * gcc.target/i386/pr80846-1.c: New testcase. * gcc.target/i386/pr80846-2.c: Likewise. From-SVN: r256576
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog17
-rw-r--r--gcc/config/i386/i386.c36
-rw-r--r--gcc/doc/tm.texi7
-rw-r--r--gcc/doc/tm.texi.in2
-rw-r--r--gcc/target.def11
-rw-r--r--gcc/targhooks.c8
-rw-r--r--gcc/targhooks.h1
-rw-r--r--gcc/testsuite/ChangeLog6
-rw-r--r--gcc/testsuite/gcc.target/i386/pr80846-1.c12
-rw-r--r--gcc/testsuite/gcc.target/i386/pr80846-2.c12
-rw-r--r--gcc/tree-vect-loop.c143
-rw-r--r--gcc/tree-vect-stmts.c2
-rw-r--r--gcc/tree-vectorizer.h1
13 files changed, 231 insertions, 27 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index e13743d..1f45587 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2018-01-12 Richard Biener <rguenther@suse.de>
+
+ PR tree-optimization/80846
+ * target.def (split_reduction): New target hook.
+ * targhooks.c (default_split_reduction): New function.
+ * targhooks.h (default_split_reduction): Declare.
+ * tree-vect-loop.c (vect_create_epilog_for_reduction): If the
+ target requests first reduce vectors by combining low and high
+ parts.
+ * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust.
+ (get_vectype_for_scalar_type_and_size): Export.
+ * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare.
+ * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document.
+ * doc/tm.texi: Regenerate.
+ * config/i386/i386.c (ix86_split_reduction): Implement
+ TARGET_VECTORIZE_SPLIT_REDUCTION.
+
2018-01-12 Eric Botcazou <ebotcazou@adacore.com>
PR target/83368
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index d625670..5ee3be3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -49008,6 +49008,39 @@ ix86_preferred_simd_mode (scalar_mode mode)
}
}
+/* All CPUs prefer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to SSE reg size. */
+
+static machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+ /* Reduce lowpart against highpart until we reach SSE reg width to
+ avoid cross-lane operations. */
+ switch (mode)
+ {
+ case E_V8DImode:
+ case E_V4DImode:
+ return V2DImode;
+ case E_V16SImode:
+ case E_V8SImode:
+ return V4SImode;
+ case E_V32HImode:
+ case E_V16HImode:
+ return V8HImode;
+ case E_V64QImode:
+ case E_V32QImode:
+ return V16QImode;
+ case E_V16SFmode:
+ case E_V8SFmode:
+ return V4SFmode;
+ case E_V8DFmode:
+ case E_V4DFmode:
+ return V2DFmode;
+ default:
+ return mode;
+ }
+}
+
/* If AVX is enabled then try vectorizing with both 256bit and 128bit
vectors. If AVX512F is enabled then try vectorizing with 512bit,
256bit and 128bit vectors. */
@@ -50640,6 +50673,9 @@ ix86_run_selftests (void)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
ix86_preferred_simd_mode
+#undef TARGET_VECTORIZE_SPLIT_REDUCTION
+#define TARGET_VECTORIZE_SPLIT_REDUCTION \
+ ix86_split_reduction
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
ix86_autovectorize_vector_sizes
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 0836cf1..11b560b 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5873,6 +5873,13 @@ equal to @code{word_mode}, because the vectorizer can do some
transformations even in absence of specialized @acronym{SIMD} hardware.
@end deftypefn
+@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode)
+This hook should return the preferred mode to split the final reduction
+step on @var{mode} to. The reduction is then carried out reducing upper
+against lower halves of vectors recursively until the specified mode is
+reached. The default is @var{mode} which means no splitting.
+@end deftypefn
+
@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes})
If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not
the only one that is worth considering, this hook should add all suitable
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 06523ef..0cd694a 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4093,6 +4093,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+@hook TARGET_VECTORIZE_SPLIT_REDUCTION
+
@hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
@hook TARGET_VECTORIZE_GET_MASK_MODE
diff --git a/gcc/target.def b/gcc/target.def
index 02250c3..0a4f5fe 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1890,6 +1890,17 @@ transformations even in absence of specialized @acronym{SIMD} hardware.",
(scalar_mode mode),
default_preferred_simd_mode)
+/* Returns the preferred mode for splitting SIMD reductions to. */
+DEFHOOK
+(split_reduction,
+ "This hook should return the preferred mode to split the final reduction\n\
+step on @var{mode} to. The reduction is then carried out reducing upper\n\
+against lower halves of vectors recursively until the specified mode is\n\
+reached. The default is @var{mode} which means no splitting.",
+ machine_mode,
+ (machine_mode),
+ default_split_reduction)
+
/* Returns a mask of vector sizes to iterate over when auto-vectorizing
after processing the preferred one derived from preferred_simd_mode. */
DEFHOOK
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index e064dd8..5b60944 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -1283,6 +1283,14 @@ default_preferred_simd_mode (scalar_mode)
return word_mode;
}
+/* By default do not split reductions further. */
+
+machine_mode
+default_split_reduction (machine_mode mode)
+{
+ return mode;
+}
+
/* By default only the size derived from the preferred vector mode
is tried. */
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index b4a0cd0..f55fde7 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -108,6 +108,7 @@ default_builtin_support_vector_misalignment (machine_mode mode,
const_tree,
int, bool);
extern machine_mode default_preferred_simd_mode (scalar_mode mode);
+extern machine_mode default_split_reduction (machine_mode);
extern void default_autovectorize_vector_sizes (vector_sizes *);
extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
extern void *default_init_cost (struct loop *);
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6b6b8e0..a91660c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2018-01-12 Richard Biener <rguenther@suse.de>
+
+ PR tree-optimization/80846
+ * gcc.target/i386/pr80846-1.c: New testcase.
+ * gcc.target/i386/pr80846-2.c: Likewise.
+
2018-01-12 Eric Botcazou <ebotcazou@adacore.com>
* gcc.c-torture/execute/20180112-1.c: New test.
diff --git a/gcc/testsuite/gcc.target/i386/pr80846-1.c b/gcc/testsuite/gcc.target/i386/pr80846-1.c
new file mode 100644
index 0000000..295bb7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr80846-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512f" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr80846-2.c b/gcc/testsuite/gcc.target/i386/pr80846-2.c
new file mode 100644
index 0000000..df3535f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr80846-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2" } */
+
+int sumint(const int arr[]) {
+ arr = __builtin_assume_aligned(arr, 64);
+ int sum=0;
+ for (int i=0 ; i<1024 ; i++)
+ sum+=arr[i];
+ return sum;
+}
+
+/* { dg-final { scan-assembler-times "vextracti" 1 } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index c2501a8..c6fa519 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5062,12 +5062,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
}
else
{
- bool reduce_with_shift = have_whole_vector_shift (mode);
- int element_bitsize = tree_to_uhwi (bitsize);
- /* Enforced by vectorizable_reduction, which disallows SLP reductions
- for variable-length vectors and also requires direct target support
- for loop reductions. */
- int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ bool reduce_with_shift;
tree vec_temp;
/* COND reductions all do the final reduction with MAX_EXPR
@@ -5081,30 +5076,125 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
code = MAX_EXPR;
}
- /* Regardless of whether we have a whole vector shift, if we're
- emulating the operation via tree-vect-generic, we don't want
- to use it. Only the first round of the reduction is likely
- to still be profitable via emulation. */
- /* ??? It might be better to emit a reduction tree code here, so that
- tree-vect-generic can expand the first round via bit tricks. */
- if (!VECTOR_MODE_P (mode))
- reduce_with_shift = false;
+ /* See if the target wants to do the final (shift) reduction
+ in a vector mode of smaller size and first reduce upper/lower
+ halves against each other. */
+ enum machine_mode mode1 = mode;
+ tree vectype1 = vectype;
+ unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+ unsigned sz1 = sz;
+ if (!slp_reduc
+ && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
+ sz1 = GET_MODE_SIZE (mode1).to_constant ();
+
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
+ reduce_with_shift = have_whole_vector_shift (mode1);
+ if (!VECTOR_MODE_P (mode1))
+ reduce_with_shift = false;
else
- {
- optab optab = optab_for_tree_code (code, vectype, optab_default);
- if (optab_handler (optab, mode) == CODE_FOR_nothing)
- reduce_with_shift = false;
- }
+ {
+ optab optab = optab_for_tree_code (code, vectype1, optab_default);
+ if (optab_handler (optab, mode1) == CODE_FOR_nothing)
+ reduce_with_shift = false;
+ }
+
+ /* First reduce the vector to the desired vector size we should
+ do shift reduction on by combining upper and lower halves. */
+ new_temp = new_phi_result;
+ while (sz > sz1)
+ {
+ gcc_assert (!slp_reduc);
+ sz /= 2;
+ vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
+
+ /* The target has to make sure we support lowpart/highpart
+ extraction, either via direct vector extract or through
+ an integer mode punning. */
+ tree dst1, dst2;
+ if (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (TREE_TYPE (new_temp)),
+ TYPE_MODE (vectype1))
+ != CODE_FOR_nothing)
+ {
+ /* Extract sub-vectors directly once vec_extract becomes
+ a conversion optab. */
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst1, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt
+ = gimple_build_assign (dst2, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, vectype1,
+ new_temp, TYPE_SIZE (vectype1),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+ else
+ {
+ /* Extract via punning to appropriately sized integer mode
+ vector. */
+ tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
+ 1);
+ tree etype = build_vector_type (eltype, 2);
+ gcc_assert (convert_optab_handler (vec_extract_optab,
+ TYPE_MODE (etype),
+ TYPE_MODE (eltype))
+ != CODE_FOR_nothing);
+ tree tem = make_ssa_name (etype);
+ epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ etype, new_temp));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ new_temp = tem;
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (0)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst1 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ tem = make_ssa_name (eltype);
+ epilog_stmt
+ = gimple_build_assign (tem, BIT_FIELD_REF,
+ build3 (BIT_FIELD_REF, eltype,
+ new_temp, TYPE_SIZE (eltype),
+ bitsize_int (sz * BITS_PER_UNIT)));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ dst2 = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ vectype1, tem));
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
+
+ new_temp = make_ssa_name (vectype1);
+ epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
+ gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+ }
if (reduce_with_shift && !slp_reduc)
- {
- int nelements = vec_size_in_bits / element_bitsize;
+ {
+ int element_bitsize = tree_to_uhwi (bitsize);
+ /* Enforced by vectorizable_reduction, which disallows SLP reductions
+ for variable-length vectors and also requires direct target support
+ for loop reductions. */
+ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+ int nelements = vec_size_in_bits / element_bitsize;
vec_perm_builder sel;
vec_perm_indices indices;
int elt_offset;
- tree zero_vec = build_zero_cst (vectype);
+ tree zero_vec = build_zero_cst (vectype1);
/* Case 2: Create:
for (offset = nelements/2; offset >= 1; offset/=2)
{
@@ -5118,15 +5208,15 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using vector shifts\n");
- vec_dest = vect_create_destination_var (scalar_dest, vectype);
- new_temp = new_phi_result;
+ mode1 = TYPE_MODE (vectype1);
+ vec_dest = vect_create_destination_var (scalar_dest, vectype1);
for (elt_offset = nelements / 2;
elt_offset >= 1;
elt_offset /= 2)
{
calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
indices.new_vector (sel, 2, nelements);
- tree mask = vect_gen_perm_mask_any (vectype, indices);
+ tree mask = vect_gen_perm_mask_any (vectype1, indices);
epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
new_temp, zero_vec, mask);
new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -5171,7 +5261,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using scalar code.\n");
- vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+ int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
+ int element_bitsize = tree_to_uhwi (bitsize);
FOR_EACH_VEC_ELT (new_phis, i, new_phi)
{
int bit_offset;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 819a981..50b35fc 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -9068,7 +9068,7 @@ free_stmt_vec_info (gimple *stmt)
Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported
by the target. */
-static tree
+tree
get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size)
{
tree orig_scalar_type = scalar_type;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 150b268..129cde0 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1209,6 +1209,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info);
/* In tree-vect-stmts.c. */
extern poly_uint64 current_vector_size;
extern tree get_vectype_for_scalar_type (tree);
+extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64);
extern tree get_mask_type_for_scalar_type (tree);
extern tree get_same_sized_vectype (tree, tree);
extern bool vect_is_simple_use (tree, vec_info *, gimple **,