aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--gcc/ChangeLog41
-rw-r--r--gcc/config/aarch64/aarch64-sve.md13
-rw-r--r--gcc/config/aarch64/aarch64.md1
-rw-r--r--gcc/doc/md.texi8
-rw-r--r--gcc/internal-fn.def3
-rw-r--r--gcc/optabs.def1
-rw-r--r--gcc/testsuite/ChangeLog17
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr37027.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/pr67790.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-1.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-2.c2
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-3.c5
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-reduc-5.c2
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_5.c58
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_5_run.c35
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_6.c47
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_6_run.c37
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_7.c66
-rw-r--r--gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c39
-rw-r--r--gcc/tree-vect-loop.c322
-rw-r--r--gcc/tree-vect-slp.c10
-rw-r--r--gcc/tree-vectorizer.h5
22 files changed, 637 insertions, 81 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8bb12d3..77eae2d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -2,6 +2,47 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
+ * doc/md.texi (vec_shl_insert_@var{m}): New optab.
+ * internal-fn.def (VEC_SHL_INSERT): New internal function.
+ * optabs.def (vec_shl_insert_optab): New optab.
+ * tree-vectorizer.h (can_duplicate_and_interleave_p): Declare.
+ (duplicate_and_interleave): Likewise.
+ * tree-vect-loop.c: Include internal-fn.h.
+ (neutral_op_for_slp_reduction): New function, split out from
+ get_initial_defs_for_reduction.
+ (get_initial_def_for_reduction): Handle option 2 for variable-length
+ vectors by loading the neutral value into a vector and then shifting
+ the initial value into element 0.
+ (get_initial_defs_for_reduction): Replace the code argument with
+ the neutral value calculated by neutral_op_for_slp_reduction.
+ Use gimple_build_vector for constant-length vectors.
+ Use IFN_VEC_SHL_INSERT for variable-length vectors if all
+ but the first group_size elements have a neutral value.
+ Use duplicate_and_interleave otherwise.
+ (vect_create_epilog_for_reduction): Take a neutral_op parameter.
+ Update call to get_initial_defs_for_reduction. Handle SLP
+ reductions for variable-length vectors by creating one vector
+ result for each scalar result, with the elements associated
+ with other scalar results stubbed out with the neutral value.
+ (vectorizable_reduction): Call neutral_op_for_slp_reduction.
+ Require IFN_VEC_SHL_INSERT for double reductions on
+ variable-length vectors, or SLP reductions that have
+ a neutral value. Require can_duplicate_and_interleave_p
+ support for variable-length unchained SLP reductions if there
+ is no neutral value, such as for MIN/MAX reductions. Also require
+ the number of vector elements to be a multiple of the number of
+ SLP statements when doing variable-length unchained SLP reductions.
+ Update call to vect_create_epilog_for_reduction.
+ * tree-vect-slp.c (can_duplicate_and_interleave_p): Make public
+ and remove initial values.
+ (duplicate_and_interleave): Make public.
+ * config/aarch64/aarch64.md (UNSPEC_INSR): New unspec.
+ * config/aarch64/aarch64-sve.md (vec_shl_insert_<mode>): New insn.
+
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
* tree-vect-slp.c: Include gimple-fold.h and internal-fn.h
(can_duplicate_and_interleave_p): New function.
(vect_get_and_check_slp_defs): Take the vector of statements
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index e83d4f6..fc35902 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2073,3 +2073,16 @@
operands[5] = gen_reg_rtx (VNx4SImode);
}
)
+
+;; Shift an SVE vector left and insert a scalar into element 0.
+(define_insn "vec_shl_insert_<mode>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 1 "register_operand" "0, 0")
+ (match_operand:<VEL> 2 "register_operand" "rZ, w")]
+ UNSPEC_INSR))]
+ "TARGET_SVE"
+ "@
+ insr\t%0.<Vetype>, %<vwcore>2
+ insr\t%0.<Vetype>, %<Vetype>2"
+)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 33eff58..f1915020 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -163,6 +163,7 @@
UNSPEC_WHILE_LO
UNSPEC_LDN
UNSPEC_STN
+ UNSPEC_INSR
])
(define_c_enum "unspecv" [
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 213ee99..4f635b0 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5224,6 +5224,14 @@ operand 1. Add operand 1 to operand 2 and place the widened result in
operand 0. (This is used express accumulation of elements into an accumulator
of a wider mode.)
+@cindex @code{vec_shl_insert_@var{m}} instruction pattern
+@item @samp{vec_shl_insert_@var{m}}
+Shift the elements in vector input operand 1 left one element (i.e.
+away from element 0) and fill the vacated element 0 with the scalar
+in operand 2. Store the result in vector output operand 0. Operands
+0 and 1 have mode @var{m} and operand 2 has the mode appropriate for
+one element of @var{m}.
+
@cindex @code{vec_shr_@var{m}} instruction pattern
@item @samp{vec_shr_@var{m}}
Whole vector right shift in bits, i.e. towards element 0.
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 4dc07c9..925a230 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -116,6 +116,9 @@ DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0,
vec_mask_store_lanes, mask_store_lanes)
+DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
+ vec_shl_insert, binary)
+
DEF_INTERNAL_OPTAB_FN (RSQRT, ECF_CONST, rsqrt, unary)
DEF_INTERNAL_OPTAB_FN (REDUC_PLUS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index c22708b..ec5f5f5 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -368,3 +368,4 @@ OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
+OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3f6b5d7..d3ec83c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -2,6 +2,23 @@
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
+ * gcc.dg/vect/pr37027.c: Remove XFAIL for variable-length vectors.
+ * gcc.dg/vect/pr67790.c: Likewise.
+ * gcc.dg/vect/slp-reduc-1.c: Likewise.
+ * gcc.dg/vect/slp-reduc-2.c: Likewise.
+ * gcc.dg/vect/slp-reduc-3.c: Likewise.
+ * gcc.dg/vect/slp-reduc-5.c: Likewise.
+ * gcc.target/aarch64/sve/slp_5.c: New test.
+ * gcc.target/aarch64/sve/slp_5_run.c: Likewise.
+ * gcc.target/aarch64/sve/slp_6.c: Likewise.
+ * gcc.target/aarch64/sve/slp_6_run.c: Likewise.
+ * gcc.target/aarch64/sve/slp_7.c: Likewise.
+ * gcc.target/aarch64/sve/slp_7_run.c: Likewise.
+
+2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
+ Alan Hayward <alan.hayward@arm.com>
+ David Sherwood <david.sherwood@arm.com>
+
* gcc.dg/vect/no-scevccp-slp-30.c: Don't XFAIL for vect_variable_length
&& vect_load_lanes
* gcc.dg/vect/slp-1.c: Likewise.
diff --git a/gcc/testsuite/gcc.dg/vect/pr37027.c b/gcc/testsuite/gcc.dg/vect/pr37027.c
index 0eef5cb..ef6760e 100644
--- a/gcc/testsuite/gcc.dg/vect/pr37027.c
+++ b/gcc/testsuite/gcc.dg/vect/pr37027.c
@@ -32,5 +32,5 @@ foo (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr67790.c b/gcc/testsuite/gcc.dg/vect/pr67790.c
index f923b54..5e2d506 100644
--- a/gcc/testsuite/gcc.dg/vect/pr67790.c
+++ b/gcc/testsuite/gcc.dg/vect/pr67790.c
@@ -37,4 +37,4 @@ int main()
return 0;
}
-/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
index 29783ea..b353dd7 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-1.c
@@ -43,5 +43,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
index ab39ccc..07c96c0 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-2.c
@@ -38,5 +38,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_add || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
index 34c8da7..9c8124c 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-3.c
@@ -58,7 +58,4 @@ int main (void)
/* The initialization loop in main also gets vectorized. */
/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && { vect_widen_sum_hi_to_si && vect_unpack } } } } } */
-/* We can't yet create the necessary SLP constant vector for variable-length
- SVE and so fall back to Advanced SIMD. This means that we repeat each
- analysis note. */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern || { { ! vect_unpack } || { aarch64_sve && vect_variable_length } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern || { ! vect_unpack } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
index 1ed8e5f..fc689e4 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
@@ -43,5 +43,5 @@ int main (void)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_min_max } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_int_min_max || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_min_max } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c
new file mode 100644
index 0000000..4e26419
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_5.c
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 2]; \
+ x1 += a[i * 2 + 1]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* ??? We don't think it's worth using SLP for the 64-bit loops and fall
+ back to the less efficient non-SLP implementation instead. */
+/* ??? At present we don't treat the int8_t and int16_t loops as
+ reductions. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tld2b\t} } } */
+/* { dg-final { scan-assembler-not {\tld2h\t} } } */
+/* { dg-final { scan-assembler-not {\tld2w\t} } } */
+/* { dg-final { scan-assembler-not {\tld2d\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_5_run.c
new file mode 100644
index 0000000..043c9b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_5_run.c
@@ -0,0 +1,35 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "slp_5.c"
+
+#define N (141 * 2)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[2] = { 40, 22 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 2); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ for (unsigned int i = 0; i < N; i += 2) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_6.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_6.c
new file mode 100644
index 0000000..d551fa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_6.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ TYPE x2 = b[2]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 3]; \
+ x1 += a[i * 3 + 1]; \
+ x2 += a[i * 3 + 2]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+ b[2] = x2; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* These loops can't use SLP. */
+/* { dg-final { scan-assembler-not {\tld1b\t} } } */
+/* { dg-final { scan-assembler-not {\tld1h\t} } } */
+/* { dg-final { scan-assembler-not {\tld1w\t} } } */
+/* { dg-final { scan-assembler-not {\tld1d\t} } } */
+/* { dg-final { scan-assembler {\tld3b\t} } } */
+/* { dg-final { scan-assembler {\tld3h\t} } } */
+/* { dg-final { scan-assembler {\tld3w\t} } } */
+/* { dg-final { scan-assembler {\tld3d\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_6_run.c
new file mode 100644
index 0000000..f1aeb5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_6_run.c
@@ -0,0 +1,37 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "slp_6.c"
+
+#define N (77 * 3)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[3] = { 40, 22, 75 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 3); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ TYPE x2 = 75; \
+ for (unsigned int i = 0; i < N; i += 3) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ x2 += a[i + 2]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1] || x2 != b[2])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
new file mode 100644
index 0000000..76abbdc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_7.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n) \
+{ \
+ TYPE x0 = b[0]; \
+ TYPE x1 = b[1]; \
+ TYPE x2 = b[2]; \
+ TYPE x3 = b[3]; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ x0 += a[i * 4]; \
+ x1 += a[i * 4 + 1]; \
+ x2 += a[i * 4 + 2]; \
+ x3 += a[i * 4 + 3]; \
+ } \
+ b[0] = x0; \
+ b[1] = x1; \
+ b[2] = x2; \
+ b[3] = x3; \
+}
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* We can't use SLP for the 64-bit loops, since the number of reduction
+ results might be greater than the number of elements in the vector.
+ Otherwise we have two loads per loop, one for the initial vector
+ and one for the loop body. */
+/* ??? At present we don't treat the int8_t and int16_t loops as
+ reductions. */
+/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
+/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
+/* { dg-final { scan-assembler-not {\tld4b\t} } } */
+/* { dg-final { scan-assembler-not {\tld4h\t} } } */
+/* { dg-final { scan-assembler-not {\tld4w\t} } } */
+/* { dg-final { scan-assembler-not {\tld1d\t} } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
+/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
+/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c
new file mode 100644
index 0000000..3cc090d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_7_run.c
@@ -0,0 +1,39 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+
+#include "slp_7.c"
+
+#define N (54 * 4)
+
+#define HARNESS(TYPE) \
+ { \
+ TYPE a[N], b[4] = { 40, 22, 75, 19 }; \
+ for (unsigned int i = 0; i < N; ++i) \
+ { \
+ a[i] = i * 2 + i % 5; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ vec_slp_##TYPE (a, b, N / 4); \
+ TYPE x0 = 40; \
+ TYPE x1 = 22; \
+ TYPE x2 = 75; \
+ TYPE x3 = 19; \
+ for (unsigned int i = 0; i < N; i += 4) \
+ { \
+ x0 += a[i]; \
+ x1 += a[i + 1]; \
+ x2 += a[i + 2]; \
+ x3 += a[i + 3]; \
+ asm volatile ("" ::: "memory"); \
+ } \
+ /* _Float16 isn't precise enough for this. */ \
+ if ((TYPE) 0x1000 + 1 != (TYPE) 0x1000 \
+ && (x0 != b[0] || x1 != b[1] || x2 != b[2] || x3 != b[3])) \
+ __builtin_abort (); \
+ }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+ TEST_ALL (HARNESS)
+}
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 5d6f1ab..9219a0d 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2451,6 +2451,54 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
}
}
+/* If there is a neutral value X such that SLP reduction NODE would not
+ be affected by the introduction of additional X elements, return that X,
+ otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
+ is true if the SLP statements perform a single reduction, false if each
+ statement performs an independent reduction. */
+
+static tree
+neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
+ bool reduc_chain)
+{
+ vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+ gimple *stmt = stmts[0];
+ stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+ tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
+ tree scalar_type = TREE_TYPE (vector_type);
+ struct loop *loop = gimple_bb (stmt)->loop_father;
+ gcc_assert (loop);
+
+ switch (code)
+ {
+ case WIDEN_SUM_EXPR:
+ case DOT_PROD_EXPR:
+ case SAD_EXPR:
+ case PLUS_EXPR:
+ case MINUS_EXPR:
+ case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ return build_zero_cst (scalar_type);
+
+ case MULT_EXPR:
+ return build_one_cst (scalar_type);
+
+ case BIT_AND_EXPR:
+ return build_all_ones_cst (scalar_type);
+
+ case MAX_EXPR:
+ case MIN_EXPR:
+ /* For MIN/MAX the initial values are neutral. A reduction chain
+ has only a single initial value, so that value is neutral for
+ all statements. */
+ if (reduc_chain)
+ return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
+ return NULL_TREE;
+
+ default:
+ return NULL_TREE;
+ }
+}
/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
STMT is printed with a message MSG. */
@@ -4095,6 +4143,16 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
/* Option1: the first element is '0' or '1' as well. */
init_def = gimple_build_vector_from_val (&stmts, vectype,
def_for_init);
+ else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
+ {
+ /* Option2 (variable length): the first element is INIT_VAL. */
+ init_def = build_vector_from_val (vectype, def_for_init);
+ gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
+ 2, init_def, init_val);
+ init_def = make_ssa_name (vectype);
+ gimple_call_set_lhs (call, init_def);
+ gimple_seq_add_stmt (&stmts, call);
+ }
else
{
/* Option2: the first element is INIT_VAL. */
@@ -4134,34 +4192,32 @@ get_initial_def_for_reduction (gimple *stmt, tree init_val,
}
/* Get at the initial defs for the reduction PHIs in SLP_NODE.
- NUMBER_OF_VECTORS is the number of vector defs to create. */
+ NUMBER_OF_VECTORS is the number of vector defs to create.
+ If NEUTRAL_OP is nonnull, introducing extra elements of that
+ value will not change the result. */
static void
get_initial_defs_for_reduction (slp_tree slp_node,
vec<tree> *vec_oprnds,
unsigned int number_of_vectors,
- enum tree_code code, bool reduc_chain)
+ bool reduc_chain, tree neutral_op)
{
vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
gimple *stmt = stmts[0];
stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
- unsigned nunits;
+ unsigned HOST_WIDE_INT nunits;
unsigned j, number_of_places_left_in_vector;
- tree vector_type, scalar_type;
+ tree vector_type;
tree vop;
int group_size = stmts.length ();
unsigned int vec_num, i;
unsigned number_of_copies = 1;
vec<tree> voprnds;
voprnds.create (number_of_vectors);
- tree neutral_op = NULL;
struct loop *loop;
+ auto_vec<tree, 16> permute_results;
vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
- scalar_type = TREE_TYPE (vector_type);
- /* vectorizable_reduction has already rejected SLP reductions on
- variable-length vectors. */
- nunits = TYPE_VECTOR_SUBPARTS (vector_type).to_constant ();
gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
@@ -4169,45 +4225,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
gcc_assert (loop);
edge pe = loop_preheader_edge (loop);
- /* op is the reduction operand of the first stmt already. */
- /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
- we need either neutral operands or the original operands. See
- get_initial_def_for_reduction() for details. */
- switch (code)
- {
- case WIDEN_SUM_EXPR:
- case DOT_PROD_EXPR:
- case SAD_EXPR:
- case PLUS_EXPR:
- case MINUS_EXPR:
- case BIT_IOR_EXPR:
- case BIT_XOR_EXPR:
- neutral_op = build_zero_cst (scalar_type);
- break;
-
- case MULT_EXPR:
- neutral_op = build_one_cst (scalar_type);
- break;
-
- case BIT_AND_EXPR:
- neutral_op = build_all_ones_cst (scalar_type);
- break;
-
- /* For MIN/MAX we don't have an easy neutral operand but
- the initial values can be used fine here. Only for
- a reduction chain we have to force a neutral element. */
- case MAX_EXPR:
- case MIN_EXPR:
- if (! reduc_chain)
- neutral_op = NULL;
- else
- neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
- break;
-
- default:
- gcc_assert (! reduc_chain);
- neutral_op = NULL;
- }
+ gcc_assert (!reduc_chain || neutral_op);
/* NUMBER_OF_COPIES is the number of times we need to use the same values in
created vectors. It is greater than 1 if unrolling is performed.
@@ -4225,9 +4243,13 @@ get_initial_defs_for_reduction (slp_tree slp_node,
(s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
{s5, s6, s7, s8}. */
+ if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
+ nunits = group_size;
+
number_of_copies = nunits * number_of_vectors / group_size;
number_of_places_left_in_vector = nunits;
+ bool constant_p = true;
tree_vector_builder elts (vector_type, nunits, 1);
elts.quick_grow (nunits);
for (j = 0; j < number_of_copies; j++)
@@ -4247,11 +4269,48 @@ get_initial_defs_for_reduction (slp_tree slp_node,
/* Create 'vect_ = {op0,op1,...,opn}'. */
number_of_places_left_in_vector--;
elts[number_of_places_left_in_vector] = op;
+ if (!CONSTANT_CLASS_P (op))
+ constant_p = false;
if (number_of_places_left_in_vector == 0)
{
gimple_seq ctor_seq = NULL;
- tree init = gimple_build_vector (&ctor_seq, &elts);
+ tree init;
+ if (constant_p && !neutral_op
+ ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
+ : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+ /* Build the vector directly from ELTS. */
+ init = gimple_build_vector (&ctor_seq, &elts);
+ else if (neutral_op)
+ {
+ /* Build a vector of the neutral value and shift the
+ other elements into place. */
+ init = gimple_build_vector_from_val (&ctor_seq, vector_type,
+ neutral_op);
+ int k = nunits;
+ while (k > 0 && elts[k - 1] == neutral_op)
+ k -= 1;
+ while (k > 0)
+ {
+ k -= 1;
+ gcall *call = gimple_build_call_internal
+ (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
+ init = make_ssa_name (vector_type);
+ gimple_call_set_lhs (call, init);
+ gimple_seq_add_stmt (&ctor_seq, call);
+ }
+ }
+ else
+ {
+ /* First time round, duplicate ELTS to fill the
+ required number of vectors, then cherry pick the
+ appropriate result for each iteration. */
+ if (vec_oprnds->is_empty ())
+ duplicate_and_interleave (&ctor_seq, vector_type, elts,
+ number_of_vectors,
+ permute_results);
+ init = permute_results[number_of_vectors - j - 1];
+ }
if (ctor_seq != NULL)
gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
voprnds.quick_push (init);
@@ -4259,6 +4318,7 @@ get_initial_defs_for_reduction (slp_tree slp_node,
number_of_places_left_in_vector = nunits;
elts.new_vector (vector_type, nunits, 1);
elts.quick_grow (nunits);
+ constant_p = true;
}
}
}
@@ -4328,6 +4388,8 @@ get_initial_defs_for_reduction (slp_tree slp_node,
be smaller than any value of the IV in the loop, for MIN_EXPR larger than
any value of the IV in the loop.
INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
+ NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
+ null if this is not an SLP reduction
This function:
1. Creates the reduction def-use cycles: sets the arguments for
@@ -4376,7 +4438,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
bool double_reduc,
slp_tree slp_node,
slp_instance slp_node_instance,
- tree induc_val, enum tree_code induc_code)
+ tree induc_val, enum tree_code induc_code,
+ tree neutral_op)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
stmt_vec_info prev_phi_info;
@@ -4412,6 +4475,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
auto_vec<tree> vec_initial_defs;
auto_vec<gimple *> phis;
bool slp_reduc = false;
+ bool direct_slp_reduc;
tree new_phi_result;
gimple *inner_phi = NULL;
tree induction_index = NULL_TREE;
@@ -4455,8 +4519,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
vec_initial_defs.reserve (vec_num);
get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
- &vec_initial_defs, vec_num, code,
- GROUP_FIRST_ELEMENT (stmt_info));
+ &vec_initial_defs, vec_num,
+ GROUP_FIRST_ELEMENT (stmt_info),
+ neutral_op);
}
else
{
@@ -4763,6 +4828,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
b2 = operation (b1) */
slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
+ /* True if we should implement SLP_REDUC using native reduction operations
+ instead of scalar operations. */
+ direct_slp_reduc = (reduc_fn != IFN_LAST
+ && slp_reduc
+ && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
+
/* In case of reduction chain, e.g.,
# a1 = phi <a3, a0>
a2 = operation (a1)
@@ -4770,7 +4841,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
we may end up with more than one vector result. Here we reduce them to
one vector. */
- if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+ if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
{
tree first_vect = PHI_RESULT (new_phis[0]);
gassign *new_vec_stmt = NULL;
@@ -5061,6 +5132,83 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
scalar_results.safe_push (new_temp);
}
+ else if (direct_slp_reduc)
+ {
+ /* Here we create one vector for each of the GROUP_SIZE results,
+ with the elements for other SLP statements replaced with the
+ neutral value. We can then do a normal reduction on each vector. */
+
+ /* Enforced by vectorizable_reduction. */
+ gcc_assert (new_phis.length () == 1);
+ gcc_assert (pow2p_hwi (group_size));
+
+ slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
+ vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
+ gimple_seq seq = NULL;
+
+ /* Build a vector {0, 1, 2, ...}, with the same number of elements
+ and the same element size as VECTYPE. */
+ tree index = build_index_vector (vectype, 0, 1);
+ tree index_type = TREE_TYPE (index);
+ tree index_elt_type = TREE_TYPE (index_type);
+ tree mask_type = build_same_sized_truth_vector_type (index_type);
+
+ /* Create a vector that, for each element, identifies which of
+ the GROUP_SIZE results should use it. */
+ tree index_mask = build_int_cst (index_elt_type, group_size - 1);
+ index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
+ build_vector_from_val (index_type, index_mask));
+
+ /* Get a neutral vector value. This is simply a splat of the neutral
+ scalar value if we have one, otherwise the initial scalar value
+ is itself a neutral value. */
+ tree vector_identity = NULL_TREE;
+ if (neutral_op)
+ vector_identity = gimple_build_vector_from_val (&seq, vectype,
+ neutral_op);
+ for (unsigned int i = 0; i < group_size; ++i)
+ {
+ /* If there's no univeral neutral value, we can use the
+ initial scalar value from the original PHI. This is used
+ for MIN and MAX reduction, for example. */
+ if (!neutral_op)
+ {
+ tree scalar_value
+ = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
+ loop_preheader_edge (loop));
+ vector_identity = gimple_build_vector_from_val (&seq, vectype,
+ scalar_value);
+ }
+
+ /* Calculate the equivalent of:
+
+ sel[j] = (index[j] == i);
+
+ which selects the elements of NEW_PHI_RESULT that should
+ be included in the result. */
+ tree compare_val = build_int_cst (index_elt_type, i);
+ compare_val = build_vector_from_val (index_type, compare_val);
+ tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
+ index, compare_val);
+
+ /* Calculate the equivalent of:
+
+ vec = seq ? new_phi_result : vector_identity;
+
+ VEC is now suitable for a full vector reduction. */
+ tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
+ sel, new_phi_result, vector_identity);
+
+ /* Do the reduction and convert it to the appropriate type. */
+ gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
+ tree scalar = make_ssa_name (TREE_TYPE (vectype));
+ gimple_call_set_lhs (call, scalar);
+ gimple_seq_add_stmt (&seq, call);
+ scalar = gimple_convert (&seq, scalar_type, scalar);
+ scalar_results.safe_push (scalar);
+ }
+ gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
+ }
else
{
bool reduce_with_shift;
@@ -6412,25 +6560,64 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
return false;
}
- if (double_reduc && !nunits_out.is_constant ())
+ /* For SLP reductions, see if there is a neutral value we can use. */
+ tree neutral_op = NULL_TREE;
+ if (slp_node)
+ neutral_op
+ = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
+ GROUP_FIRST_ELEMENT (stmt_info) != NULL);
+
+ /* For double reductions, and for SLP reductions with a neutral value,
+ we construct a variable-length initial vector by loading a vector
+ full of the neutral value and then shift-and-inserting the start
+ values into the low-numbered elements. */
+ if ((double_reduc || neutral_op)
+ && !nunits_out.is_constant ()
+ && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
+ vectype_out, OPTIMIZE_FOR_SPEED))
{
- /* The current double-reduction code creates the initial value
- element-by-element. */
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "double reduction not supported for variable-length"
- " vectors.\n");
+ "reduction on variable-length vectors requires"
+ " target support for a vector-shift-and-insert"
+ " operation.\n");
return false;
}
- if (slp_node && !nunits_out.is_constant ())
- {
- /* The current SLP code creates the initial value element-by-element. */
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "SLP reduction not supported for variable-length"
- " vectors.\n");
- return false;
+ /* Check extra constraints for variable-length unchained SLP reductions. */
+ if (STMT_SLP_TYPE (stmt_info)
+ && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
+ && !nunits_out.is_constant ())
+ {
+ /* We checked above that we could build the initial vector when
+ there's a neutral element value. Check here for the case in
+ which each SLP statement has its own initial value and in which
+ that value needs to be repeated for every instance of the
+ statement within the initial vector. */
+ unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+ scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
+ if (!neutral_op
+ && !can_duplicate_and_interleave_p (group_size, elt_mode))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported form of SLP reduction for"
+ " variable-length vectors: cannot build"
+ " initial vector.\n");
+ return false;
+ }
+ /* The epilogue code relies on the number of elements being a multiple
+ of the group size. The duplicate-and-interleave approach to setting
+ up the the initial vector does too. */
+ if (!multiple_p (nunits_out, group_size))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "unsupported form of SLP reduction for"
+ " variable-length vectors: the vector size"
+ " is not a multiple of the number of results.\n");
+ return false;
+ }
}
/* In case of widenning multiplication by a constant, we update the type
@@ -6698,7 +6885,8 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
epilog_copies, reduc_fn, phis,
double_reduc, slp_node, slp_node_instance,
- cond_reduc_val, cond_reduc_op_code);
+ cond_reduc_val, cond_reduc_op_code,
+ neutral_op);
return true;
}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 5f6a33a..7fae17b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -216,11 +216,11 @@ vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
(if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
(if nonnull). */
-static bool
+bool
can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
- unsigned int *nvectors_out = NULL,
- tree *vector_type_out = NULL,
- tree *permutes = NULL)
+ unsigned int *nvectors_out,
+ tree *vector_type_out,
+ tree *permutes)
{
poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
poly_int64 nelts;
@@ -3309,7 +3309,7 @@ vect_mask_constant_operand_p (gimple *stmt, int opnum)
We try to find the largest IM for which this sequence works, in order
to cut down on the number of interleaves. */
-static void
+void
duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
unsigned int nresults, vec<tree> &results)
{
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 000688d..1effcad 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1352,6 +1352,11 @@ extern void vect_get_slp_defs (vec<tree> , slp_tree, vec<vec<tree> > *);
extern bool vect_slp_bb (basic_block);
extern gimple *vect_find_last_scalar_stmt_in_slp (slp_tree);
extern bool is_simple_and_all_uses_invariant (gimple *, loop_vec_info);
+extern bool can_duplicate_and_interleave_p (unsigned int, machine_mode,
+ unsigned int * = NULL,
+ tree * = NULL, tree * = NULL);
+extern void duplicate_and_interleave (gimple_seq *, tree, vec<tree>,
+ unsigned int, vec<tree> &);
/* In tree-vect-patterns.c. */
/* Pattern recognition functions.