aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIra Rosen <irar@il.ibm.com>2008-08-28 11:11:14 +0000
committerIra Rosen <irar@gcc.gnu.org>2008-08-28 11:11:14 +0000
commit0fca40f598654f83453b17e44f902183859b16e4 (patch)
treee0ca36b1b58d3d02e70b7343f52d0af07126a5b3
parentb8c41c8ed2c55f001b14b0a432f4daa73155d278 (diff)
downloadgcc-0fca40f598654f83453b17e44f902183859b16e4.zip
gcc-0fca40f598654f83453b17e44f902183859b16e4.tar.gz
gcc-0fca40f598654f83453b17e44f902183859b16e4.tar.bz2
target.h (struct vectorize): Add new target builtin.
* target.h (struct vectorize): Add new target builtin. * tree-vectorizer.c (destroy_loop_vec_info): Call vect_free_slp_instance instead of vect_free_slp_node. * tree-vectorizer.h (enum slp_load_perm_type): New. (struct _slp_instance): Add new fields. (SLP_INSTANCE_LOAD_PERMUTATION): New. (SLP_INSTANCE_LOADS): New. (vect_free_slp_tree): Remove. (vect_free_slp_instance): Declare. (SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New. (vectorizable_load): Add argument. (vect_transform_slp_perm_load): New. * tree-vect-analyze.c (vect_analyze_operations): Add an argument to vectorizable_load. (vect_get_place_in_interleaving_chain): New function. (vect_free_slp_tree): Make static. (vect_free_slp_instance): New function. (vect_build_slp_tree): Add new arguments. Allow load permutations and collect the load location in the interleaving chain. (vect_supported_slp_permutation_p): New function. (vect_supported_load_permutation_p): Likewise. (vect_analyze_slp_instance): In case of loads permutation, call vect_supported_load_permutation_p to check that the permutation is supported. * target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New. * tree-vect-transform.c (vect_transform_stmt): Add new argument. (vect_create_mask_and_perm): New function. (vect_get_mask_element, vect_transform_slp_perm_load): Likewise. (vectorizable_load): Add an argument. Don't keep the created vectors statements in the node if permutation is required. Call vect_transform_slp_perm_load to generate the permutation. (vect_transform_stmt): Add new argument. Call vectorizable_load with additional argument. (vect_schedule_slp_instance): In case of loads permutation, allocate vectorized statements structure for all the related SLP nodes. Call vect_transform_stmt with addditional argument. (vect_transform_loop): Call vect_transform_stmt with correct arguments. * config/spu/spu.c (spu_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. * config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define. * config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. From-SVN: r139706
-rw-r--r--gcc/ChangeLog45
-rw-r--r--gcc/config/rs6000/rs6000.c37
-rw-r--r--gcc/config/spu/spu.c58
-rw-r--r--gcc/config/spu/spu.h5
-rw-r--r--gcc/target-def.h4
-rw-r--r--gcc/target.h5
-rw-r--r--gcc/testsuite/ChangeLog13
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-1.c60
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-2.c55
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-3.c70
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-4.c85
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-5.c77
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-6.c77
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-7.c76
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-8.c57
-rw-r--r--gcc/testsuite/gcc.dg/vect/slp-perm-9.c58
-rw-r--r--gcc/testsuite/lib/target-supports.exp22
-rw-r--r--gcc/tree-vect-analyze.c339
-rw-r--r--gcc/tree-vect-transform.c405
-rw-r--r--gcc/tree-vectorizer.c3
-rw-r--r--gcc/tree-vectorizer.h23
21 files changed, 1461 insertions, 113 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 6430e4f..1f7396f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,48 @@
+2008-08-28 Ira Rosen <irar@il.ibm.com>
+
+ * target.h (struct vectorize): Add new target builtin.
+ * tree-vectorizer.c (destroy_loop_vec_info): Call
+ vect_free_slp_instance instead of vect_free_slp_node.
+ * tree-vectorizer.h (enum slp_load_perm_type): New.
+ (struct _slp_instance): Add new fields.
+ (SLP_INSTANCE_LOAD_PERMUTATION): New.
+ (SLP_INSTANCE_LOADS): New.
+ (vect_free_slp_tree): Remove.
+ (vect_free_slp_instance): Declare.
+ (SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
+ (vectorizable_load): Add argument.
+ (vect_transform_slp_perm_load): New.
+ * tree-vect-analyze.c (vect_analyze_operations): Add an argument to
+ vectorizable_load.
+ (vect_get_place_in_interleaving_chain): New function.
+ (vect_free_slp_tree): Make static.
+ (vect_free_slp_instance): New function.
+ (vect_build_slp_tree): Add new arguments. Allow load permutations and
+ collect the load location in the interleaving chain.
+ (vect_supported_slp_permutation_p): New function.
+ (vect_supported_load_permutation_p): Likewise.
+ (vect_analyze_slp_instance): In case of loads permutation, call
+ vect_supported_load_permutation_p to check that the permutation is
+ supported.
+ * target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
+ * tree-vect-transform.c (vect_transform_stmt): Add new argument.
+ (vect_create_mask_and_perm): New function.
+ (vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
+ (vectorizable_load): Add an argument. Don't keep the created vectors
+ statements in the node if permutation is required. Call
+ vect_transform_slp_perm_load to generate the permutation.
+ (vect_transform_stmt): Add new argument. Call vectorizable_load with
+ additional argument.
+ (vect_schedule_slp_instance): In case of loads permutation, allocate
+ vectorized statements structure for all the related SLP nodes. Call
+ vect_transform_stmt with addditional argument.
+ (vect_transform_loop): Call vect_transform_stmt with correct arguments.
+ * config/spu/spu.c (spu_builtin_vec_perm): New.
+ (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
+ * config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
+ * config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
+ (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
+
2008-08-28 Chris Fairles <chris.fairles@gmail.com>
* gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach,
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 83997ff..2124ea3 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void);
static tree rs6000_builtin_mul_widen_even (tree);
static tree rs6000_builtin_mul_widen_odd (tree);
static tree rs6000_builtin_conversion (enum tree_code, tree);
+static tree rs6000_builtin_vec_perm (tree, tree *);
static void def_builtin (int, const char *, tree, int);
static bool rs6000_vector_alignment_reachable (const_tree, bool);
@@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] =
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
#define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
@@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac
}
}
+/* Implement targetm.vectorize.builtin_vec_perm. */
+tree
+rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
+{
+ tree d;
+
+ *mask_element_type = unsigned_char_type_node;
+
+ switch (TYPE_MODE (type))
+ {
+ case V16QImode:
+ d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI];
+ break;
+
+ case V8HImode:
+ d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI];
+ break;
+
+ case V4SImode:
+ d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI];
+ break;
+
+ case V4SFmode:
+ d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF];
+ break;
+
+ default:
+ return NULL_TREE;
+ }
+
+ gcc_assert (d);
+ return d;
+}
+
/* Handle generic options of the form -mfoo=yes/no.
NAME is the option name.
VALUE is the option value.
diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c
index 1021a91..da99d3f 100644
--- a/gcc/config/spu/spu.c
+++ b/gcc/config/spu/spu.c
@@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree);
static tree spu_builtin_mask_for_load (void);
static int spu_builtin_vectorization_cost (bool);
static bool spu_vector_alignment_reachable (const_tree, bool);
+static tree spu_builtin_vec_perm (tree, tree *);
static int spu_sms_res_mii (struct ddg *g);
extern const char *reg_names[];
@@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[];
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
+
#undef TARGET_LIBGCC_CMP_RETURN_MODE
#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
@@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed
return true;
}
+/* Implement targetm.vectorize.builtin_vec_perm. */
+tree
+spu_builtin_vec_perm (tree type, tree *mask_element_type)
+{
+ struct spu_builtin_description *d;
+
+ *mask_element_type = unsigned_char_type_node;
+
+ switch (TYPE_MODE (type))
+ {
+ case V16QImode:
+ if (TYPE_UNSIGNED (type))
+ d = &spu_builtins[SPU_SHUFFLE_0];
+ else
+ d = &spu_builtins[SPU_SHUFFLE_1];
+ break;
+
+ case V8HImode:
+ if (TYPE_UNSIGNED (type))
+ d = &spu_builtins[SPU_SHUFFLE_2];
+ else
+ d = &spu_builtins[SPU_SHUFFLE_3];
+ break;
+
+ case V4SImode:
+ if (TYPE_UNSIGNED (type))
+ d = &spu_builtins[SPU_SHUFFLE_4];
+ else
+ d = &spu_builtins[SPU_SHUFFLE_5];
+ break;
+
+ case V2DImode:
+ if (TYPE_UNSIGNED (type))
+ d = &spu_builtins[SPU_SHUFFLE_6];
+ else
+ d = &spu_builtins[SPU_SHUFFLE_7];
+ break;
+
+ case V4SFmode:
+ d = &spu_builtins[SPU_SHUFFLE_8];
+ break;
+
+ case V2DFmode:
+ d = &spu_builtins[SPU_SHUFFLE_9];
+ break;
+
+ default:
+ return NULL_TREE;
+ }
+
+ gcc_assert (d);
+ return d->fndecl;
+}
+
/* Count the total number of instructions in each pipe and return the
maximum, which is used as the Minimum Iteration Interval (MII)
in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1.
diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h
index 86042aa..b27e9b7 100644
--- a/gcc/config/spu/spu.h
+++ b/gcc/config/spu/spu.h
@@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \
#undef TARG_VEC_STORE_COST
#define TARG_VEC_STORE_COST 1
+/* Cost of vector permutation. */
+#ifndef TARG_VEC_PERMUTE_COST
+#define TARG_VEC_PERMUTE_COST 1
+#endif
+
/* Misc */
diff --git a/gcc/target-def.h b/gcc/target-def.h
index cd64b38..18b0eb5 100644
--- a/gcc/target-def.h
+++ b/gcc/target-def.h
@@ -364,6 +364,7 @@
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0
#define TARGET_VECTOR_ALIGNMENT_REACHABLE \
default_builtin_vector_alignment_reachable
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0
#define TARGET_VECTORIZE \
{ \
@@ -373,7 +374,8 @@
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \
TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \
- TARGET_VECTOR_ALIGNMENT_REACHABLE \
+ TARGET_VECTOR_ALIGNMENT_REACHABLE, \
+ TARGET_VECTORIZE_BUILTIN_VEC_PERM \
}
#define TARGET_DEFAULT_TARGET_FLAGS 0
diff --git a/gcc/target.h b/gcc/target.h
index 3a104c5..610d765 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -438,7 +438,10 @@ struct gcc_target
/* Return true if vector alignment is reachable (by peeling N
iterations) for the given type. */
bool (* vector_alignment_reachable) (const_tree, bool);
- } vectorize;
+
+ /* Target builtin that implements vector permute. */
+ tree (* builtin_vec_perm) (tree, tree*);
+} vectorize;
/* The initial value of target_flags. */
int default_target_flags;
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 707ccb7..d5b6c83 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,16 @@
+2008-08-28 Ira Rosen <irar@il.ibm.com>
+
+ * lib/target-supports.exp (check_effective_target_vect_perm): New.
+ * gcc.dg/vect/slp-perm-1.c: New testcase.
+ * gcc.dg/vect/slp-perm-2.c: New testcase.
+ * gcc.dg/vect/slp-perm-3.c: New testcase.
+ * gcc.dg/vect/slp-perm-4.c: New testcase.
+ * gcc.dg/vect/slp-perm-5.c: New testcase.
+ * gcc.dg/vect/slp-perm-6.c: New testcase.
+ * gcc.dg/vect/slp-perm-7.c: New testcase.
+ * gcc.dg/vect/slp-perm-8.c: New testcase.
+ * gcc.dg/vect/slp-perm-9.c: New testcase.
+
2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org>
PR 37217
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
new file mode 100644
index 0000000..410758c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c
@@ -0,0 +1,60 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M02 74
+#define M12 191
+#define M22 500
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
+{
+ unsigned int i, a, b, c;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c;
+ *pOutput++ = M10 * a + M11 * b + M12 * c;
+ *pOutput++ = M20 * a + M21 * b + M22 * c;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned int input[N], output[N], i;
+ unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ if (input[i] > 200)
+ abort();
+ output[i] = 0;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N; i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
new file mode 100644
index 0000000..da38a8d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c
@@ -0,0 +1,55 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M01 1322
+#define M11 13
+#define M02 74
+#define M12 191
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
+{
+ unsigned int i, a, b;
+
+ for (i = 0; i < N / 2; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+
+ *pOutput++ = M00 * a + M01 * b;
+ *pOutput++ = M10 * a + M11 * b;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned int input[N], output[N], i;
+ unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ if (input[i] > 200)
+ abort();
+ output[i] = 0;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N; i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
new file mode 100644
index 0000000..312db31
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c
@@ -0,0 +1,70 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
+{
+ unsigned int i, a, b, c, d;
+
+ for (i = 0; i < N / 4; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+ d = *pInput++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;
+ *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;
+ *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;
+ *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned int input[N], output[N], i;
+ unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ if (input[i] > 200)
+ abort();
+ output[i] = 0;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N - N; i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
new file mode 100644
index 0000000..f4db75d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c
@@ -0,0 +1,85 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M40 437
+
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M41 284
+
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M42 1114
+
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+#define M43 71
+
+#define M04 334
+#define M14 147
+#define M24 115
+#define M34 7716
+#define M44 16
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
+{
+ unsigned int i, a, b, c, d, e;
+
+ for (i = 0; i < N / 5; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+ d = *pInput++;
+ e = *pInput++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+ *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+ *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+ *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+ *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned int input[N], output[N], i;
+ unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ if (input[i] > 200)
+ abort();
+ output[i] = 0;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N - N; i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
+/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
new file mode 100644
index 0000000..ca59a5e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c
@@ -0,0 +1,77 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M02 74
+#define M12 191
+#define M22 500
+
+#define K00 405
+#define K10 112
+#define K01 4322
+#define K11 135
+
+#define N 16
+
+void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
+ int *__restrict__ pInput2, int *__restrict__ pOutput2)
+{
+ int i, a, b, c, d, e;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+
+ d = *pInput2++;
+ e = *pInput2++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c;
+ *pOutput++ = M10 * a + M11 * b + M12 * c;
+ *pOutput++ = M20 * a + M21 * b + M22 * c;
+
+ *pOutput2++ = K00 * d + K01 * e;
+ *pOutput2++ = K10 * d + K11 * e;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ int input[N], output[N], i;
+ int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
+ int input2[N], output2[N];
+ int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ input2[i] = i%256;
+ output[i] = 0;
+ output2[i] = 0;
+ if (input[i] > 256)
+ abort ();
+ }
+
+ foo (input, output, input2, output2);
+
+ for (i = 0; i < N; i++)
+ if (output[i] != check_results[i] || output2[i] != check_results2[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
new file mode 100644
index 0000000..ff9be8a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c
@@ -0,0 +1,77 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M02 74
+#define M12 191
+#define M22 500
+
+#define K00 405
+#define K10 112
+#define K01 4322
+#define K11 135
+
+#define N 16
+
+void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
+ int *__restrict__ pInput2, int *__restrict__ pOutput2)
+{
+ int i, a, b, c, d, e;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+
+ d = *pInput2++;
+ e = *pInput2++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c;
+ *pOutput++ = M10 * a + M11 * b + M12 * c;
+ *pOutput++ = M20 * a + M21 * b + M22 * c;
+
+ /* Regular SLP - no permutation required. */
+ *pOutput2++ = K00 * d;
+ *pOutput2++ = K10 * e;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ int input[N], output[N], i;
+ int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
+ int input2[N], output2[N];
+ int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ input2[i] = i%256;
+ output[i] = 0;
+ output2[i] = 0;
+ if (input[i] > 256)
+ abort ();
+ }
+
+ foo (input, output, input2, output2);
+
+ for (i = 0; i < N; i++)
+ if (output[i] != check_results[i] || output2[i] != check_results2[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
new file mode 100644
index 0000000..0065407
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
@@ -0,0 +1,76 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M02 74
+#define M12 191
+#define M22 500
+
+#define K00 405
+#define K10 112
+#define K01 4322
+#define K11 135
+
+#define N 16
+
+/* SLP with load permutation and loop-based vectorization. */
+void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
+ int *__restrict__ pInput2, int *__restrict__ pOutput2)
+{
+ int i, a, b, c, d;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+ d = *pInput2++;
+
+ *pOutput++ = M00 * a + M01 * b + M02 * c;
+ *pOutput++ = M10 * a + M11 * b + M12 * c;
+ *pOutput++ = M20 * a + M21 * b + M22 * c;
+
+ /* Loop-based vectorization. */
+ *pOutput2++ = K00 * d;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ int input[N], output[N], i;
+ int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
+ int input2[N], output2[N];
+ int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i%256;
+ input2[i] = i%256;
+ output[i] = 0;
+ output2[i] = 0;
+ if (input[i] > 200)
+ abort ();
+ }
+
+ foo (input, output, input2, output2);
+
+ for (i = 0; i < N; i++)
+ if (output[i] != check_results[i] || output2[i] != check_results2[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
new file mode 100644
index 0000000..8c60d44
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c
@@ -0,0 +1,57 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 200
+
+void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput)
+{
+ unsigned char i, a, b, c;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+
+ *pOutput++ = a + b + c + 3;
+ *pOutput++ = a + b + c + 12;
+ *pOutput++ = a + b + c + 1;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned char input[N], output[N], i;
+ unsigned char check_results[N];
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i;
+ output[i] = 0;
+ if (input[i] > 256)
+ abort ();
+ }
+
+ for (i = 0; i < N / 3; i++)
+ {
+ check_results[3*i] = 9 * i + 6;
+ check_results[3*i+1] = 9 * i + 15;
+ check_results[3*i+2] = 9 * i + 4;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N - (N % 3); i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
new file mode 100644
index 0000000..964e691
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 200
+
+void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
+{
+ unsigned short i, a, b, c;
+
+ for (i = 0; i < N / 3; i++)
+ {
+ a = *pInput++;
+ b = *pInput++;
+ c = *pInput++;
+
+ *pOutput++ = a + b + c + 3;
+ *pOutput++ = a + b + c + 12;
+ *pOutput++ = a + b + c + 1;
+ }
+}
+
+int main (int argc, const char* argv[])
+{
+ unsigned short input[N], output[N], i;
+ unsigned short check_results[N];
+
+ for (i = 0; i < N; i++)
+ {
+ input[i] = i;
+ output[i] = 0;
+ if (input[i] > 256)
+ abort ();
+ }
+
+ for (i = 0; i < N / 3; i++)
+ {
+ check_results[3*i] = 9 * i + 6;
+ check_results[3*i+1] = 9 * i + 15;
+ check_results[3*i+2] = 9 * i + 4;
+ }
+
+ foo (input, output);
+
+ for (i = 0; i < N - (N % 3); i++)
+ if (output[i] != check_results[i])
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 69c8ea4..e508038 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } {
return $et_vect_no_bitwise_saved
}
+# Return 1 if the target plus current options supports vector permutation,
+# 0 otherwise.
+#
+# This won't change for different subtargets so cache the result.
+
+proc check_effective_target_vect_perm { } {
+ global et_vect_perm
+
+ if [info exists et_vect_perm_saved] {
+ verbose "check_effective_target_vect_perm: using cached result" 2
+ } else {
+ set et_vect_perm_saved 0
+ if { [istarget powerpc*-*-*]
+ || [istarget spu-*-*] } {
+ set et_vect_perm_saved 1
+ }
+ }
+ verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
+ return $et_vect_perm_saved
+}
+
+
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
# A target can also support this widening summation if it can support
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c
index 305ba4c..c672d7a 100644
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -486,7 +486,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
|| vectorizable_conversion (stmt, NULL, NULL, NULL)
|| vectorizable_operation (stmt, NULL, NULL, NULL)
|| vectorizable_assignment (stmt, NULL, NULL, NULL)
- || vectorizable_load (stmt, NULL, NULL, NULL)
+ || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL)
@@ -846,6 +846,31 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
}
+/* Find the place of the data-ref in STMT in the interleaving chain that starts
+ from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
+
+static int
+vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
+{
+ gimple next_stmt = first_stmt;
+ int result = 0;
+
+ if (first_stmt != DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
+ return -1;
+
+ while (next_stmt && next_stmt != stmt)
+ {
+ result++;
+ next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
+ }
+
+ if (next_stmt)
+ return result;
+ else
+ return -1;
+}
+
+
/* Function vect_insert_into_interleaving_chain.
Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
@@ -2482,7 +2507,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
-void
+static void
vect_free_slp_tree (slp_tree node)
{
if (!node)
@@ -2503,6 +2528,17 @@ vect_free_slp_tree (slp_tree node)
}
+/* Free the memory allocated for the SLP instance. */
+
+void
+vect_free_slp_instance (slp_instance instance)
+{
+ vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+ VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance));
+ VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
+}
+
+
/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that
they are of a legal type and that they match the defs of the first stmt of
the SLP group (stored in FIRST_STMT_...). */
@@ -2705,7 +2741,9 @@ static bool
vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
unsigned int group_size,
int *inside_cost, int *outside_cost,
- int ncopies_for_cost, unsigned int *max_nunits)
+ int ncopies_for_cost, unsigned int *max_nunits,
+ VEC (int, heap) **load_permutation,
+ VEC (slp_tree, heap) **loads)
{
VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
@@ -2716,7 +2754,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
enum tree_code first_stmt_code = 0, rhs_code;
tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
tree lhs;
- gimple prev_stmt = NULL;
bool stop_recursion = false, need_same_oprnds = false;
tree vectype, scalar_type, first_op1 = NULL_TREE;
unsigned int vectorization_factor = 0, ncopies;
@@ -2728,6 +2765,9 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
struct data_reference *first_dr;
bool pattern0 = false, pattern1 = false;
HOST_WIDE_INT dummy;
+ bool permutation = false;
+ unsigned int load_place;
+ gimple first_load;
/* For every stmt in NODE find its def stmt/s. */
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
@@ -2813,8 +2853,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
if (icode == CODE_FOR_nothing)
{
if (vect_print_dump_info (REPORT_SLP))
- fprintf (vect_dump,
- "Build SLP failed: op not supported by target.");
+ fprintf (vect_dump, "Build SLP failed: "
+ "op not supported by target.");
return false;
}
optab_op2_mode = insn_data[icode].operand[2].mode;
@@ -2878,70 +2918,60 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
else
{
/* Load. */
- if (i == 0)
- {
- /* First stmt of the SLP group should be the first load of
- the interleaving loop if data permutation is not allowed.
- Check that there is no gap between the loads. */
- if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
- || DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
- {
- /* FORNOW: data permutations and gaps in loads are not
- supported. */
- if (vect_print_dump_info (REPORT_SLP))
- {
- fprintf (vect_dump, "Build SLP failed: strided "
- " loads need permutation or have gaps ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
-
- return false;
- }
-
- first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
- if (vect_supportable_dr_alignment (first_dr)
- == dr_unaligned_unsupported)
- {
- if (vect_print_dump_info (REPORT_SLP))
- {
- fprintf (vect_dump, "Build SLP failed: unsupported "
- " unaligned load ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
-
- return false;
- }
-
- /* Analyze costs (for the first stmt in the group). */
- vect_model_load_cost (vinfo_for_stmt (stmt),
- ncopies_for_cost, *node);
- }
- else
- {
- /* Check that we have consecutive loads from interleaving
- chain and that there is no gap between the loads. */
- if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt
- || DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)
- {
- /* FORNOW: data permutations and gaps in loads are not
- supported. */
- if (vect_print_dump_info (REPORT_SLP))
- {
- fprintf (vect_dump, "Build SLP failed: strided "
- " loads need permutation or have gaps ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
- return false;
- }
- }
-
- prev_stmt = stmt;
-
- /* We stop the tree when we reach a group of loads. */
- stop_recursion = true;
- continue;
- }
- } /* Strided access. */
+ /* FORNOW: Check that there is no gap between the loads. */
+ if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
+ && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+ || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
+ && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: strided "
+ "loads have gaps ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+
+ if (first_load == stmt)
+ {
+ first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+ if (vect_supportable_dr_alignment (first_dr)
+ == dr_unaligned_unsupported)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: unsupported "
+ "unaligned load ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ /* Analyze costs (for the first stmt in the group). */
+ vect_model_load_cost (vinfo_for_stmt (stmt),
+ ncopies_for_cost, *node);
+ }
+
+ /* Store the place of this load in the interleaving chain. In
+ case that permutation is needed we later decide if a specific
+ permutation is supported. */
+ load_place = vect_get_place_in_interleaving_chain (stmt,
+ first_load);
+ if (load_place != i)
+ permutation = true;
+
+ VEC_safe_push (int, heap, *load_permutation, load_place);
+
+ /* We stop the tree when we reach a group of loads. */
+ stop_recursion = true;
+ continue;
+ }
+ } /* Strided access. */
else
{
if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
@@ -2990,7 +3020,15 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
/* Strided loads were reached - stop the recursion. */
if (stop_recursion)
- return true;
+ {
+ if (permutation)
+ {
+ VEC_safe_push (slp_tree, heap, *loads, *node);
+ *inside_cost += TARG_VEC_PERMUTE_COST * group_size;
+ }
+
+ return true;
+ }
/* Create SLP_TREE nodes for the definition node/s. */
if (first_stmt_dt0 == vect_loop_def)
@@ -3003,8 +3041,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size,
- inside_cost, outside_cost,
- ncopies_for_cost, max_nunits))
+ inside_cost, outside_cost, ncopies_for_cost,
+ max_nunits, load_permutation, loads))
return false;
SLP_TREE_LEFT (*node) = left_node;
@@ -3020,8 +3058,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size,
- inside_cost, outside_cost,
- ncopies_for_cost, max_nunits))
+ inside_cost, outside_cost, ncopies_for_cost,
+ max_nunits, load_permutation, loads))
return false;
SLP_TREE_RIGHT (*node) = right_node;
@@ -3076,6 +3114,116 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
}
+/* Check if the permutation required by the SLP INSTANCE is supported.
+ Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
+
+static bool
+vect_supported_slp_permutation_p (slp_instance instance)
+{
+ slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
+ gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
+ gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
+ VEC (slp_tree, heap) *sorted_loads = NULL;
+ int index;
+ slp_tree *tmp_loads = NULL;
+ int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
+ slp_tree load;
+
+ /* FORNOW: The only supported loads permutation is loads from the same
+ location in all the loads in the node, when the data-refs in
+ nodes of LOADS constitute an interleaving chain.
+ Sort the nodes according to the order of accesses in the chain. */
+ tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
+ for (i = 0, j = 0;
+ VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index)
+ && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load);
+ i += group_size, j++)
+ {
+ gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
+ /* Check that the loads are all in the same interleaving chain. */
+ if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "Build SLP failed: unsupported data "
+ "permutation ");
+ print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM);
+ }
+
+ free (tmp_loads);
+ return false;
+ }
+
+ tmp_loads[index] = load;
+ }
+
+ sorted_loads = VEC_alloc (slp_tree, heap, group_size);
+ for (i = 0; i < group_size; i++)
+ VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]);
+
+ VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
+ SLP_INSTANCE_LOADS (instance) = sorted_loads;
+ free (tmp_loads);
+
+ if (!vect_transform_slp_perm_load (stmt, NULL, NULL,
+ SLP_INSTANCE_UNROLLING_FACTOR (instance),
+ instance, true))
+ return false;
+
+ return true;
+}
+
+
+/* Check if the required load permutation is supported.
+ LOAD_PERMUTATION contains a list of indices of the loads.
+ In SLP this permutation is relative to the order of strided stores that are
+ the base of the SLP instance. */
+
+static bool
+vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
+ VEC (int, heap) *load_permutation)
+{
+ int i = 0, j, prev = -1, next, k;
+ bool supported;
+
+ /* FORNOW: permutations are only supported for loop-aware SLP. */
+ if (!slp_instn)
+ return false;
+
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Load permutation ");
+ for (i = 0; VEC_iterate (int, load_permutation, i, next); i++)
+ fprintf (vect_dump, "%d ", next);
+ }
+
+ /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
+ GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
+ well. */
+ supported = true;
+ for (j = 0; j < group_size; j++)
+ {
+ for (i = j * group_size, k = 0;
+ VEC_iterate (int, load_permutation, i, next) && k < group_size;
+ i++, k++)
+ {
+ if (i != j * group_size && next != prev)
+ {
+ supported = false;
+ break;
+ }
+
+ prev = next;
+ }
+ }
+
+ if (supported && i == group_size * group_size
+ && vect_supported_slp_permutation_p (slp_instn))
+ return true;
+
+ return false;
+}
+
/* Analyze an SLP instance starting from a group of strided stores. Call
vect_build_slp_tree to build a tree of packed stmts if possible.
Return FALSE if it's impossible to SLP any stmt in the loop. */
@@ -3093,8 +3241,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
bool slp_impossible = false;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
unsigned int max_nunits = 0;
-
- scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))));
+ VEC (int, heap) *load_permutation;
+ VEC (slp_tree, heap) *loads;
+
+ scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
+ vinfo_for_stmt (stmt))));
vectype = get_vectype_for_scalar_type (scalar_type);
if (!vectype)
{
@@ -3134,16 +3285,21 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
GROUP_SIZE / NUNITS otherwise. */
ncopies_for_cost = unrolling_factor * group_size / nunits;
+
+ load_permutation = VEC_alloc (int, heap, group_size * group_size);
+ loads = VEC_alloc (slp_tree, heap, group_size);
/* Build the tree for the SLP instance. */
if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost,
- &outside_cost, ncopies_for_cost, &max_nunits))
+ &outside_cost, ncopies_for_cost, &max_nunits,
+ &load_permutation, &loads))
{
/* Create a new SLP instance. */
new_instance = XNEW (struct _slp_instance);
SLP_INSTANCE_TREE (new_instance) = node;
SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
- /* Calculate the unrolling factor based on the smallest type. */
+ /* Calculate the unrolling factor based on the smallest type in the
+ loop. */
if (max_nunits > nunits)
unrolling_factor = least_common_multiple (max_nunits, group_size)
/ group_size;
@@ -3151,6 +3307,27 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
+ SLP_INSTANCE_LOADS (new_instance) = loads;
+ SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
+ if (VEC_length (slp_tree, loads))
+ {
+ if (!vect_supported_load_permutation_p (new_instance, group_size,
+ load_permutation))
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ {
+ fprintf (vect_dump, "Build SLP failed: unsupported load "
+ "permutation ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ vect_free_slp_instance (new_instance);
+ return false;
+ }
+ }
+ else
+ VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance));
+
VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
new_instance);
if (vect_print_dump_info (REPORT_SLP))
@@ -3162,7 +3339,9 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
/* Failed to SLP. */
/* Free the allocated memory. */
vect_free_slp_tree (node);
-
+ VEC_free (int, heap, load_permutation);
+ VEC_free (slp_tree, heap, loads);
+
if (slp_impossible)
return false;
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c
index 0744527..3a77c5b 100644
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
/* Utility functions for the code transformation. */
static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
- slp_tree);
+ slp_tree, slp_instance);
static tree vect_create_destination_var (tree, tree);
static tree vect_create_data_ref_ptr
(gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
@@ -936,7 +936,7 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
gimple_seq_add_seq (new_stmt_list, seq);
}
-
+
/* base + base_offset */
addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
data_ref_base, base_offset);
@@ -5962,6 +5962,313 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
}
+/* Create NCOPIES permutation statements using the mask MASK_BYTES (by
+ building a vector of type MASK_TYPE from it) and two input vectors placed in
+ DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
+ shifting by STRIDE elements of DR_CHAIN for every copy.
+ (STRIDE is the number of vectorized stmts for NODE divided by the number of
+ copies).
+ VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
+ the created stmts must be inserted. */
+
+static inline void
+vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
+ int *mask_array, int mask_nunits,
+ tree mask_element_type, tree mask_type,
+ int first_vec_indx, int second_vec_indx,
+ gimple_stmt_iterator *gsi, slp_tree node,
+ tree builtin_decl, tree vectype,
+ VEC(tree,heap) *dr_chain,
+ int ncopies, int vect_stmts_counter)
+{
+ tree t = NULL_TREE, mask_vec, mask, perm_dest;
+ gimple perm_stmt = NULL;
+ stmt_vec_info next_stmt_info;
+ int i, group_size, stride, dr_chain_size;
+ tree first_vec, second_vec, data_ref;
+ tree sym;
+ ssa_op_iter iter;
+ VEC (tree, heap) *params = NULL;
+
+ /* Create a vector mask. */
+ for (i = mask_nunits - 1; i >= 0; --i)
+ t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
+ t);
+
+ mask_vec = build_vector (mask_type, t);
+ mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
+
+ group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
+ stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
+ dr_chain_size = VEC_length (tree, dr_chain);
+
+ /* Initialize the vect stmts of NODE to properly insert the generated
+ stmts later. */
+ for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
+ i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
+ VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
+
+ perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
+ for (i = 0; i < ncopies; i++)
+ {
+ first_vec = VEC_index (tree, dr_chain, first_vec_indx);
+ second_vec = VEC_index (tree, dr_chain, second_vec_indx);
+
+ /* Build argument list for the vectorized call. */
+ VEC_free (tree, heap, params);
+ params = VEC_alloc (tree, heap, 3);
+ VEC_quick_push (tree, params, first_vec);
+ VEC_quick_push (tree, params, second_vec);
+ VEC_quick_push (tree, params, mask);
+
+ /* Generate the permute statement. */
+ perm_stmt = gimple_build_call_vec (builtin_decl, params);
+ data_ref = make_ssa_name (perm_dest, perm_stmt);
+ gimple_call_set_lhs (perm_stmt, data_ref);
+ vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+ FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
+ {
+ if (TREE_CODE (sym) == SSA_NAME)
+ sym = SSA_NAME_VAR (sym);
+ mark_sym_for_renaming (sym);
+ }
+
+ /* Store the vector statement in NODE. */
+ VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
+ stride * i + vect_stmts_counter, perm_stmt);
+
+ first_vec_indx += stride;
+ second_vec_indx += stride;
+ }
+
+ /* Mark the scalar stmt as vectorized. */
+ next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
+ STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
+}
+
+
+/* Given FIRST_MASK_ELEMENT - the mask element in element representation,
+ return in CURRENT_MASK_ELEMENT its equivalent in target specific
+ representation. Check that the mask is valid and return FALSE if not.
+ Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
+ the next vector, i.e., the current first vector is not needed. */
+
+static bool
+vect_get_mask_element (gimple stmt, int first_mask_element, int m,
+ int mask_nunits, bool only_one_vec, int index,
+ int *mask, int *current_mask_element,
+ bool *need_next_vector)
+{
+ int i;
+ static int number_of_mask_fixes = 1;
+ static bool mask_fixed = false;
+ static bool needs_first_vector = false;
+
+ /* Convert to target specific representation. */
+ *current_mask_element = first_mask_element + m;
+ /* Adjust the value in case it's a mask for second and third vectors. */
+ *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
+
+ if (*current_mask_element < mask_nunits)
+ needs_first_vector = true;
+
+ /* We have only one input vector to permute but the mask accesses values in
+ the next vector as well. */
+ if (only_one_vec && *current_mask_element >= mask_nunits)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "permutation requires at least two vectors ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ /* The mask requires the next vector. */
+ if (*current_mask_element >= mask_nunits * 2)
+ {
+ if (needs_first_vector || mask_fixed)
+ {
+ /* We either need the first vector too or have already moved to the
+ next vector. In both cases, this permutation needs three
+ vectors. */
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "permutation requires at "
+ "least three vectors ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ /* We move to the next vector, dropping the first one and working with
+ the second and the third - we need to adjust the values of the mask
+ accordingly. */
+ *current_mask_element -= mask_nunits * number_of_mask_fixes;
+
+ for (i = 0; i < index; i++)
+ mask[i] -= mask_nunits * number_of_mask_fixes;
+
+ (number_of_mask_fixes)++;
+ mask_fixed = true;
+ }
+
+ *need_next_vector = mask_fixed;
+
+ /* This was the last element of this mask. Start a new one. */
+ if (index == mask_nunits - 1)
+ {
+ number_of_mask_fixes = 1;
+ mask_fixed = false;
+ needs_first_vector = false;
+ }
+
+ return true;
+}
+
+
+/* Generate vector permute statements from a list of loads in DR_CHAIN.
+ If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
+ permute statements for SLP_NODE_INSTANCE. */
+bool
+vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
+ gimple_stmt_iterator *gsi, int vf,
+ slp_instance slp_node_instance, bool analyze_only)
+{
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ tree mask_element_type = NULL_TREE, mask_type;
+ int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
+ slp_tree node;
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
+ gimple next_scalar_stmt;
+ int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+ int first_mask_element;
+ int index, unroll_factor, *mask, current_mask_element, ncopies;
+ bool only_one_vec = false, need_next_vector = false;
+ int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
+
+ if (!targetm.vectorize.builtin_vec_perm)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "no builtin for vect permute for ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
+ &mask_element_type);
+ if (!builtin_decl || !mask_element_type)
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ {
+ fprintf (vect_dump, "no builtin for vect permute for ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+
+ return false;
+ }
+
+ mask_type = get_vectype_for_scalar_type (mask_element_type);
+ mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
+ mask = (int *) xmalloc (sizeof (int) * mask_nunits);
+ nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ scale = mask_nunits / nunits;
+ unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
+
+ /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
+ unrolling factor. */
+ orig_vec_stmts_num = group_size *
+ SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+ if (orig_vec_stmts_num == 1)
+ only_one_vec = true;
+
+ /* Number of copies is determined by the final vectorization factor
+ relatively to SLP_NODE_INSTANCE unrolling factor. */
+ ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
+
+ /* Generate permutation masks for every NODE. Number of masks for each NODE
+ is equal to GROUP_SIZE.
+ E.g., we have a group of three nodes with three loads from the same
+ location in each node, and the vector size is 4. I.e., we have a
+ a0b0c0a1b1c1... sequence and we need to create the following vectors:
+ for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
+ for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
+ ...
+
+ The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
+ scpecific type, e.g., in bytes for Altivec.
+ The last mask is illegal since we assume two operands for permute
+ operation, and the mask element values can't be outside that range. Hence,
+ the last mask must be converted into {2,5,5,5}.
+ For the first two permutations we need the first and the second input
+ vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
+ we need the second and the third vectors: {b1,c1,a2,b2} and
+ {c2,a3,b3,c3}. */
+
+ for (i = 0;
+ VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
+ i, node);
+ i++)
+ {
+ scalar_index = 0;
+ index = 0;
+ vect_stmts_counter = 0;
+ vec_index = 0;
+ first_vec_index = vec_index++;
+ if (only_one_vec)
+ second_vec_index = first_vec_index;
+ else
+ second_vec_index = vec_index++;
+
+ for (j = 0; j < unroll_factor; j++)
+ {
+ for (k = 0; k < group_size; k++)
+ {
+ first_mask_element = (i + j * group_size) * scale;
+ for (m = 0; m < scale; m++)
+ {
+ if (!vect_get_mask_element (stmt, first_mask_element, m,
+ mask_nunits, only_one_vec, index, mask,
+ &current_mask_element, &need_next_vector))
+ return false;
+
+ mask[index++] = current_mask_element;
+ }
+
+ if (index == mask_nunits)
+ {
+ index = 0;
+ if (!analyze_only)
+ {
+ if (need_next_vector)
+ {
+ first_vec_index = second_vec_index;
+ second_vec_index = vec_index;
+ }
+
+ next_scalar_stmt = VEC_index (gimple,
+ SLP_TREE_SCALAR_STMTS (node), scalar_index++);
+
+ vect_create_mask_and_perm (stmt, next_scalar_stmt,
+ mask, mask_nunits, mask_element_type, mask_type,
+ first_vec_index, second_vec_index, gsi, node,
+ builtin_decl, vectype, dr_chain, ncopies,
+ vect_stmts_counter++);
+ }
+ }
+ }
+ }
+ }
+
+ free (mask);
+ return true;
+}
+
/* vectorizable_load.
Check if STMT reads a non scalar data-ref (array/pointer/structure) that
@@ -5972,7 +6279,7 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
bool
vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
- slp_tree slp_node)
+ slp_tree slp_node, slp_instance slp_node_instance)
{
tree scalar_dest;
tree vec_dest = NULL;
@@ -6008,6 +6315,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
struct loop *at_loop;
int vec_num;
bool slp = (slp_node != NULL);
+ bool slp_perm = false;
enum tree_code code;
/* Multiple types in SLP are handled by creating the appropriate number of
@@ -6028,6 +6336,9 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
return false;
}
+ if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
+ slp_perm = true;
+
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
@@ -6397,33 +6708,47 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
/* Collect vector loads and later create their permutation in
vect_transform_strided_load (). */
- if (strided_load)
+ if (strided_load || slp_perm)
VEC_quick_push (tree, dr_chain, new_temp);
/* Store vector loads in the corresponding SLP_NODE. */
- if (slp)
+ if (slp && !slp_perm)
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
}
- if (slp)
+ if (slp && !slp_perm)
continue;
- if (strided_load)
- {
- if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
- return false;
- *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
- VEC_free (tree, heap, dr_chain);
- dr_chain = VEC_alloc (tree, heap, group_size);
- }
+ if (slp_perm)
+ {
+ if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ slp_node_instance, false))
+ {
+ VEC_free (tree, heap, dr_chain);
+ return false;
+ }
+ }
else
- {
- if (j == 0)
- STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
- else
- STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
- prev_stmt_info = vinfo_for_stmt (new_stmt);
- }
+ {
+ if (strided_load)
+ {
+ if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
+ return false;
+
+ *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
+ VEC_free (tree, heap, dr_chain);
+ dr_chain = VEC_alloc (tree, heap, group_size);
+ }
+ else
+ {
+ if (j == 0)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+ }
}
if (dr_chain)
@@ -6690,7 +7015,8 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
static bool
vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
- bool *strided_store, slp_tree slp_node)
+ bool *strided_store, slp_tree slp_node,
+ slp_instance slp_node_instance)
{
bool is_store = false;
gimple vec_stmt = NULL;
@@ -6732,7 +7058,8 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case load_vec_info_type:
- done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node);
+ done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
+ slp_node_instance);
gcc_assert (done);
break;
@@ -7807,6 +8134,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
stmt_vec_info stmt_info;
unsigned int vec_stmts_size, nunits, group_size;
tree vectype;
+ int i;
+ slp_tree loads_node;
if (!node)
return false;
@@ -7830,8 +8159,28 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
size. */
vec_stmts_size = (vectorization_factor * group_size) / nunits;
- SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
- SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
+ /* In case of load permutation we have to allocate vectorized statements for
+ all the nodes that participate in that permutation. */
+ if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
+ {
+ for (i = 0;
+ VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
+ i++)
+ {
+ if (!SLP_TREE_VEC_STMTS (loads_node))
+ {
+ SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
+ vec_stmts_size);
+ SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
+ }
+ }
+ }
+
+ if (!SLP_TREE_VEC_STMTS (node))
+ {
+ SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
+ SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
+ }
if (vect_print_dump_info (REPORT_DETAILS))
{
@@ -7840,7 +8189,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
}
si = gsi_for_stmt (stmt);
- is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
+ is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
if (is_store)
{
if (DR_GROUP_FIRST_DR (stmt_info))
@@ -7980,7 +8329,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform phi.");
- vect_transform_stmt (phi, NULL, NULL, NULL);
+ vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
}
}
@@ -8059,7 +8408,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
fprintf (vect_dump, "transform statement.");
strided_store = false;
- is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
+ is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
if (is_store)
{
if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 437b145..cdab0b5 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
- vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
+ vect_free_slp_instance (instance);
+
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 10e7aa3..678dc59 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -105,6 +105,8 @@ typedef struct _slp_tree {
} cost;
} *slp_tree;
+DEF_VEC_P(slp_tree);
+DEF_VEC_ALLOC_P(slp_tree, heap);
/* SLP instance is a sequence of stmts in a loop that can be packed into
SIMD stmts. */
@@ -124,6 +126,13 @@ typedef struct _slp_instance {
int outside_of_loop; /* Statements generated outside loop. */
int inside_of_loop; /* Statements generated inside loop. */
} cost;
+
+ /* Loads permutation relatively to the stores, NULL if there is no
+ permutation. */
+ VEC (int, heap) *load_permutation;
+
+ /* The group of nodes that contain loads of this SLP instance. */
+ VEC (slp_tree, heap) *loads;
} *slp_instance;
DEF_VEC_P(slp_instance);
@@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
+#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation
+#define SLP_INSTANCE_LOADS(S) (S)->loads
#define SLP_TREE_LEFT(S) (S)->left
#define SLP_TREE_RIGHT(S) (S)->right
@@ -522,6 +533,11 @@ typedef struct _stmt_vec_info {
#define TARG_VEC_STORE_COST 1
#endif
+/* Cost of vector permutation. */
+#ifndef TARG_VEC_PERMUTE_COST
+#define TARG_VEC_PERMUTE_COST 1
+#endif
+
/* The maximum number of intermediate steps required in multi-step type
conversion. */
#define MAX_INTERM_CVT_STEPS 3
@@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt);
/** In tree-vect-analyze.c **/
/* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *);
-extern void vect_free_slp_tree (slp_tree);
+extern void vect_free_slp_instance (slp_instance);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
HOST_WIDE_INT *);
@@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info);
/** In tree-vect-transform.c **/
extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *,
- slp_tree);
+ slp_tree, slp_instance);
extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *,
@@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
slp_tree);
extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
+extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *,
+ gimple_stmt_iterator *, int, slp_instance, bool);
+
/* Driver for transformation stage. */
extern void vect_transform_loop (loop_vec_info);