aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2011-11-07 16:59:07 +0100
committerJakub Jelinek <jakub@gcc.gnu.org>2011-11-07 16:59:07 +0100
commitaec7ae7deaef9d52541da07c387066ad6ceb3d87 (patch)
treef56ca4ced37752007fdc3ce936daa06795357ad2
parent571b34b25eec4909165272e2d5a7084866ddaf96 (diff)
downloadgcc-aec7ae7deaef9d52541da07c387066ad6ceb3d87.zip
gcc-aec7ae7deaef9d52541da07c387066ad6ceb3d87.tar.gz
gcc-aec7ae7deaef9d52541da07c387066ad6ceb3d87.tar.bz2
re PR tree-optimization/50789 (Gather vectorization)
PR tree-optimization/50789 * tree-vect-stmts.c (process_use): Add force argument, avoid exist_non_indexing_operands_for_use_p check if true. (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle STMT_VINFO_GATHER_P. (gen_perm_mask): New function. (perm_mask_for_reverse): Use it. (reverse_vec_element): Rename to... (permute_vec_elements): ... this. Add Y and MASK_VEC arguments, generalize for any permutations. (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P. * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook. * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it. * doc/tm.texi: Regenerate. * tree-data-ref.c (initialize_data_dependence_relation, compute_self_dependence): No longer static. * tree-data-ref.h (initialize_data_dependence_relation, compute_self_dependence): New prototypes. * tree-vect-data-refs.c (vect_check_gather): New function. (vect_analyze_data_refs): Detect possible gather load data refs. * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field. (STMT_VINFO_GATHER_P): Define. (vect_check_gather): New prototype. * config/i386/i386-builtin-types.def: Add types for alternate gather builtins. * config/i386/sse.md (AVXMODE48P_DI): Remove. (VEC_GATHER_MODE): Rename mode_attr to... (VEC_GATHER_IDXSI): ... this. (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs. (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI> instead of <VEC_GATHER_MODE>. (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE on src and mask operands. (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator instead of AVXMODE48P_DI. (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed. * config/i386/i386.c (enum ix86_builtins): Add IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF, IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI. (ix86_init_mmx_sse_builtins): Create those builtins. (ix86_expand_builtin): Handle those builtins and adjust expansions of other gather builtins. (ix86_vectorize_builtin_gather): New function. (TARGET_VECTORIZE_BUILTIN_GATHER): Define. * gcc.target/i386/avx2-gather-1.c: New test. * gcc.target/i386/avx2-gather-2.c: New test. * gcc.target/i386/avx2-gather-3.c: New test. * gcc.target/i386/avx2-gather-4.c: New test. From-SVN: r181089
-rw-r--r--gcc/ChangeLog49
-rw-r--r--gcc/config/i386/i386-builtin-types.def4
-rw-r--r--gcc/config/i386/i386.c163
-rw-r--r--gcc/config/i386/sse.md89
-rw-r--r--gcc/doc/tm.texi8
-rw-r--r--gcc/doc/tm.texi.in8
-rw-r--r--gcc/target.def8
-rw-r--r--gcc/testsuite/ChangeLog8
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-gather-1.c215
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-gather-2.c7
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-gather-3.c167
-rw-r--r--gcc/testsuite/gcc.target/i386/avx2-gather-4.c38
-rw-r--r--gcc/tree-data-ref.c6
-rw-r--r--gcc/tree-data-ref.h5
-rw-r--r--gcc/tree-vect-data-refs.c346
-rw-r--r--gcc/tree-vect-stmts.c275
-rw-r--r--gcc/tree-vectorizer.h6
17 files changed, 1279 insertions, 123 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8aada19..0d0db8a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,52 @@
+2011-11-07 Jakub Jelinek <jakub@redhat.com>
+
+ PR tree-optimization/50789
+ * tree-vect-stmts.c (process_use): Add force argument, avoid
+ exist_non_indexing_operands_for_use_p check if true.
+ (vect_mark_stmts_to_be_vectorized): Adjust callers. Handle
+ STMT_VINFO_GATHER_P.
+ (gen_perm_mask): New function.
+ (perm_mask_for_reverse): Use it.
+ (reverse_vec_element): Rename to...
+ (permute_vec_elements): ... this. Add Y and MASK_VEC arguments,
+ generalize for any permutations.
+ (vectorizable_load): Adjust caller. Handle STMT_VINFO_GATHER_P.
+ * target.def (TARGET_VECTORIZE_BUILTIN_GATHER): New hook.
+ * doc/tm.texi.in (TARGET_VECTORIZE_BUILTIN_GATHER): Document it.
+ * doc/tm.texi: Regenerate.
+ * tree-data-ref.c (initialize_data_dependence_relation,
+ compute_self_dependence): No longer static.
+ * tree-data-ref.h (initialize_data_dependence_relation,
+ compute_self_dependence): New prototypes.
+ * tree-vect-data-refs.c (vect_check_gather): New function.
+ (vect_analyze_data_refs): Detect possible gather load data
+ refs.
+ * tree-vectorizer.h (struct _stmt_vec_info): Add gather_p field.
+ (STMT_VINFO_GATHER_P): Define.
+ (vect_check_gather): New prototype.
+ * config/i386/i386-builtin-types.def: Add types for alternate
+ gather builtins.
+ * config/i386/sse.md (AVXMODE48P_DI): Remove.
+ (VEC_GATHER_MODE): Rename mode_attr to...
+ (VEC_GATHER_IDXSI): ... this.
+ (VEC_GATHER_IDXDI, VEC_GATHER_SRCDI): New mode_attrs.
+ (avx2_gathersi<mode>, *avx2_gathersi<mode>): Use <VEC_GATHER_IDXSI>
+ instead of <VEC_GATHER_MODE>.
+ (avx2_gatherdi<mode>): Use <VEC_GATHER_IDXDI> instead of
+ <<AVXMODE48P_DI> and <VEC_GATHER_SRCDI> instead of VEC_GATHER_MODE
+ on src and mask operands.
+ (*avx2_gatherdi<mode>): Likewise. Use VEC_GATHER_MODE iterator
+ instead of AVXMODE48P_DI.
+ (avx2_gatherdi<mode>256, *avx2_gatherdi<mode>256): Removed.
+ * config/i386/i386.c (enum ix86_builtins): Add
+ IX86_BUILTIN_GATHERALTSIV4DF, IX86_BUILTIN_GATHERALTDIV8SF,
+ IX86_BUILTIN_GATHERALTSIV4DI and IX86_BUILTIN_GATHERALTDIV8SI.
+ (ix86_init_mmx_sse_builtins): Create those builtins.
+ (ix86_expand_builtin): Handle those builtins and adjust expansions
+ of other gather builtins.
+ (ix86_vectorize_builtin_gather): New function.
+ (TARGET_VECTORIZE_BUILTIN_GATHER): Define.
+
2011-11-07 Uros Bizjak <ubizjak@gmail.com>
* config/i386/f16cintrin: Remove extra _X86INTRIN_H_INCLUDED check.
diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def
index c4070e4..8466748 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -432,20 +432,24 @@ DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI)
DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V8SI, V4DF, INT)
DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT)
DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT)
DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT)
DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V4DI, V8SF, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT)
DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V8SI, V4DI, INT)
DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT)
DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT)
DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT)
DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V4DI, V8SI, INT)
DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b735ab5..4d7d2cf 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -25190,6 +25190,13 @@ enum ix86_builtins
IX86_BUILTIN_GATHERDIV4SI,
IX86_BUILTIN_GATHERDIV8SI,
+ /* Alternate 4 element gather for the vectorizer where
+ all operands are 32-byte wide. */
+ IX86_BUILTIN_GATHERALTSIV4DF,
+ IX86_BUILTIN_GATHERALTDIV8SF,
+ IX86_BUILTIN_GATHERALTSIV4DI,
+ IX86_BUILTIN_GATHERALTDIV8SI,
+
/* TFmode support builtins. */
IX86_BUILTIN_INFQ,
IX86_BUILTIN_HUGE_VALQ,
@@ -26968,6 +26975,22 @@ ix86_init_mmx_sse_builtins (void)
V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
IX86_BUILTIN_GATHERDIV8SI);
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
+ V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+ IX86_BUILTIN_GATHERALTSIV4DF);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
+ V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+ IX86_BUILTIN_GATHERALTDIV8SF);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
+ V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+ IX86_BUILTIN_GATHERALTSIV4DI);
+
+ def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
+ V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+ IX86_BUILTIN_GATHERALTDIV8SI);
+
/* MMX access to the vec_init patterns. */
def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
@@ -28954,7 +28977,7 @@ rdrand_step:
icode = CODE_FOR_avx2_gatherdiv4sf;
goto gather_gen;
case IX86_BUILTIN_GATHERDIV8SF:
- icode = CODE_FOR_avx2_gatherdiv4sf256;
+ icode = CODE_FOR_avx2_gatherdiv8sf;
goto gather_gen;
case IX86_BUILTIN_GATHERSIV2DI:
icode = CODE_FOR_avx2_gathersiv2di;
@@ -28978,7 +29001,20 @@ rdrand_step:
icode = CODE_FOR_avx2_gatherdiv4si;
goto gather_gen;
case IX86_BUILTIN_GATHERDIV8SI:
- icode = CODE_FOR_avx2_gatherdiv4si256;
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DF:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SF:
+ icode = CODE_FOR_avx2_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DI:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SI:
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
gather_gen:
arg0 = CALL_EXPR_ARG (exp, 0);
@@ -28997,8 +29033,39 @@ rdrand_step:
mode3 = insn_data[icode].operand[4].mode;
mode4 = insn_data[icode].operand[5].mode;
- if (target == NULL_RTX)
- target = gen_reg_rtx (insn_data[icode].operand[0].mode);
+ if (target == NULL_RTX
+ || GET_MODE (target) != insn_data[icode].operand[0].mode)
+ subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+ else
+ subtarget = target;
+
+ if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
+ || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
+ {
+ rtx half = gen_reg_rtx (V4SImode);
+ if (!nonimmediate_operand (op2, V8SImode))
+ op2 = copy_to_mode_reg (V8SImode, op2);
+ emit_insn (gen_vec_extract_lo_v8si (half, op2));
+ op2 = half;
+ }
+ else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
+ || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
+ {
+ rtx (*gen) (rtx, rtx);
+ rtx half = gen_reg_rtx (mode0);
+ if (mode0 == V4SFmode)
+ gen = gen_vec_extract_lo_v8sf;
+ else
+ gen = gen_vec_extract_lo_v8si;
+ if (!nonimmediate_operand (op0, GET_MODE (op0)))
+ op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+ emit_insn (gen (half, op0));
+ op0 = half;
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ emit_insn (gen (half, op3));
+ op3 = half;
+ }
/* Force memory operand only with base register here. But we
don't want to do it on memory operand for other builtin
@@ -29020,10 +29087,26 @@ rdrand_step:
error ("last argument must be scale 1, 2, 4, 8");
return const0_rtx;
}
- pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
+ pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
if (! pat)
return const0_rtx;
emit_insn (pat);
+
+ if (fcode == IX86_BUILTIN_GATHERDIV8SF
+ || fcode == IX86_BUILTIN_GATHERDIV8SI)
+ {
+ enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
+ ? V4SFmode : V4SImode;
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (tmode);
+ if (tmode == V4SFmode)
+ emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+ else
+ emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+ }
+ else
+ target = subtarget;
+
return target;
default:
@@ -29528,6 +29611,73 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
return new_fndecl;
}
+/* Returns a decl of a function that implements gather load with
+ memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
+ Return NULL_TREE if it is not available. */
+
+static tree
+ix86_vectorize_builtin_gather (const_tree mem_vectype,
+ const_tree index_type, int scale)
+{
+ bool si;
+ enum ix86_builtins code;
+
+ if (! TARGET_AVX2)
+ return NULL_TREE;
+
+ if ((TREE_CODE (index_type) != INTEGER_TYPE
+ && !POINTER_TYPE_P (index_type))
+ || (TYPE_MODE (index_type) != SImode
+ && TYPE_MODE (index_type) != DImode))
+ return NULL_TREE;
+
+ if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+ return NULL_TREE;
+
+ /* v*gather* insn sign extends index to pointer mode. */
+ if (TYPE_PRECISION (index_type) < POINTER_SIZE
+ && TYPE_UNSIGNED (index_type))
+ return NULL_TREE;
+
+ if (scale <= 0
+ || scale > 8
+ || (scale & (scale - 1)) != 0)
+ return NULL_TREE;
+
+ si = TYPE_MODE (index_type) == SImode;
+ switch (TYPE_MODE (mem_vectype))
+ {
+ case V2DFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+ break;
+ case V4DFmode:
+ code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+ break;
+ case V2DImode:
+ code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+ break;
+ case V4DImode:
+ code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+ break;
+ case V4SFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+ break;
+ case V8SFmode:
+ code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+ break;
+ case V4SImode:
+ code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+ break;
+ case V8SImode:
+ code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+ break;
+ default:
+ return NULL_TREE;
+ }
+
+ return ix86_builtins[code];
+}
+
/* Returns a code for a target-specific builtin that implements
reciprocal of the function, or NULL_TREE if not available. */
@@ -37727,6 +37877,9 @@ ix86_autovectorize_vector_sizes (void)
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
ix86_builtin_vectorized_function
+#undef TARGET_VECTORIZE_BUILTIN_GATHER
+#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
+
#undef TARGET_BUILTIN_RECIPROCAL
#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a2826ce..e3de9ec 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -316,14 +316,6 @@
;; Mix-n-match
(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
-(define_mode_iterator AVXMODE48P_DI
- [V2DI V2DF V4DI V4DF V4SF V4SI])
-(define_mode_attr AVXMODE48P_DI
- [(V2DI "V2DI") (V2DF "V2DI")
- (V4DI "V4DI") (V4DF "V4DI")
- (V4SI "V2DI") (V4SF "V2DI")
- (V8SI "V4DI") (V8SF "V4DI")])
-
(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
;; Mapping of immediate bits for blend instructions
@@ -12518,11 +12510,21 @@
;; For gather* insn patterns
(define_mode_iterator VEC_GATHER_MODE
[V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF])
-(define_mode_attr VEC_GATHER_MODE
+(define_mode_attr VEC_GATHER_IDXSI
[(V2DI "V4SI") (V2DF "V4SI")
(V4DI "V4SI") (V4DF "V4SI")
(V4SI "V4SI") (V4SF "V4SI")
(V8SI "V8SI") (V8SF "V8SI")])
+(define_mode_attr VEC_GATHER_IDXDI
+ [(V2DI "V2DI") (V2DF "V2DI")
+ (V4DI "V4DI") (V4DF "V4DI")
+ (V4SI "V2DI") (V4SF "V2DI")
+ (V8SI "V4DI") (V8SF "V4DI")])
+(define_mode_attr VEC_GATHER_SRCDI
+ [(V2DI "V2DI") (V2DF "V2DF")
+ (V4DI "V4DI") (V4DF "V4DF")
+ (V4SI "V4SI") (V4SF "V4SF")
+ (V8SI "V4SI") (V8SF "V4SF")])
(define_expand "avx2_gathersi<mode>"
[(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
@@ -12531,7 +12533,8 @@
(mem:<ssescalarmode>
(match_par_dup 7
[(match_operand 2 "vsib_address_operand" "")
- (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "")
+ (match_operand:<VEC_GATHER_IDXSI>
+ 3 "register_operand" "")
(match_operand:SI 5 "const1248_operand " "")]))
(mem:BLK (scratch))
(match_operand:VEC_GATHER_MODE 4 "register_operand" "")]
@@ -12551,7 +12554,7 @@
(match_operator:<ssescalarmode> 7 "vsib_mem_operator"
[(unspec:P
[(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:<VEC_GATHER_MODE> 4 "register_operand" "x")
+ (match_operand:<VEC_GATHER_IDXSI> 4 "register_operand" "x")
(match_operand:SI 6 "const1248_operand" "n")]
UNSPEC_VSIBADDR)])
(mem:BLK (scratch))
@@ -12567,14 +12570,16 @@
(define_expand "avx2_gatherdi<mode>"
[(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
(unspec:VEC_GATHER_MODE
- [(match_operand:VEC_GATHER_MODE 1 "register_operand" "")
+ [(match_operand:<VEC_GATHER_SRCDI> 1 "register_operand" "")
(mem:<ssescalarmode>
(match_par_dup 7
[(match_operand 2 "vsib_address_operand" "")
- (match_operand:<AVXMODE48P_DI> 3 "register_operand" "")
+ (match_operand:<VEC_GATHER_IDXDI>
+ 3 "register_operand" "")
(match_operand:SI 5 "const1248_operand " "")]))
(mem:BLK (scratch))
- (match_operand:VEC_GATHER_MODE 4 "register_operand" "")]
+ (match_operand:<VEC_GATHER_SRCDI>
+ 4 "register_operand" "")]
UNSPEC_GATHER))
(clobber (match_scratch:VEC_GATHER_MODE 6 ""))])]
"TARGET_AVX2"
@@ -12585,63 +12590,21 @@
})
(define_insn "*avx2_gatherdi<mode>"
- [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x")
- (unspec:AVXMODE48P_DI
- [(match_operand:AVXMODE48P_DI 2 "register_operand" "0")
+ [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x")
+ (unspec:VEC_GATHER_MODE
+ [(match_operand:<VEC_GATHER_SRCDI> 2 "register_operand" "0")
(match_operator:<ssescalarmode> 7 "vsib_mem_operator"
[(unspec:P
[(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:<AVXMODE48P_DI> 4 "register_operand" "x")
+ (match_operand:<VEC_GATHER_IDXDI> 4 "register_operand" "x")
(match_operand:SI 6 "const1248_operand" "n")]
UNSPEC_VSIBADDR)])
(mem:BLK (scratch))
- (match_operand:AVXMODE48P_DI 5 "register_operand" "1")]
+ (match_operand:<VEC_GATHER_SRCDI> 5 "register_operand" "1")]
UNSPEC_GATHER))
- (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))]
- "TARGET_AVX2"
- "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
- [(set_attr "type" "ssemov")
- (set_attr "prefix" "vex")
- (set_attr "mode" "<sseinsnmode>")])
-
-;; Special handling for VEX.256 with float arguments
-;; since there're still xmms as operands
-(define_expand "avx2_gatherdi<mode>256"
- [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "")
- (unspec:VI4F_128
- [(match_operand:VI4F_128 1 "register_operand" "")
- (mem:<ssescalarmode>
- (match_par_dup 7
- [(match_operand 2 "vsib_address_operand" "")
- (match_operand:V4DI 3 "register_operand" "")
- (match_operand:SI 5 "const1248_operand " "")]))
- (mem:BLK (scratch))
- (match_operand:VI4F_128 4 "register_operand" "")]
- UNSPEC_GATHER))
- (clobber (match_scratch:VI4F_128 6 ""))])]
- "TARGET_AVX2"
-{
- operands[7]
- = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3],
- operands[5]), UNSPEC_VSIBADDR);
-})
-
-(define_insn "*avx2_gatherdi<mode>256"
- [(set (match_operand:VI4F_128 0 "register_operand" "=x")
- (unspec:VI4F_128
- [(match_operand:VI4F_128 2 "register_operand" "0")
- (match_operator:<ssescalarmode> 7 "vsib_mem_operator"
- [(unspec:P
- [(match_operand:P 3 "vsib_address_operand" "p")
- (match_operand:V4DI 4 "register_operand" "x")
- (match_operand:SI 6 "const1248_operand" "n")]
- UNSPEC_VSIBADDR)])
- (mem:BLK (scratch))
- (match_operand:VI4F_128 5 "register_operand" "1")]
- UNSPEC_GATHER))
- (clobber (match_scratch:VI4F_128 1 "=&x"))]
+ (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))]
"TARGET_AVX2"
- "v<sseintprefix>gatherq<ssemodesuffix>\t{%1, %7, %0|%0, %7, %1}"
+ "v<sseintprefix>gatherq<ssemodesuffix>\t{%5, %7, %2|%2, %7, %5}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 422f74c..fed7702 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5758,6 +5758,14 @@ mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.
The default is zero which means to not iterate over other vector sizes.
@end deftypefn
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree @var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
+Target builtin that implements vector gather operation. @var{mem_vectype}
+is the vector type of the load and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize gather
+loads.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index cede91e..f0c6ce0 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -5696,6 +5696,14 @@ mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.
The default is zero which means to not iterate over other vector sizes.
@end deftypefn
+@hook TARGET_VECTORIZE_BUILTIN_GATHER
+Target builtin that implements vector gather operation. @var{mem_vectype}
+is the vector type of the load and @var{index_type} is scalar type of
+the index, scaled by @var{scale}.
+The default is @code{NULL_TREE} which means to not vectorize gather
+loads.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/target.def b/gcc/target.def
index f89bb51..a83088d 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1021,6 +1021,14 @@ DEFHOOK
(void),
default_autovectorize_vector_sizes)
+/* Target builtin that implements vector gather operation. */
+DEFHOOK
+(builtin_gather,
+ "",
+ tree,
+ (const_tree mem_vectype, const_tree index_type, int scale),
+ NULL)
+
HOOK_VECTOR_END (vectorize)
#undef HOOK_PREFIX
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 0c6cefa..46f702a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2011-11-07 Jakub Jelinek <jakub@redhat.com>
+
+ PR tree-optimization/50789
+ * gcc.target/i386/avx2-gather-1.c: New test.
+ * gcc.target/i386/avx2-gather-2.c: New test.
+ * gcc.target/i386/avx2-gather-3.c: New test.
+ * gcc.target/i386/avx2-gather-4.c: New test.
+
2011-11-07 Uros Bizjak <ubizjak@gmail.com>
* gcc.target/i386/pr49781-1.c (dg-options): Add -mtune=generic.
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-1.c b/gcc/testsuite/gcc.target/i386/avx2-gather-1.c
new file mode 100644
index 0000000..7ed567d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-1.c
@@ -0,0 +1,215 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+float vf1[N+16], vf2[N];
+double vd1[N+16], vd2[N];
+int k[N];
+long l[N];
+short n[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[k[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[k[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f11 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vf2[i] = vf1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f12 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vf1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f13 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f14 (void)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[l[i]];
+}
+
+__attribute__((noinline, noclone)) void
+f15 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ vd2[i] = vd1[l[i] + x];
+}
+
+__attribute__((noinline, noclone)) void
+f16 (long x)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ n[i] = (int) vd1[l[i] + x];
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+
+ for (i = 0; i < N + 16; i++)
+ {
+ asm ("");
+ vf1[i] = 17.0f + i;
+ vd1[i] = 19.0 + i;
+ }
+ for (i = 0; i < N; i++)
+ {
+ asm ("");
+ k[i] = (i * 731) & (N - 1);
+ l[i] = (i * 657) & (N - 1);
+ }
+
+ f1 ();
+ f2 ();
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 731) & (N - 1)) + 17
+ || n[i] != ((i * 731) & (N - 1)) + 17)
+ abort ();
+
+ f3 (12);
+ f4 (14);
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 731) & (N - 1)) + 17 + 12
+ || n[i] != ((i * 731) & (N - 1)) + 17 + 14)
+ abort ();
+
+ f5 ();
+ f6 ();
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 731) & (N - 1)) + 19
+ || n[i] != ((i * 731) & (N - 1)) + 19)
+ abort ();
+
+ f7 (7);
+ f8 (9);
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 731) & (N - 1)) + 19 + 7
+ || n[i] != ((i * 731) & (N - 1)) + 19 + 9)
+ abort ();
+
+ f9 ();
+ f10 ();
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 657) & (N - 1)) + 17
+ || n[i] != ((i * 657) & (N - 1)) + 17)
+ abort ();
+
+ f11 (2);
+ f12 (4);
+ for (i = 0; i < N; i++)
+ if (vf2[i] != ((i * 657) & (N - 1)) + 17 + 2
+ || n[i] != ((i * 657) & (N - 1)) + 17 + 4)
+ abort ();
+
+ f13 ();
+ f14 ();
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 657) & (N - 1)) + 19
+ || n[i] != ((i * 657) & (N - 1)) + 19)
+ abort ();
+
+ f15 (13);
+ f16 (15);
+ for (i = 0; i < N; i++)
+ if (vd2[i] != ((i * 657) & (N - 1)) + 19 + 13
+ || n[i] != ((i * 657) & (N - 1)) + 19 + 15)
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-2.c b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
new file mode 100644
index 0000000..8a7fe95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+#include "avx2-gather-1.c"
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 16 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-3.c b/gcc/testsuite/gcc.target/i386/avx2-gather-3.c
new file mode 100644
index 0000000..fb6289c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-3.c
@@ -0,0 +1,167 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2 -ffast-math" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+float f[N];
+double d[N];
+int k[N];
+float *l[N];
+double *n[N];
+int **m[N];
+long **o[N];
+long q[N];
+long *r[N];
+int *s[N];
+
+__attribute__((noinline, noclone)) float
+f1 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += f[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f2 (float *p)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f3 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += *l[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (void)
+{
+ int i;
+ int g = 0;
+ for (i = 0; i < N / 2; i++)
+ g += **m[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f5 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += d[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f6 (double *p)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[k[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f7 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += *n[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) int
+f8 (void)
+{
+ int i;
+ int g = 0;
+ for (i = 0; i < N / 2; i++)
+ g += **o[i];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f9 (void)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += f[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) float
+f10 (float *p)
+{
+ int i;
+ float g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f11 (void)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += d[q[i]];
+ return g;
+}
+
+__attribute__((noinline, noclone)) double
+f12 (double *p)
+{
+ int i;
+ double g = 0.0;
+ for (i = 0; i < N / 2; i++)
+ g += p[q[i]];
+ return g;
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+
+ for (i = 0; i < N; i++)
+ {
+ asm ("");
+ f[i] = -256.0f + i;
+ d[i] = -258.0 + i;
+ k[i] = (i * 731) & (N - 1);
+ q[i] = (i * 657) & (N - 1);
+ l[i] = &f[(i * 239) & (N - 1)];
+ n[i] = &d[(i * 271) & (N - 1)];
+ r[i] = &q[(i * 323) & (N - 1)];
+ s[i] = &k[(i * 565) & (N - 1)];
+ m[i] = &s[(i * 13) & (N - 1)];
+ o[i] = &r[(i * 19) & (N - 1)];
+ }
+
+ if (f1 () != 136448.0f || f2 (f) != 136448.0f || f3 () != 130304.0)
+ abort ();
+ if (f4 () != 261376 || f5 () != 135424.0 || f6 (d) != 135424.0)
+ abort ();
+ if (f7 () != 129280.0 || f8 () != 259840L || f9 () != 130816.0f)
+ abort ();
+ if (f10 (f) != 130816.0f || f11 () != 129792.0 || f12 (d) != 129792.0)
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-gather-4.c b/gcc/testsuite/gcc.target/i386/avx2-gather-4.c
new file mode 100644
index 0000000..440a9c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-gather-4.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#include "avx2-check.h"
+
+#define N 1024
+int a[N], b[N], c[N], d[N];
+
+__attribute__((noinline, noclone)) void
+foo (float *__restrict p, float *__restrict q, float *__restrict r,
+ long s1, long s2, long s3)
+{
+ int i;
+ for (i = 0; i < N; i++)
+ p[i] = q[a[i] * s1 + b[i] * s2 + s3] * r[c[i] * s1 + d[i] * s2 + s3];
+}
+
+static void
+avx2_test (void)
+{
+ int i;
+ float e[N], f[N], g[N];
+ for (i = 0; i < N; i++)
+ {
+ a[i] = (i * 7) & (N / 8 - 1);
+ b[i] = (i * 13) & (N / 8 - 1);
+ c[i] = (i * 23) & (N / 8 - 1);
+ d[i] = (i * 5) & (N / 8 - 1);
+ e[i] = 16.5 + i;
+ f[i] = 127.5 - i;
+ }
+ foo (g, e, f, 3, 2, 4);
+ for (i = 0; i < N; i++)
+ if (g[i] != (float) ((20.5 + a[i] * 3 + b[i] * 2)
+ * (123.5 - c[i] * 3 - d[i] * 2)))
+ abort ();
+}
diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c
index 4e0de05..89d123d 100644
--- a/gcc/tree-data-ref.c
+++ b/gcc/tree-data-ref.c
@@ -1351,13 +1351,11 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b,
return refs_may_alias_p (addr_a, addr_b);
}
-static void compute_self_dependence (struct data_dependence_relation *);
-
/* Initialize a data dependence relation between data accesses A and
B. NB_LOOPS is the number of loops surrounding the references: the
size of the classic distance/direction vectors. */
-static struct data_dependence_relation *
+struct data_dependence_relation *
initialize_data_dependence_relation (struct data_reference *a,
struct data_reference *b,
VEC (loop_p, heap) *loop_nest)
@@ -4121,7 +4119,7 @@ compute_affine_dependence (struct data_dependence_relation *ddr,
/* This computes the dependence relation for the same data
reference into DDR. */
-static void
+void
compute_self_dependence (struct data_dependence_relation *ddr)
{
unsigned int i;
diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
index c55bd48a..0f12962 100644
--- a/gcc/tree-data-ref.h
+++ b/gcc/tree-data-ref.h
@@ -1,5 +1,5 @@
/* Data references and dependences detectors.
- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
+ Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc.
Contributed by Sebastian Pop <pop@cri.ensmp.fr>
@@ -423,6 +423,9 @@ extern bool graphite_find_data_references_in_stmt (loop_p, loop_p, gimple,
VEC (data_reference_p, heap) **);
struct data_reference *create_data_ref (loop_p, loop_p, tree, gimple, bool);
extern bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
+extern struct data_dependence_relation *initialize_data_dependence_relation
+ (struct data_reference *, struct data_reference *, VEC (loop_p, heap) *);
+extern void compute_self_dependence (struct data_dependence_relation *);
extern void compute_all_dependences (VEC (data_reference_p, heap) *,
VEC (ddr_p, heap) **, VEC (loop_p, heap) *,
bool);
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index a239216..0ff8ee8 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -2497,6 +2497,199 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
return true;
}
+/* Check whether a non-affine read in stmt is suitable for gather load
+ and if so, return a builtin decl for that operation. */
+
+tree
+vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
+ tree *offp, int *scalep)
+{
+ HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ tree offtype = NULL_TREE;
+ tree decl, base, off;
+ enum machine_mode pmode;
+ int punsignedp, pvolatilep;
+
+ /* The gather builtins need address of the form
+ loop_invariant + vector * {1, 2, 4, 8}
+ or
+ loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
+ Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
+ of loop invariants/SSA_NAMEs defined in the loop, with casts,
+ multiplications and additions in it. To get a vector, we need
+ a single SSA_NAME that will be defined in the loop and will
+ contain everything that is not loop invariant and that can be
+ vectorized. The following code attempts to find such a preexistng
+ SSA_NAME OFF and put the loop invariants into a tree BASE
+ that can be gimplified before the loop. */
+ base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
+ &pmode, &punsignedp, &pvolatilep, false);
+ gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
+
+ if (TREE_CODE (base) == MEM_REF)
+ {
+ if (!integer_zerop (TREE_OPERAND (base, 1)))
+ {
+ if (off == NULL_TREE)
+ {
+ double_int moff = mem_ref_offset (base);
+ off = double_int_to_tree (sizetype, moff);
+ }
+ else
+ off = size_binop (PLUS_EXPR, off,
+ fold_convert (sizetype, TREE_OPERAND (base, 1)));
+ }
+ base = TREE_OPERAND (base, 0);
+ }
+ else
+ base = build_fold_addr_expr (base);
+
+ if (off == NULL_TREE)
+ off = size_zero_node;
+
+ /* If base is not loop invariant, either off is 0, then we start with just
+ the constant offset in the loop invariant BASE and continue with base
+ as OFF, otherwise give up.
+ We could handle that case by gimplifying the addition of base + off
+ into some SSA_NAME and use that as off, but for now punt. */
+ if (!expr_invariant_in_loop_p (loop, base))
+ {
+ if (!integer_zerop (off))
+ return NULL_TREE;
+ off = base;
+ base = size_int (pbitpos / BITS_PER_UNIT);
+ }
+ /* Otherwise put base + constant offset into the loop invariant BASE
+ and continue with OFF. */
+ else
+ {
+ base = fold_convert (sizetype, base);
+ base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
+ }
+
+ /* OFF at this point may be either a SSA_NAME or some tree expression
+ from get_inner_reference. Try to peel off loop invariants from it
+ into BASE as long as possible. */
+ STRIP_NOPS (off);
+ while (offtype == NULL_TREE)
+ {
+ enum tree_code code;
+ tree op0, op1, add = NULL_TREE;
+
+ if (TREE_CODE (off) == SSA_NAME)
+ {
+ gimple def_stmt = SSA_NAME_DEF_STMT (off);
+
+ if (expr_invariant_in_loop_p (loop, off))
+ return NULL_TREE;
+
+ if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
+ break;
+
+ op0 = gimple_assign_rhs1 (def_stmt);
+ code = gimple_assign_rhs_code (def_stmt);
+ op1 = gimple_assign_rhs2 (def_stmt);
+ }
+ else
+ {
+ if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
+ return NULL_TREE;
+ code = TREE_CODE (off);
+ extract_ops_from_tree (off, &code, &op0, &op1);
+ }
+ switch (code)
+ {
+ case POINTER_PLUS_EXPR:
+ case PLUS_EXPR:
+ if (expr_invariant_in_loop_p (loop, op0))
+ {
+ add = op0;
+ off = op1;
+ do_add:
+ add = fold_convert (sizetype, add);
+ if (scale != 1)
+ add = size_binop (MULT_EXPR, add, size_int (scale));
+ base = size_binop (PLUS_EXPR, base, add);
+ continue;
+ }
+ if (expr_invariant_in_loop_p (loop, op1))
+ {
+ add = op1;
+ off = op0;
+ goto do_add;
+ }
+ break;
+ case MINUS_EXPR:
+ if (expr_invariant_in_loop_p (loop, op1))
+ {
+ add = fold_convert (sizetype, op1);
+ add = size_binop (MINUS_EXPR, size_zero_node, add);
+ off = op0;
+ goto do_add;
+ }
+ break;
+ case MULT_EXPR:
+ if (scale == 1 && host_integerp (op1, 0))
+ {
+ scale = tree_low_cst (op1, 0);
+ off = op0;
+ continue;
+ }
+ break;
+ case SSA_NAME:
+ off = op0;
+ continue;
+ CASE_CONVERT:
+ if (!POINTER_TYPE_P (TREE_TYPE (op0))
+ && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
+ break;
+ if (TYPE_PRECISION (TREE_TYPE (op0))
+ == TYPE_PRECISION (TREE_TYPE (off)))
+ {
+ off = op0;
+ continue;
+ }
+ if (TYPE_PRECISION (TREE_TYPE (op0))
+ < TYPE_PRECISION (TREE_TYPE (off)))
+ {
+ off = op0;
+ offtype = TREE_TYPE (off);
+ STRIP_NOPS (off);
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+
+ /* If at the end OFF still isn't a SSA_NAME or isn't
+ defined in the loop, punt. */
+ if (TREE_CODE (off) != SSA_NAME
+ || expr_invariant_in_loop_p (loop, off))
+ return NULL_TREE;
+
+ if (offtype == NULL_TREE)
+ offtype = TREE_TYPE (off);
+
+ decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+ offtype, scale);
+ if (decl == NULL_TREE)
+ return NULL_TREE;
+
+ if (basep)
+ *basep = base;
+ if (offp)
+ *offp = off;
+ if (scalep)
+ *scalep = scale;
+ return decl;
+}
+
/* Function vect_analyze_data_refs.
@@ -2573,6 +2766,7 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
gimple stmt;
stmt_vec_info stmt_info;
tree base, offset, init;
+ bool gather = false;
int vf;
if (!dr || !DR_REF (dr))
@@ -2594,22 +2788,51 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
/* Check that analysis of the data-ref succeeded. */
if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
- || !DR_STEP (dr))
+ || !DR_STEP (dr))
{
- if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
- {
- fprintf (vect_dump, "not vectorized: data ref analysis failed ");
- print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
- }
+ /* If target supports vector gather loads, see if they can't
+ be used. */
+ if (loop_vinfo
+ && DR_IS_READ (dr)
+ && !TREE_THIS_VOLATILE (DR_REF (dr))
+ && targetm.vectorize.builtin_gather != NULL
+ && !nested_in_vect_loop_p (loop, stmt))
+ {
+ struct data_reference *newdr
+ = create_data_ref (NULL, loop_containing_stmt (stmt),
+ DR_REF (dr), stmt, true);
+ gcc_assert (newdr != NULL && DR_REF (newdr));
+ if (DR_BASE_ADDRESS (newdr)
+ && DR_OFFSET (newdr)
+ && DR_INIT (newdr)
+ && DR_STEP (newdr)
+ && integer_zerop (DR_STEP (newdr)))
+ {
+ dr = newdr;
+ gather = true;
+ }
+ else
+ free_data_ref (newdr);
+ }
- if (bb_vinfo)
- {
- STMT_VINFO_VECTORIZABLE (stmt_info) = false;
- stop_bb_analysis = true;
- continue;
- }
+ if (!gather)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump, "not vectorized: data ref analysis "
+ "failed ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
- return false;
+ if (bb_vinfo)
+ {
+ STMT_VINFO_VECTORIZABLE (stmt_info) = false;
+ stop_bb_analysis = true;
+ continue;
+ }
+
+ return false;
+ }
}
if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
@@ -2625,7 +2848,9 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
continue;
}
- return false;
+ if (gather)
+ free_data_ref (dr);
+ return false;
}
if (TREE_THIS_VOLATILE (DR_REF (dr)))
@@ -2666,6 +2891,8 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
continue;
}
+ if (gather)
+ free_data_ref (dr);
return false;
}
@@ -2791,6 +3018,8 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
continue;
}
+ if (gather)
+ free_data_ref (dr);
return false;
}
@@ -2818,8 +3047,13 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
stop_bb_analysis = true;
continue;
}
- else
- return false;
+
+ if (gather)
+ {
+ STMT_VINFO_DATA_REF (stmt_info) = NULL;
+ free_data_ref (dr);
+ }
+ return false;
}
/* Adjust the minimal vectorization factor according to the
@@ -2827,6 +3061,86 @@ vect_analyze_data_refs (loop_vec_info loop_vinfo,
vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
if (vf > *min_vf)
*min_vf = vf;
+
+ if (gather)
+ {
+ unsigned int j, k, n;
+ struct data_reference *olddr
+ = VEC_index (data_reference_p, datarefs, i);
+ VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
+ struct data_dependence_relation *ddr, *newddr;
+ bool bad = false;
+ tree off;
+ VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
+
+ if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL)
+ || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump,
+ "not vectorized: not suitable for gather ");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+ return false;
+ }
+
+ n = VEC_length (data_reference_p, datarefs) - 1;
+ for (j = 0, k = i - 1; j < i; j++)
+ {
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_B (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (DDR_A (ddr), dr,
+ nest);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ if (!bad
+ && DR_IS_WRITE (DDR_A (newddr))
+ && DDR_ARE_DEPENDENT (newddr) != chrec_known)
+ bad = true;
+ k += --n;
+ }
+
+ k++;
+ n = k + VEC_length (data_reference_p, datarefs) - i - 1;
+ for (; k < n; k++)
+ {
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_A (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (dr, DDR_B (ddr),
+ nest);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ if (!bad
+ && DR_IS_WRITE (DDR_B (newddr))
+ && DDR_ARE_DEPENDENT (newddr) != chrec_known)
+ bad = true;
+ }
+
+ k = VEC_length (ddr_p, ddrs)
+ - VEC_length (data_reference_p, datarefs) + i;
+ ddr = VEC_index (ddr_p, ddrs, k);
+ gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr);
+ newddr = initialize_data_dependence_relation (dr, dr, nest);
+ compute_self_dependence (newddr);
+ VEC_replace (ddr_p, ddrs, k, newddr);
+ free_dependence_relation (ddr);
+ VEC_replace (data_reference_p, datarefs, i, dr);
+
+ if (bad)
+ {
+ if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+ {
+ fprintf (vect_dump,
+ "not vectorized: data dependence conflict"
+ " prevents gather");
+ print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+ }
+ return false;
+ }
+
+ STMT_VINFO_GATHER_P (stmt_info) = true;
+ }
}
return true;
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index dd65636..8b9a2cf 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -332,6 +332,8 @@ exist_non_indexing_operands_for_use_p (tree use, gimple stmt)
- LIVE_P, RELEVANT - enum values to be set in the STMT_VINFO of the stmt
that defined USE. This is done by calling mark_relevant and passing it
the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
+ - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
+ be performed.
Outputs:
Generally, LIVE_P and RELEVANT are used to define the liveness and
@@ -351,7 +353,8 @@ exist_non_indexing_operands_for_use_p (tree use, gimple stmt)
static bool
process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
- enum vect_relevant relevant, VEC(gimple,heap) **worklist)
+ enum vect_relevant relevant, VEC(gimple,heap) **worklist,
+ bool force)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
@@ -363,7 +366,7 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
/* case 1: we are only interested in uses that need to be vectorized. Uses
that are used for address computation are not considered relevant. */
- if (!exist_non_indexing_operands_for_use_p (use, stmt))
+ if (!force && !exist_non_indexing_operands_for_use_p (use, stmt))
return true;
if (!vect_is_simple_use (use, loop_vinfo, NULL, &def_stmt, &def, &dt))
@@ -646,7 +649,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
break;
}
- if (is_pattern_stmt_p (vinfo_for_stmt (stmt)))
+ if (is_pattern_stmt_p (stmt_vinfo))
{
/* Pattern statements are not inserted into the code, so
FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
@@ -660,9 +663,9 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
{
if (!process_use (stmt, TREE_OPERAND (op, 0), loop_vinfo,
- live_p, relevant, &worklist)
+ live_p, relevant, &worklist, false)
|| !process_use (stmt, TREE_OPERAND (op, 1), loop_vinfo,
- live_p, relevant, &worklist))
+ live_p, relevant, &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
@@ -673,7 +676,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
{
op = gimple_op (stmt, i);
if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
@@ -686,7 +689,7 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
{
tree arg = gimple_call_arg (stmt, i);
if (!process_use (stmt, arg, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
@@ -699,12 +702,25 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
{
tree op = USE_FROM_PTR (use_p);
if (!process_use (stmt, op, loop_vinfo, live_p, relevant,
- &worklist))
+ &worklist, false))
{
VEC_free (gimple, heap, worklist);
return false;
}
}
+
+ if (STMT_VINFO_GATHER_P (stmt_vinfo))
+ {
+ tree off;
+ tree decl = vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
+ gcc_assert (decl);
+ if (!process_use (stmt, off, loop_vinfo, live_p, relevant,
+ &worklist, true))
+ {
+ VEC_free (gimple, heap, worklist);
+ return false;
+ }
+ }
} /* while worklist */
VEC_free (gimple, heap, worklist);
@@ -3914,23 +3930,17 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
return true;
}
-/* Given a vector type VECTYPE returns a builtin DECL to be used
- for vector permutation and returns the mask that implements
- reversal of the vector elements. If that is impossible to do,
- returns NULL. */
+/* Given a vector type VECTYPE and permutation SEL returns
+ the VECTOR_CST mask that implements the permutation of the
+ vector elements. If that is impossible to do, returns NULL. */
static tree
-perm_mask_for_reverse (tree vectype)
+gen_perm_mask (tree vectype, unsigned char *sel)
{
tree mask_elt_type, mask_type, mask_vec;
int i, nunits;
- unsigned char *sel;
nunits = TYPE_VECTOR_SUBPARTS (vectype);
- sel = XALLOCAVEC (unsigned char, nunits);
-
- for (i = 0; i < nunits; ++i)
- sel[i] = nunits - 1 - i;
if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
return NULL;
@@ -3941,33 +3951,52 @@ perm_mask_for_reverse (tree vectype)
mask_type = get_vectype_for_scalar_type (mask_elt_type);
mask_vec = NULL;
- for (i = 0; i < nunits; i++)
- mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, i), mask_vec);
+ for (i = nunits - 1; i >= 0; i--)
+ mask_vec = tree_cons (NULL, build_int_cst (mask_elt_type, sel[i]),
+ mask_vec);
mask_vec = build_vector (mask_type, mask_vec);
return mask_vec;
}
-/* Given a vector variable X, that was generated for the scalar LHS of
- STMT, generate instructions to reverse the vector elements of X,
- insert them a *GSI and return the permuted vector variable. */
+/* Given a vector type VECTYPE returns the VECTOR_CST mask that implements
+ reversal of the vector elements. If that is impossible to do,
+ returns NULL. */
static tree
-reverse_vec_elements (tree x, gimple stmt, gimple_stmt_iterator *gsi)
+perm_mask_for_reverse (tree vectype)
+{
+ int i, nunits;
+ unsigned char *sel;
+
+ nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ sel = XALLOCAVEC (unsigned char, nunits);
+
+ for (i = 0; i < nunits; ++i)
+ sel[i] = nunits - 1 - i;
+
+ return gen_perm_mask (vectype, sel);
+}
+
+/* Given a vector variable X and Y, that was generated for the scalar
+ STMT, generate instructions to permute the vector elements of X and Y
+ using permutation mask MASK_VEC, insert them at *GSI and return the
+ permuted vector variable. */
+
+static tree
+permute_vec_elements (tree x, tree y, tree mask_vec, gimple stmt,
+ gimple_stmt_iterator *gsi)
{
tree vectype = TREE_TYPE (x);
- tree mask_vec, perm_dest, data_ref;
+ tree perm_dest, data_ref;
gimple perm_stmt;
- mask_vec = perm_mask_for_reverse (vectype);
-
perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
+ data_ref = make_ssa_name (perm_dest, NULL);
/* Generate the permute statement. */
- perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, perm_dest,
- x, x, mask_vec);
- data_ref = make_ssa_name (perm_dest, perm_stmt);
- gimple_set_lhs (perm_stmt, data_ref);
+ perm_stmt = gimple_build_assign_with_ops3 (VEC_PERM_EXPR, data_ref,
+ x, y, mask_vec);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
return data_ref;
@@ -4026,6 +4055,10 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
int vf;
tree aggr_type;
+ tree gather_base = NULL_TREE, gather_off = NULL_TREE;
+ tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
+ int gather_scale = 1;
+ enum vect_def_type gather_dt = vect_unknown_def_type;
if (loop_vinfo)
{
@@ -4106,7 +4139,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
{
strided_load = true;
/* FORNOW */
- gcc_assert (! nested_in_vect_loop);
+ gcc_assert (! nested_in_vect_loop && !STMT_VINFO_GATHER_P (stmt_info));
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
if (!slp && !PURE_SLP_STMT (stmt_info))
@@ -4121,7 +4154,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (negative)
{
- gcc_assert (!strided_load);
+ gcc_assert (!strided_load && !STMT_VINFO_GATHER_P (stmt_info));
alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
if (alignment_support_scheme != dr_aligned
&& alignment_support_scheme != dr_unaligned_supported)
@@ -4138,6 +4171,23 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
}
}
+ if (STMT_VINFO_GATHER_P (stmt_info))
+ {
+ gimple def_stmt;
+ tree def;
+ gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
+ &gather_off, &gather_scale);
+ gcc_assert (gather_decl);
+ if (!vect_is_simple_use_1 (gather_off, loop_vinfo, bb_vinfo,
+ &def_stmt, &def, &gather_dt,
+ &gather_off_vectype))
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "gather index use not simple.");
+ return false;
+ }
+ }
+
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
@@ -4150,6 +4200,161 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
/** Transform. **/
+ if (STMT_VINFO_GATHER_P (stmt_info))
+ {
+ tree vec_oprnd0 = NULL_TREE, op;
+ tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+ tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+ tree ptr, mask, var, scale, perm_mask = NULL_TREE, prev_res = NULL_TREE;
+ edge pe = loop_preheader_edge (loop);
+ gimple_seq seq;
+ basic_block new_bb;
+ enum { NARROW, NONE, WIDEN } modifier;
+ int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype);
+
+ if (nunits == gather_off_nunits)
+ modifier = NONE;
+ else if (nunits == gather_off_nunits / 2)
+ {
+ unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits);
+ modifier = WIDEN;
+
+ for (i = 0; i < gather_off_nunits; ++i)
+ sel[i] = i | nunits;
+
+ perm_mask = gen_perm_mask (gather_off_vectype, sel);
+ gcc_assert (perm_mask != NULL_TREE);
+ }
+ else if (nunits == gather_off_nunits * 2)
+ {
+ unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+ modifier = NARROW;
+
+ for (i = 0; i < nunits; ++i)
+ sel[i] = i < gather_off_nunits
+ ? i : i + nunits - gather_off_nunits;
+
+ perm_mask = gen_perm_mask (vectype, sel);
+ gcc_assert (perm_mask != NULL_TREE);
+ ncopies *= 2;
+ }
+ else
+ gcc_unreachable ();
+
+ rettype = TREE_TYPE (TREE_TYPE (gather_decl));
+ srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+ scaletype = TREE_VALUE (arglist);
+ gcc_checking_assert (types_compatible_p (srctype, rettype)
+ && types_compatible_p (srctype, masktype));
+
+ vec_dest = vect_create_destination_var (scalar_dest, vectype);
+
+ ptr = fold_convert (ptrtype, gather_base);
+ if (!is_gimple_min_invariant (ptr))
+ {
+ ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+ new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+ gcc_assert (!new_bb);
+ }
+
+ /* Currently we support only unconditional gather loads,
+ so mask should be all ones. */
+ if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+ mask = build_int_cst (TREE_TYPE (masktype), -1);
+ else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
+ {
+ REAL_VALUE_TYPE r;
+ long tmp[6];
+ for (j = 0; j < 6; ++j)
+ tmp[j] = -1;
+ real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
+ mask = build_real (TREE_TYPE (masktype), r);
+ }
+ else
+ gcc_unreachable ();
+ mask = build_vector_from_val (masktype, mask);
+ mask = vect_init_vector (stmt, mask, masktype, NULL);
+
+ scale = build_int_cst (scaletype, gather_scale);
+
+ prev_stmt_info = NULL;
+ for (j = 0; j < ncopies; ++j)
+ {
+ if (modifier == WIDEN && (j & 1))
+ op = permute_vec_elements (vec_oprnd0, vec_oprnd0,
+ perm_mask, stmt, gsi);
+ else if (j == 0)
+ op = vec_oprnd0
+ = vect_get_vec_def_for_operand (gather_off, stmt, NULL);
+ else
+ op = vec_oprnd0
+ = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0);
+
+ if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+ {
+ gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+ == TYPE_VECTOR_SUBPARTS (idxtype));
+ var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+ add_referenced_var (var);
+ var = make_ssa_name (var, NULL);
+ op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+ new_stmt
+ = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var,
+ op, NULL_TREE);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ op = var;
+ }
+
+ new_stmt
+ = gimple_build_call (gather_decl, 5, mask, ptr, op, mask, scale);
+
+ if (!useless_type_conversion_p (vectype, rettype))
+ {
+ gcc_assert (TYPE_VECTOR_SUBPARTS (vectype)
+ == TYPE_VECTOR_SUBPARTS (rettype));
+ var = vect_get_new_vect_var (rettype, vect_simple_var, NULL);
+ add_referenced_var (var);
+ op = make_ssa_name (var, new_stmt);
+ gimple_call_set_lhs (new_stmt, op);
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+ var = make_ssa_name (vec_dest, NULL);
+ op = build1 (VIEW_CONVERT_EXPR, vectype, op);
+ new_stmt
+ = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op,
+ NULL_TREE);
+ }
+ else
+ {
+ var = make_ssa_name (vec_dest, new_stmt);
+ gimple_call_set_lhs (new_stmt, var);
+ }
+
+ vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+ if (modifier == NARROW)
+ {
+ if ((j & 1) == 0)
+ {
+ prev_res = var;
+ continue;
+ }
+ var = permute_vec_elements (prev_res, var,
+ perm_mask, stmt, gsi);
+ new_stmt = SSA_NAME_DEF_STMT (var);
+ }
+
+ if (prev_stmt_info == NULL)
+ STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+ else
+ STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+ prev_stmt_info = vinfo_for_stmt (new_stmt);
+ }
+ return true;
+ }
+
if (strided_load)
{
first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
@@ -4541,7 +4746,9 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (negative)
{
- new_temp = reverse_vec_elements (new_temp, stmt, gsi);
+ tree perm_mask = perm_mask_for_reverse (vectype);
+ new_temp = permute_vec_elements (new_temp, new_temp,
+ perm_mask, stmt, gsi);
new_stmt = SSA_NAME_DEF_STMT (new_temp);
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1d61d9d..927c0bd 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -535,6 +535,9 @@ typedef struct _stmt_vec_info {
/* Is this statement vectorizable or should it be skipped in (partial)
vectorization. */
bool vectorizable;
+
+ /* For loads only, true if this is a gather load. */
+ bool gather_p;
} *stmt_vec_info;
/* Access Functions. */
@@ -548,6 +551,7 @@ typedef struct _stmt_vec_info {
#define STMT_VINFO_VEC_STMT(S) (S)->vectorized_stmt
#define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable
#define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
+#define STMT_VINFO_GATHER_P(S) (S)->gather_p
#define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address
#define STMT_VINFO_DR_INIT(S) (S)->dr_init
@@ -858,6 +862,8 @@ extern bool vect_analyze_data_refs_alignment (loop_vec_info, bb_vec_info);
extern bool vect_verify_datarefs_alignment (loop_vec_info, bb_vec_info);
extern bool vect_analyze_data_ref_accesses (loop_vec_info, bb_vec_info);
extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
+extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
+ int *);
extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *);
extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,
tree *, gimple_stmt_iterator *,