diff options
37 files changed, 842 insertions, 49 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c660ff1..2c6f9eb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -2,6 +2,55 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * doc/sourcebuild.texi (vect_scatter_store): Document. + * optabs.def (scatter_store_optab, mask_scatter_store_optab): New + optabs. + * doc/md.texi (scatter_store@var{m}, mask_scatter_store@var{m}): + Document. + * genopinit.c (main): Add supports_vec_scatter_store and + supports_vec_scatter_store_cached to target_optabs. + * gimple.h (gimple_expr_type): Handle IFN_SCATTER_STORE and + IFN_MASK_SCATTER_STORE. + * internal-fn.def (SCATTER_STORE, MASK_SCATTER_STORE): New internal + functions. + * internal-fn.h (internal_store_fn_p): Declare. + (internal_fn_stored_value_index): Likewise. + * internal-fn.c (scatter_store_direct): New macro. + (expand_scatter_store_optab_fn): New function. + (direct_scatter_store_optab_supported_p): New macro. + (internal_store_fn_p): New function. + (internal_gather_scatter_fn_p): Handle IFN_SCATTER_STORE and + IFN_MASK_SCATTER_STORE. + (internal_fn_mask_index): Likewise. + (internal_fn_stored_value_index): New function. + (internal_gather_scatter_fn_supported_p): Adjust operand numbers + for scatter stores. + * optabs-query.h (supports_vec_scatter_store_p): Declare. + * optabs-query.c (supports_vec_scatter_store_p): New function. + * tree-vectorizer.h (vect_get_store_rhs): Declare. + * tree-vect-data-refs.c (vect_analyze_data_ref_access): Return + true for scatter stores. + (vect_gather_scatter_fn_p): Handle scatter stores too. + (vect_check_gather_scatter): Consider using scatter stores if + supports_vec_scatter_store_p. + * tree-vect-patterns.c (vect_try_gather_scatter_pattern): Handle + scatter stores too. + * tree-vect-stmts.c (exist_non_indexing_operands_for_use_p): Use + internal_fn_stored_value_index. + (check_load_store_masking): Handle scatter stores too. + (vect_get_store_rhs): Make public. + (vectorizable_call): Use internal_store_fn_p. + (vectorizable_store): Handle scatter store internal functions. + (vect_transform_stmt): Compare GROUP_STORE_COUNT with GROUP_SIZE + when deciding whether the end of the group has been reached. + * config/aarch64/aarch64.md (UNSPEC_ST1_SCATTER): New unspec. + * config/aarch64/aarch64-sve.md (scatter_store<mode>): New expander. + (mask_scatter_store<mode>): New insns. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * tree-vectorizer.h (vect_gather_scatter_fn_p): Declare. * tree-vect-data-refs.c (vect_gather_scatter_fn_p): Make public. * tree-vect-stmts.c (vect_truncate_gather_scatter_offset): New diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 04ea25c..8da10c1 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -246,6 +246,63 @@ ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]" ) +;; Unpredicated scatter store. +(define_expand "scatter_store<mode>" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_dup 5) + (match_operand:DI 0 "aarch64_reg_or_zero") + (match_operand:<V_INT_EQUIV> 1 "register_operand") + (match_operand:DI 2 "const_int_operand") + (match_operand:DI 3 "aarch64_gather_scale_operand_<Vesize>") + (match_operand:SVE_SD 4 "register_operand")] + UNSPEC_ST1_SCATTER))] + "TARGET_SVE" + { + operands[5] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode)); + } +) + +;; Predicated scatter stores for 32-bit elements. Operand 2 is true for +;; unsigned extension and false for signed extension. +(define_insn "mask_scatter_store<mode>" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl, Upl, Upl") + (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk, rk, rk") + (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w, w, w") + (match_operand:DI 2 "const_int_operand" "i, Z, Ui1, Z, Ui1") + (match_operand:DI 3 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, i, i") + (match_operand:SVE_S 4 "register_operand" "w, w, w, w, w")] + UNSPEC_ST1_SCATTER))] + "TARGET_SVE" + "@ + st1w\t%4.s, %5, [%1.s] + st1w\t%4.s, %5, [%0, %1.s, sxtw] + st1w\t%4.s, %5, [%0, %1.s, uxtw] + st1w\t%4.s, %5, [%0, %1.s, sxtw %p3] + st1w\t%4.s, %5, [%0, %1.s, uxtw %p3]" +) + +;; Predicated scatter stores for 64-bit elements. The value of operand 2 +;; doesn't matter in this case. +(define_insn "mask_scatter_store<mode>" + [(set (mem:BLK (scratch)) + (unspec:BLK + [(match_operand:<VPRED> 5 "register_operand" "Upl, Upl, Upl") + (match_operand:DI 0 "aarch64_reg_or_zero" "Z, rk, rk") + (match_operand:<V_INT_EQUIV> 1 "register_operand" "w, w, w") + (match_operand:DI 2 "const_int_operand") + (match_operand:DI 3 "aarch64_gather_scale_operand_d" "Ui1, Ui1, i") + (match_operand:SVE_D 4 "register_operand" "w, w, w")] + UNSPEC_ST1_SCATTER))] + "TARGET_SVE" + "@ + st1d\t%4.d, %5, [%1.d] + st1d\t%4.d, %5, [%0, %1.d] + st1d\t%4.d, %5, [%0, %1.d, lsl %p3]" +) + ;; SVE structure moves. (define_expand "mov<mode>" [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand") diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 455e04d..edb6a75 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -153,6 +153,7 @@ UNSPEC_ST1_SVE UNSPEC_LD1RQ UNSPEC_LD1_GATHER + UNSPEC_ST1_SCATTER UNSPEC_MERGE_PTRUE UNSPEC_PTEST_PTRUE UNSPEC_UNPACKSHI diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 245fa90..f5167a1 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -4937,6 +4937,35 @@ operand 5. Bit @var{i} of the mask is set if element @var{i} of the result should be loaded from memory and clear if element @var{i} of the result should be set to zero. +@cindex @code{scatter_store@var{m}} instruction pattern +@item @samp{scatter_store@var{m}} +Store a vector of mode @var{m} into several distinct memory locations. +Operand 0 is a scalar base address and operand 1 is a vector of offsets +from that base. Operand 4 is the vector of values that should be stored, +which has the same number of elements as the offset. For each element +index @var{i}: + +@itemize @bullet +@item +extend the offset element @var{i} to address width, using zero +extension if operand 2 is 1 and sign extension if operand 2 is zero; +@item +multiply the extended offset by operand 3; +@item +add the result to the base; and +@item +store element @var{i} of operand 4 to that address. +@end itemize + +The value of operand 2 does not matter if the offsets are already +address width. + +@cindex @code{mask_scatter_store@var{m}} instruction pattern +@item @samp{mask_scatter_store@var{m}} +Like @samp{scatter_store@var{m}}, but takes an extra mask operand as +operand 5. Bit @var{i} of the mask is set if element @var{i} +of the result should be stored to memory. + @cindex @code{vec_set@var{m}} instruction pattern @item @samp{vec_set@var{m}} Set given field in the vector value. Operand 0 is the vector to modify, diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index e02d4da..f0233c9 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1421,6 +1421,9 @@ so that vector loops can handle partial as well as full vectors. @item vect_masked_store Target supports vector masked stores. +@item vect_scatter_store +Target supports vector scatter stores. + @item vect_aligned_arrays Target aligns arrays to vector alignment boundary. diff --git a/gcc/genopinit.c b/gcc/genopinit.c index dd4e3aa..65a38d2 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -239,6 +239,8 @@ main (int argc, const char **argv) " mode. */\n" " bool supports_vec_gather_load;\n" " bool supports_vec_gather_load_cached;\n" + " bool supports_vec_scatter_store;\n" + " bool supports_vec_scatter_store_cached;\n" "};\n" "extern void init_all_optabs (struct target_optabs *);\n" "\n" diff --git a/gcc/gimple.h b/gcc/gimple.h index 281015a..7460586 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -6355,11 +6355,18 @@ gimple_expr_type (const gimple *stmt) if (code == GIMPLE_CALL) { const gcall *call_stmt = as_a <const gcall *> (stmt); - if (gimple_call_internal_p (call_stmt) - && gimple_call_internal_fn (call_stmt) == IFN_MASK_STORE) - return TREE_TYPE (gimple_call_arg (call_stmt, 3)); - else - return gimple_call_return_type (call_stmt); + if (gimple_call_internal_p (call_stmt)) + switch (gimple_call_internal_fn (call_stmt)) + { + case IFN_MASK_STORE: + case IFN_SCATTER_STORE: + return TREE_TYPE (gimple_call_arg (call_stmt, 3)); + case IFN_MASK_SCATTER_STORE: + return TREE_TYPE (gimple_call_arg (call_stmt, 4)); + default: + break; + } + return gimple_call_return_type (call_stmt); } else if (code == GIMPLE_ASSIGN) { diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8cf5b79..88adaea 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -87,6 +87,7 @@ init_internal_fns () #define mask_store_direct { 3, 2, false } #define store_lanes_direct { 0, 0, false } #define mask_store_lanes_direct { 0, 0, false } +#define scatter_store_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define cond_unary_direct { 1, 1, true } @@ -2730,6 +2731,42 @@ expand_LAUNDER (internal_fn, gcall *call) expand_assignment (lhs, gimple_call_arg (call, 0), false); } +/* Expand {MASK_,}SCATTER_STORE{S,U} call CALL using optab OPTAB. */ + +static void +expand_scatter_store_optab_fn (internal_fn, gcall *stmt, direct_optab optab) +{ + internal_fn ifn = gimple_call_internal_fn (stmt); + int rhs_index = internal_fn_stored_value_index (ifn); + int mask_index = internal_fn_mask_index (ifn); + tree base = gimple_call_arg (stmt, 0); + tree offset = gimple_call_arg (stmt, 1); + tree scale = gimple_call_arg (stmt, 2); + tree rhs = gimple_call_arg (stmt, rhs_index); + + rtx base_rtx = expand_normal (base); + rtx offset_rtx = expand_normal (offset); + HOST_WIDE_INT scale_int = tree_to_shwi (scale); + rtx rhs_rtx = expand_normal (rhs); + + struct expand_operand ops[6]; + int i = 0; + create_address_operand (&ops[i++], base_rtx); + create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE (offset))); + create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset))); + create_integer_operand (&ops[i++], scale_int); + create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs))); + if (mask_index >= 0) + { + tree mask = gimple_call_arg (stmt, mask_index); + rtx mask_rtx = expand_normal (mask); + create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE (mask))); + } + + insn_code icode = direct_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs))); + expand_insn (icode, i, ops); +} + /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB. */ static void @@ -3016,6 +3053,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_mask_store_optab_supported_p direct_optab_supported_p #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p +#define direct_scatter_store_optab_supported_p direct_optab_supported_p #define direct_while_optab_supported_p convert_optab_supported_p #define direct_fold_extract_optab_supported_p direct_optab_supported_p #define direct_fold_left_optab_supported_p direct_optab_supported_p @@ -3202,6 +3240,25 @@ internal_load_fn_p (internal_fn fn) } } +/* Return true if IFN is some form of store to memory. */ + +bool +internal_store_fn_p (internal_fn fn) +{ + switch (fn) + { + case IFN_MASK_STORE: + case IFN_STORE_LANES: + case IFN_MASK_STORE_LANES: + case IFN_SCATTER_STORE: + case IFN_MASK_SCATTER_STORE: + return true; + + default: + return false; + } +} + /* Return true if IFN is some form of gather load or scatter store. */ bool @@ -3211,6 +3268,8 @@ internal_gather_scatter_fn_p (internal_fn fn) { case IFN_GATHER_LOAD: case IFN_MASK_GATHER_LOAD: + case IFN_SCATTER_STORE: + case IFN_MASK_SCATTER_STORE: return true; default: @@ -3235,6 +3294,27 @@ internal_fn_mask_index (internal_fn fn) case IFN_MASK_GATHER_LOAD: return 3; + case IFN_MASK_SCATTER_STORE: + return 4; + + default: + return -1; + } +} + +/* If FN takes a value that should be stored to memory, return the index + of that argument, otherwise return -1. */ + +int +internal_fn_stored_value_index (internal_fn fn) +{ + switch (fn) + { + case IFN_MASK_STORE: + case IFN_SCATTER_STORE: + case IFN_MASK_SCATTER_STORE: + return 3; + default: return -1; } @@ -3259,9 +3339,12 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type, return false; optab optab = direct_internal_fn_optab (ifn); insn_code icode = direct_optab_handler (optab, TYPE_MODE (vector_type)); + int output_ops = internal_load_fn_p (ifn) ? 1 : 0; return (icode != CODE_FOR_nothing - && insn_operand_matches (icode, 3, GEN_INT (offset_sign == UNSIGNED)) - && insn_operand_matches (icode, 4, GEN_INT (scale))); + && insn_operand_matches (icode, 2 + output_ops, + GEN_INT (offset_sign == UNSIGNED)) + && insn_operand_matches (icode, 3 + output_ops, + GEN_INT (scale))); } /* Expand STMT as though it were a call to internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index db81b83..5970d0e 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see - mask_store: currently just maskstore - store_lanes: currently just vec_store_lanes - mask_store_lanes: currently just vec_mask_store_lanes + - scatter_store: used for {mask_,}scatter_store - unary: a normal unary optab, such as vec_reverse_<mode> - binary: a normal binary optab, such as vec_interleave_lo_<mode> @@ -123,6 +124,10 @@ DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, mask_gather_load, gather_load) +DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) +DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, + mask_scatter_store, scatter_store) + DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store) DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes) DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES, 0, diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index c536d1f..67102fd 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -193,8 +193,10 @@ extern bool set_edom_supported_p (void); extern internal_fn get_conditional_internal_fn (tree_code); extern bool internal_load_fn_p (internal_fn); +extern bool internal_store_fn_p (internal_fn); extern bool internal_gather_scatter_fn_p (internal_fn); extern int internal_fn_mask_index (internal_fn); +extern int internal_fn_stored_value_index (internal_fn); extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, tree, signop, int); diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c index b5c7a76..a8e10e6 100644 --- a/gcc/optabs-query.c +++ b/gcc/optabs-query.c @@ -711,3 +711,21 @@ supports_vec_gather_load_p () return this_fn_optabs->supports_vec_gather_load; } + +/* Return true if vec_scatter_store is available for at least one vector + mode. */ + +bool +supports_vec_scatter_store_p () +{ + if (this_fn_optabs->supports_vec_scatter_store_cached) + return this_fn_optabs->supports_vec_scatter_store; + + this_fn_optabs->supports_vec_scatter_store_cached = true; + + this_fn_optabs->supports_vec_scatter_store + = supports_at_least_one_mode_p (scatter_store_optab); + + return this_fn_optabs->supports_vec_scatter_store; +} + diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h index 2b02d07..a2574bb 100644 --- a/gcc/optabs-query.h +++ b/gcc/optabs-query.h @@ -192,6 +192,7 @@ bool can_atomic_exchange_p (machine_mode, bool); bool can_atomic_load_p (machine_mode); bool lshift_cheap_p (bool); bool supports_vec_gather_load_p (); +bool supports_vec_scatter_store_p (); /* Version of find_widening_optab_handler_and_mode that operates on specific mode types. */ diff --git a/gcc/optabs.def b/gcc/optabs.def index 532cf97..2c30f0e 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -386,6 +386,8 @@ OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a") OPTAB_D (gather_load_optab, "gather_load$a") OPTAB_D (mask_gather_load_optab, "mask_gather_load$a") +OPTAB_D (scatter_store_optab, "scatter_store$a") +OPTAB_D (mask_scatter_store_optab, "mask_scatter_store$a") OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE) OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 20d84c2..9955037 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -2,6 +2,33 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * lib/target-supports.exp (check_effective_target_vect_scatter_store): + New proc. + * gcc.dg/vect/pr25413a.c: Expect both loops to be optimized on + targets with scatter stores. + * gcc.dg/vect/vect-71.c: Restrict XFAIL to targets without scatter + stores. + * gcc.target/aarch64/sve/mask_scatter_store_1.c: New test. + * gcc.target/aarch64/sve/mask_scatter_store_2.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_1.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_2.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_3.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_4.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_5.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_6.c: Likewise. + * gcc.target/aarch64/sve/scatter_store_7.c: Likewise. + * gcc.target/aarch64/sve/strided_store_1.c: Likewise. + * gcc.target/aarch64/sve/strided_store_2.c: Likewise. + * gcc.target/aarch64/sve/strided_store_3.c: Likewise. + * gcc.target/aarch64/sve/strided_store_4.c: Likewise. + * gcc.target/aarch64/sve/strided_store_5.c: Likewise. + * gcc.target/aarch64/sve/strided_store_6.c: Likewise. + * gcc.target/aarch64/sve/strided_store_7.c: Likewise. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * gcc.target/aarch64/sve/reduc_strict_3.c: Expect FADDA to be used for double_reduc1. * gcc.target/aarch64/sve/strided_load_4.c: New test. diff --git a/gcc/testsuite/gcc.dg/vect/pr25413a.c b/gcc/testsuite/gcc.dg/vect/pr25413a.c index a80ca86..e444b2c 100644 --- a/gcc/testsuite/gcc.dg/vect/pr25413a.c +++ b/gcc/testsuite/gcc.dg/vect/pr25413a.c @@ -123,6 +123,7 @@ int main (void) return 0; } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! vect_scatter_store } } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target vect_scatter_store } } } */ /* { dg-final { scan-tree-dump-times "vector alignment may not be reachable" 1 "vect" { target { ! vector_alignment_reachable } } } } */ /* { dg-final { scan-tree-dump-times "Alignment of access forced using versioning" 1 "vect" { target { ! vector_alignment_reachable } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-71.c b/gcc/testsuite/gcc.dg/vect/vect-71.c index 2d1a3ff..f155211 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-71.c +++ b/gcc/testsuite/gcc.dg/vect/vect-71.c @@ -36,4 +36,4 @@ int main (void) return main1 (); } -/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { ! vect_scatter_store } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_1.c new file mode 100644 index 0000000..c799943 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_1.c @@ -0,0 +1,51 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX32 +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, CMP_TYPE, BITS) \ + void \ + f_##DATA_TYPE##_##CMP_TYPE \ + (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + CMP_TYPE *restrict cmp1, CMP_TYPE *restrict cmp2, \ + INDEX##BITS *restrict indices, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + if (cmp1[i] == cmp2[i]) \ + dest[indices[i]] = src[i] + 1; \ + } + +#define TEST32(T, DATA_TYPE) \ + T (DATA_TYPE, int32_t, 32) \ + T (DATA_TYPE, uint32_t, 32) \ + T (DATA_TYPE, float, 32) + +#define TEST64(T, DATA_TYPE) \ + T (DATA_TYPE, int64_t, 64) \ + T (DATA_TYPE, uint64_t, 64) \ + T (DATA_TYPE, double, 64) + +#define TEST_ALL(T) \ + TEST32 (T, int32_t) \ + TEST32 (T, uint32_t) \ + TEST32 (T, float) \ + TEST64 (T, int64_t) \ + TEST64 (T, uint64_t) \ + TEST64 (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */ +/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, sxtw 2\]\n} 9 } } */ + +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */ +/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_2.c new file mode 100644 index 0000000..ba8e671 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_scatter_store_2.c @@ -0,0 +1,17 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */ + +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "mask_scatter_store_1.c" + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 36 } } */ +/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+\.s, uxtw 2\]\n} 9 } } */ + +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 36 } } */ +/* { dg-final { scan-assembler-times {\tcmpeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfcmeq\tp[0-7]\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+\.d, lsl 3\]\n} 9 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c new file mode 100644 index 0000000..65be5e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_1.c @@ -0,0 +1,31 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX32 +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS *indices, int n) \ + { \ + for (int i = 9; i < n; ++i) \ + dest[indices[i]] = src[i] + 1; \ + } + +#define TEST_ALL(T) \ + T (int32_t, 32) \ + T (uint32_t, 32) \ + T (float, 32) \ + T (int64_t, 64) \ + T (uint64_t, 64) \ + T (double, 64) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c new file mode 100644 index 0000000..5cb507c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_2.c @@ -0,0 +1,10 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "scatter_store_1.c" + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c new file mode 100644 index 0000000..faa85df --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_3.c @@ -0,0 +1,32 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX32 +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +/* Invoked 18 times for each data size. */ +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS *indices, int n) \ + { \ + for (int i = 9; i < n; ++i) \ + *(DATA_TYPE *) ((char *) dest + indices[i]) = src[i] + 1; \ + } + +#define TEST_ALL(T) \ + T (int32_t, 32) \ + T (uint32_t, 32) \ + T (float, 32) \ + T (int64_t, 64) \ + T (uint64_t, 64) \ + T (double, 64) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c new file mode 100644 index 0000000..8dff57c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_4.c @@ -0,0 +1,10 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "scatter_store_3.c" + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c new file mode 100644 index 0000000..0962a72 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_5.c @@ -0,0 +1,23 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +/* Invoked 18 times for each data size. */ +#define TEST_LOOP(DATA_TYPE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE (DATA_TYPE *restrict *dest, DATA_TYPE *restrict src, \ + int n) \ + { \ + for (int i = 9; i < n; ++i) \ + *dest[i] = src[i] + 1; \ + } + +#define TEST_ALL(T) \ + T (int64_t) \ + T (uint64_t) \ + T (double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[z[0-9]+.d\]\n} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_6.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_6.c new file mode 100644 index 0000000..ee31562 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_6.c @@ -0,0 +1,36 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX32 +#define INDEX16 int16_t +#define INDEX32 int32_t +#endif + +/* Invoked 18 times for each data size. */ +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \ + INDEX##BITS *indices, INDEX##BITS mask, int n) \ + { \ + for (int i = 9; i < n; ++i) \ + dest[(INDEX##BITS) (indices[i] | mask)] = src[i] + 1; \ + } + +#define TEST_ALL(T) \ + T (int32_t, 16) \ + T (uint32_t, 16) \ + T (float, 16) \ + T (int64_t, 32) \ + T (uint64_t, 32) \ + T (double, 32) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tsunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tsunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_7.c b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_7.c new file mode 100644 index 0000000..784921e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/scatter_store_7.c @@ -0,0 +1,15 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#define INDEX16 uint16_t +#define INDEX32 uint32_t + +#include "scatter_store_6.c" + +/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.s, z[0-9]+\.h\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 3 } } */ +/* Either extension type is OK here. */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, [us]xtw 2\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_1.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_1.c new file mode 100644 index 0000000..4cd55ce --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_1.c @@ -0,0 +1,40 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#ifndef INDEX8 +#define INDEX8 int8_t +#define INDEX16 int16_t +#define INDEX32 int32_t +#define INDEX64 int64_t +#endif + +#define TEST_LOOP(DATA_TYPE, BITS) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + INDEX##BITS stride, INDEX##BITS n) \ + { \ + for (INDEX##BITS i = 0; i < n; ++i) \ + dest[i * stride] = src[i] + 1; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 8) \ + T (DATA_TYPE, 16) \ + T (DATA_TYPE, 32) \ + T (DATA_TYPE, 64) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 9 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 12 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c new file mode 100644 index 0000000..f0ea58e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c @@ -0,0 +1,18 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#define INDEX8 uint8_t +#define INDEX16 uint16_t +#define INDEX32 uint32_t +#define INDEX64 uint64_t + +#include "strided_store_1.c" + +/* 8 and 16 bits are signed because the multiplication promotes to int. + Using uxtw for all 9 would be OK. */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ +/* The 32-bit loop needs to honor the defined overflow in uint32_t, + so we vectorize the offset calculation. This means that the + 64-bit version needs two copies. */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_3.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_3.c new file mode 100644 index 0000000..68835af --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_3.c @@ -0,0 +1,33 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, OTHER_TYPE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, \ + OTHER_TYPE *restrict other, \ + OTHER_TYPE mask, \ + int stride, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i * stride] = src[i] + (OTHER_TYPE) (other[i] | mask); \ + } + +#define TEST_ALL(T) \ + T (int32_t, int16_t) \ + T (uint32_t, int16_t) \ + T (float, int16_t) \ + T (int64_t, int32_t) \ + T (uint64_t, int32_t) \ + T (double, int32_t) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 9 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 6 } } */ + +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 3\]\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_4.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_4.c new file mode 100644 index 0000000..48d83a3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_4.c @@ -0,0 +1,33 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i * SCALE] = src[i] + 1; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_5.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_5.c new file mode 100644 index 0000000..ea7756e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_5.c @@ -0,0 +1,34 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, long n) \ + { \ + for (long i = 0; i < n; ++i) \ + dest[i * SCALE] = src[i] + 1; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_6.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_6.c new file mode 100644 index 0000000..111d525 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_6.c @@ -0,0 +1,7 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable --save-temps" } */ + +#include "strided_store_5.c" + +/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_7.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_7.c new file mode 100644 index 0000000..9f7ce83 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_7.c @@ -0,0 +1,34 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src) \ + { \ + for (long i = 0; i < 1000; ++i) \ + dest[i * SCALE] = src[i] + 1; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 850260f..98a5eb7 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -6600,6 +6600,12 @@ proc check_effective_target_vect_masked_store { } { return [check_effective_target_aarch64_sve] } +# Return 1 if the target supports vector scatter stores. + +proc check_effective_target_vect_scatter_store { } { + return [check_effective_target_aarch64_sve] +} + # Return 1 if the target supports vector conditional operations, 0 otherwise. proc check_effective_target_vect_condition { } { diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index daa8b0c..c6bfe45 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2648,6 +2648,9 @@ vect_analyze_data_ref_access (struct data_reference *dr) loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); struct loop *loop = NULL; + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + return true; + if (loop_vinfo) loop = LOOP_VINFO_LOOP (loop_vinfo); @@ -3336,7 +3339,7 @@ vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, if (read_p) ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; else - return false; + ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; /* Test whether the target supports this combination. */ if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, @@ -3408,7 +3411,8 @@ vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, /* True if we should aim to use internal functions rather than built-in functions. */ bool use_ifn_p = (DR_IS_READ (dr) - && supports_vec_gather_load_p ()); + ? supports_vec_gather_load_p () + : supports_vec_scatter_store_p ()); base = DR_REF (dr); /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, @@ -3727,7 +3731,8 @@ again: bool maybe_scatter = DR_IS_WRITE (dr) && !TREE_THIS_VOLATILE (DR_REF (dr)) - && targetm.vectorize.builtin_scatter != NULL; + && (targetm.vectorize.builtin_scatter != NULL + || supports_vec_scatter_store_p ()); bool maybe_simd_lane_access = is_a <loop_vec_info> (vinfo) && loop->simduid; diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index f4b1b3e..0831b7e 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -4235,10 +4235,6 @@ vect_try_gather_scatter_pattern (gimple *stmt, stmt_vec_info last_stmt_info, if (!dr || !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) return NULL; - /* Reject stores for now. */ - if (!DR_IS_READ (dr)) - return NULL; - /* Get the boolean that controls whether the load or store happens. This is null if the operation is unconditional. */ tree mask = vect_get_load_store_mask (stmt); @@ -4278,8 +4274,16 @@ vect_try_gather_scatter_pattern (gimple *stmt, stmt_vec_info last_stmt_info, gimple_call_set_lhs (pattern_stmt, load_lhs); } else - /* Not yet supported. */ - gcc_unreachable (); + { + tree rhs = vect_get_store_rhs (stmt); + if (mask != NULL) + pattern_stmt = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5, + base, offset, scale, rhs, + mask); + else + pattern_stmt = gimple_build_call_internal (IFN_SCATTER_STORE, 4, + base, offset, scale, rhs); + } gimple_call_set_nothrow (pattern_stmt, true); /* Copy across relevant vectorization info and associate DR with the diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index df58834..0f74772 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -397,12 +397,13 @@ exist_non_indexing_operands_for_use_p (tree use, gimple *stmt) if (mask_index >= 0 && use == gimple_call_arg (stmt, mask_index)) return true; + int stored_value_index = internal_fn_stored_value_index (ifn); + if (stored_value_index >= 0 + && use == gimple_call_arg (stmt, stored_value_index)) + return true; if (internal_gather_scatter_fn_p (ifn) && use == gimple_call_arg (stmt, 1)) return true; - if (ifn == IFN_MASK_STORE - && use == gimple_call_arg (stmt, 3)) - return true; } return false; } @@ -1765,10 +1766,11 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, if (memory_access_type == VMAT_GATHER_SCATTER) { - gcc_assert (is_load); + internal_fn ifn = (is_load + ? IFN_MASK_GATHER_LOAD + : IFN_MASK_SCATTER_STORE); tree offset_type = TREE_TYPE (gs_info->offset); - if (!internal_gather_scatter_fn_supported_p (IFN_MASK_GATHER_LOAD, - vectype, + if (!internal_gather_scatter_fn_supported_p (ifn, vectype, gs_info->memory_type, TYPE_SIGN (offset_type), gs_info->scale)) @@ -1777,7 +1779,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't use a fully-masked loop because the" " target doesn't have an appropriate masked" - " gather load instruction.\n"); + " gather load or scatter store instruction.\n"); LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; return; } @@ -2021,7 +2023,7 @@ perm_mask_for_reverse (tree vectype) /* STMT is either a masked or unconditional store. Return the value being stored. */ -static tree +tree vect_get_store_rhs (gimple *stmt) { if (gassign *assign = dyn_cast <gassign *> (stmt)) @@ -2032,8 +2034,9 @@ vect_get_store_rhs (gimple *stmt) if (gcall *call = dyn_cast <gcall *> (stmt)) { internal_fn ifn = gimple_call_internal_fn (call); - gcc_assert (ifn == IFN_MASK_STORE); - return gimple_call_arg (stmt, 3); + int index = internal_fn_stored_value_index (ifn); + gcc_assert (index >= 0); + return gimple_call_arg (stmt, index); } gcc_unreachable (); } @@ -3023,7 +3026,7 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (gimple_call_internal_p (stmt) && (internal_load_fn_p (gimple_call_internal_fn (stmt)) - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) + || internal_store_fn_p (gimple_call_internal_fn (stmt)))) /* Handled by vectorizable_load and vectorizable_store. */ return false; @@ -6109,7 +6112,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, else { gcall *call = dyn_cast <gcall *> (stmt); - if (!call || !gimple_call_internal_p (call, IFN_MASK_STORE)) + if (!call || !gimple_call_internal_p (call)) + return false; + + internal_fn ifn = gimple_call_internal_fn (call); + if (!internal_store_fn_p (ifn)) return false; if (slp_node != NULL) @@ -6120,10 +6127,13 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } - ref_type = TREE_TYPE (gimple_call_arg (call, 1)); - mask = gimple_call_arg (call, 2); - if (!vect_check_load_store_mask (stmt, mask, &mask_vectype)) - return false; + int mask_index = internal_fn_mask_index (ifn); + if (mask_index >= 0) + { + mask = gimple_call_arg (call, mask_index); + if (!vect_check_load_store_mask (stmt, mask, &mask_vectype)) + return false; + } } op = vect_get_store_rhs (stmt); @@ -6185,7 +6195,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, TYPE_MODE (mask_vectype), false)) return false; } - else if (memory_access_type != VMAT_LOAD_STORE_LANES) + else if (memory_access_type != VMAT_LOAD_STORE_LANES + && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl)) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -6201,7 +6212,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } - grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info); + grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info) + && memory_access_type != VMAT_GATHER_SCATTER); if (grouped_store) { first_stmt = GROUP_FIRST_ELEMENT (stmt_info); @@ -6237,7 +6249,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ensure_base_align (dr); - if (memory_access_type == VMAT_GATHER_SCATTER) + if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl) { tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); @@ -6387,10 +6399,14 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return true; } - if (grouped_store) + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { - GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; + gimple *group_stmt = GROUP_FIRST_ELEMENT (stmt_info); + GROUP_STORE_COUNT (vinfo_for_stmt (group_stmt))++; + } + if (grouped_store) + { /* FORNOW */ gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt)); @@ -6690,10 +6706,27 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, || memory_access_type == VMAT_CONTIGUOUS_REVERSE) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + tree bump; + tree vec_offset = NULL_TREE; + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + aggr_type = NULL_TREE; + bump = NULL_TREE; + } + else if (memory_access_type == VMAT_GATHER_SCATTER) + { + aggr_type = elem_type; + vect_get_strided_load_store_ops (stmt, loop_vinfo, &gs_info, + &bump, &vec_offset); + } else - aggr_type = vectype; + { + if (memory_access_type == VMAT_LOAD_STORE_LANES) + aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + else + aggr_type = vectype; + bump = vect_get_data_ptr_increment (dr, aggr_type, memory_access_type); + } if (mask) LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true; @@ -6798,12 +6831,19 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, dataref_offset = build_int_cst (ref_type, 0); inv_p = false; } + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + vect_get_gather_scatter_ops (loop, stmt, &gs_info, + &dataref_ptr, &vec_offset); + inv_p = false; + } else dataref_ptr = vect_create_data_ref_ptr (first_stmt, aggr_type, simd_lane_access_p ? loop : NULL, offset, &dummy, gsi, &ptr_incr, - simd_lane_access_p, &inv_p); + simd_lane_access_p, &inv_p, + NULL_TREE, bump); gcc_assert (bb_vinfo || !inv_p); } else @@ -6830,11 +6870,17 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } if (dataref_offset) dataref_offset - = int_const_binop (PLUS_EXPR, dataref_offset, - TYPE_SIZE_UNIT (aggr_type)); + = int_const_binop (PLUS_EXPR, dataref_offset, bump); + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + gimple *def_stmt; + vect_def_type dt; + vect_is_simple_use (vec_offset, loop_vinfo, &def_stmt, &dt); + vec_offset = vect_get_vec_def_for_stmt_copy (dt, vec_offset); + } else dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt, - TYPE_SIZE_UNIT (aggr_type)); + bump); } if (memory_access_type == VMAT_LOAD_STORE_LANES) @@ -6906,10 +6952,28 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, final_mask = prepare_load_store_mask (mask_vectype, final_mask, vec_mask, gsi); + if (memory_access_type == VMAT_GATHER_SCATTER) + { + tree scale = size_int (gs_info.scale); + gcall *call; + if (masked_loop_p) + call = gimple_build_call_internal + (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, + scale, vec_oprnd, final_mask); + else + call = gimple_build_call_internal + (IFN_SCATTER_STORE, 4, dataref_ptr, vec_offset, + scale, vec_oprnd); + gimple_call_set_nothrow (call, true); + new_stmt = call; + vect_finish_stmt_generation (stmt, new_stmt, gsi); + break; + } + if (i > 0) /* Bump the vector pointer. */ dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, - stmt, NULL_TREE); + stmt, bump); if (slp) vec_oprnd = vec_oprnds[i]; @@ -9407,9 +9471,11 @@ vect_transform_stmt (gimple *stmt, gimple_stmt_iterator *gsi, one are skipped, and there vec_stmt_info shouldn't be freed meanwhile. */ *grouped_store = true; - if (STMT_VINFO_VEC_STMT (stmt_info)) + stmt_vec_info group_info + = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)); + if (GROUP_STORE_COUNT (group_info) == GROUP_SIZE (group_info)) is_store = true; - } + } else is_store = true; break; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 903e56e..d3dda52 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1413,6 +1413,7 @@ extern void vect_finish_replace_stmt (gimple *, gimple *); extern void vect_finish_stmt_generation (gimple *, gimple *, gimple_stmt_iterator *); extern bool vect_mark_stmts_to_be_vectorized (loop_vec_info); +extern tree vect_get_store_rhs (gimple *); extern tree vect_get_vec_def_for_operand_1 (gimple *, enum vect_def_type); extern tree vect_get_vec_def_for_operand (tree, gimple *, tree = NULL); extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *, |