diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/ChangeLog | 17 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 11 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c | 9 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c | 33 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c | 34 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c | 7 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c | 34 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 2 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 127 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 2 |
10 files changed, 263 insertions, 13 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index fb7a205..c660ff1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -2,6 +2,23 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * tree-vectorizer.h (vect_gather_scatter_fn_p): Declare. + * tree-vect-data-refs.c (vect_gather_scatter_fn_p): Make public. + * tree-vect-stmts.c (vect_truncate_gather_scatter_offset): New + function. + (vect_use_strided_gather_scatters_p): Take a masked_p argument. + Use vect_truncate_gather_scatter_offset if we can't treat the + operation as a normal gather load or scatter store. + (get_group_load_store_type): Take the gather_scatter_info + as argument. Try using a gather load or scatter store for + single-element groups. + (get_load_store_type): Update calls to get_group_load_store_type + and vect_use_strided_gather_scatters_p. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * tree-vectorizer.h (vect_create_data_ref_ptr): Take an extra optional tree argument. * tree-vect-data-refs.c (vect_check_gather_scatter): Check for diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 30bcb7a..20d84c2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -2,6 +2,17 @@ Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> + * gcc.target/aarch64/sve/reduc_strict_3.c: Expect FADDA to be used + for double_reduc1. + * gcc.target/aarch64/sve/strided_load_4.c: New test. + * gcc.target/aarch64/sve/strided_load_5.c: Likewise. + * gcc.target/aarch64/sve/strided_load_6.c: Likewise. + * gcc.target/aarch64/sve/strided_load_7.c: Likewise. + +2018-01-13 Richard Sandiford <richard.sandiford@linaro.org> + Alan Hayward <alan.hayward@arm.com> + David Sherwood <david.sherwood@arm.com> + * gcc.target/aarch64/sve/strided_load_1.c: New test. * gcc.target/aarch64/sve/strided_load_2.c: Likewise. * gcc.target/aarch64/sve/strided_load_3.c: Likewise. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c index a28145f..a718e9d 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c @@ -118,14 +118,11 @@ double_reduc3 (float *restrict i, float *restrict j) return l * k; } -/* We can't yet handle double_reduc1. */ -/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */ +/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 4 } } */ /* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */ /* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3. Each one is reported three times, once for SVE, once for 128-bit AdvSIMD and once for 64-bit AdvSIMD. */ /* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */ -/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3. - double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD) - before failing. */ -/* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */ +/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3. */ +/* { dg-final { scan-tree-dump-times "Detected reduction" 10 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c new file mode 100644 index 0000000..0eff384 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c @@ -0,0 +1,33 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + dest[i] += src[i * SCALE]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c new file mode 100644 index 0000000..415b466 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c @@ -0,0 +1,34 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src, long n) \ + { \ + for (long i = 0; i < n; ++i) \ + dest[i] += src[i * SCALE]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c new file mode 100644 index 0000000..9e00015 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c @@ -0,0 +1,7 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable --save-temps" } */ + +#include "strided_load_5.c" + +/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c new file mode 100644 index 0000000..3a36367 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c @@ -0,0 +1,34 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#include <stdint.h> + +#define TEST_LOOP(DATA_TYPE, NAME, SCALE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest, \ + DATA_TYPE *restrict src) \ + { \ + for (long i = 0; i < 1000; ++i) \ + dest[i] += src[i * SCALE]; \ + } + +#define TEST_TYPE(T, DATA_TYPE) \ + T (DATA_TYPE, 5, 5) \ + T (DATA_TYPE, 7, 7) \ + T (DATA_TYPE, 11, 11) \ + T (DATA_TYPE, 200, 200) \ + T (DATA_TYPE, m100, -100) + +#define TEST_ALL(T) \ + TEST_TYPE (T, int32_t) \ + TEST_TYPE (T, uint32_t) \ + TEST_TYPE (T, float) \ + TEST_TYPE (T, int64_t) \ + TEST_TYPE (T, uint64_t) \ + TEST_TYPE (T, double) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 69721a9..daa8b0c 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -3312,7 +3312,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) Return true if the function is supported, storing the function id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */ -static bool +bool vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype, tree memory_type, unsigned int offset_bits, signop offset_sign, int scale, diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 079cbdd..df58834 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1849,18 +1849,117 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, return and_res; } +/* Determine whether we can use a gather load or scatter store to vectorize + strided load or store STMT by truncating the current offset to a smaller + width. We need to be able to construct an offset vector: + + { 0, X, X*2, X*3, ... } + + without loss of precision, where X is STMT's DR_STEP. + + Return true if this is possible, describing the gather load or scatter + store in GS_INFO. MASKED_P is true if the load or store is conditional. */ + +static bool +vect_truncate_gather_scatter_offset (gimple *stmt, loop_vec_info loop_vinfo, + bool masked_p, + gather_scatter_info *gs_info) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); + tree step = DR_STEP (dr); + if (TREE_CODE (step) != INTEGER_CST) + { + /* ??? Perhaps we could use range information here? */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "cannot truncate variable step.\n"); + return false; + } + + /* Get the number of bits in an element. */ + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); + unsigned int element_bits = GET_MODE_BITSIZE (element_mode); + + /* Set COUNT to the upper limit on the number of elements - 1. + Start with the maximum vectorization factor. */ + unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1; + + /* Try lowering COUNT to the number of scalar latch iterations. */ + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + widest_int max_iters; + if (max_loop_iterations (loop, &max_iters) + && max_iters < count) + count = max_iters.to_shwi (); + + /* Try scales of 1 and the element size. */ + int scales[] = { 1, vect_get_scalar_dr_size (dr) }; + bool overflow_p = false; + for (int i = 0; i < 2; ++i) + { + int scale = scales[i]; + widest_int factor; + if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor)) + continue; + + /* See whether we can calculate (COUNT - 1) * STEP / SCALE + in OFFSET_BITS bits. */ + widest_int range = wi::mul (count, factor, SIGNED, &overflow_p); + if (overflow_p) + continue; + signop sign = range >= 0 ? UNSIGNED : SIGNED; + if (wi::min_precision (range, sign) > element_bits) + { + overflow_p = true; + continue; + } + + /* See whether the target supports the operation. */ + tree memory_type = TREE_TYPE (DR_REF (dr)); + if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype, + memory_type, element_bits, sign, scale, + &gs_info->ifn, &gs_info->element_type)) + continue; + + tree offset_type = build_nonstandard_integer_type (element_bits, + sign == UNSIGNED); + + gs_info->decl = NULL_TREE; + /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET, + but we don't need to store that here. */ + gs_info->base = NULL_TREE; + gs_info->offset = fold_convert (offset_type, step); + gs_info->offset_dt = vect_unknown_def_type; + gs_info->offset_vectype = NULL_TREE; + gs_info->scale = scale; + gs_info->memory_type = memory_type; + return true; + } + + if (overflow_p && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "truncating gather/scatter offset to %d bits" + " might change its value.\n", element_bits); + + return false; +} + /* Return true if we can use gather/scatter internal functions to vectorize STMT, which is a grouped or strided load or store. - When returning true, fill in GS_INFO with the information required - to perform the operation. */ + MASKED_P is true if load or store is conditional. When returning + true, fill in GS_INFO with the information required to perform the + operation. */ static bool vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo, + bool masked_p, gather_scatter_info *gs_info) { if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info) || gs_info->decl) - return false; + return vect_truncate_gather_scatter_offset (stmt, loop_vinfo, + masked_p, gs_info); scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type); unsigned int element_bits = GET_MODE_BITSIZE (element_mode); @@ -1951,7 +2050,8 @@ vect_get_store_rhs (gimple *stmt) static bool get_group_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, vec_load_store_type vls_type, - vect_memory_access_type *memory_access_type) + vect_memory_access_type *memory_access_type, + gather_scatter_info *gs_info) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); vec_info *vinfo = stmt_info->vinfo; @@ -2073,6 +2173,20 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, overrun_p = would_overrun_p; } } + + /* As a last resort, trying using a gather load or scatter store. + + ??? Although the code can handle all group sizes correctly, + it probably isn't a win to use separate strided accesses based + on nearby locations. Or, even if it's a win over scalar code, + it might not be a win over vectorizing at a lower VF, if that + allows us to use contiguous accesses. */ + if (*memory_access_type == VMAT_ELEMENTWISE + && single_element_p + && loop_vinfo + && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, + masked_p, gs_info)) + *memory_access_type = VMAT_GATHER_SCATTER; } if (vls_type != VLS_LOAD && first_stmt == stmt) @@ -2200,14 +2314,15 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type, - memory_access_type)) + memory_access_type, gs_info)) return false; } else if (STMT_VINFO_STRIDED_P (stmt_info)) { gcc_assert (!slp); if (loop_vinfo - && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info)) + && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, + masked_p, gs_info)) *memory_access_type = VMAT_GATHER_SCATTER; else *memory_access_type = VMAT_ELEMENTWISE; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index c661578..903e56e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1455,6 +1455,8 @@ extern bool vect_verify_datarefs_alignment (loop_vec_info); extern bool vect_slp_analyze_and_verify_instance_alignment (slp_instance); extern bool vect_analyze_data_ref_accesses (vec_info *); extern bool vect_prune_runtime_alias_test_list (loop_vec_info); +extern bool vect_gather_scatter_fn_p (bool, bool, tree, tree, unsigned int, + signop, int, internal_fn *, tree *); extern bool vect_check_gather_scatter (gimple *, loop_vec_info, gather_scatter_info *); extern bool vect_analyze_data_refs (vec_info *, poly_uint64 *); |