diff options
author | Evgeny Stupachenko <evstupac@gmail.com> | 2014-05-07 12:10:22 +0000 |
---|---|---|
committer | Kirill Yukhin <kyukhin@gcc.gnu.org> | 2014-05-07 12:10:22 +0000 |
commit | 2c23db6dd33ea21edc8970449e420eea4eaa3616 (patch) | |
tree | 2209bca524450055906393fe7143a936507176ab | |
parent | 586199f3092d6ff1b0bd18cc9eb0a1f3654d47cb (diff) | |
download | gcc-2c23db6dd33ea21edc8970449e420eea4eaa3616.zip gcc-2c23db6dd33ea21edc8970449e420eea4eaa3616.tar.gz gcc-2c23db6dd33ea21edc8970449e420eea4eaa3616.tar.bz2 |
re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))
gcc/
* tree-vect-data-refs.c (vect_grouped_load_supported): New
check for loads group of length 3.
(vect_permute_load_chain): New permutations for loads group of
length 3.
* tree-vect-stmts.c (vect_model_load_cost): Change cost
of vec_perm_shuffle for the new permutations.
gcc/testsuite/
PR tree-optimization/52252
* gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3.
From-SVN: r210155
-rw-r--r-- | gcc/ChangeLog | 9 | ||||
-rw-r--r-- | gcc/testsuite/ChangeLog | 5 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/pr52252-ld.c | 30 | ||||
-rw-r--r-- | gcc/tree-vect-data-refs.c | 175 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.c | 9 |
5 files changed, 184 insertions, 44 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 76339ee..000a5fb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2014-05-07 Evgeny Stupachenko <evstupac@gmail.com> + + * tree-vect-data-refs.c (vect_grouped_load_supported): New + check for loads group of length 3. + (vect_permute_load_chain): New permutations for loads group of + length 3. + * tree-vect-stmts.c (vect_model_load_cost): Change cost + of vec_perm_shuffle for the new permutations. + 2014-05-07 Alan Lawrence <alan.lawrence@arm.com> * config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 14e0e02..ea89f06 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2014-05-07 Evgeny Stupachenko <evstupac@gmail.com> + + PR tree-optimization/52252 + * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. + 2014-05-07 Alan Lawrence <alan.lawrence@arm.com> * gcc.target/aarch64/simd/vrev16p8_1.c: New file. diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c new file mode 100644 index 0000000..6e3cb52 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ + +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index d48e3cd..c486405 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -4810,36 +4810,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { enum machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not equal to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (count == 3) { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by" + " target\n"); + return false; + } + } + return true; + } + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; - } + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); + "extract even/odd not supported by target\n"); return false; } @@ -4857,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4939,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4949,44 +4991,97 @@ vect_permute_load_chain (vec<tree> dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); - - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); - - for (i = 0; i < log_length; i++) + if (length == 3) { - for (j = 0; j < length; j += 2) - { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; + unsigned int k; - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_low != NULL); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_high != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_even); + perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_odd); + perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + (*result_chain)[k] = data_ref; } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); } -} + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (length) != -1); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_even != NULL); + + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_odd != NULL); + for (i = 0; i < log_length; i++) + { + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } + } +} /* Function vect_transform_grouped_load. diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index b8547cb..ec9cc68 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, include the cost of the permutes. */ if (!load_lanes_p && group_size > 1) { - /* Uses an even and odd extract operations for each needed permute. */ - int nstmts = ncopies * exact_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle operations + for each needed permute. */ + int nstmts = ncopies * ceil_log2 (group_size) * group_size; + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, |