diff options
-rw-r--r-- | gcc/testsuite/g++.dg/vect/slp-pr98855.cc | 84 | ||||
-rw-r--r-- | gcc/tree-vect-slp.c | 173 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 9 |
3 files changed, 232 insertions, 34 deletions
diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc new file mode 100644 index 0000000..0b4e479 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc @@ -0,0 +1,84 @@ +// { dg-do compile } +// { dg-additional-options "-fvect-cost-model=cheap" } +// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } + +#include <stdint.h> +#include <stdlib.h> + +inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3) +{ + return ((static_cast<uint32_t>(i0) << 24) | + (static_cast<uint32_t>(i1) << 16) | + (static_cast<uint32_t>(i2) << 8) | + (static_cast<uint32_t>(i3))); +} + +inline uint32_t load_be(const uint8_t in[], size_t off) +{ + in += off * sizeof(uint32_t); + return make_uint32(in[0], in[1], in[2], in[3]); +} + +template<typename T> +inline void load_be(const uint8_t in[], + T& x0, T& x1, T& x2, T& x3, + T& x4, T& x5, T& x6, T& x7) +{ + x0 = load_be(in, 0); + x1 = load_be(in, 1); + x2 = load_be(in, 2); + x3 = load_be(in, 3); + x4 = load_be(in, 4); + x5 = load_be(in, 5); + x6 = load_be(in, 6); + x7 = load_be(in, 7); +} + +inline void store_be(uint32_t in, uint8_t out[4]) +{ + uint32_t o = __builtin_bswap32 (in); + __builtin_memcpy (out, &o, sizeof (uint32_t)); +} + +template<typename T> +inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3, + T x4, T x5, T x6, T x7) +{ + store_be(x0, out + (0 * sizeof(T))); + store_be(x1, out + (1 * sizeof(T))); + store_be(x2, out + (2 * sizeof(T))); + store_be(x3, out + (3 * sizeof(T))); + store_be(x4, out + (4 * sizeof(T))); + store_be(x5, out + (5 * sizeof(T))); + store_be(x6, out + (6 * sizeof(T))); + store_be(x7, out + (7 * sizeof(T))); +} + +#define BLOCK_SIZE 8 +void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK) +{ + const size_t blocks4 = blocks / 4; + + for (size_t i = 0; i < blocks4; i++) + { + uint32_t L0, R0, L1, R1, L2, R2, L3, R3; + load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + + for(size_t r = 0; r != 32; ++r) + { + L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r]; + L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r]; + L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r]; + L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r]; + + R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1]; + R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1]; + R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1]; + R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1]; + } + + store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + } +} + +// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } } diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 2305bbd..b9f12c3 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -4340,6 +4340,20 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, } } +/* Comparator for the loop-index sorted cost vectors. */ + +static int +li_cost_vec_cmp (const void *a_, const void *b_) +{ + auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_; + auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_; + if (a->first < b->first) + return -1; + else if (a->first == b->first) + return 0; + return 1; +} + /* Check if vectorization of the basic block is profitable for the subgraph denoted by SLP_INSTANCES. */ @@ -4352,61 +4366,152 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; - void *vect_target_cost_data = init_cost (NULL); - /* Calculate scalar cost and sum the cost for the vector stmts previously collected. */ - stmt_vector_for_cost scalar_costs; - scalar_costs.create (0); + stmt_vector_for_cost scalar_costs = vNULL; + stmt_vector_for_cost vector_costs = vNULL; hash_set<slp_tree> visited; FOR_EACH_VEC_ELT (slp_instances, i, instance) { auto_vec<bool, 20> life; life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)), true); + if (SLP_INSTANCE_ROOT_STMT (instance)) + record_stmt_cost (&scalar_costs, 1, scalar_stmt, + SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body); vect_bb_slp_scalar_cost (bb_vinfo, SLP_INSTANCE_TREE (instance), &life, &scalar_costs, visited); - add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec); + vector_costs.safe_splice (instance->cost_vec); instance->cost_vec.release (); } /* Unset visited flag. */ - stmt_info_for_cost *si; - FOR_EACH_VEC_ELT (scalar_costs, i, si) - gimple_set_visited (si->stmt_info->stmt, false); + stmt_info_for_cost *cost; + FOR_EACH_VEC_ELT (scalar_costs, i, cost) + gimple_set_visited (cost->stmt_info->stmt, false); - void *scalar_target_cost_data = init_cost (NULL); - add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs); - scalar_costs.release (); - unsigned dummy; - finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy); - destroy_cost_data (scalar_target_cost_data); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); + + /* When costing non-loop vectorization we need to consider each covered + loop independently and make sure vectorization is profitable. For + now we assume a loop may be not entered or executed an arbitrary + number of iterations (??? static information can provide more + precise info here) which means we can simply cost each containing + loops stmts separately. */ + + /* First produce cost vectors sorted by loop index. */ + auto_vec<std::pair<unsigned, stmt_info_for_cost *> > + li_scalar_costs (scalar_costs.length ()); + auto_vec<std::pair<unsigned, stmt_info_for_cost *> > + li_vector_costs (vector_costs.length ()); + FOR_EACH_VEC_ELT (scalar_costs, i, cost) + { + unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num; + li_scalar_costs.quick_push (std::make_pair (l, cost)); + } + /* Use a random used loop as fallback in case the first vector_costs + entry does not have a stmt_info associated with it. */ + unsigned l = li_scalar_costs[0].first; + FOR_EACH_VEC_ELT (vector_costs, i, cost) + { + /* We inherit from the previous COST, invariants, externals and + extracts immediately follow the cost for the related stmt. */ + if (cost->stmt_info) + l = gimple_bb (cost->stmt_info->stmt)->loop_father->num; + li_vector_costs.quick_push (std::make_pair (l, cost)); + } + li_scalar_costs.qsort (li_cost_vec_cmp); + li_vector_costs.qsort (li_cost_vec_cmp); + + /* Now cost the portions individually. */ + unsigned vi = 0; + unsigned si = 0; + do + { + unsigned sl = li_scalar_costs[si].first; + unsigned vl = li_vector_costs[vi].first; + if (sl != vl) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Scalar %d and vector %d loop part do not " + "match up, skipping scalar part\n", sl, vl); + /* Skip the scalar part, assuming zero cost on the vector side. */ + do + { + si++; + } + while (si < li_scalar_costs.length () + && li_scalar_costs[si].first == sl); + continue; + } - /* Complete the target-specific vector cost calculation. */ - finish_cost (vect_target_cost_data, &vec_prologue_cost, - &vec_inside_cost, &vec_epilogue_cost); - destroy_cost_data (vect_target_cost_data); + void *scalar_target_cost_data = init_cost (NULL); + do + { + add_stmt_cost (bb_vinfo, scalar_target_cost_data, + li_scalar_costs[si].second); + si++; + } + while (si < li_scalar_costs.length () + && li_scalar_costs[si].first == sl); + unsigned dummy; + finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy); + destroy_cost_data (scalar_target_cost_data); + + /* Complete the target-specific vector cost calculation. */ + void *vect_target_cost_data = init_cost (NULL); + do + { + add_stmt_cost (bb_vinfo, vect_target_cost_data, + li_vector_costs[vi].second); + vi++; + } + while (vi < li_vector_costs.length () + && li_vector_costs[vi].first == vl); + finish_cost (vect_target_cost_data, &vec_prologue_cost, + &vec_inside_cost, &vec_epilogue_cost); + destroy_cost_data (vect_target_cost_data); - vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; - if (dump_enabled_p ()) + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Cost model analysis for part in loop %d:\n", sl); + dump_printf (MSG_NOTE, " Vector cost: %d\n", + vec_inside_cost + vec_outside_cost); + dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost); + } + + /* Vectorization is profitable if its cost is more than the cost of scalar + version. Note that we err on the vector side for equal cost because + the cost estimate is otherwise quite pessimistic (constant uses are + free on the scalar side but cost a load on the vector side for + example). */ + if (vec_outside_cost + vec_inside_cost > scalar_cost) + { + scalar_costs.release (); + vector_costs.release (); + return false; + } + } + while (si < li_scalar_costs.length () + && vi < li_vector_costs.length ()); + if (vi < li_vector_costs.length ()) { - dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); - dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", - vec_inside_cost); - dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); - dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); - dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Excess vector cost for part in loop %d:\n", + li_vector_costs[vi].first); + scalar_costs.release (); + vector_costs.release (); + return false; } - /* Vectorization is profitable if its cost is more than the cost of scalar - version. Note that we err on the vector side for equal cost because - the cost estimate is otherwise quite pessimistic (constant uses are - free on the scalar side but cost a load on the vector side for - example). */ - if (vec_outside_cost + vec_inside_cost > scalar_cost) - return false; - + scalar_costs.release (); + vector_costs.release (); return true; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e564fcf..b861c97 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1481,6 +1481,15 @@ add_stmt_cost (vec_info *vinfo, void *data, int count, return cost; } +/* Alias targetm.vectorize.add_stmt_cost. */ + +static inline unsigned +add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i) +{ + return add_stmt_cost (vinfo, data, i->count, i->kind, i->stmt_info, + i->vectype, i->misalign, i->where); +} + /* Alias targetm.vectorize.finish_cost. */ static inline void |