diff options
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/config/aarch64/aarch64.c | 179 | ||||
-rw-r--r-- | gcc/config/aarch64/fractional-cost.h | 236 |
2 files changed, 377 insertions, 38 deletions
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 1a8cd13..17fcb34 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -20,8 +20,9 @@ #define IN_TARGET_CODE 1 -#include "config.h" #define INCLUDE_STRING +#define INCLUDE_ALGORITHM +#include "config.h" #include "system.h" #include "coretypes.h" #include "backend.h" @@ -76,6 +77,7 @@ #include "function-abi.h" #include "gimple-pretty-print.h" #include "tree-ssa-loop-niter.h" +#include "fractional-cost.h" /* This file should be included last. */ #include "target-def.h" @@ -14912,10 +14914,10 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info, for STMT_INFO, which has cost kind KIND. If this is a scalar operation, try to subdivide the target-independent categorization provided by KIND to get a more accurate cost. */ -static unsigned int +static fractional_cost aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, - unsigned int stmt_cost) + fractional_cost stmt_cost) { /* Detect an extension of a loaded value. In general, we'll be able to fuse the extension with the load. */ @@ -14931,11 +14933,11 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, the target-independent categorization provided by KIND to get a more accurate cost. WHERE specifies where the cost associated with KIND occurs. */ -static unsigned int +static fractional_cost aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, enum vect_cost_model_location where, - unsigned int stmt_cost) + fractional_cost stmt_cost) { const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); const sve_vec_cost *sve_costs = nullptr; @@ -15016,10 +15018,10 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Adjust the cost as necessary for SVE targets. */ -static unsigned int +static fractional_cost aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, stmt_vec_info stmt_info, tree vectype, - unsigned int stmt_cost) + fractional_cost stmt_cost) { /* Unlike vec_promote_demote, vector_stmt conversions do not change the vector register size or number of units. Integer promotions of this @@ -15083,9 +15085,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Add the cost of any embedded operations. */ -static unsigned int +static fractional_cost aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, - tree vectype, unsigned int stmt_cost) + tree vectype, fractional_cost stmt_cost) { if (vectype) { @@ -15339,7 +15341,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, if (flag_vect_cost_model) { - int stmt_cost + fractional_cost stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); /* Do one-time initialization based on the vinfo. */ @@ -15440,7 +15442,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */ } - retval = (unsigned) (count * stmt_cost); + retval = (count * stmt_cost).ceil (); costs->region[where] += retval; } @@ -15472,17 +15474,17 @@ aarch64_sve_op_count::dump () const /* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue the operations described by OPS. This is a very simplistic model! */ -static unsigned int +static fractional_cost aarch64_estimate_min_cycles_per_iter (const aarch64_vec_op_count *ops, const aarch64_base_vec_issue_info *issue_info) { - unsigned int cycles = MAX (ops->reduction_latency, 1); - cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle)); - cycles = MAX (cycles, CEIL (ops->loads + ops->stores, - issue_info->loads_stores_per_cycle)); - cycles = MAX (cycles, CEIL (ops->general_ops, - issue_info->general_ops_per_cycle)); + fractional_cost cycles = MAX (ops->reduction_latency, 1); + cycles = std::max (cycles, { ops->stores, issue_info->stores_per_cycle }); + cycles = std::max (cycles, { ops->loads + ops->stores, + issue_info->loads_stores_per_cycle }); + cycles = std::max (cycles, { ops->general_ops, + issue_info->general_ops_per_cycle }); return cycles; } @@ -15536,12 +15538,14 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if (!issue_info) return body_cost; - unsigned int scalar_cycles_per_iter + fractional_cost scalar_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops, issue_info->scalar); - unsigned int advsimd_cycles_per_iter + + fractional_cost advsimd_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops, issue_info->advsimd); + bool could_use_advsimd = ((costs->vec_flags & VEC_ADVSIMD) || (aarch64_autovec_preference != 2 @@ -15558,36 +15562,37 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); costs->scalar_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %d\n", - scalar_cycles_per_iter); + " estimated cycles per iteration = %f\n", + scalar_cycles_per_iter.as_double ()); if (could_use_advsimd) { dump_printf_loc (MSG_NOTE, vect_location, "Advanced SIMD issue estimate:\n"); costs->advsimd_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %d\n", - advsimd_cycles_per_iter); + " estimated cycles per iteration = %f\n", + advsimd_cycles_per_iter.as_double ()); } else dump_printf_loc (MSG_NOTE, vect_location, "Loop could not use Advanced SIMD\n"); } - uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter; + fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter; unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency; + if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve) { /* Estimate the minimum number of cycles per iteration needed to issue non-predicate operations. */ - unsigned int sve_cycles_per_iter + fractional_cost sve_cycles_per_iter = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops, issue_info->sve); /* Separately estimate the minimum number of cycles per iteration needed to issue the predicate operations. */ - unsigned int pred_cycles_per_iter - = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle); + fractional_cost pred_cycles_per_iter + = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; if (dump_enabled_p ()) { @@ -15595,14 +15600,16 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) costs->sve_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per iteration for non-predicate" - " operations = %d\n", sve_cycles_per_iter); + " operations = %f\n", + sve_cycles_per_iter.as_double ()); if (costs->sve_ops.pred_ops) dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per" " iteration for predicate operations = %d\n", - pred_cycles_per_iter); + pred_cycles_per_iter.as_double ()); } - vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter); + vector_cycles_per_iter = std::max (sve_cycles_per_iter, + pred_cycles_per_iter); vector_reduction_latency = costs->sve_ops.reduction_latency; /* If the scalar version of the loop could issue at least as @@ -15616,7 +15623,7 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter; code later in the function handles that case in a more conservative way. */ - uint64_t sve_estimate = pred_cycles_per_iter + 1; + fractional_cost sve_estimate = pred_cycles_per_iter + 1; if (scalar_cycles_per_iter < sve_estimate) { unsigned int min_cost @@ -15656,8 +15663,10 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate) { /* This ensures that min_cost > orig_body_cost * 2. */ - unsigned int min_cost - = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1; + unsigned int factor + = fractional_cost::scale (1, sve_estimate, + advsimd_cycles_per_iter); + unsigned int min_cost = orig_body_cost * factor + 1; if (body_cost < min_cost) { if (dump_enabled_p ()) @@ -15690,8 +15699,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) so minor differences should only result in minor changes. */ else if (scalar_cycles_per_iter < vector_cycles_per_iter) { - body_cost = CEIL (body_cost * vector_cycles_per_iter, - scalar_cycles_per_iter); + body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter, + scalar_cycles_per_iter); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Increasing body cost to %d because scalar code" @@ -15716,8 +15725,8 @@ aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) && scalar_cycles_per_iter > vector_cycles_per_iter && !should_disparage) { - body_cost = CEIL (body_cost * vector_cycles_per_iter, - scalar_cycles_per_iter); + body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter, + scalar_cycles_per_iter); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Decreasing body cost to %d account for smaller" @@ -25589,12 +25598,106 @@ aarch64_test_loading_full_dump () ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx)); } +/* Test the fractional_cost class. */ + +static void +aarch64_test_fractional_cost () +{ + using cf = fractional_cost; + + ASSERT_EQ (cf (0, 20), 0); + + ASSERT_EQ (cf (4, 2), 2); + ASSERT_EQ (3, cf (9, 3)); + + ASSERT_NE (cf (5, 2), 2); + ASSERT_NE (3, cf (8, 3)); + + ASSERT_EQ (cf (7, 11) + cf (15, 11), 2); + ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15)); + ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1); + + ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3)); + ASSERT_EQ (cf (1, 4) - cf (1, 2), 0); + ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2)); + ASSERT_EQ (cf (11, 3) - 3, cf (2, 3)); + ASSERT_EQ (3 - cf (7, 3), cf (2, 3)); + ASSERT_EQ (3 - cf (10, 3), 0); + + ASSERT_EQ (cf (2, 3) * 5, cf (10, 3)); + ASSERT_EQ (14 * cf (11, 21), cf (22, 3)); + + ASSERT_TRUE (cf (4, 15) < cf (5, 15)); + ASSERT_FALSE (cf (5, 15) < cf (5, 15)); + ASSERT_FALSE (cf (6, 15) < cf (5, 15)); + ASSERT_TRUE (cf (1, 3) < cf (2, 5)); + ASSERT_TRUE (cf (1, 12) < cf (1, 6)); + ASSERT_FALSE (cf (5, 3) < cf (5, 3)); + ASSERT_TRUE (cf (239, 240) < 1); + ASSERT_FALSE (cf (240, 240) < 1); + ASSERT_FALSE (cf (241, 240) < 1); + ASSERT_FALSE (2 < cf (207, 104)); + ASSERT_FALSE (2 < cf (208, 104)); + ASSERT_TRUE (2 < cf (209, 104)); + + ASSERT_TRUE (cf (4, 15) < cf (5, 15)); + ASSERT_FALSE (cf (5, 15) < cf (5, 15)); + ASSERT_FALSE (cf (6, 15) < cf (5, 15)); + ASSERT_TRUE (cf (1, 3) < cf (2, 5)); + ASSERT_TRUE (cf (1, 12) < cf (1, 6)); + ASSERT_FALSE (cf (5, 3) < cf (5, 3)); + ASSERT_TRUE (cf (239, 240) < 1); + ASSERT_FALSE (cf (240, 240) < 1); + ASSERT_FALSE (cf (241, 240) < 1); + ASSERT_FALSE (2 < cf (207, 104)); + ASSERT_FALSE (2 < cf (208, 104)); + ASSERT_TRUE (2 < cf (209, 104)); + + ASSERT_FALSE (cf (4, 15) >= cf (5, 15)); + ASSERT_TRUE (cf (5, 15) >= cf (5, 15)); + ASSERT_TRUE (cf (6, 15) >= cf (5, 15)); + ASSERT_FALSE (cf (1, 3) >= cf (2, 5)); + ASSERT_FALSE (cf (1, 12) >= cf (1, 6)); + ASSERT_TRUE (cf (5, 3) >= cf (5, 3)); + ASSERT_FALSE (cf (239, 240) >= 1); + ASSERT_TRUE (cf (240, 240) >= 1); + ASSERT_TRUE (cf (241, 240) >= 1); + ASSERT_TRUE (2 >= cf (207, 104)); + ASSERT_TRUE (2 >= cf (208, 104)); + ASSERT_FALSE (2 >= cf (209, 104)); + + ASSERT_FALSE (cf (4, 15) > cf (5, 15)); + ASSERT_FALSE (cf (5, 15) > cf (5, 15)); + ASSERT_TRUE (cf (6, 15) > cf (5, 15)); + ASSERT_FALSE (cf (1, 3) > cf (2, 5)); + ASSERT_FALSE (cf (1, 12) > cf (1, 6)); + ASSERT_FALSE (cf (5, 3) > cf (5, 3)); + ASSERT_FALSE (cf (239, 240) > 1); + ASSERT_FALSE (cf (240, 240) > 1); + ASSERT_TRUE (cf (241, 240) > 1); + ASSERT_TRUE (2 > cf (207, 104)); + ASSERT_FALSE (2 > cf (208, 104)); + ASSERT_FALSE (2 > cf (209, 104)); + + ASSERT_EQ (cf (1, 2).ceil (), 1); + ASSERT_EQ (cf (11, 7).ceil (), 2); + ASSERT_EQ (cf (20, 1).ceil (), 20); + ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe); + ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff); + ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff); + ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe); + ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff); + + ASSERT_EQ (cf (1, 2).as_double (), 0.5); +} + /* Run all target-specific selftests. */ static void aarch64_run_selftests (void) { aarch64_test_loading_full_dump (); + aarch64_test_fractional_cost (); } } // namespace selftest diff --git a/gcc/config/aarch64/fractional-cost.h b/gcc/config/aarch64/fractional-cost.h new file mode 100644 index 0000000..6a01634 --- /dev/null +++ b/gcc/config/aarch64/fractional-cost.h @@ -0,0 +1,236 @@ +// Simple fixed-point representation of fractional costs +// Copyright (C) 2021 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// <http://www.gnu.org/licenses/>. + +// A simple saturating fixed-point type for representing fractional +// intermediate results in cost calculations. The input and result +// costs are assumed to be uint32_ts. Unlike sreal, the class can +// represent most values that we care about exactly (without rounding). +// See the comment above the SCALE field for the current set of +// exactly-representable reciprocals. +class fractional_cost +{ +public: + // Construct an object equal to INT_VALUE. + constexpr fractional_cost (uint32_t int_value = 0) + : m_value (uint64_t (int_value) * SCALE) {} + + fractional_cost (uint32_t a, uint32_t b); + + fractional_cost operator+ (const fractional_cost &) const; + fractional_cost operator- (const fractional_cost &) const; + fractional_cost operator* (uint32_t) const; + + fractional_cost &operator+= (const fractional_cost &); + fractional_cost &operator-= (const fractional_cost &); + fractional_cost &operator*= (uint32_t); + + bool operator== (const fractional_cost &) const; + bool operator!= (const fractional_cost &) const; + bool operator< (const fractional_cost &) const; + bool operator<= (const fractional_cost &) const; + bool operator>= (const fractional_cost &) const; + bool operator> (const fractional_cost &) const; + + uint32_t ceil () const; + + static uint32_t scale (uint32_t, fractional_cost, fractional_cost); + + explicit operator bool () const { return m_value != 0; } + + // Convert the value to a double. + double as_double () const { return double (m_value) / SCALE; } + +private: + enum raw { RAW }; + constexpr fractional_cost (uint64_t value, raw) : m_value (value) {} + + // A multiple of [1, 16] * 16. This ensures that 1/N is representable + // for every every possible SVE element count N, or for any "X per cycle" + // value N in the range [1, 16]. + static const uint32_t SCALE = 11531520; + + // The value multiplied by BIAS. + uint64_t m_value; +}; + +// Construct a representation of A / B, rounding up if (contrary to +// expectations) we can't represent the value exactly. For now we +// treat inexact values as a bug, since all values of B should come +// from a small set of values that are known at compile time. +inline fractional_cost::fractional_cost (uint32_t a, uint32_t b) + : m_value (CEIL (uint64_t (a) * SCALE, uint64_t (b))) +{ + gcc_checking_assert (SCALE % b == 0); +} + +inline fractional_cost +fractional_cost::operator+ (const fractional_cost &other) const +{ + uint64_t sum = m_value + other.m_value; + return { sum >= m_value ? sum : ~uint64_t (0), RAW }; +} + +inline fractional_cost & +fractional_cost::operator+= (const fractional_cost &other) +{ + *this = *this + other; + return *this; +} + +inline fractional_cost +fractional_cost::operator- (const fractional_cost &other) const +{ + uint64_t diff = m_value - other.m_value; + return { diff <= m_value ? diff : 0, RAW }; +} + +inline fractional_cost & +fractional_cost::operator-= (const fractional_cost &other) +{ + *this = *this - other; + return *this; +} + +inline fractional_cost +fractional_cost::operator* (uint32_t other) const +{ + if (other == 0) + return 0; + + uint64_t max = ~uint64_t (0); + return { m_value <= max / other ? m_value * other : max, RAW }; +} + +inline fractional_cost & +fractional_cost::operator*= (uint32_t other) +{ + *this = *this * other; + return *this; +} + +inline bool +fractional_cost::operator== (const fractional_cost &other) const +{ + return m_value == other.m_value; +} + +inline bool +fractional_cost::operator!= (const fractional_cost &other) const +{ + return m_value != other.m_value; +} + +inline bool +fractional_cost::operator< (const fractional_cost &other) const +{ + return m_value < other.m_value; +} + +inline bool +fractional_cost::operator<= (const fractional_cost &other) const +{ + return m_value <= other.m_value; +} + +inline bool +fractional_cost::operator>= (const fractional_cost &other) const +{ + return m_value >= other.m_value; +} + +inline bool +fractional_cost::operator> (const fractional_cost &other) const +{ + return m_value > other.m_value; +} + +// Round the value up to the nearest integer and saturate to a uint32_t. +inline uint32_t +fractional_cost::ceil () const +{ + uint32_t max = ~uint32_t (0); + if (m_value <= uint64_t (max - 1) * SCALE) + return (m_value + SCALE - 1) / SCALE; + return max; +} + +// Round (COST * A) / B up to the nearest integer and saturate to a uint32_t. +inline uint32_t +fractional_cost::scale (uint32_t cost, fractional_cost a, fractional_cost b) +{ + widest_int result = wi::div_ceil (widest_int (cost) * a.m_value, + b.m_value, SIGNED); + if (result < ~uint32_t (0)) + return result.to_shwi (); + return ~uint32_t (0); +} + +inline fractional_cost +operator+ (uint32_t a, const fractional_cost &b) +{ + return b.operator+ (a); +} + +inline fractional_cost +operator- (uint32_t a, const fractional_cost &b) +{ + return fractional_cost (a).operator- (b); +} + +inline fractional_cost +operator* (uint32_t a, const fractional_cost &b) +{ + return b.operator* (a); +} + +inline bool +operator== (uint32_t a, const fractional_cost &b) +{ + return b.operator== (a); +} + +inline bool +operator!= (uint32_t a, const fractional_cost &b) +{ + return b.operator!= (a); +} + +inline bool +operator< (uint32_t a, const fractional_cost &b) +{ + return b.operator> (a); +} + +inline bool +operator<= (uint32_t a, const fractional_cost &b) +{ + return b.operator>= (a); +} + +inline bool +operator>= (uint32_t a, const fractional_cost &b) +{ + return b.operator<= (a); +} + +inline bool +operator> (uint32_t a, const fractional_cost &b) +{ + return b.operator< (a); +} |