diff options
-rw-r--r-- | gcc/doc/tm.texi | 6 | ||||
-rw-r--r-- | gcc/doc/tm.texi.in | 1 | ||||
-rw-r--r-- | gcc/target.def | 12 | ||||
-rw-r--r-- | gcc/targhooks.cc | 9 | ||||
-rw-r--r-- | gcc/targhooks.h | 2 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c | 25 | ||||
-rw-r--r-- | gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c | 58 | ||||
-rw-r--r-- | gcc/tree-vect-patterns.cc | 77 |
8 files changed, 190 insertions, 0 deletions
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 144862e..c4a92a5 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6146,6 +6146,12 @@ instruction pattern. There is no need for the hook to handle these two implementation approaches itself. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type}) +Sometimes it is possible to implement a vector division using a sequence +of two addition-shift pairs, giving four instructions in total. +Return true if taking this approach for @var{vectype} is likely +to be better than using a sequence involving highpart multiplication. +Default is false if @code{can_mult_highpart_p}, otherwise true. @end deftypefn @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in}) diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 146f461..4075e71 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4175,6 +4175,7 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_VEC_PERM_CONST +@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION diff --git a/gcc/target.def b/gcc/target.def index 049dab6..f401fe1 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1868,6 +1868,18 @@ correct for most targets.", poly_uint64, (const_tree type), default_preferred_vector_alignment) +/* Returns whether the target has a preference for decomposing divisions using + shifts rather than multiplies. */ +DEFHOOK +(preferred_div_as_shifts_over_mult, + "Sometimes it is possible to implement a vector division using a sequence\n\ +of two addition-shift pairs, giving four instructions in total.\n\ +Return true if taking this approach for @var{vectype} is likely\n\ +to be better than using a sequence involving highpart multiplication.\n\ +Default is false if @code{can_mult_highpart_p}, otherwise true.", + bool, (const_tree type), + default_preferred_div_as_shifts_over_mult) + /* Return true if vector alignment is reachable (by peeling N iterations) for the given scalar type. */ DEFHOOK diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index 8b7452c..51bf3fb 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -1488,6 +1488,15 @@ default_preferred_vector_alignment (const_tree type) return TYPE_ALIGN (type); } +/* The default implementation of + TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ + +bool +default_preferred_div_as_shifts_over_mult (const_tree type) +{ + return !can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type)); +} + /* By default assume vectors of element TYPE require a multiple of the natural alignment of TYPE. TYPE is naturally aligned if IS_PACKED is false. */ bool diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 555160d..cf3d310 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void); extern unsigned HOST_WIDE_INT default_shift_truncation_mask (machine_mode); extern unsigned int default_min_divisions_for_recip_mul (machine_mode); +extern bool default_preferred_div_as_shifts_over_mult + (const_tree); extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode); extern tree default_stack_protect_guard (void); diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c new file mode 100644 index 0000000..c81f894 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdint.h> +#include "tree-vect.h" + +typedef unsigned __attribute__((__vector_size__ (16))) V; + +static __attribute__((__noinline__)) __attribute__((__noclone__)) V +foo (V v, unsigned short i) +{ + v /= i; + return v; +} + +int +main (void) +{ + V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff); + for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++) + if (v[i] != 0x00010001) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c new file mode 100644 index 0000000..b4eb1a4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdint.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 50 +#define TYPE uint8_t + +#ifndef DEBUG +#define DEBUG 0 +#endif + +#define BASE ((TYPE) -1 < 0 ? -126 : 4) + + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] + level) / 0xff; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] + level) / 0xff; +} + +int main () +{ + TYPE a[N]; + TYPE b[N]; + + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 13; + b[i] = BASE + i * 13; + if (DEBUG) + printf ("%d: 0x%x\n", i, a[i]); + } + + fun1 (a, N / 2, N); + fun2 (b, N / 2, N); + + for (int i = 0; i < N; ++i) + { + if (DEBUG) + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); + + if (a[i] != b[i]) + __builtin_abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 298fd29..887f02b 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -3934,6 +3934,83 @@ vect_recog_divmod_pattern (vec_info *vinfo, return pattern_stmt; } + if ((cst = uniform_integer_cst_p (oprnd1)) + && TYPE_UNSIGNED (itype) + && rhs_code == TRUNC_DIV_EXPR + && vectype + && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype)) + { + /* We can use the relationship: + + x // N == ((x+N+2) // (N+1) + x) // (N+1) for 0 <= x < N(N+3) + + to optimize cases where N+1 is a power of 2, and where // (N+1) + is therefore a shift right. When operating in modes that are + multiples of a byte in size, there are two cases: + + (1) N(N+3) is not representable, in which case the question + becomes whether the replacement expression overflows. + It is enough to test that x+N+2 does not overflow, + i.e. that x < MAX-(N+1). + + (2) N(N+3) is representable, in which case it is the (only) + bound that we need to check. + + ??? For now we just handle the case where // (N+1) is a shift + right by half the precision, since some architectures can + optimize the associated addition and shift combinations + into single instructions. */ + + auto wcst = wi::to_wide (cst); + int pow = wi::exact_log2 (wcst + 1); + if (pow == prec / 2) + { + gimple *stmt = SSA_NAME_DEF_STMT (oprnd0); + + gimple_ranger ranger; + int_range_max r; + + /* Check that no overflow will occur. If we don't have range + information we can't perform the optimization. */ + + if (ranger.range_of_expr (r, oprnd0, stmt)) + { + wide_int max = r.upper_bound (); + wide_int one = wi::shwi (1, prec); + wide_int adder = wi::add (one, wi::lshift (one, pow)); + wi::overflow_type ovf; + wi::add (max, adder, UNSIGNED, &ovf); + if (ovf == wi::OVF_NONE) + { + *type_out = vectype; + tree tadder = wide_int_to_tree (itype, adder); + tree rshift = wide_int_to_tree (itype, pow); + + tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL); + gassign *patt1 + = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL); + patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1, + rshift); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL); + patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2, + oprnd0); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL); + pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR, + new_lhs3, rshift); + + return pattern_stmt; + } + } + } + } + if (prec > HOST_BITS_PER_WIDE_INT || integer_zerop (oprnd1)) return NULL; |