diff options
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-mul-1.c | 51 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-mul-2.c | 51 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-neg-1.c | 51 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-neg-2.c | 44 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-shift-1.c | 70 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr103144-shift-2.c | 79 | ||||
-rw-r--r-- | gcc/tree-vect-loop-manip.cc | 37 | ||||
-rw-r--r-- | gcc/tree-vect-loop.cc | 678 | ||||
-rw-r--r-- | gcc/tree-vectorizer.h | 15 |
9 files changed, 1062 insertions, 14 deletions
diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c new file mode 100644 index 0000000..640c34f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */ + +#define N 10000 + +void +__attribute__((noipa)) +foo_mul (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b *= 3; + } +} + +void +__attribute__((noipa)) +foo_mul_const (int* a) +{ + int b = 1; + for (int i = 0; i != N; i++) + { + a[i] = b; + b *= 3; + } +} + +void +__attribute__((noipa)) +foo_mul_peel (int* a, int b) +{ + for (int i = 0; i != 39; i++) + { + a[i] = b; + b *= 3; + } +} + +void +__attribute__((noipa)) +foo_mul_peel_const (int* a) +{ + int b = 1; + for (int i = 0; i != 39; i++) + { + a[i] = b; + b *= 3; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c new file mode 100644 index 0000000..39fdea3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-mul-2.c @@ -0,0 +1,51 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-mul-1.c" + +typedef int v8si __attribute__((vector_size(32))); + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 8; + v8si init = __extension__(v8si) { b, b * 3, b * 9, b * 27, b * 81, b * 243, b * 729, b * 2187 }; + + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init *= 6561; + } + + foo_mul (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_mul_peel (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0) + __builtin_abort (); + + init = __extension__(v8si) { 1, 3, 9, 27, 81, 243, 729, 2187 }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init *= 6561; + } + + foo_mul_const (epi32_dst); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_mul_peel_const (epi32_dst); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * 4) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c new file mode 100644 index 0000000..f87b1d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */ + +#define N 10000 + +void +__attribute__((noipa)) +foo_neg (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b = -b; + } +} + +void +__attribute__((noipa)) +foo_neg_const (int* a) +{ + int b = 1; + for (int i = 0; i != N; i++) + { + a[i] = b; + b = -b; + } +} + +void +__attribute__((noipa)) +foo_neg_peel (int* a, int b, int n) +{ + for (int i = 0; i != n; i++) + { + a[i] = b; + b = -b; + } +} + +void +__attribute__((noipa)) +foo_neg_const_peel (int* a, int n) +{ + int b = 1; + for (int i = 0; i != n; i++) + { + a[i] = b; + b = -b; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c new file mode 100644 index 0000000..bb8c22b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-neg-2.c @@ -0,0 +1,44 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-neg-1.c" + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + long long* epi64_exp = (long long*) malloc (N * sizeof (int)); + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 100; + + for (int i = 0; i != N / 2; i++) + epi64_exp[i] = ((long long) b) | (((long long) -b) << 32); + + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); + foo_neg (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_neg_peel (epi32_dst, b, 39); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0) + __builtin_abort (); + + for (int i = 0; i != N / 2; i++) + epi64_exp[i] = ((long long) 1) | (((long long) -1) << 32); + + memcpy (epi32_exp, epi64_exp, N * sizeof (int)); + foo_neg_const (epi32_dst); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_neg_const_peel (epi32_dst, 39); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c new file mode 100644 index 0000000..2a69203 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-1.c @@ -0,0 +1,70 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */ + +#define N 10000 +void +__attribute__((noipa)) +foo_shl (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b <<= 1; + } +} + +void +__attribute__((noipa)) +foo_ashr (int* a, int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b >>= 1; + } +} + +void +__attribute__((noipa)) +foo_lshr (unsigned int* a, unsigned int b) +{ + for (int i = 0; i != N; i++) + { + a[i] = b; + b >>= 1U; + } +} + +void +__attribute__((noipa)) +foo_shl_peel (int* a, int b) +{ + for (int i = 0; i != 39; i++) + { + a[i] = b; + b <<= 1; + } +} + +void +__attribute__((noipa)) +foo_ashr_peel (int* a, int b) +{ + for (int i = 0; i != 39; i++) + { + a[i] = b; + b >>= 1; + } +} + +void +__attribute__((noipa)) +foo_lshr_peel (unsigned int* a, unsigned int b) +{ + for (int i = 0; i != 39; i++) + { + a[i] = b; + b >>= 1U; + } +} diff --git a/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c new file mode 100644 index 0000000..6f47719 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103144-shift-2.c @@ -0,0 +1,79 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" +#include <string.h> +#include "pr103144-shift-1.c" + +typedef int v8si __attribute__((vector_size(32))); +typedef unsigned int v8usi __attribute__((vector_size(32))); + +void +avx2_test (void) +{ + int* epi32_exp = (int*) malloc (N * sizeof (int)); + int* epi32_dst = (int*) malloc (N * sizeof (int)); + unsigned int* epu32_exp = (unsigned int*) malloc (N * sizeof (int)); + unsigned int* epu32_dst = (unsigned int*) malloc (N * sizeof (int)); + + __builtin_memset (epi32_exp, 0, N * sizeof (int)); + int b = 8; + v8si init = __extension__(v8si) { b, b << 1, b << 2, b << 3, b << 4, b << 5, b << 6, b << 7 }; + + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init <<= 8; + } + + foo_shl (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_shl_peel (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0) + __builtin_abort (); + + b = -11111; + init = __extension__(v8si) { b, b >> 1, b >> 2, b >> 3, b >> 4, b >> 5, b >> 6, b >> 7 }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epi32_exp + i * 8, &init, 32); + init >>= 8; + } + + foo_ashr (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_ashr_peel (epi32_dst, b); + if (__builtin_memcmp (epi32_dst, epi32_exp, 39 * sizeof (int)) != 0) + { + for (int i = 0; i != 39; i++) + { + printf ("epi32_dst[%d] is %d ----", i, epi32_dst[i]); + printf ("epi32_exp[%d] is %d\n", i, epi32_exp[i]); + } + __builtin_abort (); + } + + __builtin_memset (epu32_exp, 0, N * sizeof (int)); + unsigned int c = 11111111; + v8usi initu = __extension__(v8usi) { c, c >> 1U, c >> 2U, c >> 3U, c >> 4U, c >> 5U, c >> 6U, c >> 7U }; + for (int i = 0; i != N / 8; i++) + { + memcpy (epu32_exp + i * 8, &initu, 32); + initu >>= 8U; + } + + foo_lshr (epu32_dst, c); + if (__builtin_memcmp (epu32_dst, epu32_exp, N * sizeof (int)) != 0) + __builtin_abort (); + + foo_lshr_peel (epu32_dst, c); + if (__builtin_memcmp (epu32_dst, epu32_exp, 39 * sizeof (int)) != 0) + __builtin_abort (); + + return; +} diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index b68e6cd..74b221a 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -1560,15 +1560,28 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, gcc_assert (!tree_is_chrec (step_expr)); init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); + gimple_seq stmts = NULL; + enum vect_induction_op_type induction_type + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info); - off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), - fold_convert (TREE_TYPE (step_expr), niters), - step_expr); - if (POINTER_TYPE_P (type)) - ni = fold_build_pointer_plus (init_expr, off); + if (induction_type == vect_step_op_add) + { + off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), + fold_convert (TREE_TYPE (step_expr), niters), + step_expr); + if (POINTER_TYPE_P (type)) + ni = fold_build_pointer_plus (init_expr, off); + else + ni = fold_build2 (PLUS_EXPR, type, + init_expr, fold_convert (type, off)); + } + /* Don't bother call vect_peel_nonlinear_iv_init. */ + else if (induction_type == vect_step_op_neg) + ni = init_expr; else - ni = fold_build2 (PLUS_EXPR, type, - init_expr, fold_convert (type, off)); + ni = vect_peel_nonlinear_iv_init (&stmts, init_expr, + niters, step_expr, + induction_type); var = create_tmp_var (type, "tmp"); @@ -1577,9 +1590,15 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, ni_name = force_gimple_operand (ni, &new_stmts, false, var); /* Exit_bb shouldn't be empty. */ if (!gsi_end_p (last_gsi)) - gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + { + gsi_insert_seq_after (&last_gsi, stmts, GSI_SAME_STMT); + gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); + } else - gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + { + gsi_insert_seq_before (&last_gsi, stmts, GSI_SAME_STMT); + gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); + } /* Fix phi expressions in the successor bb. */ adjust_phi_and_debug_stmts (phi1, update_e, ni_name); diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 24556b5..8f88f17 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -425,6 +425,77 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, return true; } +/* Function vect_is_nonlinear_iv_evolution + + Only support nonlinear induction for integer type + 1. neg + 2. mul by constant + 3. lshift/rshift by constant. + + For neg induction, return a fake step as integer -1. */ +static bool +vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info, + gphi* loop_phi_node, tree *init, tree *step) +{ + tree init_expr, ev_expr, result, op1, op2; + gimple* def; + + if (gimple_phi_num_args (loop_phi_node) != 2) + return false; + + init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop)); + ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop)); + + /* Support nonlinear induction only for integer type. */ + if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr))) + return false; + + *init = init_expr; + result = PHI_RESULT (loop_phi_node); + + if (TREE_CODE (ev_expr) != SSA_NAME + || ((def = SSA_NAME_DEF_STMT (ev_expr)), false) + || !is_gimple_assign (def)) + return false; + + enum tree_code t_code = gimple_assign_rhs_code (def); + switch (t_code) + { + case NEGATE_EXPR: + if (gimple_assign_rhs1 (def) != result) + return false; + *step = build_int_cst (TREE_TYPE (init_expr), -1); + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg; + break; + + case RSHIFT_EXPR: + case LSHIFT_EXPR: + case MULT_EXPR: + op1 = gimple_assign_rhs1 (def); + op2 = gimple_assign_rhs2 (def); + if (TREE_CODE (op2) != INTEGER_CST + || op1 != result) + return false; + *step = op2; + if (t_code == LSHIFT_EXPR) + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl; + else if (t_code == RSHIFT_EXPR) + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr; + /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */ + else + STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul; + break; + + default: + return false; + } + + STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init; + STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step; + + return true; +} + /* Return true if PHI, described by STMT_INFO, is the inner PHI in what we are assuming is a double reduction. For example, given a structure like this: @@ -513,11 +584,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop, = evolution_part_in_loop_num (access_fn, loop->num); } - if (!access_fn - || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) - || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) - || (LOOP_VINFO_LOOP (loop_vinfo) != loop - && TREE_CODE (step) != INTEGER_CST)) + if ((!access_fn + || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi) + || !vect_is_simple_iv_evolution (loop->num, access_fn, + &init, &step) + || (LOOP_VINFO_LOOP (loop_vinfo) != loop + && TREE_CODE (step) != INTEGER_CST)) + /* Only handle nonlinear iv for same loop. */ + && (LOOP_VINFO_LOOP (loop_vinfo) != loop + || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, + phi, &init, &step))) { worklist.safe_push (stmt_vinfo); continue; @@ -8233,6 +8309,591 @@ vect_can_vectorize_without_simd_p (code_helper code) && vect_can_vectorize_without_simd_p (tree_code (code))); } +/* Create vector init for vectorized iv. */ +static tree +vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, + tree step_expr, poly_uint64 nunits, + tree vectype, + enum vect_induction_op_type induction_type) +{ + unsigned HOST_WIDE_INT const_nunits; + tree vec_shift, vec_init, new_name; + unsigned i; + tree itype = TREE_TYPE (vectype); + + /* iv_loop is the loop to be vectorized. Create: + vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */ + new_name = gimple_convert (stmts, itype, init_expr); + switch (induction_type) + { + case vect_step_op_shr: + case vect_step_op_shl: + /* Build the Initial value from shift_expr. */ + vec_init = gimple_build_vector_from_val (stmts, + vectype, + new_name); + vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype, + build_zero_cst (itype), step_expr); + vec_init = gimple_build (stmts, + (induction_type == vect_step_op_shr + ? RSHIFT_EXPR : LSHIFT_EXPR), + vectype, vec_init, vec_shift); + break; + + case vect_step_op_neg: + { + vec_init = gimple_build_vector_from_val (stmts, + vectype, + new_name); + tree vec_neg = gimple_build (stmts, NEGATE_EXPR, + vectype, vec_init); + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nunits, 2, 3); + sel.quick_grow (6); + for (i = 0; i < 3; i++) + { + sel[2 * i] = i; + sel[2 * i + 1] = i + nunits; + } + vec_perm_indices indices (sel, 2, nunits); + tree perm_mask_even + = vect_gen_perm_mask_checked (vectype, indices); + vec_init = gimple_build (stmts, VEC_PERM_EXPR, + vectype, + vec_init, vec_neg, + perm_mask_even); + } + break; + + case vect_step_op_mul: + { + /* Use unsigned mult to avoid UD integer overflow. */ + gcc_assert (nunits.is_constant (&const_nunits)); + tree utype = unsigned_type_for (itype); + tree uvectype = build_vector_type (utype, + TYPE_VECTOR_SUBPARTS (vectype)); + new_name = gimple_convert (stmts, utype, new_name); + vec_init = gimple_build_vector_from_val (stmts, + uvectype, + new_name); + tree_vector_builder elts (uvectype, const_nunits, 1); + tree elt_step = build_one_cst (utype); + + elts.quick_push (elt_step); + for (i = 1; i < const_nunits; i++) + { + /* Create: new_name_i = new_name + step_expr. */ + elt_step = gimple_build (stmts, MULT_EXPR, + utype, elt_step, step_expr); + elts.quick_push (elt_step); + } + /* Create a vector from [new_name_0, new_name_1, ..., + new_name_nunits-1]. */ + tree vec_mul = gimple_build_vector (stmts, &elts); + vec_init = gimple_build (stmts, MULT_EXPR, uvectype, + vec_init, vec_mul); + vec_init = gimple_convert (stmts, vectype, vec_init); + } + break; + + default: + gcc_unreachable (); + } + + return vec_init; +} + +/* Peel init_expr by skip_niter for induction_type. */ +tree +vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, + tree skip_niters, tree step_expr, + enum vect_induction_op_type induction_type) +{ + gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST); + tree type = TREE_TYPE (init_expr); + unsigned prec = TYPE_PRECISION (type); + switch (induction_type) + { + case vect_step_op_neg: + if (TREE_INT_CST_LOW (skip_niters) % 2) + init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr); + /* else no change. */ + break; + + case vect_step_op_shr: + case vect_step_op_shl: + skip_niters = gimple_convert (stmts, type, skip_niters); + step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters); + /* When shift mount >= precision, need to avoid UD. + In the original loop, there's no UD, and according to semantic, + init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */ + if (!tree_fits_uhwi_p (step_expr) + || tree_to_uhwi (step_expr) >= prec) + { + if (induction_type == vect_step_op_shl + || TYPE_UNSIGNED (type)) + init_expr = build_zero_cst (type); + else + init_expr = gimple_build (stmts, RSHIFT_EXPR, type, + init_expr, + wide_int_to_tree (type, prec - 1)); + } + else + init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr + ? RSHIFT_EXPR : LSHIFT_EXPR), + type, init_expr, step_expr); + break; + + case vect_step_op_mul: + { + tree utype = unsigned_type_for (type); + init_expr = gimple_convert (stmts, utype, init_expr); + unsigned skipn = TREE_INT_CST_LOW (skip_niters); + wide_int begin = wi::to_wide (step_expr); + for (unsigned i = 0; i != skipn - 1; i++) + begin = wi::mul (begin, wi::to_wide (step_expr)); + tree mult_expr = wide_int_to_tree (utype, begin); + init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr); + init_expr = gimple_convert (stmts, type, init_expr); + } + break; + + default: + gcc_unreachable (); + } + + return init_expr; +} + +/* Create vector step for vectorized iv. */ +static tree +vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr, + poly_uint64 vf, + enum vect_induction_op_type induction_type) +{ + tree expr = build_int_cst (TREE_TYPE (step_expr), vf); + tree new_name = NULL; + /* Step should be pow (step, vf) for mult induction. */ + if (induction_type == vect_step_op_mul) + { + gcc_assert (vf.is_constant ()); + wide_int begin = wi::to_wide (step_expr); + + for (unsigned i = 0; i != vf.to_constant () - 1; i++) + begin = wi::mul (begin, wi::to_wide (step_expr)); + + new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin); + } + else if (induction_type == vect_step_op_neg) + /* Do nothing. */ + ; + else + new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr), + expr, step_expr); + return new_name; +} + +static tree +vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, + tree new_name, tree vectype, + enum vect_induction_op_type induction_type) +{ + /* No step is needed for neg induction. */ + if (induction_type == vect_step_op_neg) + return NULL; + + tree t = unshare_expr (new_name); + gcc_assert (CONSTANT_CLASS_P (new_name) + || TREE_CODE (new_name) == SSA_NAME); + tree new_vec = build_vector_from_val (vectype, t); + tree vec_step = vect_init_vector (loop_vinfo, stmt_info, + new_vec, vectype, NULL); + return vec_step; +} + +/* Update vectorized iv with vect_step, induc_def is init. */ +static tree +vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype, + tree induc_def, tree vec_step, + enum vect_induction_op_type induction_type) +{ + tree vec_def = induc_def; + switch (induction_type) + { + case vect_step_op_mul: + { + /* Use unsigned mult to avoid UD integer overflow. */ + tree uvectype + = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)), + TYPE_VECTOR_SUBPARTS (vectype)); + vec_def = gimple_convert (stmts, uvectype, vec_def); + vec_step = gimple_convert (stmts, uvectype, vec_step); + vec_def = gimple_build (stmts, MULT_EXPR, uvectype, + vec_def, vec_step); + vec_def = gimple_convert (stmts, vectype, vec_def); + } + break; + + case vect_step_op_shr: + vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype, + vec_def, vec_step); + break; + + case vect_step_op_shl: + vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype, + vec_def, vec_step); + break; + case vect_step_op_neg: + vec_def = induc_def; + /* Do nothing. */ + break; + default: + gcc_unreachable (); + } + + return vec_def; + +} +/* Function vectorizable_induction + + Check if STMT_INFO performs an nonlinear induction computation that can be + vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create + a vectorized phi to replace it, put it in VEC_STMT, and add it to the same + basic block. + Return true if STMT_INFO is vectorizable in this way. */ + +static bool +vectorizable_nonlinear_induction (loop_vec_info loop_vinfo, + stmt_vec_info stmt_info, + gimple **vec_stmt, slp_tree slp_node, + stmt_vector_for_cost *cost_vec) +{ + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + unsigned ncopies; + bool nested_in_vect_loop = false; + class loop *iv_loop; + tree vec_def; + edge pe = loop_preheader_edge (loop); + basic_block new_bb; + tree vec_init, vec_step; + tree new_name; + gimple *new_stmt; + gphi *induction_phi; + tree induc_def, vec_dest; + tree init_expr, step_expr; + tree niters_skip; + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); + unsigned i; + gimple_stmt_iterator si; + + gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); + + tree vectype = STMT_VINFO_VECTYPE (stmt_info); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + enum vect_induction_op_type induction_type + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); + + gcc_assert (induction_type > vect_step_op_add); + + if (slp_node) + ncopies = 1; + else + ncopies = vect_get_num_copies (loop_vinfo, vectype); + gcc_assert (ncopies >= 1); + + /* FORNOW. Only handle nonlinear induction in the same loop. */ + if (nested_in_vect_loop_p (loop, stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "nonlinear induction in nested loop.\n"); + return false; + } + + iv_loop = loop; + gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); + + /* TODO: Support slp for nonlinear iv. There should be separate vector iv + update for each iv and a permutation to generate wanted vector iv. */ + if (slp_node) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "SLP induction not supported for nonlinear" + " induction.\n"); + return false; + } + + /* Init_expr will be update by vect_update_ivs_after_vectorizer, + if niters is unkown: + For shift, when shift mount >= precision, there would be UD. + For mult, don't known how to generate + init_expr * pow (step, niters) for variable niters. + For neg, it should be ok, since niters of vectorized main loop + will always be multiple of 2. */ + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && induction_type != vect_step_op_neg) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Peeling for epilogue is not supported" + " for nonlinear induction except neg" + " when iteration count is unknown.\n"); + return false; + } + + /* Also doens't support peel for neg when niter is variable. + ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */ + niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); + if (niters_skip != NULL_TREE + && TREE_CODE (niters_skip) != INTEGER_CST) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Peeling for alignement is not supported" + " for nonlinear induction when niters_skip" + " is not constant.\n"); + return false; + } + + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && induction_type == vect_step_op_mul) + if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "floating point nonlinear induction vectorization" + " not supported.\n"); + return false; + } + + step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); + init_expr = vect_phi_initial_value (phi); + gcc_assert (step_expr != NULL_TREE && init_expr != NULL + && TREE_CODE (step_expr) == INTEGER_CST); + /* step_expr should be aligned with init_expr, + .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */ + step_expr = fold_convert (TREE_TYPE (vectype), step_expr); + + if (TREE_CODE (init_expr) == INTEGER_CST) + init_expr = fold_convert (TREE_TYPE (vectype), init_expr); + else + gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype), + TREE_TYPE (init_expr))); + + switch (induction_type) + { + case vect_step_op_neg: + if (TREE_CODE (init_expr) != INTEGER_CST + && TREE_CODE (init_expr) != REAL_CST) + { + /* Check for backend support of NEGATE_EXPR and vec_perm. */ + if (!directly_supported_p (NEGATE_EXPR, vectype)) + return false; + + /* The encoding has 2 interleaved stepped patterns. */ + vec_perm_builder sel (nunits, 2, 3); + machine_mode mode = TYPE_MODE (vectype); + sel.quick_grow (6); + for (i = 0; i < 3; i++) + { + sel[i * 2] = i; + sel[i * 2 + 1] = i + nunits; + } + vec_perm_indices indices (sel, 2, nunits); + if (!can_vec_perm_const_p (mode, mode, indices)) + return false; + } + break; + + case vect_step_op_mul: + { + /* Check for backend support of MULT_EXPR. */ + if (!directly_supported_p (MULT_EXPR, vectype)) + return false; + + /* ?? How to construct vector step for variable number vector. + [ 1, step, pow (step, 2), pow (step, 4), .. ]. */ + if (!vf.is_constant ()) + return false; + } + break; + + case vect_step_op_shr: + /* Check for backend support of RSHIFT_EXPR. */ + if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector)) + return false; + + /* Don't shift more than type precision to avoid UD. */ + if (!tree_fits_uhwi_p (step_expr) + || maybe_ge (nunits * tree_to_uhwi (step_expr), + TYPE_PRECISION (TREE_TYPE (init_expr)))) + return false; + break; + + case vect_step_op_shl: + /* Check for backend support of RSHIFT_EXPR. */ + if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector)) + return false; + + /* Don't shift more than type precision to avoid UD. */ + if (!tree_fits_uhwi_p (step_expr) + || maybe_ge (nunits * tree_to_uhwi (step_expr), + TYPE_PRECISION (TREE_TYPE (init_expr)))) + return false; + + break; + + default: + gcc_unreachable (); + } + + if (!vec_stmt) /* transformation not required. */ + { + unsigned inside_cost = 0, prologue_cost = 0; + /* loop cost for vec_loop. Neg induction doesn't have any + inside_cost. */ + inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, + stmt_info, 0, vect_body); + + /* loop cost for vec_loop. Neg induction doesn't have any + inside_cost. */ + if (induction_type == vect_step_op_neg) + inside_cost = 0; + + /* prologue cost for vec_init and vec_step. */ + prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, + stmt_info, 0, vect_prologue); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vect_model_induction_cost: inside_cost = %d, " + "prologue_cost = %d. \n", inside_cost, + prologue_cost); + + STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; + DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction"); + return true; + } + + /* Transform. */ + + /* Compute a vector variable, initialized with the first VF values of + the induction variable. E.g., for an iv with IV_PHI='X' and + evolution S, for a vector of 4 units, we want to compute: + [X, X + S, X + 2*S, X + 3*S]. */ + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); + + pe = loop_preheader_edge (iv_loop); + /* Find the first insertion point in the BB. */ + basic_block bb = gimple_bb (phi); + si = gsi_after_labels (bb); + + gimple_seq stmts = NULL; + + /* If we are using the loop mask to "peel" for alignment then we need + to adjust the start value here. */ + if (niters_skip != NULL_TREE) + init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip, + step_expr, induction_type); + + vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr, + step_expr, nunits, vectype, + induction_type); + if (stmts) + { + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + + stmts = NULL; + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, + vf, induction_type); + if (stmts) + { + new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); + gcc_assert (!new_bb); + } + + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, + new_name, vectype, + induction_type); + /* Create the following def-use cycle: + loop prolog: + vec_init = ... + vec_step = ... + loop: + vec_iv = PHI <vec_init, vec_loop> + ... + STMT + ... + vec_loop = vec_iv + vec_step; */ + + /* Create the induction-phi that defines the induction-operand. */ + vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); + induction_phi = create_phi_node (vec_dest, iv_loop->header); + induc_def = PHI_RESULT (induction_phi); + + /* Create the iv update inside the loop. */ + stmts = NULL; + vec_def = vect_update_nonlinear_iv (&stmts, vectype, + induc_def, vec_step, + induction_type); + + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + new_stmt = SSA_NAME_DEF_STMT (vec_def); + + /* Set the arguments of the phi node: */ + add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); + add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), + UNKNOWN_LOCATION); + + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi); + *vec_stmt = induction_phi; + + /* In case that vectorization factor (VF) is bigger than the number + of elements that we can fit in a vectype (nunits), we have to generate + more than one vector stmt - i.e - we need to "unroll" the + vector stmt by a factor VF/nunits. For more details see documentation + in vectorizable_operation. */ + + if (ncopies > 1) + { + stmts = NULL; + /* FORNOW. This restriction should be relaxed. */ + gcc_assert (!nested_in_vect_loop); + + new_name = vect_create_nonlinear_iv_step (&stmts, step_expr, + nunits, induction_type); + + vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info, + new_name, vectype, + induction_type); + vec_def = induc_def; + for (i = 1; i < ncopies; i++) + { + /* vec_i = vec_prev + vec_step. */ + stmts = NULL; + vec_def = vect_update_nonlinear_iv (&stmts, vectype, + vec_def, vec_step, + induction_type); + gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); + new_stmt = SSA_NAME_DEF_STMT (vec_def); + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + } + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "transform induction: created def-use cycle: %G%G", + (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def)); + + return true; +} + /* Function vectorizable_induction Check if STMT_INFO performs an induction computation that can be vectorized. @@ -8263,6 +8924,8 @@ vectorizable_induction (loop_vec_info loop_vinfo, unsigned i; tree expr; gimple_stmt_iterator si; + enum vect_induction_op_type induction_type + = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info); gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); if (!phi) @@ -8275,6 +8938,11 @@ vectorizable_induction (loop_vec_info loop_vinfo, if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) return false; + /* Handle nonlinear induction in a separate place. */ + if (induction_type != vect_step_op_add) + return vectorizable_nonlinear_induction (loop_vinfo, stmt_info, + vec_stmt, slp_node, cost_vec); + tree vectype = STMT_VINFO_VECTYPE (stmt_info); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index a2b0afb..5e75ed1 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -68,6 +68,15 @@ enum vect_def_type { vect_unknown_def_type }; +/* Define operation type of linear/non-linear induction variable. */ +enum vect_induction_op_type { + vect_step_op_add = 0, + vect_step_op_neg, + vect_step_op_mul, + vect_step_op_shl, + vect_step_op_shr +}; + /* Define type of reduction. */ enum vect_reduction_type { TREE_CODE_REDUCTION, @@ -1190,6 +1199,7 @@ public: the version here. */ tree loop_phi_evolution_base_unchanged; tree loop_phi_evolution_part; + enum vect_induction_op_type loop_phi_evolution_type; /* Used for various bookkeeping purposes, generally holding a pointer to some other stmt S that is in some way "related" to this stmt. @@ -1423,6 +1433,7 @@ struct gather_scatter_info { ((S)->dr_aux.dr && DR_GROUP_FIRST_ELEMENT(S)) #define STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED(S) (S)->loop_phi_evolution_base_unchanged #define STMT_VINFO_LOOP_PHI_EVOLUTION_PART(S) (S)->loop_phi_evolution_part +#define STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE(S) (S)->loop_phi_evolution_type #define STMT_VINFO_MIN_NEG_DIST(S) (S)->min_neg_dist #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type #define STMT_VINFO_REDUC_CODE(S) (S)->reduc_code @@ -2329,6 +2340,10 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *, stmt_vector_for_cost *); extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree); +/* Nonlinear induction. */ +extern tree vect_peel_nonlinear_iv_init (gimple_seq*, tree, tree, + tree, enum vect_induction_op_type); + /* In tree-vect-slp.cc. */ extern void vect_slp_init (void); extern void vect_slp_fini (void); |