diff options
author | Richard Biener <rguenther@suse.de> | 2023-02-10 13:09:10 +0100 |
---|---|---|
committer | Richard Biener <rguenther@suse.de> | 2023-04-28 11:05:53 +0200 |
commit | 821ef93976e750c118d42a2ad33b96dbd1b9f3a5 (patch) | |
tree | 7b6c9222dd6d0f2177388c680e3a36998c076055 | |
parent | 9a41d2cdbcd2af77a3a91a840a3a13f0eb39971b (diff) | |
download | gcc-821ef93976e750c118d42a2ad33b96dbd1b9f3a5.zip gcc-821ef93976e750c118d42a2ad33b96dbd1b9f3a5.tar.gz gcc-821ef93976e750c118d42a2ad33b96dbd1b9f3a5.tar.bz2 |
tree-optimization/108752 - vectorize emulated vectors in lowered form
The following makes sure to emit operations lowered to bit operations
when vectorizing using emulated vectors. This avoids relying on
the vector lowering pass adhering to the exact same cost considerations
as the vectorizer.
PR tree-optimization/108752
* tree-vect-generic.cc (build_replicated_const): Rename
to build_replicated_int_cst and move to tree.{h,cc}.
(do_plus_minus): Adjust.
(do_negate): Likewise.
* tree-vect-stmts.cc (vectorizable_operation): Emit emulated
arithmetic vector operations in lowered form.
* tree.h (build_replicated_int_cst): Declare.
* tree.cc (build_replicated_int_cst): Moved from
tree-vect-generic.cc build_replicated_const.
-rw-r--r-- | gcc/tree-vect-generic.cc | 37 | ||||
-rw-r--r-- | gcc/tree-vect-stmts.cc | 106 | ||||
-rw-r--r-- | gcc/tree.cc | 30 | ||||
-rw-r--r-- | gcc/tree.h | 1 |
4 files changed, 125 insertions, 49 deletions
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index 445da53..59115b2 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -103,35 +103,6 @@ subparts_gt (tree type1, tree type2) return known_gt (n1, n2); } -/* Build a constant of type TYPE, made of VALUE's bits replicated - every WIDTH bits to fit TYPE's precision. */ -static tree -build_replicated_const (tree type, unsigned int width, HOST_WIDE_INT value) -{ - int n = (TYPE_PRECISION (type) + HOST_BITS_PER_WIDE_INT - 1) - / HOST_BITS_PER_WIDE_INT; - unsigned HOST_WIDE_INT low, mask; - HOST_WIDE_INT a[WIDE_INT_MAX_ELTS]; - int i; - - gcc_assert (n && n <= WIDE_INT_MAX_ELTS); - - if (width == HOST_BITS_PER_WIDE_INT) - low = value; - else - { - mask = ((HOST_WIDE_INT)1 << width) - 1; - low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask); - } - - for (i = 0; i < n; i++) - a[i] = low; - - gcc_assert (TYPE_PRECISION (type) <= MAX_BITSIZE_MODE_ANY_INT); - return wide_int_to_tree - (type, wide_int::from_array (a, n, TYPE_PRECISION (type))); -} - static GTY(()) tree vector_inner_type; static GTY(()) tree vector_last_type; static GTY(()) int vector_last_nunits; @@ -255,8 +226,8 @@ do_plus_minus (gimple_stmt_iterator *gsi, tree word_type, tree a, tree b, tree low_bits, high_bits, a_low, b_low, result_low, signs; max = GET_MODE_MASK (TYPE_MODE (inner_type)); - low_bits = build_replicated_const (word_type, width, max >> 1); - high_bits = build_replicated_const (word_type, width, max & ~(max >> 1)); + low_bits = build_replicated_int_cst (word_type, width, max >> 1); + high_bits = build_replicated_int_cst (word_type, width, max & ~(max >> 1)); a = tree_vec_extract (gsi, word_type, a, bitsize, bitpos); b = tree_vec_extract (gsi, word_type, b, bitsize, bitpos); @@ -289,8 +260,8 @@ do_negate (gimple_stmt_iterator *gsi, tree word_type, tree b, tree low_bits, high_bits, b_low, result_low, signs; max = GET_MODE_MASK (TYPE_MODE (inner_type)); - low_bits = build_replicated_const (word_type, width, max >> 1); - high_bits = build_replicated_const (word_type, width, max & ~(max >> 1)); + low_bits = build_replicated_int_cst (word_type, width, max >> 1); + high_bits = build_replicated_int_cst (word_type, width, max & ~(max >> 1)); b = tree_vec_extract (gsi, word_type, b, bitsize, bitpos); diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 272839a..dc2dc2c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6134,7 +6134,6 @@ vectorizable_shift (vec_info *vinfo, return true; } - /* Function vectorizable_operation. Check if STMT_INFO performs a binary, unary or ternary operation that can @@ -6405,20 +6404,6 @@ vectorizable_operation (vec_info *vinfo, return false; } - /* ??? We should instead expand the operations here, instead of - relying on vector lowering which has this hard cap on the number - of vector elements below it performs elementwise operations. */ - if (using_emulated_vectors_p - && (code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR) - && ((BITS_PER_WORD / vector_element_bits (vectype)) < 4 - || maybe_lt (nunits_out, 4U))) - { - if (dump_enabled_p ()) - dump_printf (MSG_NOTE, "not using word mode for +- and less than " - "four vector elements\n"); - return false; - } - int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL); internal_fn cond_fn = get_conditional_internal_fn (code); @@ -6581,7 +6566,96 @@ vectorizable_operation (vec_info *vinfo, vop1 = ((op_type == binary_op || op_type == ternary_op) ? vec_oprnds1[i] : NULL_TREE); vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE); - if (masked_loop_p && mask_out_inactive) + if (using_emulated_vectors_p + && (code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)) + { + /* Lower the operation. This follows vector lowering. */ + unsigned int width = vector_element_bits (vectype); + tree inner_type = TREE_TYPE (vectype); + tree word_type + = build_nonstandard_integer_type (GET_MODE_BITSIZE (word_mode), 1); + HOST_WIDE_INT max = GET_MODE_MASK (TYPE_MODE (inner_type)); + tree low_bits = build_replicated_int_cst (word_type, width, max >> 1); + tree high_bits + = build_replicated_int_cst (word_type, width, max & ~(max >> 1)); + tree wvop0 = make_ssa_name (word_type); + new_stmt = gimple_build_assign (wvop0, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + word_type, vop0)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + tree result_low, signs; + if (code == PLUS_EXPR || code == MINUS_EXPR) + { + tree wvop1 = make_ssa_name (word_type); + new_stmt = gimple_build_assign (wvop1, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + word_type, vop1)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + signs = make_ssa_name (word_type); + new_stmt = gimple_build_assign (signs, + BIT_XOR_EXPR, wvop0, wvop1); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + tree b_low = make_ssa_name (word_type); + new_stmt = gimple_build_assign (b_low, + BIT_AND_EXPR, wvop1, low_bits); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + tree a_low = make_ssa_name (word_type); + if (code == PLUS_EXPR) + new_stmt = gimple_build_assign (a_low, + BIT_AND_EXPR, wvop0, low_bits); + else + new_stmt = gimple_build_assign (a_low, + BIT_IOR_EXPR, wvop0, high_bits); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + if (code == MINUS_EXPR) + { + new_stmt = gimple_build_assign (NULL_TREE, + BIT_NOT_EXPR, signs); + signs = make_ssa_name (word_type); + gimple_assign_set_lhs (new_stmt, signs); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } + new_stmt = gimple_build_assign (NULL_TREE, + BIT_AND_EXPR, signs, high_bits); + signs = make_ssa_name (word_type); + gimple_assign_set_lhs (new_stmt, signs); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + result_low = make_ssa_name (word_type); + new_stmt = gimple_build_assign (result_low, code, a_low, b_low); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } + else + { + tree a_low = make_ssa_name (word_type); + new_stmt = gimple_build_assign (a_low, + BIT_AND_EXPR, wvop0, low_bits); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + signs = make_ssa_name (word_type); + new_stmt = gimple_build_assign (signs, BIT_NOT_EXPR, wvop0); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + new_stmt = gimple_build_assign (NULL_TREE, + BIT_AND_EXPR, signs, high_bits); + signs = make_ssa_name (word_type); + gimple_assign_set_lhs (new_stmt, signs); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + result_low = make_ssa_name (word_type); + new_stmt = gimple_build_assign (result_low, + MINUS_EXPR, high_bits, a_low); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } + new_stmt = gimple_build_assign (NULL_TREE, BIT_XOR_EXPR, result_low, + signs); + result_low = make_ssa_name (word_type); + gimple_assign_set_lhs (new_stmt, result_low); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype, result_low)); + result_low = make_ssa_name (vectype); + gimple_assign_set_lhs (new_stmt, result_low); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } + else if (masked_loop_p && mask_out_inactive) { tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, vectype, i); diff --git a/gcc/tree.cc b/gcc/tree.cc index ead4248..7e6de28 100644 --- a/gcc/tree.cc +++ b/gcc/tree.cc @@ -2667,6 +2667,36 @@ build_zero_cst (tree type) } } +/* Build a constant of integer type TYPE, made of VALUE's bits replicated + every WIDTH bits to fit TYPE's precision. */ + +tree +build_replicated_int_cst (tree type, unsigned int width, HOST_WIDE_INT value) +{ + int n = (TYPE_PRECISION (type) + HOST_BITS_PER_WIDE_INT - 1) + / HOST_BITS_PER_WIDE_INT; + unsigned HOST_WIDE_INT low, mask; + HOST_WIDE_INT a[WIDE_INT_MAX_ELTS]; + int i; + + gcc_assert (n && n <= WIDE_INT_MAX_ELTS); + + if (width == HOST_BITS_PER_WIDE_INT) + low = value; + else + { + mask = ((HOST_WIDE_INT)1 << width) - 1; + low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask); + } + + for (i = 0; i < n; i++) + a[i] = low; + + gcc_assert (TYPE_PRECISION (type) <= MAX_BITSIZE_MODE_ANY_INT); + return wide_int_to_tree + (type, wide_int::from_array (a, n, TYPE_PRECISION (type))); +} + /* If floating-point type TYPE has an IEEE-style sign bit, return an unsigned constant in which only the sign bit is set. Return null otherwise. */ @@ -4685,6 +4685,7 @@ extern tree build_one_cst (tree); extern tree build_minus_one_cst (tree); extern tree build_all_ones_cst (tree); extern tree build_zero_cst (tree); +extern tree build_replicated_int_cst (tree, unsigned, HOST_WIDE_INT); extern tree sign_mask_for (tree); extern tree build_string (unsigned, const char * = NULL); extern tree build_poly_int_cst (tree, const poly_wide_int_ref &); |