aboutsummaryrefslogtreecommitdiff
path: root/gcc/match.pd
diff options
context:
space:
mode:
authorRichard Biener <rguenther@suse.de>2023-07-12 15:01:47 +0200
committerRichard Biener <rguenther@suse.de>2023-08-22 11:32:50 +0200
commit27de9aa152141e7f3ee66372647d0f2cd94c4b90 (patch)
tree22a249ae4c755579a1866f56f3549bcf06c39879 /gcc/match.pd
parentd3b5a1bccc219680dc19281b6fd6cc798bb679eb (diff)
downloadgcc-27de9aa152141e7f3ee66372647d0f2cd94c4b90.zip
gcc-27de9aa152141e7f3ee66372647d0f2cd94c4b90.tar.gz
gcc-27de9aa152141e7f3ee66372647d0f2cd94c4b90.tar.bz2
tree-optimization/94864 - vector insert of vector extract simplification
The PRs ask for optimizing of _1 = BIT_FIELD_REF <b_3(D), 64, 64>; result_4 = BIT_INSERT_EXPR <a_2(D), _1, 64>; to a vector permutation. The following implements this as match.pd pattern, improving code generation on x86_64. On the RTL level we face the issue that backend patterns inconsistently use vec_merge and vec_select of vec_concat to represent permutes. I think using a (supported) permute is almost always better than an extract plus insert, maybe excluding the case we extract element zero and that's aliased to a register that can be used directly for insertion (not sure how to query that). The patch FAILs one case in gcc.target/i386/avx512fp16-vmovsh-1a.c where we now expand from __A_28 = VEC_PERM_EXPR <x2.8_9, x1.9_10, { 0, 9, 10, 11, 12, 13, 14, 15 }>; instead of _28 = BIT_FIELD_REF <x2.8_9, 16, 0>; __A_29 = BIT_INSERT_EXPR <x1.9_10, _28, 0>; producing a vpblendw instruction instead of the expected vmovsh. That's either a missed vec_perm_const expansion optimization or even better, an improvement - Zen4 for example has 4 ports to execute vpblendw but only 3 for executing vmovsh and both instructions have the same size. The patch XFAILs the sub-testcase. PR tree-optimization/94864 PR tree-optimization/94865 PR tree-optimization/93080 * match.pd (bit_insert @0 (BIT_FIELD_REF @1 ..) ..): New pattern for vector insertion from vector extraction. * gcc.target/i386/pr94864.c: New testcase. * gcc.target/i386/pr94865.c: Likewise. * gcc.target/i386/avx512fp16-vmovsh-1a.c: XFAIL. * gcc.dg/tree-ssa/forwprop-40.c: Likewise. * gcc.dg/tree-ssa/forwprop-41.c: Likewise.
Diffstat (limited to 'gcc/match.pd')
-rw-r--r--gcc/match.pd25
1 files changed, 25 insertions, 0 deletions
diff --git a/gcc/match.pd b/gcc/match.pd
index 86fdc60..6e08302 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8006,6 +8006,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
wi::to_wide (@ipos) + isize))
(BIT_FIELD_REF @0 @rsize @rpos)))))
+/* Simplify vector inserts of other vector extracts to a permute. */
+(simplify
+ (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
+ (if (VECTOR_TYPE_P (type)
+ && types_match (@0, @1)
+ && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
+ && TYPE_VECTOR_SUBPARTS (type).is_constant ())
+ (with
+ {
+ unsigned HOST_WIDE_INT elsz
+ = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
+ poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
+ poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
+ unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+ vec_perm_builder builder;
+ builder.new_vector (nunits, nunits, 1);
+ for (unsigned i = 0; i < nunits; ++i)
+ builder.quick_push (known_eq (ielt, i) ? nunits + relt : i);
+ vec_perm_indices sel (builder, 2, nunits);
+ }
+ (if (!VECTOR_MODE_P (TYPE_MODE (type))
+ || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
+ (vec_perm @0 @1 { vec_perm_indices_to_tree
+ (build_vector_type (ssizetype, nunits), sel); })))))
+
(if (canonicalize_math_after_vectorization_p ())
(for fmas (FMA)
(simplify