aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTamar Christina <tamar.christina@arm.com>2021-11-01 13:48:58 +0000
committerTamar Christina <tamar.christina@arm.com>2021-11-01 13:48:58 +0000
commit68b48f3f4c49132cc6bfb16e65f8b6fd939689c7 (patch)
tree2aa3d70a8e03bc4ab5dc8082306b6d6e3572869d
parent8a260d652c421e212818efc3c0f487cf9cdbcc2c (diff)
downloadgcc-68b48f3f4c49132cc6bfb16e65f8b6fd939689c7.zip
gcc-68b48f3f4c49132cc6bfb16e65f8b6fd939689c7.tar.gz
gcc-68b48f3f4c49132cc6bfb16e65f8b6fd939689c7.tar.bz2
middle-end: Teach CSE to be able to do vector extracts.
This patch gets CSE to re-use constants already inside a vector rather than re-materializing the constant again. Basically consider the following case: #include <stdint.h> #include <arm_neon.h> uint64_t test (uint64_t a, uint64x2_t b, uint64x2_t* rt) { uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL}; uint64_t res = a | arr[0]; uint64x2_t val = vld1q_u64 (arr); *rt = vaddq_u64 (val, b); return res; } The actual behavior is inconsequential however notice that the same constants are used in the vector (arr and later val) and in the calculation of res. The code we generate for this however is quite sub-optimal: test: adrp x2, .LC0 sub sp, sp, #16 ldr q1, [x2, #:lo12:.LC0] mov x2, 16502 movk x2, 0x1023, lsl 16 movk x2, 0x4308, lsl 32 add v1.2d, v1.2d, v0.2d movk x2, 0x942, lsl 48 orr x0, x0, x2 str q1, [x1] add sp, sp, 16 ret .LC0: .xword 667169396713799798 .xword 667169396713799798 Essentially we materialize the same constant twice. The reason for this is because the front-end lowers the constant extracted from arr[0] quite early on. If you look into the result of fre you'll find <bb 2> : arr[0] = 667169396713799798; arr[1] = 667169396713799798; res_7 = a_6(D) | 667169396713799798; _16 = __builtin_aarch64_ld1v2di (&arr); _17 = VIEW_CONVERT_EXPR<uint64x2_t>(_16); _11 = b_10(D) + _17; *rt_12(D) = _11; arr ={v} {CLOBBER}; return res_7; Which makes sense for further optimization. However come expand time if the constant isn't representable in the target arch it will be assigned to a register again. (insn 8 5 9 2 (set (reg:V2DI 99) (const_vector:V2DI [ (const_int 667169396713799798 [0x942430810234076]) repeated x2 ])) "cse.c":7:12 -1 (nil)) ... (insn 14 13 15 2 (set (reg:DI 103) (const_int 667169396713799798 [0x942430810234076])) "cse.c":8:12 -1 (nil)) (insn 15 14 16 2 (set (reg:DI 102 [ res ]) (ior:DI (reg/v:DI 96 [ a ]) (reg:DI 103))) "cse.c":8:12 -1 (nil)) And since it's out of the immediate range of the scalar instruction used combine won't be able to do anything here. This will then trigger the re-materialization of the constant twice. To fix this this patch extends CSE to be able to generate an extract for a constant from another vector, or to make a vector for a constant by duplicating another constant. Whether this transformation is done or not depends entirely on the costing for the target for the different constants and operations. I Initially also investigated doing this in PRE, but PRE requires at least 2 BB to work and does not currently have any way to remove redundancies within a single BB and it did not look easy to support. gcc/ChangeLog: * cse.c (add_to_set): New. (find_sets_in_insn): Register constants in sets. (canonicalize_insn): Use auto_vec instead. (cse_insn): Try materializing using vec_dup. * rtl.h (simplify_context::simplify_gen_vec_select, simplify_gen_vec_select): New. * simplify-rtx.c (simplify_context::simplify_gen_vec_select): New.
-rw-r--r--gcc/cse.c80
-rw-r--r--gcc/rtl.h7
-rw-r--r--gcc/simplify-rtx.c22
3 files changed, 91 insertions, 18 deletions
diff --git a/gcc/cse.c b/gcc/cse.c
index 4c3988e..c1c7d0c 100644
--- a/gcc/cse.c
+++ b/gcc/cse.c
@@ -44,6 +44,7 @@ along with GCC; see the file COPYING3. If not see
#include "regs.h"
#include "function-abi.h"
#include "rtlanal.h"
+#include "expr.h"
/* The basic idea of common subexpression elimination is to go
through the code, keeping a record of expressions that would
@@ -4239,14 +4240,21 @@ try_back_substitute_reg (rtx set, rtx_insn *insn)
}
}
}
-
+
+/* Add an entry containing RTL X into SETS. */
+static inline void
+add_to_set (vec<struct set> *sets, rtx x)
+{
+ struct set entry = {};
+ entry.rtl = x;
+ sets->safe_push (entry);
+}
+
/* Record all the SETs in this instruction into SETS_PTR,
and return the number of recorded sets. */
static int
-find_sets_in_insn (rtx_insn *insn, struct set **psets)
+find_sets_in_insn (rtx_insn *insn, vec<struct set> *psets)
{
- struct set *sets = *psets;
- int n_sets = 0;
rtx x = PATTERN (insn);
if (GET_CODE (x) == SET)
@@ -4266,8 +4274,25 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets)
someplace else, so it isn't worth cse'ing. */
else if (GET_CODE (SET_SRC (x)) == CALL)
;
+ else if (GET_CODE (SET_SRC (x)) == CONST_VECTOR
+ && GET_MODE_CLASS (GET_MODE (SET_SRC (x))) != MODE_VECTOR_BOOL)
+ {
+ /* First register the vector itself. */
+ add_to_set (psets, x);
+ rtx src = SET_SRC (x);
+ /* Go over the constants of the CONST_VECTOR in forward order, to
+ put them in the same order in the SETS array. */
+ for (unsigned i = 0; i < const_vector_encoded_nelts (src) ; i++)
+ {
+ /* These are templates and don't actually get emitted but are
+ used to tell CSE how to get to a particular constant. */
+ rtx y = simplify_gen_vec_select (SET_DEST (x), i);
+ gcc_assert (y);
+ add_to_set (psets, gen_rtx_SET (y, CONST_VECTOR_ELT (src, i)));
+ }
+ }
else
- sets[n_sets++].rtl = x;
+ add_to_set (psets, x);
}
else if (GET_CODE (x) == PARALLEL)
{
@@ -4288,12 +4313,12 @@ find_sets_in_insn (rtx_insn *insn, struct set **psets)
else if (GET_CODE (SET_SRC (y)) == CALL)
;
else
- sets[n_sets++].rtl = y;
+ add_to_set (psets, y);
}
}
}
- return n_sets;
+ return psets->length ();
}
/* Subroutine of canonicalize_insn. X is an ASM_OPERANDS in INSN. */
@@ -4341,9 +4366,10 @@ canon_asm_operands (rtx x, rtx_insn *insn)
see canon_reg. */
static void
-canonicalize_insn (rtx_insn *insn, struct set **psets, int n_sets)
+canonicalize_insn (rtx_insn *insn, vec<struct set> *psets)
{
- struct set *sets = *psets;
+ vec<struct set> sets = *psets;
+ int n_sets = sets.length ();
rtx tem;
rtx x = PATTERN (insn);
int i;
@@ -4502,13 +4528,6 @@ cse_insn (rtx_insn *insn)
int src_eqv_in_memory = 0;
unsigned src_eqv_hash = 0;
- struct set *sets = (struct set *) 0;
-
- if (GET_CODE (x) == SET)
- sets = XALLOCA (struct set);
- else if (GET_CODE (x) == PARALLEL)
- sets = XALLOCAVEC (struct set, XVECLEN (x, 0));
-
this_insn = insn;
/* Find all regs explicitly clobbered in this insn,
@@ -4517,10 +4536,11 @@ cse_insn (rtx_insn *insn)
invalidate_from_sets_and_clobbers (insn);
/* Record all the SETs in this instruction. */
- n_sets = find_sets_in_insn (insn, &sets);
+ auto_vec<struct set, 8> sets;
+ n_sets = find_sets_in_insn (insn, (vec<struct set>*)&sets);
/* Substitute the canonical register where possible. */
- canonicalize_insn (insn, &sets, n_sets);
+ canonicalize_insn (insn, (vec<struct set>*)&sets);
/* If this insn has a REG_EQUAL note, store the equivalent value in SRC_EQV,
if different, or if the DEST is a STRICT_LOW_PART/ZERO_EXTRACT. The
@@ -4986,6 +5006,30 @@ cse_insn (rtx_insn *insn)
src_related_is_const_anchor = src_related != NULL_RTX;
}
+ /* Try to re-materialize a vec_dup with an existing constant. */
+ rtx src_elt;
+ if ((!src_eqv_here || CONSTANT_P (src_eqv_here))
+ && const_vec_duplicate_p (src, &src_elt))
+ {
+ machine_mode const_mode = GET_MODE_INNER (GET_MODE (src));
+ struct table_elt *related_elt
+ = lookup (src_elt, HASH (src_elt, const_mode), const_mode);
+ if (related_elt)
+ {
+ for (related_elt = related_elt->first_same_value;
+ related_elt; related_elt = related_elt->next_same_value)
+ if (REG_P (related_elt->exp))
+ {
+ /* We don't need to compare costs with an existing (constant)
+ src_eqv_here, since any such src_eqv_here should already be
+ available in src_const. */
+ src_eqv_here
+ = gen_rtx_VEC_DUPLICATE (GET_MODE (src),
+ related_elt->exp);
+ break;
+ }
+ }
+ }
if (src == src_folded)
src_folded = 0;
diff --git a/gcc/rtl.h b/gcc/rtl.h
index 5473cc9..6a6de1c 100644
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -3425,6 +3425,7 @@ public:
rtx, rtx, rtx);
rtx simplify_gen_relational (rtx_code, machine_mode, machine_mode, rtx, rtx);
rtx simplify_gen_subreg (machine_mode, rtx, machine_mode, poly_uint64);
+ rtx simplify_gen_vec_select (rtx, unsigned int);
/* Tracks the level of MEM nesting for the value being simplified:
0 means the value is not in a MEM, >0 means it is. This is needed
@@ -3527,6 +3528,12 @@ simplify_gen_subreg (machine_mode outermode, rtx op, machine_mode innermode,
}
inline rtx
+simplify_gen_vec_select (rtx op, unsigned int index)
+{
+ return simplify_context ().simplify_gen_vec_select (op, index);
+}
+
+inline rtx
lowpart_subreg (machine_mode outermode, rtx op, machine_mode innermode)
{
return simplify_context ().lowpart_subreg (outermode, op, innermode);
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index a060f1b..aac5693 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -7620,6 +7620,28 @@ simplify_context::lowpart_subreg (machine_mode outer_mode, rtx expr,
subreg_lowpart_offset (outer_mode, inner_mode));
}
+/* Generate RTX to select element at INDEX out of vector OP. */
+
+rtx simplify_context::simplify_gen_vec_select (rtx op, unsigned int index)
+{
+
+ if (!VECTOR_MODE_P (GET_MODE (op)))
+ return NULL_RTX;
+
+ machine_mode imode = GET_MODE_INNER (GET_MODE (op));
+
+ if (index == 0)
+ {
+ rtx res = lowpart_subreg (imode, op, GET_MODE (op));
+ if (res)
+ return res;
+ }
+
+ rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (index)));
+ return gen_rtx_VEC_SELECT (imode, op, tmp);
+}
+
+
/* Simplify X, an rtx expression.
Return the simplified expression or NULL if no simplifications