aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/riscv/riscv-v.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/riscv/riscv-v.cc')
-rw-r--r--gcc/config/riscv/riscv-v.cc301
1 files changed, 268 insertions, 33 deletions
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 22d1949..c9c8328 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -351,9 +351,12 @@ public:
add_rounding_mode_operand (FRM_RNE);
else if (m_insn_flags & VXRM_RNU_P)
add_rounding_mode_operand (VXRM_RNU);
+ else if (m_insn_flags & VXRM_RNE_P)
+ add_rounding_mode_operand (VXRM_RNE);
else if (m_insn_flags & VXRM_RDN_P)
add_rounding_mode_operand (VXRM_RDN);
-
+ else if (m_insn_flags & VXRM_ROD_P)
+ add_rounding_mode_operand (VXRM_ROD);
if (insn_data[(int) icode].n_operands != m_opno)
internal_error ("invalid number of operands for insn %s, "
@@ -437,6 +440,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
e.emit_insn ((enum insn_code) icode, ops);
}
+/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE. For a
+ non-VLMAX insn, the length must be specified in VL. */
+
+void
+emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops,
+ avl_type type, rtx vl)
+{
+ if (type != avl_type::VLMAX && vl != NULL_RTX)
+ {
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
+ e.set_vl (vl);
+ e.emit_insn ((enum insn_code) icode, ops);
+ }
+ else
+ {
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
+ e.emit_insn ((enum insn_code) icode, ops);
+ }
+}
+
/* Return true if the vector duplicated by a super element which is the fusion
of consecutive elements.
@@ -1170,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target,
return false;
}
+/* Helper function to emit a vmv.vx/vi and float variants.
+ If VL is not given a VLMAX insn will be emitted, otherwise
+ a non-VLMAX insn with length VL.
+ If the value to be broadcast is not suitable for vmv.vx
+ fall back to a vlse with zero stride. This itself has a
+ fallback if the uarch prefers not to use a strided load
+ for broadcast. */
+
+void
+expand_broadcast (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops,
+ type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ UNARY_OP, ops, type, vl);
+}
+
+/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */
+
+void
+expand_set_first (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[1];
+ avl_type type = vl ? NONVLMAX : VLMAX;
+ if (can_be_broadcast_p (elt))
+ emit_avltype_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+ else
+ emit_avltype_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_OP, ops, type, vl);
+}
+
+/* Similar to expand_set_first but keeping the tail elements
+ unchanged (TU) */
+
+void
+expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl)
+{
+ rtx elt = ops[2];
+ if (!vl)
+ vl = const1_rtx;
+ if (can_be_broadcast_p (elt))
+ emit_nonvlmax_insn (code_for_pred_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+ else
+ emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode),
+ SCALAR_MOVE_MERGED_OP_TU, ops, vl);
+}
+
static void
expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
{
@@ -1206,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt)
if (lra_in_progress)
{
rtx ops[] = {result, elt};
- emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+ expand_broadcast (mode, ops);
}
else
{
@@ -1258,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder)
{
dup = gen_reg_rtx (builder->new_mode ());
rtx ops[] = {dup, ele};
- emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder->new_mode (), ops);
}
else
dup = expand_vector_broadcast (builder->new_mode (), ele);
@@ -1302,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder)
rtx tmp1 = gen_reg_rtx (builder->mode ());
rtx dup_ops[] = {tmp1, builder->elt (0)};
- emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP,
- dup_ops);
+ expand_broadcast (builder->mode (), dup_ops);
for (unsigned int i = 1; i < builder->npatterns (); i++)
{
@@ -2116,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x)
}
}
+/* This is a helper for binary ops with DImode scalar operands that are
+ broadcast (like vadd.vx v1, a1).
+ Instead of having similar code for all the expanders this function
+ unifies the handling. For 64-bit targets all we do is choose
+ between the vi variant (if available) and the register variant.
+ For 32-bit targets we either create the sign-extending variant
+ of vop.vx (when the immediate fits 32 bits) or emit a vector
+ broadcast of the 64-bit register/immediate and switch to a
+ vop.vv (replacing the scalar op with the broadcast vector. */
+
bool
sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
machine_mode vector_mode, bool has_vi_variant_p,
void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
{
machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
+
+ /* If the scalar broadcast op fits an immediate, use the
+ vop.vi variant if there is one. */
if (has_vi_variant_p)
{
*scalar_op = force_reg (scalar_mode, *scalar_op);
return false;
}
+ /* On a 64-bit target we can always use the vop.vx variant. */
if (TARGET_64BIT)
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2135,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
+ /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate
+ we need to use the sign-extending (SI -> DI) vop.vx variants. */
if (immediate_operand (*scalar_op, Pmode))
{
if (!rtx_equal_p (*scalar_op, const0_rtx))
@@ -2144,6 +2234,17 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
return false;
}
+ /* Now we're left with a 64-bit immediate or a register.
+ We cannot use a vop.vx variant but must broadcast the value first
+ and switch to a vop.vv variant.
+ Broadcast can either be done via vlse64.v v1, reg, zero
+ or by loading one 64-bit element (vle64.v) and using a
+ broadcast vrgather.vi. This is decided when splitting
+ the strided broadcast insn. */
+ gcc_assert (!TARGET_64BIT
+ && (CONST_INT_P (*scalar_op)
+ || register_operand (*scalar_op, scalar_mode)));
+
if (CONST_INT_P (*scalar_op))
{
if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
@@ -2154,11 +2255,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
rtx tmp = gen_reg_rtx (vector_mode);
rtx ops[] = {tmp, *scalar_op};
- if (type == VLMAX)
- emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
- else
- emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
- vl);
+ emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode),
+ UNARY_OP, ops, type, vl);
emit_vector_func (operands, tmp);
return true;
@@ -2552,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
/* Step 1: Broadcast the first pattern. */
rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
- emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
- UNARY_OP, ops);
+ expand_broadcast (builder.mode (), ops);
/* Step 2: Merge the rest iteration of pattern. */
for (unsigned int i = 1; i < builder.npatterns (); i++)
{
@@ -2566,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */
{
rtx ops[] = {dup, merge_mask};
- emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
- SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
+ expand_set_first (GET_MODE (dup), ops);
}
else /* vmv.v.x. */
{
@@ -2575,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target,
force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
Pmode);
- emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
- ops, vl);
+ expand_broadcast (mask_int_mode, ops, vl);
}
emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
@@ -4667,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
rtx m1_tmp = gen_reg_rtx (m1_mode);
rtx scalar_move_ops[] = {m1_tmp, init};
- insn_code icode = code_for_pred_broadcast (m1_mode);
if (need_mask_operand_p (insn_flags))
{
if (need_vl0_safe)
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
+ expand_set_first (m1_mode, scalar_move_ops, const1_rtx);
else
- emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
+ expand_set_first (m1_mode, scalar_move_ops, vl_op);
}
else
- emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+ expand_set_first (m1_mode, scalar_move_ops);
rtx m1_tmp2 = gen_reg_rtx (m1_mode);
rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
+ insn_code icode;
if (need_vl0_safe)
icode = code_for_pred (unspec_for_vl0_safe, vmode);
else
@@ -5558,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2,
emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
}
+static enum insn_type
+get_insn_type_by_vxrm_val (int vxrm_val)
+{
+ enum insn_type itype;
+
+ switch (vxrm_val)
+ {
+ case VXRM_RNU:
+ itype = BINARY_OP_VXRM_RNU;
+ break;
+ case VXRM_RNE:
+ itype = BINARY_OP_VXRM_RNE;
+ break;
+ case VXRM_RDN:
+ itype = BINARY_OP_VXRM_RDN;
+ break;
+ case VXRM_ROD:
+ itype = BINARY_OP_VXRM_ROD;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ return itype;
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x))
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1)
+ and its' vxrm value. Aka the second op comes from the vec_duplicate,
+ and the first op is the vector reg. */
+
+void
+expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec,
+ int vxrm_val, machine_mode mode)
+{
+ enum insn_code icode;
+ enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val);
+ rtx ops[] = {op_0, op_1, op_2};
+
+ switch (unspec)
+ {
+ case UNSPEC_VAADD:
+ case UNSPEC_VAADDU:
+ icode = code_for_pred_scalar (unspec, mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_vlmax_insn (icode, itype, ops);
+}
+
/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)).
Aka the second op comes from the vec_duplicate, and the first op is
the vector reg. */
@@ -5586,6 +5757,7 @@ expand_vx_binary_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2,
case US_PLUS:
case US_MINUS:
case SS_PLUS:
+ case SS_MINUS:
icode = code_for_pred_scalar (code, mode);
break;
default:
@@ -5768,24 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
return count;
}
-/* Return true if the OP can be directly broadcasted. */
+/* Return true if the OP can be broadcast with a
+ v[f]mv.v.[xif] instruction. */
+
+bool
+can_be_broadcast_p (rtx op)
+{
+ machine_mode mode = GET_MODE (op);
+
+ /* Zero always works and we can always put an immediate into a
+ register.
+ What's tricky is that for an immediate we don't know the
+ register's mode it will end up in, i.e. what element size
+ we want to broadcast. So even if the immediate is small it might
+ still end up in a DImode register that we cannot broadcast.
+ vmv.s.x, i.e. a single-element set can handle this, though,
+ because it implicitly sign-extends to SEW. */
+ if (rtx_equal_p (op, CONST0_RTX (mode))
+ || const_int_operand (op, Xmode))
+ return true;
+
+ /* Do not accept DImode broadcasts on !TARGET_64BIT. Those
+ are handled by strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return false;
+
+ /* Non-register operands that can be forced into a register we can
+ handle. These don't need to use strided broadcast. */
+ if (INTEGRAL_MODE_P (mode)
+ && (memory_operand (op, mode) || CONST_POLY_INT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* Likewise, do not accept HFmode broadcast if we don't have
+ vfmv.v.f for 16-bit registers available. */
+ if (mode == HFmode && !TARGET_ZVFH)
+ return false;
+
+ /* Same for float, just that we can always handle 64-bit doubles
+ even on !TARGET_64BIT. We have ruled out 16-bit HF already
+ above. */
+ if (FLOAT_MODE_P (mode)
+ && (memory_operand (op, mode) || CONSTANT_P (op))
+ && can_create_pseudo_p ())
+ return true;
+
+ /* After excluding all the cases we cannot handle the register types
+ that remain can always be broadcast. */
+ if (register_operand (op, mode))
+ return true;
+
+ return false;
+}
+
+/* Returns true for all operands that cannot use vmv.vx, vfmv.vf,
+ vmv.s.x, or vfmv.s.f but rather need to go via memory. */
+
bool
-can_be_broadcasted_p (rtx op)
+strided_broadcast_p (rtx op)
{
machine_mode mode = GET_MODE (op);
- /* We don't allow RA (register allocation) reload generate
- (vec_duplicate:DI reg) in RV32 system wheras we allow
- (vec_duplicate:DI mem) in RV32 system. */
- if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
- && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
- && !satisfies_constraint_Wdm (op))
+ if (!memory_operand (op, mode)
+ && !register_operand (op, mode)
+ && !rtx_equal_p (op, CONST0_RTX (mode))
+ && !const_int_operand (op, mode))
return false;
- if (satisfies_constraint_K (op) || register_operand (op, mode)
- || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
+ /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit
+ DImode elements. */
+ if (INTEGRAL_MODE_P (mode)
+ && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD))
+ return true;
+
+ /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */
+ if (!TARGET_ZVFH && mode == HFmode)
return true;
- return can_create_pseudo_p () && nonmemory_operand (op, mode);
+ return false;
}
void
@@ -5900,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
return false;
}
-/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.
+ That's the case if we're dealing with a scalar broadcast that
+ has VL = 1. */
+
bool
splat_to_scalar_move_p (rtx *ops)
{