diff options
Diffstat (limited to 'gcc/config/riscv/riscv-v.cc')
-rw-r--r-- | gcc/config/riscv/riscv-v.cc | 301 |
1 files changed, 268 insertions, 33 deletions
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 22d1949..c9c8328 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -351,9 +351,12 @@ public: add_rounding_mode_operand (FRM_RNE); else if (m_insn_flags & VXRM_RNU_P) add_rounding_mode_operand (VXRM_RNU); + else if (m_insn_flags & VXRM_RNE_P) + add_rounding_mode_operand (VXRM_RNE); else if (m_insn_flags & VXRM_RDN_P) add_rounding_mode_operand (VXRM_RDN); - + else if (m_insn_flags & VXRM_ROD_P) + add_rounding_mode_operand (VXRM_ROD); if (insn_data[(int) icode].n_operands != m_opno) internal_error ("invalid number of operands for insn %s, " @@ -437,6 +440,26 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl) e.emit_insn ((enum insn_code) icode, ops); } +/* Emit either a VLMAX insn or a non-VLMAX insn depending on TYPE. For a + non-VLMAX insn, the length must be specified in VL. */ + +void +emit_avltype_insn (unsigned icode, unsigned insn_flags, rtx *ops, + avl_type type, rtx vl) +{ + if (type != avl_type::VLMAX && vl != NULL_RTX) + { + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false); + e.set_vl (vl); + e.emit_insn ((enum insn_code) icode, ops); + } + else + { + insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true); + e.emit_insn ((enum insn_code) icode, ops); + } +} + /* Return true if the vector duplicated by a super element which is the fusion of consecutive elements. @@ -1170,6 +1193,59 @@ expand_vector_init_trailing_same_elem (rtx target, return false; } +/* Helper function to emit a vmv.vx/vi and float variants. + If VL is not given a VLMAX insn will be emitted, otherwise + a non-VLMAX insn with length VL. + If the value to be broadcast is not suitable for vmv.vx + fall back to a vlse with zero stride. This itself has a + fallback if the uarch prefers not to use a strided load + for broadcast. */ + +void +expand_broadcast (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), UNARY_OP, ops, + type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + UNARY_OP, ops, type, vl); +} + +/* Similar to expand_broadcast but emits a vmv.s.x/vfmv.s.f instead. */ + +void +expand_set_first (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[1]; + avl_type type = vl ? NONVLMAX : VLMAX; + if (can_be_broadcast_p (elt)) + emit_avltype_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); + else + emit_avltype_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_OP, ops, type, vl); +} + +/* Similar to expand_set_first but keeping the tail elements + unchanged (TU) */ + +void +expand_set_first_tu (machine_mode mode, rtx *ops, rtx vl) +{ + rtx elt = ops[2]; + if (!vl) + vl = const1_rtx; + if (can_be_broadcast_p (elt)) + emit_nonvlmax_insn (code_for_pred_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); + else + emit_nonvlmax_insn (code_for_pred_strided_broadcast (mode), + SCALAR_MOVE_MERGED_OP_TU, ops, vl); +} + static void expand_const_vec_duplicate (rtx target, rtx src, rtx elt) { @@ -1206,7 +1282,7 @@ expand_const_vec_duplicate (rtx target, rtx src, rtx elt) if (lra_in_progress) { rtx ops[] = {result, elt}; - emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops); + expand_broadcast (mode, ops); } else { @@ -1258,8 +1334,7 @@ expand_const_vector_duplicate_repeating (rtx target, rvv_builder *builder) { dup = gen_reg_rtx (builder->new_mode ()); rtx ops[] = {dup, ele}; - emit_vlmax_insn (code_for_pred_broadcast (builder->new_mode ()), - UNARY_OP, ops); + expand_broadcast (builder->new_mode (), ops); } else dup = expand_vector_broadcast (builder->new_mode (), ele); @@ -1302,8 +1377,7 @@ expand_const_vector_duplicate_default (rtx target, rvv_builder *builder) rtx tmp1 = gen_reg_rtx (builder->mode ()); rtx dup_ops[] = {tmp1, builder->elt (0)}; - emit_vlmax_insn (code_for_pred_broadcast (builder->mode ()), UNARY_OP, - dup_ops); + expand_broadcast (builder->mode (), dup_ops); for (unsigned int i = 1; i < builder->npatterns (); i++) { @@ -2116,18 +2190,32 @@ has_vi_variant_p (rtx_code code, rtx x) } } +/* This is a helper for binary ops with DImode scalar operands that are + broadcast (like vadd.vx v1, a1). + Instead of having similar code for all the expanders this function + unifies the handling. For 64-bit targets all we do is choose + between the vi variant (if available) and the register variant. + For 32-bit targets we either create the sign-extending variant + of vop.vx (when the immediate fits 32 bits) or emit a vector + broadcast of the 64-bit register/immediate and switch to a + vop.vv (replacing the scalar op with the broadcast vector. */ + bool sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, machine_mode vector_mode, bool has_vi_variant_p, void (*emit_vector_func) (rtx *, rtx), enum avl_type type) { machine_mode scalar_mode = GET_MODE_INNER (vector_mode); + + /* If the scalar broadcast op fits an immediate, use the + vop.vi variant if there is one. */ if (has_vi_variant_p) { *scalar_op = force_reg (scalar_mode, *scalar_op); return false; } + /* On a 64-bit target we can always use the vop.vx variant. */ if (TARGET_64BIT) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2135,6 +2223,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } + /* For 32 bit and if there is no vop.vi variant for a 32-bit immediate + we need to use the sign-extending (SI -> DI) vop.vx variants. */ if (immediate_operand (*scalar_op, Pmode)) { if (!rtx_equal_p (*scalar_op, const0_rtx)) @@ -2144,6 +2234,17 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, return false; } + /* Now we're left with a 64-bit immediate or a register. + We cannot use a vop.vx variant but must broadcast the value first + and switch to a vop.vv variant. + Broadcast can either be done via vlse64.v v1, reg, zero + or by loading one 64-bit element (vle64.v) and using a + broadcast vrgather.vi. This is decided when splitting + the strided broadcast insn. */ + gcc_assert (!TARGET_64BIT + && (CONST_INT_P (*scalar_op) + || register_operand (*scalar_op, scalar_mode))); + if (CONST_INT_P (*scalar_op)) { if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) @@ -2154,11 +2255,8 @@ sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, rtx tmp = gen_reg_rtx (vector_mode); rtx ops[] = {tmp, *scalar_op}; - if (type == VLMAX) - emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops); - else - emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, - vl); + emit_avltype_insn (code_for_pred_strided_broadcast (vector_mode), + UNARY_OP, ops, type, vl); emit_vector_func (operands, tmp); return true; @@ -2552,8 +2650,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, /* Step 1: Broadcast the first pattern. */ rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))}; - emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), - UNARY_OP, ops); + expand_broadcast (builder.mode (), ops); /* Step 2: Merge the rest iteration of pattern. */ for (unsigned int i = 1; i < builder.npatterns (); i++) { @@ -2566,8 +2663,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */ { rtx ops[] = {dup, merge_mask}; - emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)), - SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode)); + expand_set_first (GET_MODE (dup), ops); } else /* vmv.v.x. */ { @@ -2575,8 +2671,7 @@ expand_vector_init_merge_repeating_sequence (rtx target, force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)}; rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()), Pmode); - emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP, - ops, vl); + expand_broadcast (mask_int_mode, ops, vl); } emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup)); @@ -4667,20 +4762,20 @@ expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe, rtx m1_tmp = gen_reg_rtx (m1_mode); rtx scalar_move_ops[] = {m1_tmp, init}; - insn_code icode = code_for_pred_broadcast (m1_mode); if (need_mask_operand_p (insn_flags)) { if (need_vl0_safe) - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx); + expand_set_first (m1_mode, scalar_move_ops, const1_rtx); else - emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op); + expand_set_first (m1_mode, scalar_move_ops, vl_op); } else - emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops); + expand_set_first (m1_mode, scalar_move_ops); rtx m1_tmp2 = gen_reg_rtx (m1_mode); rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp}; + insn_code icode; if (need_vl0_safe) icode = code_for_pred (unspec_for_vl0_safe, vmode); else @@ -5558,6 +5653,82 @@ expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops); } +static enum insn_type +get_insn_type_by_vxrm_val (int vxrm_val) +{ + enum insn_type itype; + + switch (vxrm_val) + { + case VXRM_RNU: + itype = BINARY_OP_VXRM_RNU; + break; + case VXRM_RNE: + itype = BINARY_OP_VXRM_RNE; + break; + case VXRM_RDN: + itype = BINARY_OP_VXRM_RDN; + break; + case VXRM_ROD: + itype = BINARY_OP_VXRM_ROD; + break; + default: + gcc_unreachable (); + } + + return itype; +} + +/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + +/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1) + and its' vxrm value. Aka the second op comes from the vec_duplicate, + and the first op is the vector reg. */ + +void +expand_vx_binary_vxrm_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, int unspec, + int vxrm_val, machine_mode mode) +{ + enum insn_code icode; + enum insn_type itype = get_insn_type_by_vxrm_val (vxrm_val); + rtx ops[] = {op_0, op_1, op_2}; + + switch (unspec) + { + case UNSPEC_VAADD: + case UNSPEC_VAADDU: + icode = code_for_pred_scalar (unspec, mode); + break; + default: + gcc_unreachable (); + } + + emit_vlmax_insn (icode, itype, ops); +} + /* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)). Aka the second op comes from the vec_duplicate, and the first op is the vector reg. */ @@ -5586,6 +5757,7 @@ expand_vx_binary_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, case US_PLUS: case US_MINUS: case SS_PLUS: + case SS_MINUS: icode = code_for_pred_scalar (code, mode); break; default: @@ -5768,24 +5940,84 @@ count_regno_occurrences (rtx_insn *rinsn, unsigned int regno) return count; } -/* Return true if the OP can be directly broadcasted. */ +/* Return true if the OP can be broadcast with a + v[f]mv.v.[xif] instruction. */ + +bool +can_be_broadcast_p (rtx op) +{ + machine_mode mode = GET_MODE (op); + + /* Zero always works and we can always put an immediate into a + register. + What's tricky is that for an immediate we don't know the + register's mode it will end up in, i.e. what element size + we want to broadcast. So even if the immediate is small it might + still end up in a DImode register that we cannot broadcast. + vmv.s.x, i.e. a single-element set can handle this, though, + because it implicitly sign-extends to SEW. */ + if (rtx_equal_p (op, CONST0_RTX (mode)) + || const_int_operand (op, Xmode)) + return true; + + /* Do not accept DImode broadcasts on !TARGET_64BIT. Those + are handled by strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return false; + + /* Non-register operands that can be forced into a register we can + handle. These don't need to use strided broadcast. */ + if (INTEGRAL_MODE_P (mode) + && (memory_operand (op, mode) || CONST_POLY_INT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* Likewise, do not accept HFmode broadcast if we don't have + vfmv.v.f for 16-bit registers available. */ + if (mode == HFmode && !TARGET_ZVFH) + return false; + + /* Same for float, just that we can always handle 64-bit doubles + even on !TARGET_64BIT. We have ruled out 16-bit HF already + above. */ + if (FLOAT_MODE_P (mode) + && (memory_operand (op, mode) || CONSTANT_P (op)) + && can_create_pseudo_p ()) + return true; + + /* After excluding all the cases we cannot handle the register types + that remain can always be broadcast. */ + if (register_operand (op, mode)) + return true; + + return false; +} + +/* Returns true for all operands that cannot use vmv.vx, vfmv.vf, + vmv.s.x, or vfmv.s.f but rather need to go via memory. */ + bool -can_be_broadcasted_p (rtx op) +strided_broadcast_p (rtx op) { machine_mode mode = GET_MODE (op); - /* We don't allow RA (register allocation) reload generate - (vec_duplicate:DI reg) in RV32 system wheras we allow - (vec_duplicate:DI mem) in RV32 system. */ - if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode) - && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode)) - && !satisfies_constraint_Wdm (op)) + if (!memory_operand (op, mode) + && !register_operand (op, mode) + && !rtx_equal_p (op, CONST0_RTX (mode)) + && !const_int_operand (op, mode)) return false; - if (satisfies_constraint_K (op) || register_operand (op, mode) - || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode))) + /* !TARGET64_BIT does not have a vmv.v.x/vmv.s.x for 64-bit + DImode elements. */ + if (INTEGRAL_MODE_P (mode) + && maybe_gt (GET_MODE_SIZE (mode), UNITS_PER_WORD)) + return true; + + /* Zvfhmin does not have a vfmv.v.f/vfmv.s.f. for 16-bit elements. */ + if (!TARGET_ZVFH && mode == HFmode) return true; - return can_create_pseudo_p () && nonmemory_operand (op, mode); + return false; } void @@ -5900,7 +6132,10 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) return false; } -/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */ +/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. + That's the case if we're dealing with a scalar broadcast that + has VL = 1. */ + bool splat_to_scalar_move_p (rtx *ops) { |