/* Subroutines used for code generation for RISC-V 'V' Extension for GNU compiler. Copyright (C) 2022-2023 Free Software Foundation, Inc. Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd. This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #define IN_TARGET_CODE 1 #include "config.h" #include "system.h" #include "coretypes.h" #include "tm.h" #include "backend.h" #include "rtl.h" #include "insn-config.h" #include "insn-attr.h" #include "recog.h" #include "alias.h" #include "tree.h" #include "stringpool.h" #include "attribs.h" #include "explow.h" #include "memmodel.h" #include "emit-rtl.h" #include "tm_p.h" #include "target.h" #include "expr.h" #include "optabs.h" #include "tm-constrs.h" #include "rtx-vector-builder.h" using namespace riscv_vector; namespace riscv_vector { template class insn_expander { public: insn_expander () : m_opno (0) {} void add_output_operand (rtx x, machine_mode mode) { create_output_operand (&m_ops[m_opno++], x, mode); gcc_assert (m_opno <= MAX_OPERANDS); } void add_input_operand (rtx x, machine_mode mode) { create_input_operand (&m_ops[m_opno++], x, mode); gcc_assert (m_opno <= MAX_OPERANDS); } void add_all_one_mask_operand (machine_mode mode) { add_input_operand (CONSTM1_RTX (mode), mode); } void add_vundef_operand (machine_mode mode) { add_input_operand (RVV_VUNDEF (mode), mode); } void add_policy_operand (enum tail_policy vta, enum mask_policy vma) { rtx tail_policy_rtx = gen_int_mode (vta, Pmode); rtx mask_policy_rtx = gen_int_mode (vma, Pmode); add_input_operand (tail_policy_rtx, Pmode); add_input_operand (mask_policy_rtx, Pmode); } void add_avl_type_operand (avl_type type) { add_input_operand (gen_int_mode (type, Pmode), Pmode); } void expand (enum insn_code icode, bool temporary_volatile_p = false) { if (temporary_volatile_p) { temporary_volatile_ok v (true); expand_insn (icode, m_opno, m_ops); } else expand_insn (icode, m_opno, m_ops); } private: int m_opno; expand_operand m_ops[MAX_OPERANDS]; }; static unsigned get_sew (machine_mode mode) { unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? 8 : GET_MODE_BITSIZE (GET_MODE_INNER (mode)); return sew; } /* Return true if X is a const_vector with all duplicate elements, which is in the range between MINVAL and MAXVAL. */ bool const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval, HOST_WIDE_INT maxval) { rtx elt; return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt) && IN_RANGE (INTVAL (elt), minval, maxval)); } /* Emit a vlmax vsetvl instruction. This should only be used when optimization is disabled or after vsetvl insertion pass. */ void emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl) { unsigned int sew = get_sew (vmode); emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode), gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx, const0_rtx)); } void emit_vlmax_vsetvl (machine_mode vmode, rtx vl) { unsigned int sew = get_sew (vmode); enum vlmul_type vlmul = get_vlmul (vmode); unsigned int ratio = calculate_ratio (sew, vlmul); if (!optimize) emit_hard_vlmax_vsetvl (vmode, vl); else emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode))); } /* Calculate SEW/LMUL ratio. */ unsigned int calculate_ratio (unsigned int sew, enum vlmul_type vlmul) { unsigned int ratio; switch (vlmul) { case LMUL_1: ratio = sew; break; case LMUL_2: ratio = sew / 2; break; case LMUL_4: ratio = sew / 4; break; case LMUL_8: ratio = sew / 8; break; case LMUL_F8: ratio = sew * 8; break; case LMUL_F4: ratio = sew * 4; break; case LMUL_F2: ratio = sew * 2; break; default: gcc_unreachable (); } return ratio; } /* Emit an RVV unmask && vl mov from SRC to DEST. */ static void emit_pred_op (unsigned icode, rtx mask, rtx dest, rtx src, rtx len, machine_mode mask_mode, bool vlmax_p) { insn_expander<8> e; machine_mode mode = GET_MODE (dest); e.add_output_operand (dest, mode); if (mask) e.add_input_operand (mask, GET_MODE (mask)); else e.add_all_one_mask_operand (mask_mode); e.add_vundef_operand (mode); e.add_input_operand (src, GET_MODE (src)); if (len) e.add_input_operand (len, Pmode); else { rtx vlmax = gen_reg_rtx (Pmode); emit_vlmax_vsetvl (mode, vlmax); e.add_input_operand (vlmax, Pmode); } if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) e.add_policy_operand (get_prefer_tail_policy (), get_prefer_mask_policy ()); if (vlmax_p) e.add_avl_type_operand (avl_type::VLMAX); else e.add_avl_type_operand (avl_type::NONVLMAX); e.expand ((enum insn_code) icode, MEM_P (dest) || MEM_P (src)); } void emit_vlmax_op (unsigned icode, rtx dest, rtx src, machine_mode mask_mode) { emit_pred_op (icode, NULL_RTX, dest, src, NULL_RTX, mask_mode, true); } void emit_vlmax_op (unsigned icode, rtx dest, rtx src, rtx len, machine_mode mask_mode) { emit_pred_op (icode, NULL_RTX, dest, src, len, mask_mode, true); } void emit_nonvlmax_op (unsigned icode, rtx dest, rtx src, rtx len, machine_mode mask_mode) { emit_pred_op (icode, NULL_RTX, dest, src, len, mask_mode, false); } static void expand_const_vector (rtx target, rtx src, machine_mode mask_mode) { machine_mode mode = GET_MODE (target); scalar_mode elt_mode = GET_MODE_INNER (mode); if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) { rtx elt; gcc_assert ( const_vec_duplicate_p (src, &elt) && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx))); emit_vlmax_op (code_for_pred_mov (mode), target, src, mask_mode); return; } rtx elt; if (const_vec_duplicate_p (src, &elt)) { rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode); /* Element in range -16 ~ 15 integer or 0.0 floating-point, we use vmv.v.i instruction. */ if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src)) emit_vlmax_op (code_for_pred_mov (mode), tmp, src, mask_mode); else emit_vlmax_op (code_for_pred_broadcast (mode), tmp, force_reg (elt_mode, elt), mask_mode); if (tmp != target) emit_move_insn (target, tmp); return; } /* TODO: We only support const duplicate vector for now. More cases will be supported when we support auto-vectorization: 1. series vector. 2. multiple elts duplicate vector. 3. multiple patterns with multiple elts. */ } /* Expand a pre-RA RVV data move from SRC to DEST. It expands move for RVV fractional vector modes. */ bool legitimize_move (rtx dest, rtx src, machine_mode mask_mode) { machine_mode mode = GET_MODE (dest); if (CONST_VECTOR_P (src)) { expand_const_vector (dest, src, mask_mode); return true; } /* In order to decrease the memory traffic, we don't use whole register * load/store for the LMUL less than 1 and mask mode, so those case will * require one extra general purpose register, but it's not allowed during LRA * process, so we have a special move pattern used for LRA, which will defer * the expansion after LRA. */ if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) && lra_in_progress) { emit_insn (gen_mov_lra (mode, Pmode, dest, src)); return true; } if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) { /* Need to force register if mem <- !reg. */ if (MEM_P (dest) && !REG_P (src)) src = force_reg (mode, src); return false; } if (register_operand (src, mode) && register_operand (dest, mode)) { emit_insn (gen_rtx_SET (dest, src)); return true; } if (!register_operand (src, mode) && !register_operand (dest, mode)) { rtx tmp = gen_reg_rtx (mode); if (MEM_P (src)) emit_vlmax_op (code_for_pred_mov (mode), tmp, src, mask_mode); else emit_move_insn (tmp, src); src = tmp; } if (satisfies_constraint_vu (src)) return false; emit_vlmax_op (code_for_pred_mov (mode), dest, src, mask_mode); return true; } /* VTYPE information for machine_mode. */ struct mode_vtype_group { enum vlmul_type vlmul_for_min_vlen32[NUM_MACHINE_MODES]; uint8_t ratio_for_min_vlen32[NUM_MACHINE_MODES]; enum vlmul_type vlmul_for_min_vlen64[NUM_MACHINE_MODES]; uint8_t ratio_for_min_vlen64[NUM_MACHINE_MODES]; mode_vtype_group () { #define ENTRY(MODE, REQUIREMENT, VLMUL_FOR_MIN_VLEN32, RATIO_FOR_MIN_VLEN32, \ VLMUL_FOR_MIN_VLEN64, RATIO_FOR_MIN_VLEN64) \ vlmul_for_min_vlen32[MODE##mode] = VLMUL_FOR_MIN_VLEN32; \ ratio_for_min_vlen32[MODE##mode] = RATIO_FOR_MIN_VLEN32; \ vlmul_for_min_vlen64[MODE##mode] = VLMUL_FOR_MIN_VLEN64; \ ratio_for_min_vlen64[MODE##mode] = RATIO_FOR_MIN_VLEN64; #include "riscv-vector-switch.def" } }; static mode_vtype_group mode_vtype_infos; /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */ enum vlmul_type get_vlmul (machine_mode mode) { if (TARGET_MIN_VLEN == 32) return mode_vtype_infos.vlmul_for_min_vlen32[mode]; else return mode_vtype_infos.vlmul_for_min_vlen64[mode]; } /* Get ratio according to machine mode. */ unsigned int get_ratio (machine_mode mode) { if (TARGET_MIN_VLEN == 32) return mode_vtype_infos.ratio_for_min_vlen32[mode]; else return mode_vtype_infos.ratio_for_min_vlen64[mode]; } /* Get ta according to operand[tail_op_idx]. */ int get_ta (rtx ta) { if (INTVAL (ta) == TAIL_ANY) return INVALID_ATTRIBUTE; return INTVAL (ta); } /* Get ma according to operand[mask_op_idx]. */ int get_ma (rtx ma) { if (INTVAL (ma) == MASK_ANY) return INVALID_ATTRIBUTE; return INTVAL (ma); } /* Get prefer tail policy. */ enum tail_policy get_prefer_tail_policy () { /* TODO: By default, we choose to use TAIL_ANY which allows compiler pick up either agnostic or undisturbed. Maybe we will have a compile option like -mprefer=agnostic to set this value???. */ return TAIL_ANY; } /* Get prefer mask policy. */ enum mask_policy get_prefer_mask_policy () { /* TODO: By default, we choose to use MASK_ANY which allows compiler pick up either agnostic or undisturbed. Maybe we will have a compile option like -mprefer=agnostic to set this value???. */ return MASK_ANY; } /* Get avl_type rtx. */ rtx get_avl_type_rtx (enum avl_type type) { return gen_int_mode (type, Pmode); } /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE. This function is not only used by builtins, but also will be used by auto-vectorization in the future. */ opt_machine_mode get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits) { enum mode_class mclass; if (inner_mode == E_BImode) mclass = MODE_VECTOR_BOOL; else if (FLOAT_MODE_P (inner_mode)) mclass = MODE_VECTOR_FLOAT; else mclass = MODE_VECTOR_INT; machine_mode mode; FOR_EACH_MODE_IN_CLASS (mode, mclass) if (inner_mode == GET_MODE_INNER (mode) && known_eq (nunits, GET_MODE_NUNITS (mode)) && riscv_v_ext_vector_mode_p (mode)) return mode; return opt_machine_mode (); } bool simm5_p (rtx x) { if (!CONST_INT_P (x)) return false; return IN_RANGE (INTVAL (x), -16, 15); } bool neg_simm5_p (rtx x) { if (!CONST_INT_P (x)) return false; return IN_RANGE (INTVAL (x), -15, 16); } bool has_vi_variant_p (rtx_code code, rtx x) { switch (code) { case PLUS: case AND: case IOR: case XOR: case SS_PLUS: case US_PLUS: case EQ: case NE: case LE: case LEU: case GT: case GTU: return simm5_p (x); case LT: case LTU: case GE: case GEU: case MINUS: case SS_MINUS: return neg_simm5_p (x); default: return false; } } bool sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, machine_mode vector_mode, machine_mode mask_mode, bool has_vi_variant_p, void (*emit_vector_func) (rtx *, rtx)) { machine_mode scalar_mode = GET_MODE_INNER (vector_mode); if (has_vi_variant_p) { *scalar_op = force_reg (scalar_mode, *scalar_op); return false; } if (TARGET_64BIT) { if (!rtx_equal_p (*scalar_op, const0_rtx)) *scalar_op = force_reg (scalar_mode, *scalar_op); return false; } if (immediate_operand (*scalar_op, Pmode)) { if (!rtx_equal_p (*scalar_op, const0_rtx)) *scalar_op = force_reg (Pmode, *scalar_op); *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op); return false; } if (CONST_INT_P (*scalar_op)) *scalar_op = force_reg (scalar_mode, *scalar_op); rtx tmp = gen_reg_rtx (vector_mode); riscv_vector::emit_nonvlmax_op (code_for_pred_broadcast (vector_mode), tmp, *scalar_op, vl, mask_mode); emit_vector_func (operands, tmp); return true; } /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */ rtx gen_scalar_move_mask (machine_mode mode) { rtx_vector_builder builder (mode, 1, 2); builder.quick_push (const1_rtx); builder.quick_push (const0_rtx); return builder.build (); } static unsigned compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size) { // Original equation: // VLMAX = (VectorBits / EltSize) * LMUL // where LMUL = MinSize / TARGET_MIN_VLEN // The following equations have been reordered to prevent loss of precision // when calculating fractional LMUL. return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN; } static unsigned get_unknown_min_value (machine_mode mode) { enum vlmul_type vlmul = get_vlmul (mode); switch (vlmul) { case LMUL_1: return TARGET_MIN_VLEN; case LMUL_2: return TARGET_MIN_VLEN * 2; case LMUL_4: return TARGET_MIN_VLEN * 4; case LMUL_8: return TARGET_MIN_VLEN * 8; default: gcc_unreachable (); } } static rtx force_vector_length_operand (rtx vl) { if (CONST_INT_P (vl) && !satisfies_constraint_K (vl)) return force_reg (Pmode, vl); return vl; } static rtx gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl) { unsigned int sew = get_sew (vmode); return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode), gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx, const0_rtx); } /* GET VL * 2 rtx. */ static rtx get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode) { rtx i32vl = NULL_RTX; if (CONST_INT_P (avl)) { unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode)); unsigned min_size = get_unknown_min_value (mode); unsigned vlen_max = RVV_65536; unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size); unsigned vlen_min = TARGET_MIN_VLEN; unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size); unsigned HOST_WIDE_INT avl_int = INTVAL (avl); if (avl_int <= vlmax_min) i32vl = gen_int_mode (2 * avl_int, Pmode); else if (avl_int >= 2 * vlmax_max) { // Just set i32vl to VLMAX in this situation i32vl = gen_reg_rtx (Pmode); emit_insn ( gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX)); } else { // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl // is related to the hardware implementation. // So let the following code handle } } if (!i32vl) { // Using vsetvli instruction to get actually used length which related to // the hardware implementation rtx i64vl = gen_reg_rtx (Pmode); emit_insn ( gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl))); // scale 2 for 32-bit length i32vl = gen_reg_rtx (Pmode); emit_insn ( gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx))); } return force_vector_length_operand (i32vl); } bool slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode, machine_mode demote_mask_mode, rtx *ops) { rtx scalar_op = ops[4]; rtx avl = ops[5]; machine_mode scalar_mode = GET_MODE_INNER (mode); if (rtx_equal_p (scalar_op, const0_rtx)) { ops[5] = force_vector_length_operand (ops[5]); return false; } if (TARGET_64BIT) { ops[4] = force_reg (scalar_mode, scalar_op); ops[5] = force_vector_length_operand (ops[5]); return false; } if (immediate_operand (scalar_op, Pmode)) { ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op)); ops[5] = force_vector_length_operand (ops[5]); return false; } if (CONST_INT_P (scalar_op)) scalar_op = force_reg (scalar_mode, scalar_op); rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode); rtx demote_scalar_op1, demote_scalar_op2; if (unspec == UNSPEC_VSLIDE1UP) { demote_scalar_op1 = gen_highpart (Pmode, scalar_op); demote_scalar_op2 = gen_lowpart (Pmode, scalar_op); } else { demote_scalar_op1 = gen_lowpart (Pmode, scalar_op); demote_scalar_op2 = gen_highpart (Pmode, scalar_op); } rtx temp = gen_reg_rtx (demote_mode); rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode); rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode); rtx merge = RVV_VUNDEF (demote_mode); /* Handle vslide1_tu. */ if (register_operand (ops[2], mode) && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))) { merge = gen_lowpart (demote_mode, ops[2]); ta = ops[6]; ma = ops[7]; } emit_insn (gen_pred_slide (unspec, demote_mode, temp, CONSTM1_RTX (demote_mask_mode), merge, gen_lowpart (demote_mode, ops[3]), demote_scalar_op1, vl_x2, ta, ma, ops[8])); emit_insn (gen_pred_slide (unspec, demote_mode, gen_lowpart (demote_mode, ops[0]), CONSTM1_RTX (demote_mask_mode), merge, temp, demote_scalar_op2, vl_x2, ta, ma, ops[8])); if (rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))) return true; else emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1], force_vector_length_operand (ops[5]), ops[6], ops[8])); return true; } rtx gen_avl_for_scalar_move (rtx avl) { /* AVL for scalar move has different behavior between 0 and large than 0. */ if (CONST_INT_P (avl)) { /* So we could just set AVL to 1 for any constant other than 0. */ if (rtx_equal_p (avl, const0_rtx)) return const0_rtx; else return const1_rtx; } else { /* For non-constant value, we set any non zero value to 1 by `sgtu new_avl,input_avl,zero` + `vsetvli`. */ rtx tmp = gen_reg_rtx (Pmode); emit_insn ( gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx))); return tmp; } } } // namespace riscv_vector