diff options
Diffstat (limited to 'gcc/config/arm')
-rw-r--r-- | gcc/config/arm/arm-protos.h | 4 | ||||
-rw-r--r-- | gcc/config/arm/arm.cc | 1249 | ||||
-rw-r--r-- | gcc/config/arm/arm.opt | 3 | ||||
-rw-r--r-- | gcc/config/arm/iterators.md | 15 | ||||
-rw-r--r-- | gcc/config/arm/mve.md | 50 | ||||
-rw-r--r-- | gcc/config/arm/thumb2.md | 138 | ||||
-rw-r--r-- | gcc/config/arm/types.md | 6 | ||||
-rw-r--r-- | gcc/config/arm/unspecs.md | 14 |
8 files changed, 1424 insertions, 55 deletions
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 2cd560c..34d6be7 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void); extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *); extern bool arm_q_bit_access (void); extern bool arm_ge_bits_access (void); -extern bool arm_target_insn_ok_for_lob (rtx); - +extern bool arm_target_bb_ok_for_lob (basic_block); +extern int arm_attempt_dlstp_transform (rtx); #ifdef RTX_CODE enum reg_class arm_mode_base_reg_class (machine_mode); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index b8c32db..7d67d2c 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #undef TARGET_HAVE_CONDITIONAL_EXECUTION #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution +#undef TARGET_LOOP_UNROLL_ADJUST +#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust + +#undef TARGET_PREDICT_DOLOOP_P +#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p + #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p @@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn) } bool -arm_target_insn_ok_for_lob (rtx insn) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - /* Make sure the basic block of the target insn is a simple latch - having as single predecessor and successor the body of the loop - itself. Only simple loops with a single basic block as body are - supported for 'low over head loop' making sure that LE target is - above LE itself in the generated code. */ - - return single_succ_p (bb) - && single_pred_p (bb) - && single_succ_edge (bb)->dest == single_pred_edge (bb)->src - && contains_no_active_insn_p (bb); +arm_target_bb_ok_for_lob (basic_block bb) +{ + /* Make sure the basic block is a simple latch having as the single + predecessor and successor the body of the loop itself. + Only simple loops with a single basic block as body are supported for + low over head loops, making sure that LE target is above LE instruction + in the generated code. */ + return (single_succ_p (bb) + && single_pred_p (bb) + && single_succ_edge (bb)->dest == single_pred_edge (bb)->src); +} + +/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE + lanes based on the machine mode being used. */ + +static int +arm_mve_get_vctp_lanes (rtx_insn *insn) +{ + rtx insn_set = single_set (insn); + if (insn_set + && GET_CODE (SET_SRC (insn_set)) == UNSPEC + && (XINT (SET_SRC (insn_set), 1) == VCTP + || XINT (SET_SRC (insn_set), 1) == VCTP_M)) + { + machine_mode mode = GET_MODE (SET_SRC (insn_set)); + return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode)) + ? GET_MODE_NUNITS (mode) : 0); + } + return 0; +} + +enum arm_dl_usage_type { DL_USAGE_ANY = 0, + DL_USAGE_READ = 1, + DL_USAGE_WRITE = 2 }; + +/* Check if INSN requires the use of the VPR reg, if it does, return the + sub-rtx of the VPR reg. The TYPE argument controls whether + this function should: + * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands, + and return the first occurrence of the VPR reg. + * For TYPE == DL_USAGE_READ, only check the input operands. + * For TYPE == DL_USAGE_WRITE, only check the output operands. + (INOUT operands are considered both as input and output operands) +*/ +static rtx +arm_get_required_vpr_reg (rtx_insn *insn, + arm_dl_usage_type type = DL_USAGE_ANY) +{ + gcc_assert (type < 3); + if (!NONJUMP_INSN_P (insn)) + return NULL_RTX; + + bool requires_vpr; + extract_constrain_insn (insn); + int n_operands = recog_data.n_operands; + if (recog_data.n_alternatives == 0) + return NULL_RTX; + + /* Fill in recog_op_alt with information about the constraints of + this insn. */ + preprocess_constraints (insn); + + for (int op = 0; op < n_operands; op++) + { + requires_vpr = true; + if (type == DL_USAGE_READ + && recog_data.operand_type[op] == OP_OUT) + continue; + else if (type == DL_USAGE_WRITE + && recog_data.operand_type[op] == OP_IN) + continue; + + /* Iterate through alternatives of operand "op" in recog_op_alt and + identify if the operand is required to be the VPR. */ + for (int alt = 0; alt < recog_data.n_alternatives; alt++) + { + const operand_alternative *op_alt + = &recog_op_alt[alt * n_operands]; + /* Fetch the reg_class for each entry and check it against the + VPR_REG reg_class. */ + if (alternative_class (op_alt, op) != VPR_REG) + requires_vpr = false; + } + /* If all alternatives of the insn require the VPR reg for this operand, + it means that either this is VPR-generating instruction, like a vctp, + vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx + of the VPR reg operand. */ + if (requires_vpr) + return recog_data.operand[op]; + } + return NULL_RTX; +} + +/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ, + so return the VPR only if it is an input operand to the insn. */ + +static rtx +arm_get_required_vpr_reg_param (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, DL_USAGE_READ); +} + +/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE, + so return the VPR only if it is the return value, an output of, or is + clobbered by the insn. */ + +static rtx +arm_get_required_vpr_reg_ret_val (rtx_insn *insn) +{ + return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE); +} + +/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise. */ + +static rtx_insn * +arm_mve_get_loop_vctp (basic_block bb) +{ + rtx_insn *insn = BB_HEAD (bb); + + /* Now scan through all the instruction patterns and pick out the VCTP + instruction. We require arm_get_required_vpr_reg_param to be false + to make sure we pick up a VCTP, rather than a VCTP_M. */ + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn)) + if (arm_get_required_vpr_reg_ret_val (insn) + && (arm_mve_get_vctp_lanes (insn) != 0) + && !arm_get_required_vpr_reg_param (insn)) + return insn; + return NULL; +} + +/* Return true if INSN is a MVE instruction that is VPT-predicable and is + predicated on VPR_REG. */ + +static bool +arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg) +{ + rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn) + ? arm_get_required_vpr_reg_param (insn) + : NULL_RTX); + return (insn_vpr_reg_operand + && rtx_equal_p (vpr_reg, insn_vpr_reg_operand)); +} + +/* Utility function to identify if INSN is an MVE instruction that performs + some across lane operation (and as a result does not align with normal + lane predication rules). All such instructions give one only scalar + output, except for vshlcq which gives a PARALLEL of a vector and a scalar + (one vector result and one carry output). */ + +static bool +arm_mve_across_lane_insn_p (rtx_insn* insn) +{ + df_ref insn_defs = NULL; + if (!MVE_VPT_PREDICABLE_INSN_P (insn)) + return false; + + FOR_EACH_INSN_DEF (insn_defs, insn) + if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs))) + && !arm_get_required_vpr_reg_ret_val (insn)) + return true; + + return false; +} + +/* Utility function to identify if INSN is an MVE load or store instruction. + * For TYPE == DL_USAGE_ANY, check all operands. If the function returns + true, INSN is a load or a store insn. + * For TYPE == DL_USAGE_READ, only check the input operands. If the + function returns true, INSN is a load insn. + * For TYPE == DL_USAGE_WRITE, only check the output operands. If the + function returns true, INSN is a store insn. */ + +static bool +arm_mve_load_store_insn_p (rtx_insn* insn, + arm_dl_usage_type type = DL_USAGE_ANY) +{ + gcc_assert (type < 3); + int n_operands = recog_data.n_operands; + extract_insn (insn); + + for (int op = 0; op < n_operands; op++) + { + if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT) + continue; + else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN) + continue; + if (mve_memory_operand (recog_data.operand[op], + GET_MODE (recog_data.operand[op]))) + return true; + } + return false; +} + +/* Return TRUE if INSN is validated for implicit predication by how its outputs + are used. + + If INSN is a MVE operation across lanes that is not predicated by + VCTP_VPR_GENERATED it can not be validated by the use of its ouputs. + + Any other INSN is safe to implicit predicate if we don't use its outputs + outside the loop. The instructions that use this INSN's outputs will be + validated as we go through the analysis. */ + +static bool +arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated) +{ + /* Reject any unpredicated across lane operation. */ + if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated) + && arm_mve_across_lane_insn_p (insn)) + return false; + + /* Next, scan forward to the various USEs of the DEFs in this insn. */ + df_ref insn_def = NULL; + basic_block insn_bb = BLOCK_FOR_INSN (insn); + FOR_EACH_INSN_DEF (insn_def, insn) + { + for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def)); + use; + use = DF_REF_NEXT_REG (use)) + { + rtx_insn *next_use_insn = DF_REF_INSN (use); + if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn)) + continue; + + if (insn_bb != BLOCK_FOR_INSN (next_use_insn)) + return false; + } + } + return true; +} + + +/* Returns the prevailing definition of OP before CUR_INSN in the same + basic block as CUR_INSN, if one exists, returns NULL otherwise. */ + +static rtx_insn* +arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn) +{ + if (!REG_P (op) + || !BLOCK_FOR_INSN (cur_insn)) + return NULL; + + df_ref def_insns; + rtx_insn *last_def = NULL; + for (def_insns = DF_REG_DEF_CHAIN (REGNO (op)); + def_insns; + def_insns = DF_REF_NEXT_REG (def_insns)) + { + rtx_insn *def_insn = DF_REF_INSN (def_insns); + /* Definition not in the loop body or after the current insn. */ + if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn) + || INSN_UID (def_insn) >= INSN_UID (cur_insn)) + continue; + + if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def)) + last_def = def_insn; + } + return last_def; +} + + +/* This function returns TRUE if we can validate the implicit predication of + INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's + input operands. + + If INSN_IN is a MVE operation across lanes then all of its MVE vector + operands must have its tail-predicated lanes be zeroes. We keep track of any + instructions that define vector operands for which this is true in + PROPS_ZERO_SET. + + For any other INSN_IN, the definition of all its operands must be defined + inside the loop body by an instruction that comes before INSN_IN and not be + a MVE load predicated by a different VPR. These instructions have all been + validated for explicit or implicit predication. + */ + +static bool +arm_mve_impl_pred_on_inputs_p (vec <rtx_insn *> *props_zero_set, + rtx_insn *insn_in, rtx vctp_vpr_generated) +{ + /* If all inputs come from instructions that are explicitly or + implicitly predicated by the same predicate then it is safe to + implicitly predicate this instruction. */ + df_ref insn_uses = NULL; + bool across_lane = arm_mve_across_lane_insn_p (insn_in); + FOR_EACH_INSN_USE (insn_uses, insn_in) + { + rtx op = DF_REF_REG (insn_uses); + rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in); + if (across_lane) + { + if (!VALID_MVE_MODE (GET_MODE (op))) + continue; + if (!def_insn || !props_zero_set->contains (def_insn)) + return false; + + continue; + } + + if (!def_insn + || (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated) + && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ))) + return false; + } + + return true; +} + + +/* Determine whether INSN_IN is safe to implicitly predicate based on the type + of instruction and where needed the definition of its inputs and the uses of + its outputs. + Return TRUE if it is safe to implicitly predicate and FALSE otherwise. + + * If INSN_IN is a store, then it is always unsafe to implicitly predicate it. + * If INSN_IN is a load, only reject implicit predication if its uses + directly invalidate it. + * If INSN_IN operates across vector lanes and does not have the + "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly + predicate. + * If INSN_IN operates on Floating Point elements and we are not compiling + with -Ofast, then it is unsafe to implicitly predicate it as we may be + changing exception and cumulative bits behaviour. + * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate, + but instructions that use this predicate will need to be checked + just like any other UNPREDICATED MVE instruction. + * Otherwise check if INSN_IN's inputs or uses of outputs can validate its + implicit predication. + + * If all inputs come from instructions that are explicitly or implicitly + predicated by the same predicate then it is safe to implicitly predicate + this instruction. + * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred" + attribute, then all it's operands must have zeroed falsely predicated tail + lanes. + + * Otherwise, check if the implicit predication of INSN_IN can be validated + based on its inputs, and if not check whether it can be validated based on + how its outputs are used. */ + +static bool +arm_mve_impl_predicated_p (vec <rtx_insn *> *props_zero_set, + rtx_insn *insn_in, rtx vctp_vpr_generated) +{ + + /* If INSN_IN is a store, then it is always unsafe to implicitly + predicate it. */ + if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE)) + return false; + + /* If INSN_IN is a load, only reject implicit predication if its uses + directly invalidate it. */ + if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ)) + { + if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated)) + return false; + return true; + } + + /* If INSN_IN operates across vector lanes and does not have the + "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly + predicate. */ + if (arm_mve_across_lane_insn_p (insn_in) + && (get_attr_mve_safe_imp_xlane_pred (insn_in) + != MVE_SAFE_IMP_XLANE_PRED_YES)) + return false; + + /* If INSN_IN operates on Floating Point elements and we are not compiling + with -Ofast, then it is unsafe to implicitly predicate it as we may be + changing exception and cumulative bits behaviour. */ + if (!flag_unsafe_math_optimizations + && flag_trapping_math + && MVE_VPT_UNPREDICATED_INSN_P (insn_in)) + { + df_ref def; + FOR_EACH_INSN_DEF (def, insn_in) + if (DF_REF_TYPE (def) == DF_REF_REG_DEF + && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def)))) + return false; + FOR_EACH_INSN_USE (def, insn_in) + if (DF_REF_TYPE (def) == DF_REF_REG_DEF + && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def)))) + return false; + } + + /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate, + but instructions that use this predicate will need to be checked + just like any other UNPREDICATED MVE instruction. */ + if (arm_get_required_vpr_reg_ret_val (insn_in) + && (arm_mve_get_vctp_lanes (insn_in) != 0)) + return true; + + /* Otherwise, check if the implicit predication of INSN_IN can be validated + based on its inputs, and if not check whether it can be validated based on + how its outputs are used. */ + return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated) + || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated)); +} + +/* Helper function to `arm_mve_dlstp_check_inc_counter` and to + `arm_mve_dlstp_check_dec_counter`. In the situations where the loop counter + is incrementing by 1 or decrementing by 1 in each iteration, ensure that the + number of iterations, the value of REG, going into the loop, was calculated + as: + REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP + + where N is equivalent to the VCTP_REG. +*/ + +static bool +arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step, + rtx vctp_reg) +{ + df_ref counter_max_last_def = NULL; + + /* More than one reaching definition. */ + if (DF_REG_DEF_COUNT (REGNO (reg)) > 2) + return false; + + /* Look for a single defition of REG going into the loop. The DEF_CHAIN will + have at least two values, as this is a loop induction variable that is + defined outside the loop. */ + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + /* Skip the update inside the loop, this has already been checked by the + iv_analyze call earlier. */ + if (DF_REF_BB (def) == loop->header) + continue; + + counter_max_last_def = def; + break; + } + + if (!counter_max_last_def) + return false; + + rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def)); + + if (!counter_max_last_set) + return false; + + /* If we encounter a simple SET from a REG, follow it through. */ + if (REG_P (SET_SRC (counter_max_last_set))) + { + if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1) + return false; + + counter_max_last_def + = DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set))); + counter_max_last_set + = single_set (DF_REF_INSN (counter_max_last_def)); + + if (!counter_max_last_set) + return false; + } + + /* We are looking for: + COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP. + We currently only support the unsigned VCTP_OP case. */ + rtx division = SET_SRC (counter_max_last_set); + if (GET_CODE (division) != LSHIFTRT) + return false; + + /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes. */ + rtx divisor = XEXP (division, 1); + unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step)); + if (!CONST_INT_P (divisor) + || (1U << INTVAL (divisor) != vctp_step_cst)) + return false; + + rtx dividend = XEXP (division, 0); + if (!REG_P (dividend)) + /* Subreg? */ + return false; + + /* For now only support the simple case, this only works for unsigned N, any + signed N will have further computations to deal with overflow. */ + if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1) + return false; + + rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend))); + rtx dividend_op = single_set (dividend_insn); + if (!dividend_op + && GET_CODE (SET_SRC (dividend_op)) != PLUS) + return false; + + /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1]. */ + rtx plus_op = SET_SRC (dividend_op); + if (!REG_P (XEXP (plus_op, 0)) + || !CONST_INT_P (XEXP (plus_op, 1)) + || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1)) + return false; + + /* VCTP_REG may have been copied before entering the loop, let's see if we can + trace such a copy back. If we have more than one reaching definition then + bail out as analysis will be too difficult. */ + if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2) + return false; + + /* Look for the definition of N. */ + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + if (DF_REF_BB (def) == loop->header) + continue; + rtx set = single_set (DF_REF_INSN (def)); + if (set + && REG_P (SET_SRC (set)) + && !HARD_REGISTER_P (SET_SRC (set))) + vctp_reg = SET_SRC (set); + } + + return rtx_equal_p (vctp_reg, XEXP (plus_op, 0)); +} + +/* If we have identified the loop to have an incrementing counter, we need to + make sure that it increments by 1 and that the loop is structured correctly: + * The counter starts from 0 + * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes + * The vctp insn uses a reg that decrements appropriately in each iteration. +*/ + +static rtx_insn* +arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn, + rtx condconst, rtx condcount) +{ + rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0); + /* The loop latch has to be empty. When compiling all the known MVE LoLs in + user applications, none of those with incrementing counters had any real + insns in the loop latch. As such, this function has only been tested with + an empty latch and may misbehave or ICE if we somehow get here with an + increment in the latch, so, for correctness, error out early. */ + if (!empty_block_p (loop->latch)) + return NULL; + + class rtx_iv vctp_reg_iv; + /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement + of the reg used in the vctp_insn. So run iv analysis on that reg. This + has to succeed for such loops to be supported. */ + if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)), + vctp_reg, &vctp_reg_iv)) + return NULL; + + /* Extract the decrementnum of the vctp reg from the iv. This decrementnum + is the number of lanes/elements it decrements from the remaining number of + lanes/elements to process in the loop, for this reason this is always a + negative number, but to simplify later checks we use it's absolute value. */ + HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step); + if (decrementnum >= 0) + return NULL; + decrementnum = abs_hwi (decrementnum); + + /* Find where both of those are modified in the loop header bb. */ + df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header, + REGNO (condcount)); + df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header, + REGNO (vctp_reg)); + if (!condcount_reg_set_df || !vctp_reg_set_df) + return NULL; + rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df)); + rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df)); + if (!condcount_reg_set || !vctp_reg_set) + return NULL; + + /* Ensure the modification of the vctp reg from df is consistent with + the iv and the number of lanes on the vctp insn. */ + if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS + || !REG_P (SET_DEST (vctp_reg_set)) + || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0)) + || REGNO (SET_DEST (vctp_reg_set)) + != REGNO (XEXP (SET_SRC (vctp_reg_set), 0)) + || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1)) + || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0 + || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1))) + || decrementnum != arm_mve_get_vctp_lanes (vctp_insn)) + return NULL; + + if (REG_P (condcount) && REG_P (condconst)) + { + /* First we need to prove that the loop is going 0..condconst with an + inc of 1 in each iteration. */ + if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS + && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1)) + && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1) + { + rtx counter_reg = SET_DEST (condcount_reg_set); + /* Check that the counter did indeed start from zero. */ + df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg)); + if (!this_set) + return NULL; + df_ref last_set_def = DF_REF_NEXT_REG (this_set); + if (!last_set_def) + return NULL; + rtx_insn* last_set_insn = DF_REF_INSN (last_set_def); + rtx last_set = single_set (last_set_insn); + if (!last_set) + return NULL; + rtx counter_orig_set; + counter_orig_set = SET_SRC (last_set); + if (!CONST_INT_P (counter_orig_set) + || (INTVAL (counter_orig_set) != 0)) + return NULL; + /* And finally check that the target value of the counter, + condconst, is of the correct shape. */ + if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst, + vctp_reg_iv.step, + vctp_reg)) + return NULL; + } + else + return NULL; + } + else + return NULL; + + /* Everything looks valid. */ + return vctp_insn; +} + +/* Helper function to `arm_mve_loop_valid_for_dlstp`. In the case of a + counter that is decrementing, ensure that it is decrementing by the + right amount in each iteration and that the target condition is what + we expect. */ + +static rtx_insn* +arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn, + rtx condconst, rtx condcount) +{ + rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0); + class rtx_iv vctp_reg_iv; + HOST_WIDE_INT decrementnum; + /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the + loop latch. Here we simply need to verify that this counter is the same + reg that is also used in the vctp_insn and that it is not otherwise + modified. */ + rtx_insn *dec_insn = BB_END (loop->latch); + /* If not in the loop latch, try to find the decrement in the loop header. */ + if (!NONDEBUG_INSN_P (dec_insn)) + { + df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount)); + /* If we haven't been able to find the decrement, bail out. */ + if (!temp) + return NULL; + dec_insn = DF_REF_INSN (temp); + } + + rtx dec_set = single_set (dec_insn); + + /* Next, ensure that it is a PLUS of the form: + (set (reg a) (plus (reg a) (const_int))) + where (reg a) is the same as condcount. */ + if (!dec_set + || !REG_P (SET_DEST (dec_set)) + || !REG_P (XEXP (SET_SRC (dec_set), 0)) + || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1)) + || REGNO (SET_DEST (dec_set)) + != REGNO (XEXP (SET_SRC (dec_set), 0)) + || REGNO (SET_DEST (dec_set)) != REGNO (condcount)) + return NULL; + + decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1)); + + /* This decrementnum is the number of lanes/elements it decrements from the + remaining number of lanes/elements to process in the loop, for this reason + this is always a negative number, but to simplify later checks we use its + absolute value. */ + if (decrementnum >= 0) + return NULL; + decrementnum = -decrementnum; + + /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and + verify that it also decrements correctly. + Then, we need to establish that the starting value of the loop decrement + originates from the starting value of the vctp decrement. */ + if (decrementnum == 1) + { + class rtx_iv vctp_reg_iv, condcount_reg_iv; + /* The loop counter is found to be independent of the decrement + of the reg used in the vctp_insn, again. Ensure that IV analysis + succeeds and check the step. */ + if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)), + vctp_reg, &vctp_reg_iv)) + return NULL; + /* Ensure it matches the number of lanes of the vctp instruction. */ + if (abs (INTVAL (vctp_reg_iv.step)) + != arm_mve_get_vctp_lanes (vctp_insn)) + return NULL; + + if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount, + vctp_reg_iv.step, + vctp_reg)) + return NULL; + } + /* If the decrements are the same, then the situation is simple: either they + are also the same reg, which is safe, or they are different registers, in + which case makse sure that there is a only simple SET from one to the + other inside the loop.*/ + else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn)) + { + if (REGNO (condcount) != REGNO (vctp_reg)) + { + /* It wasn't the same reg, but it could be behild a + (set (vctp_reg) (condcount)), so instead find where + the VCTP insn is DEF'd inside the loop. */ + rtx_insn *vctp_reg_insn + = DF_REF_INSN (df_bb_regno_only_def_find (loop->header, + REGNO (vctp_reg))); + rtx vctp_reg_set = single_set (vctp_reg_insn); + /* This must just be a simple SET from the condcount. */ + if (!vctp_reg_set + || !REG_P (SET_DEST (vctp_reg_set)) + || !REG_P (SET_SRC (vctp_reg_set)) + || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount)) + return NULL; + } + } + else + return NULL; + + /* We now only need to find out that the loop terminates with a LE + zero condition. If condconst is a const_int, then this is easy. + If its a REG, look at the last condition+jump in a bb before + the loop, because that usually will have a branch jumping over + the loop header. */ + rtx_insn *jump_insn = BB_END (loop->header); + if (CONST_INT_P (condconst) + && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn) + && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE + && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE + ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT))) + return NULL; + else if (REG_P (condconst)) + { + basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src); + if (!pre_loop_bb) + return NULL; + + rtx initial_compare = NULL_RTX; + if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)) + && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))))) + return NULL; + else + initial_compare + = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))); + if (!(initial_compare + && cc_register (SET_DEST (initial_compare), VOIDmode) + && GET_CODE (SET_SRC (initial_compare)) == COMPARE + && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1)) + && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0)) + return NULL; + + /* Usually this is a LE condition, but it can also just be a GT or an EQ + condition (if the value is unsigned or the compiler knows its not negative) */ + rtx_insn *loop_jumpover = BB_END (pre_loop_bb); + if (!(JUMP_P (loop_jumpover) + && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE + && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE + || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT + || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ))) + return NULL; + } + + /* Everything looks valid. */ + return vctp_insn; +} + +/* Function to check a loop's structure to see if it is a valid candidate for + an MVE Tail Predicated Low-Overhead Loop. Returns the loop's VCTP_INSN if + it is valid, or NULL if it isn't. */ + +static rtx_insn* +arm_mve_loop_valid_for_dlstp (loop *loop) +{ + /* Doloop can only be done "elementwise" with predicated dlstp/letp if it + contains a VCTP on the number of elements processed by the loop. + Find the VCTP predicate generation inside the loop body BB. */ + rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header); + if (!vctp_insn) + return NULL; + + /* We only support two loop forms for tail predication: + DLSTP_TYPE_A) Loops of the form: + int num_of_lanes = 128 / elem_size; + while (num_of_elem > 0) + { + p = vctp<size> (num_of_elem); + num_of_elem -= num_of_lanes; + } + DLSTP_TYPE_B) Loops of the form: + int num_of_lanes = 128 / elem_size; + int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes; + for (i = 0; i < num_of_iters; i++) + { + p = vctp<size> (num_of_elem); + num_of_elem -= num_of_lanes; + } + + Then, depending on the type of loop above we need will need to do + different sets of checks. */ + iv_analysis_loop_init (loop); + + /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above + look for the loop counter: it will either be incrementing by one per + iteration or it will be decrementing by num_of_lanes. We can find the + loop counter in the condition at the end of the loop. */ + rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header)); + if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode) + && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE)) + return NULL; + + /* The operands in the condition: Try to identify which one is the + constant and which is the counter and run IV analysis on the latter. */ + rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0); + rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1); + + rtx loop_cond_constant; + rtx loop_counter; + class rtx_iv cond_counter_iv, cond_temp_iv; + + if (CONST_INT_P (cond_arg_1)) + { + /* cond_arg_1 is the constant and cond_arg_2 is the counter. */ + loop_cond_constant = cond_arg_1; + loop_counter = cond_arg_2; + iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)), + cond_arg_2, &cond_counter_iv); + } + else if (CONST_INT_P (cond_arg_2)) + { + /* cond_arg_2 is the constant and cond_arg_1 is the counter. */ + loop_cond_constant = cond_arg_2; + loop_counter = cond_arg_1; + iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)), + cond_arg_1, &cond_counter_iv); + } + else if (REG_P (cond_arg_1) && REG_P (cond_arg_2)) + { + /* If both operands to the compare are REGs, we can safely + run IV analysis on both and then determine which is the + constant by looking at the step. + First assume cond_arg_1 is the counter. */ + loop_counter = cond_arg_1; + loop_cond_constant = cond_arg_2; + iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)), + cond_arg_1, &cond_counter_iv); + iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)), + cond_arg_2, &cond_temp_iv); + + /* Look at the steps and swap around the rtx's if needed. Error out if + one of them cannot be identified as constant. */ + if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step)) + return NULL; + if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0) + return NULL; + if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0) + { + loop_counter = cond_arg_2; + loop_cond_constant = cond_arg_1; + cond_counter_iv = cond_temp_iv; + } + } + else + return NULL; + + if (!REG_P (loop_counter)) + return NULL; + if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant))) + return NULL; + + /* Now we have extracted the IV step of the loop counter, call the + appropriate checking function. */ + if (INTVAL (cond_counter_iv.step) > 0) + return arm_mve_dlstp_check_inc_counter (loop, vctp_insn, + loop_cond_constant, loop_counter); + else if (INTVAL (cond_counter_iv.step) < 0) + return arm_mve_dlstp_check_dec_counter (loop, vctp_insn, + loop_cond_constant, loop_counter); + else + return NULL; +} + +/* Predict whether the given loop in gimple will be transformed in the RTL + doloop_optimize pass. It could be argued that turning large enough loops + into low-overhead loops would not show a signficant performance boost. + However, in the case of tail predication we would still avoid using VPT/VPST + instructions inside the loop, and in either case using low-overhead loops + would not be detrimental, so we decided to not consider size, avoiding the + need of a heuristic to determine what an appropriate size boundary is. */ + +static bool +arm_predict_doloop_p (struct loop *loop) +{ + gcc_assert (loop); + /* On arm, targetm.can_use_doloop_p is actually + can_use_doloop_if_innermost. Ensure the loop is innermost, + it is valid and as per arm_target_bb_ok_for_lob and the + correct architecture flags are enabled. */ + if (!(TARGET_HAVE_LOB && optimize > 0)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " target architecture or optimisation flags.\n"); + return false; + } + else if (loop->inner != NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " loop nesting.\n"); + return false; + } + else if (!arm_target_bb_ok_for_lob (loop->header->next_bb)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Predict doloop failure due to" + " loop bb complexity.\n"); + return false; + } + + return true; +} + +/* Implement targetm.loop_unroll_adjust. Use this to block unrolling of loops + that may later be turned into MVE Tail Predicated Low Overhead Loops. The + performance benefit of an MVE LoL is likely to be much higher than that of + the unrolling. */ + +unsigned +arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop) +{ + if (TARGET_HAVE_MVE + && arm_target_bb_ok_for_lob (loop->latch) + && arm_mve_loop_valid_for_dlstp (loop)) + return 0; + else + return nunroll; +} + +/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated + insn to a sequence. */ + +static bool +arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn) +{ + rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn); + int new_icode = get_attr_mve_unpredicated_insn (insn); + if (!in_sequence_p () + || !MVE_VPT_PREDICATED_INSN_P (insn) + || (!insn_vpr_reg_operand) + || (!new_icode)) + return false; + + extract_insn (insn); + rtx arr[8]; + int j = 0; + + /* When transforming a VPT-predicated instruction into its unpredicated + equivalent we need to drop the VPR operand and we may need to also drop a + merge "vuninit" input operand, depending on the instruction pattern. Here + ensure that we have at most a two-operand difference between the two + instrunctions. */ + int n_operands_diff + = recog_data.n_operands - insn_data[new_icode].n_operands; + if (!(n_operands_diff > 0 && n_operands_diff <= 2)) + return false; + + rtx move = NULL_RTX; + /* Then, loop through the operands of the predicated + instruction, and retain the ones that map to the + unpredicated instruction. */ + for (int i = 0; i < recog_data.n_operands; i++) + { + /* Ignore the VPR and, if needed, the vuninit + operand. */ + if (insn_vpr_reg_operand == recog_data.operand[i]) + continue; + if (n_operands_diff == 2 + && !strcmp (recog_data.constraints[i], "0")) + { + move = gen_rtx_SET (arr[0], recog_data.operand[i]); + arr[0] = recog_data.operand[i]; + } + else + arr[j++] = recog_data.operand[i]; + } + + /* Finally, emit the upredicated instruction. */ + rtx_insn *new_insn; + switch (j) + { + case 1: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0])); + break; + case 2: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1])); + break; + case 3: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2])); + break; + case 4: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3])); + break; + case 5: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4])); + break; + case 6: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], arr[5])); + break; + case 7: + new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2], + arr[3], arr[4], arr[5], + arr[6])); + break; + default: + gcc_unreachable (); + } + INSN_LOCATION (new_insn) = INSN_LOCATION (insn); + if (move) + { + new_insn = emit_insn (move); + INSN_LOCATION (new_insn) = INSN_LOCATION (insn); + } + return true; +} + +/* Return TRUE if INSN defines a MVE vector operand that has zeroed + tail-predicated lanes. This is either true if: + * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand + is in the PROPS_ZERO_SET, + * all MVE vector operands are in the PROPS_ZERO_SET +*/ + +static bool +arm_mve_propagate_zero_pred_p (vec <rtx_insn *> *props_zero_set, + rtx_insn *insn, rtx vctp_vpr_generated) +{ + if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ)) + return true; + if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE)) + return false; + + int inactive_idx = -1; + + extract_insn (insn); + /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated + lanes will keep the value that is in the 'invalid lanes' register which we + identify by the "0" constraint, to ensure it is the same as the 'result' + register of this instruction. */ + if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated)) + { + for (int i = 0; i < recog_data.n_operands; i++) + { + if (strcmp (recog_data.constraints[i], "0") == 0 + && VALID_MVE_MODE (GET_MODE (recog_data.operand[i]))) + { + inactive_idx = i; + break; + } + } + } + + if (inactive_idx > 0) + { + rtx op = recog_data.operand[inactive_idx]; + rtx_insn *def_insn = arm_last_vect_def_insn (op, insn); + return def_insn != NULL_RTX && props_zero_set->contains (def_insn); + } + + /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must + check that all vector operands have zeroed tail-predicated lanes, and that + it has at least one vector operand. */ + bool at_least_one_vector = false; + df_ref insn_uses; + FOR_EACH_INSN_USE (insn_uses, insn) + { + rtx reg = DF_REF_REG (insn_uses); + if (!VALID_MVE_MODE (GET_MODE (reg))) + continue; + + rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn); + if (def_insn && props_zero_set->contains (def_insn)) + at_least_one_vector |= true; + else + return false; + + } + return at_least_one_vector; +} + + +/* Attempt to transform the loop contents of loop basic block from VPT + predicated insns into unpredicated insns for a dlstp/letp loop. Returns + the number to decrement from the total number of elements each iteration. + Returns 1 if tail predication can not be performed and fallback to scalar + low-overhead loops. */ + +int +arm_attempt_dlstp_transform (rtx label) +{ + if (!dlstp_enabled) + return 1; + + basic_block body = single_succ (BLOCK_FOR_INSN (label)); + + /* Ensure that the bb is within a loop that has all required metadata. */ + if (!body->loop_father || !body->loop_father->header + || !body->loop_father->simple_loop_desc) + return 1; + + loop *loop = body->loop_father; + /* Instruction that sets the predicate mask depending on how many elements + are left to process. */ + rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop); + if (!vctp_insn) + return 1; + + gcc_assert (single_set (vctp_insn)); + + rtx vctp_vpr_generated = single_set (vctp_insn); + if (!vctp_vpr_generated) + return 1; + + vctp_vpr_generated = SET_DEST (vctp_vpr_generated); + + if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated) + || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated))) + return 1; + + /* decrementunum is already known to be valid at this point. */ + int decrementnum = arm_mve_get_vctp_lanes (vctp_insn); + + rtx_insn *insn = 0; + rtx_insn *cur_insn = 0; + rtx_insn *seq; + auto_vec <rtx_insn *> props_zero_set; + + /* Scan through the insns in the loop bb and emit the transformed bb + insns to a sequence. */ + start_sequence (); + FOR_BB_INSNS (body, insn) + { + if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)) + continue; + else if (NOTE_P (insn)) + emit_note ((enum insn_note)NOTE_KIND (insn)); + else if (DEBUG_INSN_P (insn)) + emit_debug_insn (PATTERN (insn)); + else if (!INSN_P (insn)) + { + end_sequence (); + return 1; + } + /* If the transformation is successful we no longer need the vctp + instruction. */ + else if (insn == vctp_insn) + continue; + /* If the insn pattern requires the use of the VPR value from the + vctp as an input parameter for predication. */ + else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated)) + { + /* Check whether this INSN propagates the zeroed tail-predication + lanes. */ + if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn, + vctp_vpr_generated)) + props_zero_set.safe_push (insn); + bool success = arm_emit_mve_unpredicated_insn_to_seq (insn); + if (!success) + { + end_sequence (); + return 1; + } + } + /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to + make sure that it is still valid within the dlstp/letp loop. */ + else + { + /* If this instruction USE-s the vctp_vpr_generated other than for + predication, this blocks the transformation as we are not allowed + to optimise the VPR value away. */ + df_ref insn_uses = NULL; + FOR_EACH_INSN_USE (insn_uses, insn) + { + if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses))) + { + end_sequence (); + return 1; + } + } + /* If within the loop we have an MVE vector instruction that is + unpredicated, the dlstp/letp looping will add implicit + predication to it. This will result in a change in behaviour + of the instruction, so we need to find out if any instructions + that feed into the current instruction were implicitly + predicated. */ + if (MVE_VPT_PREDICABLE_INSN_P (insn) + && !arm_mve_impl_predicated_p (&props_zero_set, insn, + vctp_vpr_generated)) + { + end_sequence (); + return 1; + } + emit_insn (PATTERN (insn)); + } + } + seq = get_insns (); + end_sequence (); + + /* Re-write the entire BB contents with the transformed + sequence. */ + FOR_BB_INSNS_SAFE (body, insn, cur_insn) + if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))) + delete_insn (insn); + + emit_insn_after (seq, BB_END (body)); + + /* The transformation has succeeded, so now modify the "count" + (a.k.a. niter_expr) for the middle-end. Also set noloop_assumptions + to NULL to stop the middle-end from making assumptions about the + number of iterations. */ + simple_loop_desc (body->loop_father)->niter_expr + = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0); + simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX; + return decrementnum; } #if CHECKING_P diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt index 0cd3fc2..d88c7a5 100644 --- a/gcc/config/arm/arm.opt +++ b/gcc/config/arm/arm.opt @@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str) Use an immediate to offset from the TLS register. This option is for use with fstack-protector-guard=tls and not for use in user-land code. +mdlstp +Target Var(dlstp_enabled) Init(1) Undocumented + TargetVariable long arm_stack_protector_guard_offset = 0 diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8d066fc..987602d 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -2686,6 +2686,17 @@ (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")]) (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")]) +(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32") + (DLSTP64 "64")]) + +(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4") + (LETP64 "2")]) +(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4") + (LETP64 "-2")]) + +(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3") + (LETP64 "1")]) + (define_int_attr opsuffix [(UNSPEC_DOT_S "s8") (UNSPEC_DOT_U "u8") (UNSPEC_DOT_US "s8") @@ -2926,6 +2937,10 @@ (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S]) (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S]) +(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 + DLSTP64]) +(define_int_iterator LETP [LETP8 LETP16 LETP32 + LETP64]) ;; Define iterators for VCMLA operations (define_int_iterator VCMLA_OP [UNSPEC_VCMLA diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 9fe5129..4b4d629 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -6930,3 +6930,53 @@ } } ) + +;; Originally expanded by 'predicated_doloop_end'. +;; In the rare situation where the branch is too far, we do also need to +;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration. +(define_insn "predicated_doloop_end_internal<letp_num_lanes>" + [(set (pc) + (if_then_else + (gtu (plus:SI (reg:SI LR_REGNUM) + (const_int <letp_num_lanes_neg>)) + (const_int <letp_num_lanes_minus_1>)) + (match_operand 0 "" "") + (pc))) + (set (reg:SI LR_REGNUM) + (plus:SI (reg:SI LR_REGNUM) (const_int <letp_num_lanes_neg>))) + ;; We use UNSPEC here to guarantee this pattern can not be + ;; generated by a RTL optimization and be matched by other + ;; patterns, since this pattern is also responsible for turning off + ;; the tail predication machinery if we were to exit the loop. + ;; This is done by either the LETP or the LCTP instructions that + ;; this pattern generates. + (use (unspec:SI [(const_int 0)] LETP)) + (clobber (reg:CC CC_REGNUM))] + "TARGET_HAVE_MVE" + { + if (get_attr_length (insn) == 4) + return "letp\t%|lr, %l0"; + else + return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l0\n\tlctp"; + } + [(set (attr "length") + (if_then_else + (ltu (minus (pc) (match_dup 0)) (const_int 1024)) + (const_int 4) + (const_int 12))) + (set_attr "type" "branch") + (set_attr "conds" "unconditional")]) + +(define_insn "dlstp<dlstp_elemsize>_insn" + [ + (set (reg:SI LR_REGNUM) +;; Similar to the previous pattern, we use UNSPEC here to make sure this +;; rtx construct is not matched by other patterns, as this pattern is also +;; responsible for setting the element size of the tail predication machinery +;; using the dlsp.<size> instruction. + (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")] + DLSTP)) + ] + "TARGET_HAVE_MVE" + "dlstp.<dlstp_elemsize>\t%|lr, %0" + [(set_attr "type" "mve_misc")]) diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md index 84c9c3d..66b3ae6 100644 --- a/gcc/config/arm/thumb2.md +++ b/gcc/config/arm/thumb2.md @@ -1613,7 +1613,7 @@ (use (match_operand 1 "" ""))] ; label "TARGET_32BIT" " - { +{ /* Currently SMS relies on the do-loop pattern to recognize loops where (1) the control part consists of all insns defining and/or using a certain 'count' register and (2) the loop count can be @@ -1623,41 +1623,75 @@ Also used to implement the low over head loops feature, which is part of the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */ - if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) - { - rtx s0; - rtx bcomp; - rtx loc_ref; - rtx cc_reg; - rtx insn; - rtx cmp; - - if (GET_MODE (operands[0]) != SImode) - FAIL; - - s0 = operands [0]; - - /* Low over head loop instructions require the first operand to be LR. */ - if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1])) - s0 = gen_rtx_REG (SImode, LR_REGNUM); - - if (TARGET_THUMB2) - insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1))); - else - insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); - - cmp = XVECEXP (PATTERN (insn), 0, 0); - cc_reg = SET_DEST (cmp); - bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); - loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]); - emit_jump_insn (gen_rtx_SET (pc_rtx, - gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, - loc_ref, pc_rtx))); - DONE; - } - else - FAIL; - }") + if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB)) + { + rtx s0; + rtx bcomp; + rtx loc_ref; + rtx cc_reg; + rtx insn; + rtx cmp; + int decrement_num; + + if (GET_MODE (operands[0]) != SImode) + FAIL; + + s0 = operands[0]; + + if (TARGET_HAVE_LOB + && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1]))) + { + /* If we have a compatible MVE target, try and analyse the loop + contents to determine if we can use predicated dlstp/letp + looping. These patterns implicitly use LR as the loop counter. */ + if (TARGET_HAVE_MVE + && ((decrement_num = arm_attempt_dlstp_transform (operands[1])) + != 1)) + { + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + switch (decrement_num) + { + case 2: + insn = gen_predicated_doloop_end_internal2 (loc_ref); + break; + case 4: + insn = gen_predicated_doloop_end_internal4 (loc_ref); + break; + case 8: + insn = gen_predicated_doloop_end_internal8 (loc_ref); + break; + case 16: + insn = gen_predicated_doloop_end_internal16 (loc_ref); + break; + default: + gcc_unreachable (); + } + emit_jump_insn (insn); + DONE; + } + /* Remaining LOB cases need to explicitly use LR. */ + s0 = gen_rtx_REG (SImode, LR_REGNUM); + } + + /* Otherwise, try standard decrement-by-one dls/le looping. */ + if (TARGET_THUMB2) + insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, + GEN_INT (-1))); + else + insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1))); + + cmp = XVECEXP (PATTERN (insn), 0, 0); + cc_reg = SET_DEST (cmp); + bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx); + loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]); + emit_jump_insn (gen_rtx_SET (pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + loc_ref, pc_rtx))); + DONE; + } + else + FAIL; +}") (define_insn "*clear_apsr" [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR) @@ -1755,7 +1789,37 @@ { if (REGNO (operands[0]) == LR_REGNUM) { - emit_insn (gen_dls_insn (operands[0])); + /* Pick out the number by which we are decrementing the loop counter + in every iteration. If it's > 1, then use dlstp. */ + int const_int_dec_num + = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1), + 1), + 1))); + switch (const_int_dec_num) + { + case 16: + emit_insn (gen_dlstp8_insn (operands[0])); + break; + + case 8: + emit_insn (gen_dlstp16_insn (operands[0])); + break; + + case 4: + emit_insn (gen_dlstp32_insn (operands[0])); + break; + + case 2: + emit_insn (gen_dlstp64_insn (operands[0])); + break; + + case 1: + emit_insn (gen_dls_insn (operands[0])); + break; + + default: + gcc_unreachable (); + } DONE; } else diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index e2b70da..9527bdb 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -574,6 +574,7 @@ ; mve_move ; mve_store ; mve_load +; mve_misc (define_attr "type" "adc_imm,\ @@ -1126,7 +1127,8 @@ ls64,\ mve_move,\ mve_store,\ - mve_load" + mve_load, \ + mve_misc" (cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2") (const_string "alu_shift_imm_other") (eq_attr "autodetect_type" "alu_shift_lsl_op2") @@ -1292,7 +1294,7 @@ ;; No otherwise. (define_attr "is_mve_type" "yes,no" (if_then_else (eq_attr "type" - "mve_move, mve_load, mve_store, mrs") + "mve_move, mve_load, mve_store, mrs, mve_misc") (const_string "yes") (const_string "no"))) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 46ac8b3..f5f4d15 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -591,6 +591,10 @@ VADDLVQ_U VCTP VCTP_M + LETP8 + LETP16 + LETP32 + LETP64 VPNOT VCREATEQ_F VCVTQ_N_TO_F_S @@ -1259,6 +1263,14 @@ UQRSHLL_48 SQRSHRL_64 SQRSHRL_48 - VSHLCQ_M_ REINTERPRET ]) + +; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule +; these instructions within the loop preheader. +(define_c_enum "unspecv" [ + DLSTP8 + DLSTP16 + DLSTP32 + DLSTP64 +]) |