diff options
Diffstat (limited to 'gcc/config/aarch64/aarch64.cc')
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 149 |
1 files changed, 126 insertions, 23 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4d9d83d..d30c9c7 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -430,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = #include "tuning_models/neoversev2.h" #include "tuning_models/neoversev3.h" #include "tuning_models/neoversev3ae.h" +#include "tuning_models/olympus.h" #include "tuning_models/a64fx.h" #include "tuning_models/fujitsu_monaka.h" @@ -3932,18 +3933,53 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); } +/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE + is a partial vector mode, and if exceptions must be suppressed for its + undefined elements, convert PRED from a container-level predicate to + an element-level predicate and ensure that the undefined elements + are inactive. Make no changes otherwise. + + Return the resultant predicate. */ +rtx +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) + { + /* Generate an element-level mask. */ + rtx mask = aarch64_sve_packed_pred (data_mode); + machine_mode pmode = GET_MODE (mask); + + /* Apply the existing predicate. */ + rtx dst = gen_reg_rtx (pmode); + emit_insn (gen_and3 (pmode, dst, mask, + gen_lowpart (pmode, pred))); + return dst; + } + + return pred; +} + /* Emit a comparison CMP between OP0 and OP1, both of which have mode DATA_MODE, and return the result in a predicate of mode PRED_MODE. - Use TARGET as the target register if nonnull and convenient. */ + Use TARGET as the target register if nonnull and convenient. + + PRED_MODE can be either VNx16BI or the natural predicate mode for + DATA_MODE. */ static rtx aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp, machine_mode data_mode, rtx op1, rtx op2) { - insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode); + auto src_pred_mode = aarch64_sve_pred_mode (data_mode); + insn_code icode; + if (known_eq (GET_MODE_NUNITS (pred_mode), GET_MODE_NUNITS (data_mode))) + icode = code_for_aarch64_pred_cmp (cmp, data_mode); + else + icode = code_for_aarch64_pred_cmp_acle (cmp, data_mode); expand_operand ops[5]; create_output_operand (&ops[0], target, pred_mode); - create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode); + create_input_operand (&ops[1], CONSTM1_RTX (src_pred_mode), src_pred_mode); create_integer_operand (&ops[2], SVE_KNOWN_PTRUE); create_input_operand (&ops[3], op1, data_mode); create_input_operand (&ops[4], op2, data_mode); @@ -3951,15 +3987,14 @@ aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp, return ops[0].value; } -/* Use a comparison to convert integer vector SRC into MODE, which is - the corresponding SVE predicate mode. Use TARGET for the result - if it's nonnull and convenient. */ +/* Use a comparison to convert integer vector SRC into VNx16BI. + Use TARGET for the result if it's nonnull and convenient. */ rtx -aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src) +aarch64_convert_sve_data_to_pred (rtx target, rtx src) { machine_mode src_mode = GET_MODE (src); - return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode, + return aarch64_sve_emit_int_cmp (target, VNx16BImode, NE, src_mode, src, CONST0_RTX (src_mode)); } @@ -6041,9 +6076,9 @@ aarch64_sve_move_pred_via_while (rtx target, machine_mode mode, unsigned int vl) { rtx limit = force_reg (DImode, gen_int_mode (vl, DImode)); - target = aarch64_target_reg (target, mode); - emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode, - target, const0_rtx, limit)); + target = aarch64_target_reg (target, VNx16BImode); + emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode, + target, const0_rtx, limit)); return target; } @@ -6189,8 +6224,7 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder, operands but permutes them as though they had mode MODE. */ machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); target = aarch64_target_reg (target, GET_MODE (a)); - rtx type_reg = CONST0_RTX (mode); - emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); + emit_insn (gen_aarch64_sve_acle (UNSPEC_TRN1, mode, target, a, b)); return target; } @@ -6272,8 +6306,7 @@ aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder) for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) int_builder.quick_push (INTVAL (builder.elt (i)) ? constm1_rtx : const0_rtx); - return aarch64_convert_sve_data_to_pred (target, VNx16BImode, - int_builder.build ()); + return aarch64_convert_sve_data_to_pred (target, int_builder.build ()); } /* Set DEST to immediate IMM. */ @@ -6725,6 +6758,27 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src) dest, ptrue, src)); } +/* Set predicate register DEST such that every element has the scalar + boolean value in SRC, with any nonzero source counting as "true". + MODE is a MODE_VECTOR_BOOL that determines the element size; + DEST can have this mode or VNx16BImode. In the latter case, + the upper bits of each element are defined to be zero, as for + the .H, .S, and .D forms of PTRUE. */ + +void +aarch64_emit_sve_pred_vec_duplicate (machine_mode mode, rtx dest, rtx src) +{ + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_ashldi3 (tmp, gen_lowpart (DImode, src), + gen_int_mode (63, DImode))); + if (GET_MODE (dest) == VNx16BImode) + emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode, + dest, const0_rtx, tmp)); + else + emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode, + dest, const0_rtx, tmp)); +} + static bool aarch64_function_ok_for_sibcall (tree, tree exp) { @@ -17165,8 +17219,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info, && STMT_VINFO_DATA_REF (stmt_info)) { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - if (stmt_info - && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES) + if (node + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -17437,8 +17491,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for each element. We therefore need to divide the full-instruction cost by the number of elements in the vector. */ if (kind == scalar_load + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int nunits = vect_nunits_for_cost (vectype); /* Test for VNx2 modes, which have 64-bit containers. */ @@ -17450,8 +17505,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which a scalar_store is really storing one element in a scatter operation. */ if (kind == scalar_store + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -17707,7 +17763,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && kind == vec_to_scalar && (m_vec_flags & VEC_ADVSIMD) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { auto dr = STMT_VINFO_DATA_REF (stmt_info); tree dr_ref = DR_REF (dr); @@ -17720,7 +17776,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, { if (gimple_vuse (SSA_NAME_DEF_STMT (offset))) { - if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) + if (SLP_TREE_TYPE (node) == load_vec_info_type) ops->loads += count - 1; else /* Stores want to count both the index to array and data to @@ -17822,7 +17878,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && sve_issue && (kind == scalar_load || kind == scalar_store) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17977,9 +18033,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Check if we've seen an SVE gather/scatter operation and which size. */ if (kind == scalar_load + && node && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) @@ -20481,6 +20538,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2) unsigned long _size; // Size of the struct, so it can grow. unsigned long _hwcap; unsigned long _hwcap2; + unsigned long _hwcap3; + unsigned long _hwcap4; } */ @@ -20497,14 +20556,24 @@ build_ifunc_arg_type () tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, get_identifier ("_hwcap2"), long_unsigned_type_node); + tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap3"), + long_unsigned_type_node); + tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap4"), + long_unsigned_type_node); DECL_FIELD_CONTEXT (field1) = ifunc_arg_type; DECL_FIELD_CONTEXT (field2) = ifunc_arg_type; DECL_FIELD_CONTEXT (field3) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field4) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field5) = ifunc_arg_type; TYPE_FIELDS (ifunc_arg_type) = field1; DECL_CHAIN (field1) = field2; DECL_CHAIN (field2) = field3; + DECL_CHAIN (field3) = field4; + DECL_CHAIN (field4) = field5; layout_type (ifunc_arg_type); @@ -31963,9 +32032,43 @@ aarch64_test_sysreg_encoding_clashes (void) static void aarch64_test_sve_folding () { + aarch64_target_switcher switcher (AARCH64_FL_SVE); + tree res = fold_unary (BIT_NOT_EXPR, ssizetype, ssize_int (poly_int64 (1, 1))); ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1)))); + + auto build_v16bi = [](bool a, bool b) + { + rtx_vector_builder builder (VNx16BImode, 2, 1); + builder.quick_push (a ? const1_rtx : const0_rtx); + builder.quick_push (b ? const1_rtx : const0_rtx); + return builder.build (); + }; + rtx v16bi_10 = build_v16bi (1, 0); + rtx v16bi_01 = build_v16bi (0, 1); + + for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode }) + { + rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1); + rtx subreg = lowpart_subreg (VNx16BImode, reg, mode); + rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg); + rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode)); + + rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode)); + rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg); + + rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10); + ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode), + lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg), + VNx16BImode)); + rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg); + } } /* Run all target-specific selftests. */ |