diff options
Diffstat (limited to 'gcc/config/aarch64/aarch64.cc')
-rw-r--r-- | gcc/config/aarch64/aarch64.cc | 1061 |
1 files changed, 838 insertions, 223 deletions
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4e80114..2dbaf4a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -83,6 +83,7 @@ #include "rtlanal.h" #include "tree-dfa.h" #include "asan.h" +#include "aarch64-elf-metadata.h" #include "aarch64-feature-deps.h" #include "config/arm/aarch-common.h" #include "config/arm/aarch-common-protos.h" @@ -108,6 +109,10 @@ and 1 MOVI/DUP (same size as a call). */ #define MAX_SET_SIZE(speed) (speed ? 256 : 96) +#ifndef HAVE_AS_AEABI_BUILD_ATTRIBUTES +#define HAVE_AS_AEABI_BUILD_ATTRIBUTES 0 +#endif + /* Flags that describe how a function shares certain architectural state with its callers. @@ -351,7 +356,8 @@ static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed); + bool is_packed, + bool is_gather_scatter); static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, aarch64_addr_query_type); @@ -424,6 +430,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = #include "tuning_models/neoversev2.h" #include "tuning_models/neoversev3.h" #include "tuning_models/neoversev3ae.h" +#include "tuning_models/olympus.h" #include "tuning_models/a64fx.h" #include "tuning_models/fujitsu_monaka.h" @@ -954,6 +961,39 @@ svpattern_token (enum aarch64_svpattern pattern) gcc_unreachable (); } +/* Return true if RHS is an operand suitable for a CB<cc> (immediate) + instruction. OP_CODE determines the type of the comparison. */ +bool +aarch64_cb_rhs (rtx_code op_code, rtx rhs) +{ + if (!CONST_INT_P (rhs)) + return REG_P (rhs); + + HOST_WIDE_INT rhs_val = INTVAL (rhs); + + switch (op_code) + { + case EQ: + case NE: + case GT: + case GTU: + case LT: + case LTU: + return IN_RANGE (rhs_val, 0, 63); + + case GE: /* CBGE: signed greater than or equal */ + case GEU: /* CBHS: unsigned greater than or equal */ + return IN_RANGE (rhs_val, 1, 64); + + case LE: /* CBLE: signed less than or equal */ + case LEU: /* CBLS: unsigned less than or equal */ + return IN_RANGE (rhs_val, -1, 62); + + default: + return false; + } +} + /* Return the location of a piece that is known to be passed or returned in registers. FIRST_ZR is the first unused vector argument register and FIRST_PR is the first unused predicate argument register. */ @@ -2879,10 +2919,10 @@ aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum, emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask)); rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM); rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx); - return gen_condjump (x, cc_reg, label); + return gen_aarch64_bcond (x, cc_reg, label); } - return gen_aarch64_tb (code, mode, mode, - x, gen_int_mode (bitnum, mode), label); + return gen_aarch64_tbz (code, mode, mode, + x, gen_int_mode (bitnum, mode), label); } /* Consider the operation: @@ -3201,8 +3241,7 @@ aarch64_load_symref_appropriately (rtx dest, rtx imm, aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm)); else aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm)); - insns = get_insns (); - end_sequence (); + insns = end_sequence (); RTL_CONST_CALL_P (insns) = 1; emit_libcall_block (insns, tmp_reg, result, imm); @@ -3667,6 +3706,14 @@ aarch64_partial_ptrue_length (rtx_vector_builder &builder, if (builder.nelts_per_pattern () == 3) return 0; + /* It is conservatively correct to drop the element size to a lower value, + and we must do so if the predicate consists of a leading "foreground" + sequence that is smaller than the element size. Without this, + we would test only one bit and so treat everything as either an + all-true or an all-false predicate. */ + if (builder.nelts_per_pattern () == 2) + elt_size = MIN (elt_size, builder.npatterns ()); + /* Skip over leading set bits. */ unsigned int nelts = builder.encoded_nelts (); unsigned int i = 0; @@ -3698,6 +3745,24 @@ aarch64_partial_ptrue_length (rtx_vector_builder &builder, return vl; } +/* Return: + + * -1 if all bits of PRED are set + * N if PRED has N leading set bits followed by all clear bits + * 0 if PRED does not have any of these forms. */ + +int +aarch64_partial_ptrue_length (rtx pred) +{ + rtx_vector_builder builder; + if (!aarch64_get_sve_pred_bits (builder, pred)) + return 0; + + auto elt_size = vector_element_size (GET_MODE_BITSIZE (GET_MODE (pred)), + GET_MODE_NUNITS (GET_MODE (pred))); + return aarch64_partial_ptrue_length (builder, elt_size); +} + /* See if there is an svpattern that encodes an SVE predicate of mode PRED_MODE in which the first VL bits are set and the rest are clear. Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS. @@ -3830,18 +3895,91 @@ aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2) return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]); } + +/* Generate a predicate to control partial SVE mode DATA_MODE as if it + were fully packed, enabling the defined elements only. */ +rtx +aarch64_sve_packed_pred (machine_mode data_mode) +{ + unsigned int container_bytes + = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT; + /* Enable the significand of each container only. */ + rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes)); + /* Predicate at the element size. */ + machine_mode pmode + = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require (); + return gen_lowpart (pmode, ptrue); +} + +/* Generate a predicate and strictness value to govern a floating-point + operation with SVE mode DATA_MODE. + + If DATA_MODE is a partial vector mode, this pair prevents the operation + from interpreting undefined elements - unless we don't need to suppress + their trapping behavior. */ +rtx +aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) + { + if (strictness) + *strictness = gen_int_mode (SVE_STRICT_GP, SImode); + return aarch64_sve_packed_pred (data_mode); + } + if (strictness) + *strictness = gen_int_mode (SVE_RELAXED_GP, SImode); + /* Use the VPRED mode. */ + return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); +} + +/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE + is a partial vector mode, and if exceptions must be suppressed for its + undefined elements, convert PRED from a container-level predicate to + an element-level predicate and ensure that the undefined elements + are inactive. Make no changes otherwise. + + Return the resultant predicate. */ +rtx +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) + { + /* Generate an element-level mask. */ + rtx mask = aarch64_sve_packed_pred (data_mode); + machine_mode pmode = GET_MODE (mask); + + /* Apply the existing predicate. */ + rtx dst = gen_reg_rtx (pmode); + emit_insn (gen_and3 (pmode, dst, mask, + gen_lowpart (pmode, pred))); + return dst; + } + + return pred; +} + /* Emit a comparison CMP between OP0 and OP1, both of which have mode DATA_MODE, and return the result in a predicate of mode PRED_MODE. - Use TARGET as the target register if nonnull and convenient. */ + Use TARGET as the target register if nonnull and convenient. + + PRED_MODE can be either VNx16BI or the natural predicate mode for + DATA_MODE. */ static rtx aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp, machine_mode data_mode, rtx op1, rtx op2) { - insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode); + auto src_pred_mode = aarch64_sve_pred_mode (data_mode); + insn_code icode; + if (known_eq (GET_MODE_NUNITS (pred_mode), GET_MODE_NUNITS (data_mode))) + icode = code_for_aarch64_pred_cmp (cmp, data_mode); + else + icode = code_for_aarch64_pred_cmp_acle (cmp, data_mode); expand_operand ops[5]; create_output_operand (&ops[0], target, pred_mode); - create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode); + create_input_operand (&ops[1], CONSTM1_RTX (src_pred_mode), src_pred_mode); create_integer_operand (&ops[2], SVE_KNOWN_PTRUE); create_input_operand (&ops[3], op1, data_mode); create_input_operand (&ops[4], op2, data_mode); @@ -3849,15 +3987,14 @@ aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp, return ops[0].value; } -/* Use a comparison to convert integer vector SRC into MODE, which is - the corresponding SVE predicate mode. Use TARGET for the result - if it's nonnull and convenient. */ +/* Use a comparison to convert integer vector SRC into VNx16BI. + Use TARGET for the result if it's nonnull and convenient. */ rtx -aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src) +aarch64_convert_sve_data_to_pred (rtx target, rtx src) { machine_mode src_mode = GET_MODE (src); - return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode, + return aarch64_sve_emit_int_cmp (target, VNx16BImode, NE, src_mode, src, CONST0_RTX (src_mode)); } @@ -5939,9 +6076,9 @@ aarch64_sve_move_pred_via_while (rtx target, machine_mode mode, unsigned int vl) { rtx limit = force_reg (DImode, gen_int_mode (vl, DImode)); - target = aarch64_target_reg (target, mode); - emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode, - target, const0_rtx, limit)); + target = aarch64_target_reg (target, VNx16BImode); + emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode, + target, const0_rtx, limit)); return target; } @@ -6087,8 +6224,7 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder, operands but permutes them as though they had mode MODE. */ machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); target = aarch64_target_reg (target, GET_MODE (a)); - rtx type_reg = CONST0_RTX (mode); - emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); + emit_insn (gen_aarch64_sve_acle (UNSPEC_TRN1, mode, target, a, b)); return target; } @@ -6170,8 +6306,7 @@ aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder) for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) int_builder.quick_push (INTVAL (builder.elt (i)) ? constm1_rtx : const0_rtx); - return aarch64_convert_sve_data_to_pred (target, VNx16BImode, - int_builder.build ()); + return aarch64_convert_sve_data_to_pred (target, int_builder.build ()); } /* Set DEST to immediate IMM. */ @@ -6410,19 +6545,51 @@ aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl, return gen_rtx_MEM (mode, force_reg (Pmode, addr)); } -/* Emit an SVE predicated move from SRC to DEST. PRED is a predicate - that is known to contain PTRUE. */ +/* Emit a load/store from a subreg of SRC to a subreg of DEST. + The subregs have mode NEW_MODE. Use only for reg<->mem moves. */ +void +aarch64_emit_load_store_through_mode (rtx dest, rtx src, machine_mode new_mode) +{ + gcc_assert ((MEM_P (dest) && register_operand (src, VOIDmode)) + || (MEM_P (src) && register_operand (dest, VOIDmode))); + auto mode = GET_MODE (dest); + auto int_mode = aarch64_sve_int_mode (mode); + if (MEM_P (src)) + { + rtx tmp = force_reg (new_mode, adjust_address (src, new_mode, 0)); + tmp = force_lowpart_subreg (int_mode, tmp, new_mode); + emit_move_insn (dest, force_lowpart_subreg (mode, tmp, int_mode)); + } + else + { + src = force_lowpart_subreg (int_mode, src, mode); + emit_move_insn (adjust_address (dest, new_mode, 0), + force_lowpart_subreg (new_mode, src, int_mode)); + } +} + +/* PRED is a predicate that is known to contain PTRUE. + For 128-bit VLS loads/stores, emit LDR/STR. + Else, emit an SVE predicated move from SRC to DEST. */ void aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src) { - expand_operand ops[3]; machine_mode mode = GET_MODE (dest); - create_output_operand (&ops[0], dest, mode); - create_input_operand (&ops[1], pred, GET_MODE(pred)); - create_input_operand (&ops[2], src, mode); - temporary_volatile_ok v (true); - expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); + if ((MEM_P (dest) || MEM_P (src)) + && known_eq (GET_MODE_SIZE (mode), 16) + && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA + && !BYTES_BIG_ENDIAN) + aarch64_emit_load_store_through_mode (dest, src, V16QImode); + else + { + expand_operand ops[3]; + create_output_operand (&ops[0], dest, mode); + create_input_operand (&ops[1], pred, GET_MODE(pred)); + create_input_operand (&ops[2], src, mode); + temporary_volatile_ok v (true); + expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); + } } /* Expand a pre-RA SVE data move from SRC to DEST in which at least one @@ -6591,6 +6758,27 @@ aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src) dest, ptrue, src)); } +/* Set predicate register DEST such that every element has the scalar + boolean value in SRC, with any nonzero source counting as "true". + MODE is a MODE_VECTOR_BOOL that determines the element size; + DEST can have this mode or VNx16BImode. In the latter case, + the upper bits of each element are defined to be zero, as for + the .H, .S, and .D forms of PTRUE. */ + +void +aarch64_emit_sve_pred_vec_duplicate (machine_mode mode, rtx dest, rtx src) +{ + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_ashldi3 (tmp, gen_lowpart (DImode, src), + gen_int_mode (63, DImode))); + if (GET_MODE (dest) == VNx16BImode) + emit_insn (gen_aarch64_sve_while_acle (UNSPEC_WHILELO, DImode, mode, + dest, const0_rtx, tmp)); + else + emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode, + dest, const0_rtx, tmp)); +} + static bool aarch64_function_ok_for_sibcall (tree, tree exp) { @@ -8699,6 +8887,13 @@ aarch_bti_j_insn_p (rtx_insn *insn) return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J; } +/* Return TRUE if Pointer Authentication for the return address is enabled. */ +bool +aarch64_pacret_enabled (void) +{ + return (aarch_ra_sign_scope != AARCH_FUNCTION_NONE); +} + /* Return TRUE if Guarded Control Stack is enabled. */ bool aarch64_gcs_enabled (void) @@ -9417,13 +9612,16 @@ aarch64_emit_stack_tie (rtx reg) } /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch - registers. If POLY_SIZE is not large enough to require a probe this function - will only adjust the stack. When allocating the stack space - FRAME_RELATED_P is then used to indicate if the allocation is frame related. - FINAL_ADJUSTMENT_P indicates whether we are allocating the area below - the saved registers. If we are then we ensure that any allocation - larger than the ABI defined buffer needs a probe so that the - invariant of having a 1KB buffer is maintained. + registers, given that the stack pointer is currently BYTES_BELOW_SP bytes + above the bottom of the static frame. + + If POLY_SIZE is not large enough to require a probe this function will only + adjust the stack. When allocating the stack space FRAME_RELATED_P is then + used to indicate if the allocation is frame related. FINAL_ADJUSTMENT_P + indicates whether we are allocating the area below the saved registers. + If we are then we ensure that any allocation larger than the ABI defined + buffer needs a probe so that the invariant of having a 1KB buffer is + maintained. We emit barriers after each stack adjustment to prevent optimizations from breaking the invariant that we never drop the stack more than a page. This @@ -9440,6 +9638,7 @@ aarch64_emit_stack_tie (rtx reg) static void aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, poly_int64 poly_size, + poly_int64 bytes_below_sp, aarch64_isa_mode force_isa_mode, bool frame_related_p, bool final_adjustment_p) @@ -9503,8 +9702,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, poly_size, temp1, temp2, force_isa_mode, false, true); - rtx_insn *insn = get_last_insn (); - + auto initial_cfa_offset = frame.frame_size - bytes_below_sp; + auto final_cfa_offset = initial_cfa_offset + poly_size; if (frame_related_p) { /* This is done to provide unwinding information for the stack @@ -9514,28 +9713,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, The tie will expand to nothing but the optimizers will not touch the instruction. */ rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); - emit_move_insn (stack_ptr_copy, stack_pointer_rtx); + auto *insn = emit_move_insn (stack_ptr_copy, stack_pointer_rtx); aarch64_emit_stack_tie (stack_ptr_copy); /* We want the CFA independent of the stack pointer for the duration of the loop. */ - add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_ptr_copy, + initial_cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } rtx probe_const = gen_int_mode (min_probe_threshold, Pmode); rtx guard_const = gen_int_mode (guard_size, Pmode); - insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, - stack_pointer_rtx, temp1, - probe_const, guard_const)); + auto *insn + = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, + stack_pointer_rtx, temp1, + probe_const, guard_const)); /* Now reset the CFA register if needed. */ if (frame_related_p) { add_reg_note (insn, REG_CFA_DEF_CFA, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - gen_int_mode (poly_size, Pmode))); + plus_constant (Pmode, stack_pointer_rtx, + final_cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9581,12 +9783,13 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, We can determine which allocation we are doing by looking at the value of FRAME_RELATED_P since the final allocations are not frame related. */ + auto cfa_offset = frame.frame_size - (bytes_below_sp - rounded_size); if (frame_related_p) { /* We want the CFA independent of the stack pointer for the duration of the loop. */ add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, temp1, rounded_size)); + plus_constant (Pmode, temp1, cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9608,7 +9811,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, if (frame_related_p) { add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, stack_pointer_rtx, rounded_size)); + plus_constant (Pmode, stack_pointer_rtx, cfa_offset)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -9916,17 +10119,22 @@ aarch64_expand_prologue (void) code below does not handle it for -fstack-clash-protection. */ gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0); + /* The offset of the current SP from the bottom of the static frame. */ + poly_int64 bytes_below_sp = frame_size; + /* Will only probe if the initial adjustment is larger than the guard less the amount of the guard reserved for use by the caller's outgoing args. */ aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, - force_isa_mode, true, false); + bytes_below_sp, force_isa_mode, + true, false); + bytes_below_sp -= initial_adjust; if (callee_adjust != 0) - aarch64_push_regs (reg1, reg2, callee_adjust); - - /* The offset of the current SP from the bottom of the static frame. */ - poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust; + { + aarch64_push_regs (reg1, reg2, callee_adjust); + bytes_below_sp -= callee_adjust; + } if (emit_frame_chain) { @@ -9994,7 +10202,7 @@ aarch64_expand_prologue (void) || known_eq (frame.reg_offset[VG_REGNUM], bytes_below_sp)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, sve_callee_adjust, - force_isa_mode, + bytes_below_sp, force_isa_mode, !frame_pointer_needed, false); bytes_below_sp -= sve_callee_adjust; } @@ -10005,10 +10213,11 @@ aarch64_expand_prologue (void) /* We may need to probe the final adjustment if it is larger than the guard that is assumed by the called. */ - gcc_assert (known_eq (bytes_below_sp, final_adjust)); aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, - force_isa_mode, + bytes_below_sp, force_isa_mode, !frame_pointer_needed, true); + bytes_below_sp -= final_adjust; + gcc_assert (known_eq (bytes_below_sp, 0)); if (emit_frame_chain && maybe_ne (final_adjust, 0)) aarch64_emit_stack_tie (hard_frame_pointer_rtx); @@ -14507,6 +14716,13 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, we don't need to consider that here. */ if (x == const0_rtx) *cost = 0; + /* If the outer is a COMPARE which is used by the middle-end + and the constant fits how the cmp instruction allows, say the cost + is the same as 1 insn. */ + else if (outer == COMPARE + && (aarch64_uimm12_shift (INTVAL (x)) + || aarch64_uimm12_shift (-UINTVAL (x)))) + *cost = COSTS_N_INSNS (1); else { /* To an approximation, building any other constant is @@ -15693,11 +15909,14 @@ cost_plus: break; case CONST_VECTOR: { - /* Load using MOVI/MVNI. */ - if (aarch64_simd_valid_mov_imm (x)) - *cost = extra_cost->vect.movi; - else /* Load using constant pool. */ - *cost = extra_cost->ldst.load; + if (speed) + { + /* Load using MOVI/MVNI. */ + if (aarch64_simd_valid_mov_imm (x)) + *cost += extra_cost->vect.movi; + else /* Load using constant pool. */ + *cost += extra_cost->ldst.load; + } break; } case VEC_CONCAT: @@ -15706,7 +15925,8 @@ cost_plus: break; case VEC_DUPLICATE: /* Load using a DUP. */ - *cost = extra_cost->vect.dup; + if (speed) + *cost += extra_cost->vect.dup; return false; case VEC_SELECT: { @@ -15714,13 +15934,16 @@ cost_plus: *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); /* cost subreg of 0 as free, otherwise as DUP */ - rtx op1 = XEXP (x, 1); - if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) - ; - else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) - *cost = extra_cost->vect.dup; - else - *cost = extra_cost->vect.extract; + if (speed) + { + rtx op1 = XEXP (x, 1); + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) + ; + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) + *cost += extra_cost->vect.dup; + else + *cost += extra_cost->vect.extract; + } return true; } default: @@ -16996,8 +17219,8 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info, && STMT_VINFO_DATA_REF (stmt_info)) { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); - if (stmt_info - && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES) + if (node + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -17268,8 +17491,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, for each element. We therefore need to divide the full-instruction cost by the number of elements in the vector. */ if (kind == scalar_load + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int nunits = vect_nunits_for_cost (vectype); /* Test for VNx2 modes, which have 64-bit containers. */ @@ -17281,8 +17505,9 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which a scalar_store is really storing one element in a scatter operation. */ if (kind == scalar_store + && node && sve_costs - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -17538,7 +17763,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && kind == vec_to_scalar && (m_vec_flags & VEC_ADVSIMD) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { auto dr = STMT_VINFO_DATA_REF (stmt_info); tree dr_ref = DR_REF (dr); @@ -17551,7 +17776,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, { if (gimple_vuse (SSA_NAME_DEF_STMT (offset))) { - if (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type) + if (SLP_TREE_TYPE (node) == load_vec_info_type) ops->loads += count - 1; else /* Stores want to count both the index to array and data to @@ -17653,7 +17878,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, if (stmt_info && sve_issue && (kind == scalar_load || kind == scalar_store) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17771,7 +17996,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); - if (!m_analyzed_vinfo) + if (!m_analyzed_vinfo && !m_costing_for_scalar) { if (loop_vinfo) analyze_loop_vinfo (loop_vinfo); @@ -17808,8 +18033,10 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, /* Check if we've seen an SVE gather/scatter operation and which size. */ if (kind == scalar_load + && node + && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_GATHER_SCATTER) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) @@ -18632,6 +18859,8 @@ aarch64_adjust_generic_arch_tuning (struct tune_params ¤t_tune) if (TARGET_SVE2) current_tune.extra_tuning_flags &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS; + if (!AARCH64_HAVE_ISA(V8_8A)) + aarch64_tune_params.extra_tuning_flags |= AARCH64_EXTRA_TUNE_AVOID_LDAPUR; } static void @@ -18696,7 +18925,10 @@ aarch64_override_options_internal (struct gcc_options *opts) /* Make a copy of the tuning parameters attached to the core, which we may later overwrite. */ aarch64_tune_params = *(tune->tune); - if (tune->tune == &generic_tunings) + + if (tune->tune == &generic_tunings + || tune->tune == &generic_armv8_a_tunings + || tune->tune == &generic_armv9_a_tunings) aarch64_adjust_generic_arch_tuning (aarch64_tune_params); if (opts->x_aarch64_override_tune_string) @@ -18748,9 +18980,16 @@ aarch64_override_options_internal (struct gcc_options *opts) aarch64_stack_protector_guard_offset = offs; } - if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK) - && !fixed_regs[R18_REGNUM]) - error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>"); + if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)) + { + if (!fixed_regs[R18_REGNUM]) + error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>"); +#ifdef TARGET_OS_USES_R18 + else + sorry ("%<-fsanitize=shadow-call-stack%> conflicts with the use of" + " register x18 by the target operating system"); +#endif + } aarch64_feature_flags isa_flags = aarch64_get_isa_flags (opts); if ((isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON)) @@ -18901,6 +19140,20 @@ aarch64_override_options_internal (struct gcc_options *opts) if (TARGET_SME && !TARGET_SVE2) sorry ("no support for %qs without %qs", "sme", "sve2"); + /* Set scalar costing to a high value such that we always pick + vectorization. Increase scalar costing by 10000%. */ + if (opts->x_flag_aarch64_max_vectorization) + SET_OPTION_IF_UNSET (opts, &global_options_set, + param_vect_scalar_cost_multiplier, 10000); + + /* Synchronize the -mautovec-preference and aarch64_autovec_preference using + whichever one is not default. If both are set then prefer the param flag + over the parameters. */ + if (opts->x_autovec_preference != AARCH64_AUTOVEC_DEFAULT) + SET_OPTION_IF_UNSET (opts, &global_options_set, + aarch64_autovec_preference, + opts->x_autovec_preference); + aarch64_override_options_after_change_1 (opts); } @@ -19651,6 +19904,8 @@ static const struct aarch64_attribute_info aarch64_attributes[] = OPT_msign_return_address_ }, { "outline-atomics", aarch64_attr_bool, true, NULL, OPT_moutline_atomics}, + { "max-vectorization", aarch64_attr_bool, false, NULL, + OPT_mmax_vectorization}, { NULL, aarch64_attr_custom, false, NULL, OPT____ } }; @@ -19769,8 +20024,9 @@ aarch64_process_one_target_attr (char *arg_str) if (valid) { set_option (&global_options, NULL, p_attr->opt_num, value, - NULL, DK_UNSPECIFIED, input_location, - global_dc); + NULL, + static_cast<int> (diagnostics::kind::unspecified), + input_location, global_dc); } else { @@ -20282,6 +20538,8 @@ aarch64_compare_version_priority (tree decl1, tree decl2) unsigned long _size; // Size of the struct, so it can grow. unsigned long _hwcap; unsigned long _hwcap2; + unsigned long _hwcap3; + unsigned long _hwcap4; } */ @@ -20298,14 +20556,24 @@ build_ifunc_arg_type () tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, get_identifier ("_hwcap2"), long_unsigned_type_node); + tree field4 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap3"), + long_unsigned_type_node); + tree field5 = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier ("_hwcap4"), + long_unsigned_type_node); DECL_FIELD_CONTEXT (field1) = ifunc_arg_type; DECL_FIELD_CONTEXT (field2) = ifunc_arg_type; DECL_FIELD_CONTEXT (field3) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field4) = ifunc_arg_type; + DECL_FIELD_CONTEXT (field5) = ifunc_arg_type; TYPE_FIELDS (ifunc_arg_type) = field1; DECL_CHAIN (field1) = field2; DECL_CHAIN (field2) = field3; + DECL_CHAIN (field3) = field4; + DECL_CHAIN (field4) = field5; layout_type (ifunc_arg_type); @@ -20777,7 +21045,6 @@ aarch64_get_function_versions_dispatcher (void *decl) struct cgraph_node *node = NULL; struct cgraph_node *default_node = NULL; struct cgraph_function_version_info *node_v = NULL; - struct cgraph_function_version_info *first_v = NULL; tree dispatch_decl = NULL; @@ -20794,37 +21061,16 @@ aarch64_get_function_versions_dispatcher (void *decl) if (node_v->dispatcher_resolver != NULL) return node_v->dispatcher_resolver; - /* Find the default version and make it the first node. */ - first_v = node_v; - /* Go to the beginning of the chain. */ - while (first_v->prev != NULL) - first_v = first_v->prev; - default_version_info = first_v; - while (default_version_info != NULL) - { - if (get_feature_mask_for_version - (default_version_info->this_node->decl) == 0ULL) - break; - default_version_info = default_version_info->next; - } + /* The default node is always the beginning of the chain. */ + default_version_info = node_v; + while (default_version_info->prev) + default_version_info = default_version_info->prev; + default_node = default_version_info->this_node; /* If there is no default node, just return NULL. */ - if (default_version_info == NULL) + if (!is_function_default_version (default_node->decl)) return NULL; - /* Make default info the first node. */ - if (first_v != default_version_info) - { - default_version_info->prev->next = default_version_info->next; - if (default_version_info->next) - default_version_info->next->prev = default_version_info->prev; - first_v->prev = default_version_info; - default_version_info->next = first_v; - default_version_info->prev = NULL; - } - - default_node = default_version_info->this_node; - if (targetm.has_ifunc_p ()) { struct cgraph_function_version_info *it_v = NULL; @@ -21968,6 +22214,14 @@ aarch64_conditional_register_usage (void) fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1; call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1; } + +#ifdef TARGET_OS_USES_R18 + /* R18 is the STATIC_CHAIN_REGNUM on most aarch64 ports, but VxWorks + uses it as the TCB, so aarch64-vxworks.h overrides + STATIC_CHAIN_REGNUM, and here we mark R18 as fixed. */ + fixed_regs[R18_REGNUM] = 1; + call_used_regs[R18_REGNUM] = 1; +#endif } /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */ @@ -22904,6 +23158,58 @@ aarch64_sve_index_immediate_p (rtx base_or_step) && IN_RANGE (INTVAL (base_or_step), -16, 15)); } +/* Return true if SERIES is a constant vector that can be loaded using + an immediate SVE INDEX, considering both SVE and Advanced SIMD modes. + When returning true, store the base in *BASE_OUT and the step + in *STEP_OUT. */ + +static bool +aarch64_sve_index_series_p (rtx series, rtx *base_out, rtx *step_out) +{ + rtx base, step; + if (!const_vec_series_p (series, &base, &step) + || !CONST_INT_P (base) + || !CONST_INT_P (step)) + return false; + + auto mode = GET_MODE (series); + auto elt_mode = as_a<scalar_int_mode> (GET_MODE_INNER (mode)); + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + if (BYTES_BIG_ENDIAN && (vec_flags & VEC_ADVSIMD)) + { + /* On big-endian targets, architectural lane 0 holds the last element + for Advanced SIMD and the first element for SVE; see the comment at + the head of aarch64-sve.md for details. This means that, from an SVE + point of view, an Advanced SIMD series goes from the last element to + the first. */ + auto i = GET_MODE_NUNITS (mode).to_constant () - 1; + base = gen_int_mode (UINTVAL (base) + i * UINTVAL (step), elt_mode); + step = gen_int_mode (-UINTVAL (step), elt_mode); + } + + if (!aarch64_sve_index_immediate_p (base) + || !aarch64_sve_index_immediate_p (step)) + return false; + + /* If the mode spans multiple registers, check that each subseries is + in range. */ + unsigned int nvectors = aarch64_ldn_stn_vectors (mode); + if (nvectors != 1) + { + unsigned int nunits; + if (!GET_MODE_NUNITS (mode).is_constant (&nunits)) + return false; + nunits /= nvectors; + for (unsigned int i = 1; i < nvectors; ++i) + if (!IN_RANGE (INTVAL (base) + i * nunits * INTVAL (step), -16, 15)) + return false; + } + + *base_out = base; + *step_out = step; + return true; +} + /* Return true if X is a valid immediate for the SVE ADD and SUB instructions when applied to mode MODE. Negate X first if NEGATE_P is true. */ @@ -23352,13 +23658,8 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, n_elts = CONST_VECTOR_NPATTERNS (op); else if (which == AARCH64_CHECK_MOV && TARGET_SVE - && const_vec_series_p (op, &base, &step)) + && aarch64_sve_index_series_p (op, &base, &step)) { - gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); - if (!aarch64_sve_index_immediate_p (base) - || !aarch64_sve_index_immediate_p (step)) - return false; - if (info) { /* Get the corresponding container mode. E.g. an INDEX on V2SI @@ -23470,6 +23771,8 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, long int as_long_ints[2]; as_long_ints[0] = ival & 0xFFFFFFFF; as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF; + if (imode == DImode && FLOAT_WORDS_BIG_ENDIAN) + std::swap (as_long_ints[0], as_long_ints[1]); REAL_VALUE_TYPE r; real_from_target (&r, as_long_ints, fmode); @@ -23495,6 +23798,39 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, return false; } +/* Try to optimize the expansion of a maskload or maskstore with + the operands in OPERANDS, given that the vector being loaded or + stored has mode MODE. Return true on success or false if the normal + expansion should be used. */ + +bool +aarch64_expand_maskloadstore (rtx *operands, machine_mode mode) +{ + /* If the predicate in operands[2] is a patterned SVE PTRUE predicate + with patterns VL1, VL2, VL4, VL8, or VL16 and at most the bottom + 128 bits are loaded/stored, emit an ASIMD load/store. */ + int vl = aarch64_partial_ptrue_length (operands[2]); + int width = vl * GET_MODE_UNIT_BITSIZE (mode); + if (width <= 128 + && pow2p_hwi (vl) + && (vl == 1 + || (!BYTES_BIG_ENDIAN + && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA))) + { + machine_mode new_mode; + if (known_eq (width, 128)) + new_mode = V16QImode; + else if (known_eq (width, 64)) + new_mode = V8QImode; + else + new_mode = int_mode_for_size (width, 0).require (); + aarch64_emit_load_store_through_mode (operands[0], operands[1], + new_mode); + return true; + } + return false; +} + /* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD. */ bool aarch64_simd_valid_mov_imm (rtx op) @@ -23516,6 +23852,36 @@ aarch64_simd_valid_and_imm (rtx op) return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND); } +/* Return true if OP is a valid SIMD and immediate which allows the and to be + optimized as fmov. If ELT_BITSIZE is nonnull, use it to return the number of + bits to move. */ +bool +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_bitsize) +{ + machine_mode mode = GET_MODE (op); + gcc_assert (!aarch64_sve_mode_p (mode)); + + auto_vec<target_unit, 16> buffer; + unsigned int n_bytes = GET_MODE_SIZE (mode).to_constant (); + buffer.reserve (n_bytes); + + bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes); + gcc_assert (ok); + + auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes * BITS_PER_UNIT); + int set_bit = wi::exact_log2 (mask + 1); + if ((set_bit == 16 && TARGET_SIMD_F16INST) + || set_bit == 32 + || set_bit == 64) + { + if (elt_bitsize) + *elt_bitsize = set_bit; + return true; + } + + return false; +} + /* Return true if OP is a valid SIMD xor immediate for SVE. */ bool aarch64_simd_valid_xor_imm (rtx op) @@ -23551,6 +23917,19 @@ aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left) return IN_RANGE (INTVAL (x), 1, bit_width); } + +/* Check whether X can control SVE mode MODE. */ +bool +aarch64_sve_valid_pred_p (rtx x, machine_mode mode) +{ + machine_mode pred_mode = GET_MODE (x); + if (!aarch64_sve_pred_mode_p (pred_mode)) + return false; + + return known_ge (GET_MODE_NUNITS (pred_mode), + GET_MODE_NUNITS (mode)); +} + /* Return the bitmask CONST_INT to select the bits required by a zero extract operation of width WIDTH at bit position POS. */ @@ -23809,6 +24188,16 @@ aarch64_strided_registers_p (rtx *operands, unsigned int num_operands, return true; } +/* Return the base 2 logarithm of the bit inverse of OP masked by the lowest + NELTS bits, if OP is a power of 2. Otherwise, returns -1. */ + +int +aarch64_exact_log2_inverse (unsigned int nelts, rtx op) +{ + return exact_log2 ((~INTVAL (op)) + & ((HOST_WIDE_INT_1U << nelts) - 1)); +} + /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive). */ void @@ -24096,10 +24485,14 @@ aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed) static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, const_tree type, int misalignment, - bool is_packed) + bool is_packed, + bool is_gather_scatter) { if (TARGET_SIMD && STRICT_ALIGNMENT) { + if (is_gather_scatter) + return true; + /* Return if movmisalign pattern is not supported for this mode. */ if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) return false; @@ -24109,7 +24502,8 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode, return false; } return default_builtin_support_vector_misalignment (mode, type, misalignment, - is_packed); + is_packed, + is_gather_scatter); } /* If VALS is a vector constant that can be loaded into a register @@ -24514,6 +24908,28 @@ seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed) return cost; } +/* *VECTOR is an Advanced SIMD structure mode and *INDEX is a constant index + into it. Narrow *VECTOR and *INDEX so that they reference a single vector + of mode SUBVEC_MODE. IS_DEST is true if *VECTOR is a destination operand, + false if it is a source operand. */ + +void +aarch64_decompose_vec_struct_index (machine_mode subvec_mode, + rtx *vector, rtx *index, bool is_dest) +{ + auto elts_per_vector = GET_MODE_NUNITS (subvec_mode).to_constant (); + auto subvec = UINTVAL (*index) / elts_per_vector; + auto subelt = UINTVAL (*index) % elts_per_vector; + auto subvec_byte = subvec * GET_MODE_SIZE (subvec_mode); + if (is_dest) + *vector = simplify_gen_subreg (subvec_mode, *vector, GET_MODE (*vector), + subvec_byte); + else + *vector = force_subreg (subvec_mode, *vector, GET_MODE (*vector), + subvec_byte); + *index = gen_int_mode (subelt, SImode); +} + /* Expand a vector initialization sequence, such that TARGET is initialized to contain VALS. */ @@ -24547,12 +24963,18 @@ aarch64_expand_vector_init (rtx target, rtx vals) rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals)); aarch64_expand_vector_init (tmp_reg, new_vals); halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0); - rtx_insn *rec_seq = get_insns (); - end_sequence (); + rtx_insn *rec_seq = end_sequence (); costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size); emit_insn (rec_seq); } + /* The two halves should (by induction) be individually endian-correct. + However, in the memory layout provided by VALS, the nth element of + HALVES[0] comes immediately before the nth element HALVES[1]. + This means that, on big-endian targets, the nth element of HALVES[0] + is more significant than the nth element HALVES[1]. */ + if (BYTES_BIG_ENDIAN) + std::swap (halves[0], halves[1]); rtvec v = gen_rtvec (2, halves[0], halves[1]); rtx_insn *zip1_insn = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); @@ -24560,8 +24982,7 @@ aarch64_expand_vector_init (rtx target, rtx vals) = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1]; seq_total_cost += insn_cost (zip1_insn, !optimize_size); - rtx_insn *seq = get_insns (); - end_sequence (); + rtx_insn *seq = end_sequence (); start_sequence (); aarch64_expand_vector_init_fallback (target, vals); @@ -25014,20 +25435,41 @@ aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global) return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; } +/* Return true if function declaration FNDECL needs to be marked as + having a variant PCS. */ + +static bool +aarch64_is_variant_pcs (tree fndecl) +{ + /* Check for ABIs that preserve more registers than usual. */ + arm_pcs pcs = (arm_pcs) fndecl_abi (fndecl).id (); + if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE) + return true; + + /* Check for ABIs that allow PSTATE.SM to be 1 on entry. */ + tree fntype = TREE_TYPE (fndecl); + if (aarch64_fntype_pstate_sm (fntype) != AARCH64_ISA_MODE_SM_OFF) + return true; + + /* Check for ABIs that require PSTATE.ZA to be 1 on entry, either because + of ZA or ZT0. */ + if (aarch64_fntype_pstate_za (fntype) != 0) + return true; + + return false; +} + /* Output .variant_pcs for aarch64_vector_pcs function symbols. */ static void aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name) { - if (TREE_CODE (decl) == FUNCTION_DECL) + if (TREE_CODE (decl) == FUNCTION_DECL + && aarch64_is_variant_pcs (decl)) { - arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id (); - if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE) - { - fprintf (stream, "\t.variant_pcs\t"); - assemble_name (stream, name); - fprintf (stream, "\n"); - } + fprintf (stream, "\t.variant_pcs\t"); + assemble_name (stream, name); + fprintf (stream, "\n"); } } @@ -25191,7 +25633,6 @@ aarch64_start_file (void) } /* Emit load exclusive. */ - static void aarch64_emit_load_exclusive (machine_mode mode, rtx rval, rtx mem, rtx model_rtx) @@ -25642,6 +26083,26 @@ aarch64_float_const_representable_p (rtx x) return aarch64_real_float_const_representable_p (r); } +/* Returns the string with the fmov instruction which is equivalent to an and + instruction with the SIMD immediate CONST_VECTOR. */ +char* +aarch64_output_fmov (rtx const_vector) +{ + bool is_valid; + static char templ[40]; + char element_char; + unsigned int elt_bitsize; + + is_valid = aarch64_simd_valid_and_imm_fmov (const_vector, &elt_bitsize); + gcc_assert (is_valid); + + element_char = sizetochar (elt_bitsize); + snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1", element_char, + element_char); + + return templ; +} + /* Returns the string with the instruction for the SIMD immediate * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. */ char* @@ -26191,6 +26652,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL; newd.testing_p = d->testing_p; newd.one_vector_p = d->one_vector_p; + newd.zero_op0_p = d->zero_op0_p; + newd.zero_op1_p = d->zero_op1_p; newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2, newpermindices.nelts_per_input ()); @@ -26448,7 +26911,6 @@ aarch64_evpc_hvla (struct expand_vec_perm_d *d) machine_mode vmode = d->vmode; if (!TARGET_SVE2p1 || !TARGET_NON_STREAMING - || BYTES_BIG_ENDIAN || d->vec_flags != VEC_SVE_DATA || GET_MODE_UNIT_BITSIZE (vmode) > 64) return false; @@ -26608,12 +27070,23 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) static bool aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) { - unsigned HOST_WIDE_INT nelt; + if (!d->one_vector_p) + { + /* aarch64_expand_sve_vec_perm does not yet handle variable-length + vectors. */ + if (!d->perm.length ().is_constant ()) + return false; - /* Permuting two variable-length vectors could overflow the - index range. */ - if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt)) - return false; + /* This permutation reduces to the vec_perm optab if the elements are + large enough to hold all selector indices. Do not handle that case + here, since the general TBL+SUB+TBL+ORR sequence is too expensive to + be considered a "native" constant permutation. + + Not doing this would undermine code that queries can_vec_perm_const_p + with allow_variable_p set to false. See PR121027. */ + if (selector_fits_mode_p (d->vmode, d->perm)) + return false; + } if (d->testing_p) return true; @@ -26774,6 +27247,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns suitable for the AND instructions. */ +static bool +aarch64_evpc_and (struct expand_vec_perm_d *d) +{ + /* Either d->op0 or d->op1 should be a vector of all zeros. */ + if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p)) + return false; + + machine_mode mode = d->vmode; + machine_mode sel_mode; + if (!related_int_vector_mode (mode).exists (&sel_mode)) + return false; + + insn_code and_code = optab_handler (and_optab, sel_mode); + rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p); + if (and_code == CODE_FOR_nothing || !and_mask) + return false; + + if (d->testing_p) + return true; + + class expand_operand ops[3]; + rtx in = d->zero_op0_p ? d->op1 : d->op0; + create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode); + create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode); + create_input_operand (&ops[2], and_mask, sel_mode); + expand_insn (and_code, 3, ops); + rtx result = gen_lowpart (mode, ops[0].value); + if (!rtx_equal_p (d->target, result)) + emit_move_insn (d->target, result); + + return true; +} + static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { @@ -26809,6 +27316,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_uzp (d)) return true; + else if (aarch64_evpc_and (d)) + return true; else if (aarch64_evpc_trn (d)) return true; else if (aarch64_evpc_sel (d)) @@ -26869,11 +27378,17 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, d.op_mode = op_mode; d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode); d.target = target; - d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX; + d.op0 = op0; + if (d.op0 && !register_operand (d.op0, op_mode)) + d.op0 = force_reg (op_mode, d.op0); if (op0 && d.one_vector_p) d.op1 = copy_rtx (d.op0); else - d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX; + { + d.op1 = op1; + if (d.op1 && !register_operand (d.op1, op_mode)) + d.op1 = force_reg (op_mode, d.op1); + } d.testing_p = !target; if (!d.testing_p) @@ -26961,7 +27476,7 @@ aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred, bool known_ptrue_p, rtx op0, rtx op1) { rtx flag = gen_int_mode (known_ptrue_p, SImode); - rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred), + rtx unspec = gen_rtx_UNSPEC (GET_MODE (target), gen_rtvec (4, pred, flag, op0, op1), aarch64_unspec_cond_code (code)); emit_set_insn (target, unspec); @@ -26980,10 +27495,10 @@ static void aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2, rtx pred, bool known_ptrue_p, rtx op0, rtx op1) { - machine_mode pred_mode = GET_MODE (pred); - rtx tmp1 = gen_reg_rtx (pred_mode); + machine_mode target_mode = GET_MODE (target); + rtx tmp1 = gen_reg_rtx (target_mode); aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1); - rtx tmp2 = gen_reg_rtx (pred_mode); + rtx tmp2 = gen_reg_rtx (target_mode); aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1); aarch64_emit_binop (target, ior_optab, tmp1, tmp2); } @@ -27000,8 +27515,7 @@ static void aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred, bool known_ptrue_p, rtx op0, rtx op1) { - machine_mode pred_mode = GET_MODE (pred); - rtx tmp = gen_reg_rtx (pred_mode); + rtx tmp = gen_reg_rtx (GET_MODE (target)); aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1); aarch64_emit_unop (target, one_cmpl_optab, tmp); } @@ -27013,10 +27527,25 @@ aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred, void aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1) { - machine_mode pred_mode = GET_MODE (target); machine_mode data_mode = GET_MODE (op0); + rtx pred = aarch64_sve_fp_pred (data_mode, nullptr); - rtx ptrue = aarch64_ptrue_reg (pred_mode); + /* The governing and destination modes. */ + machine_mode pred_mode = GET_MODE (pred); + machine_mode target_mode = GET_MODE (target); + + /* For partial vector modes, the choice of predicate mode depends + on whether we need to suppress exceptions for inactive elements. + If we do need to suppress exceptions, the predicate mode matches + the element size rather than the container size and the predicate + marks the upper bits in each container as inactive. The predicate + is then a ptrue wrt TARGET_MODE but not wrt PRED_MODE. It is the + latter which matters here. + + If we don't need to suppress exceptions, the predicate mode matches + the container size, PRED_MODE == TARGET_MODE, and the predicate is + thus a ptrue wrt both TARGET_MODE and PRED_MODE. */ + bool known_ptrue_p = pred_mode == target_mode; switch (code) { case UNORDERED: @@ -27030,12 +27559,13 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1) case EQ: case NE: /* There is native support for the comparison. */ - aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1); + aarch64_emit_sve_fp_cond (target, code, pred, known_ptrue_p, op0, op1); return; case LTGT: /* This is a trapping operation (LT or GT). */ - aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1); + aarch64_emit_sve_or_fp_conds (target, LT, GT, + pred, known_ptrue_p, op0, op1); return; case UNEQ: @@ -27044,7 +27574,7 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1) /* This would trap for signaling NaNs. */ op1 = force_reg (data_mode, op1); aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ, - ptrue, true, op0, op1); + pred, known_ptrue_p, op0, op1); return; } /* fall through */ @@ -27054,11 +27584,19 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1) case UNGE: if (flag_trapping_math) { - /* Work out which elements are ordered. */ - rtx ordered = gen_reg_rtx (pred_mode); op1 = force_reg (data_mode, op1); - aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED, - ptrue, true, op0, op1); + + /* Work out which elements are unordered. */ + rtx uo_tmp = gen_reg_rtx (target_mode); + aarch64_emit_sve_fp_cond (uo_tmp, UNORDERED, + pred, known_ptrue_p, op0, op1); + + /* Invert the result. Governered by PRED so that we only + flip the active bits. */ + rtx ordered = gen_reg_rtx (pred_mode); + uo_tmp = gen_lowpart (pred_mode, uo_tmp); + emit_insn (gen_aarch64_pred_one_cmpl_z (pred_mode, ordered, + pred, uo_tmp)); /* Test the opposite condition for the ordered elements, then invert the result. */ @@ -27083,7 +27621,8 @@ aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1) /* There is native support for the inverse comparison. */ code = reverse_condition_maybe_unordered (code); - aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1); + aarch64_emit_sve_invert_fp_cond (target, code, + pred, known_ptrue_p, op0, op1); } /* Return true if: @@ -27688,8 +28227,7 @@ aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, end_sequence (); return NULL_RTX; } - *prep_seq = get_insns (); - end_sequence (); + *prep_seq = end_sequence (); create_fixed_operand (&ops[0], op0); create_fixed_operand (&ops[1], op1); @@ -27700,8 +28238,7 @@ aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, end_sequence (); return NULL_RTX; } - *gen_seq = get_insns (); - end_sequence (); + *gen_seq = end_sequence (); return gen_rtx_fmt_ee (code, cc_mode, gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx); @@ -27765,8 +28302,7 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, end_sequence (); return NULL_RTX; } - *prep_seq = get_insns (); - end_sequence (); + *prep_seq = end_sequence (); target = gen_rtx_REG (cc_mode, CC_REGNUM); aarch64_cond = aarch64_get_condition_code_1 (cc_mode, cmp_code); @@ -27805,8 +28341,7 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, return NULL_RTX; } - *gen_seq = get_insns (); - end_sequence (); + *gen_seq = end_sequence (); return gen_rtx_fmt_ee (cmp_code, VOIDmode, target, const0_rtx); } @@ -29762,60 +30297,43 @@ aarch64_can_tag_addresses () /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE section at the end if needed. */ -#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 -#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) -#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) -#define GNU_PROPERTY_AARCH64_FEATURE_1_GCS (1U << 2) void aarch64_file_end_indicate_exec_stack () { file_end_indicate_exec_stack (); - unsigned feature_1_and = 0; - if (aarch_bti_enabled ()) - feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI; - - if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE) - feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + /* Check whether the current assembler supports AEABI build attributes, if + not fallback to .note.gnu.property section. */ + if (HAVE_AS_AEABI_BUILD_ATTRIBUTES) + { + using namespace aarch64; + aeabi_subsection<BA_TagFeature_t, bool, 3> + aeabi_subsec ("aeabi_feature_and_bits", true); - if (aarch64_gcs_enabled ()) - feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_GCS; + aeabi_subsec.append ( + make_aeabi_attribute (Tag_Feature_BTI, aarch_bti_enabled ())); + aeabi_subsec.append ( + make_aeabi_attribute (Tag_Feature_PAC, aarch64_pacret_enabled ())); + aeabi_subsec.append ( + make_aeabi_attribute (Tag_Feature_GCS, aarch64_gcs_enabled ())); - if (feature_1_and) + if (!aeabi_subsec.empty ()) + aeabi_subsec.write (asm_out_file); + } + else { - /* Generate .note.gnu.property section. */ - switch_to_section (get_section (".note.gnu.property", - SECTION_NOTYPE, NULL)); + aarch64::section_note_gnu_property gnu_properties; - /* PT_NOTE header: namesz, descsz, type. - namesz = 4 ("GNU\0") - descsz = 16 (Size of the program property array) - [(12 + padding) * Number of array elements] - type = 5 (NT_GNU_PROPERTY_TYPE_0). */ - assemble_align (POINTER_SIZE); - assemble_integer (GEN_INT (4), 4, 32, 1); - assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1); - assemble_integer (GEN_INT (5), 4, 32, 1); - - /* PT_NOTE name. */ - assemble_string ("GNU", 4); - - /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0: - type = GNU_PROPERTY_AARCH64_FEATURE_1_AND - datasz = 4 - data = feature_1_and. */ - assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1); - assemble_integer (GEN_INT (4), 4, 32, 1); - assemble_integer (GEN_INT (feature_1_and), 4, 32, 1); - - /* Pad the size of the note to the required alignment. */ - assemble_align (POINTER_SIZE); + if (aarch_bti_enabled ()) + gnu_properties.bti_enabled (); + if (aarch64_pacret_enabled ()) + gnu_properties.pac_enabled (); + if (aarch64_gcs_enabled ()) + gnu_properties.gcs_enabled (); + + gnu_properties.write (); } } -#undef GNU_PROPERTY_AARCH64_FEATURE_1_GCS -#undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC -#undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI -#undef GNU_PROPERTY_AARCH64_FEATURE_1_AND /* Helper function for straight line speculation. Return what barrier should be emitted for straight line speculation @@ -30391,8 +30909,7 @@ aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live) aarch64_local_sme_state (prev_mode)); break; } - rtx_insn *seq = get_insns (); - end_sequence (); + rtx_insn *seq = end_sequence (); /* Get the set of clobbered registers that are currently live. */ HARD_REG_SET clobbers = {}; @@ -30802,8 +31319,7 @@ aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs, emit_insn (REGNO (x) == ZA_REGNUM ? gen_aarch64_asm_update_za (id_rtx) : gen_aarch64_asm_update_zt0 (id_rtx)); - seq = get_insns (); - end_sequence (); + seq = end_sequence (); auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode; uses.safe_push (gen_rtx_REG (mode, REGNO (x))); @@ -30838,8 +31354,7 @@ aarch64_switch_pstate_sm_for_landing_pad (basic_block bb) args_switch.emit_epilogue (); if (guard_label) emit_label (guard_label); - auto seq = get_insns (); - end_sequence (); + auto seq = end_sequence (); emit_insn_after (seq, bb_note (bb)); return true; @@ -30862,8 +31377,7 @@ aarch64_switch_pstate_sm_for_jump (rtx_insn *jump) aarch64_switch_pstate_sm (AARCH64_ISA_MODE_SM_ON, AARCH64_ISA_MODE_SM_OFF); if (guard_label) emit_label (guard_label); - auto seq = get_insns (); - end_sequence (); + auto seq = end_sequence (); emit_insn_before (seq, jump); return true; @@ -30897,8 +31411,7 @@ aarch64_switch_pstate_sm_for_call (rtx_call_insn *call) args_switch.emit_epilogue (); if (args_guard_label) emit_label (args_guard_label); - auto args_seq = get_insns (); - end_sequence (); + auto args_seq = end_sequence (); emit_insn_before (args_seq, call); if (find_reg_note (call, REG_NORETURN, NULL_RTX)) @@ -30918,8 +31431,7 @@ aarch64_switch_pstate_sm_for_call (rtx_call_insn *call) return_switch.emit_epilogue (); if (return_guard_label) emit_label (return_guard_label); - auto result_seq = get_insns (); - end_sequence (); + auto result_seq = end_sequence (); emit_insn_after (result_seq, call); return true; } @@ -31073,8 +31585,6 @@ aarch64_valid_sysreg_name_p (const char *regname) const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname); if (sysreg == NULL) return aarch64_is_implem_def_reg (regname); - if (sysreg->arch_reqs) - return bool (aarch64_isa_flags & sysreg->arch_reqs); return true; } @@ -31098,8 +31608,6 @@ aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op) if ((write_p && (sysreg->properties & F_REG_READ)) || (!write_p && (sysreg->properties & F_REG_WRITE))) return NULL; - if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0) - return NULL; return sysreg->encoding; } @@ -31298,6 +31806,79 @@ aarch64_expand_reversed_crc_using_pmull (scalar_mode crc_mode, } } +/* Expand the spaceship optab for floating-point operands. + + If the result is compared against (-1, 0, 1 , 2), expand into + fcmpe + conditional branch insns. + + Otherwise (the result is just stored as an integer), expand into + fcmpe + a sequence of conditional select/increment/invert insns. */ +void +aarch64_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx hint) +{ + rtx cc_reg = gen_rtx_REG (CCFPEmode, CC_REGNUM); + emit_set_insn (cc_reg, gen_rtx_COMPARE (CCFPEmode, op0, op1)); + + rtx cc_gt = gen_rtx_GT (VOIDmode, cc_reg, const0_rtx); + rtx cc_lt = gen_rtx_LT (VOIDmode, cc_reg, const0_rtx); + rtx cc_un = gen_rtx_UNORDERED (VOIDmode, cc_reg, const0_rtx); + + if (hint == const0_rtx) + { + rtx un_label = gen_label_rtx (); + rtx lt_label = gen_label_rtx (); + rtx gt_label = gen_label_rtx (); + rtx end_label = gen_label_rtx (); + + rtx temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_un, + gen_rtx_LABEL_REF (Pmode, un_label), pc_rtx); + aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, temp)); + + temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_lt, + gen_rtx_LABEL_REF (Pmode, lt_label), pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); + + temp = gen_rtx_IF_THEN_ELSE (VOIDmode, cc_gt, + gen_rtx_LABEL_REF (Pmode, gt_label), pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); + + /* Equality. */ + emit_move_insn (dest, const0_rtx); + emit_jump (end_label); + + emit_label (un_label); + emit_move_insn (dest, const2_rtx); + emit_jump (end_label); + + emit_label (gt_label); + emit_move_insn (dest, const1_rtx); + emit_jump (end_label); + + emit_label (lt_label); + emit_move_insn (dest, constm1_rtx); + + emit_label (end_label); + } + else + { + rtx temp0 = gen_reg_rtx (SImode); + rtx temp1 = gen_reg_rtx (SImode); + rtx cc_ungt = gen_rtx_UNGT (VOIDmode, cc_reg, const0_rtx); + + /* The value of hint is stored if the operands are unordered. */ + rtx temp_un = gen_int_mode (UINTVAL (hint) - 1, SImode); + if (!aarch64_reg_zero_or_m1_or_1 (temp_un, SImode)) + temp_un = force_reg (SImode, temp_un); + + emit_set_insn (temp0, gen_rtx_IF_THEN_ELSE (SImode, cc_lt, + constm1_rtx, const0_rtx)); + emit_set_insn (temp1, gen_rtx_IF_THEN_ELSE (SImode, cc_un, + temp_un, const0_rtx)); + emit_set_insn (dest, gen_rtx_IF_THEN_ELSE (SImode, cc_ungt, + gen_rtx_PLUS (SImode, temp1, const1_rtx), temp0)); + } +} + /* Target-specific selftests. */ #if CHECKING_P @@ -31472,9 +32053,43 @@ aarch64_test_sysreg_encoding_clashes (void) static void aarch64_test_sve_folding () { + aarch64_target_switcher switcher (AARCH64_FL_SVE); + tree res = fold_unary (BIT_NOT_EXPR, ssizetype, ssize_int (poly_int64 (1, 1))); ASSERT_TRUE (operand_equal_p (res, ssize_int (poly_int64 (-2, -1)))); + + auto build_v16bi = [](bool a, bool b) + { + rtx_vector_builder builder (VNx16BImode, 2, 1); + builder.quick_push (a ? const1_rtx : const0_rtx); + builder.quick_push (b ? const1_rtx : const0_rtx); + return builder.build (); + }; + rtx v16bi_10 = build_v16bi (1, 0); + rtx v16bi_01 = build_v16bi (0, 1); + + for (auto mode : { VNx8BImode, VNx4BImode, VNx2BImode }) + { + rtx reg = gen_rtx_REG (mode, LAST_VIRTUAL_REGISTER + 1); + rtx subreg = lowpart_subreg (VNx16BImode, reg, mode); + rtx and1 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, and1, VNx16BImode), reg); + rtx and0 = simplify_gen_binary (AND, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, and0, VNx16BImode), CONST0_RTX (mode)); + + rtx ior1 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_10); + ASSERT_EQ (lowpart_subreg (mode, ior1, VNx16BImode), CONSTM1_RTX (mode)); + rtx ior0 = simplify_gen_binary (IOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, ior0, VNx16BImode), reg); + + rtx xor1 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_10); + ASSERT_RTX_EQ (lowpart_subreg (mode, xor1, VNx16BImode), + lowpart_subreg (mode, gen_rtx_NOT (VNx16BImode, subreg), + VNx16BImode)); + rtx xor0 = simplify_gen_binary (XOR, VNx16BImode, subreg, v16bi_01); + ASSERT_EQ (lowpart_subreg (mode, xor0, VNx16BImode), reg); + } } /* Run all target-specific selftests. */ |