diff options
Diffstat (limited to 'gcc/config')
380 files changed, 17126 insertions, 14376 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index eb878b9..97bde7c 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -133,6 +133,7 @@ #define MODE_d_f16 E_V4HFmode #define MODE_d_f32 E_V2SFmode #define MODE_d_f64 E_V1DFmode +#define MODE_d_mf8 E_V8QImode #define MODE_d_s8 E_V8QImode #define MODE_d_s16 E_V4HImode #define MODE_d_s32 E_V2SImode @@ -148,6 +149,7 @@ #define MODE_q_f16 E_V8HFmode #define MODE_q_f32 E_V4SFmode #define MODE_q_f64 E_V2DFmode +#define MODE_q_mf8 E_V16QImode #define MODE_q_s8 E_V16QImode #define MODE_q_s16 E_V8HImode #define MODE_q_s32 E_V4SImode @@ -177,6 +179,7 @@ #define QUAL_p16 qualifier_poly #define QUAL_p64 qualifier_poly #define QUAL_p128 qualifier_poly +#define QUAL_mf8 qualifier_modal_float #define LENGTH_d "" #define LENGTH_q "q" @@ -458,6 +461,19 @@ aarch64_types_storestruct_lane_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] qualifier_poly, qualifier_struct_load_store_lane_index }; #define TYPES_STORESTRUCT_LANE_P (aarch64_types_storestruct_lane_p_qualifiers) +constexpr insn_code CODE_FOR_aarch64_sdot_prodv8qi + = CODE_FOR_sdot_prodv2siv8qi; +constexpr insn_code CODE_FOR_aarch64_udot_prodv8qi + = CODE_FOR_udot_prodv2siv8qi; +constexpr insn_code CODE_FOR_aarch64_usdot_prodv8qi + = CODE_FOR_usdot_prodv2siv8qi; +constexpr insn_code CODE_FOR_aarch64_sdot_prodv16qi + = CODE_FOR_sdot_prodv4siv16qi; +constexpr insn_code CODE_FOR_aarch64_udot_prodv16qi + = CODE_FOR_udot_prodv4siv16qi; +constexpr insn_code CODE_FOR_aarch64_usdot_prodv16qi + = CODE_FOR_usdot_prodv4siv16qi; + #define CF0(N, X) CODE_FOR_aarch64_##N##X #define CF1(N, X) CODE_FOR_##N##X##1 #define CF2(N, X) CODE_FOR_##N##X##2 @@ -585,6 +601,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { /* vreinterpret intrinsics are defined for any pair of element types. { _bf16 } { _bf16 } { _f16 _f32 _f64 } { _f16 _f32 _f64 } + { _mf8 } { _mf8 } { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 } { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 } { _p8 _p16 _p64 } { _p8 _p16 _p64 }. */ @@ -596,6 +613,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRET_BUILTIN2 (A, f16) \ VREINTERPRET_BUILTIN2 (A, f32) \ VREINTERPRET_BUILTIN2 (A, f64) \ + VREINTERPRET_BUILTIN2 (A, mf8) \ VREINTERPRET_BUILTIN2 (A, s8) \ VREINTERPRET_BUILTIN2 (A, s16) \ VREINTERPRET_BUILTIN2 (A, s32) \ @@ -613,6 +631,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRET_BUILTINS1 (f16) \ VREINTERPRET_BUILTINS1 (f32) \ VREINTERPRET_BUILTINS1 (f64) \ + VREINTERPRET_BUILTINS1 (mf8) \ VREINTERPRET_BUILTINS1 (s8) \ VREINTERPRET_BUILTINS1 (s16) \ VREINTERPRET_BUILTINS1 (s32) \ @@ -628,6 +647,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { /* vreinterpretq intrinsics are additionally defined for p128. { _bf16 } { _bf16 } { _f16 _f32 _f64 } { _f16 _f32 _f64 } + { _mf8 } { _mf8 } { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 } { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 } { _p8 _p16 _p64 _p128 } { _p8 _p16 _p64 _p128 }. */ @@ -639,6 +659,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRETQ_BUILTIN2 (A, f16) \ VREINTERPRETQ_BUILTIN2 (A, f32) \ VREINTERPRETQ_BUILTIN2 (A, f64) \ + VREINTERPRETQ_BUILTIN2 (A, mf8) \ VREINTERPRETQ_BUILTIN2 (A, s8) \ VREINTERPRETQ_BUILTIN2 (A, s16) \ VREINTERPRETQ_BUILTIN2 (A, s32) \ @@ -657,6 +678,7 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VREINTERPRETQ_BUILTINS1 (f16) \ VREINTERPRETQ_BUILTINS1 (f32) \ VREINTERPRETQ_BUILTINS1 (f64) \ + VREINTERPRETQ_BUILTINS1 (mf8) \ VREINTERPRETQ_BUILTINS1 (s8) \ VREINTERPRETQ_BUILTINS1 (s16) \ VREINTERPRETQ_BUILTINS1 (s32) \ @@ -757,6 +779,10 @@ typedef struct #define VAR1(T, N, MAP, FLAG, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, +#undef ENTRY +#define ENTRY(N, S, M, U) \ + AARCH64_##N, + enum aarch64_builtins { AARCH64_BUILTIN_MIN, @@ -829,6 +855,10 @@ enum aarch64_builtins AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL, + /* Pragma builtins. */ + AARCH64_PRAGMA_BUILTIN_START, +#include "aarch64-simd-pragma-builtins.def" + AARCH64_PRAGMA_BUILTIN_END, /* System register builtins. */ AARCH64_RSR, AARCH64_RSRP, @@ -947,6 +977,7 @@ const char *aarch64_scalar_builtin_types[] = { extern GTY(()) aarch64_simd_type_info aarch64_simd_types[]; +#undef ENTRY #define ENTRY(E, M, Q, G) \ {E, "__" #E, #G "__" #E, NULL_TREE, NULL_TREE, E_##M##mode, qualifier_##Q}, struct aarch64_simd_type_info aarch64_simd_types [] = { @@ -961,6 +992,11 @@ static GTY(()) tree aarch64_simd_intOI_type_node = NULL_TREE; static GTY(()) tree aarch64_simd_intCI_type_node = NULL_TREE; static GTY(()) tree aarch64_simd_intXI_type_node = NULL_TREE; +/* The user-visible __mfp8 type, and a pointer to that type. Used + across the back-end. */ +tree aarch64_mfp8_type_node = NULL_TREE; +tree aarch64_mfp8_ptr_type_node = NULL_TREE; + /* The user-visible __fp16 type, and a pointer to that type. Used across the back-end. */ tree aarch64_fp16_type_node = NULL_TREE; @@ -1082,7 +1118,8 @@ aarch64_lookup_simd_type_in_table (machine_mode mode, { int i; int nelts = ARRAY_SIZE (aarch64_simd_types); - int q = qualifiers & (qualifier_poly | qualifier_unsigned); + int q = qualifiers + & (qualifier_poly | qualifier_unsigned | qualifier_modal_float); for (i = 0; i < nelts; i++) { @@ -1126,7 +1163,7 @@ aarch64_simd_builtin_type (machine_mode mode, return type; } - + static void aarch64_init_simd_builtin_types (void) { @@ -1185,6 +1222,10 @@ aarch64_init_simd_builtin_types (void) aarch64_simd_types[Bfloat16x4_t].eltype = bfloat16_type_node; aarch64_simd_types[Bfloat16x8_t].eltype = bfloat16_type_node; + /* Init FP8 element types. */ + aarch64_simd_types[Mfloat8x8_t].eltype = aarch64_mfp8_type_node; + aarch64_simd_types[Mfloat8x16_t].eltype = aarch64_mfp8_type_node; + for (i = 0; i < nelts; i++) { tree eltype = aarch64_simd_types[i].eltype; @@ -1547,6 +1588,71 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) } } +enum class aarch64_builtin_signatures +{ + binary, +}; + +#undef ENTRY +#define ENTRY(N, S, M, U) \ + {#N, aarch64_builtin_signatures::S, E_##M##mode, U, \ + aarch64_required_extensions::REQUIRED_EXTENSIONS}, + +/* Initialize pragma builtins. */ + +struct aarch64_pragma_builtins_data +{ + const char *name; + aarch64_builtin_signatures signature; + machine_mode mode; + int unspec; + aarch64_required_extensions required_extensions; +}; + +static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = { +#include "aarch64-simd-pragma-builtins.def" +}; + +static tree +aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data) +{ + auto type = aarch64_simd_builtin_type (builtin_data.mode, qualifier_none); + switch (builtin_data.signature) + { + case aarch64_builtin_signatures::binary: + return build_function_type_list (type, type, type, NULL_TREE); + default: + gcc_unreachable (); + } +} + +static void +aarch64_init_pragma_builtins () +{ + for (size_t i = 0; i < ARRAY_SIZE (aarch64_pragma_builtins); ++i) + { + auto data = aarch64_pragma_builtins[i]; + auto fntype = aarch64_fntype (data); + auto code = AARCH64_PRAGMA_BUILTIN_START + i + 1; + aarch64_builtin_decls[code] + = aarch64_general_simulate_builtin (data.name, fntype, code); + } +} + +/* If the builtin function with code CODE has an entry in + aarch64_pragma_builtins, return its entry, otherwise return null. */ + +static const aarch64_pragma_builtins_data* +aarch64_get_pragma_builtin (int code) +{ + if (!(code > AARCH64_PRAGMA_BUILTIN_START + && code < AARCH64_PRAGMA_BUILTIN_END)) + return NULL; + + auto idx = code - (AARCH64_PRAGMA_BUILTIN_START + 1); + return &aarch64_pragma_builtins[idx]; +} + /* Register the tuple type that contains NUM_VECTORS of the AdvSIMD type indexed by TYPE_INDEX. */ static void @@ -1640,6 +1746,7 @@ handle_arm_neon_h (void) aarch64_init_simd_builtin_functions (true); aarch64_init_simd_intrinsics (); + aarch64_init_pragma_builtins (); } static void @@ -1721,6 +1828,19 @@ aarch64_init_builtin_rsqrt (void) } } +/* Initialize the backend type that supports the user-visible __mfp8 + type and its relative pointer type. */ + +static void +aarch64_init_fp8_types (void) +{ + aarch64_mfp8_type_node = make_unsigned_type (8); + SET_TYPE_MODE (aarch64_mfp8_type_node, QImode); + + lang_hooks.types.register_builtin_type (aarch64_mfp8_type_node, "__mfp8"); + aarch64_mfp8_ptr_type_node = build_pointer_type (aarch64_mfp8_type_node); +} + /* Initialize the backend types that support the user-visible __fp16 type, also initialize a pointer to that type, to be used when forming HFAs. */ @@ -2125,6 +2245,8 @@ aarch64_general_init_builtins (void) { aarch64_init_fpsr_fpcr_builtins (); + aarch64_init_fp8_types (); + aarch64_init_fp16_types (); aarch64_init_bf16_types (); @@ -2212,18 +2334,40 @@ aarch64_report_missing_registers (location_t location, tree fndecl) reported_missing_registers_p = true; } -/* Check whether all the AARCH64_FL_* values in REQUIRED_EXTENSIONS are - enabled, given that those extensions are required for function FNDECL. - Report an error against LOCATION if not. */ +/* Check whether the requirements in REQUIRED_EXTENSIONS are met, given that + those requirements come from calling function FNDECL. Report an error + against LOCATION if not. */ bool aarch64_check_required_extensions (location_t location, tree fndecl, - aarch64_feature_flags required_extensions) + aarch64_required_extensions + required_extensions) { - if ((required_extensions & ~aarch64_isa_flags) == 0) - return true; + aarch64_feature_flags sm_state_extensions = 0; + if (!TARGET_STREAMING) + { + if (required_extensions.sm_off == 0) + { + error_at (location, "ACLE function %qD can only be called when" + " SME streaming mode is enabled", fndecl); + return false; + } + sm_state_extensions |= required_extensions.sm_off & ~AARCH64_FL_SM_OFF; + } + if (!TARGET_NON_STREAMING) + { + if (required_extensions.sm_on == 0) + { + error_at (location, "ACLE function %qD cannot be called when" + " SME streaming mode is enabled", fndecl); + return false; + } + sm_state_extensions |= required_extensions.sm_on & ~AARCH64_FL_SM_ON; + } - auto missing_extensions = required_extensions & ~aarch64_asm_isa_flags; + if ((sm_state_extensions & ~aarch64_isa_flags) == 0) + return true; + auto missing_extensions = sm_state_extensions & ~aarch64_asm_isa_flags; if (missing_extensions == 0) { /* All required extensions are enabled in aarch64_asm_isa_flags, so the @@ -2232,20 +2376,6 @@ aarch64_check_required_extensions (location_t location, tree fndecl, return false; } - if (missing_extensions & AARCH64_FL_SM_OFF) - { - error_at (location, "ACLE function %qD cannot be called when" - " SME streaming mode is enabled", fndecl); - return false; - } - - if (missing_extensions & AARCH64_FL_SM_ON) - { - error_at (location, "ACLE function %qD can only be called when" - " SME streaming mode is enabled", fndecl); - return false; - } - if (missing_extensions & AARCH64_FL_ZA_ON) { error_at (location, "ACLE function %qD can only be called from" @@ -2271,12 +2401,47 @@ aarch64_check_required_extensions (location_t location, tree fndecl, gcc_unreachable (); } +/* Return the ISA extensions required by function CODE. */ +static aarch64_required_extensions +aarch64_general_required_extensions (unsigned int code) +{ + using ext = aarch64_required_extensions; + switch (code) + { + case AARCH64_TME_BUILTIN_TSTART: + case AARCH64_TME_BUILTIN_TCOMMIT: + case AARCH64_TME_BUILTIN_TTEST: + case AARCH64_TME_BUILTIN_TCANCEL: + return ext::streaming_compatible (AARCH64_FL_TME); + + case AARCH64_LS64_BUILTIN_LD64B: + case AARCH64_LS64_BUILTIN_ST64B: + case AARCH64_LS64_BUILTIN_ST64BV: + case AARCH64_LS64_BUILTIN_ST64BV0: + return ext::streaming_compatible (AARCH64_FL_LS64); + + default: + if (code >= AARCH64_MEMTAG_BUILTIN_START + && code <= AARCH64_MEMTAG_BUILTIN_END) + return ext::streaming_compatible (AARCH64_FL_MEMTAG); + + if (auto builtin_data = aarch64_get_pragma_builtin (code)) + return builtin_data->required_extensions; + } + return ext::streaming_compatible (0); +} + bool aarch64_general_check_builtin_call (location_t location, vec<location_t>, - unsigned int code, tree fndecl, - unsigned int nargs ATTRIBUTE_UNUSED, tree *args) + unsigned int code, tree fndecl, + unsigned int nargs ATTRIBUTE_UNUSED, + tree *args) { tree decl = aarch64_builtin_decls[code]; + auto required_extensions = aarch64_general_required_extensions (code); + if (!aarch64_check_required_extensions (location, decl, required_extensions)) + return false; + switch (code) { case AARCH64_RSR: @@ -2302,30 +2467,8 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>, } break; } - - case AARCH64_TME_BUILTIN_TSTART: - case AARCH64_TME_BUILTIN_TCOMMIT: - case AARCH64_TME_BUILTIN_TTEST: - case AARCH64_TME_BUILTIN_TCANCEL: - return aarch64_check_required_extensions (location, decl, - AARCH64_FL_TME); - - case AARCH64_LS64_BUILTIN_LD64B: - case AARCH64_LS64_BUILTIN_ST64B: - case AARCH64_LS64_BUILTIN_ST64BV: - case AARCH64_LS64_BUILTIN_ST64BV0: - return aarch64_check_required_extensions (location, decl, - AARCH64_FL_LS64); - - default: - break; } - if (code >= AARCH64_MEMTAG_BUILTIN_START - && code <= AARCH64_MEMTAG_BUILTIN_END) - return aarch64_check_required_extensions (location, decl, - AARCH64_FL_MEMTAG); - return true; } @@ -3189,6 +3332,25 @@ aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target) return ops[0].value; } +static rtx +aarch64_expand_pragma_builtin (tree exp, rtx target, + const aarch64_pragma_builtins_data *builtin_data) +{ + expand_operand ops[3]; + auto mode = builtin_data->mode; + auto op1 = expand_normal (CALL_EXPR_ARG (exp, 0)); + auto op2 = expand_normal (CALL_EXPR_ARG (exp, 1)); + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], op1, mode); + create_input_operand (&ops[2], op2, mode); + + auto unspec = builtin_data->unspec; + auto icode = code_for_aarch64 (unspec, mode); + expand_insn (icode, 3, ops); + + return target; +} + /* Expand an expression EXP as fpsr or fpcr setter (depending on UNSPEC) using MODE. */ static void @@ -3369,6 +3531,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target, && fcode <= AARCH64_RBITLL) return aarch64_expand_builtin_data_intrinsic (fcode, exp, target); + if (auto builtin_data = aarch64_get_pragma_builtin (fcode)) + return aarch64_expand_pragma_builtin (exp, target, builtin_data); + gcc_unreachable (); } @@ -4021,6 +4186,7 @@ aarch64_resolve_overloaded_builtin_general (location_t loc, tree function, #undef CF3 #undef CF4 #undef CF10 +#undef ENTRY_VHSDF #undef VAR1 #undef VAR2 #undef VAR3 diff --git a/gcc/config/aarch64/aarch64-builtins.h b/gcc/config/aarch64/aarch64-builtins.h index e326fe6..00db7a7 100644 --- a/gcc/config/aarch64/aarch64-builtins.h +++ b/gcc/config/aarch64/aarch64-builtins.h @@ -54,6 +54,8 @@ enum aarch64_type_qualifiers /* Lane indices selected in quadtuplets. - must be in range, and flipped for bigendian. */ qualifier_lane_quadtup_index = 0x1000, + /* Modal FP types. */ + qualifier_modal_float = 0x2000, }; #define ENTRY(E, M, Q, G) E, diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc index 3af8c00..3cae4c4 100644 --- a/gcc/config/aarch64/aarch64-cc-fusion.cc +++ b/gcc/config/aarch64/aarch64-cc-fusion.cc @@ -64,6 +64,7 @@ #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL #define INCLUDE_ARRAY +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index cc22600..9d4abf2 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -132,6 +132,7 @@ AARCH64_CORE("octeontx2f95mm", octeontx2f95mm, cortexa57, V8_2A, (CRYPTO, PROFI /* Fujitsu ('F') cores. */ AARCH64_CORE("a64fx", a64fx, a64fx, V8_2A, (F16, SVE), a64fx, 0x46, 0x001, -1) +AARCH64_CORE("fujitsu-monaka", fujitsu_monaka, cortexa57, V9_3A, (F16, FP8, LS64, RNG, CRYPTO, SVE2_AES, SVE2_BITPERM, SVE2_SHA3, SVE2_SM4), fujitsu_monaka, 0x46, 0x003, -1) /* HiSilicon ('H') cores. */ AARCH64_CORE("tsv110", tsv110, tsv110, V8_2A, (CRYPTO, F16), tsv110, 0x48, 0xd01, -1) diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index 7c79491..5e0c1f7 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -346,7 +346,7 @@ const struct cpu_cost_table thunderx2t99_extra_costs = } }; -const struct cpu_cost_table thunderx3t110_extra_costs = +const struct cpu_cost_table thunderx3t110_extra_costs = { /* ALU */ { diff --git a/gcc/config/aarch64/aarch64-early-ra.cc b/gcc/config/aarch64/aarch64-early-ra.cc index 5f269d0..bbd8468 100644 --- a/gcc/config/aarch64/aarch64-early-ra.cc +++ b/gcc/config/aarch64/aarch64-early-ra.cc @@ -40,6 +40,7 @@ #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY #define INCLUDE_ARRAY #include "config.h" #include "system.h" @@ -3389,6 +3390,12 @@ early_ra::is_dead_insn (rtx_insn *insn) if (side_effects_p (set)) return false; + /* If we can't delete dead exceptions and the insn throws, + then the instruction is not dead. */ + if (!cfun->can_delete_dead_exceptions + && !insn_nothrow_p (insn)) + return false; + return true; } diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index 6998627..8279f5a 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -234,6 +234,8 @@ AARCH64_OPT_EXTENSION("gcs", GCS, (), (), (), "gcs") AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8") +AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax") + #undef AARCH64_OPT_FMV_EXTENSION #undef AARCH64_OPT_EXTENSION #undef AARCH64_FMV_FEATURE diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index d03c1fe..e8588e1 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -627,7 +627,7 @@ struct aarch64_address_info { }; #define AARCH64_FUSION_PAIR(x, name) \ - AARCH64_FUSE_##name##_index, + AARCH64_FUSE_##name##_index, /* Supported fusion operations. */ enum aarch64_fusion_pairs_index { @@ -665,16 +665,6 @@ enum aarch64_extra_tuning_flags AARCH64_EXTRA_TUNE_ALL = (1u << AARCH64_EXTRA_TUNE_index_END) - 1 }; -/* Enum to distinguish which type of check is to be done in - aarch64_simd_valid_immediate. This is used as a bitmask where - AARCH64_CHECK_MOV has both bits set. Thus AARCH64_CHECK_MOV will - perform all checks. Adding new types would require changes accordingly. */ -enum simd_immediate_check { - AARCH64_CHECK_ORR = 1 << 0, - AARCH64_CHECK_BIC = 1 << 1, - AARCH64_CHECK_MOV = AARCH64_CHECK_ORR | AARCH64_CHECK_BIC -}; - extern struct tune_params aarch64_tune_params; /* The available SVE predicate patterns, known in the ACLE as "svpattern". */ @@ -754,6 +744,91 @@ private: bool m_old_general_regs_only; }; +/* Represents the ISA requirements of an intrinsic function, or of some + other similar operation. It stores separate feature flags for + non-streaming mode and for streaming-mode; both requirements must + be met in streaming-compatible mode. */ +struct aarch64_required_extensions +{ + /* Return a requirement that includes FLAGS on top of any existing + requirements. */ + inline CONSTEXPR aarch64_required_extensions + and_also (aarch64_feature_flags flags) + { + return { sm_off ? sm_off | flags : 0, + sm_on ? sm_on | flags : 0 }; + } + + /* Require non-streaming mode and the features in FLAGS. */ + static inline CONSTEXPR aarch64_required_extensions + nonstreaming_only (aarch64_feature_flags flags) + { + return { AARCH64_FL_SM_OFF | flags, 0 }; + } + + /* Likewise, and also require SVE. */ + static inline CONSTEXPR aarch64_required_extensions + nonstreaming_sve (aarch64_feature_flags flags) + { + return nonstreaming_only (AARCH64_FL_SVE | flags); + } + + /* Allow both streaming and non-streaming mode, requiring the features + in FLAGS for both cases. */ + static inline CONSTEXPR aarch64_required_extensions + streaming_compatible (aarch64_feature_flags flags) + { + return { AARCH64_FL_SM_OFF | flags, AARCH64_FL_SM_ON | flags }; + } + + /* Likewise, and also require SVE for non-streaming mode. */ + static inline CONSTEXPR aarch64_required_extensions + ssve (aarch64_feature_flags flags) + { + return streaming_compatible (AARCH64_FL_SVE | flags, flags); + } + + /* Allow both streaming and non-streaming mode, requiring the features + in SM_OFF for non-streaming mode and the features in SM_ON for + streaming mode. */ + static inline CONSTEXPR aarch64_required_extensions + streaming_compatible (aarch64_feature_flags sm_off, + aarch64_feature_flags sm_on) + { + return { AARCH64_FL_SM_OFF | sm_off, AARCH64_FL_SM_ON | sm_on }; + } + + /* Likewise, and also require SVE for non-streaming mode. */ + static inline CONSTEXPR aarch64_required_extensions + sve_and_sme (aarch64_feature_flags sm_off, aarch64_feature_flags sm_on) + { + return streaming_compatible (AARCH64_FL_SVE | sm_off, sm_on); + } + + /* Require streaming mode and the features in FLAGS. */ + static inline CONSTEXPR aarch64_required_extensions + streaming_only (aarch64_feature_flags flags) + { + return { 0, AARCH64_FL_SM_ON | flags }; + } + + /* The ISA requirements in non-streaming mode, or 0 if the operation + is only allowed in streaming mode. When this field is nonzero, + it always includes AARCH64_FL_SM_OFF. */ + aarch64_feature_flags sm_off; + + /* The ISA requirements in streaming mode, or 0 if the operation is only + allowed in non-streaming mode. When this field is nonzero, + it always includes AARCH64_FL_SM_ON. + + This field should not normally include AARCH64_FL_SME, since we + would not be in streaming mode if SME wasn't supported. Excluding + AARCH64_FL_SME makes it easier to handle streaming-compatible rules + since (for example) svadd_x should be available in streaming-compatible + functions even without +sme. */ + aarch64_feature_flags sm_on; +}; + void aarch64_post_cfi_startproc (void); poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned); int aarch64_get_condition_code (rtx); @@ -776,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx); bool aarch64_constant_address_p (rtx); bool aarch64_emit_approx_div (rtx, rtx, rtx); bool aarch64_emit_approx_sqrt (rtx, rtx, bool); +bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx); tree aarch64_vector_load_decl (tree); rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs); void aarch64_expand_call (rtx, rtx, rtx, bool); @@ -834,8 +910,11 @@ char *aarch64_output_sve_rdvl (rtx); char *aarch64_output_sve_addvl_addpl (rtx); char *aarch64_output_sve_vector_inc_dec (const char *, rtx); char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode); -char *aarch64_output_simd_mov_immediate (rtx, unsigned, - enum simd_immediate_check w = AARCH64_CHECK_MOV); +char *aarch64_output_simd_mov_imm (rtx, unsigned); +char *aarch64_output_simd_orr_imm (rtx, unsigned); +char *aarch64_output_simd_and_imm (rtx, unsigned); +char *aarch64_output_simd_xor_imm (rtx, unsigned); + char *aarch64_output_sve_mov_immediate (rtx); char *aarch64_output_sve_ptrues (rtx); bool aarch64_pad_reg_upward (machine_mode, const_tree, bool); @@ -849,8 +928,10 @@ bool aarch64_pars_overlap_p (rtx, rtx); bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode); bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool); bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *); -bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *, - enum simd_immediate_check w = AARCH64_CHECK_MOV); +bool aarch64_simd_valid_and_imm (rtx); +bool aarch64_simd_valid_mov_imm (rtx); +bool aarch64_simd_valid_orr_imm (rtx); +bool aarch64_simd_valid_xor_imm (rtx); bool aarch64_valid_sysreg_name_p (const char *); const char *aarch64_retrieve_sysreg (const char *, bool, bool); rtx aarch64_check_zero_based_sve_index_immediate (rtx); @@ -922,6 +1003,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx); void aarch64_expand_mov_immediate (rtx, rtx); rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type); rtx aarch64_ptrue_reg (machine_mode); +rtx aarch64_ptrue_reg (machine_mode, unsigned int); rtx aarch64_pfalse_reg (machine_mode); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); @@ -1019,7 +1101,7 @@ void handle_arm_acle_h (void); void handle_arm_neon_h (void); bool aarch64_check_required_extensions (location_t, tree, - aarch64_feature_flags); + aarch64_required_extensions); bool aarch64_general_check_builtin_call (location_t, vec<location_t>, unsigned int, tree, unsigned int, tree *); diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def index 6111cd0..83b2da2 100644 --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def @@ -52,3 +52,5 @@ ENTRY (Float64x2_t, V2DF, none, 13) ENTRY (Bfloat16x4_t, V4BF, none, 14) ENTRY (Bfloat16x8_t, V8BF, none, 14) + ENTRY (Mfloat8x8_t, V8QI, modal_float, 13) + ENTRY (Mfloat8x16_t, V16QI, modal_float, 14) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index e65f73d..0814f8b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -418,9 +418,9 @@ BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE) /* Implemented by <sur><dotprod>_prod<dot_mode>. */ - BUILTIN_VB (TERNOP, sdot_prod, 10, NONE) - BUILTIN_VB (TERNOPU, udot_prod, 10, NONE) - BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE) + BUILTIN_VB (TERNOP, sdot_prod, 0, NONE) + BUILTIN_VB (TERNOPU, udot_prod, 0, NONE) + BUILTIN_VB (TERNOP_SUSS, usdot_prod, 0, NONE) /* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */ BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE) BUILTIN_VB (QUADOPU_LANE, udot_lane, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def new file mode 100644 index 0000000..d66642e --- /dev/null +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -0,0 +1,33 @@ +/* AArch64 SIMD pragma builtins + Copyright (C) 2024 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#undef ENTRY_VHSDF +#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC) \ + ENTRY (NAME##_f16, SIGNATURE, V4HF, UNSPEC) \ + ENTRY (NAME##q_f16, SIGNATURE, V8HF, UNSPEC) \ + ENTRY (NAME##_f32, SIGNATURE, V2SF, UNSPEC) \ + ENTRY (NAME##q_f32, SIGNATURE, V4SF, UNSPEC) \ + ENTRY (NAME##q_f64, SIGNATURE, V2DF, UNSPEC) + +// faminmax +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX) +ENTRY_VHSDF (vamax, binary, UNSPEC_FAMAX) +ENTRY_VHSDF (vamin, binary, UNSPEC_FAMIN) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 23c03a9..a91222b 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -160,7 +160,7 @@ [?r, w ; neon_to_gp<q> , * , *] fmov\t%x0, %d1 [?w, r ; f_mcr , * , *] fmov\t%d0, %1 [?r, r ; mov_reg , * , *] mov\t%0, %1 - [w , Dn; neon_move<q> , simd , *] << aarch64_output_simd_mov_immediate (operands[1], 64); + [w , Dn; neon_move<q> , simd , *] << aarch64_output_simd_mov_imm (operands[1], 64); [w , Dz; f_mcr , * , *] fmov\t%d0, xzr [w , Dx; neon_move , simd , 8] # } @@ -189,7 +189,7 @@ [?r , w ; multiple , * , 8] # [?w , r ; multiple , * , 8] # [?r , r ; multiple , * , 8] # - [w , Dn; neon_move<q> , simd, 4] << aarch64_output_simd_mov_immediate (operands[1], 128); + [w , Dn; neon_move<q> , simd, 4] << aarch64_output_simd_mov_imm (operands[1], 128); [w , Dz; fmov , * , 4] fmov\t%d0, xzr [w , Dx; neon_move , simd, 8] # } @@ -208,7 +208,6 @@ else { if (FP_REGNUM_P (REGNO (operands[0])) - && <MODE>mode == V2DImode && aarch64_maybe_generate_simd_constant (operands[0], operands[1], <MODE>mode)) ; @@ -568,7 +567,7 @@ ;; ... ;; ;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_insn "<sur>dot_prod<vsi2qi><vczle><vczbe>" +(define_insn "<sur>dot_prod<mode><vsi2qi><vczle><vczbe>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w") @@ -582,7 +581,7 @@ ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. -(define_insn "usdot_prod<vsi2qi><vczle><vczbe>" +(define_insn "usdot_prod<mode><vsi2qi><vczle><vczbe>" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w") @@ -1075,7 +1074,8 @@ rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode)); rtx abd = gen_reg_rtx (V16QImode); emit_insn (gen_aarch64_<su>abdv16qi (abd, operands[1], operands[2])); - emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3])); + emit_insn (gen_udot_prodv4siv16qi (operands[0], abd, ones, + operands[3])); DONE; } rtx reduc = gen_reg_rtx (V8HImode); @@ -1121,11 +1121,11 @@ (define_insn "and<mode>3<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand") (and:VDQ_I (match_operand:VDQ_I 1 "register_operand") - (match_operand:VDQ_I 2 "aarch64_reg_or_bic_imm")))] + (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))] "TARGET_SIMD" {@ [ cons: =0 , 1 , 2 ] [ w , w , w ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> - [ w , 0 , Db ] << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, AARCH64_CHECK_BIC); + [ w , 0 , Db ] << aarch64_output_simd_and_imm (operands[2], <bitsize>); } [(set_attr "type" "neon_logic<q>")] ) @@ -1134,24 +1134,25 @@ (define_insn "ior<mode>3<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand") (ior:VDQ_I (match_operand:VDQ_I 1 "register_operand") - (match_operand:VDQ_I 2 "aarch64_orr_imm_sve_advsimd")))] + (match_operand:VDQ_I 2 "aarch64_reg_or_orr_imm")))] "TARGET_SIMD" - {@ [ cons: =0 , 1 , 2; attrs: arch ] - [ w , w , w ; simd ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> - [ w , 0 , vsl; sve ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2 - [ w , 0 , Do ; simd ] \ - << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \ - AARCH64_CHECK_ORR); + {@ [ cons: =0 , 1 , 2 ] + [ w , w , w ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> + [ w , 0 , Do ] << aarch64_output_simd_orr_imm (operands[2], <bitsize>); } [(set_attr "type" "neon_logic<q>")] ) +;; For EOR (vector, register) and SVE EOR (vector, immediate) (define_insn "xor<mode>3<vczle><vczbe>" - [(set (match_operand:VDQ_I 0 "register_operand" "=w") - (xor:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w") - (match_operand:VDQ_I 2 "register_operand" "w")))] + [(set (match_operand:VDQ_I 0 "register_operand") + (xor:VDQ_I (match_operand:VDQ_I 1 "register_operand") + (match_operand:VDQ_I 2 "aarch64_reg_or_xor_imm")))] "TARGET_SIMD" - "eor\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>" + {@ [ cons: =0 , 1 , 2 ] + [ w , w , w ] eor\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> + [ w , 0 , De ] << aarch64_output_simd_xor_imm (operands[2], <bitsize>); + } [(set_attr "type" "neon_logic<q>")] ) @@ -1293,6 +1294,38 @@ [(set_attr "type" "neon_shift_acc<q>")] ) +;; After all the combinations and propagations of ROTATE have been +;; attempted split any remaining vector rotates into SHL + USRA sequences. +(define_insn_and_split "*aarch64_simd_rotate_imm<mode>" + [(set (match_operand:VDQ_I 0 "register_operand" "=&w") + (rotate:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w") + (match_operand:VDQ_I 2 "aarch64_simd_lshift_imm")))] + "TARGET_SIMD" + "#" + "&& 1" + [(set (match_dup 3) + (ashift:VDQ_I (match_dup 1) + (match_dup 2))) + (set (match_dup 0) + (plus:VDQ_I + (lshiftrt:VDQ_I + (match_dup 1) + (match_dup 4)) + (match_dup 3)))] + { + if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2])) + DONE; + + operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode); + rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]); + int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT; + operands[4] + = aarch64_simd_gen_const_vector_dup (<MODE>mode, + bitwidth - INTVAL (shft_amnt)); + } + [(set_attr "length" "8")] +) + (define_insn "aarch64_<sra_op>rsra_n<mode>_insn" [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w") (plus:VSDQ_I_DI @@ -3515,21 +3548,31 @@ ) (define_expand "popcount<mode>2" - [(set (match_operand:VDQHSD 0 "register_operand") - (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] + [(set (match_operand:VDQHSD_V1DI 0 "register_operand") + (popcount:VDQHSD_V1DI + (match_operand:VDQHSD_V1DI 1 "register_operand")))] "TARGET_SIMD" { if (TARGET_SVE) { - rtx p = aarch64_ptrue_reg (<VPRED>mode); + rtx p = aarch64_ptrue_reg (<VPRED>mode, <bitsize> == 64 ? 8 : 16); emit_insn (gen_aarch64_pred_popcount<mode> (operands[0], p, operands[1])); DONE; } + if (<MODE>mode == V1DImode) + { + rtx out = gen_reg_rtx (DImode); + emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1]))); + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out)); + DONE; + } + /* Generate a byte popcount. */ machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode; + machine_mode mode2 = <bitsize> == 64 ? V2SImode : V4SImode; rtx tmp = gen_reg_rtx (mode); auto icode = optab_handler (popcount_optab, mode); emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1]))); @@ -3540,7 +3583,7 @@ /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a 1 multiplicand. For V2DI, another UAADDLP is needed. */ rtx ones = force_reg (mode, CONST1_RTX (mode)); - auto icode = optab_handler (udot_prod_optab, mode); + auto icode = convert_optab_handler (udot_prod_optab, mode2, mode); mode = <bitsize> == 64 ? V2SImode : V4SImode; rtx dest = mode == <MODE>mode ? operands[0] : gen_reg_rtx (mode); rtx zeros = force_reg (mode, CONST0_RTX (mode)); @@ -9044,18 +9087,43 @@ [(set_attr "type" "crypto_sha3")] ) -(define_insn "aarch64_xarqv2di" +(define_insn "*aarch64_xarqv2di_insn" [(set (match_operand:V2DI 0 "register_operand" "=w") - (rotatert:V2DI + (rotate:V2DI (xor:V2DI (match_operand:V2DI 1 "register_operand" "%w") (match_operand:V2DI 2 "register_operand" "w")) - (match_operand:SI 3 "aarch64_simd_shift_imm_di" "Usd")))] + (match_operand:V2DI 3 "aarch64_simd_lshift_imm" "Dl")))] "TARGET_SHA3" - "xar\\t%0.2d, %1.2d, %2.2d, %3" + { + operands[3] + = GEN_INT (64 - INTVAL (unwrap_const_vec_duplicate (operands[3]))); + return "xar\\t%0.2d, %1.2d, %2.2d, %3"; + } [(set_attr "type" "crypto_sha3")] ) +;; The semantics of the vxarq_u64 intrinsics treat the immediate argument as a +;; right-rotate amount but the recommended representation of rotates by a +;; constant in RTL is with the left ROTATE code. Translate between the +;; intrinsic-provided amount and the RTL operands in the expander here. +;; The define_insn for XAR will translate back to instruction semantics in its +;; output logic. +(define_expand "aarch64_xarqv2di" + [(set (match_operand:V2DI 0 "register_operand") + (rotate:V2DI + (xor:V2DI + (match_operand:V2DI 1 "register_operand") + (match_operand:V2DI 2 "register_operand")) + (match_operand:SI 3 "aarch64_simd_shift_imm_di")))] + "TARGET_SHA3" + { + operands[3] + = aarch64_simd_gen_const_vector_dup (V2DImode, + 64 - INTVAL (operands[3])); + } +) + (define_insn "bcaxq<mode>4" [(set (match_operand:VQ_I 0 "register_operand" "=w") (xor:VQ_I @@ -9910,3 +9978,22 @@ "shl\\t%d0, %d1, #16" [(set_attr "type" "neon_shift_imm")] ) + +;; faminmax +(define_insn "@aarch64_<faminmax_uns_op><mode>" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w") + (match_operand:VHSDF 2 "register_operand" "w")] + FAMINMAX_UNS))] + "TARGET_FAMINMAX" + "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" +) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))] + "TARGET_FAMINMAX" + "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" +) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index d55bee0..1c9f515 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -755,28 +755,62 @@ public: gimple * fold (gimple_folder &f) const override { - tree divisor = gimple_call_arg (f.call, 2); - tree divisor_cst = uniform_integer_cst_p (divisor); + if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR)) + return res; - if (!divisor_cst || !integer_pow2p (divisor_cst)) + /* If the divisor is all ones, fold to dividend. */ + tree op1 = gimple_call_arg (f.call, 1); + tree op2 = gimple_call_arg (f.call, 2); + if (integer_onep (op2)) + return f.fold_active_lanes_to (op1); + + /* If one of the operands is all zeros, fold to zero vector. */ + if (integer_zerop (op1) || integer_zerop (op2)) + return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); + + /* If the divisor is all integer -1, fold to svneg. */ + tree pg = gimple_call_arg (f.call, 0); + if (!f.type_suffix (0).unsigned_p && integer_minus_onep (op2)) + { + function_instance instance ("svneg", functions::svneg, + shapes::unary, MODE_none, + f.type_suffix_ids, GROUP_none, f.pred); + gcall *call = f.redirect_call (instance); + unsigned offset_index = 0; + if (f.pred == PRED_m) + { + offset_index = 1; + gimple_call_set_arg (call, 0, op1); + } + else + gimple_set_num_ops (call, 5); + gimple_call_set_arg (call, offset_index, pg); + gimple_call_set_arg (call, offset_index + 1, op1); + return call; + } + + /* If the divisor is a uniform power of 2, fold to a shift + instruction. */ + tree op2_cst = uniform_integer_cst_p (op2); + if (!op2_cst || !integer_pow2p (op2_cst)) return NULL; tree new_divisor; gcall *call; - if (f.type_suffix (0).unsigned_p && tree_to_uhwi (divisor_cst) != 1) + if (f.type_suffix (0).unsigned_p && tree_to_uhwi (op2_cst) != 1) { function_instance instance ("svlsr", functions::svlsr, shapes::binary_uint_opt_n, MODE_n, f.type_suffix_ids, GROUP_none, f.pred); call = f.redirect_call (instance); - tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst; + tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : op2_cst; new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d)); } else { - if (tree_int_cst_sign_bit (divisor_cst) - || tree_to_shwi (divisor_cst) == 1) + if (tree_int_cst_sign_bit (op2_cst) + || tree_to_shwi (op2_cst) == 1) return NULL; function_instance instance ("svasrd", functions::svasrd, @@ -784,7 +818,7 @@ public: f.type_suffix_ids, GROUP_none, f.pred); call = f.redirect_call (instance); new_divisor = wide_int_to_tree (scalar_types[VECTOR_TYPE_svuint64_t], - tree_log2 (divisor_cst)); + tree_log2 (op2_cst)); } gimple_call_set_arg (call, 2, new_divisor); @@ -804,15 +838,16 @@ public: e.rotate_inputs_left (0, 3); insn_code icode; if (e.type_suffix_ids[1] == NUM_TYPE_SUFFIXES) - icode = e.direct_optab_handler_for_sign (sdot_prod_optab, - udot_prod_optab, - 0, GET_MODE (e.args[0])); + icode = e.convert_optab_handler_for_sign (sdot_prod_optab, + udot_prod_optab, + 0, e.result_mode (), + GET_MODE (e.args[0])); else icode = (e.type_suffix (0).float_p ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf : e.type_suffix (0).unsigned_p - ? CODE_FOR_aarch64_sve_udotvnx4sivnx8hi - : CODE_FOR_aarch64_sve_sdotvnx4sivnx8hi); + ? CODE_FOR_udot_prodvnx4sivnx8hi + : CODE_FOR_sdot_prodvnx4sivnx8hi); return e.use_unpred_insn (icode); } }; @@ -1288,6 +1323,20 @@ public: class svindex_impl : public function_base { public: + gimple * + fold (gimple_folder &f) const override + { + /* Apply constant folding if base and step are integer constants. */ + tree vec_type = TREE_TYPE (f.lhs); + tree base = gimple_call_arg (f.call, 0); + tree step = gimple_call_arg (f.call, 1); + if (TREE_CODE (base) != INTEGER_CST || TREE_CODE (step) != INTEGER_CST) + return NULL; + return gimple_build_assign (f.lhs, + build_vec_series (vec_type, base, step)); + } + +public: rtx expand (function_expander &e) const override { @@ -1877,6 +1926,19 @@ public: } }; +class svlsl_impl : public rtx_code_function +{ +public: + CONSTEXPR svlsl_impl () + : rtx_code_function (ASHIFT, ASHIFT) {} + + gimple * + fold (gimple_folder &f) const override + { + return f.fold_const_binary (LSHIFT_EXPR); + } +}; + class svmad_impl : public function_base { public: @@ -1995,6 +2057,93 @@ public: } }; +class svmul_impl : public rtx_code_function +{ +public: + CONSTEXPR svmul_impl () + : rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL) {} + + gimple * + fold (gimple_folder &f) const override + { + if (auto *res = f.fold_const_binary (MULT_EXPR)) + return res; + + /* If one of the operands is all ones, fold to other operand. */ + tree op1 = gimple_call_arg (f.call, 1); + tree op2 = gimple_call_arg (f.call, 2); + if (integer_onep (op1)) + return f.fold_active_lanes_to (op2); + if (integer_onep (op2)) + return f.fold_active_lanes_to (op1); + + /* If one of the operands is all zeros, fold to zero vector. */ + if (integer_zerop (op1) || integer_zerop (op2)) + return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs))); + + /* If one of the operands is all integer -1, fold to svneg. */ + tree pg = gimple_call_arg (f.call, 0); + tree negated_op = NULL; + if (integer_minus_onep (op2)) + negated_op = op1; + else if (integer_minus_onep (op1)) + negated_op = op2; + if (!f.type_suffix (0).unsigned_p && negated_op) + { + function_instance instance ("svneg", functions::svneg, + shapes::unary, MODE_none, + f.type_suffix_ids, GROUP_none, f.pred); + gcall *call = f.redirect_call (instance); + unsigned offset_index = 0; + if (f.pred == PRED_m) + { + offset_index = 1; + gimple_call_set_arg (call, 0, op1); + } + else + gimple_set_num_ops (call, 5); + gimple_call_set_arg (call, offset_index, pg); + gimple_call_set_arg (call, offset_index + 1, negated_op); + return call; + } + + /* If one of the operands is a uniform power of 2, fold to a left shift + by immediate. */ + tree op1_cst = uniform_integer_cst_p (op1); + tree op2_cst = uniform_integer_cst_p (op2); + tree shift_op1, shift_op2 = NULL; + if (op1_cst && integer_pow2p (op1_cst) + && (f.pred != PRED_m + || is_ptrue (pg, f.type_suffix (0).element_bytes))) + { + shift_op1 = op2; + shift_op2 = op1_cst; + } + else if (op2_cst && integer_pow2p (op2_cst)) + { + shift_op1 = op1; + shift_op2 = op2_cst; + } + else + return NULL; + + if (shift_op2) + { + shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)), + tree_log2 (shift_op2)); + function_instance instance ("svlsl", functions::svlsl, + shapes::binary_uint_opt_n, MODE_n, + f.type_suffix_ids, GROUP_none, f.pred); + gcall *call = f.redirect_call (instance); + gimple_call_set_arg (call, 1, shift_op1); + gimple_call_set_arg (call, 2, shift_op2); + return call; + } + + return NULL; + } +}; + class svnand_impl : public function_base { public: @@ -2861,7 +3010,7 @@ public: Hence we do the same rotation on arguments as svdot_impl does. */ e.rotate_inputs_left (0, 3); machine_mode mode = e.vector_mode (0); - insn_code icode = code_for_dot_prod (UNSPEC_USDOT, mode); + insn_code icode = code_for_dot_prod (UNSPEC_USDOT, e.result_mode (), mode); return e.use_exact_insn (icode); } @@ -2900,7 +3049,9 @@ public: : while_comparison (unspec_for_sint, unspec_for_uint), m_eq_p (eq_p) {} - /* Try to fold a call by treating its arguments as constants of type T. */ + /* Try to fold a call by treating its arguments as constants of type T. + We have already filtered out the degenerate cases of X .LT. MIN + and X .LE. MAX. */ template<typename T> gimple * fold_type (gimple_folder &f) const @@ -2956,6 +3107,13 @@ public: if (f.vectors_per_tuple () > 1) return nullptr; + /* Filter out cases where the condition is always true or always false. */ + tree arg1 = gimple_call_arg (f.call, 1); + if (!m_eq_p && operand_equal_p (arg1, TYPE_MIN_VALUE (TREE_TYPE (arg1)))) + return f.fold_to_pfalse (); + if (m_eq_p && operand_equal_p (arg1, TYPE_MAX_VALUE (TREE_TYPE (arg1)))) + return f.fold_to_ptrue (); + if (f.type_suffix (1).unsigned_p) return fold_type<poly_uint64> (f); else @@ -3155,7 +3313,7 @@ FUNCTION (svldnf1uh, svldxf1_extend_impl, (TYPE_SUFFIX_u16, UNSPEC_LDNF1)) FUNCTION (svldnf1uw, svldxf1_extend_impl, (TYPE_SUFFIX_u32, UNSPEC_LDNF1)) FUNCTION (svldnt1, svldnt1_impl,) FUNCTION (svlen, svlen_impl,) -FUNCTION (svlsl, rtx_code_function, (ASHIFT, ASHIFT)) +FUNCTION (svlsl, svlsl_impl,) FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) @@ -3179,7 +3337,7 @@ FUNCTION (svmls_lane, svmls_lane_impl,) FUNCTION (svmmla, svmmla_impl,) FUNCTION (svmov, svmov_impl,) FUNCTION (svmsb, svmsb_impl,) -FUNCTION (svmul, rtx_code_function, (MULT, MULT, UNSPEC_COND_FMUL)) +FUNCTION (svmul, svmul_impl,) FUNCTION (svmul_lane, CODE_FOR_MODE0 (aarch64_mul_lane),) FUNCTION (svmulh, unspec_based_function, (UNSPEC_SMUL_HIGHPART, UNSPEC_UMUL_HIGHPART, -1)) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.def b/gcc/config/aarch64/aarch64-sve-builtins-base.def index 65fcba9..da2a0e4 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.def @@ -17,7 +17,7 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE +#define REQUIRED_EXTENSIONS ssve (0) DEF_SVE_FUNCTION (svabd, binary_opt_n, all_arith, mxz) DEF_SVE_FUNCTION (svabs, unary, all_float_and_signed, mxz) DEF_SVE_FUNCTION (svacge, compare_opt_n, all_float, implicit) @@ -261,7 +261,7 @@ DEF_SVE_FUNCTION (svzip2, binary, all_data, none) DEF_SVE_FUNCTION (svzip2, binary_pred, all_pred, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_SM_OFF +#define REQUIRED_EXTENSIONS nonstreaming_sve (0) DEF_SVE_FUNCTION (svadda, fold_left, all_float, implicit) DEF_SVE_FUNCTION (svadrb, adr_offset, none, none) DEF_SVE_FUNCTION (svadrd, adr_index, none, none) @@ -327,7 +327,7 @@ DEF_SVE_FUNCTION (svtssel, binary_uint, all_float, none) DEF_SVE_FUNCTION (svwrffr, setffr, none, implicit) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_BF16 +#define REQUIRED_EXTENSIONS ssve (AARCH64_FL_BF16) DEF_SVE_FUNCTION (svbfdot, ternary_bfloat_opt_n, s_float, none) DEF_SVE_FUNCTION (svbfdot_lane, ternary_bfloat_lanex2, s_float, none) DEF_SVE_FUNCTION (svbfmlalb, ternary_bfloat_opt_n, s_float, none) @@ -338,33 +338,29 @@ DEF_SVE_FUNCTION (svcvt, unary_convertxn, cvt_bfloat, mxz) DEF_SVE_FUNCTION (svcvtnt, unary_convert_narrowt, cvt_bfloat, mx) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_BF16 \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_BF16) DEF_SVE_FUNCTION (svbfmmla, ternary_bfloat, s_float, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_I8MM +#define REQUIRED_EXTENSIONS ssve (AARCH64_FL_I8MM) DEF_SVE_FUNCTION (svsudot, ternary_intq_uintq_opt_n, s_signed, none) DEF_SVE_FUNCTION (svsudot_lane, ternary_intq_uintq_lane, s_signed, none) DEF_SVE_FUNCTION (svusdot, ternary_uintq_intq_opt_n, s_signed, none) DEF_SVE_FUNCTION (svusdot_lane, ternary_uintq_intq_lane, s_signed, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_I8MM \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_I8MM) DEF_SVE_FUNCTION (svmmla, mmla, s_integer, none) DEF_SVE_FUNCTION (svusmmla, ternary_uintq_intq, s_signed, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_F32MM \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_F32MM) DEF_SVE_FUNCTION (svmmla, mmla, s_float, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_F64MM +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_F64MM) +DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) +DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) DEF_SVE_FUNCTION (svtrn1q, binary, all_data, none) DEF_SVE_FUNCTION (svtrn2q, binary, all_data, none) DEF_SVE_FUNCTION (svuzp1q, binary, all_data, none) @@ -372,10 +368,3 @@ DEF_SVE_FUNCTION (svuzp2q, binary, all_data, none) DEF_SVE_FUNCTION (svzip1q, binary, all_data, none) DEF_SVE_FUNCTION (svzip2q, binary, all_data, none) #undef REQUIRED_EXTENSIONS - -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_F64MM \ - | AARCH64_FL_SM_OFF) -DEF_SVE_FUNCTION (svld1ro, load_replicate, all_data, implicit) -DEF_SVE_FUNCTION (svmmla, mmla, d_float, none) -#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.h b/gcc/config/aarch64/aarch64-sve-builtins-base.h index 5bbf356..978cf70 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.h @@ -37,6 +37,8 @@ namespace aarch64_sve extern const function_base *const svadrd; extern const function_base *const svadrh; extern const function_base *const svadrw; + extern const function_base *const svamax; + extern const function_base *const svamin; extern const function_base *const svand; extern const function_base *const svandv; extern const function_base *const svasr; diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sme.def b/gcc/config/aarch64/aarch64-sve-builtins-sme.def index 416df0b..bc2c332 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sme.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-sme.def @@ -32,12 +32,12 @@ DEF_SME_ZA_FUNCTION_GS (NAME, SHAPE, TYPES, none, PREDS) #endif -#define REQUIRED_EXTENSIONS 0 +#define REQUIRED_EXTENSIONS streaming_compatible (0) DEF_SME_FUNCTION (arm_has_sme, bool_inherent, none, none) DEF_SME_FUNCTION (arm_in_streaming_mode, bool_inherent, none, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SME +#define REQUIRED_EXTENSIONS streaming_compatible (AARCH64_FL_SME) DEF_SME_FUNCTION (svcntsb, count_inherent, none, none) DEF_SME_FUNCTION (svcntsd, count_inherent, none, none) DEF_SME_FUNCTION (svcntsh, count_inherent, none, none) @@ -49,7 +49,7 @@ DEF_SME_ZA_FUNCTION (svzero, inherent_za, za, none) DEF_SME_ZA_FUNCTION (svzero_mask, inherent_mask_za, za, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SME | AARCH64_FL_SM_ON +#define REQUIRED_EXTENSIONS streaming_only (0) DEF_SME_ZA_FUNCTION (svaddha, unary_za_m, za_s_integer, za_m) DEF_SME_ZA_FUNCTION (svaddva, unary_za_m, za_s_integer, za_m) DEF_SME_ZA_FUNCTION (svld1_hor, load_za, all_za, none) @@ -70,9 +70,7 @@ DEF_SME_ZA_FUNCTION (svwrite_hor, write_za_m, za_all_data, za_m) DEF_SME_ZA_FUNCTION (svwrite_ver, write_za_m, za_all_data, za_m) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SME \ - | AARCH64_FL_SME_I16I64 \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME_I16I64) DEF_SME_ZA_FUNCTION (svaddha, unary_za_m, za_d_integer, za_m) DEF_SME_ZA_FUNCTION (svaddva, unary_za_m, za_d_integer, za_m) DEF_SME_ZA_FUNCTION (svmopa, binary_za_m, mop_i16i64, za_m) @@ -83,14 +81,12 @@ DEF_SME_ZA_FUNCTION (svusmopa, binary_za_int_m, mop_i16i64_unsigned, za_m) DEF_SME_ZA_FUNCTION (svusmops, binary_za_int_m, mop_i16i64_unsigned, za_m) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SME \ - | AARCH64_FL_SME_F64F64 \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME_F64F64) DEF_SME_ZA_FUNCTION (svmopa, binary_za_m, za_d_float, za_m) DEF_SME_ZA_FUNCTION (svmops, binary_za_m, za_d_float, za_m) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS AARCH64_FL_SME | AARCH64_FL_SME2 +#define REQUIRED_EXTENSIONS streaming_compatible (AARCH64_FL_SME2) DEF_SME_FUNCTION (svldr_zt, ldr_zt, none, none) DEF_SME_FUNCTION (svstr_zt, str_zt, none, none) DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none) @@ -100,7 +96,7 @@ DEF_SME_FUNCTION (svzero_zt, inherent_zt, none, none) which will then be resolved to either an integer function or a floating-point function. They are needed because the integer and floating-point functions have different architecture requirements. */ -#define REQUIRED_EXTENSIONS AARCH64_FL_SME | AARCH64_FL_SME2 | AARCH64_FL_SM_ON +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2) DEF_SME_ZA_FUNCTION_GS (svadd, unary_za_slice, za_s_data, vg1x24, none) DEF_SME_ZA_FUNCTION_GS (svadd, unary_za_slice, d_za, vg1x24, none) DEF_SME_ZA_FUNCTION_GS (svadd_write, binary_za_slice_opt_single, za_s_integer, @@ -172,10 +168,8 @@ DEF_SME_ZA_FUNCTION_GS (svwrite_hor, write_za, za_bhsd_data, vg24, none) DEF_SME_ZA_FUNCTION_GS (svwrite_ver, write_za, za_bhsd_data, vg24, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SME \ - | AARCH64_FL_SME2 \ - | AARCH64_FL_SME_I16I64 \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \ + | AARCH64_FL_SME_I16I64) DEF_SME_ZA_FUNCTION_GS (svadd, unary_za_slice, za_d_integer, vg1x24, none) DEF_SME_ZA_FUNCTION_GS (svadd_write, binary_za_slice_opt_single, za_d_integer, vg1x24, none) @@ -198,10 +192,8 @@ DEF_SME_ZA_FUNCTION_GS (svvdot_lane, dot_za_slice_lane, za_d_h_integer, vg1x4, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SME \ - | AARCH64_FL_SME2 \ - | AARCH64_FL_SME_F64F64 \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2 \ + | AARCH64_FL_SME_F64F64) DEF_SME_ZA_FUNCTION_GS (svadd, unary_za_slice, za_d_float, vg1x24, none) DEF_SME_ZA_FUNCTION_GS (svmla, binary_za_slice_opt_single, za_d_float, vg1x24, none) diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc index 146a545..f0ab740 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc @@ -81,6 +81,24 @@ unspec_sqrdcmlah (int rot) class svaba_impl : public function_base { public: + gimple * + fold (gimple_folder &f) const override + { + /* Fold to svabd if op1 is all zeros. */ + tree op1 = gimple_call_arg (f.call, 0); + if (!integer_zerop (op1)) + return NULL; + function_instance instance ("svabd", functions::svabd, + shapes::binary_opt_n, f.mode_suffix_id, + f.type_suffix_ids, GROUP_none, PRED_x); + gcall *call = f.redirect_call (instance); + /* Add a ptrue as predicate, because unlike svaba, svabd is + predicated. */ + gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ())); + return call; + } + +public: rtx expand (function_expander &e) const override { @@ -90,6 +108,22 @@ public: } }; +class svxar_impl : public function_base +{ +public: + rtx + expand (function_expander &e) const override + { + /* aarch64_sve2_xar represents this operation with a left-rotate RTX. + Convert the right-rotate amount from the intrinsic to fit this. */ + machine_mode mode = e.vector_mode (0); + HOST_WIDE_INT rot = GET_MODE_UNIT_BITSIZE (mode) + - INTVAL (e.args[2]); + e.args[2] = aarch64_simd_gen_const_vector_dup (mode, rot); + return e.use_exact_insn (code_for_aarch64_sve2_xar (mode)); + } +}; + class svcdot_impl : public function_base { public: @@ -234,7 +268,7 @@ public: } }; -class svpsel_impl : public function_base +class svpsel_lane_impl : public function_base { public: rtx @@ -418,6 +452,34 @@ public: class svsra_impl : public function_base { public: + gimple * + fold (gimple_folder &f) const override + { + /* Fold to svlsr/svasr if op1 is all zeros. */ + tree op1 = gimple_call_arg (f.call, 0); + if (!integer_zerop (op1)) + return NULL; + function_instance instance ("svlsr", functions::svlsr, + shapes::binary_uint_opt_n, MODE_n, + f.type_suffix_ids, GROUP_none, PRED_x); + if (!f.type_suffix (0).unsigned_p) + { + instance.base_name = "svasr"; + instance.base = functions::svasr; + } + gcall *call = f.redirect_call (instance); + /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are + predicated intrinsics. */ + gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ())); + /* For svsra, the shift amount (imm3) is uint64_t for all function types, + but for svlsr/svasr, imm3 has the same width as the function type. */ + tree imm3 = gimple_call_arg (f.call, 2); + tree imm3_prec = wide_int_to_tree (f.scalar_type (0), + wi::to_widest (imm3)); + gimple_call_set_arg (call, 2, imm3_prec); + return call; + } +public: rtx expand (function_expander &e) const override { @@ -545,6 +607,10 @@ FUNCTION (svaesd, fixed_insn_function, (CODE_FOR_aarch64_sve2_aesd)) FUNCTION (svaese, fixed_insn_function, (CODE_FOR_aarch64_sve2_aese)) FUNCTION (svaesimc, fixed_insn_function, (CODE_FOR_aarch64_sve2_aesimc)) FUNCTION (svaesmc, fixed_insn_function, (CODE_FOR_aarch64_sve2_aesmc)) +FUNCTION (svamax, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMAX, UNSPEC_FAMAX)) +FUNCTION (svamin, cond_or_uncond_unspec_function, + (UNSPEC_COND_FAMIN, UNSPEC_FAMIN)) FUNCTION (svbcax, CODE_FOR_MODE0 (aarch64_sve2_bcax),) FUNCTION (svbdep, unspec_based_function, (UNSPEC_BDEP, UNSPEC_BDEP, -1)) FUNCTION (svbext, unspec_based_function, (UNSPEC_BEXT, UNSPEC_BEXT, -1)) @@ -625,7 +691,7 @@ FUNCTION (svpmullb, unspec_based_function, (-1, UNSPEC_PMULLB, -1)) FUNCTION (svpmullb_pair, unspec_based_function, (-1, UNSPEC_PMULLB_PAIR, -1)) FUNCTION (svpmullt, unspec_based_function, (-1, UNSPEC_PMULLT, -1)) FUNCTION (svpmullt_pair, unspec_based_function, (-1, UNSPEC_PMULLT_PAIR, -1)) -FUNCTION (svpsel, svpsel_impl,) +FUNCTION (svpsel_lane, svpsel_lane_impl,) FUNCTION (svqabs, rtx_code_function, (SS_ABS, UNKNOWN, UNKNOWN)) FUNCTION (svqcadd, svqcadd_impl,) FUNCTION (svqcvt, integer_conversion, (UNSPEC_SQCVT, UNSPEC_SQCVTU, @@ -745,6 +811,6 @@ FUNCTION (svwhilege, while_comparison, (UNSPEC_WHILEGE, UNSPEC_WHILEHS)) FUNCTION (svwhilegt, while_comparison, (UNSPEC_WHILEGT, UNSPEC_WHILEHI)) FUNCTION (svwhilerw, svwhilerw_svwhilewr_impl, (UNSPEC_WHILERW)) FUNCTION (svwhilewr, svwhilerw_svwhilewr_impl, (UNSPEC_WHILEWR)) -FUNCTION (svxar, CODE_FOR_MODE0 (aarch64_sve2_xar),) +FUNCTION (svxar, svxar_impl,) } /* end namespace aarch64_sve */ diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def index 4543402..e402155 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def @@ -17,7 +17,7 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -#define REQUIRED_EXTENSIONS AARCH64_FL_SVE | AARCH64_FL_SVE2 +#define REQUIRED_EXTENSIONS sve_and_sme (AARCH64_FL_SVE2, 0) DEF_SVE_FUNCTION (svaba, ternary_opt_n, all_integer, none) DEF_SVE_FUNCTION (svabalb, ternary_long_opt_n, hsd_integer, none) DEF_SVE_FUNCTION (svabalt, ternary_long_opt_n, hsd_integer, none) @@ -166,9 +166,7 @@ DEF_SVE_FUNCTION (svwhilewr, compare_ptr, all_data, none) DEF_SVE_FUNCTION (svxar, ternary_shift_right_imm, all_integer, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2) DEF_SVE_FUNCTION (svhistcnt, binary_to_uint, sd_integer, z) DEF_SVE_FUNCTION (svhistseg, binary_to_uint, b_integer, none) DEF_SVE_FUNCTION (svldnt1_gather, load_gather_sv_restricted, sd_data, implicit) @@ -194,10 +192,8 @@ DEF_SVE_FUNCTION (svstnt1w_scatter, store_scatter_index_restricted, d_integer, i DEF_SVE_FUNCTION (svstnt1w_scatter, store_scatter_offset_restricted, d_integer, implicit) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SVE2_AES \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2 \ + | AARCH64_FL_SVE2_AES) DEF_SVE_FUNCTION (svaesd, binary, b_unsigned, none) DEF_SVE_FUNCTION (svaese, binary, b_unsigned, none) DEF_SVE_FUNCTION (svaesmc, unary, b_unsigned, none) @@ -206,44 +202,31 @@ DEF_SVE_FUNCTION (svpmullb_pair, binary_opt_n, d_unsigned, none) DEF_SVE_FUNCTION (svpmullt_pair, binary_opt_n, d_unsigned, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SVE2_BITPERM \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2 \ + | AARCH64_FL_SVE2_BITPERM) DEF_SVE_FUNCTION (svbdep, binary_opt_n, all_unsigned, none) DEF_SVE_FUNCTION (svbext, binary_opt_n, all_unsigned, none) DEF_SVE_FUNCTION (svbgrp, binary_opt_n, all_unsigned, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SVE2_SHA3 \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2 \ + | AARCH64_FL_SVE2_SHA3) DEF_SVE_FUNCTION (svrax1, binary, d_integer, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SVE2_SM4 \ - | AARCH64_FL_SM_OFF) +#define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE2 \ + | AARCH64_FL_SVE2_SM4) DEF_SVE_FUNCTION (svsm4e, binary, s_unsigned, none) DEF_SVE_FUNCTION (svsm4ekey, binary, s_unsigned, none) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SME \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (0) DEF_SVE_FUNCTION (svclamp, clamp, all_integer, none) -DEF_SVE_FUNCTION (svpsel, select_pred, all_pred_count, none) +DEF_SVE_FUNCTION (svpsel_lane, select_pred, all_pred_count, none) DEF_SVE_FUNCTION (svrevd, unary, all_data, mxz) #undef REQUIRED_EXTENSIONS -#define REQUIRED_EXTENSIONS (AARCH64_FL_SVE \ - | AARCH64_FL_SVE2 \ - | AARCH64_FL_SME \ - | AARCH64_FL_SME2 \ - | AARCH64_FL_SM_ON) +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME2) DEF_SVE_FUNCTION_GS (svadd, binary_single, all_integer, x24, none) DEF_SVE_FUNCTION (svbfmlslb, ternary_bfloat_opt_n, s_float, none) DEF_SVE_FUNCTION (svbfmlslb_lane, ternary_bfloat_lane, s_float, none) @@ -300,3 +283,10 @@ DEF_SVE_FUNCTION (svwhilelt, compare_scalar_count, while_x_c, none) DEF_SVE_FUNCTION_GS (svzip, unaryxn, all_data, x24, none) DEF_SVE_FUNCTION_GS (svzipq, unaryxn, all_data, x24, none) #undef REQUIRED_EXTENSIONS + +#define REQUIRED_EXTENSIONS \ + sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_FAMINMAX, \ + AARCH64_FL_SME2 | AARCH64_FL_FAMINMAX) +DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz) +DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h index 2ac6ede..013a9df 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.h +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.h @@ -106,7 +106,7 @@ namespace aarch64_sve extern const function_base *const svpmullb_pair; extern const function_base *const svpmullt; extern const function_base *const svpmullt_pair; - extern const function_base *const svpsel; + extern const function_base *const svpsel_lane; extern const function_base *const svqabs; extern const function_base *const svqcadd; extern const function_base *const svqcvt; diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc index 5ca9ec3..44b7f6e 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins.cc @@ -19,6 +19,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -82,9 +83,8 @@ public: /* The decl itself. */ tree decl; - /* The architecture extensions that the function requires, as a set of - AARCH64_FL_* flags. */ - aarch64_feature_flags required_extensions; + /* The architecture extensions that the function requires. */ + aarch64_required_extensions required_extensions; /* True if the decl represents an overloaded function that needs to be resolved by function_resolver. */ @@ -882,11 +882,15 @@ static const predication_index preds_z[] = { PRED_z, NUM_PREDS }; /* Used by SME instructions that always merge into ZA. */ static const predication_index preds_za_m[] = { PRED_za_m, NUM_PREDS }; +#define NONSTREAMING_SVE(X) nonstreaming_only (AARCH64_FL_SVE | (X)) +#define SVE_AND_SME(X, Y) streaming_compatible (AARCH64_FL_SVE | (X), (Y)) +#define SSVE(X) SVE_AND_SME (X, X) + /* A list of all arm_sve.h functions. */ static CONSTEXPR const function_group_info function_groups[] = { #define DEF_SVE_FUNCTION_GS(NAME, SHAPE, TYPES, GROUPS, PREDS) \ { #NAME, &functions::NAME, &shapes::SHAPE, types_##TYPES, groups_##GROUPS, \ - preds_##PREDS, REQUIRED_EXTENSIONS }, + preds_##PREDS, aarch64_required_extensions::REQUIRED_EXTENSIONS }, #include "aarch64-sve-builtins.def" }; @@ -894,7 +898,7 @@ static CONSTEXPR const function_group_info function_groups[] = { static CONSTEXPR const function_group_info neon_sve_function_groups[] = { #define DEF_NEON_SVE_FUNCTION(NAME, SHAPE, TYPES, GROUPS, PREDS) \ { #NAME, &neon_sve_bridge_functions::NAME, &shapes::SHAPE, types_##TYPES, \ - groups_##GROUPS, preds_##PREDS, 0 }, + groups_##GROUPS, preds_##PREDS, aarch64_required_extensions::ssve (0) }, #include "aarch64-neon-sve-bridge-builtins.def" }; @@ -902,10 +906,12 @@ static CONSTEXPR const function_group_info neon_sve_function_groups[] = { static CONSTEXPR const function_group_info sme_function_groups[] = { #define DEF_SME_FUNCTION_GS(NAME, SHAPE, TYPES, GROUPS, PREDS) \ { #NAME, &functions::NAME, &shapes::SHAPE, types_##TYPES, groups_##GROUPS, \ - preds_##PREDS, REQUIRED_EXTENSIONS }, + preds_##PREDS, aarch64_required_extensions::REQUIRED_EXTENSIONS }, #define DEF_SME_ZA_FUNCTION_GS(NAME, SHAPE, TYPES, GROUPS, PREDS) \ { #NAME, &functions::NAME##_za, &shapes::SHAPE, types_##TYPES, \ - groups_##GROUPS, preds_##PREDS, (REQUIRED_EXTENSIONS | AARCH64_FL_ZA_ON) }, + groups_##GROUPS, preds_##PREDS, \ + aarch64_required_extensions::REQUIRED_EXTENSIONS \ + .and_also (AARCH64_FL_ZA_ON) }, #include "aarch64-sve-builtins-sme.def" }; @@ -1132,6 +1138,33 @@ report_not_enum (location_t location, tree fndecl, unsigned int argno, " a valid %qT value", actual, argno + 1, fndecl, enumtype); } +/* Try to fold constant arguments ARG1 and ARG2 using the given tree_code. + Operations are not treated as overflowing. */ +static tree +aarch64_const_binop (enum tree_code code, tree arg1, tree arg2) +{ + if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2)) + { + poly_wide_int poly_res; + tree type = TREE_TYPE (arg1); + signop sign = TYPE_SIGN (type); + wi::overflow_type overflow = wi::OVF_NONE; + + /* Return 0 for division by 0, like SDIV and UDIV do. */ + if (code == TRUNC_DIV_EXPR && integer_zerop (arg2)) + return arg2; + /* Return 0 if shift amount is out of range. */ + if (code == LSHIFT_EXPR + && wi::geu_p (wi::to_wide (arg2), TYPE_PRECISION (type))) + return build_int_cst (type, 0); + if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow)) + return NULL_TREE; + return force_fit_type (type, poly_res, false, + TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2)); + } + return NULL_TREE; +} + /* Return a hash code for a function_instance. */ hashval_t function_instance::hash () const @@ -1259,7 +1292,7 @@ function_builder::function_builder (handle_pragma_index pragma_index, bool function_nulls) { m_overload_type = build_function_type (void_type_node, void_list_node); - m_direct_overloads = lang_GNU_CXX (); + m_direct_overloads = lang_GNU_CXX () || in_lto_p; if (initial_indexes[pragma_index] == 0) { @@ -1389,16 +1422,17 @@ add_shared_state_attribute (const char *name, bool is_in, bool is_out, } /* Return the appropriate function attributes for INSTANCE, which requires - the feature flags in REQUIRED_EXTENSIONS. */ + the architecture extensions in REQUIRED_EXTENSIONS. */ tree function_builder::get_attributes (const function_instance &instance, - aarch64_feature_flags required_extensions) + aarch64_required_extensions + required_extensions) { tree attrs = NULL_TREE; - if (required_extensions & AARCH64_FL_SM_ON) + if (required_extensions.sm_off == 0) attrs = add_attribute ("arm", "streaming", NULL_TREE, attrs); - else if (!(required_extensions & AARCH64_FL_SM_OFF)) + else if (required_extensions.sm_on != 0) attrs = add_attribute ("arm", "streaming_compatible", NULL_TREE, attrs); attrs = add_shared_state_attribute ("in", true, false, @@ -1424,12 +1458,13 @@ function_builder::get_attributes (const function_instance &instance, /* Add a function called NAME with type FNTYPE and attributes ATTRS. INSTANCE describes what the function does and OVERLOADED_P indicates - whether it is overloaded. REQUIRED_EXTENSIONS are the set of - architecture extensions that the function requires. */ + whether it is overloaded. REQUIRED_EXTENSIONS describes the architecture + extensions that the function requires. */ registered_function & function_builder::add_function (const function_instance &instance, const char *name, tree fntype, tree attrs, - aarch64_feature_flags required_extensions, + aarch64_required_extensions + required_extensions, bool overloaded_p, bool placeholder_p) { @@ -1469,7 +1504,7 @@ function_builder::add_function (const function_instance &instance, /* Add a built-in function for INSTANCE, with the argument types given by ARGUMENT_TYPES and the return type given by RETURN_TYPE. - REQUIRED_EXTENSIONS are the set of architecture extensions that the + REQUIRED_EXTENSIONS describes the architecture extensions that the function requires. FORCE_DIRECT_OVERLOADS is true if there is a one-to-one mapping between "short" and "full" names, and if standard overload resolution therefore isn't necessary. */ @@ -1478,7 +1513,7 @@ function_builder:: add_unique_function (const function_instance &instance, tree return_type, vec<tree> &argument_types, - aarch64_feature_flags required_extensions, + aarch64_required_extensions required_extensions, bool force_direct_overloads) { /* Add the function under its full (unique) name. */ @@ -1516,7 +1551,7 @@ add_unique_function (const function_instance &instance, } /* Add one function decl for INSTANCE, to be used with manual overload - resolution. REQUIRED_EXTENSIONS are the set of architecture extensions + resolution. REQUIRED_EXTENSIONS describes the architecture extensions that the function requires. For simplicity, deal with duplicate attempts to add the same function, @@ -1527,7 +1562,7 @@ add_unique_function (const function_instance &instance, void function_builder:: add_overloaded_function (const function_instance &instance, - aarch64_feature_flags required_extensions) + aarch64_required_extensions required_extensions) { auto &name_map = overload_names[m_function_nulls]; if (!name_map) @@ -1537,8 +1572,12 @@ add_overloaded_function (const function_instance &instance, tree id = get_identifier (name); if (registered_function **map_value = name_map->get (id)) gcc_assert ((*map_value)->instance == instance - && ((*map_value)->required_extensions - & ~required_extensions) == 0); + && (required_extensions.sm_off == 0 + || ((*map_value)->required_extensions.sm_off + & ~required_extensions.sm_off) == 0) + && (required_extensions.sm_on == 0 + || ((*map_value)->required_extensions.sm_on + & ~required_extensions.sm_on) == 0)); else { registered_function &rfn @@ -3593,6 +3632,52 @@ gimple_folder::fold_to_vl_pred (unsigned int vl) return gimple_build_assign (lhs, builder.build ()); } +/* Try to fold the call to a constant, given that, for integers, the call + is roughly equivalent to binary operation CODE. aarch64_const_binop + handles any differences between CODE and the intrinsic. */ +gimple * +gimple_folder::fold_const_binary (enum tree_code code) +{ + gcc_assert (gimple_call_num_args (call) == 3); + tree pg = gimple_call_arg (call, 0); + tree op1 = gimple_call_arg (call, 1); + tree op2 = gimple_call_arg (call, 2); + + if (type_suffix (0).integer_p + && (pred == PRED_x || is_ptrue (pg, type_suffix (0).element_bytes))) + if (tree res = vector_const_binop (code, op1, op2, aarch64_const_binop)) + return gimple_build_assign (lhs, res); + + return NULL; +} + +/* Fold the active lanes to X and set the inactive lanes according to the + predication. Return the new statement. */ +gimple * +gimple_folder::fold_active_lanes_to (tree x) +{ + /* If predication is _x or the predicate is ptrue, fold to X. */ + if (pred == PRED_x + || is_ptrue (gimple_call_arg (call, 0), type_suffix (0).element_bytes)) + return gimple_build_assign (lhs, x); + + /* If the predication is _z or _m, calculate a vector that supplies the + values of inactive lanes (the first vector argument for m and a zero + vector from z). */ + tree vec_inactive; + if (pred == PRED_z) + vec_inactive = build_zero_cst (TREE_TYPE (lhs)); + else + vec_inactive = gimple_call_arg (call, 1); + if (operand_equal_p (x, vec_inactive, 0)) + return gimple_build_assign (lhs, x); + + gimple_seq stmts = NULL; + tree pred = convert_pred (stmts, vector_type (0), 0); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + return gimple_build_assign (lhs, VEC_COND_EXPR, pred, x, vec_inactive); +} + /* Try to fold the call. Return the new statement on success and null on failure. */ gimple * @@ -3647,6 +3732,21 @@ function_expander::direct_optab_handler_for_sign (optab signed_op, return ::direct_optab_handler (op, mode); } +/* Choose between signed and unsigned convert optabs SIGNED_OP and + UNSIGNED_OP based on the signedness of type suffix SUFFIX_I, then + pick the appropriate optab handler for "converting" from FROM_MODE + to TO_MODE. */ +insn_code +function_expander::convert_optab_handler_for_sign (optab signed_op, + optab unsigned_op, + unsigned int suffix_i, + machine_mode to_mode, + machine_mode from_mode) +{ + optab op = type_suffix (suffix_i).unsigned_p ? unsigned_op : signed_op; + return ::convert_optab_handler (op, to_mode, from_mode); +} + /* Return true if X overlaps any input. */ bool function_expander::overlaps_input_p (rtx x) diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h index 9ab6f20..d5cc6e0 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins.h +++ b/gcc/config/aarch64/aarch64-sve-builtins.h @@ -363,9 +363,8 @@ struct function_group_info const group_suffix_index *groups; const predication_index *preds; - /* The architecture extensions that the functions require, as a set of - AARCH64_FL_* flags. */ - aarch64_feature_flags required_extensions; + /* The architecture extensions that the functions require. */ + aarch64_required_extensions required_extensions; }; /* Describes a single fully-resolved function (i.e. one that has a @@ -432,9 +431,9 @@ public: ~function_builder (); void add_unique_function (const function_instance &, tree, - vec<tree> &, aarch64_feature_flags, bool); + vec<tree> &, aarch64_required_extensions, bool); void add_overloaded_function (const function_instance &, - aarch64_feature_flags); + aarch64_required_extensions); void add_overloaded_functions (const function_group_info &, mode_suffix_index); @@ -446,11 +445,11 @@ private: char *get_name (const function_instance &, bool); - tree get_attributes (const function_instance &, aarch64_feature_flags); + tree get_attributes (const function_instance &, aarch64_required_extensions); registered_function &add_function (const function_instance &, const char *, tree, tree, - aarch64_feature_flags, bool, bool); + aarch64_required_extensions, bool, bool); /* The function type to use for functions that are resolved by function_resolver. */ @@ -636,6 +635,8 @@ public: gimple *fold_to_pfalse (); gimple *fold_to_ptrue (); gimple *fold_to_vl_pred (unsigned int); + gimple *fold_const_binary (enum tree_code); + gimple *fold_active_lanes_to (tree); gimple *fold (); @@ -659,6 +660,8 @@ public: insn_code direct_optab_handler (optab, unsigned int = 0); insn_code direct_optab_handler_for_sign (optab, optab, unsigned int = 0, machine_mode = E_VOIDmode); + insn_code convert_optab_handler_for_sign (optab, optab, unsigned int, + machine_mode, machine_mode); machine_mode result_mode () const; diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index a5cd42b..06bd3e4 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3088,6 +3088,23 @@ ;; - NOT ;; ------------------------------------------------------------------------- +(define_expand "ctz<mode>2" + [(set (match_operand:SVE_I 0 "register_operand") + (unspec:SVE_I + [(match_dup 2) + (ctz:SVE_I + (match_operand:SVE_I 1 "register_operand"))] + UNSPEC_PRED_X))] + "TARGET_SVE" + { + rtx pred = aarch64_ptrue_reg (<VPRED>mode); + rtx temp = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_pred_rbit<mode> (temp, pred, operands[1])); + emit_insn (gen_aarch64_pred_clz<mode> (operands[0], pred, temp)); + DONE; + } +) + ;; Unpredicated integer unary arithmetic. (define_expand "<optab><mode>2" [(set (match_operand:SVE_I 0 "register_operand") @@ -4816,11 +4833,23 @@ ;; Unpredicated shift operations by a constant (post-RA only). ;; These are generated by splitting a predicated instruction whose ;; predicate is unused. -(define_insn "*post_ra_v<optab><mode>3" +(define_insn "*post_ra_v_ashl<mode>3" + [(set (match_operand:SVE_I 0 "register_operand") + (ashift:SVE_I + (match_operand:SVE_I 1 "register_operand") + (match_operand:SVE_I 2 "aarch64_simd_lshift_imm")))] + "TARGET_SVE && reload_completed" + {@ [ cons: =0 , 1 , 2 ] + [ w , w , vs1 ] add\t%0.<Vetype>, %1.<Vetype>, %1.<Vetype> + [ w , w , Dl ] lsl\t%0.<Vetype>, %1.<Vetype>, #%2 + } +) + +(define_insn "*post_ra_v_<optab><mode>3" [(set (match_operand:SVE_I 0 "register_operand" "=w") - (ASHIFT:SVE_I + (SHIFTRT:SVE_I (match_operand:SVE_I 1 "register_operand" "w") - (match_operand:SVE_I 2 "aarch64_simd_<lr>shift_imm")))] + (match_operand:SVE_I 2 "aarch64_simd_rshift_imm")))] "TARGET_SVE && reload_completed" "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2" ) @@ -6443,10 +6472,10 @@ ;; by providing this, but we need to use UNSPECs since rtx logical ops ;; aren't defined for floating-point modes. (define_insn "*<optab><mode>3" - [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w") - (unspec:SVE_FULL_F - [(match_operand:SVE_FULL_F 1 "register_operand" "w") - (match_operand:SVE_FULL_F 2 "register_operand" "w")] + [(set (match_operand:SVE_F 0 "register_operand" "=w") + (unspec:SVE_F + [(match_operand:SVE_F 1 "register_operand" "w") + (match_operand:SVE_F 2 "register_operand" "w")] LOGICALF))] "TARGET_SVE" "<logicalf_op>\t%0.d, %1.d, %2.d" @@ -6588,39 +6617,6 @@ ;; - FMINNM ;; ------------------------------------------------------------------------- -;; Unpredicated fmax/fmin (the libm functions). The optabs for the -;; smax/smin rtx codes are handled in the generic section above. -(define_expand "<fmaxmin><mode>3" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_dup 3) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 1 "register_operand") - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_maxmin_operand")] - SVE_COND_FP_MAXMIN_PUBLIC))] - "TARGET_SVE" - { - operands[3] = aarch64_ptrue_reg (<VPRED>mode); - } -) - -;; Predicated fmax/fmin (the libm functions). The optabs for the -;; smax/smin rtx codes are handled in the generic section above. -(define_expand "cond_<fmaxmin><mode>" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F - [(match_operand:<VPRED> 1 "register_operand") - (unspec:SVE_FULL_F - [(match_dup 1) - (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_maxmin_operand")] - SVE_COND_FP_MAXMIN_PUBLIC) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] - UNSPEC_SEL))] - "TARGET_SVE" -) - ;; Predicated floating-point maximum/minimum. (define_insn "@aarch64_pred_<optab><mode>" [(set (match_operand:SVE_FULL_F 0 "register_operand") @@ -7197,7 +7193,7 @@ ;; ------------------------------------------------------------------------- ;; Four-element integer dot-product with accumulation. -(define_insn "<sur>dot_prod<vsi2qi>" +(define_insn "<sur>dot_prod<mode><vsi2qi>" [(set (match_operand:SVE_FULL_SDI 0 "register_operand") (plus:SVE_FULL_SDI (unspec:SVE_FULL_SDI @@ -7235,7 +7231,7 @@ } ) -(define_insn "@<sur>dot_prod<vsi2qi>" +(define_insn "@<sur>dot_prod<mode><vsi2qi>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY @@ -7293,7 +7289,8 @@ rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode)); rtx diff = gen_reg_rtx (<VSI2QI>mode); emit_insn (gen_<su>abd<vsi2qi>3 (diff, operands[1], operands[2])); - emit_insn (gen_udot_prod<vsi2qi> (operands[0], diff, ones, operands[3])); + emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], diff, ones, + operands[3])); DONE; } ) diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 972b03a..8047f40 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -1266,18 +1266,28 @@ ;; - XAR ;; ------------------------------------------------------------------------- +;; Also allow the Advanced SIMD modes as the the SVE2 XAR instruction +;; can handle more element sizes than the TARGET_SHA3 one from Advanced SIMD. +;; Don't allow the V2DImode use here unless !TARGET_SHA3 as the Advanced SIMD +;; version should be preferred when available as it is non-destructive on its +;; input. (define_insn "@aarch64_sve2_xar<mode>" - [(set (match_operand:SVE_FULL_I 0 "register_operand") - (rotatert:SVE_FULL_I - (xor:SVE_FULL_I - (match_operand:SVE_FULL_I 1 "register_operand") - (match_operand:SVE_FULL_I 2 "register_operand")) - (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")))] - "TARGET_SVE2" - {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] - [ w , %0 , w ; * ] xar\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3 - [ ?&w , w , w ; yes ] movprfx\t%0, %1\;xar\t%0.<Vetype>, %0.<Vetype>, %2.<Vetype>, #%3 + [(set (match_operand:SVE_ASIMD_FULL_I 0 "register_operand" "=w,?&w") + (rotate:SVE_ASIMD_FULL_I + (xor:SVE_ASIMD_FULL_I + (match_operand:SVE_ASIMD_FULL_I 1 "register_operand" "%0,w") + (match_operand:SVE_ASIMD_FULL_I 2 "register_operand" "w,w")) + (match_operand:SVE_ASIMD_FULL_I 3 "aarch64_simd_lshift_imm")))] + "TARGET_SVE2 && !(<MODE>mode == V2DImode && TARGET_SHA3)" + { + operands[3] + = GEN_INT (GET_MODE_UNIT_BITSIZE (<MODE>mode) + - INTVAL (unwrap_const_vec_duplicate (operands[3]))); + if (which_alternative == 0) + return "xar\t%Z0.<Vetype>, %Z0.<Vetype>, %Z2.<Vetype>, #%3"; + return "movprfx\t%Z0, %Z1\;xar\t%Z0.<Vetype>, %Z0.<Vetype>, %Z2.<Vetype>, #%3"; } + [(set_attr "movprfx" "*,yes")] ) ;; ------------------------------------------------------------------------- @@ -2021,7 +2031,7 @@ ) ;; Two-way dot-product. -(define_insn "@aarch64_sve_<sur>dotvnx4sivnx8hi" +(define_insn "<sur>dot_prodvnx4sivnx8hi" [(set (match_operand:VNx4SI 0 "register_operand") (plus:VNx4SI (unspec:VNx4SI @@ -2467,6 +2477,43 @@ [(set_attr "movprfx" "yes")] ) +;; ------------------------------------------------------------------------- +;; -- [FP] Absolute maximum and minimum +;; ------------------------------------------------------------------------- +;; Includes: +;; - FAMAX +;; - FAMIN +;; ------------------------------------------------------------------------- +;; Predicated floating-point absolute maximum and minimum. +(define_insn_and_rewrite "*aarch64_pred_faminmax_fused" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand:<VPRED> 1 "register_operand") + (match_operand:SI 4 "aarch64_sve_gp_strictness") + (unspec:SVE_FULL_F + [(match_operand 5) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 2 "register_operand")] + UNSPEC_COND_FABS) + (unspec:SVE_FULL_F + [(match_operand 6) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 3 "register_operand")] + UNSPEC_COND_FABS)] + SVE_COND_SMAXMIN))] + "TARGET_SVE_FAMINMAX" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , Upl , %0 , w ; * ] <faminmax_cond_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<faminmax_cond_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> + } + "&& (!rtx_equal_p (operands[1], operands[5]) + || !rtx_equal_p (operands[1], operands[6]))" + { + operands[5] = copy_rtx (operands[1]); + operands[6] = copy_rtx (operands[1]); + } +) + ;; ========================================================================= ;; == Complex arithmetic ;; ========================================================================= diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index 4fce0c5..4423a99f 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88,thunderxt88p1,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,oryon1,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexa725,cortexx2,cortexx3,cortexx4,cortexx925,neoversen2,cobalt100,neoversen3,neoversev2,grace,neoversev3,neoversev3ae,demeter,generic,generic_armv8_a,generic_armv9_a" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88,thunderxt88p1,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,ampere1b,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,neoversen1,ares,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,fujitsu_monaka,tsv110,thunderx3t110,neoversev1,zeus,neoverse512tvb,saphira,oryon1,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa520,cortexa710,cortexa715,cortexa720,cortexa725,cortexx2,cortexx3,cortexx4,cortexx925,neoversen2,cobalt100,neoversen3,neoversev2,grace,neoversev3,neoversev3ae,demeter,generic,generic_armv8_a,generic_armv9_a" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 40dacfc..9347e06 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22,6 +22,7 @@ #define INCLUDE_STRING #define INCLUDE_ALGORITHM +#define INCLUDE_MEMORY #define INCLUDE_VECTOR #include "config.h" #include "system.h" @@ -59,6 +60,7 @@ #include "opts.h" #include "gimplify.h" #include "dwarf2.h" +#include "dwarf2out.h" #include "gimple-iterator.h" #include "tree-vectorizer.h" #include "aarch64-cost-tables.h" @@ -127,10 +129,19 @@ constexpr auto AARCH64_STATE_SHARED = 1U << 0; constexpr auto AARCH64_STATE_IN = 1U << 1; constexpr auto AARCH64_STATE_OUT = 1U << 2; +/* Enum to distinguish which type of check is to be done in + aarch64_simd_valid_imm. */ +enum simd_immediate_check { + AARCH64_CHECK_MOV, + AARCH64_CHECK_ORR, + AARCH64_CHECK_AND, + AARCH64_CHECK_XOR +}; + /* Information about a legitimate vector immediate operand. */ struct simd_immediate_info { - enum insn_type { MOV, MVN, INDEX, PTRUE }; + enum insn_type { MOV, MVN, INDEX, PTRUE, SVE_MOV }; enum modifier_type { LSL, MSL }; simd_immediate_info () {} @@ -415,6 +426,7 @@ static const struct aarch64_flag_desc aarch64_tuning_flags[] = #include "tuning_models/neoversev3.h" #include "tuning_models/neoversev3ae.h" #include "tuning_models/a64fx.h" +#include "tuning_models/fujitsu_monaka.h" /* Support for fine-grained override of the tuning structures. */ struct aarch64_tuning_override_function @@ -593,14 +605,10 @@ aarch64_lookup_shared_state_flags (tree attrs, const char *state_name) { for (tree attr = attrs; attr; attr = TREE_CHAIN (attr)) { - if (!cxx11_attribute_p (attr)) - continue; - - auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr))); - if (strcmp (ns, "arm") != 0) + if (!is_attribute_namespace_p ("arm", attr)) continue; - auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr))); + auto attr_name = IDENTIFIER_POINTER (get_attribute_name (attr)); auto flags = aarch64_attribute_shared_state_flags (attr_name); if (!flags) continue; @@ -1083,7 +1091,7 @@ pure_scalable_type_info::analyze_array (const_tree type) /* An array of unknown, flexible or variable length will be passed and returned by reference whatever we do. */ - tree nelts_minus_one = array_type_nelts (type); + tree nelts_minus_one = array_type_nelts_minus_one (type); if (!tree_fits_uhwi_p (nelts_minus_one)) return DOESNT_MATTER; @@ -1453,6 +1461,32 @@ aarch64_dwarf_frame_reg_mode (int regno) return default_dwarf_frame_reg_mode (regno); } +/* Implement TARGET_OUTPUT_CFI_DIRECTIVE. */ +static bool +aarch64_output_cfi_directive (FILE *f, dw_cfi_ref cfi) +{ + bool found = false; + if (cfi->dw_cfi_opc == DW_CFA_AARCH64_negate_ra_state) + { + fprintf (f, "\t.cfi_negate_ra_state\n"); + found = true; + } + return found; +} + +/* Implement TARGET_DW_CFI_OPRND1_DESC. */ +static bool +aarch64_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc, + dw_cfi_oprnd_type &oprnd_type) +{ + if (cfi_opc == DW_CFA_AARCH64_negate_ra_state) + { + oprnd_type = dw_cfi_oprnd_unused; + return true; + } + return false; +} + /* If X is a CONST_DOUBLE, return its bit representation as a constant integer, otherwise return X unmodified. */ static rtx @@ -1909,6 +1943,46 @@ aarch64_sve_int_mode (machine_mode mode) return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require (); } +/* Look for a vector mode with the same classification as VEC_MODE, + but with each group of FACTOR elements coalesced into a single element. + In other words, look for a mode in which the elements are FACTOR times + larger and in which the number of elements is FACTOR times smaller. + + Return the mode found, if one exists. */ + +static opt_machine_mode +aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor) +{ + auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode), + GET_MODE_NUNITS (vec_mode)); + auto vec_flags = aarch64_classify_vector_mode (vec_mode); + if (vec_flags & VEC_SVE_PRED) + { + if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED)) + return aarch64_sve_pred_mode (elt_bits * factor); + return {}; + } + + scalar_mode new_elt_mode; + if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode)) + return {}; + + if (vec_flags == VEC_ADVSIMD) + { + auto mode = aarch64_simd_container_mode (new_elt_mode, + GET_MODE_BITSIZE (vec_mode)); + if (mode != word_mode) + return mode; + } + else if (vec_flags & VEC_SVE_DATA) + { + poly_uint64 new_nunits; + if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits)) + return aarch64_sve_data_mode (new_elt_mode, new_nunits); + } + return {}; +} + /* Implement TARGET_VECTORIZE_RELATED_MODE. */ static opt_machine_mode @@ -3557,6 +3631,27 @@ aarch64_ptrue_reg (machine_mode mode) return gen_lowpart (mode, reg); } +/* Return an all-true (restricted to the leading VL bits) predicate register of + mode MODE. */ + +rtx +aarch64_ptrue_reg (machine_mode mode, unsigned int vl) +{ + gcc_assert (aarch64_sve_pred_mode_p (mode)); + + rtx_vector_builder builder (VNx16BImode, vl, 2); + + for (unsigned i = 0; i < vl; i++) + builder.quick_push (CONST1_RTX (BImode)); + + for (unsigned i = 0; i < vl; i++) + builder.quick_push (CONST0_RTX (BImode)); + + rtx const_vec = builder.build (); + rtx reg = force_reg (VNx16BImode, const_vec); + return gen_lowpart (mode, reg); +} + /* Return an all-false predicate register of mode MODE. */ rtx @@ -5593,7 +5688,7 @@ aarch64_expand_sve_const_vector (rtx target, rtx src) builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci)); } rtx vq_src = builder.build (); - if (aarch64_simd_valid_immediate (vq_src, NULL)) + if (aarch64_simd_valid_mov_imm (vq_src)) { vq_src = force_reg (vq_mode, vq_src); return aarch64_expand_sve_dupq (target, mode, vq_src); @@ -6105,8 +6200,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) } } - if (GET_CODE (imm) == HIGH - || aarch64_simd_valid_immediate (imm, NULL)) + if (GET_CODE (imm) == HIGH || aarch64_simd_valid_mov_imm (imm)) { emit_insn (gen_rtx_SET (dest, imm)); return; @@ -9012,7 +9106,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) { bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno); machine_mode mode = aarch64_reg_save_mode (regno); - + rtx reg = gen_rtx_REG (mode, regno); poly_int64 offset = frame.reg_offset[regno]; if (frame_pointer_needed) @@ -9615,7 +9709,7 @@ aarch64_expand_prologue (void) default: gcc_unreachable (); } - add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); + add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx); RTX_FRAME_RELATED_P (insn) = 1; } @@ -10036,7 +10130,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) default: gcc_unreachable (); } - add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); + add_reg_note (insn, REG_CFA_NEGATE_RA_STATE, const0_rtx); RTX_FRAME_RELATED_P (insn) = 1; } @@ -10197,7 +10291,7 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) /* Implement TARGET_CASE_VALUES_THRESHOLD. The expansion for a table switch is quite expensive due to the number of instructions, the table lookup and hard to predict indirect jump. - When optimizing for speed, and -O3 enabled, use the per-core tuning if + When optimizing for speed, and -O3 enabled, use the per-core tuning if set, otherwise use tables for >= 11 cases as a tradeoff between size and performance. When optimizing for size, use 8 for smallest codesize. */ @@ -11048,7 +11142,7 @@ aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode) vmode = aarch64_simd_container_mode (imode, width); rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival); - return aarch64_simd_valid_immediate (v_op, NULL); + return aarch64_simd_valid_mov_imm (v_op); } @@ -12829,7 +12923,7 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x, unsigned int vec_flags = aarch64_classify_vector_mode (mode); if (reg_class_subset_p (rclass, FP_REGS) && !((REG_P (x) && HARD_REGISTER_P (x)) - || aarch64_simd_valid_immediate (x, NULL)) + || aarch64_simd_valid_mov_imm (x)) && mode != VNx16QImode && (vec_flags & VEC_SVE_DATA) && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN)) @@ -14193,7 +14287,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, /* BFM. */ if (speed) *cost += extra_cost->alu.bfi; - *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed); + *cost += rtx_cost (op1, VOIDmode, code, 1, speed); } return true; @@ -14573,8 +14667,7 @@ cost_minus: *cost += extra_cost->alu.extend_arith; op1 = aarch64_strip_extend (op1, true); - *cost += rtx_cost (op1, VOIDmode, - (enum rtx_code) GET_CODE (op1), 0, speed); + *cost += rtx_cost (op1, VOIDmode, GET_CODE (op1), 0, speed); return true; } @@ -14585,9 +14678,7 @@ cost_minus: || aarch64_shift_p (GET_CODE (new_op1))) && code != COMPARE) { - *cost += aarch64_rtx_mult_cost (new_op1, MULT, - (enum rtx_code) code, - speed); + *cost += aarch64_rtx_mult_cost (new_op1, MULT, code, speed); return true; } @@ -14688,8 +14779,7 @@ cost_plus: *cost += extra_cost->alu.extend_arith; op0 = aarch64_strip_extend (op0, true); - *cost += rtx_cost (op0, VOIDmode, - (enum rtx_code) GET_CODE (op0), 0, speed); + *cost += rtx_cost (op0, VOIDmode, GET_CODE (op0), 0, speed); return true; } @@ -14803,8 +14893,7 @@ cost_plus: && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1, XEXP (op0, 1))) { - *cost += rtx_cost (XEXP (op0, 0), int_mode, - (enum rtx_code) code, 0, speed); + *cost += rtx_cost (XEXP (op0, 0), int_mode, code, 0, speed); if (speed) *cost += extra_cost->alu.bfx; @@ -14814,8 +14903,7 @@ cost_plus: { /* We possibly get the immediate for free, this is not modelled. */ - *cost += rtx_cost (op0, int_mode, - (enum rtx_code) code, 0, speed); + *cost += rtx_cost (op0, int_mode, code, 0, speed); if (speed) *cost += extra_cost->alu.logical; @@ -14850,10 +14938,8 @@ cost_plus: } /* In both cases we want to cost both operands. */ - *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code, - 0, speed); - *cost += rtx_cost (op1, int_mode, (enum rtx_code) code, - 1, speed); + *cost += rtx_cost (new_op0, int_mode, code, 0, speed); + *cost += rtx_cost (op1, int_mode, code, 1, speed); return true; } @@ -14874,7 +14960,7 @@ cost_plus: /* MVN-shifted-reg. */ if (op0 != x) { - *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed); + *cost += rtx_cost (op0, mode, code, 0, speed); if (speed) *cost += extra_cost->alu.log_shift; @@ -14890,7 +14976,7 @@ cost_plus: rtx newop1 = XEXP (op0, 1); rtx op0_stripped = aarch64_strip_shift (newop0); - *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed); + *cost += rtx_cost (newop1, mode, code, 1, speed); *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed); if (speed) @@ -15056,7 +15142,7 @@ cost_plus: && known_eq (INTVAL (XEXP (op1, 1)), GET_MODE_BITSIZE (mode) - 1)) { - *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); + *cost += rtx_cost (op0, mode, code, 0, speed); /* We already demanded XEXP (op1, 0) to be REG_P, so don't recurse into it. */ return true; @@ -15119,7 +15205,7 @@ cost_plus: /* We can trust that the immediates used will be correct (there are no by-register forms), so we need only cost op0. */ - *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed); + *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed); return true; case MULT: @@ -15309,12 +15395,11 @@ cost_plus: && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0) || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0)) { - *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code, - 0, speed); + *cost += rtx_cost (XEXP (x, 0), VOIDmode, code, 0, speed); return true; } - *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed); + *cost += rtx_cost (x, VOIDmode, code, 0, speed); return true; case ABS: @@ -15415,7 +15500,7 @@ cost_plus: case CONST_VECTOR: { /* Load using MOVI/MVNI. */ - if (aarch64_simd_valid_immediate (x, NULL)) + if (aarch64_simd_valid_mov_imm (x)) *cost = extra_cost->vect.movi; else /* Load using constant pool. */ *cost = extra_cost->ldst.load; @@ -15501,6 +15586,12 @@ aarch64_register_move_cost (machine_mode mode, reg_class_contents[FFR_REGS])) return 80; + /* Moves to/from sysregs are expensive, and must go via GPR. */ + if (from == MOVEABLE_SYSREGS) + return 80 + aarch64_register_move_cost (mode, GENERAL_REGS, to); + if (to == MOVEABLE_SYSREGS) + return 80 + aarch64_register_move_cost (mode, from, GENERAL_REGS); + /* Moving between GPR and stack cost is the same as GP2GP. */ if ((from == GENERAL_REGS && to == STACK_REG) || (to == GENERAL_REGS && from == STACK_REG)) @@ -15928,6 +16019,44 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den) return true; } +/* Emit an optimized sequence to perform a vector rotate + of REG by the vector constant amount AMNT_VEC and place the result + in DST. Return true iff successful. */ + +bool +aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt_vec) +{ + rtx amnt = unwrap_const_vec_duplicate (amnt_vec); + gcc_assert (CONST_INT_P (amnt)); + HOST_WIDE_INT rotamnt = UINTVAL (amnt); + machine_mode mode = GET_MODE (reg); + /* Rotates by half the element width map down to REV* instructions and should + always be preferred when possible. */ + if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2 + && expand_rotate_as_vec_perm (mode, dst, reg, amnt)) + return true; + /* 64 and 128-bit vector modes can use the XAR instruction + when available. */ + else if (can_create_pseudo_p () + && ((TARGET_SHA3 && mode == V2DImode) + || (TARGET_SVE2 + && (known_eq (GET_MODE_SIZE (mode), 8) + || known_eq (GET_MODE_SIZE (mode), 16))))) + { + rtx zeroes = aarch64_gen_shareable_zero (mode); + rtx xar_op + = gen_rtx_ROTATE (mode, gen_rtx_XOR (mode, reg, zeroes), + amnt_vec); + emit_set_insn (dst, xar_op); + return true; + } + /* If none of the above, try to expand rotates by any byte amount as + permutes. */ + else if (expand_rotate_as_vec_perm (mode, dst, reg, amnt)) + return true; + return false; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) @@ -16214,7 +16343,7 @@ public: private: void record_potential_advsimd_unrolling (loop_vec_info); void analyze_loop_vinfo (loop_vec_info); - void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, + void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info, slp_tree, aarch64_vec_op_count *); fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *, fractional_cost, unsigned int, @@ -16531,11 +16660,13 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } -/* Return true if an access of kind KIND for STMT_INFO represents one - vector of an LD[234] or ST[234] operation. Return the total number of - vectors (2, 3 or 4) if so, otherwise return a value outside that range. */ +/* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP) + represents one vector of an LD[234] or ST[234] operation. Return the total + number of vectors (2, 3 or 4) if so, otherwise return a value outside that + range. */ static int -aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) +aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info, + slp_tree node) { if ((kind == vector_load || kind == unaligned_load @@ -16545,7 +16676,7 @@ aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); if (stmt_info - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + && vect_mem_access_type (stmt_info, node) == VMAT_LOAD_STORE_LANES) return DR_GROUP_SIZE (stmt_info); } return 0; @@ -16783,14 +16914,15 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, } /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost - for the vectorized form of STMT_INFO, which has cost kind KIND and which - when vectorized would operate on vector type VECTYPE. Try to subdivide - the target-independent categorization provided by KIND to get a more - accurate cost. WHERE specifies where the cost associated with KIND - occurs. */ + for the vectorized form of STMT_INFO possibly using SLP node NODE, which has + cost kind KIND and which when vectorized would operate on vector type + VECTYPE. Try to subdivide the target-independent categorization provided by + KIND to get a more accurate cost. WHERE specifies where the cost associated + with KIND occurs. */ static fractional_cost aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, + stmt_vec_info stmt_info, slp_tree node, + tree vectype, enum vect_cost_model_location where, fractional_cost stmt_cost) { @@ -16816,10 +16948,11 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, cost by the number of elements in the vector. */ if (kind == scalar_load && sve_costs - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) { unsigned int nunits = vect_nunits_for_cost (vectype); - if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64) + /* Test for VNx2 modes, which have 64-bit containers. */ + if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), aarch64_sve_vg)) return { sve_costs->gather_load_x64_cost, nunits }; return { sve_costs->gather_load_x32_cost, nunits }; } @@ -16828,7 +16961,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind, in a scatter operation. */ if (kind == scalar_store && sve_costs - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) return sve_costs->scatter_store_elt_cost; /* Detect cases in which vec_to_scalar represents an in-loop reduction. */ @@ -16952,7 +17085,7 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, cost of any embedded operations. */ static fractional_cost aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, tree vectype, + stmt_vec_info stmt_info, slp_tree node, tree vectype, unsigned vec_flags, fractional_cost stmt_cost) { if (vectype) @@ -16961,7 +17094,7 @@ aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind, /* Detect cases in which a vector load or store represents an LD[234] or ST[234] instruction. */ - switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + switch (aarch64_ld234_st234_vectors (kind, stmt_info, node)) { case 2: stmt_cost += simd_costs->ld2_st2_permute_cost; @@ -17033,7 +17166,7 @@ aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info) information relating to the vector operation in OPS. */ void aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, + stmt_vec_info stmt_info, slp_tree node, aarch64_vec_op_count *ops) { const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info (); @@ -17131,7 +17264,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, /* Add any extra overhead associated with LD[234] and ST[234] operations. */ if (simd_issue) - switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + switch (aarch64_ld234_st234_vectors (kind, stmt_info, node)) { case 2: ops->general_ops += simd_issue->ld2_st2_general_ops * count; @@ -17149,7 +17282,7 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, /* Add any overhead associated with gather loads and scatter stores. */ if (sve_issue && (kind == scalar_load || kind == scalar_store) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) { unsigned int pairs = CEIL (count, 2); ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs; @@ -17254,7 +17387,7 @@ aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind, unsigned aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, - stmt_vec_info stmt_info, slp_tree, + stmt_vec_info stmt_info, slp_tree node, tree vectype, int misalign, vect_cost_model_location where) { @@ -17298,18 +17431,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (vectype && m_vec_flags) stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind, - stmt_info, vectype, - where, stmt_cost); + stmt_info, node, + vectype, where, + stmt_cost); /* Check if we've seen an SVE gather/scatter operation and which size. */ if (kind == scalar_load && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER) + && vect_mem_access_type (stmt_info, node) == VMAT_GATHER_SCATTER) { const sve_vec_cost *sve_costs = aarch64_tune_params.vec_costs->sve; if (sve_costs) { - if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64) + /* Test for VNx2 modes, which have 64-bit containers. */ + if (known_eq (GET_MODE_NUNITS (TYPE_MODE (vectype)), + aarch64_sve_vg)) m_sve_gather_scatter_init_cost += sve_costs->gather_load_x64_init_cost; else @@ -17351,7 +17487,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, { /* Account for any extra "embedded" costs that apply additively to the base cost calculated above. */ - stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, + stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info, node, vectype, m_vec_flags, stmt_cost); /* If we're recording a nonzero vector loop body cost for the @@ -17362,7 +17498,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p) && stmt_cost != 0) for (auto &ops : m_ops) - count_ops (count, kind, stmt_info, &ops); + count_ops (count, kind, stmt_info, node, &ops); /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, estimate the number of statements in the unrolled Advanced SIMD @@ -17565,6 +17701,19 @@ adjust_body_cost (loop_vec_info loop_vinfo, dump_printf_loc (MSG_NOTE, vect_location, "Original vector body cost = %d\n", body_cost); + /* If we know we have a single partial vector iteration, cap the VF + to the number of scalar iterations for costing purposes. */ + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + auto niters = LOOP_VINFO_INT_NITERS (loop_vinfo); + if (niters < estimated_vf && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Scalar loop iterates at most %wd times. Capping VF " + " from %d to %wd\n", niters, estimated_vf, niters); + + estimated_vf = MIN (estimated_vf, niters); + } + fractional_cost scalar_cycles_per_iter = scalar_ops.min_cycles_per_iter () * estimated_vf; @@ -19000,9 +19149,9 @@ static char * aarch64_offload_options (void) { if (TARGET_ILP32) - return xstrdup ("-foffload-abi=ilp32"); + return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-mabi=ilp32"); else - return xstrdup ("-foffload-abi=lp64"); + return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-mabi=lp64"); } static struct machine_function * @@ -20353,6 +20502,10 @@ dispatch_function_versions (tree dispatch_decl, tree init_fn_id = get_identifier ("__init_cpu_features_resolver"); tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, init_fn_id, init_fn_type); + DECL_EXTERNAL (init_fn_decl) = 1; + TREE_PUBLIC (init_fn_decl) = 1; + DECL_VISIBILITY (init_fn_decl) = VISIBILITY_HIDDEN; + DECL_VISIBILITY_SPECIFIED (init_fn_decl) = 1; tree arg1 = DECL_ARGUMENTS (dispatch_decl); tree arg2 = TREE_CHAIN (arg1); ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2); @@ -20372,6 +20525,9 @@ dispatch_function_versions (tree dispatch_decl, get_identifier ("__aarch64_cpu_features"), global_type); DECL_EXTERNAL (global_var) = 1; + TREE_PUBLIC (global_var) = 1; + DECL_VISIBILITY (global_var) = VISIBILITY_HIDDEN; + DECL_VISIBILITY_SPECIFIED (global_var) = 1; tree mask_var = create_tmp_var (long_long_unsigned_type_node); tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node, @@ -21075,7 +21231,7 @@ aarch64_legitimate_constant_p (machine_mode mode, rtx x) ??? It would be possible (but complex) to handle rematerialization of other constants via secondary reloads. */ if (!GET_MODE_SIZE (mode).is_constant ()) - return aarch64_simd_valid_immediate (x, NULL); + return aarch64_simd_valid_mov_imm (x); /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at least be forced to memory and loaded from there. */ @@ -22467,6 +22623,10 @@ aarch64_mangle_type (const_tree type) return "Dh"; } + /* Modal 8 bit floating point types. */ + if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node) + return "u6__mfp8"; + /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for builtin types. */ if (TYPE_NAME (type) != NULL) @@ -22481,6 +22641,29 @@ aarch64_mangle_type (const_tree type) return NULL; } +/* Implement TARGET_INVALID_CONVERSION. */ + +static const char * +aarch64_invalid_conversion (const_tree fromtype, const_tree totype) +{ + /* Do not allow conversions to/from FP8. But do allow conversions between + volatile and const variants of __mfp8. */ + bool fromtype_is_fp8 + = (TYPE_MAIN_VARIANT (fromtype) == aarch64_mfp8_type_node); + bool totype_is_fp8 = (TYPE_MAIN_VARIANT (totype) == aarch64_mfp8_type_node); + + if (fromtype_is_fp8 && totype_is_fp8) + return NULL; + + if (fromtype_is_fp8) + return N_ ("invalid conversion from type %<mfloat8_t%>"); + if (totype_is_fp8) + return N_ ("invalid conversion to type %<mfloat8_t%>"); + + /* Conversion allowed. */ + return NULL; +} + /* Implement TARGET_VERIFY_TYPE_CONTEXT. */ static bool @@ -22788,34 +22971,32 @@ aarch64_advsimd_valid_immediate_hs (unsigned int val32, return false; } -/* Return true if replicating VAL64 is a valid immediate for the +/* Return true if replicating VAL64 with mode MODE is a valid immediate for the Advanced SIMD operation described by WHICH. If INFO is nonnull, use it to describe valid immediates. */ static bool aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64, + scalar_int_mode mode, simd_immediate_info *info, enum simd_immediate_check which) { unsigned int val32 = val64 & 0xffffffff; - unsigned int val16 = val64 & 0xffff; unsigned int val8 = val64 & 0xff; - if (val32 == (val64 >> 32)) + if (mode != DImode) { - if ((which & AARCH64_CHECK_ORR) != 0 + if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_ORR) && aarch64_advsimd_valid_immediate_hs (val32, info, which, simd_immediate_info::MOV)) return true; - if ((which & AARCH64_CHECK_BIC) != 0 + if ((which == AARCH64_CHECK_MOV || which == AARCH64_CHECK_AND) && aarch64_advsimd_valid_immediate_hs (~val32, info, which, simd_immediate_info::MVN)) return true; /* Try using a replicated byte. */ - if (which == AARCH64_CHECK_MOV - && val16 == (val32 >> 16) - && val8 == (val16 >> 8)) + if (which == AARCH64_CHECK_MOV && mode == QImode) { if (info) *info = simd_immediate_info (QImode, val8); @@ -22843,47 +23024,41 @@ aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64, return false; } -/* Return true if replicating VAL64 gives a valid immediate for an SVE MOV - instruction. If INFO is nonnull, use it to describe valid immediates. */ +/* Return true if replicating IVAL with MODE gives a valid immediate for an SVE + MOV instruction. If INFO is nonnull, use it to describe valid + immediates. */ static bool -aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64, - simd_immediate_info *info) +aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT ival, scalar_int_mode mode, + simd_immediate_info *info, + enum simd_immediate_check which) { - scalar_int_mode mode = DImode; - unsigned int val32 = val64 & 0xffffffff; - if (val32 == (val64 >> 32)) + HOST_WIDE_INT val = trunc_int_for_mode (ival, mode); + + if (which == AARCH64_CHECK_MOV) { - mode = SImode; - unsigned int val16 = val32 & 0xffff; - if (val16 == (val32 >> 16)) + if (IN_RANGE (val, -0x80, 0x7f)) { - mode = HImode; - unsigned int val8 = val16 & 0xff; - if (val8 == (val16 >> 8)) - mode = QImode; + /* DUP with no shift. */ + if (info) + *info = simd_immediate_info (mode, val, + simd_immediate_info::SVE_MOV); + return true; + } + if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00)) + { + /* DUP with LSL #8. */ + if (info) + *info = simd_immediate_info (mode, val, + simd_immediate_info::SVE_MOV); + return true; } } - HOST_WIDE_INT val = trunc_int_for_mode (val64, mode); - if (IN_RANGE (val, -0x80, 0x7f)) - { - /* DUP with no shift. */ - if (info) - *info = simd_immediate_info (mode, val); - return true; - } - if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00)) - { - /* DUP with LSL #8. */ - if (info) - *info = simd_immediate_info (mode, val); - return true; - } - if (aarch64_bitmask_imm (val64, mode)) + if (aarch64_bitmask_imm (ival, mode)) { /* DUPM. */ if (info) - *info = simd_immediate_info (mode, val); + *info = simd_immediate_info (mode, val, simd_immediate_info::SVE_MOV); return true; } return false; @@ -22960,12 +23135,97 @@ aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info) return false; } +/* We can only represent floating point constants which will fit in + "quarter-precision" values. These values are characterised by + a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given + by: + + (-1)^s * (n/16) * 2^r + + Where: + 's' is the sign bit. + 'n' is an integer in the range 16 <= n <= 31. + 'r' is an integer in the range -3 <= r <= 4. + + Return true iff R represents a vale encodable into an AArch64 floating point + move instruction as an immediate. Othewise false. */ + +static bool +aarch64_real_float_const_representable_p (REAL_VALUE_TYPE r) +{ + /* This represents our current view of how many bits + make up the mantissa. */ + int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; + int exponent; + unsigned HOST_WIDE_INT mantissa, mask; + REAL_VALUE_TYPE m; + bool fail = false; + + /* We cannot represent infinities, NaNs or +/-zero. We won't + know if we have +zero until we analyse the mantissa, but we + can reject the other invalid values. */ + if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) + || REAL_VALUE_MINUS_ZERO (r)) + return false; + + /* Extract exponent. */ + r = real_value_abs (&r); + exponent = REAL_EXP (&r); + + /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the + highest (sign) bit, with a fixed binary point at bit point_pos. + m1 holds the low part of the mantissa, m2 the high part. + WARNING: If we ever have a representation using more than 2 * H_W_I - 1 + bits for the mantissa, this can fail (low bits will be lost). */ + real_ldexp (&m, &r, point_pos - exponent); + wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2); + + /* If the low part of the mantissa has bits set we cannot represent + the value. */ + if (fail || w.ulow () != 0) + return false; + + /* We have rejected the lower HOST_WIDE_INT, so update our + understanding of how many bits lie in the mantissa and + look only at the high HOST_WIDE_INT. */ + mantissa = w.elt (1); + point_pos -= HOST_BITS_PER_WIDE_INT; + + /* We can only represent values with a mantissa of the form 1.xxxx. */ + mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1; + if ((mantissa & mask) != 0) + return false; + + /* Having filtered unrepresentable values, we may now remove all + but the highest 5 bits. */ + mantissa >>= point_pos - 5; + + /* We cannot represent the value 0.0, so reject it. This is handled + elsewhere. */ + if (mantissa == 0) + return false; + + /* Then, as bit 4 is always set, we can mask it off, leaving + the mantissa in the range [0, 15]. */ + mantissa &= ~(1 << 4); + gcc_assert (mantissa <= 15); + + /* GCC internally does not use IEEE754-like encoding (where normalized + significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc). + Our mantissa values are shifted 4 places to the left relative to + normalized IEEE754 so we must modify the exponent returned by REAL_EXP + by 5 places to correct for GCC's representation. */ + exponent = 5 - exponent; + + return (exponent >= 0 && exponent <= 7); +} + /* Return true if OP is a valid SIMD immediate for the operation described by WHICH. If INFO is nonnull, use it to describe valid immediates. */ -bool -aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, - enum simd_immediate_check which) +static bool +aarch64_simd_valid_imm (rtx op, simd_immediate_info *info, + enum simd_immediate_check which) { machine_mode mode = GET_MODE (op); unsigned int vec_flags = aarch64_classify_vector_mode (mode); @@ -22987,7 +23247,8 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, if (CONST_VECTOR_P (op) && CONST_VECTOR_DUPLICATE_P (op)) n_elts = CONST_VECTOR_NPATTERNS (op); - else if ((vec_flags & VEC_SVE_DATA) + else if (which == AARCH64_CHECK_MOV + && TARGET_SVE && const_vec_series_p (op, &base, &step)) { gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); @@ -23012,20 +23273,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, else return false; - scalar_float_mode elt_float_mode; - if (n_elts == 1 - && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) - { - rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); - if (aarch64_float_const_zero_rtx_p (elt) - || aarch64_float_const_representable_p (elt)) - { - if (info) - *info = simd_immediate_info (elt_float_mode, elt); - return true; - } - } - /* If all elements in an SVE vector have the same value, we have a free choice between using the element mode and using the container mode. Using the element mode means that unused parts of the vector are @@ -23087,10 +23334,90 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes] << (i * BITS_PER_UNIT)); + /* Try encoding the integer immediate as a floating point value if it's an + exact value. */ + scalar_float_mode fmode = DFmode; + scalar_int_mode imode = DImode; + unsigned HOST_WIDE_INT ival = val64; + unsigned int val32 = val64 & 0xffffffff; + if (val32 == (val64 >> 32)) + { + fmode = SFmode; + imode = SImode; + ival = val32; + unsigned int val16 = val32 & 0xffff; + if (val16 == (val32 >> 16)) + { + fmode = HFmode; + imode = HImode; + ival = val16; + unsigned int val8 = val16 & 0xff; + if (val8 == (val16 >> 8)) + { + imode = QImode; + ival = val8; + } + } + } + + if (which == AARCH64_CHECK_MOV + && imode != QImode + && (imode != HImode || TARGET_FP_F16INST)) + { + long int as_long_ints[2]; + as_long_ints[0] = ival & 0xFFFFFFFF; + as_long_ints[1] = (ival >> 32) & 0xFFFFFFFF; + + REAL_VALUE_TYPE r; + real_from_target (&r, as_long_ints, fmode); + if (aarch64_real_float_const_representable_p (r)) + { + if (info) + { + rtx float_val = const_double_from_real_value (r, fmode); + *info = simd_immediate_info (fmode, float_val); + } + return true; + } + } + if (vec_flags & VEC_SVE_DATA) - return aarch64_sve_valid_immediate (val64, info); - else - return aarch64_advsimd_valid_immediate (val64, info, which); + return aarch64_sve_valid_immediate (ival, imode, info, which); + + if (aarch64_advsimd_valid_immediate (val64, imode, info, which)) + return true; + + if (TARGET_SVE) + return aarch64_sve_valid_immediate (ival, imode, info, which); + return false; +} + +/* Return true if OP is a valid SIMD move immediate for SVE or AdvSIMD. */ +bool +aarch64_simd_valid_mov_imm (rtx op) +{ + return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_MOV); +} + +/* Return true if OP is a valid SIMD orr immediate for SVE or AdvSIMD. */ +bool +aarch64_simd_valid_orr_imm (rtx op) +{ + return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_ORR); +} + +/* Return true if OP is a valid SIMD and immediate for SVE or AdvSIMD. */ +bool +aarch64_simd_valid_and_imm (rtx op) +{ + return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND); +} + +/* Return true if OP is a valid SIMD xor immediate for SVE. */ +bool +aarch64_simd_valid_xor_imm (rtx op) +{ + return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_XOR); } /* Check whether X is a VEC_SERIES-like constant that starts at 0 and @@ -23156,7 +23483,7 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) && GET_MODE (x) != VNx16BImode) return false; - return aarch64_simd_valid_immediate (x, NULL); + return aarch64_simd_valid_mov_imm (x); } /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */ @@ -23257,7 +23584,7 @@ aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode) vmode = aarch64_simd_container_mode (mode, 64); rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op)); - return aarch64_simd_valid_immediate (op_v, NULL); + return aarch64_simd_valid_mov_imm (op_v); } /* Construct and return a PARALLEL RTX vector with elements numbering the @@ -23737,7 +24064,7 @@ aarch64_simd_make_constant (rtx vals) gcc_unreachable (); if (const_vec != NULL_RTX - && aarch64_simd_valid_immediate (const_vec, NULL)) + && aarch64_simd_valid_mov_imm (const_vec)) /* Load using MOVI/MVNI. */ return const_vec; else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) @@ -23942,7 +24269,7 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals) /* Load constant part of vector. We really don't care what goes into the parts we will overwrite, but we're more likely to be able to load the constant efficiently if it has fewer, larger, repeating parts - (see aarch64_simd_valid_immediate). */ + (see aarch64_simd_valid_imm). */ for (int i = 0; i < n_elts; i++) { rtx x = XVECEXP (vals, 0, i); @@ -25093,109 +25420,31 @@ aarch64_c_mode_for_suffix (char suffix) return VOIDmode; } -/* We can only represent floating point constants which will fit in - "quarter-precision" values. These values are characterised by - a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given - by: - - (-1)^s * (n/16) * 2^r - - Where: - 's' is the sign bit. - 'n' is an integer in the range 16 <= n <= 31. - 'r' is an integer in the range -3 <= r <= 4. */ - -/* Return true iff X can be represented by a quarter-precision +/* Return true iff X with mode MODE can be represented by a quarter-precision floating point immediate operand X. Note, we cannot represent 0.0. */ + bool aarch64_float_const_representable_p (rtx x) { - /* This represents our current view of how many bits - make up the mantissa. */ - int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; - int exponent; - unsigned HOST_WIDE_INT mantissa, mask; - REAL_VALUE_TYPE r, m; - bool fail; - x = unwrap_const_vec_duplicate (x); + machine_mode mode = GET_MODE (x); if (!CONST_DOUBLE_P (x)) return false; - if (GET_MODE (x) == VOIDmode - || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST)) - return false; - - r = *CONST_DOUBLE_REAL_VALUE (x); - - /* We cannot represent infinities, NaNs or +/-zero. We won't - know if we have +zero until we analyse the mantissa, but we - can reject the other invalid values. */ - if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) - || REAL_VALUE_MINUS_ZERO (r)) - return false; - - /* For BFmode, only handle 0.0. */ - if (GET_MODE (x) == BFmode) - return real_iszero (&r, false); - - /* Extract exponent. */ - r = real_value_abs (&r); - exponent = REAL_EXP (&r); - - /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the - highest (sign) bit, with a fixed binary point at bit point_pos. - m1 holds the low part of the mantissa, m2 the high part. - WARNING: If we ever have a representation using more than 2 * H_W_I - 1 - bits for the mantissa, this can fail (low bits will be lost). */ - real_ldexp (&m, &r, point_pos - exponent); - wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2); - - /* If the low part of the mantissa has bits set we cannot represent - the value. */ - if (w.ulow () != 0) - return false; - /* We have rejected the lower HOST_WIDE_INT, so update our - understanding of how many bits lie in the mantissa and - look only at the high HOST_WIDE_INT. */ - mantissa = w.elt (1); - point_pos -= HOST_BITS_PER_WIDE_INT; - - /* We can only represent values with a mantissa of the form 1.xxxx. */ - mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1; - if ((mantissa & mask) != 0) + if ((mode == HFmode && !TARGET_FP_F16INST) + || mode == BFmode) return false; - /* Having filtered unrepresentable values, we may now remove all - but the highest 5 bits. */ - mantissa >>= point_pos - 5; + REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (x); - /* We cannot represent the value 0.0, so reject it. This is handled - elsewhere. */ - if (mantissa == 0) - return false; - - /* Then, as bit 4 is always set, we can mask it off, leaving - the mantissa in the range [0, 15]. */ - mantissa &= ~(1 << 4); - gcc_assert (mantissa <= 15); - - /* GCC internally does not use IEEE754-like encoding (where normalized - significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc). - Our mantissa values are shifted 4 places to the left relative to - normalized IEEE754 so we must modify the exponent returned by REAL_EXP - by 5 places to correct for GCC's representation. */ - exponent = 5 - exponent; - - return (exponent >= 0 && exponent <= 7); + return aarch64_real_float_const_representable_p (r); } -/* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC - immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to - output MOVI/MVNI, ORR or BIC immediate. */ +/* Returns the string with the instruction for the SIMD immediate + * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. */ char* -aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, - enum simd_immediate_check which) +aarch64_output_simd_imm (rtx const_vector, unsigned width, + enum simd_immediate_check which) { bool is_valid; static char templ[40]; @@ -25206,11 +25455,7 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, struct simd_immediate_info info; - /* This will return true to show const_vector is legal for use as either - a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate. - It will also update INFO to show how the immediate should be generated. - WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */ - is_valid = aarch64_simd_valid_immediate (const_vector, &info, which); + is_valid = aarch64_simd_valid_imm (const_vector, &info, which); gcc_assert (is_valid); element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); @@ -25245,6 +25490,24 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, if (which == AARCH64_CHECK_MOV) { + if (info.insn == simd_immediate_info::INDEX) + { + gcc_assert (TARGET_SVE); + snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #" + HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC, + element_char, INTVAL (info.u.index.base), + INTVAL (info.u.index.step)); + return templ; + } + + if (info.insn == simd_immediate_info::SVE_MOV) + { + gcc_assert (TARGET_SVE); + snprintf (templ, sizeof (templ), "mov\t%%Z0.%c, #" HOST_WIDE_INT_PRINT_DEC, + element_char, INTVAL (info.u.mov.value)); + return templ; + } + mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi"; shift_op = (info.u.mov.modifier == simd_immediate_info::MSL ? "msl" : "lsl"); @@ -25263,9 +25526,21 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, } else { - /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */ - mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr"; - if (info.u.mov.shift) + /* AARCH64_CHECK_ORR, AARCH64_CHECK_AND or AARCH64_CHECK_XOR. */ + mnemonic = "orr"; + if (which == AARCH64_CHECK_AND) + mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "and"; + else if (which == AARCH64_CHECK_XOR) + mnemonic = "eor"; + + if (info.insn == simd_immediate_info::SVE_MOV) + { + gcc_assert (TARGET_SVE); + snprintf (templ, sizeof (templ), "%s\t%%Z0.%c, %%Z0.%c, " + HOST_WIDE_INT_PRINT_DEC, mnemonic, element_char, + element_char, INTVAL (info.u.mov.value)); + } + else if (info.u.mov.shift) snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #" HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count, element_char, UINTVAL (info.u.mov.value), "lsl", @@ -25278,6 +25553,38 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, return templ; } +/* Returns the string with the ORR instruction for the SIMD immediate + CONST_VECTOR of WIDTH bits. */ +char* +aarch64_output_simd_orr_imm (rtx const_vector, unsigned width) +{ + return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_ORR); +} + +/* Returns the string with the AND/BIC instruction for the SIMD immediate + CONST_VECTOR of WIDTH bits. */ +char* +aarch64_output_simd_and_imm (rtx const_vector, unsigned width) +{ + return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_AND); +} + +/* Returns the string with the EOR instruction for the SIMD immediate + CONST_VECTOR of WIDTH bits. */ +char* +aarch64_output_simd_xor_imm (rtx const_vector, unsigned width) +{ + return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_XOR); +} + +/* Returns the string with the MOV instruction for the SIMD immediate + CONST_VECTOR of WIDTH bits. */ +char* +aarch64_output_simd_mov_imm (rtx const_vector, unsigned width) +{ + return aarch64_output_simd_imm (const_vector, width, AARCH64_CHECK_MOV); +} + char* aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode) { @@ -25299,7 +25606,7 @@ aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode) vmode = aarch64_simd_container_mode (mode, width); rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate)); - return aarch64_output_simd_mov_immediate (v_op, width); + return aarch64_output_simd_mov_imm (v_op, width); } /* Return the output string to use for moving immediate CONST_VECTOR @@ -25311,8 +25618,9 @@ aarch64_output_sve_mov_immediate (rtx const_vector) static char templ[40]; struct simd_immediate_info info; char element_char; + bool is_valid; - bool is_valid = aarch64_simd_valid_immediate (const_vector, &info); + is_valid = aarch64_simd_valid_imm (const_vector, &info, AARCH64_CHECK_MOV); gcc_assert (is_valid); element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); @@ -25368,8 +25676,11 @@ aarch64_output_sve_mov_immediate (rtx const_vector) } } - snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC, - element_char, INTVAL (info.u.mov.value)); + if (info.u.mov.value == const0_rtx && TARGET_NON_STREAMING) + snprintf (templ, sizeof (templ), "movi\t%%d0, #0"); + else + snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC, + element_char, INTVAL (info.u.mov.value)); return templ; } @@ -25381,9 +25692,10 @@ char * aarch64_output_sve_ptrues (rtx const_unspec) { static char templ[40]; - struct simd_immediate_info info; - bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info); + bool is_valid; + + is_valid = aarch64_simd_valid_imm (const_unspec, &info, AARCH64_CHECK_MOV); gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE); char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); @@ -25653,26 +25965,23 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) { expand_vec_perm_d newd; - if (d->vec_flags != VEC_ADVSIMD) + /* The subregs that we'd create are not supported for big-endian SVE; + see aarch64_modes_compatible_p for details. */ + if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE)) return false; /* Get the new mode. Always twice the size of the inner and half the elements. */ - poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode); - unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2; - auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require (); - machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits); - - if (new_mode == word_mode) + machine_mode new_mode; + if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode)) return false; vec_perm_indices newpermindices; - if (!newpermindices.new_shrunk_vector (d->perm, 2)) return false; newd.vmode = new_mode; - newd.vec_flags = VEC_ADVSIMD; + newd.vec_flags = d->vec_flags; newd.op_mode = newd.vmode; newd.op_vec_flags = newd.vec_flags; newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL; @@ -27081,6 +27390,9 @@ aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, if (op_mode == VOIDmode) op_mode = GET_MODE (op1); + if (CONST_SCALAR_INT_P (op1)) + canonicalize_comparison (op_mode, &code, &op1); + switch (op_mode) { case E_QImode: @@ -27097,13 +27409,13 @@ aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, case E_SFmode: cmp_mode = SFmode; - cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); + cc_mode = aarch64_select_cc_mode (code, op0, op1); icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf; break; case E_DFmode: cmp_mode = DFmode; - cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); + cc_mode = aarch64_select_cc_mode (code, op0, op1); icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf; break; @@ -27134,7 +27446,7 @@ aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, *gen_seq = get_insns (); end_sequence (); - return gen_rtx_fmt_ee ((rtx_code) code, cc_mode, + return gen_rtx_fmt_ee (code, cc_mode, gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx); } @@ -27157,6 +27469,9 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, if (op_mode == VOIDmode) op_mode = GET_MODE (op1); + if (CONST_SCALAR_INT_P (op1)) + canonicalize_comparison (op_mode, &cmp_code, &op1); + switch (op_mode) { case E_QImode: @@ -27171,12 +27486,12 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, case E_SFmode: cmp_mode = SFmode; - cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); + cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1); break; case E_DFmode: cmp_mode = DFmode; - cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); + cc_mode = aarch64_select_cc_mode (cmp_code, op0, op1); break; default: @@ -27197,7 +27512,7 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, end_sequence (); target = gen_rtx_REG (cc_mode, CC_REGNUM); - aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code); + aarch64_cond = aarch64_get_condition_code_1 (cc_mode, cmp_code); if (bit_code != AND) { @@ -27236,7 +27551,7 @@ aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, *gen_seq = get_insns (); end_sequence (); - return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx); + return gen_rtx_fmt_ee (cmp_code, VOIDmode, target, const0_rtx); } #undef TARGET_GEN_CCMP_FIRST @@ -29020,8 +29335,20 @@ aarch64_stack_protect_guard (void) return NULL_TREE; } -/* Return the diagnostic message string if the binary operation OP is - not permitted on TYPE1 and TYPE2, NULL otherwise. */ +/* Implement TARGET_INVALID_UNARY_OP. */ + +static const char * +aarch64_invalid_unary_op (int op, const_tree type) +{ + /* Reject all single-operand operations on __mfp8 except for &. */ + if (TYPE_MAIN_VARIANT (type) == aarch64_mfp8_type_node && op != ADDR_EXPR) + return N_ ("operation not permitted on type %<mfloat8_t%>"); + + /* Operation allowed. */ + return NULL; +} + +/* Implement TARGET_INVALID_BINARY_OP. */ static const char * aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, @@ -29035,6 +29362,11 @@ aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, != aarch64_sve::builtin_type_p (type2))) return N_("cannot combine GNU and SVE vectors in a binary operation"); + /* Reject all 2-operand operations on __mfp8. */ + if (TYPE_MAIN_VARIANT (type1) == aarch64_mfp8_type_node + || TYPE_MAIN_VARIANT (type2) == aarch64_mfp8_type_node) + return N_ ("operation not permitted on type %<mfloat8_t%>"); + /* Operation allowed. */ return NULL; } @@ -30752,6 +31084,12 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_MANGLE_TYPE #define TARGET_MANGLE_TYPE aarch64_mangle_type +#undef TARGET_INVALID_CONVERSION +#define TARGET_INVALID_CONVERSION aarch64_invalid_conversion + +#undef TARGET_INVALID_UNARY_OP +#define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op + #undef TARGET_INVALID_BINARY_OP #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op @@ -30811,6 +31149,12 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_DWARF_FRAME_REG_MODE #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode +#undef TARGET_OUTPUT_CFI_DIRECTIVE +#define TARGET_OUTPUT_CFI_DIRECTIVE aarch64_output_cfi_directive + +#undef TARGET_DW_CFI_OPRND1_DESC +#define TARGET_DW_CFI_OPRND1_DESC aarch64_dw_cfi_oprnd1_desc + #undef TARGET_PROMOTED_TYPE #define TARGET_PROMOTED_TYPE aarch64_promoted_type diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 2dfb999..593319f 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -96,6 +96,8 @@ #define LONG_LONG_TYPE_SIZE 64 +#define WIDEST_HARDWARE_FP_SIZE 64 + /* This value is the amount of bytes a caller is allowed to drop the stack before probing has to be done for stack clash protection. */ #define STACK_CLASH_CALLER_GUARD 1024 @@ -156,6 +158,16 @@ #define PCC_BITFIELD_TYPE_MATTERS 1 +/* Use the same RTL truth representation for vector elements as we do + for scalars. This maintains the property that a comparison like + eq:V4SI is a composition of 4 individual eq:SIs, just like plus:V4SI + is a composition of 4 individual plus:SIs. + + This means that Advanced SIMD comparisons are represented in RTL as + (neg (op ...)). */ + +#define VECTOR_STORE_FLAG_VALUE(MODE) CONST1_RTX (GET_MODE_INNER (MODE)) + #ifndef USED_FOR_TARGET /* Define an enum of all features (ISA modes, architectures and extensions). @@ -457,6 +469,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED enabled through +gcs. */ #define TARGET_GCS AARCH64_HAVE_ISA (GCS) +/* Floating Point Absolute Maximum/Minimum extension instructions are + enabled through +faminmax. */ +#define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX) +#define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX) + /* Prefer different predicate registers for the output of a predicated operation over re-using an existing input predicate. */ #define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ @@ -1447,6 +1464,11 @@ extern const char *aarch64_rewrite_mcpu (int argc, const char **argv); #define ASM_OUTPUT_POOL_EPILOGUE aarch64_asm_output_pool_epilogue +/* This type is the user-visible __mfp8, and a pointer to that type. We + need it in many places in the backend. Defined in aarch64-builtins.cc. */ +extern GTY(()) tree aarch64_mfp8_type_node; +extern GTY(()) tree aarch64_mfp8_ptr_type_node; + /* This type is the user-visible __fp16, and a pointer to that type. We need it in many places in the backend. Defined in aarch64-builtins.cc. */ extern GTY(()) tree aarch64_fp16_type_node; diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index c54b29c..20956fc 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -5345,6 +5345,15 @@ (popcount:ALLI (match_operand:ALLI 1 "register_operand")))] "TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD" { + if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode) + { + rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode); + rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]); + emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1)); + emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp)); + DONE; + } + if (!TARGET_CSSC) { rtx v = gen_reg_rtx (V8QImode); @@ -7218,13 +7227,12 @@ } ) -;; For copysign (x, y), we want to generate: +;; For copysignf (x, y), we want to generate: ;; -;; LDR d2, #(1 << 63) -;; BSL v2.8b, [y], [x] +;; movi v31.4s, 0x80, lsl 24 +;; bit v0.16b, v1.16b, v31.16b ;; -;; or another, equivalent, sequence using one of BSL/BIT/BIF. Because -;; we expect these operations to nearly always operate on +;; Because we expect these operations to nearly always operate on ;; floating-point values, we do not want the operation to be ;; simplified into a bit-field insert operation that operates on the ;; integer side, since typically that would involve three inter-bank @@ -7239,32 +7247,25 @@ (match_operand:GPF 2 "nonmemory_operand")] "TARGET_SIMD" { - rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U - << (GET_MODE_BITSIZE (<MODE>mode) - 1)); - /* copysign (x, -1) should instead be expanded as orr with the sign - bit. */ + rtx sign = GEN_INT (HOST_WIDE_INT_M1U << (GET_MODE_BITSIZE (<MODE>mode) - 1)); + rtx v_bitmask = gen_const_vec_duplicate (<VQ_INT_EQUIV>mode, sign); + v_bitmask = force_reg (<VQ_INT_EQUIV>mode, v_bitmask); + + /* copysign (x, -1) should instead be expanded as orr with the signbit. */ rtx op2_elt = unwrap_const_vec_duplicate (operands[2]); + if (GET_CODE (op2_elt) == CONST_DOUBLE && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt))) { - rtx v_bitmask - = force_reg (V2<V_INT_EQUIV>mode, - gen_const_vec_duplicate (V2<V_INT_EQUIV>mode, - signbit_const)); - - emit_insn (gen_iorv2<v_int_equiv>3 ( - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode), - lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode), + emit_insn (gen_ior<vq_int_equiv>3 ( + lowpart_subreg (<VQ_INT_EQUIV>mode, operands[0], <MODE>mode), + lowpart_subreg (<VQ_INT_EQUIV>mode, operands[1], <MODE>mode), v_bitmask)); DONE; } - - machine_mode int_mode = <V_INT_EQUIV>mode; - rtx bitmask = gen_reg_rtx (int_mode); - emit_move_insn (bitmask, signbit_const); operands[2] = force_reg (<MODE>mode, operands[2]); emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2], - bitmask)); + v_bitmask)); DONE; } ) @@ -7273,23 +7274,21 @@ [(set (match_operand:GPF 0 "register_operand") (unspec:GPF [(match_operand:GPF 1 "register_operand") (match_operand:GPF 2 "register_operand") - (match_operand:<V_INT_EQUIV> 3 "register_operand")] + (match_operand:<VQ_INT_EQUIV> 3 "register_operand")] UNSPEC_COPYSIGN))] "TARGET_SIMD" {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type ] [ w , w , w , 0 ; neon_bsl<q> ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype> [ w , 0 , w , w ; neon_bsl<q> ] bit\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype> [ w , w , 0 , w ; neon_bsl<q> ] bif\t%0.<Vbtype>, %1.<Vbtype>, %3.<Vbtype> - [ r , r , 0 , X ; bfm ] bfxil\t%<w1>0, %<w1>1, #0, <sizem1> } ) - -;; For xorsign (x, y), we want to generate: +;; For xorsignf (x, y), we want to generate: ;; -;; LDR d2, #1<<63 -;; AND v3.8B, v1.8B, v2.8B -;; EOR v0.8B, v0.8B, v3.8B +;; movi v31.4s, 0x80, lsl 24 +;; and v31.16b, v31.16b, v1.16b +;; eor v0.16b, v31.16b, v0.16b ;; (define_expand "@xorsign<mode>3" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index e376685..d3533f3 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -72,6 +72,9 @@ typedef __Poly16_t poly16_t; typedef __Poly64_t poly64_t; typedef __Poly128_t poly128_t; +typedef __Mfloat8x8_t mfloat8x8_t; +typedef __Mfloat8x16_t mfloat8x16_t; + typedef __fp16 float16_t; typedef float float32_t; typedef double float64_t; @@ -26949,9 +26952,9 @@ vrax1q_u64 (uint64x2_t __a, uint64x2_t __b) __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vxarq_u64 (uint64x2_t __a, uint64x2_t __b, const int imm6) +vxarq_u64 (uint64x2_t __a, uint64x2_t __b, const int __imm6) { - return __builtin_aarch64_xarqv2di_uuus (__a, __b,imm6); + return __builtin_aarch64_xarqv2di_uuus (__a, __b, __imm6); } __extension__ extern __inline uint8x16_t diff --git a/gcc/config/aarch64/arm_private_fp8.h b/gcc/config/aarch64/arm_private_fp8.h index 5668cc2..f787022 100644 --- a/gcc/config/aarch64/arm_private_fp8.h +++ b/gcc/config/aarch64/arm_private_fp8.h @@ -40,6 +40,8 @@ extern "C" { #endif + typedef __mfp8 mfloat8_t; + typedef uint64_t fpm_t; enum __ARM_FPM_FORMAT diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index f491e4b..647941c3 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -464,21 +464,25 @@ "@internal A constraint that matches vector of immediates for orr." (and (match_code "const_vector") - (match_test "aarch64_simd_valid_immediate (op, NULL, - AARCH64_CHECK_ORR)"))) + (match_test "aarch64_simd_valid_orr_imm (op)"))) (define_constraint "Db" "@internal - A constraint that matches vector of immediates for bic." + A constraint that matches vector of immediates for and/bic." (and (match_code "const_vector") - (match_test "aarch64_simd_valid_immediate (op, NULL, - AARCH64_CHECK_BIC)"))) + (match_test "aarch64_simd_valid_and_imm (op)"))) + +(define_constraint "De" + "@internal + A constraint that matches vector of immediates for xor." + (and (match_code "const_vector") + (match_test "aarch64_simd_valid_xor_imm (op)"))) (define_constraint "Dn" "@internal A constraint that matches vector of immediates." (and (match_code "const,const_vector") - (match_test "aarch64_simd_valid_immediate (op, NULL)"))) + (match_test "aarch64_simd_valid_mov_imm (op)"))) (define_constraint "Dh" "@internal diff --git a/gcc/config/aarch64/driver-aarch64.cc b/gcc/config/aarch64/driver-aarch64.cc index b620351..abe6e7d 100644 --- a/gcc/config/aarch64/driver-aarch64.cc +++ b/gcc/config/aarch64/driver-aarch64.cc @@ -256,9 +256,9 @@ host_detect_local_cpu (int argc, const char **argv) bool cpu = false; unsigned int i = 0; unsigned char imp = INVALID_IMP; - unsigned int cores[2] = { INVALID_CORE, INVALID_CORE }; + unsigned int cores[3] = { INVALID_CORE, INVALID_CORE, INVALID_CORE }; unsigned int n_cores = 0; - unsigned int variants[2] = { ALL_VARIANTS, ALL_VARIANTS }; + unsigned int variants[3] = { ALL_VARIANTS, ALL_VARIANTS, ALL_VARIANTS }; unsigned int n_variants = 0; bool processed_exts = false; aarch64_feature_flags extension_flags = 0; @@ -314,7 +314,7 @@ host_detect_local_cpu (int argc, const char **argv) unsigned cvariant = parse_field (buf); if (!contains_core_p (variants, cvariant)) { - if (n_variants == 2) + if (n_variants == 3) goto not_found; variants[n_variants++] = cvariant; @@ -326,7 +326,7 @@ host_detect_local_cpu (int argc, const char **argv) unsigned ccore = parse_field (buf); if (!contains_core_p (cores, ccore)) { - if (n_cores == 2) + if (n_cores == 3) goto not_found; cores[n_cores++] = ccore; @@ -383,11 +383,15 @@ host_detect_local_cpu (int argc, const char **argv) /* Weird cpuinfo format that we don't know how to handle. */ if (n_cores == 0 || n_cores > 2 - || (n_cores == 1 && n_variants != 1) || imp == INVALID_IMP || !processed_exts) goto not_found; + /* If we have one core type but multiple variants, consider + that as one variant with ALL_VARIANTS instead. */ + if (n_cores == 1 && n_variants != 1) + variants[0] = ALL_VARIANTS; + /* Simple case, one core type or just looking for the arch. */ if (n_cores == 1 || arch) { diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 20a318e..8269b0c 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -290,6 +290,8 @@ ;; Advanced SIMD modes for H, S and D types. (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI]) +(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI]) + ;; Advanced SIMD and scalar integer modes for H and S. (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI]) @@ -444,6 +446,9 @@ ;; All fully-packed SVE integer vector modes. (define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI]) +;; All fully-packed SVE integer and Advanced SIMD integer modes. +(define_mode_iterator SVE_ASIMD_FULL_I [SVE_FULL_I VDQ_I]) + ;; All fully-packed SVE floating-point vector modes. (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF]) @@ -519,15 +524,20 @@ VNx4HI VNx2HI VNx2SI]) +;; All SVE integer vector modes. +(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI + VNx8HI VNx4HI VNx2HI + VNx4SI VNx2SI + VNx2DI]) + +;; All SVE floating-point vector modes. +(define_mode_iterator SVE_F [VNx8HF VNx4HF VNx2HF + VNx8BF VNx4BF VNx2BF + VNx4SF VNx2SF + VNx2DF]) + ;; All SVE vector modes. -(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI - VNx8HI VNx4HI VNx2HI - VNx8HF VNx4HF VNx2HF - VNx8BF VNx4BF VNx2BF - VNx4SI VNx2SI - VNx4SF VNx2SF - VNx2DI - VNx2DF]) +(define_mode_iterator SVE_ALL [SVE_I SVE_F]) ;; All SVE 2-vector modes. (define_mode_iterator SVE_FULLx2 [VNx32QI VNx16HI VNx8SI VNx4DI @@ -549,18 +559,12 @@ ;; All SVE vector and structure modes. (define_mode_iterator SVE_ALL_STRUCT [SVE_ALL SVE_STRUCT]) -;; All SVE integer vector modes. -(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI - VNx8HI VNx4HI VNx2HI - VNx4SI VNx2SI - VNx2DI]) - ;; All SVE integer vector modes and Advanced SIMD 64-bit vector ;; element modes (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI]) ;; All SVE and Advanced SIMD integer vector modes. -(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I]) +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI]) ;; SVE integer vector modes whose elements are 16 bits or wider. (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI @@ -841,6 +845,8 @@ UNSPEC_COND_CMPNE_WIDE ; Used in aarch64-sve.md. UNSPEC_COND_FABS ; Used in aarch64-sve.md. UNSPEC_COND_FADD ; Used in aarch64-sve.md. + UNSPEC_COND_FAMAX ; Used in aarch64-sve.md. + UNSPEC_COND_FAMIN ; Used in aarch64-sve.md. UNSPEC_COND_FCADD90 ; Used in aarch64-sve.md. UNSPEC_COND_FCADD270 ; Used in aarch64-sve.md. UNSPEC_COND_FCMEQ ; Used in aarch64-sve.md. @@ -881,6 +887,8 @@ UNSPEC_COND_FSQRT ; Used in aarch64-sve.md. UNSPEC_COND_FSUB ; Used in aarch64-sve.md. UNSPEC_COND_SCVTF ; Used in aarch64-sve.md. + UNSPEC_COND_SMAX ; Used in aarch64-sve.md. + UNSPEC_COND_SMIN ; Used in aarch64-sve.md. UNSPEC_COND_UCVTF ; Used in aarch64-sve.md. UNSPEC_LASTA ; Used in aarch64-sve.md. UNSPEC_LASTB ; Used in aarch64-sve.md. @@ -1057,6 +1065,8 @@ UNSPEC_BFCVTN2 ; Used in aarch64-simd.md. UNSPEC_BFCVT ; Used in aarch64-simd.md. UNSPEC_FCVTXN ; Used in aarch64-simd.md. + UNSPEC_FAMAX ; Used in aarch64-simd.md. + UNSPEC_FAMIN ; Used in aarch64-simd.md. ;; All used in aarch64-sve2.md UNSPEC_FCVTN @@ -1230,7 +1240,7 @@ (define_mode_attr bitsize [(V8QI "64") (V16QI "128") (V4HI "64") (V8HI "128") (V2SI "64") (V4SI "128") - (V2DI "128")]) + (V1DI "64") (V2DI "128")]) ;; Map a floating point or integer mode to the appropriate register name prefix (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")]) @@ -1886,6 +1896,14 @@ (VNx8SF "vnx8si") (VNx16SF "vnx16si") ]) +;; Mode with floating-point values replaced by 128-bit vector integers. +(define_mode_attr VQ_INT_EQUIV [(DF "V2DI") (SF "V4SI") +]) + +;; Lower case mode with floating-point values replaced by 128-bit vector integers. +(define_mode_attr vq_int_equiv [(DF "v2di") (SF "v4si") +]) + ;; Floating-point equivalent of selected modes. (define_mode_attr V_FP_EQUIV [(VNx8HI "VNx8HF") (VNx8HF "VNx8HF") (VNx8BF "VNx8HF") @@ -2284,7 +2302,7 @@ (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") (V8QI "VNx8BI") (V16QI "VNx16BI") (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") - (V4SI "VNx4BI") (V2DI "VNx2BI")]) + (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")]) ;; ...and again in lower case. (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") @@ -2318,6 +2336,14 @@ (VNx4SI "VNx8SI") (VNx4SF "VNx8SF") (VNx2DI "VNx4DI") (VNx2DF "VNx4DF")]) +;; The Advanced SIMD modes of popcount corresponding to scalar modes. +(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI") + (SI "V2SI") (DI "V1DI")]) + +;; ...and again in lower case. +(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi") + (SI "v2si") (DI "v1di")]) + ;; On AArch64 the By element instruction doesn't have a 2S variant. ;; However because the instruction always selects a pair of values ;; The normal 3SAME instruction can be used here instead. @@ -3079,15 +3105,20 @@ (define_int_iterator SVE_COND_FCVTI [UNSPEC_COND_FCVTZS UNSPEC_COND_FCVTZU]) (define_int_iterator SVE_COND_ICVTF [UNSPEC_COND_SCVTF UNSPEC_COND_UCVTF]) -(define_int_iterator SVE_COND_FP_BINARY [UNSPEC_COND_FADD - UNSPEC_COND_FDIV - UNSPEC_COND_FMAX - UNSPEC_COND_FMAXNM - UNSPEC_COND_FMIN - UNSPEC_COND_FMINNM - UNSPEC_COND_FMUL - UNSPEC_COND_FMULX - UNSPEC_COND_FSUB]) +(define_int_iterator SVE_COND_FP_BINARY + [UNSPEC_COND_FADD + (UNSPEC_COND_FAMAX "TARGET_SVE_FAMINMAX") + (UNSPEC_COND_FAMIN "TARGET_SVE_FAMINMAX") + UNSPEC_COND_FDIV + UNSPEC_COND_FMAX + UNSPEC_COND_FMAXNM + UNSPEC_COND_FMIN + UNSPEC_COND_FMINNM + UNSPEC_COND_FMUL + UNSPEC_COND_FMULX + UNSPEC_COND_FSUB + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) ;; Same as SVE_COND_FP_BINARY, but without codes that have a dedicated ;; <optab><mode>3 expander. @@ -3098,7 +3129,9 @@ UNSPEC_COND_FMINNM UNSPEC_COND_FMUL UNSPEC_COND_FMULX - UNSPEC_COND_FSUB]) + UNSPEC_COND_FSUB + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_BINARY_INT [UNSPEC_COND_FSCALE]) @@ -3110,10 +3143,15 @@ UNSPEC_COND_FMAXNM UNSPEC_COND_FMIN UNSPEC_COND_FMINNM - UNSPEC_COND_FMUL]) + UNSPEC_COND_FMUL + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) -(define_int_iterator SVE_COND_FP_BINARY_REG [UNSPEC_COND_FDIV - UNSPEC_COND_FMULX]) +(define_int_iterator SVE_COND_FP_BINARY_REG + [(UNSPEC_COND_FAMAX "TARGET_SVE_FAMINMAX") + (UNSPEC_COND_FAMIN "TARGET_SVE_FAMINMAX") + UNSPEC_COND_FDIV + UNSPEC_COND_FMULX]) (define_int_iterator SVE_COND_FCADD [UNSPEC_COND_FCADD90 UNSPEC_COND_FCADD270]) @@ -3121,12 +3159,12 @@ (define_int_iterator SVE_COND_FP_MAXMIN [UNSPEC_COND_FMAX UNSPEC_COND_FMAXNM UNSPEC_COND_FMIN - UNSPEC_COND_FMINNM]) + UNSPEC_COND_FMINNM + UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) -;; Floating-point max/min operations that correspond to optabs, -;; as opposed to those that are internal to the port. -(define_int_iterator SVE_COND_FP_MAXMIN_PUBLIC [UNSPEC_COND_FMAXNM - UNSPEC_COND_FMINNM]) +(define_int_iterator SVE_COND_SMAXMIN [UNSPEC_COND_SMAX + UNSPEC_COND_SMIN]) (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA UNSPEC_COND_FMLS @@ -3692,6 +3730,8 @@ (UNSPEC_ZIP2Q "zip2q") (UNSPEC_COND_FABS "abs") (UNSPEC_COND_FADD "add") + (UNSPEC_COND_FAMAX "famax") + (UNSPEC_COND_FAMIN "famin") (UNSPEC_COND_FCADD90 "cadd90") (UNSPEC_COND_FCADD270 "cadd270") (UNSPEC_COND_FCMLA "fcmla") @@ -3703,9 +3743,9 @@ (UNSPEC_COND_FCVTZU "fixuns_trunc") (UNSPEC_COND_FDIV "div") (UNSPEC_COND_FMAX "fmax_nan") - (UNSPEC_COND_FMAXNM "smax") + (UNSPEC_COND_FMAXNM "fmax") (UNSPEC_COND_FMIN "fmin_nan") - (UNSPEC_COND_FMINNM "smin") + (UNSPEC_COND_FMINNM "fmin") (UNSPEC_COND_FMLA "fma") (UNSPEC_COND_FMLS "fnma") (UNSPEC_COND_FMUL "mul") @@ -3725,6 +3765,8 @@ (UNSPEC_COND_FSQRT "sqrt") (UNSPEC_COND_FSUB "sub") (UNSPEC_COND_SCVTF "float") + (UNSPEC_COND_SMAX "smax") + (UNSPEC_COND_SMIN "smin") (UNSPEC_COND_UCVTF "floatuns")]) (define_int_attr fmaxmin [(UNSPEC_FMAX "fmax_nan") @@ -3732,9 +3774,7 @@ (UNSPEC_FMAXNMV "fmax") (UNSPEC_FMIN "fmin_nan") (UNSPEC_FMINNM "fmin") - (UNSPEC_FMINNMV "fmin") - (UNSPEC_COND_FMAXNM "fmax") - (UNSPEC_COND_FMINNM "fmin")]) + (UNSPEC_FMINNMV "fmin")]) (define_int_attr maxmin_uns_op [(UNSPEC_UMAXV "umax") (UNSPEC_UMINV "umin") @@ -4228,6 +4268,8 @@ (UNSPEC_FTSSEL "ftssel") (UNSPEC_COND_FABS "fabs") (UNSPEC_COND_FADD "fadd") + (UNSPEC_COND_FAMAX "famax") + (UNSPEC_COND_FAMIN "famin") (UNSPEC_COND_FCVTLT "fcvtlt") (UNSPEC_COND_FCVTX "fcvtx") (UNSPEC_COND_FDIV "fdiv") @@ -4249,9 +4291,13 @@ (UNSPEC_COND_FRINTZ "frintz") (UNSPEC_COND_FSCALE "fscale") (UNSPEC_COND_FSQRT "fsqrt") - (UNSPEC_COND_FSUB "fsub")]) + (UNSPEC_COND_FSUB "fsub") + (UNSPEC_COND_SMAX "fmaxnm") + (UNSPEC_COND_SMIN "fminnm")]) (define_int_attr sve_fp_op_rev [(UNSPEC_COND_FADD "fadd") + (UNSPEC_COND_FAMAX "famax") + (UNSPEC_COND_FAMIN "famin") (UNSPEC_COND_FDIV "fdivr") (UNSPEC_COND_FMAX "fmax") (UNSPEC_COND_FMAXNM "fmaxnm") @@ -4259,7 +4305,9 @@ (UNSPEC_COND_FMINNM "fminnm") (UNSPEC_COND_FMUL "fmul") (UNSPEC_COND_FMULX "fmulx") - (UNSPEC_COND_FSUB "fsubr")]) + (UNSPEC_COND_FSUB "fsubr") + (UNSPEC_COND_SMAX "fmaxnm") + (UNSPEC_COND_SMIN "fminnm")]) (define_int_attr sme_int_op [(UNSPEC_SME_ADD_WRITE "add") (UNSPEC_SME_SUB_WRITE "sub")]) @@ -4388,6 +4436,8 @@ ;; <optab><mode>3 pattern. (define_int_attr sve_pred_fp_rhs1_operand [(UNSPEC_COND_FADD "register_operand") + (UNSPEC_COND_FAMAX "register_operand") + (UNSPEC_COND_FAMIN "register_operand") (UNSPEC_COND_FDIV "register_operand") (UNSPEC_COND_FMAX "register_operand") (UNSPEC_COND_FMAXNM "register_operand") @@ -4395,12 +4445,16 @@ (UNSPEC_COND_FMINNM "register_operand") (UNSPEC_COND_FMUL "register_operand") (UNSPEC_COND_FMULX "register_operand") - (UNSPEC_COND_FSUB "aarch64_sve_float_arith_operand")]) + (UNSPEC_COND_FSUB "aarch64_sve_float_arith_operand") + (UNSPEC_COND_SMAX "register_operand") + (UNSPEC_COND_SMIN "register_operand")]) ;; The predicate to use for the second input operand in a floating-point ;; <optab><mode>3 pattern. (define_int_attr sve_pred_fp_rhs2_operand [(UNSPEC_COND_FADD "aarch64_sve_float_arith_with_sub_operand") + (UNSPEC_COND_FAMAX "register_operand") + (UNSPEC_COND_FAMIN "register_operand") (UNSPEC_COND_FDIV "register_operand") (UNSPEC_COND_FMAX "aarch64_sve_float_maxmin_operand") (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_operand") @@ -4408,7 +4462,9 @@ (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_operand") (UNSPEC_COND_FMUL "aarch64_sve_float_mul_operand") (UNSPEC_COND_FMULX "register_operand") - (UNSPEC_COND_FSUB "register_operand")]) + (UNSPEC_COND_FSUB "register_operand") + (UNSPEC_COND_SMAX "aarch64_sve_float_maxmin_operand") + (UNSPEC_COND_SMIN "aarch64_sve_float_maxmin_operand")]) ;; Likewise for immediates only. (define_int_attr sve_pred_fp_rhs2_immediate @@ -4416,7 +4472,9 @@ (UNSPEC_COND_FMAXNM "aarch64_sve_float_maxmin_immediate") (UNSPEC_COND_FMIN "aarch64_sve_float_maxmin_immediate") (UNSPEC_COND_FMINNM "aarch64_sve_float_maxmin_immediate") - (UNSPEC_COND_FMUL "aarch64_sve_float_mul_immediate")]) + (UNSPEC_COND_FMUL "aarch64_sve_float_mul_immediate") + (UNSPEC_COND_SMAX "aarch64_sve_float_maxmin_immediate") + (UNSPEC_COND_SMIN "aarch64_sve_float_maxmin_immediate")]) ;; The maximum number of element bits that an instruction can handle. (define_int_attr max_elem_bits [(UNSPEC_UADDV "64") (UNSPEC_SADDV "32") @@ -4463,3 +4521,16 @@ (UNSPECV_SET_FPCR "fpcr")]) (define_int_attr bits_etype [(8 "b") (16 "h") (32 "s") (64 "d")]) + +;; Iterators and attributes for faminmax + +(define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) + +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + +(define_int_attr faminmax_uns_op + [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 8f3aab2..6ad9a4b 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -118,14 +118,17 @@ (define_predicate "aarch64_reg_or_orr_imm" (ior (match_operand 0 "register_operand") (and (match_code "const_vector") - (match_test "aarch64_simd_valid_immediate (op, NULL, - AARCH64_CHECK_ORR)")))) + (match_test "aarch64_simd_valid_orr_imm (op)")))) -(define_predicate "aarch64_reg_or_bic_imm" +(define_predicate "aarch64_reg_or_and_imm" (ior (match_operand 0 "register_operand") (and (match_code "const_vector") - (match_test "aarch64_simd_valid_immediate (op, NULL, - AARCH64_CHECK_BIC)")))) + (match_test "aarch64_simd_valid_and_imm (op)")))) + +(define_predicate "aarch64_reg_or_xor_imm" + (ior (match_operand 0 "register_operand") + (and (match_code "const_vector") + (match_test "aarch64_simd_valid_xor_imm (op)")))) (define_predicate "aarch64_fp_compare_operand" (ior (match_operand 0 "register_operand") @@ -945,11 +948,6 @@ (ior (match_operand 0 "register_operand") (match_operand 0 "aarch64_sve_logical_immediate"))) -(define_predicate "aarch64_orr_imm_sve_advsimd" - (ior (match_operand 0 "aarch64_reg_or_orr_imm") - (and (match_test "TARGET_SVE") - (match_operand 0 "aarch64_sve_logical_operand")))) - (define_predicate "aarch64_sve_gather_offset_b" (ior (match_operand 0 "register_operand") (match_operand 0 "aarch64_sve_gather_immediate_b"))) diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h new file mode 100644 index 0000000..c3a1e06 --- /dev/null +++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h @@ -0,0 +1,65 @@ +/* Tuning model description for FUJITSU-MONAKA. + Copyright (C) 2009-2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_AARCH64_H_FUJITSU_MONAKA +#define GCC_AARCH64_H_FUJITSU_MONAKA + +#include "generic.h" +#include "generic_armv9_a.h" + +/* Tuning parameters for FUJITSU-MONAKA processor. It is copied from the + generic one except for the vector width for now. */ +static const struct tune_params fujitsu_monaka_tunings = +{ + &cortexa76_extra_costs, + &generic_armv9_a_addrcost_table, + &generic_armv9_a_regmove_cost, + &generic_armv9_a_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_256, /* sve_width. */ + { 4, /* load_int. */ + 1, /* store_int. */ + 6, /* load_fp. */ + 2, /* store_fp. */ + 6, /* load_pred. */ + 1 /* store_pred. */ + }, /* memmov_cost. */ + 3, /* issue_rate. */ + (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops. */ + "32:16", /* function_align. */ + "4", /* jump_align. */ + "32:16", /* loop_align. */ + 2, /* int_reassoc_width. */ + 4, /* fp_reassoc_width. */ + 2, /* fma_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ + &generic_prefetch_tune, + AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ + AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ +}; + +#endif /* GCC_AARCH64_H_FUJITSU_MONAKA. */ diff --git a/gcc/config/aarch64/tuning_models/generic.h b/gcc/config/aarch64/tuning_models/generic.h index 101969b..ee2f3ff 100644 --- a/gcc/config/aarch64/tuning_models/generic.h +++ b/gcc/config/aarch64/tuning_models/generic.h @@ -105,8 +105,8 @@ static const sve_vec_cost generic_sve_vector_cost = 2, /* fadda_f64_cost */ 4, /* gather_load_x32_cost */ 2, /* gather_load_x64_cost */ - 12, /* gather_load_x32_init_cost */ - 4, /* gather_load_x64_init_cost */ + 0, /* gather_load_x32_init_cost */ + 0, /* gather_load_x64_init_cost */ 1 /* scatter_store_elt_cost */ }; diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h b/gcc/config/aarch64/tuning_models/generic_armv9_a.h index 999985e..76b3e4c 100644 --- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h +++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h @@ -207,6 +207,18 @@ static const struct cpu_vector_cost generic_armv9_a_vector_cost = &generic_armv9_a_vec_issue_info /* issue_info */ }; +/* Generic prefetch settings (which disable prefetch). */ +static const cpu_prefetch_tune generic_armv9a_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + static const struct tune_params generic_armv9_a_tunings = { &cortexa76_extra_costs, @@ -239,7 +251,7 @@ static const struct tune_params generic_armv9_a_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ - &generic_prefetch_tune, + &generic_armv9a_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ }; diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index 52aad7d..e7e37e6b 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -206,6 +206,19 @@ static const struct cpu_vector_cost neoversev2_vector_cost = &neoversev2_vec_issue_info /* issue_info */ }; +/* Prefetch settings. Disable software prefetch generation but set L1 cache + line size. */ +static const cpu_prefetch_tune neoversev2_prefetch_tune = +{ + 0, /* num_slots */ + -1, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + -1, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + static const struct tune_params neoversev2_tunings = { &cortexa76_extra_costs, @@ -244,7 +257,7 @@ static const struct tune_params neoversev2_tunings = | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA), /* tune_flags. */ - &generic_prefetch_tune, + &neoversev2_prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ }; diff --git a/gcc/config/alpha/alpha.cc b/gcc/config/alpha/alpha.cc index 74631a4..d7f5e3b 100644 --- a/gcc/config/alpha/alpha.cc +++ b/gcc/config/alpha/alpha.cc @@ -3269,7 +3269,7 @@ alpha_emit_xfloating_cvt (enum rtx_code orig_code, rtx operands[]) set (OP[1] OP[3]) is valid. Naturally, output operand ordering is little-endian. This is used by *movtf_internal and *movti_internal. */ - + void alpha_split_tmode_pair (rtx operands[4], machine_mode mode, bool fixup_overlap) @@ -4410,7 +4410,7 @@ emit_insxl (machine_mode mode, rtx op1, rtx op2) } /* Expand an atomic fetch-and-operate pattern. CODE is the binary operation - to perform. MEM is the memory on which to operate. VAL is the second + to perform. MEM is the memory on which to operate. VAL is the second operand of the binary operator. BEFORE and AFTER are optional locations to return the value of MEM either before of after the operation. SCRATCH is a scratch register. */ @@ -4594,7 +4594,7 @@ alpha_split_compare_and_swap_12 (rtx operands[]) label2 = gen_rtx_LABEL_REF (DImode, gen_label_rtx ()); emit_insn (gen_load_locked (DImode, scratch, mem)); - + width = GEN_INT (GET_MODE_BITSIZE (mode)); mask = GEN_INT (mode == QImode ? 0xff : 0xffff); emit_insn (gen_extxl (dest, scratch, width, addr)); @@ -4725,7 +4725,7 @@ alpha_split_atomic_exchange_12 (rtx operands[]) emit_label (XEXP (label, 0)); emit_insn (gen_load_locked (DImode, scratch, mem)); - + width = GEN_INT (GET_MODE_BITSIZE (mode)); mask = GEN_INT (mode == QImode ? 0xff : 0xffff); emit_insn (gen_extxl (dest, scratch, width, addr)); @@ -5019,7 +5019,7 @@ get_trap_mode_suffix (void) gcc_unreachable (); } break; - + default: gcc_unreachable (); } @@ -5056,7 +5056,7 @@ get_round_mode_suffix (void) case ROUND_SUFFIX_C: return "c"; - + default: gcc_unreachable (); } @@ -6151,7 +6151,7 @@ alpha_setup_incoming_varargs (cumulative_args_t pcum, /* Detect whether integer registers or floating-point registers are needed by the detected va_arg statements. See above for how these values are computed. Note that the "escape" value - is VA_LIST_MAX_FPR_SIZE, which is 255, which has both of + is VA_LIST_MAX_FPR_SIZE, which is 255, which has both of these bits set. */ gcc_assert ((VA_LIST_MAX_FPR_SIZE & 3) == 3); @@ -6754,7 +6754,7 @@ alpha_fold_builtin_cmpbge (unsigned HOST_WIDE_INT opint[], long op_const) return NULL; } -/* Fold the builtin for the ZAPNOT instruction. This is essentially a +/* Fold the builtin for the ZAPNOT instruction. This is essentially a specialized form of an AND operation. Other byte manipulation instructions are defined in terms of this instruction, so this is also used as a subroutine for other builtins. @@ -6821,7 +6821,7 @@ alpha_fold_builtin_extxx (tree op[], unsigned HOST_WIDE_INT opint[], else zap_op = op; } - + opint[1] = bytemask; return alpha_fold_builtin_zapnot (zap_op, opint, zap_const); } @@ -7422,7 +7422,7 @@ alpha_vms_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) HOST_WIDE_INT alpha_vms_initial_elimination_offset (unsigned int from, unsigned int to) -{ +{ /* The only possible attempts we ever expect are ARG or FRAME_PTR to HARD_FRAME or STACK_PTR. We need the alpha_procedure_type to decide on the proper computations and will need the register save area size @@ -7433,7 +7433,7 @@ alpha_vms_initial_elimination_offset (unsigned int from, unsigned int to) /* PT_NULL procedures have no frame of their own and we only allow elimination to the stack pointer. This is the argument pointer and we resolve the soft frame pointer to that as well. */ - + if (alpha_procedure_type == PT_NULL) return 0; @@ -7448,13 +7448,13 @@ alpha_vms_initial_elimination_offset (unsigned int from, unsigned int to) ^ ^ ^ ^ ARG_PTR FRAME_PTR HARD_FRAME_PTR STACK_PTR - + PT_REGISTER procedures are similar in that they may have a frame of their own. They have no regs-sa/pv/outgoing-args area. We first compute offset to HARD_FRAME_PTR, then add what we need to get to STACK_PTR if need be. */ - + { HOST_WIDE_INT offset; HOST_WIDE_INT pv_save_size = alpha_procedure_type == PT_STACK ? 8 : 0; @@ -7473,10 +7473,10 @@ alpha_vms_initial_elimination_offset (unsigned int from, unsigned int to) default: gcc_unreachable (); } - + if (to == STACK_POINTER_REGNUM) offset += ALPHA_ROUND (crtl->outgoing_args_size); - + return offset; } } @@ -8828,7 +8828,7 @@ alpha_handle_trap_shadows (void) suitably aligned. This is very processor-specific. */ /* There are a number of entries in alphaev4_insn_pipe and alphaev5_insn_pipe that are marked "fake". These instructions do not exist on that target, - but it is possible to see these insns with deranged combinations of + but it is possible to see these insns with deranged combinations of command-line options, such as "-mtune=ev4 -mmax". Instead of aborting, choose a result at random. */ @@ -9465,7 +9465,7 @@ And in the noreturn case: after the insn. In case trap is the last insn in the function, emit NOP to guarantee that PC remains inside function boundaries. This workaround is needed to get reliable backtraces. */ - + rtx_insn *insn = prev_active_insn (get_last_insn ()); if (insn && NONJUMP_INSN_P (insn)) @@ -9725,7 +9725,7 @@ alpha_write_linkage (FILE *stream, const char *funname) the section; 0 if the default should be used. */ static void -vms_asm_named_section (const char *name, unsigned int flags, +vms_asm_named_section (const char *name, unsigned int flags, tree decl ATTRIBUTE_UNUSED) { fputc ('\n', asm_out_file); diff --git a/gcc/config/alpha/driver-alpha.cc b/gcc/config/alpha/driver-alpha.cc index 816d06b..16f0e7f 100644 --- a/gcc/config/alpha/driver-alpha.cc +++ b/gcc/config/alpha/driver-alpha.cc @@ -33,7 +33,7 @@ along with GCC; see the file COPYING3. If not see /* Bit defines for amask instruction. */ #define AMASK_BWX 0x1 /* byte/word extension. */ -#define AMASK_FIX 0x2 /* sqrt and f <-> i conversions +#define AMASK_FIX 0x2 /* sqrt and f <-> i conversions extension. */ #define AMASK_CIX 0x4 /* count extension. */ #define AMASK_MVI 0x100 /* multimedia extension. */ diff --git a/gcc/config/alpha/elf.h b/gcc/config/alpha/elf.h index a10454a..d4f1d40 100644 --- a/gcc/config/alpha/elf.h +++ b/gcc/config/alpha/elf.h @@ -25,7 +25,7 @@ along with GCC; see the file COPYING3. If not see #define ASM_SPEC "%{G*} %{relax:-relax} %{mcpu=*:-m%*}" /* Do not output a .file directive at the beginning of the input file. */ - + #undef TARGET_ASM_FILE_START_FILE_DIRECTIVE #define TARGET_ASM_FILE_START_FILE_DIRECTIVE false diff --git a/gcc/config/alpha/vms.h b/gcc/config/alpha/vms.h index 8038f0e..4226804 100644 --- a/gcc/config/alpha/vms.h +++ b/gcc/config/alpha/vms.h @@ -188,8 +188,8 @@ typedef struct {int num_args; enum avms_arg_type atypes[6];} avms_arg_info; #define ASM_OUTPUT_CASE_LABEL(FILE,PREFIX,NUM,TABLEINSN) \ { ASM_OUTPUT_ALIGN (FILE, 3); (*targetm.asm_out.internal_label) (FILE, PREFIX, NUM); } -/* This says how to output assembler code to declare an - uninitialized external linkage data object. */ +/* This says how to output assembler code to declare an + uninitialized external linkage data object. */ #define COMMON_ASM_OP "\t.comm\t" diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc index c800226..95d3285 100644 --- a/gcc/config/arc/arc.cc +++ b/gcc/config/arc/arc.cc @@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -721,7 +722,7 @@ static rtx arc_legitimize_address_0 (rtx, rtx, machine_mode mode); arc_no_speculation_in_delay_slots_p #undef TARGET_LRA_P -#define TARGET_LRA_P arc_lra_p +#define TARGET_LRA_P hook_bool_void_true #define TARGET_REGISTER_PRIORITY arc_register_priority /* Stores with scaled offsets have different displacement ranges. */ #define TARGET_DIFFERENT_ADDR_DISPLACEMENT_P hook_bool_void_true @@ -4228,7 +4229,7 @@ enum arc_shift_alg { SHIFT_MOVE, /* Register-to-register move. */ SHIFT_LOOP, /* Zero-overhead loop implementation. */ - SHIFT_INLINE, /* Mmultiple LSHIFTs and LSHIFT-PLUSs. */ + SHIFT_INLINE, /* Mmultiple LSHIFTs and LSHIFT-PLUSs. */ SHIFT_AND_ROT, /* Bitwise AND, then ROTATERTs. */ SHIFT_SWAP, /* SWAP then multiple LSHIFTs/LSHIFT-PLUSs. */ SHIFT_AND_SWAP_ROT /* Bitwise AND, then SWAP, then ROTATERTs. */ @@ -9674,7 +9675,7 @@ arc_delegitimize_address (rtx orig_x) rtx gen_acc1 (void) { - return gen_rtx_REG (SImode, TARGET_BIG_ENDIAN ? 56: 57); + return gen_rtx_REG (SImode, TARGET_BIG_ENDIAN ? 56 : 57); } /* Return a REG rtx for acc2. N.B. the gcc-internal representation may @@ -9684,7 +9685,7 @@ gen_acc1 (void) rtx gen_acc2 (void) { - return gen_rtx_REG (SImode, TARGET_BIG_ENDIAN ? 57: 56); + return gen_rtx_REG (SImode, TARGET_BIG_ENDIAN ? 57 : 56); } /* When estimating sizes during arc_reorg, when optimizing for speed, there @@ -10156,14 +10157,6 @@ arc_eh_uses (int regno) return false; } -/* Return true if we use LRA instead of reload pass. */ - -bool -arc_lra_p (void) -{ - return arc_lra_flag; -} - /* ??? Should we define TARGET_REGISTER_PRIORITY? We might perfer to use q registers, because some insn are shorter with them. OTOH we already have separate alternatives for this purpose, and other diff --git a/gcc/config/arc/arc.h b/gcc/config/arc/arc.h index 0a1ecb7..1d3bc37 100644 --- a/gcc/config/arc/arc.h +++ b/gcc/config/arc/arc.h @@ -608,8 +608,8 @@ extern enum reg_class arc_regno_reg_class[]; needed to represent mode MODE in a register of class CLASS. */ #define CLASS_MAX_NREGS(CLASS, MODE) \ -(( GET_MODE_SIZE (MODE) == 16 && CLASS == SIMD_VR_REGS) ? 1: \ -((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)) +((GET_MODE_SIZE (MODE) == 16 && CLASS == SIMD_VR_REGS) ? 1 \ + : ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)) #define SMALL_INT(X) ((unsigned) ((X) + 0x100) < 0x200) #define SMALL_INT_RANGE(X, OFFSET, SHIFT) \ @@ -868,9 +868,9 @@ extern int arc_initial_elimination_offset(int from, int to); /* Recognize any constant value that is a valid address. */ #define CONSTANT_ADDRESS_P(X) \ - (flag_pic ? (arc_legitimate_pic_addr_p (X) || LABEL_P (X)): \ - (GET_CODE (X) == LABEL_REF || GET_CODE (X) == SYMBOL_REF \ - || GET_CODE (X) == CONST_INT || GET_CODE (X) == CONST)) + (flag_pic ? (arc_legitimate_pic_addr_p (X) || LABEL_P (X)) \ + : (GET_CODE (X) == LABEL_REF || GET_CODE (X) == SYMBOL_REF \ + || GET_CODE (X) == CONST_INT || GET_CODE (X) == CONST)) /* Is the argument a const_int rtx, containing an exact power of 2 */ #define IS_POWEROF2_P(X) (! ( (X) & ((X) - 1)) && (X)) @@ -1660,8 +1660,4 @@ enum /* The default option for BI/BIH instructions. */ #define DEFAULT_BRANCH_INDEX 0 -#ifndef TARGET_LRA -#define TARGET_LRA arc_lra_p() -#endif - #endif /* GCC_ARC_H */ diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt index 5abb297..7b93183 100644 --- a/gcc/config/arc/arc.opt +++ b/gcc/config/arc/arc.opt @@ -401,8 +401,8 @@ Pass -marclinux_prof option through to linker. ;; lra is still unproven for ARC, so allow to fall back to reload with -mno-lra. mlra -Target Var(arc_lra_flag) Init(1) Save -Use LRA instead of reload. +Target Ignore +Does nothing. Preserved for backward compatibility. mlra-priority-none Target RejectNegative Var(arc_lra_priority_tag, ARC_LRA_PRIORITY_NONE) diff --git a/gcc/config/arc/simdext.md b/gcc/config/arc/simdext.md index 4e51a23..0696f0a 100644 --- a/gcc/config/arc/simdext.md +++ b/gcc/config/arc/simdext.md @@ -1643,7 +1643,7 @@ ;; We can use dmac as well here. To be investigated which version ;; brings more. -(define_expand "sdot_prodv2hi" +(define_expand "sdot_prodsiv2hi" [(match_operand:SI 0 "register_operand" "") (match_operand:V2HI 1 "register_operand" "") (match_operand:V2HI 2 "register_operand" "") @@ -1656,7 +1656,7 @@ DONE; }) -(define_expand "udot_prodv2hi" +(define_expand "udot_prodsiv2hi" [(match_operand:SI 0 "register_operand" "") (match_operand:V2HI 1 "register_operand" "") (match_operand:V2HI 2 "register_operand" "") @@ -1669,7 +1669,7 @@ DONE; }) -(define_expand "sdot_prodv4hi" +(define_expand "sdot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand" "") (match_operand:V4HI 1 "register_operand" "") (match_operand:V4HI 2 "register_operand" "") @@ -1688,7 +1688,7 @@ DONE; }) -(define_expand "udot_prodv4hi" +(define_expand "udot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand" "") (match_operand:V4HI 1 "register_operand" "") (match_operand:V4HI 2 "register_operand" "") diff --git a/gcc/config/arm/aarch-common.cc b/gcc/config/arm/aarch-common.cc index aa405af..44012fe 100644 --- a/gcc/config/arm/aarch-common.cc +++ b/gcc/config/arm/aarch-common.cc @@ -23,6 +23,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h index db9e8dd..0c32c1e 100644 --- a/gcc/config/arm/aout.h +++ b/gcc/config/arm/aout.h @@ -1,7 +1,7 @@ /* Definitions of target machine for GNU compiler, for ARM with a.out Copyright (C) 1995-2024 Free Software Foundation, Inc. Contributed by Richard Earnshaw (rearnsha@armltd.co.uk). - + This file is part of GCC. GCC is free software; you can redistribute it and/or modify it @@ -165,7 +165,7 @@ #define ASM_GENERATE_INTERNAL_LABEL(STRING, PREFIX, NUM) \ sprintf (STRING, "*%s%s%u", LOCAL_LABEL_PREFIX, PREFIX, (unsigned int)(NUM)) #endif - + /* Output an element of a dispatch table. */ #define ASM_OUTPUT_ADDR_VEC_ELT(STREAM, VALUE) \ do \ @@ -174,7 +174,7 @@ asm_fprintf (STREAM, "\t.word\t%LL%d\n", VALUE); \ } \ while (0) - + /* Thumb-2 always uses addr_diff_elf so that the Table Branch instructions can be used. For non-pic code where the offsets do not suitable for @@ -266,7 +266,7 @@ fprintf (STREAM, "\t.space\t%d\n", (int) (NBYTES)) /* Align output to a power of two. Horrible /bin/as. */ -#ifndef ASM_OUTPUT_ALIGN +#ifndef ASM_OUTPUT_ALIGN #define ASM_OUTPUT_ALIGN(STREAM, POWER) \ do \ { \ @@ -292,7 +292,7 @@ } \ while (0) #endif - + /* Output a local common block. /bin/as can't do this, so hack a `.space' into the bss segment. Note that this is *bad* practice, which is guaranteed NOT to work since it doesn't define STATIC @@ -308,7 +308,7 @@ } \ while (0) #endif - + /* Output a zero-initialized block. */ #ifndef ASM_OUTPUT_ALIGNED_BSS #define ASM_OUTPUT_ALIGNED_BSS(STREAM, DECL, NAME, SIZE, ALIGN) \ diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index c9d50bf..6ee1563 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -477,19 +477,6 @@ arm_ternop_unone_unone_none_none_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_ternop_unone_unone_none_none_qualifiers) static enum arm_type_qualifiers -arm_ternop_unone_none_unone_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_none, qualifier_unsigned, - qualifier_immediate }; -#define TERNOP_UNONE_NONE_UNONE_IMM_QUALIFIERS \ - (arm_ternop_unone_none_unone_imm_qualifiers) - -static enum arm_type_qualifiers -arm_ternop_none_none_unone_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_immediate }; -#define TERNOP_NONE_NONE_UNONE_IMM_QUALIFIERS \ - (arm_ternop_none_none_unone_imm_qualifiers) - -static enum arm_type_qualifiers arm_ternop_unone_unone_none_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_none, qualifier_immediate }; @@ -624,16 +611,6 @@ arm_quadop_unone_unone_unone_none_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] (arm_quadop_unone_unone_unone_none_pred_qualifiers) static enum arm_type_qualifiers -arm_strs_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_pointer, qualifier_none }; -#define STRS_QUALIFIERS (arm_strs_qualifiers) - -static enum arm_type_qualifiers -arm_stru_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_pointer, qualifier_unsigned }; -#define STRU_QUALIFIERS (arm_stru_qualifiers) - -static enum arm_type_qualifiers arm_strss_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer, qualifier_unsigned, qualifier_none}; @@ -657,17 +634,6 @@ arm_strsbu_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define STRSBU_QUALIFIERS (arm_strsbu_qualifiers) static enum arm_type_qualifiers -arm_strs_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_pointer, qualifier_none, qualifier_predicate}; -#define STRS_P_QUALIFIERS (arm_strs_p_qualifiers) - -static enum arm_type_qualifiers -arm_stru_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_void, qualifier_pointer, qualifier_unsigned, - qualifier_predicate}; -#define STRU_P_QUALIFIERS (arm_stru_p_qualifiers) - -static enum arm_type_qualifiers arm_strsu_p_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_void, qualifier_pointer, qualifier_unsigned, qualifier_unsigned, qualifier_predicate}; @@ -702,16 +668,6 @@ arm_ldrgs_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define LDRGS_QUALIFIERS (arm_ldrgs_qualifiers) static enum arm_type_qualifiers -arm_ldrs_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_pointer}; -#define LDRS_QUALIFIERS (arm_ldrs_qualifiers) - -static enum arm_type_qualifiers -arm_ldru_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_pointer}; -#define LDRU_QUALIFIERS (arm_ldru_qualifiers) - -static enum arm_type_qualifiers arm_ldrgbs_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_unsigned, qualifier_immediate}; #define LDRGBS_QUALIFIERS (arm_ldrgbs_qualifiers) @@ -746,23 +702,6 @@ arm_ldrgu_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define LDRGU_Z_QUALIFIERS (arm_ldrgu_z_qualifiers) static enum arm_type_qualifiers -arm_ldrs_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_none, qualifier_pointer, qualifier_predicate}; -#define LDRS_Z_QUALIFIERS (arm_ldrs_z_qualifiers) - -static enum arm_type_qualifiers -arm_ldru_z_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_pointer, qualifier_predicate}; -#define LDRU_Z_QUALIFIERS (arm_ldru_z_qualifiers) - -static enum arm_type_qualifiers -arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers[SIMD_MAX_BUILTIN_ARGS] - = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned, - qualifier_unsigned, qualifier_immediate, qualifier_predicate }; -#define QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED_QUALIFIERS \ - (arm_quinop_unone_unone_unone_unone_imm_pred_qualifiers) - -static enum arm_type_qualifiers arm_ldrgbwbxu_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_unsigned, qualifier_unsigned, qualifier_immediate}; #define LDRGBWBXU_QUALIFIERS (arm_ldrgbwbxu_qualifiers) @@ -908,6 +847,13 @@ typedef struct { enum arm_type_qualifiers *qualifiers; } arm_builtin_datum; +constexpr insn_code CODE_FOR_neon_sdotv8qi = CODE_FOR_neon_sdotv2siv8qi; +constexpr insn_code CODE_FOR_neon_udotv8qi = CODE_FOR_neon_udotv2siv8qi; +constexpr insn_code CODE_FOR_neon_usdotv8qi = CODE_FOR_neon_usdotv2siv8qi; +constexpr insn_code CODE_FOR_neon_sdotv16qi = CODE_FOR_neon_sdotv4siv16qi; +constexpr insn_code CODE_FOR_neon_udotv16qi = CODE_FOR_neon_udotv4siv16qi; +constexpr insn_code CODE_FOR_neon_usdotv16qi = CODE_FOR_neon_usdotv4siv16qi; + #define CF(N,X) CODE_FOR_neon_##N##X #define VAR1(T, N, A) \ diff --git a/gcc/config/arm/arm-mve-builtins-base.cc b/gcc/config/arm/arm-mve-builtins-base.cc index e0ae593..2c8ff46 100644 --- a/gcc/config/arm/arm-mve-builtins-base.cc +++ b/gcc/config/arm/arm-mve-builtins-base.cc @@ -30,6 +30,7 @@ #include "basic-block.h" #include "function.h" #include "gimple.h" +#include "emit-rtl.h" #include "arm-mve-builtins.h" #include "arm-mve-builtins-shapes.h" #include "arm-mve-builtins-base.h" @@ -39,6 +40,59 @@ using namespace arm_mve; namespace { +/* Implements vdup_* intrinsics. */ +class vdupq_impl : public quiet<function_base> +{ +public: + CONSTEXPR vdupq_impl (int unspec_for_m_n_sint, + int unspec_for_m_n_uint, + int unspec_for_m_n_fp) + : m_unspec_for_m_n_sint (unspec_for_m_n_sint), + m_unspec_for_m_n_uint (unspec_for_m_n_uint), + m_unspec_for_m_n_fp (unspec_for_m_n_fp) + {} + int m_unspec_for_m_n_sint; + int m_unspec_for_m_n_uint; + int m_unspec_for_m_n_fp; + + rtx expand (function_expander &e) const override + { + gcc_assert (e.mode_suffix_id == MODE_n); + + insn_code code; + machine_mode mode = e.vector_mode (0); + + switch (e.pred) + { + case PRED_none: + /* No predicate, _n suffix. */ + code = code_for_mve_vdupq_n (mode); + return e.use_exact_insn (code); + + case PRED_m: + case PRED_x: + /* "m" or "x" predicate, _n suffix. */ + if (e.type_suffix (0).integer_p) + if (e.type_suffix (0).unsigned_p) + code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, + m_unspec_for_m_n_uint, mode); + else + code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, + m_unspec_for_m_n_sint, mode); + else + code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, mode); + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); + else + return e.use_pred_x_insn (code); + + default: + gcc_unreachable (); + } + } +}; + /* Implements vreinterpretq_* intrinsics. */ class vreinterpretq_impl : public quiet<function_base> { @@ -96,16 +150,18 @@ public: expand (function_expander &e) const override { insn_code icode; - if (e.type_suffix (0).float_p) - icode = code_for_mve_vld1q_f(e.vector_mode (0)); - else + switch (e.pred) { - if (e.type_suffix (0).unsigned_p) - icode = code_for_mve_vld1q(VLD1Q_U, - e.vector_mode (0)); - else - icode = code_for_mve_vld1q(VLD1Q_S, - e.vector_mode (0)); + case PRED_none: + icode = code_for_mve_vldrq (e.vector_mode (0)); + break; + + case PRED_z: + icode = code_for_mve_vldrq_z (e.vector_mode (0)); + break; + + default: + gcc_unreachable (); } return e.use_contiguous_load_insn (icode); } @@ -124,21 +180,683 @@ public: expand (function_expander &e) const override { insn_code icode; - if (e.type_suffix (0).float_p) - icode = code_for_mve_vst1q_f(e.vector_mode (0)); - else + switch (e.pred) { - if (e.type_suffix (0).unsigned_p) - icode = code_for_mve_vst1q(VST1Q_U, - e.vector_mode (0)); + case PRED_none: + icode = code_for_mve_vstrq (e.vector_mode (0)); + break; + + case PRED_p: + icode = code_for_mve_vstrq_p (e.vector_mode (0)); + break; + + default: + gcc_unreachable (); + } + return e.use_contiguous_store_insn (icode); + } +}; + +/* Builds the vstrq* intrinsics. */ +class vstrq_impl : public store_truncating +{ +public: + using store_truncating::store_truncating; + + unsigned int call_properties (const function_instance &) const override + { + return CP_WRITE_MEMORY; + } + + rtx expand (function_expander &e) const override + { + insn_code icode; + switch (e.pred) + { + case PRED_none: + if (e.vector_mode (0) == e.memory_vector_mode ()) + /* Non-truncating store case. */ + icode = code_for_mve_vstrq (e.vector_mode (0)); else - icode = code_for_mve_vst1q(VST1Q_S, - e.vector_mode (0)); + /* Truncating store case. + (there is only one possible truncation for each memory mode so only + one mode argument is needed). */ + icode = code_for_mve_vstrq_truncate (e.memory_vector_mode ()); + break; + + case PRED_p: + if (e.vector_mode (0) == e.memory_vector_mode ()) + icode = code_for_mve_vstrq_p (e.vector_mode (0)); + else + icode = code_for_mve_vstrq_p_truncate (e.memory_vector_mode ()); + break; + + default: + gcc_unreachable (); } + return e.use_contiguous_store_insn (icode); } }; +/* Builds the vldrq* intrinsics. */ +class vldrq_impl : public load_extending +{ +public: + using load_extending::load_extending; + + unsigned int call_properties (const function_instance &) const override + { + return CP_READ_MEMORY; + } + + rtx expand (function_expander &e) const override + { + insn_code icode; + switch (e.pred) + { + case PRED_none: + if (e.vector_mode (0) == e.memory_vector_mode ()) + /* Non-extending load case. */ + icode = code_for_mve_vldrq (e.vector_mode (0)); + else + /* Extending load case. + (there is only one extension for each memory mode so only one type + argument is needed). */ + icode = code_for_mve_vldrq_extend (e.memory_vector_mode (), + e.type_suffix (0).unsigned_p + ? ZERO_EXTEND + : SIGN_EXTEND); + break; + + case PRED_z: + if (e.vector_mode (0) == e.memory_vector_mode ()) + icode = code_for_mve_vldrq_z (e.vector_mode (0)); + else + icode = code_for_mve_vldrq_z_extend (e.memory_vector_mode (), + e.type_suffix (0).unsigned_p + ? ZERO_EXTEND + : SIGN_EXTEND); + break; + + default: + gcc_unreachable (); + } + + return e.use_contiguous_load_insn (icode); + } +}; + + /* Implements vctp8q, vctp16q, vctp32q and vctp64q intrinsics. */ +class vctpq_impl : public function_base +{ +public: + CONSTEXPR vctpq_impl (machine_mode mode) + : m_mode (mode) + {} + + /* Mode this intrinsic operates on. */ + machine_mode m_mode; + + rtx + expand (function_expander &e) const override + { + insn_code code; + rtx target; + + if (e.mode_suffix_id != MODE_none) + gcc_unreachable (); + + switch (e.pred) + { + case PRED_none: + /* No predicate, no suffix. */ + code = code_for_mve_vctpq (m_mode, m_mode); + target = e.use_exact_insn (code); + break; + + case PRED_m: + /* No suffix, "m" predicate. */ + code = code_for_mve_vctpq_m (m_mode, m_mode); + target = e.use_cond_insn (code, 0); + break; + + default: + gcc_unreachable (); + } + + rtx HItarget = gen_reg_rtx (HImode); + emit_move_insn (HItarget, gen_lowpart (HImode, target)); + return HItarget; + } +}; + + /* Implements vcvtq intrinsics. */ +class vcvtq_impl : public function_base +{ +public: + rtx + expand (function_expander &e) const override + { + insn_code code; + machine_mode target_mode = e.vector_mode (0); + int unspec; + switch (e.pred) + { + case PRED_none: + switch (e.mode_suffix_id) + { + case MODE_none: + /* No predicate, no suffix. */ + if (e.type_suffix (0).integer_p) + { + unspec = (e.type_suffix (0).unsigned_p + ? VCVTQ_FROM_F_U + : VCVTQ_FROM_F_S); + code = code_for_mve_q_from_f (unspec, unspec, target_mode); + } + else + { + unspec = (e.type_suffix (1).unsigned_p + ? VCVTQ_TO_F_U + : VCVTQ_TO_F_S); + code = code_for_mve_q_to_f (unspec, unspec, target_mode); + } + break; + + case MODE_n: + /* No predicate, _n suffix. */ + if (e.type_suffix (0).integer_p) + { + unspec = (e.type_suffix (0).unsigned_p + ? VCVTQ_N_FROM_F_U + : VCVTQ_N_FROM_F_S); + code = code_for_mve_q_n_from_f (unspec, unspec, target_mode); + } + else + { + unspec = (e.type_suffix (1).unsigned_p + ? VCVTQ_N_TO_F_U + : VCVTQ_N_TO_F_S); + code = code_for_mve_q_n_to_f (unspec, unspec, target_mode); + } + break; + + default: + gcc_unreachable (); + } + return e.use_exact_insn (code); + + case PRED_m: + case PRED_x: + switch (e.mode_suffix_id) + { + case MODE_none: + /* No suffix, "m" or "x" predicate. */ + if (e.type_suffix (0).integer_p) + { + unspec = (e.type_suffix (0).unsigned_p + ? VCVTQ_M_FROM_F_U + : VCVTQ_M_FROM_F_S); + code = code_for_mve_q_m_from_f (unspec, unspec, target_mode); + } + else + { + unspec = (e.type_suffix (1).unsigned_p + ? VCVTQ_M_TO_F_U + : VCVTQ_M_TO_F_S); + code = code_for_mve_q_m_to_f (unspec, unspec, target_mode); + } + break; + + case MODE_n: + /* _n suffix, "m" or "x" predicate. */ + if (e.type_suffix (0).integer_p) + { + unspec = (e.type_suffix (0).unsigned_p + ? VCVTQ_M_N_FROM_F_U + : VCVTQ_M_N_FROM_F_S); + code = code_for_mve_q_m_n_from_f (unspec, unspec, target_mode); + } + else + { + unspec = (e.type_suffix (1).unsigned_p + ? VCVTQ_M_N_TO_F_U + : VCVTQ_M_N_TO_F_S); + code = code_for_mve_q_m_n_to_f (unspec, unspec, target_mode); + } + break; + + default: + gcc_unreachable (); + } + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); + else + return e.use_pred_x_insn (code); + + default: + gcc_unreachable (); + } + + gcc_unreachable (); + } +}; + + /* Implements vcvt[bt]q_f32_f16 and vcvt[bt]q_f16_f32 + intrinsics. */ +class vcvtxq_impl : public function_base +{ +public: + CONSTEXPR vcvtxq_impl (int unspec_f16_f32, int unspec_for_m_f16_f32, + int unspec_f32_f16, int unspec_for_m_f32_f16) + : m_unspec_f16_f32 (unspec_f16_f32), + m_unspec_for_m_f16_f32 (unspec_for_m_f16_f32), + m_unspec_f32_f16 (unspec_f32_f16), + m_unspec_for_m_f32_f16 (unspec_for_m_f32_f16) + {} + + /* The unspec code associated with vcvt[bt]q. */ + int m_unspec_f16_f32; + int m_unspec_for_m_f16_f32; + int m_unspec_f32_f16; + int m_unspec_for_m_f32_f16; + + rtx + expand (function_expander &e) const override + { + insn_code code; + switch (e.pred) + { + case PRED_none: + /* No predicate. */ + if (e.type_suffix (0).element_bits == 16) + code = code_for_mve_q_f16_f32v8hf (m_unspec_f16_f32); + else + code = code_for_mve_q_f32_f16v4sf (m_unspec_f32_f16); + return e.use_exact_insn (code); + + case PRED_m: + case PRED_x: + /* "m" or "x" predicate. */ + if (e.type_suffix (0).element_bits == 16) + code = code_for_mve_q_m_f16_f32v8hf (m_unspec_for_m_f16_f32); + else + code = code_for_mve_q_m_f32_f16v4sf (m_unspec_for_m_f32_f16); + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); + else + return e.use_pred_x_insn (code); + + default: + gcc_unreachable (); + } + + gcc_unreachable (); + } +}; + +/* Map the vidup / vddup function directly to CODE (UNSPEC, M) where M is the + vector mode associated with type suffix 0. We need this special case + because in MODE_wb the builtins derefrence the first parameter and update + its contents. We also have to insert the two additional parameters needed + by the builtins compared to the intrinsics. In wrapping mode, we have to + match the 'hack' to make sure the 'wrap' parameters is in odd register. */ +class viddup_impl : public function_base +{ +public: + CONSTEXPR viddup_impl (bool inc_dec, bool wrap) + : m_inc_dec (inc_dec), m_wrap (wrap) + {} + + /* Increment (true) or decrement (false). */ + bool m_inc_dec; + /* v[id]wdup (true) or v[id]dup (false). */ + bool m_wrap; + + unsigned int + call_properties (const function_instance &fi) const override + { + if (fi.mode_suffix_id == MODE_wb) + return CP_WRITE_MEMORY | CP_READ_MEMORY; + else + return 0; + } + + tree + memory_scalar_type (const function_instance &) const override + { + return get_typenode_from_name (UINT32_TYPE); + } + + rtx + expand (function_expander &e) const override + { + machine_mode mode = e.vector_mode (0); + insn_code code; + rtx insns, offset_ptr; + rtx new_offset; + int offset_arg_no; + + if (! e.type_suffix (0).integer_p) + gcc_unreachable (); + + if ((e.mode_suffix_id != MODE_n) + && (e.mode_suffix_id != MODE_wb)) + gcc_unreachable (); + + offset_arg_no = (e.pred == PRED_m) ? 1 : 0; + + /* In _wb mode, the start offset is passed via a pointer, + dereference it. */ + if (e.mode_suffix_id == MODE_wb) + { + rtx offset = gen_reg_rtx (SImode); + offset_ptr = e.args[offset_arg_no]; + emit_insn (gen_rtx_SET (offset, gen_rtx_MEM (SImode, offset_ptr))); + e.args[offset_arg_no] = offset; + } + + /* We have to shuffle parameters because the builtin needs additional + arguments: + - the updated "new_offset" + - total increment (incr * number of lanes) in the non-wrapping case + - hack to pass wrap in the top end of DImode operand so that it is + actually in a odd register */ + new_offset = gen_reg_rtx (SImode); + e.args.quick_insert (offset_arg_no, new_offset); + + if (m_wrap) + { + rtx wrap = gen_reg_rtx (DImode); + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 4), + e.args[offset_arg_no + 2])); + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (SImode, wrap, 0), + GEN_INT (0))); + e.args[offset_arg_no + 2] = wrap; + } + else + { + rtx incr = e.args[offset_arg_no + 2]; + rtx total_incr = gen_int_mode (INTVAL (incr) + * GET_MODE_NUNITS (e.vector_mode (0)), + SImode); + e.args.quick_push (total_incr); + } + + /* _wb mode uses the _n builtins and adds code to update the + offset. */ + switch (e.pred) + { + case PRED_none: + /* No predicate. */ + code = m_wrap + ? (m_inc_dec + ? code_for_mve_q_wb_u_insn (VIWDUPQ, mode) + : code_for_mve_q_wb_u_insn (VDWDUPQ, mode)) + : (m_inc_dec + ? code_for_mve_q_u_insn (VIDUPQ, mode) + : code_for_mve_q_u_insn (VDDUPQ, mode)); + insns = e.use_exact_insn (code); + break; + + case PRED_m: + case PRED_x: + /* "m" or "x" predicate. */ + code = m_wrap + ? (m_inc_dec + ? code_for_mve_q_m_wb_u_insn (VIWDUPQ_M, mode) + : code_for_mve_q_m_wb_u_insn (VDWDUPQ_M, mode)) + : (m_inc_dec + ? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode) + : code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode)); + + if (e.pred == PRED_m) + insns = e.use_cond_insn (code, 0); + else + insns = e.use_pred_x_insn (code); + break; + + default: + gcc_unreachable (); + } + + /* Update offset as appropriate. */ + if (e.mode_suffix_id == MODE_wb) + emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, offset_ptr), new_offset)); + + return insns; + } +}; + +/* Map the vshlc function directly to CODE (UNSPEC, M) where M is the vector + mode associated with type suffix 0. We need this special case because the + intrinsics derefrence the second parameter and update its contents. */ +class vshlc_impl : public function_base +{ +public: + unsigned int + call_properties (const function_instance &) const override + { + return CP_WRITE_MEMORY | CP_READ_MEMORY; + } + + tree + memory_scalar_type (const function_instance &) const override + { + return get_typenode_from_name (UINT32_TYPE); + } + + rtx + expand (function_expander &e) const override + { + machine_mode mode = e.vector_mode (0); + insn_code code; + rtx insns, carry_ptr, carry, new_carry; + int carry_arg_no; + + if (! e.type_suffix (0).integer_p) + gcc_unreachable (); + + if (e.mode_suffix_id != MODE_none) + gcc_unreachable (); + + carry_arg_no = 1; + + carry = gen_reg_rtx (SImode); + carry_ptr = e.args[carry_arg_no]; + emit_insn (gen_rtx_SET (carry, gen_rtx_MEM (SImode, carry_ptr))); + e.args[carry_arg_no] = carry; + + new_carry = gen_reg_rtx (SImode); + e.args.quick_insert (0, new_carry); + + switch (e.pred) + { + case PRED_none: + /* No predicate. */ + code = e.type_suffix (0).unsigned_p + ? code_for_mve_vshlcq (VSHLCQ_U, mode) + : code_for_mve_vshlcq (VSHLCQ_S, mode); + insns = e.use_exact_insn (code); + break; + + case PRED_m: + /* "m" predicate. */ + code = e.type_suffix (0).unsigned_p + ? code_for_mve_vshlcq_m (VSHLCQ_M_U, mode) + : code_for_mve_vshlcq_m (VSHLCQ_M_S, mode); + insns = e.use_cond_insn (code, 0); + break; + + default: + gcc_unreachable (); + } + + /* Update carry. */ + emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, carry_ptr), new_carry)); + + return insns; + } +}; + +/* Map the vadc and similar functions directly to CODE (UNSPEC, UNSPEC). Take + care of the implicit carry argument. */ +class vadc_vsbc_impl : public function_base +{ +public: + CONSTEXPR vadc_vsbc_impl (bool init_carry, bool add) + : m_init_carry (init_carry), m_add (add) + {} + + /* Initialize carry with 0 (vadci). */ + bool m_init_carry; + /* Add (true) or Sub (false). */ + bool m_add; + + unsigned int + call_properties (const function_instance &) const override + { + unsigned int flags = CP_WRITE_MEMORY | CP_READ_FPCR; + if (!m_init_carry) + flags |= CP_READ_MEMORY; + return flags; + } + + tree + memory_scalar_type (const function_instance &) const override + { + /* carry is "unsigned int". */ + return get_typenode_from_name ("unsigned int"); + } + + rtx + expand (function_expander &e) const override + { + insn_code code; + rtx insns, carry_ptr, carry_out; + int carry_out_arg_no; + int unspec; + + if (! e.type_suffix (0).integer_p) + gcc_unreachable (); + + if (e.mode_suffix_id != MODE_none) + gcc_unreachable (); + + /* Remove carry from arguments, it is implicit for the builtin. */ + switch (e.pred) + { + case PRED_none: + carry_out_arg_no = 2; + break; + + case PRED_m: + carry_out_arg_no = 3; + break; + + default: + gcc_unreachable (); + } + + carry_ptr = e.args[carry_out_arg_no]; + e.args.ordered_remove (carry_out_arg_no); + + if (!m_init_carry) + { + /* Prepare carry in: + set_fpscr ( (fpscr & ~0x20000000u) + | ((*carry & 1u) << 29) ) */ + rtx carry_in = gen_reg_rtx (SImode); + rtx fpscr = gen_reg_rtx (SImode); + emit_insn (gen_get_fpscr_nzcvqc (fpscr)); + emit_insn (gen_rtx_SET (carry_in, gen_rtx_MEM (SImode, carry_ptr))); + + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_ASHIFT (SImode, + carry_in, + GEN_INT (29)))); + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_AND (SImode, + carry_in, + GEN_INT (0x20000000)))); + emit_insn (gen_rtx_SET (fpscr, + gen_rtx_AND (SImode, + fpscr, + GEN_INT (~0x20000000)))); + emit_insn (gen_rtx_SET (carry_in, + gen_rtx_IOR (SImode, + carry_in, + fpscr))); + emit_insn (gen_set_fpscr_nzcvqc (carry_in)); + } + + switch (e.pred) + { + case PRED_none: + /* No predicate. */ + unspec = m_add + ? (m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VADCIQ_U + : VADCIQ_S) + : (e.type_suffix (0).unsigned_p + ? VADCQ_U + : VADCQ_S)) + : (m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VSBCIQ_U + : VSBCIQ_S) + : (e.type_suffix (0).unsigned_p + ? VSBCQ_U + : VSBCQ_S)); + code = code_for_mve_q_v4si (unspec, unspec); + insns = e.use_exact_insn (code); + break; + + case PRED_m: + /* "m" predicate. */ + unspec = m_add + ? (m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VADCIQ_M_U + : VADCIQ_M_S) + : (e.type_suffix (0).unsigned_p + ? VADCQ_M_U + : VADCQ_M_S)) + : (m_init_carry + ? (e.type_suffix (0).unsigned_p + ? VSBCIQ_M_U + : VSBCIQ_M_S) + : (e.type_suffix (0).unsigned_p + ? VSBCQ_M_U + : VSBCQ_M_S)); + code = code_for_mve_q_m_v4si (unspec, unspec); + insns = e.use_cond_insn (code, 0); + break; + + default: + gcc_unreachable (); + } + + /* Update carry_out. */ + carry_out = gen_reg_rtx (SImode); + emit_insn (gen_get_fpscr_nzcvqc (carry_out)); + emit_insn (gen_rtx_SET (carry_out, + gen_rtx_LSHIFTRT (SImode, + carry_out, + GEN_INT (29)))); + emit_insn (gen_rtx_SET (carry_out, + gen_rtx_AND (SImode, + carry_out, + GEN_INT (1)))); + emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, carry_ptr), carry_out)); + + return insns; + } +}; + } /* end anonymous namespace */ namespace arm_mve { @@ -309,12 +1027,15 @@ namespace arm_mve { FUNCTION_PRED_P_S_U (vabavq, VABAVQ) FUNCTION_WITHOUT_N (vabdq, VABDQ) FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, -1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1)) +FUNCTION (vadciq, vadc_vsbc_impl, (true, true)) +FUNCTION (vadcq, vadc_vsbc_impl, (false, true)) FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ) FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ) FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ) FUNCTION_PRED_P_S_U (vaddvq, VADDVQ) FUNCTION_PRED_P_S_U (vaddvaq, VADDVAQ) FUNCTION_WITH_RTX_M (vandq, AND, VANDQ) +FUNCTION (vbicq, unspec_based_mve_function_exact_insn_vbic, (VBICQ_N_S, VBICQ_N_U, VBICQ_M_S, VBICQ_M_U, VBICQ_M_F, VBICQ_M_N_S, VBICQ_M_N_U)) FUNCTION_ONLY_N (vbrsrq, VBRSRQ) FUNCTION (vcaddq_rot90, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD90, UNSPEC_VCADD90, UNSPEC_VCADD90, VCADDQ_ROT90_M, VCADDQ_ROT90_M, VCADDQ_ROT90_M_F)) FUNCTION (vcaddq_rot270, unspec_mve_function_exact_insn_rot, (UNSPEC_VCADD270, UNSPEC_VCADD270, UNSPEC_VCADD270, VCADDQ_ROT270_M, VCADDQ_ROT270_M, VCADDQ_ROT270_M_F)) @@ -339,7 +1060,22 @@ FUNCTION (vcmpltq, unspec_based_mve_function_exact_insn_vcmp, (LT, UNKNOWN, LT, FUNCTION (vcmpcsq, unspec_based_mve_function_exact_insn_vcmp, (UNKNOWN, GEU, UNKNOWN, UNKNOWN, VCMPCSQ_M_U, UNKNOWN, UNKNOWN, VCMPCSQ_M_N_U, UNKNOWN)) FUNCTION (vcmphiq, unspec_based_mve_function_exact_insn_vcmp, (UNKNOWN, GTU, UNKNOWN, UNKNOWN, VCMPHIQ_M_U, UNKNOWN, UNKNOWN, VCMPHIQ_M_N_U, UNKNOWN)) FUNCTION_WITHOUT_M_N (vcreateq, VCREATEQ) -FUNCTION_ONLY_N (vdupq, VDUPQ) +FUNCTION (vctp8q, vctpq_impl, (V16BImode)) +FUNCTION (vctp16q, vctpq_impl, (V8BImode)) +FUNCTION (vctp32q, vctpq_impl, (V4BImode)) +FUNCTION (vctp64q, vctpq_impl, (V2QImode)) +FUNCTION_WITHOUT_N_NO_F (vcvtaq, VCVTAQ) +FUNCTION (vcvtbq, vcvtxq_impl, (VCVTBQ_F16_F32, VCVTBQ_M_F16_F32, VCVTBQ_F32_F16, VCVTBQ_M_F32_F16)) +FUNCTION (vcvtq, vcvtq_impl,) +FUNCTION_WITHOUT_N_NO_F (vcvtmq, VCVTMQ) +FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ) +FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ) +FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16)) +FUNCTION (vddupq, viddup_impl, (false, false)) +FUNCTION (vdupq, vdupq_impl, (VDUPQ_M_N_S, VDUPQ_M_N_U, VDUPQ_M_N_F)) +FUNCTION (vdwdupq, viddup_impl, (false, true)) +FUNCTION (vidupq, viddup_impl, (true, false)) +FUNCTION (viwdupq, viddup_impl, (true, true)) FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ) FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F)) FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F)) @@ -347,6 +1083,9 @@ FUNCTION (vfmsq, unspec_mve_function_exact_insn, (-1, -1, VFMSQ_F, -1, -1, -1, - FUNCTION_WITH_M_N_NO_F (vhaddq, VHADDQ) FUNCTION_WITH_M_N_NO_F (vhsubq, VHSUBQ) FUNCTION (vld1q, vld1_impl,) +FUNCTION (vldrbq, vldrq_impl, (TYPE_SUFFIX_s8, TYPE_SUFFIX_u8)) +FUNCTION (vldrhq, vldrq_impl, (TYPE_SUFFIX_s16, TYPE_SUFFIX_u16, TYPE_SUFFIX_f16)) +FUNCTION (vldrwq, vldrq_impl, (TYPE_SUFFIX_s32, TYPE_SUFFIX_u32, TYPE_SUFFIX_f32)) FUNCTION_PRED_P_S (vmaxavq, VMAXAVQ) FUNCTION_WITHOUT_N_NO_U_F (vmaxaq, VMAXAQ) FUNCTION_ONLY_F (vmaxnmaq, VMAXNMAQ) @@ -394,6 +1133,7 @@ FUNCTION_WITH_RTX_M_N (vmulq, MULT, VMULQ) FUNCTION_WITH_RTX_M_N_NO_F (vmvnq, NOT, VMVNQ) FUNCTION (vnegq, unspec_based_mve_function_exact_insn, (NEG, NEG, NEG, -1, -1, -1, VNEGQ_M_S, -1, VNEGQ_M_F, -1, -1, -1)) FUNCTION_WITHOUT_M_N (vpselq, VPSELQ) +FUNCTION (vornq, unspec_based_mve_function_exact_insn_vorn, (-1, -1, VORNQ_M_S, VORNQ_M_U, VORNQ_M_F, -1, -1)) FUNCTION_WITH_RTX_M_N_NO_N_F (vorrq, IOR, VORRQ) FUNCTION_WITHOUT_N_NO_U_F (vqabsq, VQABSQ) FUNCTION_WITH_M_N_NO_F (vqaddq, VQADDQ) @@ -454,6 +1194,9 @@ FUNCTION_WITH_M_N_NO_F (vrshlq, VRSHLQ) FUNCTION_ONLY_N_NO_F (vrshrnbq, VRSHRNBQ) FUNCTION_ONLY_N_NO_F (vrshrntq, VRSHRNTQ) FUNCTION_ONLY_N_NO_F (vrshrq, VRSHRQ) +FUNCTION (vsbciq, vadc_vsbc_impl, (true, false)) +FUNCTION (vsbcq, vadc_vsbc_impl, (false, false)) +FUNCTION (vshlcq, vshlc_impl,) FUNCTION_ONLY_N_NO_F (vshllbq, VSHLLBQ) FUNCTION_ONLY_N_NO_F (vshlltq, VSHLLTQ) FUNCTION_WITH_M_N_R (vshlq, VSHLQ) @@ -463,6 +1206,9 @@ FUNCTION_ONLY_N_NO_F (vshrq, VSHRQ) FUNCTION_ONLY_N_NO_F (vsliq, VSLIQ) FUNCTION_ONLY_N_NO_F (vsriq, VSRIQ) FUNCTION (vst1q, vst1_impl,) +FUNCTION (vstrbq, vstrq_impl, (QImode, opt_scalar_mode ())) +FUNCTION (vstrhq, vstrq_impl, (HImode, HFmode)) +FUNCTION (vstrwq, vstrq_impl, (SImode, SFmode)) FUNCTION_WITH_RTX_M_N (vsubq, MINUS, VSUBQ) FUNCTION (vuninitializedq, vuninitializedq_impl,) diff --git a/gcc/config/arm/arm-mve-builtins-base.def b/gcc/config/arm/arm-mve-builtins-base.def index 90d031e..6166f1b 100644 --- a/gcc/config/arm/arm-mve-builtins-base.def +++ b/gcc/config/arm/arm-mve-builtins-base.def @@ -21,15 +21,18 @@ DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none) DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none) +DEF_MVE_FUNCTION (vadciq, vadc_vsbc, integer_32, m_or_none) +DEF_MVE_FUNCTION (vadcq, vadc_vsbc, integer_32, m_or_none) DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none) DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none) DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none) DEF_MVE_FUNCTION (vaddvaq, unary_int32_acc, all_integer, p_or_none) DEF_MVE_FUNCTION (vaddvq, unary_int32, all_integer, p_or_none) DEF_MVE_FUNCTION (vandq, binary, all_integer, mx_or_none) +DEF_MVE_FUNCTION (vbicq, binary_orrq, all_integer, mx_or_none) DEF_MVE_FUNCTION (vbrsrq, binary_imm32, all_integer, mx_or_none) -DEF_MVE_FUNCTION (vcaddq_rot90, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vcaddq_rot270, binary, all_integer, mx_or_none) +DEF_MVE_FUNCTION (vcaddq_rot90, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vclsq, unary, all_signed, mx_or_none) DEF_MVE_FUNCTION (vclzq, unary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vcmpcsq, cmp, all_unsigned, m_or_none) @@ -41,13 +44,24 @@ DEF_MVE_FUNCTION (vcmpleq, cmp, all_signed, m_or_none) DEF_MVE_FUNCTION (vcmpltq, cmp, all_signed, m_or_none) DEF_MVE_FUNCTION (vcmpneq, cmp, all_integer, m_or_none) DEF_MVE_FUNCTION (vcreateq, create, all_integer_with_64, none) +DEF_MVE_FUNCTION (vctp16q, vctp, none, m_or_none) +DEF_MVE_FUNCTION (vctp32q, vctp, none, m_or_none) +DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none) +DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none) +DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none) DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none) +DEF_MVE_FUNCTION (vdwdupq, vidwdup, all_unsigned, mx_or_none) DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none) -DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none) DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none) +DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none) DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none) -DEF_MVE_FUNCTION (vld1q, load, all_integer, none) +DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none) +DEF_MVE_FUNCTION (viwdupq, vidwdup, all_unsigned, mx_or_none) +DEF_MVE_FUNCTION (vld1q, load, all_integer, z_or_none) +DEF_MVE_FUNCTION (vldrbq, load_ext, all_integer, z_or_none) +DEF_MVE_FUNCTION (vldrhq, load_ext, integer_16_32, z_or_none) +DEF_MVE_FUNCTION (vldrwq, load_ext, integer_32, z_or_none) DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none) DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none) DEF_MVE_FUNCTION (vmaxq, binary, all_integer, mx_or_none) @@ -80,12 +94,13 @@ DEF_MVE_FUNCTION (vmovnbq, binary_move_narrow, integer_16_32, m_or_none) DEF_MVE_FUNCTION (vmovntq, binary_move_narrow, integer_16_32, m_or_none) DEF_MVE_FUNCTION (vmulhq, binary, all_integer, mx_or_none) DEF_MVE_FUNCTION (vmullbq_int, binary_widen, all_integer, mx_or_none) -DEF_MVE_FUNCTION (vmulltq_int, binary_widen, all_integer, mx_or_none) DEF_MVE_FUNCTION (vmullbq_poly, binary_widen_poly, poly_8_16, mx_or_none) +DEF_MVE_FUNCTION (vmulltq_int, binary_widen, all_integer, mx_or_none) DEF_MVE_FUNCTION (vmulltq_poly, binary_widen_poly, poly_8_16, mx_or_none) DEF_MVE_FUNCTION (vmulq, binary_opt_n, all_integer, mx_or_none) DEF_MVE_FUNCTION (vmvnq, mvn, all_integer, mx_or_none) DEF_MVE_FUNCTION (vnegq, unary, all_signed, mx_or_none) +DEF_MVE_FUNCTION (vornq, binary_orrq, all_integer, mx_or_none) DEF_MVE_FUNCTION (vorrq, binary_orrq, all_integer, mx_or_none) DEF_MVE_FUNCTION (vpselq, vpsel, all_integer_with_64, none) DEF_MVE_FUNCTION (vqabsq, unary, all_signed, m_or_none) @@ -142,6 +157,9 @@ DEF_MVE_FUNCTION (vrshlq, binary_round_lshift, all_integer, mx_or_none) DEF_MVE_FUNCTION (vrshrnbq, binary_rshift_narrow, integer_16_32, m_or_none) DEF_MVE_FUNCTION (vrshrntq, binary_rshift_narrow, integer_16_32, m_or_none) DEF_MVE_FUNCTION (vrshrq, binary_rshift, all_integer, mx_or_none) +DEF_MVE_FUNCTION (vsbciq, vadc_vsbc, integer_32, m_or_none) +DEF_MVE_FUNCTION (vsbcq, vadc_vsbc, integer_32, m_or_none) +DEF_MVE_FUNCTION (vshlcq, vshlc, all_integer, m_or_none) DEF_MVE_FUNCTION (vshllbq, binary_widen_n, integer_8_16, mx_or_none) DEF_MVE_FUNCTION (vshlltq, binary_widen_n, integer_8_16, mx_or_none) DEF_MVE_FUNCTION (vshlq, binary_lshift, all_integer, mx_or_none) @@ -151,7 +169,10 @@ DEF_MVE_FUNCTION (vshrntq, binary_rshift_narrow, integer_16_32, m_or_none) DEF_MVE_FUNCTION (vshrq, binary_rshift, all_integer, mx_or_none) DEF_MVE_FUNCTION (vsliq, ternary_lshift, all_integer, m_or_none) DEF_MVE_FUNCTION (vsriq, ternary_rshift, all_integer, m_or_none) -DEF_MVE_FUNCTION (vst1q, store, all_integer, none) +DEF_MVE_FUNCTION (vst1q, store, all_integer, p_or_none) +DEF_MVE_FUNCTION (vstrbq, store, all_integer, p_or_none) +DEF_MVE_FUNCTION (vstrhq, store, integer_16_32, p_or_none) +DEF_MVE_FUNCTION (vstrwq, store, integer_32, p_or_none) DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_integer, mx_or_none) DEF_MVE_FUNCTION (vuninitializedq, inherent, all_integer_with_64, none) #undef REQUIRES_FLOAT @@ -161,30 +182,42 @@ DEF_MVE_FUNCTION (vabdq, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vabsq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_float, mx_or_none) DEF_MVE_FUNCTION (vandq, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vbicq, binary_orrq, all_float, mx_or_none) DEF_MVE_FUNCTION (vbrsrq, binary_imm32, all_float, mx_or_none) -DEF_MVE_FUNCTION (vcaddq_rot90, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vcaddq_rot270, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vcaddq_rot90, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vcmlaq, ternary, all_float, m_or_none) -DEF_MVE_FUNCTION (vcmlaq_rot90, ternary, all_float, m_or_none) DEF_MVE_FUNCTION (vcmlaq_rot180, ternary, all_float, m_or_none) DEF_MVE_FUNCTION (vcmlaq_rot270, ternary, all_float, m_or_none) -DEF_MVE_FUNCTION (vcmulq, binary, all_float, mx_or_none) -DEF_MVE_FUNCTION (vcmulq_rot90, binary, all_float, mx_or_none) -DEF_MVE_FUNCTION (vcmulq_rot180, binary, all_float, mx_or_none) -DEF_MVE_FUNCTION (vcmulq_rot270, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vcmlaq_rot90, ternary, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpeqq, cmp, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpgeq, cmp, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpgtq, cmp, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpleq, cmp, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpltq, cmp, all_float, m_or_none) DEF_MVE_FUNCTION (vcmpneq, cmp, all_float, m_or_none) +DEF_MVE_FUNCTION (vcmulq, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vcmulq_rot180, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vcmulq_rot270, binary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vcmulq_rot90, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vcreateq, create, all_float, none) +DEF_MVE_FUNCTION (vcvtaq, vcvtx, cvtx, mx_or_none) +DEF_MVE_FUNCTION (vcvtbq, vcvt_f16_f32, cvt_f16_f32, mx_or_none) +DEF_MVE_FUNCTION (vcvtbq, vcvt_f32_f16, cvt_f32_f16, mx_or_none) +DEF_MVE_FUNCTION (vcvtmq, vcvtx, cvtx, mx_or_none) +DEF_MVE_FUNCTION (vcvtnq, vcvtx, cvtx, mx_or_none) +DEF_MVE_FUNCTION (vcvtpq, vcvtx, cvtx, mx_or_none) +DEF_MVE_FUNCTION (vcvtq, vcvt, cvt, mx_or_none) +DEF_MVE_FUNCTION (vcvttq, vcvt_f16_f32, cvt_f16_f32, mx_or_none) +DEF_MVE_FUNCTION (vcvttq, vcvt_f32_f16, cvt_f32_f16, mx_or_none) DEF_MVE_FUNCTION (vdupq, unary_n, all_float, mx_or_none) DEF_MVE_FUNCTION (veorq, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vfmaq, ternary_opt_n, all_float, m_or_none) DEF_MVE_FUNCTION (vfmasq, ternary_n, all_float, m_or_none) DEF_MVE_FUNCTION (vfmsq, ternary, all_float, m_or_none) -DEF_MVE_FUNCTION (vld1q, load, all_float, none) +DEF_MVE_FUNCTION (vld1q, load, all_float, z_or_none) +DEF_MVE_FUNCTION (vldrhq, load_ext, float_16, z_or_none) +DEF_MVE_FUNCTION (vldrwq, load_ext, float_32, z_or_none) DEF_MVE_FUNCTION (vmaxnmaq, binary, all_float, m_or_none) DEF_MVE_FUNCTION (vmaxnmavq, binary_maxvminv, all_float, p_or_none) DEF_MVE_FUNCTION (vmaxnmq, binary, all_float, mx_or_none) @@ -195,10 +228,11 @@ DEF_MVE_FUNCTION (vminnmq, binary, all_float, mx_or_none) DEF_MVE_FUNCTION (vminnmvq, binary_maxvminv, all_float, p_or_none) DEF_MVE_FUNCTION (vmulq, binary_opt_n, all_float, mx_or_none) DEF_MVE_FUNCTION (vnegq, unary, all_float, mx_or_none) +DEF_MVE_FUNCTION (vornq, binary_orrq, all_float, mx_or_none) DEF_MVE_FUNCTION (vorrq, binary_orrq, all_float, mx_or_none) DEF_MVE_FUNCTION (vpselq, vpsel, all_float, none) DEF_MVE_FUNCTION (vreinterpretq, unary_convert, reinterpret_float, none) -DEF_MVE_FUNCTION (vrev32q, unary, float16, mx_or_none) +DEF_MVE_FUNCTION (vrev32q, unary, float_16, mx_or_none) DEF_MVE_FUNCTION (vrev64q, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndaq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndmq, unary, all_float, mx_or_none) @@ -206,7 +240,9 @@ DEF_MVE_FUNCTION (vrndnq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndpq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndq, unary, all_float, mx_or_none) DEF_MVE_FUNCTION (vrndxq, unary, all_float, mx_or_none) -DEF_MVE_FUNCTION (vst1q, store, all_float, none) +DEF_MVE_FUNCTION (vst1q, store, all_float, p_or_none) +DEF_MVE_FUNCTION (vstrhq, store, float_16, p_or_none) +DEF_MVE_FUNCTION (vstrwq, store, float_32, p_or_none) DEF_MVE_FUNCTION (vsubq, binary_opt_n, all_float, mx_or_none) DEF_MVE_FUNCTION (vuninitializedq, inherent, all_float, none) #undef REQUIRES_FLOAT diff --git a/gcc/config/arm/arm-mve-builtins-base.h b/gcc/config/arm/arm-mve-builtins-base.h index c9b52a8..7c866d8 100644 --- a/gcc/config/arm/arm-mve-builtins-base.h +++ b/gcc/config/arm/arm-mve-builtins-base.h @@ -26,12 +26,15 @@ namespace functions { extern const function_base *const vabavq; extern const function_base *const vabdq; extern const function_base *const vabsq; +extern const function_base *const vadciq; +extern const function_base *const vadcq; extern const function_base *const vaddlvaq; extern const function_base *const vaddlvq; extern const function_base *const vaddq; extern const function_base *const vaddvaq; extern const function_base *const vaddvq; extern const function_base *const vandq; +extern const function_base *const vbicq; extern const function_base *const vbrsrq; extern const function_base *const vcaddq_rot270; extern const function_base *const vcaddq_rot90; @@ -54,7 +57,20 @@ extern const function_base *const vcmulq_rot180; extern const function_base *const vcmulq_rot270; extern const function_base *const vcmulq_rot90; extern const function_base *const vcreateq; +extern const function_base *const vctp16q; +extern const function_base *const vctp32q; +extern const function_base *const vctp64q; +extern const function_base *const vctp8q; +extern const function_base *const vcvtaq; +extern const function_base *const vcvtbq; +extern const function_base *const vcvtmq; +extern const function_base *const vcvtnq; +extern const function_base *const vcvtpq; +extern const function_base *const vcvtq; +extern const function_base *const vcvttq; +extern const function_base *const vddupq; extern const function_base *const vdupq; +extern const function_base *const vdwdupq; extern const function_base *const veorq; extern const function_base *const vfmaq; extern const function_base *const vfmasq; @@ -63,7 +79,12 @@ extern const function_base *const vhaddq; extern const function_base *const vhcaddq_rot270; extern const function_base *const vhcaddq_rot90; extern const function_base *const vhsubq; +extern const function_base *const vidupq; +extern const function_base *const viwdupq; extern const function_base *const vld1q; +extern const function_base *const vldrbq; +extern const function_base *const vldrhq; +extern const function_base *const vldrwq; extern const function_base *const vmaxaq; extern const function_base *const vmaxavq; extern const function_base *const vmaxnmaq; @@ -110,6 +131,7 @@ extern const function_base *const vmulltq_poly; extern const function_base *const vmulq; extern const function_base *const vmvnq; extern const function_base *const vnegq; +extern const function_base *const vornq; extern const function_base *const vorrq; extern const function_base *const vpselq; extern const function_base *const vqabsq; @@ -171,6 +193,9 @@ extern const function_base *const vrshlq; extern const function_base *const vrshrnbq; extern const function_base *const vrshrntq; extern const function_base *const vrshrq; +extern const function_base *const vsbciq; +extern const function_base *const vsbcq; +extern const function_base *const vshlcq; extern const function_base *const vshllbq; extern const function_base *const vshlltq; extern const function_base *const vshlq; @@ -180,6 +205,9 @@ extern const function_base *const vshrq; extern const function_base *const vsliq; extern const function_base *const vsriq; extern const function_base *const vst1q; +extern const function_base *const vstrbq; +extern const function_base *const vstrhq; +extern const function_base *const vstrwq; extern const function_base *const vsubq; extern const function_base *const vuninitializedq; diff --git a/gcc/config/arm/arm-mve-builtins-functions.h b/gcc/config/arm/arm-mve-builtins-functions.h index ac2a731..0ade215 100644 --- a/gcc/config/arm/arm-mve-builtins-functions.h +++ b/gcc/config/arm/arm-mve-builtins-functions.h @@ -20,6 +20,8 @@ #ifndef GCC_ARM_MVE_BUILTINS_FUNCTIONS_H #define GCC_ARM_MVE_BUILTINS_FUNCTIONS_H +#include "arm-protos.h" + namespace arm_mve { /* Wrap T, which is derived from function_base, and indicate that the @@ -40,17 +42,23 @@ public: }; /* An incomplete function_base for functions that have an associated - rtx_code for signed integers, unsigned integers and floating-point - values for the non-predicated, non-suffixed intrinsic, and unspec - codes, with separate codes for signed integers, unsigned integers - and floating-point values. The class simply records information - about the mapping for derived classes to use. */ + rtx_code or an unspec for signed integers, unsigned integers and + floating-point values for the non-predicated, non-suffixed + intrinsics, and unspec codes, with separate codes for signed + integers, unsigned integers and floating-point values for + predicated and/or suffixed intrinsics. The class simply records + information about the mapping for derived classes to use and + provides a generic expand_unspec () to avoid duplicating expansion + code in derived classes. */ class unspec_based_mve_function_base : public function_base { public: CONSTEXPR unspec_based_mve_function_base (rtx_code code_for_sint, rtx_code code_for_uint, rtx_code code_for_fp, + int unspec_for_sint, + int unspec_for_uint, + int unspec_for_fp, int unspec_for_n_sint, int unspec_for_n_uint, int unspec_for_n_fp, @@ -63,6 +71,9 @@ public: : m_code_for_sint (code_for_sint), m_code_for_uint (code_for_uint), m_code_for_fp (code_for_fp), + m_unspec_for_sint (unspec_for_sint), + m_unspec_for_uint (unspec_for_uint), + m_unspec_for_fp (unspec_for_fp), m_unspec_for_n_sint (unspec_for_n_sint), m_unspec_for_n_uint (unspec_for_n_uint), m_unspec_for_n_fp (unspec_for_n_fp), @@ -83,6 +94,9 @@ public: /* The unspec code associated with signed-integer, unsigned-integer and floating-point operations respectively. It covers the cases with the _n suffix, and/or the _m predicate. */ + int m_unspec_for_sint; + int m_unspec_for_uint; + int m_unspec_for_fp; int m_unspec_for_n_sint; int m_unspec_for_n_uint; int m_unspec_for_n_fp; @@ -92,8 +106,101 @@ public: int m_unspec_for_m_n_sint; int m_unspec_for_m_n_uint; int m_unspec_for_m_n_fp; + + rtx expand_unspec (function_expander &e) const; }; +/* Expand the unspecs, which is common to all intrinsics using + unspec_based_mve_function_base. If some combinations are not + supported for an intrinsics family, they should be handled by the + caller (and not crash here). */ +rtx +unspec_based_mve_function_base::expand_unspec (function_expander &e) const +{ + machine_mode mode = e.vector_mode (0); + insn_code code; + + switch (e.pred) + { + case PRED_none: + switch (e.mode_suffix_id) + { + case MODE_none: + /* No predicate, no suffix. */ + if (e.type_suffix (0).integer_p) + { + int unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_uint + : m_unspec_for_sint); + code = code_for_mve_q (unspec, unspec, mode); + } + else + code = code_for_mve_q_f (m_unspec_for_fp, mode); + break; + + case MODE_n: + /* No predicate, _n suffix. */ + if (e.type_suffix (0).integer_p) + { + int unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_n_uint + : m_unspec_for_n_sint); + code = code_for_mve_q_n (unspec, unspec, mode); + } + else + code = code_for_mve_q_n_f (m_unspec_for_n_fp, mode); + break; + + default: + gcc_unreachable (); + } + return e.use_exact_insn (code); + + case PRED_m: + case PRED_x: + switch (e.mode_suffix_id) + { + case MODE_none: + /* No suffix, "m" or "x" predicate. */ + if (e.type_suffix (0).integer_p) + { + int unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_uint + : m_unspec_for_m_sint); + code = code_for_mve_q_m (unspec, unspec, mode); + } + else + code = code_for_mve_q_m_f (m_unspec_for_m_fp, mode); + break; + + case MODE_n: + /* _n suffix, "m" or "x" predicate. */ + if (e.type_suffix (0).integer_p) + { + int unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_n_uint + : m_unspec_for_m_n_sint); + code = code_for_mve_q_m_n (unspec, unspec, mode); + } + else + code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, mode); + break; + + default: + gcc_unreachable (); + } + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); + else + return e.use_pred_x_insn (code); + break; + + default: + gcc_unreachable (); + } +} + /* Map the function directly to CODE (UNSPEC, M) where M is the vector mode associated with type suffix 0, except when there is no predicate and no _n suffix, in which case we use the appropriate @@ -117,6 +224,9 @@ public: : unspec_based_mve_function_base (code_for_sint, code_for_uint, code_for_fp, + -1, + -1, + -1, unspec_for_n_sint, unspec_for_n_uint, unspec_for_n_fp, @@ -137,97 +247,13 @@ public: return e.map_to_rtx_codes (m_code_for_sint, m_code_for_uint, m_code_for_fp); - insn_code code; - switch (e.pred) - { - case PRED_none: - if (e.mode_suffix_id == MODE_n) - /* No predicate, _n suffix. */ - { - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_n (m_unspec_for_n_uint, m_unspec_for_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_n (m_unspec_for_n_sint, m_unspec_for_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_n_f (m_unspec_for_n_fp, e.vector_mode (0)); - - return e.use_exact_insn (code); - } - gcc_unreachable (); - break; - - case PRED_m: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "m" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, e.vector_mode (0)); - break; - - case MODE_n: - /* _n suffix, "m" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_cond_insn (code, 0); - - case PRED_x: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "x" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, e.vector_mode (0)); - break; - - case MODE_n: - /* _n suffix, "x" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_pred_x_insn (code); - - default: - gcc_unreachable (); - } - - gcc_unreachable (); + return expand_unspec (e); } }; /* Map the function directly to CODE (UNSPEC, M) where M is the vector mode associated with type suffix 0. */ -class unspec_mve_function_exact_insn : public function_base +class unspec_mve_function_exact_insn : public unspec_based_mve_function_base { public: CONSTEXPR unspec_mve_function_exact_insn (int unspec_for_sint, @@ -242,143 +268,33 @@ public: int unspec_for_m_n_sint, int unspec_for_m_n_uint, int unspec_for_m_n_fp) - : m_unspec_for_sint (unspec_for_sint), - m_unspec_for_uint (unspec_for_uint), - m_unspec_for_fp (unspec_for_fp), - m_unspec_for_n_sint (unspec_for_n_sint), - m_unspec_for_n_uint (unspec_for_n_uint), - m_unspec_for_n_fp (unspec_for_n_fp), - m_unspec_for_m_sint (unspec_for_m_sint), - m_unspec_for_m_uint (unspec_for_m_uint), - m_unspec_for_m_fp (unspec_for_m_fp), - m_unspec_for_m_n_sint (unspec_for_m_n_sint), - m_unspec_for_m_n_uint (unspec_for_m_n_uint), - m_unspec_for_m_n_fp (unspec_for_m_n_fp) + : unspec_based_mve_function_base (UNKNOWN, + UNKNOWN, + UNKNOWN, + unspec_for_sint, + unspec_for_uint, + unspec_for_fp, + unspec_for_n_sint, + unspec_for_n_uint, + unspec_for_n_fp, + unspec_for_m_sint, + unspec_for_m_uint, + unspec_for_m_fp, + unspec_for_m_n_sint, + unspec_for_m_n_uint, + unspec_for_m_n_fp) {} - /* The unspec code associated with signed-integer, unsigned-integer - and floating-point operations respectively. It covers the cases - with the _n suffix, and/or the _m predicate. */ - int m_unspec_for_sint; - int m_unspec_for_uint; - int m_unspec_for_fp; - int m_unspec_for_n_sint; - int m_unspec_for_n_uint; - int m_unspec_for_n_fp; - int m_unspec_for_m_sint; - int m_unspec_for_m_uint; - int m_unspec_for_m_fp; - int m_unspec_for_m_n_sint; - int m_unspec_for_m_n_uint; - int m_unspec_for_m_n_fp; - rtx expand (function_expander &e) const override { - insn_code code; - switch (e.pred) - { - case PRED_none: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No predicate, no suffix. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q (m_unspec_for_uint, m_unspec_for_uint, e.vector_mode (0)); - else - code = code_for_mve_q (m_unspec_for_sint, m_unspec_for_sint, e.vector_mode (0)); - else - code = code_for_mve_q_f (m_unspec_for_fp, e.vector_mode (0)); - break; - - case MODE_n: - /* No predicate, _n suffix. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_n (m_unspec_for_n_uint, m_unspec_for_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_n (m_unspec_for_n_sint, m_unspec_for_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_n_f (m_unspec_for_n_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_exact_insn (code); - - case PRED_m: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "m" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, e.vector_mode (0)); - break; - - case MODE_n: - /* _n suffix, "m" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_cond_insn (code, 0); - - case PRED_x: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "x" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, e.vector_mode (0)); - break; - - case MODE_n: - /* _n suffix, "x" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n_f (m_unspec_for_m_n_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_pred_x_insn (code); - - default: - gcc_unreachable (); - } - - gcc_unreachable (); + return expand_unspec (e); } }; /* Map the function directly to CODE (UNSPEC), when there is a non-predicated version and one with the "_p" predicate. */ -class unspec_mve_function_exact_insn_pred_p : public function_base +class unspec_mve_function_exact_insn_pred_p : public unspec_based_mve_function_base { public: CONSTEXPR unspec_mve_function_exact_insn_pred_p (int unspec_for_sint, @@ -387,19 +303,23 @@ public: int unspec_for_p_sint, int unspec_for_p_uint, int unspec_for_p_fp) - : m_unspec_for_sint (unspec_for_sint), - m_unspec_for_uint (unspec_for_uint), - m_unspec_for_fp (unspec_for_fp), + : unspec_based_mve_function_base (UNKNOWN, /* No RTX code. */ + UNKNOWN, + UNKNOWN, + unspec_for_sint, + unspec_for_uint, + unspec_for_fp, + -1, -1, -1, /* No _n intrinsics. */ + -1, -1, -1, /* No _m intrinsics. */ + -1, -1, -1), /* No _m_n intrinsics. */ m_unspec_for_p_sint (unspec_for_p_sint), m_unspec_for_p_uint (unspec_for_p_uint), m_unspec_for_p_fp (unspec_for_p_fp) {} - /* The unspec code associated with signed-integer and unsigned-integer - operations, with no predicate, or with "_p" predicate. */ - int m_unspec_for_sint; - int m_unspec_for_uint; - int m_unspec_for_fp; + /* The unspec code associated with signed-integer and + unsigned-integer or floating-point operations with "_p" + predicate. */ int m_unspec_for_p_sint; int m_unspec_for_p_uint; int m_unspec_for_p_fp; @@ -408,6 +328,7 @@ public: expand (function_expander &e) const override { insn_code code; + int unspec; if (m_unspec_for_sint == VADDLVQ_S || m_unspec_for_sint == VADDLVAQ_S @@ -423,62 +344,49 @@ public: switch (e.pred) { case PRED_none: - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_v4si (m_unspec_for_uint, m_unspec_for_uint); - else - code = code_for_mve_q_v4si (m_unspec_for_sint, m_unspec_for_sint); + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_uint + : m_unspec_for_sint); + code = code_for_mve_q_v4si (unspec, unspec); return e.use_exact_insn (code); case PRED_p: - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_p_v4si (m_unspec_for_p_uint, m_unspec_for_p_uint); - else - code = code_for_mve_q_p_v4si (m_unspec_for_p_sint, m_unspec_for_p_sint); + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_p_uint + : m_unspec_for_p_sint); + code = code_for_mve_q_p_v4si (unspec, unspec); return e.use_exact_insn (code); default: gcc_unreachable (); } } - else - { - switch (e.pred) - { - case PRED_none: - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q (m_unspec_for_uint, m_unspec_for_uint, e.vector_mode (0)); - else - code = code_for_mve_q (m_unspec_for_sint, m_unspec_for_sint, e.vector_mode (0)); - else - code = code_for_mve_q_f (m_unspec_for_fp, e.vector_mode (0)); - return e.use_exact_insn (code); - - case PRED_p: - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_p (m_unspec_for_p_uint, m_unspec_for_p_uint, e.vector_mode (0)); - else - code = code_for_mve_q_p (m_unspec_for_p_sint, m_unspec_for_p_sint, e.vector_mode (0)); - else - code = code_for_mve_q_p_f (m_unspec_for_p_fp, e.vector_mode (0)); - - return e.use_exact_insn (code); + if (e.pred == PRED_p) + { + machine_mode mode = e.vector_mode (0); - default: - gcc_unreachable (); + if (e.type_suffix (0).integer_p) + { + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_p_uint + : m_unspec_for_p_sint); + code = code_for_mve_q_p (unspec, unspec, mode); } + else + code = code_for_mve_q_p_f (m_unspec_for_p_fp, mode); + + return e.use_exact_insn (code); } - gcc_unreachable (); + return expand_unspec (e); } }; /* Map the function directly to CODE (UNSPEC, M) for vshl-like builtins. The difference with unspec_mve_function_exact_insn is that this function handles MODE_r and the related unspecs.. */ -class unspec_mve_function_exact_insn_vshl : public function_base +class unspec_mve_function_exact_insn_vshl : public unspec_based_mve_function_base { public: CONSTEXPR unspec_mve_function_exact_insn_vshl (int unspec_for_sint, @@ -493,31 +401,29 @@ public: int unspec_for_m_r_uint, int unspec_for_r_sint, int unspec_for_r_uint) - : m_unspec_for_sint (unspec_for_sint), - m_unspec_for_uint (unspec_for_uint), - m_unspec_for_n_sint (unspec_for_n_sint), - m_unspec_for_n_uint (unspec_for_n_uint), - m_unspec_for_m_sint (unspec_for_m_sint), - m_unspec_for_m_uint (unspec_for_m_uint), - m_unspec_for_m_n_sint (unspec_for_m_n_sint), - m_unspec_for_m_n_uint (unspec_for_m_n_uint), + : unspec_based_mve_function_base (UNKNOWN, + UNKNOWN, + UNKNOWN, + unspec_for_sint, + unspec_for_uint, + -1, + unspec_for_n_sint, + unspec_for_n_uint, + -1, + unspec_for_m_sint, + unspec_for_m_uint, + -1, + unspec_for_m_n_sint, + unspec_for_m_n_uint, + -1), m_unspec_for_m_r_sint (unspec_for_m_r_sint), m_unspec_for_m_r_uint (unspec_for_m_r_uint), m_unspec_for_r_sint (unspec_for_r_sint), m_unspec_for_r_uint (unspec_for_r_uint) {} - /* The unspec code associated with signed-integer, unsigned-integer - and floating-point operations respectively. It covers the cases - with the _n suffix, and/or the _m predicate. */ - int m_unspec_for_sint; - int m_unspec_for_uint; - int m_unspec_for_n_sint; - int m_unspec_for_n_uint; - int m_unspec_for_m_sint; - int m_unspec_for_m_uint; - int m_unspec_for_m_n_sint; - int m_unspec_for_m_n_uint; + /* The unspec code associated with signed-integer and unsigned-integer + operations with MODE_r with or without PRED_m. */ int m_unspec_for_m_r_sint; int m_unspec_for_m_r_uint; int m_unspec_for_r_sint; @@ -527,101 +433,147 @@ public: expand (function_expander &e) const override { insn_code code; - switch (e.pred) + int unspec; + + if (e.mode_suffix_id == MODE_r) { - case PRED_none: - switch (e.mode_suffix_id) + machine_mode mode = e.vector_mode (0); + switch (e.pred) { - case MODE_none: - /* No predicate, no suffix. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q (m_unspec_for_uint, m_unspec_for_uint, e.vector_mode (0)); - else - code = code_for_mve_q (m_unspec_for_sint, m_unspec_for_sint, e.vector_mode (0)); - break; - - case MODE_n: - /* No predicate, _n suffix. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_n (m_unspec_for_n_uint, m_unspec_for_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_n (m_unspec_for_n_sint, m_unspec_for_n_sint, e.vector_mode (0)); - break; - - case MODE_r: + case PRED_none: /* No predicate, _r suffix. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_r (m_unspec_for_r_uint, m_unspec_for_r_uint, e.vector_mode (0)); + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_r_uint + : m_unspec_for_r_sint); + code = code_for_mve_q_r (unspec, unspec, mode); + return e.use_exact_insn (code); + + case PRED_m: + case PRED_x: + /* _r suffix, "m" or "x" predicate. */ + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_r_uint + : m_unspec_for_m_r_sint); + code = code_for_mve_q_m_r (unspec, unspec, mode); + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); else - code = code_for_mve_q_r (m_unspec_for_r_sint, m_unspec_for_r_sint, e.vector_mode (0)); - break; + return e.use_pred_x_insn (code); default: gcc_unreachable (); } - return e.use_exact_insn (code); + } - case PRED_m: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "m" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - break; + return expand_unspec (e); + } +}; - case MODE_n: - /* _n suffix, "m" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - break; +/* Map the function directly to CODE (M) for vbic-like builtins. The difference + with unspec_based_mve_function_exact_insn is that this function has vbic + hardcoded for the PRED_none, MODE_none version, rather than using an + RTX. */ +class unspec_based_mve_function_exact_insn_vbic : public unspec_based_mve_function_base +{ +public: + CONSTEXPR unspec_based_mve_function_exact_insn_vbic (int unspec_for_n_sint, + int unspec_for_n_uint, + int unspec_for_m_sint, + int unspec_for_m_uint, + int unspec_for_m_fp, + int unspec_for_m_n_sint, + int unspec_for_m_n_uint) + : unspec_based_mve_function_base (UNKNOWN, + UNKNOWN, + UNKNOWN, + -1, -1, -1, /* No non-predicated, no mode intrinsics. */ + unspec_for_n_sint, + unspec_for_n_uint, + -1, + unspec_for_m_sint, + unspec_for_m_uint, + unspec_for_m_fp, + unspec_for_m_n_sint, + unspec_for_m_n_uint, + -1) + {} - case MODE_r: - /* _r suffix, "m" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_r (m_unspec_for_m_r_uint, m_unspec_for_m_r_uint, e.vector_mode (0)); + rtx + expand (function_expander &e) const override + { + machine_mode mode = e.vector_mode (0); + insn_code code; + + /* No suffix, no predicate, use the right RTX code. */ + if (e.pred == PRED_none + && e.mode_suffix_id == MODE_none) + { + if (e.type_suffix (0).integer_p) + if (e.type_suffix (0).unsigned_p) + code = code_for_mve_vbicq_u (mode); + else + code = code_for_mve_vbicq_s (mode); else - code = code_for_mve_q_m_r (m_unspec_for_m_r_sint, m_unspec_for_m_r_sint, e.vector_mode (0)); - break; + code = code_for_mve_vbicq_f (mode); - default: - gcc_unreachable (); + return e.use_exact_insn (code); } - return e.use_cond_insn (code, 0); - case PRED_x: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "x" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - break; + return expand_unspec (e); + } +}; - case MODE_n: - /* _n suffix, "x" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, e.vector_mode (0)); - break; +/* Map the function directly to CODE (M) for vorn-like builtins. The difference + with unspec_based_mve_function_exact_insn is that this function has vbic + hardcoded for the PRED_none, MODE_none version, rather than using an + RTX. */ +class unspec_based_mve_function_exact_insn_vorn : public unspec_based_mve_function_base +{ +public: + CONSTEXPR unspec_based_mve_function_exact_insn_vorn (int unspec_for_n_sint, + int unspec_for_n_uint, + int unspec_for_m_sint, + int unspec_for_m_uint, + int unspec_for_m_fp, + int unspec_for_m_n_sint, + int unspec_for_m_n_uint) + : unspec_based_mve_function_base (UNKNOWN, + UNKNOWN, + UNKNOWN, + -1, -1, -1, /* No non-predicated, no mode unspec intrinsics. */ + unspec_for_n_sint, + unspec_for_n_uint, + -1, + unspec_for_m_sint, + unspec_for_m_uint, + unspec_for_m_fp, + unspec_for_m_n_sint, + unspec_for_m_n_uint, + -1) + {} - default: - gcc_unreachable (); - } - return e.use_pred_x_insn (code); + rtx + expand (function_expander &e) const override + { + machine_mode mode = e.vector_mode (0); + insn_code code; - default: - gcc_unreachable (); + /* No suffix, no predicate, use the right RTX code. */ + if (e.pred == PRED_none + && e.mode_suffix_id == MODE_none) + { + if (e.type_suffix (0).integer_p) + if (e.type_suffix (0).unsigned_p) + code = code_for_mve_vornq_u (mode); + else + code = code_for_mve_vornq_s (mode); + else + code = code_for_mve_vornq_f (mode); + return e.use_exact_insn (code); } - gcc_unreachable (); + return expand_unspec (e); } }; @@ -641,9 +593,8 @@ public: : unspec_based_mve_function_base (code_for_sint, code_for_uint, code_for_fp, - -1, - -1, - -1, + -1, -1, -1, /* No non-predicated, no mode intrinsics. */ + -1, -1, -1, /* No _n intrinsics. */ unspec_for_m_sint, unspec_for_m_uint, unspec_for_m_fp, @@ -662,24 +613,30 @@ public: /* No suffix, no predicate, use the right RTX code. */ if (e.pred == PRED_none) { + rtx_code r_code; + switch (e.mode_suffix_id) { case MODE_none: if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_vcmpq (m_code_for_uint, mode); - else - code = code_for_mve_vcmpq (m_code_for_sint, mode); + { + r_code = (e.type_suffix (0).unsigned_p + ? m_code_for_uint + : m_code_for_sint); + code = code_for_mve_vcmpq (r_code, mode); + } else code = code_for_mve_vcmpq_f (m_code_for_fp, mode); break; case MODE_n: if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_vcmpq_n (m_code_for_uint, mode); - else - code = code_for_mve_vcmpq_n (m_code_for_sint, mode); + { + r_code = (e.type_suffix (0).unsigned_p + ? m_code_for_uint + : m_code_for_sint); + code = code_for_mve_vcmpq_n (r_code, mode); + } else code = code_for_mve_vcmpq_n_f (m_code_for_fp, mode); break; @@ -691,6 +648,8 @@ public: } else { + int unspec; + switch (e.pred) { case PRED_m: @@ -699,10 +658,12 @@ public: case MODE_none: /* No suffix, "m" predicate. */ if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_vcmpq_m (m_unspec_for_m_uint, m_unspec_for_m_uint, mode); - else - code = code_for_mve_vcmpq_m (m_unspec_for_m_sint, m_unspec_for_m_sint, mode); + { + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_uint + : m_unspec_for_m_sint); + code = code_for_mve_vcmpq_m (unspec, unspec, mode); + } else code = code_for_mve_vcmpq_m_f (m_unspec_for_m_fp, mode); break; @@ -710,10 +671,12 @@ public: case MODE_n: /* _n suffix, "m" predicate. */ if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_vcmpq_m_n (m_unspec_for_m_n_uint, m_unspec_for_m_n_uint, mode); - else - code = code_for_mve_vcmpq_m_n (m_unspec_for_m_n_sint, m_unspec_for_m_n_sint, mode); + { + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_n_uint + : m_unspec_for_m_n_sint); + code = code_for_mve_vcmpq_m_n (unspec, unspec, mode); + } else code = code_for_mve_vcmpq_m_n_f (m_unspec_for_m_n_fp, mode); break; @@ -738,7 +701,9 @@ public: /* Map the function directly to CODE (UNSPEC, UNSPEC, UNSPEC, M) where M is the vector mode associated with type suffix 0. USed for the operations where there is a "rot90" or "rot270" suffix, depending - on the UNSPEC. */ + on the UNSPEC. We cannot use + unspec_based_mve_function_base::expand_unspec () because we call + code_for_mve_q with one more parameter. */ class unspec_mve_function_exact_insn_rot : public function_base { public: @@ -769,7 +734,9 @@ public: rtx expand (function_expander &e) const override { + machine_mode mode = e.vector_mode (0); insn_code code; + int unspec; switch (e.pred) { @@ -779,12 +746,14 @@ public: case MODE_none: /* No predicate, no suffix. */ if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q (m_unspec_for_uint, m_unspec_for_uint, m_unspec_for_uint, e.vector_mode (0)); - else - code = code_for_mve_q (m_unspec_for_sint, m_unspec_for_sint, m_unspec_for_sint, e.vector_mode (0)); + { + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_uint + : m_unspec_for_sint); + code = code_for_mve_q (unspec, unspec, unspec, mode); + } else - code = code_for_mve_q_f (m_unspec_for_fp, m_unspec_for_fp, e.vector_mode (0)); + code = code_for_mve_q_f (m_unspec_for_fp, m_unspec_for_fp, mode); break; default: @@ -793,42 +762,30 @@ public: return e.use_exact_insn (code); case PRED_m: + case PRED_x: switch (e.mode_suffix_id) { case MODE_none: - /* No suffix, "m" predicate. */ + /* No suffix, "m" or "x" predicate. */ if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); + { + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_uint + : m_unspec_for_m_sint); + code = code_for_mve_q_m (unspec, unspec, unspec, mode); + } else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, m_unspec_for_m_fp, e.vector_mode (0)); - break; - - default: - gcc_unreachable (); - } - return e.use_cond_insn (code, 0); + code = code_for_mve_q_m_f (m_unspec_for_m_fp, m_unspec_for_m_fp, mode); - case PRED_x: - switch (e.mode_suffix_id) - { - case MODE_none: - /* No suffix, "x" predicate. */ - if (e.type_suffix (0).integer_p) - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_m (m_unspec_for_m_uint, m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_m (m_unspec_for_m_sint, m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); else - code = code_for_mve_q_m_f (m_unspec_for_m_fp, m_unspec_for_m_fp, e.vector_mode (0)); + return e.use_pred_x_insn (code); break; default: gcc_unreachable (); } - return e.use_pred_x_insn (code); default: gcc_unreachable (); @@ -866,7 +823,9 @@ public: rtx expand (function_expander &e) const override { + machine_mode mode = e.vector_mode (0); insn_code code; + int unspec; if (! e.type_suffix (0).integer_p) gcc_unreachable (); @@ -878,30 +837,25 @@ public: { case PRED_none: /* No predicate, no suffix. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_int (m_unspec_for_uint, m_unspec_for_uint, e.vector_mode (0)); - else - code = code_for_mve_q_int (m_unspec_for_sint, m_unspec_for_sint, e.vector_mode (0)); + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_uint + : m_unspec_for_sint); + code = code_for_mve_q_int (unspec, unspec, mode); return e.use_exact_insn (code); case PRED_m: - /* No suffix, "m" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_int_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); - else - code = code_for_mve_q_int_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - - return e.use_cond_insn (code, 0); - case PRED_x: - /* No suffix, "x" predicate. */ - if (e.type_suffix (0).unsigned_p) - code = code_for_mve_q_int_m (m_unspec_for_m_uint, m_unspec_for_m_uint, e.vector_mode (0)); + /* No suffix, "m" or "x" predicate. */ + unspec = (e.type_suffix (0).unsigned_p + ? m_unspec_for_m_uint + : m_unspec_for_m_sint); + code = code_for_mve_q_int_m (unspec, unspec, mode); + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); else - code = code_for_mve_q_int_m (m_unspec_for_m_sint, m_unspec_for_m_sint, e.vector_mode (0)); - - return e.use_pred_x_insn (code); + return e.use_pred_x_insn (code); default: gcc_unreachable (); @@ -933,6 +887,7 @@ public: rtx expand (function_expander &e) const override { + machine_mode mode = e.vector_mode (0); insn_code code; if (e.mode_suffix_id != MODE_none) @@ -945,18 +900,18 @@ public: { case PRED_none: /* No predicate, no suffix. */ - code = code_for_mve_q_poly (m_unspec_for_poly, m_unspec_for_poly, e.vector_mode (0)); + code = code_for_mve_q_poly (m_unspec_for_poly, m_unspec_for_poly, mode); return e.use_exact_insn (code); case PRED_m: - /* No suffix, "m" predicate. */ - code = code_for_mve_q_poly_m (m_unspec_for_m_poly, m_unspec_for_m_poly, e.vector_mode (0)); - return e.use_cond_insn (code, 0); - case PRED_x: - /* No suffix, "x" predicate. */ - code = code_for_mve_q_poly_m (m_unspec_for_m_poly, m_unspec_for_m_poly, e.vector_mode (0)); - return e.use_pred_x_insn (code); + /* No suffix, "m" or "x" predicate. */ + code = code_for_mve_q_poly_m (m_unspec_for_m_poly, m_unspec_for_m_poly, mode); + + if (e.pred == PRED_m) + return e.use_cond_insn (code, 0); + else + return e.use_pred_x_insn (code); default: gcc_unreachable (); @@ -1003,19 +958,6 @@ public: memory_vector_mode (const function_instance &fi) const override { machine_mode mode = fi.vector_mode (0); - /* Vectors of floating-point are managed in memory as vectors of - integers. */ - switch (mode) - { - case E_V4SFmode: - mode = E_V4SImode; - break; - case E_V8HFmode: - mode = E_V8HImode; - break; - default: - break; - } if (m_vectors_per_tuple != 1) mode = targetm.array_mode (mode, m_vectors_per_tuple).require (); @@ -1024,6 +966,107 @@ public: } }; +/* A function_base that loads elements from memory and extends them + to a wider element. The memory element type is a fixed part of + the function base name. */ +class load_extending : public function_base +{ +public: + CONSTEXPR load_extending (type_suffix_index signed_memory_type, + type_suffix_index unsigned_memory_type, + type_suffix_index float_memory_type) + : m_signed_memory_type (signed_memory_type), + m_unsigned_memory_type (unsigned_memory_type), + m_float_memory_type (float_memory_type) + {} + CONSTEXPR load_extending (type_suffix_index signed_memory_type, + type_suffix_index unsigned_memory_type) + : m_signed_memory_type (signed_memory_type), + m_unsigned_memory_type (unsigned_memory_type), + m_float_memory_type (NUM_TYPE_SUFFIXES) + {} + + unsigned int call_properties (const function_instance &) const override + { + return CP_READ_MEMORY; + } + + tree memory_scalar_type (const function_instance &fi) const override + { + type_suffix_index memory_type_suffix + = (fi.type_suffix (0).integer_p + ? (fi.type_suffix (0).unsigned_p + ? m_unsigned_memory_type + : m_signed_memory_type) + : m_float_memory_type); + return scalar_types[type_suffixes[memory_type_suffix].vector_type]; + } + + machine_mode memory_vector_mode (const function_instance &fi) const override + { + type_suffix_index memory_type_suffix + = (fi.type_suffix (0).integer_p + ? (fi.type_suffix (0).unsigned_p + ? m_unsigned_memory_type + : m_signed_memory_type) + : m_float_memory_type); + machine_mode mem_mode = type_suffixes[memory_type_suffix].vector_mode; + machine_mode reg_mode = fi.vector_mode (0); + + return arm_mve_data_mode (GET_MODE_INNER (mem_mode), + GET_MODE_NUNITS (reg_mode)).require (); + } + + /* The type of the memory elements. This is part of the function base + name rather than a true type suffix. */ + type_suffix_index m_signed_memory_type; + type_suffix_index m_unsigned_memory_type; + type_suffix_index m_float_memory_type; +}; + +/* A function_base that truncates vector elements and stores them to memory. + The memory element width is a fixed part of the function base name. */ +class store_truncating : public function_base +{ +public: + CONSTEXPR store_truncating (scalar_mode to_int_mode, + opt_scalar_mode to_float_mode) + : m_to_int_mode (to_int_mode), m_to_float_mode (to_float_mode) + {} + + unsigned int call_properties (const function_instance &) const override + { + return CP_WRITE_MEMORY; + } + + tree memory_scalar_type (const function_instance &fi) const override + { + /* In truncating stores, the signedness of the memory element is defined + to be the same as the signedness of the vector element. The signedness + doesn't make any difference to the behavior of the function. */ + type_class_index tclass = fi.type_suffix (0).tclass; + unsigned int element_bits + = GET_MODE_BITSIZE (fi.type_suffix (0).integer_p + ? m_to_int_mode + : m_to_float_mode.require ()); + type_suffix_index suffix = find_type_suffix (tclass, element_bits); + return scalar_types[type_suffixes[suffix].vector_type]; + } + + machine_mode memory_vector_mode (const function_instance &fi) const override + { + poly_uint64 nunits = GET_MODE_NUNITS (fi.vector_mode (0)); + scalar_mode mode = (fi.type_suffix (0).integer_p + ? m_to_int_mode + : m_to_float_mode.require ()); + return arm_mve_data_mode (mode, nunits).require (); + } + + /* The mode of a single memory element. */ + scalar_mode m_to_int_mode; + opt_scalar_mode m_to_float_mode; +}; + } /* end namespace arm_mve */ /* Declare the global function base NAME, creating it from an instance diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-mve-builtins-shapes.cc index ba20c6a..12e6212 100644 --- a/gcc/config/arm/arm-mve-builtins-shapes.cc +++ b/gcc/config/arm/arm-mve-builtins-shapes.cc @@ -320,6 +320,45 @@ build_16_32 (function_builder &b, const char *signature, } } +/* TYPE is the largest type suffix associated with the arguments of R, but the + result is twice as wide. Return the associated type suffix of + EXPECTED_TCLASS if it exists, otherwise report an appropriate error and + return NUM_TYPE_SUFFIXES. */ +static type_suffix_index +long_type_suffix (function_resolver &r, + type_suffix_index type, + type_class_index expected_tclass) +{ + unsigned int element_bits = type_suffixes[type].element_bits; + if (expected_tclass == function_resolver::SAME_TYPE_CLASS) + expected_tclass = type_suffixes[type].tclass; + + if (type_suffixes[type].integer_p && element_bits < 64) + return find_type_suffix (expected_tclass, element_bits * 2); + + r.report_no_such_form (type); + return NUM_TYPE_SUFFIXES; +} + +/* Return the type suffix half as wide as TYPE with EXPECTED_TCLASS if it + exists, otherwise report an appropriate error and return + NUM_TYPE_SUFFIXES. */ +static type_suffix_index +half_type_suffix (function_resolver &r, + type_suffix_index type, + type_class_index expected_tclass) +{ + unsigned int element_bits = type_suffixes[type].element_bits; + if (expected_tclass == function_resolver::SAME_TYPE_CLASS) + expected_tclass = type_suffixes[type].tclass; + + if (type_suffixes[type].integer_p && element_bits > 8) + return find_type_suffix (expected_tclass, element_bits / 2); + + r.report_no_such_form (type); + return NUM_TYPE_SUFFIXES; +} + /* Declare the function shape NAME, pointing it to an instance of class <NAME>_def. */ #define SHAPE(NAME) \ @@ -330,7 +369,8 @@ build_16_32 (function_builder &b, const char *signature, struct nonoverloaded_base : public function_shape { bool - explicit_type_suffix_p (unsigned int, enum predication_index, enum mode_suffix_index) const override + explicit_type_suffix_p (unsigned int, enum predication_index, + enum mode_suffix_index, type_suffix_info) const override { return true; } @@ -360,7 +400,8 @@ template<unsigned int EXPLICIT_MASK> struct overloaded_base : public function_shape { bool - explicit_type_suffix_p (unsigned int i, enum predication_index, enum mode_suffix_index) const override + explicit_type_suffix_p (unsigned int i, enum predication_index, + enum mode_suffix_index, type_suffix_info) const override { return (EXPLICIT_MASK >> i) & 1; } @@ -475,18 +516,23 @@ struct binary_acca_int32_def : public overloaded_base<0> { unsigned int i, nargs; type_suffix_index type; + const char *first_type_name; + if (!r.check_gp_argument (3, i, nargs) || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES) return error_mark_node; + first_type_name = (type_suffixes[type].unsigned_p + ? "uint32_t" + : "int32_t"); + if (!r.require_scalar_type (0, first_type_name)) + return error_mark_node; + unsigned int last_arg = i + 1; for (i = 1; i < last_arg; i++) if (!r.require_matching_vector_type (i, type)) return error_mark_node; - if (!r.require_integer_immediate (0)) - return error_mark_node; - return r.resolve_to (r.mode_suffix_id, type); } }; @@ -512,18 +558,24 @@ struct binary_acca_int64_def : public overloaded_base<0> { unsigned int i, nargs; type_suffix_index type; + const char *first_type_name; + if (!r.check_gp_argument (3, i, nargs) || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES) return error_mark_node; + + first_type_name = (type_suffixes[type].unsigned_p + ? "uint64_t" + : "int64_t"); + if (!r.require_scalar_type (0, first_type_name)) + return error_mark_node; + unsigned int last_arg = i + 1; for (i = 1; i < last_arg; i++) if (!r.require_matching_vector_type (i, type)) return error_mark_node; - if (!r.require_integer_immediate (0)) - return error_mark_node; - return r.resolve_to (r.mode_suffix_id, type); } }; @@ -611,7 +663,7 @@ struct binary_lshift_unsigned_def : public overloaded_base<0> bool preserve_user_namespace) const override { b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); - build_all (b, "vu0,vs0,ss32", group, MODE_n, preserve_user_namespace); + build_all (b, "vu0,vs0,su64", group, MODE_n, preserve_user_namespace); } tree @@ -620,6 +672,7 @@ struct binary_lshift_unsigned_def : public overloaded_base<0> unsigned int i, nargs; type_suffix_index type; if (!r.check_gp_argument (2, i, nargs) + || !r.require_integer_immediate (i) || (type = r.infer_vector_type (i-1)) == NUM_TYPE_SUFFIXES) return error_mark_node; @@ -634,10 +687,6 @@ struct binary_lshift_unsigned_def : public overloaded_base<0> return error_mark_node; } - for (; i < nargs; ++i) - if (!r.require_integer_immediate (i)) - return error_mark_node; - return r.resolve_to (r.mode_suffix_id, type); } @@ -769,16 +818,13 @@ struct binary_move_narrow_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, narrow_suffix; if (!r.check_gp_argument (2, i, nargs) - || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES) + || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES + || ((narrow_suffix = half_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES)) return error_mark_node; - type_suffix_index narrow_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits / 2); - - if (!r.require_matching_vector_type (0, narrow_suffix)) return error_mark_node; @@ -806,15 +852,13 @@ struct binary_move_narrow_unsigned_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, narrow_suffix; if (!r.check_gp_argument (2, i, nargs) - || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES) + || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES + || ((narrow_suffix = half_type_suffix (r, type, TYPE_unsigned)) + == NUM_TYPE_SUFFIXES)) return error_mark_node; - type_suffix_index narrow_suffix - = find_type_suffix (TYPE_unsigned, - type_suffixes[type].element_bits / 2); - if (!r.require_matching_vector_type (0, narrow_suffix)) return error_mark_node; @@ -865,7 +909,12 @@ SHAPE (binary_opt_n) int16x8_t [__arm_]vorrq_m[_s16](int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p) int16x8_t [__arm_]vorrq_x[_s16](int16x8_t a, int16x8_t b, mve_pred16_t p) int16x8_t [__arm_]vorrq[_n_s16](int16x8_t a, const int16_t imm) - int16x8_t [__arm_]vorrq_m_n[_s16](int16x8_t a, const int16_t imm, mve_pred16_t p) */ + int16x8_t [__arm_]vorrq_m_n[_s16](int16x8_t a, const int16_t imm, mve_pred16_t p) + + No "_n" forms for floating-point, nor 8-bit integers: + float16x8_t [__arm_]vorrq[_f16](float16x8_t a, float16x8_t b) + float16x8_t [__arm_]vorrq_m[_f16](float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p) + float16x8_t [__arm_]vorrq_x[_f16](float16x8_t a, float16x8_t b, mve_pred16_t p) */ struct binary_orrq_def : public overloaded_base<0> { bool @@ -1090,23 +1139,21 @@ struct binary_rshift_narrow_def : public overloaded_base<0> bool preserve_user_namespace) const override { b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); - build_all (b, "vh0,vh0,v0,ss32", group, MODE_n, preserve_user_namespace); + build_all (b, "vh0,vh0,v0,su64", group, MODE_n, preserve_user_namespace); } tree resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, narrow_suffix; if (!r.check_gp_argument (3, i, nargs) || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES + || ((narrow_suffix = half_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES) || !r.require_integer_immediate (i)) return error_mark_node; - type_suffix_index narrow_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits / 2); - if (!r.require_matching_vector_type (0, narrow_suffix)) return error_mark_node; @@ -1137,23 +1184,21 @@ struct binary_rshift_narrow_unsigned_def : public overloaded_base<0> bool preserve_user_namespace) const override { b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); - build_all (b, "vhu0,vhu0,v0,ss32", group, MODE_n, preserve_user_namespace); + build_all (b, "vhu0,vhu0,v0,su64", group, MODE_n, preserve_user_namespace); } tree resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, narrow_suffix; if (!r.check_gp_argument (3, i, nargs) || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES + || ((narrow_suffix = half_type_suffix (r, type, TYPE_unsigned)) + == NUM_TYPE_SUFFIXES) || !r.require_integer_immediate (i)) return error_mark_node; - type_suffix_index narrow_suffix - = find_type_suffix (TYPE_unsigned, - type_suffixes[type].element_bits / 2); - if (!r.require_matching_vector_type (0, narrow_suffix)) return error_mark_node; @@ -1190,15 +1235,13 @@ struct binary_widen_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, wide_suffix; if (!r.check_gp_argument (2, i, nargs) - || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES) + || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES + || ((wide_suffix = long_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES)) return error_mark_node; - type_suffix_index wide_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits * 2); - if (!r.require_matching_vector_type (i, type)) return error_mark_node; @@ -1283,17 +1326,15 @@ struct binary_widen_n_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, wide_suffix; tree res; if (!r.check_gp_argument (2, i, nargs) || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES + || ((wide_suffix = long_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES) || !r.require_integer_immediate (i)) return error_mark_node; - type_suffix_index wide_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits * 2); - /* Check the inactive argument has the wide type. */ if (((r.pred == PRED_m) && (r.infer_vector_type (0) == wide_suffix)) || r.pred == PRED_none @@ -1337,15 +1378,13 @@ struct binary_widen_opt_n_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, wide_suffix; if (!r.check_gp_argument (2, i, nargs) - || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES) + || (type = r.infer_vector_type (i - 1)) == NUM_TYPE_SUFFIXES + || ((wide_suffix = long_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES)) return error_mark_node; - type_suffix_index wide_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits * 2); - /* Skip last argument, may be scalar, will be checked below by finish_opt_n_resolution. */ unsigned int last_arg = i--; @@ -1403,12 +1442,6 @@ struct create_def : public nonoverloaded_base { build_all (b, "v0,su64,su64", group, MODE_none, preserve_user_namespace); } - - tree - resolve (function_resolver &r) const override - { - return r.resolve_uniform (0, 2); - } }; SHAPE (create) @@ -1428,7 +1461,9 @@ struct inherent_def : public nonoverloaded_base }; SHAPE (inherent) -/* sv<t0>_t svfoo[_t0](const <t0>_t *) +/* <T0>_t vfoo[_t0](const <s0>_t *) + + where <s0> is the scalar name of <T0>. Example: vld1q. int8x16_t [__arm_]vld1q[_s8](int8_t const *base) @@ -1460,6 +1495,24 @@ struct load_def : public overloaded_base<0> }; SHAPE (load) +/* <T0>_t foo_t0 (const <X>_t *) + + where <X> is determined by the function base name. + + Example: vldrq. + int32x4_t [__arm_]vldrwq_s32 (int32_t const *base) + uint32x4_t [__arm_]vldrhq_z_u32 (uint16_t const *base, mve_pred16_t p) */ +struct load_ext_def : public nonoverloaded_base +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + build_all (b, "t0,al", group, MODE_none, preserve_user_namespace); + } +}; +SHAPE (load_ext) + /* <T0>_t vfoo[_t0](<T0>_t) <T0>_t vfoo_n_t0(<sT0>_t) @@ -1509,14 +1562,18 @@ struct mvn_def : public overloaded_base<0> }; SHAPE (mvn) -/* void vfoo[_t0](<X>_t *, v<t0>[xN]_t) +/* void vfoo[_t0](<X>_t *, <T0>[xN]_t) where <X> might be tied to <t0> (for non-truncating stores) or might depend on the function base name (for truncating stores). Example: vst1q. void [__arm_]vst1q[_s8](int8_t *base, int8x16_t value) - void [__arm_]vst1q_p[_s8](int8_t *base, int8x16_t value, mve_pred16_t p) */ + void [__arm_]vst1q_p[_s8](int8_t *base, int8x16_t value, mve_pred16_t p) + + Example: vstrb. + void [__arm_]vstrbq[_s16](int8_t *base, int16x8_t value) + void [__arm_]vstrbq_p[_s16](int8_t *base, int16x8_t value, mve_pred16_t p) */ struct store_def : public overloaded_base<0> { void @@ -1587,7 +1644,7 @@ struct ternary_lshift_def : public overloaded_base<0> bool preserve_user_namespace) const override { b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); - build_all (b, "v0,v0,v0,ss32", group, MODE_n, preserve_user_namespace); + build_all (b, "v0,v0,v0,su64", group, MODE_n, preserve_user_namespace); } tree @@ -1682,7 +1739,7 @@ struct ternary_rshift_def : public overloaded_base<0> bool preserve_user_namespace) const override { b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); - build_all (b, "v0,v0,v0,ss32", group, MODE_n, preserve_user_namespace); + build_all (b, "v0,v0,v0,su64", group, MODE_n, preserve_user_namespace); } tree @@ -1837,11 +1894,18 @@ struct unary_int32_acc_def : public overloaded_base<0> { unsigned int i, nargs; type_suffix_index type; + const char *first_type_name; + if (!r.check_gp_argument (2, i, nargs) - || !r.require_integer_immediate (0) || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES) return error_mark_node; + first_type_name = (type_suffixes[type].unsigned_p + ? "uint32_t" + : "int32_t"); + if (!r.require_scalar_type (0, first_type_name)) + return error_mark_node; + return r.resolve_to (r.mode_suffix_id, type); } }; @@ -1857,7 +1921,7 @@ struct unary_n_def : public overloaded_base<0> { bool explicit_type_suffix_p (unsigned int, enum predication_index pred, - enum mode_suffix_index) const override + enum mode_suffix_index, type_suffix_info) const override { return pred != PRED_m; } @@ -1923,16 +1987,14 @@ struct unary_widen_def : public overloaded_base<0> resolve (function_resolver &r) const override { unsigned int i, nargs; - type_suffix_index type; + type_suffix_index type, wide_suffix; tree res; if (!r.check_gp_argument (1, i, nargs) - || (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES) + || (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES + || ((wide_suffix = long_type_suffix (r, type, r.SAME_TYPE_CLASS)) + == NUM_TYPE_SUFFIXES)) return error_mark_node; - type_suffix_index wide_suffix - = find_type_suffix (type_suffixes[type].tclass, - type_suffixes[type].element_bits * 2); - /* Check the inactive argument has the wide type. */ if ((r.pred == PRED_m) && (r.infer_vector_type (0) != wide_suffix)) @@ -1980,6 +2042,425 @@ struct unary_widen_acc_def : public overloaded_base<0> }; SHAPE (unary_widen_acc) +/* <T0>_t vfoo[_t0](T0, T0, uint32_t*) + + Example: vadcq. + int32x4_t [__arm_]vadcq[_s32](int32x4_t a, int32x4_t b, unsigned *carry) + int32x4_t [__arm_]vadcq_m[_s32](int32x4_t inactive, int32x4_t a, int32x4_t b, unsigned *carry, mve_pred16_t p) */ +struct vadc_vsbc_def : public overloaded_base<0> +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + build_all (b, "v0,v0,v0,as", group, MODE_none, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index type; + if (!r.check_gp_argument (3, i, nargs) + || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES) + return error_mark_node; + + if (!r.require_matching_vector_type (1, type)) + return error_mark_node; + + /* Check that last arg is a pointer. */ + if (!POINTER_TYPE_P (r.get_argument_type (i))) + return error_mark_node; + + return r.resolve_to (r.mode_suffix_id, type); + } +}; +SHAPE (vadc_vsbc) + +/* mve_pred16_t foo_t0(uint32_t) + + Example: vctp16q. + mve_pred16_t [__arm_]vctp16q(uint32_t a) + mve_pred16_t [__arm_]vctp16q_m(uint32_t a, mve_pred16_t p) */ +struct vctp_def : public nonoverloaded_base +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + build_all (b, "p,su32", group, MODE_none, preserve_user_namespace); + } +}; +SHAPE (vctp) + +/* <T0>_t foo_t0[_t1](<T1>_t) + <T0>_t foo_t0_n[_t1](<T1>_t, const int) + + Example: vcvtq. + float32x4_t [__arm_]vcvtq[_f32_s32](int32x4_t a) + float32x4_t [__arm_]vcvtq_m[_f32_s32](float32x4_t inactive, int32x4_t a, mve_pred16_t p) + float32x4_t [__arm_]vcvtq_x[_f32_s32](int32x4_t a, mve_pred16_t p) + float32x4_t [__arm_]vcvtq_n[_f32_s32](int32x4_t a, const int imm6) + float32x4_t [__arm_]vcvtq_m_n[_f32_s32](float32x4_t inactive, int32x4_t a, const int imm6, mve_pred16_t p) + float32x4_t [__arm_]vcvtq_x_n[_f32_s32](int32x4_t a, const int imm6, mve_pred16_t p) + int32x4_t [__arm_]vcvtq_s32_f32(float32x4_t a) + int32x4_t [__arm_]vcvtq_m[_s32_f32](int32x4_t inactive, float32x4_t a, mve_pred16_t p) + int32x4_t [__arm_]vcvtq_x_s32_f32(float32x4_t a, mve_pred16_t p) + int32x4_t [__arm_]vcvtq_n_s32_f32(float32x4_t a, const int imm6) + int32x4_t [__arm_]vcvtq_m_n[_s32_f32](int32x4_t inactive, float32x4_t a, const int imm6, mve_pred16_t p) + int32x4_t [__arm_]vcvtq_x_n_s32_f32(float32x4_t a, const int imm6, mve_pred16_t p) */ +struct vcvt_def : public overloaded_base<0> +{ + bool + explicit_type_suffix_p (unsigned int i, enum predication_index pred, + enum mode_suffix_index, + type_suffix_info type_info) const override + { + if (pred != PRED_m + && ((i == 0 && type_info.integer_p) + || (i == 1 && type_info.float_p))) + return true; + return false; + } + + bool + explicit_mode_suffix_p (enum predication_index, + enum mode_suffix_index) const override + { + return true; + } + + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + b.add_overloaded_functions (group, MODE_n, preserve_user_namespace); + build_all (b, "v0,v1", group, MODE_none, preserve_user_namespace); + build_all (b, "v0,v1,su64", group, MODE_n, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index from_type; + tree res; + unsigned int nimm = (r.mode_suffix_id == MODE_none) ? 0 : 1; + + if (!r.check_gp_argument (1 + nimm, i, nargs) + || (from_type + = r.infer_vector_type (i - nimm)) == NUM_TYPE_SUFFIXES) + return error_mark_node; + + if (nimm > 0 + && !r.require_integer_immediate (i)) + return error_mark_node; + + type_suffix_index to_type; + + if (type_suffixes[from_type].integer_p) + { + to_type = find_type_suffix (TYPE_float, + type_suffixes[from_type].element_bits); + } + else + { + /* This should not happen: when 'from_type' is float, the type + suffixes are not overloaded (except for "m" predication, + handled above). */ + gcc_assert (r.pred == PRED_m); + + /* Get the return type from the 'inactive' argument. */ + to_type = r.infer_vector_type (0); + } + + if ((res = r.lookup_form (r.mode_suffix_id, to_type, from_type))) + return res; + + return r.report_no_such_form (from_type); + } + + bool + check (function_checker &c) const override + { + if (c.mode_suffix_id == MODE_none) + return true; + + unsigned int bits = c.type_suffix (0).element_bits; + return c.require_immediate_range (1, 1, bits); + } +}; +SHAPE (vcvt) + +/* float16x8_t foo_f16_f32(float16x8_t, float32x4_t) + + Example: vcvttq_f16_f32. + float16x8_t [__arm_]vcvttq_f16_f32(float16x8_t a, float32x4_t b) + float16x8_t [__arm_]vcvttq_m_f16_f32(float16x8_t a, float32x4_t b, mve_pred16_t p) +*/ +struct vcvt_f16_f32_def : public nonoverloaded_base +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + build_all (b, "v0,v0,v1", group, MODE_none, preserve_user_namespace); + } +}; +SHAPE (vcvt_f16_f32) + +/* float32x4_t foo_f32_f16(float16x8_t) + + Example: vcvttq_f32_f16. + float32x4_t [__arm_]vcvttq_f32_f16(float16x8_t a) + float32x4_t [__arm_]vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p) + float32x4_t [__arm_]vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p) +*/ +struct vcvt_f32_f16_def : public nonoverloaded_base +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + build_all (b, "v0,v1", group, MODE_none, preserve_user_namespace); + } +}; +SHAPE (vcvt_f32_f16) + +/* <T0>_t foo_t0[_t1](<T1>_t) + + Example: vcvtaq. + int16x8_t [__arm_]vcvtaq_s16_f16(float16x8_t a) + int16x8_t [__arm_]vcvtaq_m[_s16_f16](int16x8_t inactive, float16x8_t a, mve_pred16_t p) + int16x8_t [__arm_]vcvtaq_x_s16_f16(float16x8_t a, mve_pred16_t p) +*/ +struct vcvtx_def : public overloaded_base<0> +{ + bool + explicit_type_suffix_p (unsigned int, enum predication_index pred, + enum mode_suffix_index, + type_suffix_info) const override + { + return pred != PRED_m; + } + + bool + skip_overload_p (enum predication_index pred, enum mode_suffix_index) + const override + { + return pred != PRED_m; + } + + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + build_all (b, "v0,v1", group, MODE_none, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index from_type; + tree res; + + if (!r.check_gp_argument (1, i, nargs) + || (from_type + = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES) + return error_mark_node; + + type_suffix_index to_type; + + gcc_assert (r.pred == PRED_m); + + /* Get the return type from the 'inactive' argument. */ + to_type = r.infer_vector_type (0); + + if ((res = r.lookup_form (r.mode_suffix_id, to_type, from_type))) + return res; + + return r.report_no_such_form (from_type); + } +}; +SHAPE (vcvtx) + +/* <T0>_t vfoo[_n]_t0(uint32_t, const int) + <T0>_t vfoo[_wb]_t0(uint32_t *, const int) + + Shape for vector increment or decrement and duplicate operations that take + an integer or pointer to integer first argument and an immediate, and + produce a vector. + + Check that 'imm' is one of 1, 2, 4 or 8. + + Example: vddupq. + uint8x16_t [__arm_]vddupq[_n]_u8(uint32_t a, const int imm) + uint8x16_t [__arm_]vddupq[_wb]_u8(uint32_t *a, const int imm) + uint8x16_t [__arm_]vddupq_m[_n_u8](uint8x16_t inactive, uint32_t a, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vddupq_m[_wb_u8](uint8x16_t inactive, uint32_t *a, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vddupq_x[_n]_u8(uint32_t a, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vddupq_x[_wb]_u8(uint32_t *a, const int imm, mve_pred16_t p) */ +struct viddup_def : public overloaded_base<0> +{ + bool + explicit_type_suffix_p (unsigned int i, enum predication_index pred, + enum mode_suffix_index, + type_suffix_info) const override + { + return ((i == 0) && (pred != PRED_m)); + } + + bool + skip_overload_p (enum predication_index, enum mode_suffix_index mode) const override + { + /* For MODE_wb, share the overloaded instance with MODE_n. */ + if (mode == MODE_wb) + return true; + + return false; + } + + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + build_all (b, "v0,su32,su64", group, MODE_n, preserve_user_namespace); + build_all (b, "v0,as,su64", group, MODE_wb, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index type_suffix = NUM_TYPE_SUFFIXES; + if (!r.check_gp_argument (2, i, nargs)) + return error_mark_node; + + type_suffix = r.type_suffix_ids[0]; + /* With PRED_m, ther is no type suffix, so infer it from the first (inactive) + argument. */ + if (type_suffix == NUM_TYPE_SUFFIXES) + type_suffix = r.infer_vector_type (0); + + unsigned int last_arg = i - 1; + /* Check that last_arg is either scalar or pointer. */ + if (!r.scalar_argument_p (last_arg)) + return error_mark_node; + + if (!r.require_integer_immediate (last_arg + 1)) + return error_mark_node; + + /* With MODE_n we expect a scalar, with MODE_wb we expect a pointer. */ + mode_suffix_index mode_suffix; + if (POINTER_TYPE_P (r.get_argument_type (last_arg))) + mode_suffix = MODE_wb; + else + mode_suffix = MODE_n; + + return r.resolve_to (mode_suffix, type_suffix); + } + + bool + check (function_checker &c) const override + { + return c.require_immediate_one_of (1, 1, 2, 4, 8); + } +}; +SHAPE (viddup) + +/* <T0>_t vfoo[_n]_t0(uint32_t, uint32_t, const int) + <T0>_t vfoo[_wb]_t0(uint32_t *, uint32_t, const int) + + Shape for vector increment or decrement with wrap and duplicate operations + that take an integer or pointer to integer first argument, an integer second + argument and an immediate, and produce a vector. + + Check that 'imm' is one of 1, 2, 4 or 8. + + Example: vdwdupq. + uint8x16_t [__arm_]vdwdupq[_n]_u8(uint32_t a, uint32_t b, const int imm) + uint8x16_t [__arm_]vdwdupq[_wb]_u8(uint32_t *a, uint32_t b, const int imm) + uint8x16_t [__arm_]vdwdupq_m[_n_u8](uint8x16_t inactive, uint32_t a, uint32_t b, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vdwdupq_m[_wb_u8](uint8x16_t inactive, uint32_t *a, uint32_t b, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vdwdupq_x[_n]_u8(uint32_t a, uint32_t b, const int imm, mve_pred16_t p) + uint8x16_t [__arm_]vdwdupq_x[_wb]_u8(uint32_t *a, uint32_t b, const int imm, mve_pred16_t p) */ +struct vidwdup_def : public overloaded_base<0> +{ + bool + explicit_type_suffix_p (unsigned int i, enum predication_index pred, + enum mode_suffix_index, + type_suffix_info) const override + { + return ((i == 0) && (pred != PRED_m)); + } + + bool + skip_overload_p (enum predication_index, enum mode_suffix_index mode) const override + { + /* For MODE_wb, share the overloaded instance with MODE_n. */ + if (mode == MODE_wb) + return true; + + return false; + } + + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + build_all (b, "v0,su32,su32,su64", group, MODE_n, preserve_user_namespace); + build_all (b, "v0,as,su32,su64", group, MODE_wb, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index type_suffix = NUM_TYPE_SUFFIXES; + if (!r.check_gp_argument (3, i, nargs)) + return error_mark_node; + + type_suffix = r.type_suffix_ids[0]; + /* With PRED_m, ther is no type suffix, so infer it from the first (inactive) + argument. */ + if (type_suffix == NUM_TYPE_SUFFIXES) + type_suffix = r.infer_vector_type (0); + + unsigned int last_arg = i - 2; + /* Check that last_arg is either scalar or pointer. */ + if (!r.scalar_argument_p (last_arg)) + return error_mark_node; + + if (!r.scalar_argument_p (last_arg + 1)) + return error_mark_node; + + if (!r.require_integer_immediate (last_arg + 2)) + return error_mark_node; + + /* With MODE_n we expect a scalar, with MODE_wb we expect a pointer. */ + mode_suffix_index mode_suffix; + if (POINTER_TYPE_P (r.get_argument_type (last_arg))) + mode_suffix = MODE_wb; + else + mode_suffix = MODE_n; + + return r.resolve_to (mode_suffix, type_suffix); + } + + bool + check (function_checker &c) const override + { + return c.require_immediate_one_of (2, 1, 2, 4, 8); + } +}; +SHAPE (vidwdup) + /* <T0>_t vfoo[_t0](<T0>_t, <T0>_t, mve_pred16_t) i.e. a version of the standard ternary shape in which @@ -2019,6 +2500,50 @@ struct vpsel_def : public overloaded_base<0> }; SHAPE (vpsel) +/* <T0>_t vfoo[_t0](T0, uint32_t* , const int) + + Check that 'imm' is in [1..32]. + + Example: vshlcq. + uint8x16_t [__arm_]vshlcq[_u8](uint8x16_t a, uint32_t *b, const int imm) + uint8x16_t [__arm_]vshlcq_m[_u8](uint8x16_t a, uint32_t *b, const int imm, mve_pred16_t p) */ +struct vshlc_def : public overloaded_base<0> +{ + void + build (function_builder &b, const function_group_info &group, + bool preserve_user_namespace) const override + { + b.add_overloaded_functions (group, MODE_none, preserve_user_namespace); + build_all (b, "v0,v0,as,su64", group, MODE_none, preserve_user_namespace); + } + + tree + resolve (function_resolver &r) const override + { + unsigned int i, nargs; + type_suffix_index type; + if (!r.check_gp_argument (3, i, nargs) + || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES) + return error_mark_node; + + /* Check that arg #2 is a pointer. */ + if (!POINTER_TYPE_P (r.get_argument_type (i - 1))) + return error_mark_node; + + if (!r.require_integer_immediate (i)) + return error_mark_node; + + return r.resolve_to (r.mode_suffix_id, type); + } + + bool + check (function_checker &c) const override + { + return c.require_immediate_range (2, 1, 32); + } +}; +SHAPE (vshlc) + } /* end namespace arm_mve */ #undef SHAPE diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h b/gcc/config/arm/arm-mve-builtins-shapes.h index 61aa4fa..db7c631 100644 --- a/gcc/config/arm/arm-mve-builtins-shapes.h +++ b/gcc/config/arm/arm-mve-builtins-shapes.h @@ -62,6 +62,7 @@ namespace arm_mve extern const function_shape *const create; extern const function_shape *const inherent; extern const function_shape *const load; + extern const function_shape *const load_ext; extern const function_shape *const mvn; extern const function_shape *const store; extern const function_shape *const ternary; @@ -77,7 +78,16 @@ namespace arm_mve extern const function_shape *const unary_n; extern const function_shape *const unary_widen; extern const function_shape *const unary_widen_acc; + extern const function_shape *const vadc_vsbc; + extern const function_shape *const vctp; + extern const function_shape *const vcvt; + extern const function_shape *const vcvt_f16_f32; + extern const function_shape *const vcvt_f32_f16; + extern const function_shape *const vcvtx; + extern const function_shape *const viddup; + extern const function_shape *const vidwdup; extern const function_shape *const vpsel; + extern const function_shape *const vshlc; } /* end namespace arm_mve::shapes */ } /* end namespace arm_mve */ diff --git a/gcc/config/arm/arm-mve-builtins.cc b/gcc/config/arm/arm-mve-builtins.cc index 7e82176..af19086 100644 --- a/gcc/config/arm/arm-mve-builtins.cc +++ b/gcc/config/arm/arm-mve-builtins.cc @@ -19,6 +19,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -149,8 +150,10 @@ CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = { class ("b", "f", etc.) and a numerical bit count. */ /* _f16. */ -#define TYPES_float16(S, D) \ - S (f16) +#define TYPES_float_16(S, D) S (f16) + +/* _f32. */ +#define TYPES_float_32(S, D) S (f32) /* _f16 _f32. */ #define TYPES_all_float(S, D) \ @@ -205,6 +208,36 @@ CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = { #define TYPES_signed_32(S, D) \ S (s32) +/* All the type combinations allowed by vcvtq. */ +#define TYPES_cvt(S, D) \ + D (f16, s16), \ + D (f16, u16), \ + \ + D (f32, s32), \ + D (f32, u32), \ + \ + D (s16, f16), \ + D (s32, f32), \ + \ + D (u16, f16), \ + D (u32, f32) + +/* vcvt[bt]q_f16_f132. */ +#define TYPES_cvt_f16_f32(S, D) \ + D (f16, f32) + +/* vcvt[bt]q_f32_f16. */ +#define TYPES_cvt_f32_f16(S, D) \ + D (f32, f16) + +/* All the type combinations allowed by vcvtXq. */ +#define TYPES_cvtx(S, D) \ + D (s16, f16), \ + D (s32, f32), \ + \ + D (u16, f16), \ + D (u32, f32) + #define TYPES_reinterpret_signed1(D, A) \ D (A, s8), D (A, s16), D (A, s32), D (A, s64) @@ -273,7 +306,8 @@ static const type_suffix_pair types_none[] = { DEF_MVE_TYPES_ARRAY (all_integer); DEF_MVE_TYPES_ARRAY (all_integer_with_64); -DEF_MVE_TYPES_ARRAY (float16); +DEF_MVE_TYPES_ARRAY (float_16); +DEF_MVE_TYPES_ARRAY (float_32); DEF_MVE_TYPES_ARRAY (all_float); DEF_MVE_TYPES_ARRAY (all_signed); DEF_MVE_TYPES_ARRAY (all_unsigned); @@ -284,6 +318,10 @@ DEF_MVE_TYPES_ARRAY (integer_32); DEF_MVE_TYPES_ARRAY (poly_8_16); DEF_MVE_TYPES_ARRAY (signed_16_32); DEF_MVE_TYPES_ARRAY (signed_32); +DEF_MVE_TYPES_ARRAY (cvt); +DEF_MVE_TYPES_ARRAY (cvt_f16_f32); +DEF_MVE_TYPES_ARRAY (cvt_f32_f16); +DEF_MVE_TYPES_ARRAY (cvtx); DEF_MVE_TYPES_ARRAY (reinterpret_integer); DEF_MVE_TYPES_ARRAY (reinterpret_float); @@ -308,6 +346,11 @@ static const predication_index preds_p_or_none[] = { PRED_p, PRED_none, NUM_PREDS }; +/* Used by functions that have the z predicated form, in addition to + an unpredicated form. */ +static const predication_index preds_z_or_none[] + = {PRED_z, PRED_none, NUM_PREDS}; + /* A list of all MVE ACLE functions. */ static CONSTEXPR const function_group_info function_groups[] = { #define DEF_MVE_FUNCTION(NAME, SHAPE, TYPES, PREDS) \ @@ -596,6 +639,20 @@ report_not_enum (location_t location, tree fndecl, unsigned int argno, " a valid %qT value", actual, argno + 1, fndecl, enumtype); } +/* Report that LOCATION has a call to FNDECL in which argument ARGNO has + the value ACTUAL, whereas the function requires one of VALUE0..3. + ARGNO counts from zero. */ +static void +report_not_one_of (location_t location, tree fndecl, unsigned int argno, + HOST_WIDE_INT actual, HOST_WIDE_INT value0, + HOST_WIDE_INT value1, HOST_WIDE_INT value2, + HOST_WIDE_INT value3) +{ + error_at (location, "passing %wd to argument %d of %qE, which expects" + " %wd, %wd, %wd or %wd", actual, argno + 1, fndecl, value0, value1, + value2, value3); +} + /* Checks that the mve.fp extension is enabled, given that REQUIRES_FLOAT indicates whether it is required or not for function FNDECL. Report an error against LOCATION if not. */ @@ -703,6 +760,7 @@ function_instance::has_inactive_argument () const return false; if (mode_suffix_id == MODE_r + || (base == functions::vbicq && mode_suffix_id == MODE_n) || base == functions::vcmlaq || base == functions::vcmlaq_rot90 || base == functions::vcmlaq_rot180 @@ -715,6 +773,12 @@ function_instance::has_inactive_argument () const || base == functions::vcmpltq || base == functions::vcmpcsq || base == functions::vcmphiq + || base == functions::vctp16q + || base == functions::vctp32q + || base == functions::vctp64q + || base == functions::vctp8q + || (base == functions::vcvtbq && type_suffix (0).element_bits == 16) + || (base == functions::vcvttq && type_suffix (0).element_bits == 16) || base == functions::vfmaq || base == functions::vfmasq || base == functions::vfmsq @@ -755,6 +819,7 @@ function_instance::has_inactive_argument () const || (base == functions::vrshlq && mode_suffix_id == MODE_n) || base == functions::vrshrnbq || base == functions::vrshrntq + || base == functions::vshlcq || base == functions::vshrnbq || base == functions::vshrntq || base == functions::vsliq @@ -823,7 +888,8 @@ function_builder::get_name (const function_instance &instance, for (unsigned int i = 0; i < 2; ++i) if (!overloaded_p || instance.shape->explicit_type_suffix_p (i, instance.pred, - instance.mode_suffix_id)) + instance.mode_suffix_id, + instance.type_suffix (i))) append_name (instance.type_suffix (i).string); return finish_name (); } @@ -1001,9 +1067,11 @@ function_builder::add_overloaded_functions (const function_group_info &group, for (unsigned int pi = 0; group.preds[pi] != NUM_PREDS; ++pi) { unsigned int explicit_type0 - = (*group.shape)->explicit_type_suffix_p (0, group.preds[pi], mode); + = (*group.shape)->explicit_type_suffix_p (0, group.preds[pi], mode, + type_suffixes[NUM_TYPE_SUFFIXES]); unsigned int explicit_type1 - = (*group.shape)->explicit_type_suffix_p (1, group.preds[pi], mode); + = (*group.shape)->explicit_type_suffix_p (1, group.preds[pi], mode, + type_suffixes[NUM_TYPE_SUFFIXES]); if ((*group.shape)->skip_overload_p (group.preds[pi], mode)) continue; @@ -1601,6 +1669,7 @@ function_resolver::check_gp_argument (unsigned int nops, case PRED_p: case PRED_x: + case PRED_z: /* Add final predicate. */ nargs = nops + 1; break; @@ -1925,6 +1994,36 @@ function_checker::require_immediate_enum (unsigned int rel_argno, tree type) return false; } +/* Check that argument REL_ARGNO is an integer constant expression that + has one of the given values. */ +bool +function_checker::require_immediate_one_of (unsigned int rel_argno, + HOST_WIDE_INT value0, + HOST_WIDE_INT value1, + HOST_WIDE_INT value2, + HOST_WIDE_INT value3) +{ + unsigned int argno = m_base_arg + rel_argno; + if (!argument_exists_p (argno)) + return true; + + HOST_WIDE_INT actual; + if (!require_immediate (argno, actual)) + return false; + + if (actual != value0 + && actual != value1 + && actual != value2 + && actual != value3) + { + report_not_one_of (location, fndecl, argno, actual, + value0, value1, value2, value3); + return false; + } + + return true; +} + /* Check that argument REL_ARGNO is an integer constant expression in the range [MIN, MAX]. REL_ARGNO counts from the end of the predication arguments. */ @@ -2237,6 +2336,8 @@ function_expander::use_contiguous_load_insn (insn_code icode) add_output_operand (icode); add_mem_operand (mem_mode, get_contiguous_base ()); + if (pred == PRED_z) + add_input_operand (icode, args[1]); return generate_insn (icode); } @@ -2249,6 +2350,8 @@ function_expander::use_contiguous_store_insn (insn_code icode) add_mem_operand (mem_mode, get_contiguous_base ()); add_input_operand (icode, args[1]); + if (pred == PRED_p) + add_input_operand (icode, args[2]); return generate_insn (icode); } diff --git a/gcc/config/arm/arm-mve-builtins.def b/gcc/config/arm/arm-mve-builtins.def index 24ebb33..265cc7b 100644 --- a/gcc/config/arm/arm-mve-builtins.def +++ b/gcc/config/arm/arm-mve-builtins.def @@ -36,6 +36,7 @@ DEF_MVE_MODE (n, none, none, none) DEF_MVE_MODE (offset, none, none, bytes) DEF_MVE_MODE (r, none, none, none) +DEF_MVE_MODE (wb, none, none, none) #define REQUIRES_FLOAT false DEF_MVE_TYPE (mve_pred16_t, boolean_type_node) diff --git a/gcc/config/arm/arm-mve-builtins.h b/gcc/config/arm/arm-mve-builtins.h index f282236..2e48d91 100644 --- a/gcc/config/arm/arm-mve-builtins.h +++ b/gcc/config/arm/arm-mve-builtins.h @@ -401,7 +401,7 @@ public: bool require_integer_immediate (unsigned int); bool require_derived_scalar_type (unsigned int, type_class_index, unsigned int = SAME_SIZE); - + bool check_num_arguments (unsigned int); bool check_gp_argument (unsigned int, unsigned int &, unsigned int &); tree resolve_unary (type_class_index = SAME_TYPE_CLASS, @@ -433,6 +433,8 @@ public: bool require_immediate_enum (unsigned int, tree); bool require_immediate_lane_index (unsigned int, unsigned int = 1); + bool require_immediate_one_of (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT, + HOST_WIDE_INT, HOST_WIDE_INT); bool require_immediate_range (unsigned int, HOST_WIDE_INT, HOST_WIDE_INT); bool check (); @@ -571,9 +573,13 @@ public: class function_shape { public: - virtual bool explicit_type_suffix_p (unsigned int, enum predication_index, enum mode_suffix_index) const = 0; - virtual bool explicit_mode_suffix_p (enum predication_index, enum mode_suffix_index) const = 0; - virtual bool skip_overload_p (enum predication_index, enum mode_suffix_index) const = 0; + virtual bool explicit_type_suffix_p (unsigned int, enum predication_index, + enum mode_suffix_index, + type_suffix_info) const = 0; + virtual bool explicit_mode_suffix_p (enum predication_index, + enum mode_suffix_index) const = 0; + virtual bool skip_overload_p (enum predication_index, + enum mode_suffix_index) const = 0; /* Define all functions associated with the given group. */ virtual void build (function_builder &, diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 50cae2b..7311ad4 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -210,6 +210,7 @@ extern bool arm_pad_reg_upward (machine_mode, tree, int); #endif extern int arm_apply_result_size (void); extern opt_machine_mode arm_get_mask_mode (machine_mode mode); +extern bool arm_noce_conversion_profitable_p (rtx_insn *,struct noce_if_info *); #endif /* RTX_CODE */ @@ -615,4 +616,7 @@ void arm_initialize_isa (sbitmap, const enum isa_feature *); const char * arm_gen_far_branch (rtx *, int, const char * , const char *); bool arm_mve_immediate_check(rtx, machine_mode, bool); + +opt_machine_mode arm_mve_data_mode (scalar_mode, poly_uint64); + #endif /* ! GCC_ARM_PROTOS_H */ diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index d54564a..6f11b6c 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -23,6 +23,7 @@ #define IN_TARGET_CODE 1 #include "config.h" +#define INCLUDE_MEMORY #define INCLUDE_STRING #include "system.h" #include "coretypes.h" @@ -75,6 +76,7 @@ #include "opts.h" #include "aarch-common.h" #include "aarch-common-protos.h" +#include "machmode.h" /* This file should be included last. */ #include "target-def.h" @@ -814,6 +816,9 @@ static const scoped_attribute_specs *const arm_attribute_table[] = #undef TARGET_MODES_TIEABLE_P #define TARGET_MODES_TIEABLE_P arm_modes_tieable_p +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P +#define TARGET_NOCE_CONVERSION_PROFITABLE_P arm_noce_conversion_profitable_p + #undef TARGET_CAN_CHANGE_MODE_CLASS #define TARGET_CAN_CHANGE_MODE_CLASS arm_can_change_mode_class @@ -8006,10 +8011,11 @@ arm_function_ok_for_sibcall (tree decl, tree exp) && DECL_WEAK (decl)) return false; - /* We cannot tailcall an indirect call by descriptor if all the call-clobbered - general registers are live (r0-r3 and ip). This can happen when: - - IP contains the static chain, or - - IP is needed for validating the PAC signature. */ + /* Indirect tailcalls need a call-clobbered register to hold the function + address. But we only have r0-r3 and ip in that class. If r0-r3 all hold + function arguments, then we can only use IP. But IP may be needed in the + epilogue (for PAC validation), or for passing the static chain. We have + to disable the tail call if nothing is available. */ if (!decl && ((CALL_EXPR_BY_DESCRIPTOR (exp) && !flag_trampolines) || arm_current_function_pac_enabled_p())) @@ -8021,18 +8027,33 @@ arm_function_ok_for_sibcall (tree decl, tree exp) arm_init_cumulative_args (&cum, fntype, NULL_RTX, NULL_TREE); cum_v = pack_cumulative_args (&cum); - for (tree t = TYPE_ARG_TYPES (fntype); t; t = TREE_CHAIN (t)) + tree arg; + call_expr_arg_iterator iter; + unsigned used_regs = 0; + + /* Layout each actual argument in turn. If it is allocated to + core regs, note which regs have been allocated. */ + FOR_EACH_CALL_EXPR_ARG (arg, iter, exp) { - tree type = TREE_VALUE (t); - if (!VOID_TYPE_P (type)) + tree type = TREE_TYPE (arg); + function_arg_info arg_info (type, /*named=*/true); + rtx reg = arm_function_arg (cum_v, arg_info); + if (reg && REG_P (reg) + && REGNO (reg) <= LAST_ARG_REGNUM) { - function_arg_info arg (type, /*named=*/true); - arm_function_arg_advance (cum_v, arg); + /* Avoid any chance of UB here. We don't care if TYPE + is very large since it will use up all the argument regs. */ + unsigned nregs = MIN (ARM_NUM_REGS2 (GET_MODE (reg), type), + LAST_ARG_REGNUM + 1); + used_regs |= ((1 << nregs) - 1) << REGNO (reg); } + arm_function_arg_advance (cum_v, arg_info); } - function_arg_info arg (integer_type_node, /*named=*/true); - if (!arm_function_arg (cum_v, arg)) + /* We've used all the argument regs, and we know IP is live during the + epilogue for some reason, so we can't tailcall. */ + if ((used_regs & ((1 << (LAST_ARG_REGNUM + 1)) - 1)) + == ((1 << (LAST_ARG_REGNUM + 1)) - 1)) return false; } @@ -11891,7 +11912,7 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code, case CONST_DOUBLE: if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT - && (mode == SFmode || !TARGET_VFP_SINGLE)) + && (mode == SFmode || mode == HFmode || !TARGET_VFP_SINGLE)) { if (vfp3_const_double_rtx (x)) { @@ -11916,12 +11937,18 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code, return true; case CONST_VECTOR: - /* Fixme. */ if (((TARGET_NEON && TARGET_HARD_FLOAT && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))) || TARGET_HAVE_MVE) && simd_immediate_valid_for_move (x, mode, NULL, NULL)) *cost = COSTS_N_INSNS (1); + else if (TARGET_HAVE_MVE) + { + /* 128-bit vector requires two vldr.64 on MVE. */ + *cost = COSTS_N_INSNS (2); + if (speed_p) + *cost += extra_cost->ldst.loadd * 2; + } else *cost = COSTS_N_INSNS (4); return true; @@ -15341,9 +15368,9 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, HOST_WIDE_INT srcoffset, dstoffset; HOST_WIDE_INT src_autoinc, dst_autoinc; rtx mem, addr; - + gcc_assert (interleave_factor >= 1 && interleave_factor <= 4); - + /* Use hard registers if we have aligned source or destination so we can use load/store multiple with contiguous registers. */ if (dst_aligned || src_aligned) @@ -15357,7 +15384,7 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, src = copy_addr_to_reg (XEXP (srcbase, 0)); srcoffset = dstoffset = 0; - + /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST. For copying the last bytes we want to subtract this offset again. */ src_autoinc = dst_autoinc = 0; @@ -15411,14 +15438,14 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, remaining -= block_size_bytes; } - + /* Copy any whole words left (note these aren't interleaved with any subsequent halfword/byte load/stores in the interests of simplicity). */ - + words = remaining / UNITS_PER_WORD; gcc_assert (words < interleave_factor); - + if (src_aligned && words > 1) { emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase, @@ -15464,11 +15491,11 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, } remaining -= words * UNITS_PER_WORD; - + gcc_assert (remaining < 4); - + /* Copy a halfword if necessary. */ - + if (remaining >= 2) { halfword_tmp = gen_reg_rtx (SImode); @@ -15492,11 +15519,11 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, remaining -= 2; srcoffset += 2; } - + gcc_assert (remaining < 2); - + /* Copy last byte. */ - + if ((remaining & 1) != 0) { byte_tmp = gen_reg_rtx (SImode); @@ -15517,9 +15544,9 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, remaining--; srcoffset++; } - + /* Store last halfword if we haven't done so already. */ - + if (halfword_tmp) { addr = plus_constant (Pmode, dst, dstoffset - dst_autoinc); @@ -15538,7 +15565,7 @@ arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase, emit_move_insn (mem, gen_lowpart (QImode, byte_tmp)); dstoffset++; } - + gcc_assert (remaining == 0 && srcoffset == dstoffset); } @@ -15557,7 +15584,7 @@ arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg, rtx *loop_mem) { *loop_reg = copy_addr_to_reg (XEXP (mem, 0)); - + /* Although the new mem does not refer to a known location, it does keep up to LENGTH bytes of alignment. */ *loop_mem = change_address (mem, BLKmode, *loop_reg); @@ -15577,14 +15604,14 @@ arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length, { rtx src_reg, dest_reg, final_src, test; HOST_WIDE_INT leftover; - + leftover = length % bytes_per_iter; length -= leftover; - + /* Create registers and memory references for use within the loop. */ arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src); arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest); - + /* Calculate the value that SRC_REG should have after the last iteration of the loop. */ final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length), @@ -15593,7 +15620,7 @@ arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length, /* Emit the start of the loop. */ rtx_code_label *label = gen_label_rtx (); emit_label (label); - + /* Emit the loop body. */ arm_block_move_unaligned_straight (dest, src, bytes_per_iter, interleave_factor); @@ -15601,11 +15628,11 @@ arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length, /* Move on to the next block. */ emit_move_insn (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter)); emit_move_insn (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter)); - + /* Emit the loop condition. */ test = gen_rtx_NE (VOIDmode, src_reg, final_src); emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label)); - + /* Mop up any left-over bytes. */ if (leftover) arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor); @@ -15619,7 +15646,7 @@ static int arm_cpymemqi_unaligned (rtx *operands) { HOST_WIDE_INT length = INTVAL (operands[2]); - + if (optimize_size) { bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD; @@ -15630,7 +15657,7 @@ arm_cpymemqi_unaligned (rtx *operands) resulting code can be smaller. */ unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1; HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4; - + if (length > 12) arm_block_move_unaligned_loop (operands[0], operands[1], length, interleave_factor, bytes_per_iter); @@ -15648,7 +15675,7 @@ arm_cpymemqi_unaligned (rtx *operands) else arm_block_move_unaligned_straight (operands[0], operands[1], length, 4); } - + return 1; } @@ -24710,11 +24737,11 @@ arm_print_operand (FILE *stream, rtx x, int code) asm_fprintf (stream, "[%r", REGNO (XEXP (addr, 0))); inc_val = GET_MODE_SIZE (GET_MODE (x)); if (code == POST_INC || code == POST_DEC) - asm_fprintf (stream, "], #%s%d",(code == POST_INC) - ? "": "-", inc_val); + asm_fprintf (stream, "], #%s%d", (code == POST_INC) + ? "" : "-", inc_val); else - asm_fprintf (stream, ", #%s%d]!",(code == PRE_INC) - ? "": "-", inc_val); + asm_fprintf (stream, ", #%s%d]!", (code == PRE_INC) + ? "" : "-", inc_val); } else if (code == POST_MODIFY || code == PRE_MODIFY) { @@ -24723,9 +24750,9 @@ arm_print_operand (FILE *stream, rtx x, int code) if (postinc_reg && CONST_INT_P (postinc_reg)) { if (code == POST_MODIFY) - asm_fprintf (stream, "], #%wd",INTVAL (postinc_reg)); + asm_fprintf (stream, "], #%wd", INTVAL (postinc_reg)); else - asm_fprintf (stream, ", #%wd]!",INTVAL (postinc_reg)); + asm_fprintf (stream, ", #%wd]!", INTVAL (postinc_reg)); } } else if (code == PLUS) @@ -31139,10 +31166,10 @@ int vfp3_const_double_for_fract_bits (rtx operand) { REAL_VALUE_TYPE r0; - + if (!CONST_DOUBLE_P (operand)) return 0; - + r0 = *CONST_DOUBLE_REAL_VALUE (operand); if (exact_real_inverse (DFmode, &r0) && !REAL_VALUE_NEGATIVE (r0)) @@ -32404,7 +32431,7 @@ arm_autoinc_modes_ok_p (machine_mode mode, enum arm_auto_incmodes code) else return false; } - + return true; case ARM_POST_DEC: @@ -32421,10 +32448,10 @@ arm_autoinc_modes_ok_p (machine_mode mode, enum arm_auto_incmodes code) return false; return true; - + default: return false; - + } return false; @@ -32435,7 +32462,7 @@ arm_autoinc_modes_ok_p (machine_mode mode, enum arm_auto_incmodes code) Additionally, the default expansion code is not available or suitable for post-reload insn splits (this can occur when the register allocator chooses not to do a shift in NEON). - + This function is used in both initial expand and post-reload splits, and handles all kinds of 64-bit shifts. @@ -33505,7 +33532,7 @@ arm_asan_shadow_offset (void) /* This is a temporary fix for PR60655. Ideally we need to handle most of these cases in the generic part but - currently we reject minus (..) (sym_ref). We try to + currently we reject minus (..) (sym_ref). We try to ameliorate the case with minus (sym_ref1) (sym_ref2) where they are in the same section. */ @@ -33828,7 +33855,7 @@ arm_valid_target_attribute_tree (tree args, struct gcc_options *opts, return build_target_option_node (opts, opts_set); } -static void +static void add_attribute (const char * mode, tree *attributes) { size_t len = strlen (mode); @@ -33859,7 +33886,7 @@ arm_insert_attributes (tree fndecl, tree * attributes) /* Nested definitions must inherit mode. */ if (current_function_decl) { - mode = TARGET_THUMB ? "thumb" : "arm"; + mode = TARGET_THUMB ? "thumb" : "arm"; add_attribute (mode, attributes); return; } @@ -35214,6 +35241,32 @@ arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn, return vctp_insn; } +/* Helper function to 'arm_mve_dlstp_check_dec_counter' to make sure DEC_INSN + is of the expected form: + (set (reg a) (plus (reg a) (const_int))) + where (reg a) is the same as CONDCOUNT. + Return a rtx with the set if it is in the right format or NULL_RTX + otherwise. */ + +static rtx +check_dec_insn (rtx_insn *dec_insn, rtx condcount) +{ + if (!NONDEBUG_INSN_P (dec_insn)) + return NULL_RTX; + rtx dec_set = single_set (dec_insn); + if (!dec_set + || !REG_P (SET_DEST (dec_set)) + || GET_CODE (SET_SRC (dec_set)) != PLUS + || !REG_P (XEXP (SET_SRC (dec_set), 0)) + || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1)) + || REGNO (SET_DEST (dec_set)) + != REGNO (XEXP (SET_SRC (dec_set), 0)) + || REGNO (SET_DEST (dec_set)) != REGNO (condcount)) + return NULL_RTX; + + return dec_set; +} + /* Helper function to `arm_mve_loop_valid_for_dlstp`. In the case of a counter that is decrementing, ensure that it is decrementing by the right amount in each iteration and that the target condition is what @@ -35230,30 +35283,19 @@ arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn, loop latch. Here we simply need to verify that this counter is the same reg that is also used in the vctp_insn and that it is not otherwise modified. */ - rtx_insn *dec_insn = BB_END (loop->latch); + rtx dec_set = check_dec_insn (BB_END (loop->latch), condcount); /* If not in the loop latch, try to find the decrement in the loop header. */ - if (!NONDEBUG_INSN_P (dec_insn)) + if (dec_set == NULL_RTX) { df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount)); /* If we haven't been able to find the decrement, bail out. */ if (!temp) return NULL; - dec_insn = DF_REF_INSN (temp); - } + dec_set = check_dec_insn (DF_REF_INSN (temp), condcount); - rtx dec_set = single_set (dec_insn); - - /* Next, ensure that it is a PLUS of the form: - (set (reg a) (plus (reg a) (const_int))) - where (reg a) is the same as condcount. */ - if (!dec_set - || !REG_P (SET_DEST (dec_set)) - || !REG_P (XEXP (SET_SRC (dec_set), 0)) - || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1)) - || REGNO (SET_DEST (dec_set)) - != REGNO (XEXP (SET_SRC (dec_set), 0)) - || REGNO (SET_DEST (dec_set)) != REGNO (condcount)) - return NULL; + if (dec_set == NULL_RTX) + return NULL; + } decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1)); @@ -36057,6 +36099,90 @@ arm_get_mask_mode (machine_mode mode) return default_get_mask_mode (mode); } +/* Helper function to determine whether SEQ represents a sequence of + instructions representing the Armv8.1-M Mainline conditional arithmetic + instructions: csinc, csneg and csinv. The cinc instruction is generated + using a different mechanism. */ + +static bool +arm_is_v81m_cond_insn (rtx_insn *seq) +{ + rtx_insn *curr_insn = seq; + rtx set = NULL_RTX; + /* The pattern may start with a simple set with register operands. Skip + through any of those. */ + while (curr_insn) + { + set = single_set (curr_insn); + if (!set + || !REG_P (SET_DEST (set))) + return false; + + if (!REG_P (SET_SRC (set))) + break; + curr_insn = NEXT_INSN (curr_insn); + } + + if (!set) + return false; + + /* The next instruction should be one of: + NEG: for csneg, + PLUS: for csinc, + NOT: for csinv. */ + if (GET_CODE (SET_SRC (set)) != NEG + && GET_CODE (SET_SRC (set)) != PLUS + && GET_CODE (SET_SRC (set)) != NOT) + return false; + + curr_insn = NEXT_INSN (curr_insn); + if (!curr_insn) + return false; + + /* The next instruction should be a COMPARE. */ + set = single_set (curr_insn); + if (!set + || !REG_P (SET_DEST (set)) + || GET_CODE (SET_SRC (set)) != COMPARE) + return false; + + curr_insn = NEXT_INSN (curr_insn); + if (!curr_insn) + return false; + + /* And the last instruction should be an IF_THEN_ELSE. */ + set = single_set (curr_insn); + if (!set + || !REG_P (SET_DEST (set)) + || GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) + return false; + + return !NEXT_INSN (curr_insn); +} + +/* For Armv8.1-M Mainline we have both conditional execution through IT blocks, + as well as conditional arithmetic instructions controlled by + TARGET_COND_ARITH. To generate the latter we rely on a special part of the + "ce" pass that generates code for targets that don't support conditional + execution of general instructions known as "noce". These transformations + happen before 'reload_completed'. However, "noce" also triggers for some + unwanted patterns [PR 116444] that prevent "ce" optimisations after reload. + To make sure we can get both we use the TARGET_NOCE_CONVERSION_PROFITABLE_P + hook to only allow "noce" to generate the patterns that are profitable. */ + +bool +arm_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *) +{ + if (!TARGET_COND_ARITH + || reload_completed) + return true; + + if (arm_is_v81m_cond_insn (seq)) + return true; + + return false; +} + /* Output assembly to read the thread pointer from the appropriate TPIDR register into DEST. If PRED_P also emit the %? that can be used to output the predication code. */ @@ -36087,4 +36213,18 @@ arm_output_load_tpidr (rtx dst, bool pred_p) return ""; } +/* Return the MVE vector mode that has NUNITS elements of mode INNER_MODE. */ +opt_machine_mode +arm_mve_data_mode (scalar_mode inner_mode, poly_uint64 nunits) +{ + enum mode_class mclass + = (SCALAR_FLOAT_MODE_P (inner_mode) ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT); + machine_mode mode; + FOR_EACH_MODE_IN_CLASS (mode, mclass) + if (inner_mode == GET_MODE_INNER (mode) + && known_eq (nunits, GET_MODE_NUNITS (mode))) + return mode; + return opt_machine_mode (); +} + #include "gt-arm.h" diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 0cd5d73..13a90d8 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -394,9 +394,11 @@ emission of floating point pcs attributes. */ TARGET_MODE_CHECK that also takes into account the selected CPU and architecture. */ #define OPTION_DEFAULT_SPECS \ - {"arch", "%{!march=*:%{!mcpu=*:-march=%(VALUE)}}" }, \ - {"cpu", "%{!march=*:%{!mcpu=*:-mcpu=%(VALUE)}}" }, \ - {"tune", "%{!mcpu=*:%{!mtune=*:-mtune=%(VALUE)}}" }, \ + {"arch", "%{!march=*|march=unset:"\ + "%{!mcpu=*|mcpu=unset:%<march=* %<mcpu=* -march=%(VALUE)}}" }, \ + {"tune", "%{!mcpu=*|mcpu=unset:%{!mtune=*:-mtune=%(VALUE)}}" }, \ + {"cpu", "%{!march=*|march=unset:"\ + "%{!mcpu=*|mcpu=unset:%<march=* %<mcpu=* -mcpu=%(VALUE)}}" }, \ {"float", "%{!mfloat-abi=*:-mfloat-abi=%(VALUE)}" }, \ {"fpu", "%{!mfpu=*:-mfpu=%(VALUE)}"}, \ {"abi", "%{!mabi=*:-mabi=%(VALUE)}"}, \ @@ -1424,7 +1426,7 @@ extern const char *fp_sysreg_names[NB_FP_SYSREGS]; but prevents the compiler from extending the lifetime of these registers. */ #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \ - arm_small_register_classes_for_mode_p + arm_small_register_classes_for_mode_p /* Must leave BASE_REGS reloads alone */ #define THUMB_SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X) \ @@ -1477,7 +1479,7 @@ extern const char *fp_sysreg_names[NB_FP_SYSREGS]; /* Return the maximum number of consecutive registers needed to represent mode MODE in a register of class CLASS. - ARM regs are UNITS_PER_WORD bits. + ARM regs are UNITS_PER_WORD bits. FIXME: Is this true for iWMMX? */ #define CLASS_MAX_NREGS(CLASS, MODE) \ (CLASS == VPR_REG) \ @@ -1645,14 +1647,14 @@ machine_function; #define ARM_Q_BIT_READ (arm_q_bit_access ()) #define ARM_GE_BITS_READ (arm_ge_bits_access ()) -/* As in the machine_function, a global set of call-via labels, for code +/* As in the machine_function, a global set of call-via labels, for code that is in text_section. */ extern GTY(()) rtx thumb_call_via_label[14]; /* The number of potential ways of assigning to a co-processor. */ #define ARM_NUM_COPROC_SLOTS 1 -/* Enumeration of procedure calling standard variants. We don't really +/* Enumeration of procedure calling standard variants. We don't really support all of these yet. */ enum arm_pcs { @@ -2538,6 +2540,11 @@ const char *arm_be8_option (int argc, const char **argv); #define TARGET_MODE_SPECS \ " %{!marm:%{!mthumb:%:target_mode_check(%{march=*:arch %*;mcpu=*:cpu %*;:})}}" +/* Cleanup any stray -march=/-mcpu= if either is followed by "unset". */ +#define ARCH_CPU_CLEANUP_SPECS \ + " %{march=unset:%<march=*} " \ + " %{mcpu=unset:%<mcpu=*} " + /* Generate a canonical string to represent the architecture selected. */ #define ARCH_CANONICAL_SPECS \ " -march=%:canon_arch(%{mcpu=*: cpu %*} " \ @@ -2559,6 +2566,7 @@ const char *arm_be8_option (int argc, const char **argv); individual rules so that any option suppression (%<opt...)is completed before starting subsequent rules. */ #define DRIVER_SELF_SPECS \ + ARCH_CPU_CLEANUP_SPECS, \ MCPU_MTUNE_NATIVE_SPECS, \ TARGET_MODE_SPECS, \ MULTILIB_ARCH_CANONICAL_SPECS, \ diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index ae1b543..8ffdbc7 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -42,17 +42,9 @@ #ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE #define vst4q(__addr, __value) __arm_vst4q(__addr, __value) -#define vornq(__a, __b) __arm_vornq(__a, __b) -#define vbicq(__a, __b) __arm_vbicq(__a, __b) -#define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p) -#define vshlcq(__a, __b, __imm) __arm_vshlcq(__a, __b, __imm) -#define vbicq_m(__inactive, __a, __b, __p) __arm_vbicq_m(__inactive, __a, __b, __p) -#define vornq_m(__inactive, __a, __b, __p) __arm_vornq_m(__inactive, __a, __b, __p) #define vstrbq_scatter_offset(__base, __offset, __value) __arm_vstrbq_scatter_offset(__base, __offset, __value) -#define vstrbq(__addr, __value) __arm_vstrbq(__addr, __value) #define vstrwq_scatter_base(__addr, __offset, __value) __arm_vstrwq_scatter_base(__addr, __offset, __value) #define vldrbq_gather_offset(__base, __offset) __arm_vldrbq_gather_offset(__base, __offset) -#define vstrbq_p(__addr, __value, __p) __arm_vstrbq_p(__addr, __value, __p) #define vstrbq_scatter_offset_p(__base, __offset, __value, __p) __arm_vstrbq_scatter_offset_p(__base, __offset, __value, __p) #define vstrwq_scatter_base_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_p(__addr, __offset, __value, __p) #define vldrbq_gather_offset_z(__base, __offset, __p) __arm_vldrbq_gather_offset_z(__base, __offset, __p) @@ -72,10 +64,6 @@ #define vstrhq_scatter_offset_p(__base, __offset, __value, __p) __arm_vstrhq_scatter_offset_p(__base, __offset, __value, __p) #define vstrhq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrhq_scatter_shifted_offset(__base, __offset, __value) #define vstrhq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrhq_scatter_shifted_offset_p(__base, __offset, __value, __p) -#define vstrhq(__addr, __value) __arm_vstrhq(__addr, __value) -#define vstrhq_p(__addr, __value, __p) __arm_vstrhq_p(__addr, __value, __p) -#define vstrwq(__addr, __value) __arm_vstrwq(__addr, __value) -#define vstrwq_p(__addr, __value, __p) __arm_vstrwq_p(__addr, __value, __p) #define vstrdq_scatter_base_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_p(__addr, __offset, __value, __p) #define vstrdq_scatter_base(__addr, __offset, __value) __arm_vstrdq_scatter_base(__addr, __offset, __value) #define vstrdq_scatter_offset_p(__base, __offset, __value, __p) __arm_vstrdq_scatter_offset_p(__base, __offset, __value, __p) @@ -87,70 +75,15 @@ #define vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) __arm_vstrwq_scatter_shifted_offset_p(__base, __offset, __value, __p) #define vstrwq_scatter_shifted_offset(__base, __offset, __value) __arm_vstrwq_scatter_shifted_offset(__base, __offset, __value) #define vuninitializedq(__v) __arm_vuninitializedq(__v) -#define vddupq_m(__inactive, __a, __imm, __p) __arm_vddupq_m(__inactive, __a, __imm, __p) -#define vddupq_u8(__a, __imm) __arm_vddupq_u8(__a, __imm) -#define vddupq_u32(__a, __imm) __arm_vddupq_u32(__a, __imm) -#define vddupq_u16(__a, __imm) __arm_vddupq_u16(__a, __imm) -#define vdwdupq_m(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m(__inactive, __a, __b, __imm, __p) -#define vdwdupq_u8(__a, __b, __imm) __arm_vdwdupq_u8(__a, __b, __imm) -#define vdwdupq_u32(__a, __b, __imm) __arm_vdwdupq_u32(__a, __b, __imm) -#define vdwdupq_u16(__a, __b, __imm) __arm_vdwdupq_u16(__a, __b, __imm) -#define vidupq_m(__inactive, __a, __imm, __p) __arm_vidupq_m(__inactive, __a, __imm, __p) -#define vidupq_u8(__a, __imm) __arm_vidupq_u8(__a, __imm) -#define vidupq_u32(__a, __imm) __arm_vidupq_u32(__a, __imm) -#define vidupq_u16(__a, __imm) __arm_vidupq_u16(__a, __imm) -#define viwdupq_m(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m(__inactive, __a, __b, __imm, __p) -#define viwdupq_u8(__a, __b, __imm) __arm_viwdupq_u8(__a, __b, __imm) -#define viwdupq_u32(__a, __b, __imm) __arm_viwdupq_u32(__a, __b, __imm) -#define viwdupq_u16(__a, __b, __imm) __arm_viwdupq_u16(__a, __b, __imm) #define vstrdq_scatter_base_wb(__addr, __offset, __value) __arm_vstrdq_scatter_base_wb(__addr, __offset, __value) #define vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_wb_p(__addr, __offset, __value, __p) #define vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) __arm_vstrwq_scatter_base_wb_p(__addr, __offset, __value, __p) #define vstrwq_scatter_base_wb(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb(__addr, __offset, __value) -#define vddupq_x_u8(__a, __imm, __p) __arm_vddupq_x_u8(__a, __imm, __p) -#define vddupq_x_u16(__a, __imm, __p) __arm_vddupq_x_u16(__a, __imm, __p) -#define vddupq_x_u32(__a, __imm, __p) __arm_vddupq_x_u32(__a, __imm, __p) -#define vdwdupq_x_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_u8(__a, __b, __imm, __p) -#define vdwdupq_x_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_u16(__a, __b, __imm, __p) -#define vdwdupq_x_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_u32(__a, __b, __imm, __p) -#define vidupq_x_u8(__a, __imm, __p) __arm_vidupq_x_u8(__a, __imm, __p) -#define vidupq_x_u16(__a, __imm, __p) __arm_vidupq_x_u16(__a, __imm, __p) -#define vidupq_x_u32(__a, __imm, __p) __arm_vidupq_x_u32(__a, __imm, __p) -#define viwdupq_x_u8(__a, __b, __imm, __p) __arm_viwdupq_x_u8(__a, __b, __imm, __p) -#define viwdupq_x_u16(__a, __b, __imm, __p) __arm_viwdupq_x_u16(__a, __b, __imm, __p) -#define viwdupq_x_u32(__a, __b, __imm, __p) __arm_viwdupq_x_u32(__a, __b, __imm, __p) -#define vbicq_x(__a, __b, __p) __arm_vbicq_x(__a, __b, __p) -#define vornq_x(__a, __b, __p) __arm_vornq_x(__a, __b, __p) -#define vadciq(__a, __b, __carry_out) __arm_vadciq(__a, __b, __carry_out) -#define vadciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vadciq_m(__inactive, __a, __b, __carry_out, __p) -#define vadcq(__a, __b, __carry) __arm_vadcq(__a, __b, __carry) -#define vadcq_m(__inactive, __a, __b, __carry, __p) __arm_vadcq_m(__inactive, __a, __b, __carry, __p) -#define vsbciq(__a, __b, __carry_out) __arm_vsbciq(__a, __b, __carry_out) -#define vsbciq_m(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m(__inactive, __a, __b, __carry_out, __p) -#define vsbcq(__a, __b, __carry) __arm_vsbcq(__a, __b, __carry) -#define vsbcq_m(__inactive, __a, __b, __carry, __p) __arm_vsbcq_m(__inactive, __a, __b, __carry, __p) -#define vst1q_p(__addr, __value, __p) __arm_vst1q_p(__addr, __value, __p) #define vst2q(__addr, __value) __arm_vst2q(__addr, __value) -#define vld1q_z(__base, __p) __arm_vld1q_z(__base, __p) #define vld2q(__addr) __arm_vld2q(__addr) #define vld4q(__addr) __arm_vld4q(__addr) #define vsetq_lane(__a, __b, __idx) __arm_vsetq_lane(__a, __b, __idx) #define vgetq_lane(__a, __idx) __arm_vgetq_lane(__a, __idx) -#define vshlcq_m(__a, __b, __imm, __p) __arm_vshlcq_m(__a, __b, __imm, __p) -#define vcvttq_f32(__a) __arm_vcvttq_f32(__a) -#define vcvtbq_f32(__a) __arm_vcvtbq_f32(__a) -#define vcvtq(__a) __arm_vcvtq(__a) -#define vcvtq_n(__a, __imm6) __arm_vcvtq_n(__a, __imm6) -#define vcvtaq_m(__inactive, __a, __p) __arm_vcvtaq_m(__inactive, __a, __p) -#define vcvtq_m(__inactive, __a, __p) __arm_vcvtq_m(__inactive, __a, __p) -#define vcvtbq_m(__a, __b, __p) __arm_vcvtbq_m(__a, __b, __p) -#define vcvttq_m(__a, __b, __p) __arm_vcvttq_m(__a, __b, __p) -#define vcvtmq_m(__inactive, __a, __p) __arm_vcvtmq_m(__inactive, __a, __p) -#define vcvtnq_m(__inactive, __a, __p) __arm_vcvtnq_m(__inactive, __a, __p) -#define vcvtpq_m(__inactive, __a, __p) __arm_vcvtpq_m(__inactive, __a, __p) -#define vcvtq_m_n(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n(__inactive, __a, __imm6, __p) -#define vcvtq_x(__a, __p) __arm_vcvtq_x(__a, __p) -#define vcvtq_x_n(__a, __imm6, __p) __arm_vcvtq_x_n(__a, __imm6, __p) #define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value) @@ -161,167 +94,23 @@ #define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value) #define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value) #define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value) -#define vcvttq_f32_f16(__a) __arm_vcvttq_f32_f16(__a) -#define vcvtbq_f32_f16(__a) __arm_vcvtbq_f32_f16(__a) -#define vcvtq_f16_s16(__a) __arm_vcvtq_f16_s16(__a) -#define vcvtq_f32_s32(__a) __arm_vcvtq_f32_s32(__a) -#define vcvtq_f16_u16(__a) __arm_vcvtq_f16_u16(__a) -#define vcvtq_f32_u32(__a) __arm_vcvtq_f32_u32(__a) -#define vcvtaq_s16_f16(__a) __arm_vcvtaq_s16_f16(__a) -#define vcvtaq_s32_f32(__a) __arm_vcvtaq_s32_f32(__a) -#define vcvtnq_s16_f16(__a) __arm_vcvtnq_s16_f16(__a) -#define vcvtnq_s32_f32(__a) __arm_vcvtnq_s32_f32(__a) -#define vcvtpq_s16_f16(__a) __arm_vcvtpq_s16_f16(__a) -#define vcvtpq_s32_f32(__a) __arm_vcvtpq_s32_f32(__a) -#define vcvtmq_s16_f16(__a) __arm_vcvtmq_s16_f16(__a) -#define vcvtmq_s32_f32(__a) __arm_vcvtmq_s32_f32(__a) -#define vcvtq_s16_f16(__a) __arm_vcvtq_s16_f16(__a) -#define vcvtq_s32_f32(__a) __arm_vcvtq_s32_f32(__a) -#define vcvtq_u16_f16(__a) __arm_vcvtq_u16_f16(__a) -#define vcvtq_u32_f32(__a) __arm_vcvtq_u32_f32(__a) -#define vcvtpq_u16_f16(__a) __arm_vcvtpq_u16_f16(__a) -#define vcvtpq_u32_f32(__a) __arm_vcvtpq_u32_f32(__a) -#define vcvtnq_u16_f16(__a) __arm_vcvtnq_u16_f16(__a) -#define vcvtnq_u32_f32(__a) __arm_vcvtnq_u32_f32(__a) -#define vcvtmq_u16_f16(__a) __arm_vcvtmq_u16_f16(__a) -#define vcvtmq_u32_f32(__a) __arm_vcvtmq_u32_f32(__a) -#define vcvtaq_u16_f16(__a) __arm_vcvtaq_u16_f16(__a) -#define vcvtaq_u32_f32(__a) __arm_vcvtaq_u32_f32(__a) -#define vctp16q(__a) __arm_vctp16q(__a) -#define vctp32q(__a) __arm_vctp32q(__a) -#define vctp64q(__a) __arm_vctp64q(__a) -#define vctp8q(__a) __arm_vctp8q(__a) #define vpnot(__a) __arm_vpnot(__a) -#define vcvtq_n_f16_s16(__a, __imm6) __arm_vcvtq_n_f16_s16(__a, __imm6) -#define vcvtq_n_f32_s32(__a, __imm6) __arm_vcvtq_n_f32_s32(__a, __imm6) -#define vcvtq_n_f16_u16(__a, __imm6) __arm_vcvtq_n_f16_u16(__a, __imm6) -#define vcvtq_n_f32_u32(__a, __imm6) __arm_vcvtq_n_f32_u32(__a, __imm6) -#define vcvtq_n_s16_f16(__a, __imm6) __arm_vcvtq_n_s16_f16(__a, __imm6) -#define vcvtq_n_s32_f32(__a, __imm6) __arm_vcvtq_n_s32_f32(__a, __imm6) -#define vcvtq_n_u16_f16(__a, __imm6) __arm_vcvtq_n_u16_f16(__a, __imm6) -#define vcvtq_n_u32_f32(__a, __imm6) __arm_vcvtq_n_u32_f32(__a, __imm6) -#define vornq_u8(__a, __b) __arm_vornq_u8(__a, __b) -#define vbicq_u8(__a, __b) __arm_vbicq_u8(__a, __b) -#define vornq_s8(__a, __b) __arm_vornq_s8(__a, __b) -#define vbicq_s8(__a, __b) __arm_vbicq_s8(__a, __b) -#define vornq_u16(__a, __b) __arm_vornq_u16(__a, __b) -#define vbicq_u16(__a, __b) __arm_vbicq_u16(__a, __b) -#define vornq_s16(__a, __b) __arm_vornq_s16(__a, __b) -#define vbicq_s16(__a, __b) __arm_vbicq_s16(__a, __b) -#define vornq_u32(__a, __b) __arm_vornq_u32(__a, __b) -#define vbicq_u32(__a, __b) __arm_vbicq_u32(__a, __b) -#define vornq_s32(__a, __b) __arm_vornq_s32(__a, __b) -#define vbicq_s32(__a, __b) __arm_vbicq_s32(__a, __b) -#define vbicq_n_u16(__a, __imm) __arm_vbicq_n_u16(__a, __imm) -#define vornq_f16(__a, __b) __arm_vornq_f16(__a, __b) -#define vbicq_f16(__a, __b) __arm_vbicq_f16(__a, __b) -#define vbicq_n_s16(__a, __imm) __arm_vbicq_n_s16(__a, __imm) -#define vbicq_n_u32(__a, __imm) __arm_vbicq_n_u32(__a, __imm) -#define vornq_f32(__a, __b) __arm_vornq_f32(__a, __b) -#define vbicq_f32(__a, __b) __arm_vbicq_f32(__a, __b) -#define vbicq_n_s32(__a, __imm) __arm_vbicq_n_s32(__a, __imm) -#define vctp8q_m(__a, __p) __arm_vctp8q_m(__a, __p) -#define vctp64q_m(__a, __p) __arm_vctp64q_m(__a, __p) -#define vctp32q_m(__a, __p) __arm_vctp32q_m(__a, __p) -#define vctp16q_m(__a, __p) __arm_vctp16q_m(__a, __p) -#define vcvttq_f16_f32(__a, __b) __arm_vcvttq_f16_f32(__a, __b) -#define vcvtbq_f16_f32(__a, __b) __arm_vcvtbq_f16_f32(__a, __b) -#define vbicq_m_n_s16(__a, __imm, __p) __arm_vbicq_m_n_s16(__a, __imm, __p) -#define vbicq_m_n_s32(__a, __imm, __p) __arm_vbicq_m_n_s32(__a, __imm, __p) -#define vbicq_m_n_u16(__a, __imm, __p) __arm_vbicq_m_n_u16(__a, __imm, __p) -#define vbicq_m_n_u32(__a, __imm, __p) __arm_vbicq_m_n_u32(__a, __imm, __p) -#define vcvtaq_m_s16_f16(__inactive, __a, __p) __arm_vcvtaq_m_s16_f16(__inactive, __a, __p) -#define vcvtaq_m_u16_f16(__inactive, __a, __p) __arm_vcvtaq_m_u16_f16(__inactive, __a, __p) -#define vcvtaq_m_s32_f32(__inactive, __a, __p) __arm_vcvtaq_m_s32_f32(__inactive, __a, __p) -#define vcvtaq_m_u32_f32(__inactive, __a, __p) __arm_vcvtaq_m_u32_f32(__inactive, __a, __p) -#define vcvtq_m_f16_s16(__inactive, __a, __p) __arm_vcvtq_m_f16_s16(__inactive, __a, __p) -#define vcvtq_m_f16_u16(__inactive, __a, __p) __arm_vcvtq_m_f16_u16(__inactive, __a, __p) -#define vcvtq_m_f32_s32(__inactive, __a, __p) __arm_vcvtq_m_f32_s32(__inactive, __a, __p) -#define vcvtq_m_f32_u32(__inactive, __a, __p) __arm_vcvtq_m_f32_u32(__inactive, __a, __p) -#define vshlcq_s8(__a, __b, __imm) __arm_vshlcq_s8(__a, __b, __imm) -#define vshlcq_u8(__a, __b, __imm) __arm_vshlcq_u8(__a, __b, __imm) -#define vshlcq_s16(__a, __b, __imm) __arm_vshlcq_s16(__a, __b, __imm) -#define vshlcq_u16(__a, __b, __imm) __arm_vshlcq_u16(__a, __b, __imm) -#define vshlcq_s32(__a, __b, __imm) __arm_vshlcq_s32(__a, __b, __imm) -#define vshlcq_u32(__a, __b, __imm) __arm_vshlcq_u32(__a, __b, __imm) -#define vcvtbq_m_f16_f32(__a, __b, __p) __arm_vcvtbq_m_f16_f32(__a, __b, __p) -#define vcvtbq_m_f32_f16(__inactive, __a, __p) __arm_vcvtbq_m_f32_f16(__inactive, __a, __p) -#define vcvttq_m_f16_f32(__a, __b, __p) __arm_vcvttq_m_f16_f32(__a, __b, __p) -#define vcvttq_m_f32_f16(__inactive, __a, __p) __arm_vcvttq_m_f32_f16(__inactive, __a, __p) -#define vcvtmq_m_s16_f16(__inactive, __a, __p) __arm_vcvtmq_m_s16_f16(__inactive, __a, __p) -#define vcvtnq_m_s16_f16(__inactive, __a, __p) __arm_vcvtnq_m_s16_f16(__inactive, __a, __p) -#define vcvtpq_m_s16_f16(__inactive, __a, __p) __arm_vcvtpq_m_s16_f16(__inactive, __a, __p) -#define vcvtq_m_s16_f16(__inactive, __a, __p) __arm_vcvtq_m_s16_f16(__inactive, __a, __p) -#define vcvtmq_m_u16_f16(__inactive, __a, __p) __arm_vcvtmq_m_u16_f16(__inactive, __a, __p) -#define vcvtnq_m_u16_f16(__inactive, __a, __p) __arm_vcvtnq_m_u16_f16(__inactive, __a, __p) -#define vcvtpq_m_u16_f16(__inactive, __a, __p) __arm_vcvtpq_m_u16_f16(__inactive, __a, __p) -#define vcvtq_m_u16_f16(__inactive, __a, __p) __arm_vcvtq_m_u16_f16(__inactive, __a, __p) -#define vcvtmq_m_s32_f32(__inactive, __a, __p) __arm_vcvtmq_m_s32_f32(__inactive, __a, __p) -#define vcvtnq_m_s32_f32(__inactive, __a, __p) __arm_vcvtnq_m_s32_f32(__inactive, __a, __p) -#define vcvtpq_m_s32_f32(__inactive, __a, __p) __arm_vcvtpq_m_s32_f32(__inactive, __a, __p) -#define vcvtq_m_s32_f32(__inactive, __a, __p) __arm_vcvtq_m_s32_f32(__inactive, __a, __p) -#define vcvtmq_m_u32_f32(__inactive, __a, __p) __arm_vcvtmq_m_u32_f32(__inactive, __a, __p) -#define vcvtnq_m_u32_f32(__inactive, __a, __p) __arm_vcvtnq_m_u32_f32(__inactive, __a, __p) -#define vcvtpq_m_u32_f32(__inactive, __a, __p) __arm_vcvtpq_m_u32_f32(__inactive, __a, __p) -#define vcvtq_m_u32_f32(__inactive, __a, __p) __arm_vcvtq_m_u32_f32(__inactive, __a, __p) -#define vcvtq_m_n_f16_u16(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_f16_u16(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_f16_s16(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_f16_s16(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_f32_u32(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_f32_u32(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_f32_s32(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_f32_s32(__inactive, __a, __imm6, __p) -#define vbicq_m_s8(__inactive, __a, __b, __p) __arm_vbicq_m_s8(__inactive, __a, __b, __p) -#define vbicq_m_s32(__inactive, __a, __b, __p) __arm_vbicq_m_s32(__inactive, __a, __b, __p) -#define vbicq_m_s16(__inactive, __a, __b, __p) __arm_vbicq_m_s16(__inactive, __a, __b, __p) -#define vbicq_m_u8(__inactive, __a, __b, __p) __arm_vbicq_m_u8(__inactive, __a, __b, __p) -#define vbicq_m_u32(__inactive, __a, __b, __p) __arm_vbicq_m_u32(__inactive, __a, __b, __p) -#define vbicq_m_u16(__inactive, __a, __b, __p) __arm_vbicq_m_u16(__inactive, __a, __b, __p) -#define vornq_m_s8(__inactive, __a, __b, __p) __arm_vornq_m_s8(__inactive, __a, __b, __p) -#define vornq_m_s32(__inactive, __a, __b, __p) __arm_vornq_m_s32(__inactive, __a, __b, __p) -#define vornq_m_s16(__inactive, __a, __b, __p) __arm_vornq_m_s16(__inactive, __a, __b, __p) -#define vornq_m_u8(__inactive, __a, __b, __p) __arm_vornq_m_u8(__inactive, __a, __b, __p) -#define vornq_m_u32(__inactive, __a, __b, __p) __arm_vornq_m_u32(__inactive, __a, __b, __p) -#define vornq_m_u16(__inactive, __a, __b, __p) __arm_vornq_m_u16(__inactive, __a, __b, __p) -#define vbicq_m_f32(__inactive, __a, __b, __p) __arm_vbicq_m_f32(__inactive, __a, __b, __p) -#define vbicq_m_f16(__inactive, __a, __b, __p) __arm_vbicq_m_f16(__inactive, __a, __b, __p) -#define vcvtq_m_n_s32_f32(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_s32_f32(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_s16_f16(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_s16_f16(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_u32_f32(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_u32_f32(__inactive, __a, __imm6, __p) -#define vcvtq_m_n_u16_f16(__inactive, __a, __imm6, __p) __arm_vcvtq_m_n_u16_f16(__inactive, __a, __imm6, __p) -#define vornq_m_f32(__inactive, __a, __b, __p) __arm_vornq_m_f32(__inactive, __a, __b, __p) -#define vornq_m_f16(__inactive, __a, __b, __p) __arm_vornq_m_f16(__inactive, __a, __b, __p) -#define vstrbq_s8( __addr, __value) __arm_vstrbq_s8( __addr, __value) -#define vstrbq_u8( __addr, __value) __arm_vstrbq_u8( __addr, __value) -#define vstrbq_u16( __addr, __value) __arm_vstrbq_u16( __addr, __value) #define vstrbq_scatter_offset_s8( __base, __offset, __value) __arm_vstrbq_scatter_offset_s8( __base, __offset, __value) #define vstrbq_scatter_offset_u8( __base, __offset, __value) __arm_vstrbq_scatter_offset_u8( __base, __offset, __value) #define vstrbq_scatter_offset_u16( __base, __offset, __value) __arm_vstrbq_scatter_offset_u16( __base, __offset, __value) -#define vstrbq_s16( __addr, __value) __arm_vstrbq_s16( __addr, __value) -#define vstrbq_u32( __addr, __value) __arm_vstrbq_u32( __addr, __value) #define vstrbq_scatter_offset_s16( __base, __offset, __value) __arm_vstrbq_scatter_offset_s16( __base, __offset, __value) #define vstrbq_scatter_offset_u32( __base, __offset, __value) __arm_vstrbq_scatter_offset_u32( __base, __offset, __value) -#define vstrbq_s32( __addr, __value) __arm_vstrbq_s32( __addr, __value) #define vstrbq_scatter_offset_s32( __base, __offset, __value) __arm_vstrbq_scatter_offset_s32( __base, __offset, __value) #define vstrwq_scatter_base_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_s32(__addr, __offset, __value) #define vstrwq_scatter_base_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_u32(__addr, __offset, __value) #define vldrbq_gather_offset_u8(__base, __offset) __arm_vldrbq_gather_offset_u8(__base, __offset) #define vldrbq_gather_offset_s8(__base, __offset) __arm_vldrbq_gather_offset_s8(__base, __offset) -#define vldrbq_s8(__base) __arm_vldrbq_s8(__base) -#define vldrbq_u8(__base) __arm_vldrbq_u8(__base) #define vldrbq_gather_offset_u16(__base, __offset) __arm_vldrbq_gather_offset_u16(__base, __offset) #define vldrbq_gather_offset_s16(__base, __offset) __arm_vldrbq_gather_offset_s16(__base, __offset) -#define vldrbq_s16(__base) __arm_vldrbq_s16(__base) -#define vldrbq_u16(__base) __arm_vldrbq_u16(__base) #define vldrbq_gather_offset_u32(__base, __offset) __arm_vldrbq_gather_offset_u32(__base, __offset) #define vldrbq_gather_offset_s32(__base, __offset) __arm_vldrbq_gather_offset_s32(__base, __offset) -#define vldrbq_s32(__base) __arm_vldrbq_s32(__base) -#define vldrbq_u32(__base) __arm_vldrbq_u32(__base) #define vldrwq_gather_base_s32(__addr, __offset) __arm_vldrwq_gather_base_s32(__addr, __offset) #define vldrwq_gather_base_u32(__addr, __offset) __arm_vldrwq_gather_base_u32(__addr, __offset) -#define vstrbq_p_s8( __addr, __value, __p) __arm_vstrbq_p_s8( __addr, __value, __p) -#define vstrbq_p_s32( __addr, __value, __p) __arm_vstrbq_p_s32( __addr, __value, __p) -#define vstrbq_p_s16( __addr, __value, __p) __arm_vstrbq_p_s16( __addr, __value, __p) -#define vstrbq_p_u8( __addr, __value, __p) __arm_vstrbq_p_u8( __addr, __value, __p) -#define vstrbq_p_u32( __addr, __value, __p) __arm_vstrbq_p_u32( __addr, __value, __p) -#define vstrbq_p_u16( __addr, __value, __p) __arm_vstrbq_p_u16( __addr, __value, __p) #define vstrbq_scatter_offset_p_s8( __base, __offset, __value, __p) __arm_vstrbq_scatter_offset_p_s8( __base, __offset, __value, __p) #define vstrbq_scatter_offset_p_s32( __base, __offset, __value, __p) __arm_vstrbq_scatter_offset_p_s32( __base, __offset, __value, __p) #define vstrbq_scatter_offset_p_s16( __base, __offset, __value, __p) __arm_vstrbq_scatter_offset_p_s16( __base, __offset, __value, __p) @@ -336,12 +125,6 @@ #define vldrbq_gather_offset_z_u16(__base, __offset, __p) __arm_vldrbq_gather_offset_z_u16(__base, __offset, __p) #define vldrbq_gather_offset_z_u32(__base, __offset, __p) __arm_vldrbq_gather_offset_z_u32(__base, __offset, __p) #define vldrbq_gather_offset_z_s8(__base, __offset, __p) __arm_vldrbq_gather_offset_z_s8(__base, __offset, __p) -#define vldrbq_z_s16(__base, __p) __arm_vldrbq_z_s16(__base, __p) -#define vldrbq_z_u8(__base, __p) __arm_vldrbq_z_u8(__base, __p) -#define vldrbq_z_s8(__base, __p) __arm_vldrbq_z_s8(__base, __p) -#define vldrbq_z_s32(__base, __p) __arm_vldrbq_z_s32(__base, __p) -#define vldrbq_z_u16(__base, __p) __arm_vldrbq_z_u16(__base, __p) -#define vldrbq_z_u32(__base, __p) __arm_vldrbq_z_u32(__base, __p) #define vldrwq_gather_base_z_u32(__addr, __offset, __p) __arm_vldrwq_gather_base_z_u32(__addr, __offset, __p) #define vldrwq_gather_base_z_s32(__addr, __offset, __p) __arm_vldrwq_gather_base_z_s32(__addr, __offset, __p) #define vldrhq_gather_offset_s32(__base, __offset) __arm_vldrhq_gather_offset_s32(__base, __offset) @@ -360,22 +143,6 @@ #define vldrhq_gather_shifted_offset_z_s16(__base, __offset, __p) __arm_vldrhq_gather_shifted_offset_z_s16(__base, __offset, __p) #define vldrhq_gather_shifted_offset_z_u32(__base, __offset, __p) __arm_vldrhq_gather_shifted_offset_z_u32(__base, __offset, __p) #define vldrhq_gather_shifted_offset_z_u16(__base, __offset, __p) __arm_vldrhq_gather_shifted_offset_z_u16(__base, __offset, __p) -#define vldrhq_s32(__base) __arm_vldrhq_s32(__base) -#define vldrhq_s16(__base) __arm_vldrhq_s16(__base) -#define vldrhq_u32(__base) __arm_vldrhq_u32(__base) -#define vldrhq_u16(__base) __arm_vldrhq_u16(__base) -#define vldrhq_z_s32(__base, __p) __arm_vldrhq_z_s32(__base, __p) -#define vldrhq_z_s16(__base, __p) __arm_vldrhq_z_s16(__base, __p) -#define vldrhq_z_u32(__base, __p) __arm_vldrhq_z_u32(__base, __p) -#define vldrhq_z_u16(__base, __p) __arm_vldrhq_z_u16(__base, __p) -#define vldrwq_s32(__base) __arm_vldrwq_s32(__base) -#define vldrwq_u32(__base) __arm_vldrwq_u32(__base) -#define vldrwq_z_s32(__base, __p) __arm_vldrwq_z_s32(__base, __p) -#define vldrwq_z_u32(__base, __p) __arm_vldrwq_z_u32(__base, __p) -#define vldrhq_f16(__base) __arm_vldrhq_f16(__base) -#define vldrhq_z_f16(__base, __p) __arm_vldrhq_z_f16(__base, __p) -#define vldrwq_f32(__base) __arm_vldrwq_f32(__base) -#define vldrwq_z_f32(__base, __p) __arm_vldrwq_z_f32(__base, __p) #define vldrdq_gather_base_s64(__addr, __offset) __arm_vldrdq_gather_base_s64(__addr, __offset) #define vldrdq_gather_base_u64(__addr, __offset) __arm_vldrdq_gather_base_u64(__addr, __offset) #define vldrdq_gather_base_z_s64(__addr, __offset, __p) __arm_vldrdq_gather_base_z_s64(__addr, __offset, __p) @@ -406,7 +173,6 @@ #define vldrwq_gather_shifted_offset_z_f32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_f32(__base, __offset, __p) #define vldrwq_gather_shifted_offset_z_s32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_s32(__base, __offset, __p) #define vldrwq_gather_shifted_offset_z_u32(__base, __offset, __p) __arm_vldrwq_gather_shifted_offset_z_u32(__base, __offset, __p) -#define vstrhq_f16(__addr, __value) __arm_vstrhq_f16(__addr, __value) #define vstrhq_scatter_offset_s32( __base, __offset, __value) __arm_vstrhq_scatter_offset_s32( __base, __offset, __value) #define vstrhq_scatter_offset_s16( __base, __offset, __value) __arm_vstrhq_scatter_offset_s16( __base, __offset, __value) #define vstrhq_scatter_offset_u32( __base, __offset, __value) __arm_vstrhq_scatter_offset_u32( __base, __offset, __value) @@ -423,21 +189,6 @@ #define vstrhq_scatter_shifted_offset_p_s16( __base, __offset, __value, __p) __arm_vstrhq_scatter_shifted_offset_p_s16( __base, __offset, __value, __p) #define vstrhq_scatter_shifted_offset_p_u32( __base, __offset, __value, __p) __arm_vstrhq_scatter_shifted_offset_p_u32( __base, __offset, __value, __p) #define vstrhq_scatter_shifted_offset_p_u16( __base, __offset, __value, __p) __arm_vstrhq_scatter_shifted_offset_p_u16( __base, __offset, __value, __p) -#define vstrhq_s32(__addr, __value) __arm_vstrhq_s32(__addr, __value) -#define vstrhq_s16(__addr, __value) __arm_vstrhq_s16(__addr, __value) -#define vstrhq_u32(__addr, __value) __arm_vstrhq_u32(__addr, __value) -#define vstrhq_u16(__addr, __value) __arm_vstrhq_u16(__addr, __value) -#define vstrhq_p_f16(__addr, __value, __p) __arm_vstrhq_p_f16(__addr, __value, __p) -#define vstrhq_p_s32(__addr, __value, __p) __arm_vstrhq_p_s32(__addr, __value, __p) -#define vstrhq_p_s16(__addr, __value, __p) __arm_vstrhq_p_s16(__addr, __value, __p) -#define vstrhq_p_u32(__addr, __value, __p) __arm_vstrhq_p_u32(__addr, __value, __p) -#define vstrhq_p_u16(__addr, __value, __p) __arm_vstrhq_p_u16(__addr, __value, __p) -#define vstrwq_f32(__addr, __value) __arm_vstrwq_f32(__addr, __value) -#define vstrwq_s32(__addr, __value) __arm_vstrwq_s32(__addr, __value) -#define vstrwq_u32(__addr, __value) __arm_vstrwq_u32(__addr, __value) -#define vstrwq_p_f32(__addr, __value, __p) __arm_vstrwq_p_f32(__addr, __value, __p) -#define vstrwq_p_s32(__addr, __value, __p) __arm_vstrwq_p_s32(__addr, __value, __p) -#define vstrwq_p_u32(__addr, __value, __p) __arm_vstrwq_p_u32(__addr, __value, __p) #define vstrdq_scatter_base_p_s64(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_p_s64(__addr, __offset, __value, __p) #define vstrdq_scatter_base_p_u64(__addr, __offset, __value, __p) __arm_vstrdq_scatter_base_p_u64(__addr, __offset, __value, __p) #define vstrdq_scatter_base_s64(__addr, __offset, __value) __arm_vstrdq_scatter_base_s64(__addr, __offset, __value) @@ -478,54 +229,6 @@ #define vuninitializedq_s64(void) __arm_vuninitializedq_s64(void) #define vuninitializedq_f16(void) __arm_vuninitializedq_f16(void) #define vuninitializedq_f32(void) __arm_vuninitializedq_f32(void) -#define vddupq_m_n_u8(__inactive, __a, __imm, __p) __arm_vddupq_m_n_u8(__inactive, __a, __imm, __p) -#define vddupq_m_n_u32(__inactive, __a, __imm, __p) __arm_vddupq_m_n_u32(__inactive, __a, __imm, __p) -#define vddupq_m_n_u16(__inactive, __a, __imm, __p) __arm_vddupq_m_n_u16(__inactive, __a, __imm, __p) -#define vddupq_m_wb_u8(__inactive, __a, __imm, __p) __arm_vddupq_m_wb_u8(__inactive, __a, __imm, __p) -#define vddupq_m_wb_u16(__inactive, __a, __imm, __p) __arm_vddupq_m_wb_u16(__inactive, __a, __imm, __p) -#define vddupq_m_wb_u32(__inactive, __a, __imm, __p) __arm_vddupq_m_wb_u32(__inactive, __a, __imm, __p) -#define vddupq_n_u8(__a, __imm) __arm_vddupq_n_u8(__a, __imm) -#define vddupq_n_u32(__a, __imm) __arm_vddupq_n_u32(__a, __imm) -#define vddupq_n_u16(__a, __imm) __arm_vddupq_n_u16(__a, __imm) -#define vddupq_wb_u8( __a, __imm) __arm_vddupq_wb_u8( __a, __imm) -#define vddupq_wb_u16( __a, __imm) __arm_vddupq_wb_u16( __a, __imm) -#define vddupq_wb_u32( __a, __imm) __arm_vddupq_wb_u32( __a, __imm) -#define vdwdupq_m_n_u8(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_n_u8(__inactive, __a, __b, __imm, __p) -#define vdwdupq_m_n_u32(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_n_u32(__inactive, __a, __b, __imm, __p) -#define vdwdupq_m_n_u16(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_n_u16(__inactive, __a, __b, __imm, __p) -#define vdwdupq_m_wb_u8(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_wb_u8(__inactive, __a, __b, __imm, __p) -#define vdwdupq_m_wb_u32(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_wb_u32(__inactive, __a, __b, __imm, __p) -#define vdwdupq_m_wb_u16(__inactive, __a, __b, __imm, __p) __arm_vdwdupq_m_wb_u16(__inactive, __a, __b, __imm, __p) -#define vdwdupq_n_u8(__a, __b, __imm) __arm_vdwdupq_n_u8(__a, __b, __imm) -#define vdwdupq_n_u32(__a, __b, __imm) __arm_vdwdupq_n_u32(__a, __b, __imm) -#define vdwdupq_n_u16(__a, __b, __imm) __arm_vdwdupq_n_u16(__a, __b, __imm) -#define vdwdupq_wb_u8( __a, __b, __imm) __arm_vdwdupq_wb_u8( __a, __b, __imm) -#define vdwdupq_wb_u32( __a, __b, __imm) __arm_vdwdupq_wb_u32( __a, __b, __imm) -#define vdwdupq_wb_u16( __a, __b, __imm) __arm_vdwdupq_wb_u16( __a, __b, __imm) -#define vidupq_m_n_u8(__inactive, __a, __imm, __p) __arm_vidupq_m_n_u8(__inactive, __a, __imm, __p) -#define vidupq_m_n_u32(__inactive, __a, __imm, __p) __arm_vidupq_m_n_u32(__inactive, __a, __imm, __p) -#define vidupq_m_n_u16(__inactive, __a, __imm, __p) __arm_vidupq_m_n_u16(__inactive, __a, __imm, __p) -#define vidupq_m_wb_u8(__inactive, __a, __imm, __p) __arm_vidupq_m_wb_u8(__inactive, __a, __imm, __p) -#define vidupq_m_wb_u16(__inactive, __a, __imm, __p) __arm_vidupq_m_wb_u16(__inactive, __a, __imm, __p) -#define vidupq_m_wb_u32(__inactive, __a, __imm, __p) __arm_vidupq_m_wb_u32(__inactive, __a, __imm, __p) -#define vidupq_n_u8(__a, __imm) __arm_vidupq_n_u8(__a, __imm) -#define vidupq_n_u32(__a, __imm) __arm_vidupq_n_u32(__a, __imm) -#define vidupq_n_u16(__a, __imm) __arm_vidupq_n_u16(__a, __imm) -#define vidupq_wb_u8( __a, __imm) __arm_vidupq_wb_u8( __a, __imm) -#define vidupq_wb_u16( __a, __imm) __arm_vidupq_wb_u16( __a, __imm) -#define vidupq_wb_u32( __a, __imm) __arm_vidupq_wb_u32( __a, __imm) -#define viwdupq_m_n_u8(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_n_u8(__inactive, __a, __b, __imm, __p) -#define viwdupq_m_n_u32(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_n_u32(__inactive, __a, __b, __imm, __p) -#define viwdupq_m_n_u16(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_n_u16(__inactive, __a, __b, __imm, __p) -#define viwdupq_m_wb_u8(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_wb_u8(__inactive, __a, __b, __imm, __p) -#define viwdupq_m_wb_u32(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_wb_u32(__inactive, __a, __b, __imm, __p) -#define viwdupq_m_wb_u16(__inactive, __a, __b, __imm, __p) __arm_viwdupq_m_wb_u16(__inactive, __a, __b, __imm, __p) -#define viwdupq_n_u8(__a, __b, __imm) __arm_viwdupq_n_u8(__a, __b, __imm) -#define viwdupq_n_u32(__a, __b, __imm) __arm_viwdupq_n_u32(__a, __b, __imm) -#define viwdupq_n_u16(__a, __b, __imm) __arm_viwdupq_n_u16(__a, __b, __imm) -#define viwdupq_wb_u8( __a, __b, __imm) __arm_viwdupq_wb_u8( __a, __b, __imm) -#define viwdupq_wb_u32( __a, __b, __imm) __arm_viwdupq_wb_u32( __a, __b, __imm) -#define viwdupq_wb_u16( __a, __b, __imm) __arm_viwdupq_wb_u16( __a, __b, __imm) #define vldrdq_gather_base_wb_s64(__addr, __offset) __arm_vldrdq_gather_base_wb_s64(__addr, __offset) #define vldrdq_gather_base_wb_u64(__addr, __offset) __arm_vldrdq_gather_base_wb_u64(__addr, __offset) #define vldrdq_gather_base_wb_z_s64(__addr, __offset, __p) __arm_vldrdq_gather_base_wb_z_s64(__addr, __offset, __p) @@ -546,136 +249,30 @@ #define vstrwq_scatter_base_wb_s32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_s32(__addr, __offset, __value) #define vstrwq_scatter_base_wb_u32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_u32(__addr, __offset, __value) #define vstrwq_scatter_base_wb_f32(__addr, __offset, __value) __arm_vstrwq_scatter_base_wb_f32(__addr, __offset, __value) -#define vddupq_x_n_u8(__a, __imm, __p) __arm_vddupq_x_n_u8(__a, __imm, __p) -#define vddupq_x_n_u16(__a, __imm, __p) __arm_vddupq_x_n_u16(__a, __imm, __p) -#define vddupq_x_n_u32(__a, __imm, __p) __arm_vddupq_x_n_u32(__a, __imm, __p) -#define vddupq_x_wb_u8(__a, __imm, __p) __arm_vddupq_x_wb_u8(__a, __imm, __p) -#define vddupq_x_wb_u16(__a, __imm, __p) __arm_vddupq_x_wb_u16(__a, __imm, __p) -#define vddupq_x_wb_u32(__a, __imm, __p) __arm_vddupq_x_wb_u32(__a, __imm, __p) -#define vdwdupq_x_n_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_n_u8(__a, __b, __imm, __p) -#define vdwdupq_x_n_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_n_u16(__a, __b, __imm, __p) -#define vdwdupq_x_n_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_n_u32(__a, __b, __imm, __p) -#define vdwdupq_x_wb_u8(__a, __b, __imm, __p) __arm_vdwdupq_x_wb_u8(__a, __b, __imm, __p) -#define vdwdupq_x_wb_u16(__a, __b, __imm, __p) __arm_vdwdupq_x_wb_u16(__a, __b, __imm, __p) -#define vdwdupq_x_wb_u32(__a, __b, __imm, __p) __arm_vdwdupq_x_wb_u32(__a, __b, __imm, __p) -#define vidupq_x_n_u8(__a, __imm, __p) __arm_vidupq_x_n_u8(__a, __imm, __p) -#define vidupq_x_n_u16(__a, __imm, __p) __arm_vidupq_x_n_u16(__a, __imm, __p) -#define vidupq_x_n_u32(__a, __imm, __p) __arm_vidupq_x_n_u32(__a, __imm, __p) -#define vidupq_x_wb_u8(__a, __imm, __p) __arm_vidupq_x_wb_u8(__a, __imm, __p) -#define vidupq_x_wb_u16(__a, __imm, __p) __arm_vidupq_x_wb_u16(__a, __imm, __p) -#define vidupq_x_wb_u32(__a, __imm, __p) __arm_vidupq_x_wb_u32(__a, __imm, __p) -#define viwdupq_x_n_u8(__a, __b, __imm, __p) __arm_viwdupq_x_n_u8(__a, __b, __imm, __p) -#define viwdupq_x_n_u16(__a, __b, __imm, __p) __arm_viwdupq_x_n_u16(__a, __b, __imm, __p) -#define viwdupq_x_n_u32(__a, __b, __imm, __p) __arm_viwdupq_x_n_u32(__a, __b, __imm, __p) -#define viwdupq_x_wb_u8(__a, __b, __imm, __p) __arm_viwdupq_x_wb_u8(__a, __b, __imm, __p) -#define viwdupq_x_wb_u16(__a, __b, __imm, __p) __arm_viwdupq_x_wb_u16(__a, __b, __imm, __p) -#define viwdupq_x_wb_u32(__a, __b, __imm, __p) __arm_viwdupq_x_wb_u32(__a, __b, __imm, __p) -#define vbicq_x_s8(__a, __b, __p) __arm_vbicq_x_s8(__a, __b, __p) -#define vbicq_x_s16(__a, __b, __p) __arm_vbicq_x_s16(__a, __b, __p) -#define vbicq_x_s32(__a, __b, __p) __arm_vbicq_x_s32(__a, __b, __p) -#define vbicq_x_u8(__a, __b, __p) __arm_vbicq_x_u8(__a, __b, __p) -#define vbicq_x_u16(__a, __b, __p) __arm_vbicq_x_u16(__a, __b, __p) -#define vbicq_x_u32(__a, __b, __p) __arm_vbicq_x_u32(__a, __b, __p) -#define vornq_x_s8(__a, __b, __p) __arm_vornq_x_s8(__a, __b, __p) -#define vornq_x_s16(__a, __b, __p) __arm_vornq_x_s16(__a, __b, __p) -#define vornq_x_s32(__a, __b, __p) __arm_vornq_x_s32(__a, __b, __p) -#define vornq_x_u8(__a, __b, __p) __arm_vornq_x_u8(__a, __b, __p) -#define vornq_x_u16(__a, __b, __p) __arm_vornq_x_u16(__a, __b, __p) -#define vornq_x_u32(__a, __b, __p) __arm_vornq_x_u32(__a, __b, __p) -#define vcvtaq_x_s16_f16(__a, __p) __arm_vcvtaq_x_s16_f16(__a, __p) -#define vcvtaq_x_s32_f32(__a, __p) __arm_vcvtaq_x_s32_f32(__a, __p) -#define vcvtaq_x_u16_f16(__a, __p) __arm_vcvtaq_x_u16_f16(__a, __p) -#define vcvtaq_x_u32_f32(__a, __p) __arm_vcvtaq_x_u32_f32(__a, __p) -#define vcvtnq_x_s16_f16(__a, __p) __arm_vcvtnq_x_s16_f16(__a, __p) -#define vcvtnq_x_s32_f32(__a, __p) __arm_vcvtnq_x_s32_f32(__a, __p) -#define vcvtnq_x_u16_f16(__a, __p) __arm_vcvtnq_x_u16_f16(__a, __p) -#define vcvtnq_x_u32_f32(__a, __p) __arm_vcvtnq_x_u32_f32(__a, __p) -#define vcvtpq_x_s16_f16(__a, __p) __arm_vcvtpq_x_s16_f16(__a, __p) -#define vcvtpq_x_s32_f32(__a, __p) __arm_vcvtpq_x_s32_f32(__a, __p) -#define vcvtpq_x_u16_f16(__a, __p) __arm_vcvtpq_x_u16_f16(__a, __p) -#define vcvtpq_x_u32_f32(__a, __p) __arm_vcvtpq_x_u32_f32(__a, __p) -#define vcvtmq_x_s16_f16(__a, __p) __arm_vcvtmq_x_s16_f16(__a, __p) -#define vcvtmq_x_s32_f32(__a, __p) __arm_vcvtmq_x_s32_f32(__a, __p) -#define vcvtmq_x_u16_f16(__a, __p) __arm_vcvtmq_x_u16_f16(__a, __p) -#define vcvtmq_x_u32_f32(__a, __p) __arm_vcvtmq_x_u32_f32(__a, __p) -#define vcvtbq_x_f32_f16(__a, __p) __arm_vcvtbq_x_f32_f16(__a, __p) -#define vcvttq_x_f32_f16(__a, __p) __arm_vcvttq_x_f32_f16(__a, __p) -#define vcvtq_x_f16_u16(__a, __p) __arm_vcvtq_x_f16_u16(__a, __p) -#define vcvtq_x_f16_s16(__a, __p) __arm_vcvtq_x_f16_s16(__a, __p) -#define vcvtq_x_f32_s32(__a, __p) __arm_vcvtq_x_f32_s32(__a, __p) -#define vcvtq_x_f32_u32(__a, __p) __arm_vcvtq_x_f32_u32(__a, __p) -#define vcvtq_x_n_f16_s16(__a, __imm6, __p) __arm_vcvtq_x_n_f16_s16(__a, __imm6, __p) -#define vcvtq_x_n_f16_u16(__a, __imm6, __p) __arm_vcvtq_x_n_f16_u16(__a, __imm6, __p) -#define vcvtq_x_n_f32_s32(__a, __imm6, __p) __arm_vcvtq_x_n_f32_s32(__a, __imm6, __p) -#define vcvtq_x_n_f32_u32(__a, __imm6, __p) __arm_vcvtq_x_n_f32_u32(__a, __imm6, __p) -#define vcvtq_x_s16_f16(__a, __p) __arm_vcvtq_x_s16_f16(__a, __p) -#define vcvtq_x_s32_f32(__a, __p) __arm_vcvtq_x_s32_f32(__a, __p) -#define vcvtq_x_u16_f16(__a, __p) __arm_vcvtq_x_u16_f16(__a, __p) -#define vcvtq_x_u32_f32(__a, __p) __arm_vcvtq_x_u32_f32(__a, __p) -#define vcvtq_x_n_s16_f16(__a, __imm6, __p) __arm_vcvtq_x_n_s16_f16(__a, __imm6, __p) -#define vcvtq_x_n_s32_f32(__a, __imm6, __p) __arm_vcvtq_x_n_s32_f32(__a, __imm6, __p) -#define vcvtq_x_n_u16_f16(__a, __imm6, __p) __arm_vcvtq_x_n_u16_f16(__a, __imm6, __p) -#define vcvtq_x_n_u32_f32(__a, __imm6, __p) __arm_vcvtq_x_n_u32_f32(__a, __imm6, __p) -#define vbicq_x_f16(__a, __b, __p) __arm_vbicq_x_f16(__a, __b, __p) -#define vbicq_x_f32(__a, __b, __p) __arm_vbicq_x_f32(__a, __b, __p) -#define vornq_x_f16(__a, __b, __p) __arm_vornq_x_f16(__a, __b, __p) -#define vornq_x_f32(__a, __b, __p) __arm_vornq_x_f32(__a, __b, __p) -#define vadciq_s32(__a, __b, __carry_out) __arm_vadciq_s32(__a, __b, __carry_out) -#define vadciq_u32(__a, __b, __carry_out) __arm_vadciq_u32(__a, __b, __carry_out) -#define vadciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vadciq_m_s32(__inactive, __a, __b, __carry_out, __p) -#define vadciq_m_u32(__inactive, __a, __b, __carry_out, __p) __arm_vadciq_m_u32(__inactive, __a, __b, __carry_out, __p) -#define vadcq_s32(__a, __b, __carry) __arm_vadcq_s32(__a, __b, __carry) -#define vadcq_u32(__a, __b, __carry) __arm_vadcq_u32(__a, __b, __carry) -#define vadcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_s32(__inactive, __a, __b, __carry, __p) -#define vadcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vadcq_m_u32(__inactive, __a, __b, __carry, __p) -#define vsbciq_s32(__a, __b, __carry_out) __arm_vsbciq_s32(__a, __b, __carry_out) -#define vsbciq_u32(__a, __b, __carry_out) __arm_vsbciq_u32(__a, __b, __carry_out) -#define vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_s32(__inactive, __a, __b, __carry_out, __p) -#define vsbciq_m_u32(__inactive, __a, __b, __carry_out, __p) __arm_vsbciq_m_u32(__inactive, __a, __b, __carry_out, __p) -#define vsbcq_s32(__a, __b, __carry) __arm_vsbcq_s32(__a, __b, __carry) -#define vsbcq_u32(__a, __b, __carry) __arm_vsbcq_u32(__a, __b, __carry) -#define vsbcq_m_s32(__inactive, __a, __b, __carry, __p) __arm_vsbcq_m_s32(__inactive, __a, __b, __carry, __p) -#define vsbcq_m_u32(__inactive, __a, __b, __carry, __p) __arm_vsbcq_m_u32(__inactive, __a, __b, __carry, __p) -#define vst1q_p_u8(__addr, __value, __p) __arm_vst1q_p_u8(__addr, __value, __p) -#define vst1q_p_s8(__addr, __value, __p) __arm_vst1q_p_s8(__addr, __value, __p) #define vst2q_s8(__addr, __value) __arm_vst2q_s8(__addr, __value) #define vst2q_u8(__addr, __value) __arm_vst2q_u8(__addr, __value) -#define vld1q_z_u8(__base, __p) __arm_vld1q_z_u8(__base, __p) -#define vld1q_z_s8(__base, __p) __arm_vld1q_z_s8(__base, __p) #define vld2q_s8(__addr) __arm_vld2q_s8(__addr) #define vld2q_u8(__addr) __arm_vld2q_u8(__addr) #define vld4q_s8(__addr) __arm_vld4q_s8(__addr) #define vld4q_u8(__addr) __arm_vld4q_u8(__addr) -#define vst1q_p_u16(__addr, __value, __p) __arm_vst1q_p_u16(__addr, __value, __p) -#define vst1q_p_s16(__addr, __value, __p) __arm_vst1q_p_s16(__addr, __value, __p) #define vst2q_s16(__addr, __value) __arm_vst2q_s16(__addr, __value) #define vst2q_u16(__addr, __value) __arm_vst2q_u16(__addr, __value) -#define vld1q_z_u16(__base, __p) __arm_vld1q_z_u16(__base, __p) -#define vld1q_z_s16(__base, __p) __arm_vld1q_z_s16(__base, __p) #define vld2q_s16(__addr) __arm_vld2q_s16(__addr) #define vld2q_u16(__addr) __arm_vld2q_u16(__addr) #define vld4q_s16(__addr) __arm_vld4q_s16(__addr) #define vld4q_u16(__addr) __arm_vld4q_u16(__addr) -#define vst1q_p_u32(__addr, __value, __p) __arm_vst1q_p_u32(__addr, __value, __p) -#define vst1q_p_s32(__addr, __value, __p) __arm_vst1q_p_s32(__addr, __value, __p) #define vst2q_s32(__addr, __value) __arm_vst2q_s32(__addr, __value) #define vst2q_u32(__addr, __value) __arm_vst2q_u32(__addr, __value) -#define vld1q_z_u32(__base, __p) __arm_vld1q_z_u32(__base, __p) -#define vld1q_z_s32(__base, __p) __arm_vld1q_z_s32(__base, __p) #define vld2q_s32(__addr) __arm_vld2q_s32(__addr) #define vld2q_u32(__addr) __arm_vld2q_u32(__addr) #define vld4q_s32(__addr) __arm_vld4q_s32(__addr) #define vld4q_u32(__addr) __arm_vld4q_u32(__addr) #define vld4q_f16(__addr) __arm_vld4q_f16(__addr) #define vld2q_f16(__addr) __arm_vld2q_f16(__addr) -#define vld1q_z_f16(__base, __p) __arm_vld1q_z_f16(__base, __p) #define vst2q_f16(__addr, __value) __arm_vst2q_f16(__addr, __value) -#define vst1q_p_f16(__addr, __value, __p) __arm_vst1q_p_f16(__addr, __value, __p) #define vld4q_f32(__addr) __arm_vld4q_f32(__addr) #define vld2q_f32(__addr) __arm_vld2q_f32(__addr) -#define vld1q_z_f32(__base, __p) __arm_vld1q_z_f32(__base, __p) #define vst2q_f32(__addr, __value) __arm_vst2q_f32(__addr, __value) -#define vst1q_p_f32(__addr, __value, __p) __arm_vst1q_p_f32(__addr, __value, __p) #define vsetq_lane_f16(__a, __b, __idx) __arm_vsetq_lane_f16(__a, __b, __idx) #define vsetq_lane_f32(__a, __b, __idx) __arm_vsetq_lane_f32(__a, __b, __idx) #define vsetq_lane_s16(__a, __b, __idx) __arm_vsetq_lane_s16(__a, __b, __idx) @@ -712,12 +309,6 @@ #define urshrl(__p0, __p1) __arm_urshrl(__p0, __p1) #define lsll(__p0, __p1) __arm_lsll(__p0, __p1) #define asrl(__p0, __p1) __arm_asrl(__p0, __p1) -#define vshlcq_m_s8(__a, __b, __imm, __p) __arm_vshlcq_m_s8(__a, __b, __imm, __p) -#define vshlcq_m_u8(__a, __b, __imm, __p) __arm_vshlcq_m_u8(__a, __b, __imm, __p) -#define vshlcq_m_s16(__a, __b, __imm, __p) __arm_vshlcq_m_s16(__a, __b, __imm, __p) -#define vshlcq_m_u16(__a, __b, __imm, __p) __arm_vshlcq_m_u16(__a, __b, __imm, __p) -#define vshlcq_m_s32(__a, __b, __imm, __p) __arm_vshlcq_m_s32(__a, __b, __imm, __p) -#define vshlcq_m_u32(__a, __b, __imm, __p) __arm_vshlcq_m_u32(__a, __b, __imm, __p) #endif /* For big-endian, GCC's vector indices are reversed within each 64 bits @@ -788,345 +379,11 @@ __arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value) __extension__ extern __inline mve_pred16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp16q (uint32_t __a) -{ - return __builtin_mve_vctp16qv8bi (__a); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp32q (uint32_t __a) -{ - return __builtin_mve_vctp32qv4bi (__a); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp64q (uint32_t __a) -{ - return __builtin_mve_vctp64qv2qi (__a); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp8q (uint32_t __a) -{ - return __builtin_mve_vctp8qv16bi (__a); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vpnot (mve_pred16_t __a) { return __builtin_mve_vpnotv16bi (__a); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_u8 (uint8x16_t __a, uint8x16_t __b) -{ - return __builtin_mve_vornq_uv16qi (__a, __b); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_u8 (uint8x16_t __a, uint8x16_t __b) -{ - return __builtin_mve_vbicq_uv16qi (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_s8 (int8x16_t __a, int8x16_t __b) -{ - return __builtin_mve_vornq_sv16qi (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_s8 (int8x16_t __a, int8x16_t __b) -{ - return __builtin_mve_vbicq_sv16qi (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_u16 (uint16x8_t __a, uint16x8_t __b) -{ - return __builtin_mve_vornq_uv8hi (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_u16 (uint16x8_t __a, uint16x8_t __b) -{ - return __builtin_mve_vbicq_uv8hi (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_s16 (int16x8_t __a, int16x8_t __b) -{ - return __builtin_mve_vornq_sv8hi (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_s16 (int16x8_t __a, int16x8_t __b) -{ - return __builtin_mve_vbicq_sv8hi (__a, __b); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_u32 (uint32x4_t __a, uint32x4_t __b) -{ - return __builtin_mve_vornq_uv4si (__a, __b); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_u32 (uint32x4_t __a, uint32x4_t __b) -{ - return __builtin_mve_vbicq_uv4si (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_s32 (int32x4_t __a, int32x4_t __b) -{ - return __builtin_mve_vornq_sv4si (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_s32 (int32x4_t __a, int32x4_t __b) -{ - return __builtin_mve_vbicq_sv4si (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_n_u16 (uint16x8_t __a, const int __imm) -{ - return __builtin_mve_vbicq_n_uv8hi (__a, __imm); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_n_s16 (int16x8_t __a, const int __imm) -{ - return __builtin_mve_vbicq_n_sv8hi (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_n_u32 (uint32x4_t __a, const int __imm) -{ - return __builtin_mve_vbicq_n_uv4si (__a, __imm); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_n_s32 (int32x4_t __a, const int __imm) -{ - return __builtin_mve_vbicq_n_sv4si (__a, __imm); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp8q_m (uint32_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vctp8q_mv16bi (__a, __p); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp64q_m (uint32_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vctp64q_mv2qi (__a, __p); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp32q_m (uint32_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vctp32q_mv4bi (__a, __p); -} - -__extension__ extern __inline mve_pred16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vctp16q_m (uint32_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vctp16q_mv8bi (__a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n_s16 (int16x8_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_n_sv8hi (__a, __imm, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n_s32 (int32x4_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_n_sv4si (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n_u16 (uint16x8_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_n_uv8hi (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n_u32 (uint32x4_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_n_uv4si (__a, __imm, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_s8 (int8x16_t __a, uint32_t * __b, const int __imm) -{ - int8x16_t __res = __builtin_mve_vshlcq_vec_sv16qi (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_sv16qi (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_u8 (uint8x16_t __a, uint32_t * __b, const int __imm) -{ - uint8x16_t __res = __builtin_mve_vshlcq_vec_uv16qi (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_uv16qi (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_s16 (int16x8_t __a, uint32_t * __b, const int __imm) -{ - int16x8_t __res = __builtin_mve_vshlcq_vec_sv8hi (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_sv8hi (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_u16 (uint16x8_t __a, uint32_t * __b, const int __imm) -{ - uint16x8_t __res = __builtin_mve_vshlcq_vec_uv8hi (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_uv8hi (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_s32 (int32x4_t __a, uint32_t * __b, const int __imm) -{ - int32x4_t __res = __builtin_mve_vshlcq_vec_sv4si (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_sv4si (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_u32 (uint32x4_t __a, uint32_t * __b, const int __imm) -{ - uint32x4_t __res = __builtin_mve_vshlcq_vec_uv4si (__a, *__b, __imm); - *__b = __builtin_mve_vshlcq_carry_uv4si (__a, *__b, __imm); - return __res; -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_s8 (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv16qi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv4si (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_s16 (int16x8_t __inactive, int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv8hi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_u8 (uint8x16_t __inactive, uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv16qi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv4si (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv8hi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_s8 (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv16qi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv4si (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_s16 (int16x8_t __inactive, int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv8hi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_u8 (uint8x16_t __inactive, uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv16qi (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv4si (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_u16 (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv8hi (__inactive, __a, __b, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrbq_scatter_offset_s8 (int8_t * __base, uint8x16_t __offset, int8x16_t __value) @@ -1171,48 +428,6 @@ __arm_vstrbq_scatter_offset_u16 (uint8_t * __base, uint16x8_t __offset, uint16x8 __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_s8 (int8_t * __addr, int8x16_t __value) -{ - __builtin_mve_vstrbq_sv16qi ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_s32 (int8_t * __addr, int32x4_t __value) -{ - __builtin_mve_vstrbq_sv4si ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_s16 (int8_t * __addr, int16x8_t __value) -{ - __builtin_mve_vstrbq_sv8hi ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_u8 (uint8_t * __addr, uint8x16_t __value) -{ - __builtin_mve_vstrbq_uv16qi ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_u32 (uint8_t * __addr, uint32x4_t __value) -{ - __builtin_mve_vstrbq_uv4si ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_u16 (uint8_t * __addr, uint16x8_t __value) -{ - __builtin_mve_vstrbq_uv8hi ((__builtin_neon_qi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base_s32 (uint32x4_t __addr, const int __offset, int32x4_t __value) { __builtin_mve_vstrwq_scatter_base_sv4si (__addr, __offset, __value); @@ -1239,20 +454,6 @@ __arm_vldrbq_gather_offset_s8 (int8_t const * __base, uint8x16_t __offset) return __builtin_mve_vldrbq_gather_offset_sv16qi ((__builtin_neon_qi *) __base, __offset); } -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_s8 (int8_t const * __base) -{ - return __builtin_mve_vldrbq_sv16qi ((__builtin_neon_qi *) __base); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_u8 (uint8_t const * __base) -{ - return __builtin_mve_vldrbq_uv16qi ((__builtin_neon_qi *) __base); -} - __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrbq_gather_offset_u16 (uint8_t const * __base, uint16x8_t __offset) @@ -1267,20 +468,6 @@ __arm_vldrbq_gather_offset_s16 (int8_t const * __base, uint16x8_t __offset) return __builtin_mve_vldrbq_gather_offset_sv8hi ((__builtin_neon_qi *) __base, __offset); } -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_s16 (int8_t const * __base) -{ - return __builtin_mve_vldrbq_sv8hi ((__builtin_neon_qi *) __base); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_u16 (uint8_t const * __base) -{ - return __builtin_mve_vldrbq_uv8hi ((__builtin_neon_qi *) __base); -} - __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrbq_gather_offset_u32 (uint8_t const * __base, uint32x4_t __offset) @@ -1297,20 +484,6 @@ __arm_vldrbq_gather_offset_s32 (int8_t const * __base, uint32x4_t __offset) __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_s32 (int8_t const * __base) -{ - return __builtin_mve_vldrbq_sv4si ((__builtin_neon_qi *) __base); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_u32 (uint8_t const * __base) -{ - return __builtin_mve_vldrbq_uv4si ((__builtin_neon_qi *) __base); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrwq_gather_base_s32 (uint32x4_t __addr, const int __offset) { return __builtin_mve_vldrwq_gather_base_sv4si (__addr, __offset); @@ -1325,48 +498,6 @@ __arm_vldrwq_gather_base_u32 (uint32x4_t __addr, const int __offset) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_s8 (int8_t * __addr, int8x16_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_sv16qi ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_s32 (int8_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_sv4si ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_s16 (int8_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_sv8hi ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_u8 (uint8_t * __addr, uint8x16_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_uv16qi ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_u32 (uint8_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_uv4si ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p_u16 (uint8_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrbq_p_uv8hi ((__builtin_neon_qi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrbq_scatter_offset_p_s8 (int8_t * __base, uint8x16_t __offset, int8x16_t __value, mve_pred16_t __p) { __builtin_mve_vstrbq_scatter_offset_p_sv16qi ((__builtin_neon_qi *) __base, __offset, __value, __p); @@ -1463,48 +594,6 @@ __arm_vldrbq_gather_offset_z_u16 (uint8_t const * __base, uint16x8_t __offset, m return __builtin_mve_vldrbq_gather_offset_z_uv8hi ((__builtin_neon_qi *) __base, __offset, __p); } -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_s8 (int8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_sv16qi ((__builtin_neon_qi *) __base, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_s32 (int8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_sv4si ((__builtin_neon_qi *) __base, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_s16 (int8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_sv8hi ((__builtin_neon_qi *) __base, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_u8 (uint8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_uv16qi ((__builtin_neon_qi *) __base, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_u32 (uint8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_uv4si ((__builtin_neon_qi *) __base, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrbq_z_u16 (uint8_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrbq_z_uv8hi ((__builtin_neon_qi *) __base, __p); -} - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrwq_gather_base_z_s32 (uint32x4_t __addr, const int __offset, mve_pred16_t __p) @@ -1631,91 +720,6 @@ __arm_vldrhq_gather_shifted_offset_z_u16 (uint16_t const * __base, uint16x8_t __ return __builtin_mve_vldrhq_gather_shifted_offset_z_uv8hi ((__builtin_neon_hi *) __base, __offset, __p); } -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_s32 (int16_t const * __base) -{ - return __builtin_mve_vldrhq_sv4si ((__builtin_neon_hi *) __base); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_s16 (int16_t const * __base) -{ - return __builtin_mve_vldrhq_sv8hi ((__builtin_neon_hi *) __base); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_u32 (uint16_t const * __base) -{ - return __builtin_mve_vldrhq_uv4si ((__builtin_neon_hi *) __base); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_u16 (uint16_t const * __base) -{ - return __builtin_mve_vldrhq_uv8hi ((__builtin_neon_hi *) __base); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_z_s32 (int16_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrhq_z_sv4si ((__builtin_neon_hi *) __base, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_z_s16 (int16_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrhq_z_sv8hi ((__builtin_neon_hi *) __base, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_z_u32 (uint16_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrhq_z_uv4si ((__builtin_neon_hi *) __base, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_z_u16 (uint16_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrhq_z_uv8hi ((__builtin_neon_hi *) __base, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_s32 (int32_t const * __base) -{ - return __builtin_mve_vldrwq_sv4si ((__builtin_neon_si *) __base); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_u32 (uint32_t const * __base) -{ - return __builtin_mve_vldrwq_uv4si ((__builtin_neon_si *) __base); -} - - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_z_s32 (int32_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrwq_z_sv4si ((__builtin_neon_si *) __base, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_z_u32 (uint32_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrwq_z_uv4si ((__builtin_neon_si *) __base, __p); -} - __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrdq_gather_base_s64 (uint64x2_t __addr, const int __offset) @@ -1971,90 +975,6 @@ __arm_vstrhq_scatter_shifted_offset_p_u16 (uint16_t * __base, uint16x8_t __offse __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_s32 (int16_t * __addr, int32x4_t __value) -{ - __builtin_mve_vstrhq_sv4si ((__builtin_neon_hi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_s16 (int16_t * __addr, int16x8_t __value) -{ - __builtin_mve_vstrhq_sv8hi ((__builtin_neon_hi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_u32 (uint16_t * __addr, uint32x4_t __value) -{ - __builtin_mve_vstrhq_uv4si ((__builtin_neon_hi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_u16 (uint16_t * __addr, uint16x8_t __value) -{ - __builtin_mve_vstrhq_uv8hi ((__builtin_neon_hi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p_s32 (int16_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrhq_p_sv4si ((__builtin_neon_hi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p_s16 (int16_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrhq_p_sv8hi ((__builtin_neon_hi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p_u32 (uint16_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrhq_p_uv4si ((__builtin_neon_hi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p_u16 (uint16_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrhq_p_uv8hi ((__builtin_neon_hi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_s32 (int32_t * __addr, int32x4_t __value) -{ - __builtin_mve_vstrwq_sv4si ((__builtin_neon_si *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_u32 (uint32_t * __addr, uint32x4_t __value) -{ - __builtin_mve_vstrwq_uv4si ((__builtin_neon_si *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p_s32 (int32_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrwq_p_sv4si ((__builtin_neon_si *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p_u32 (uint32_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrwq_p_uv4si ((__builtin_neon_si *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_p_s64 (uint64x2_t __addr, const int __offset, int64x2_t __value, mve_pred16_t __p) { __builtin_mve_vstrdq_scatter_base_p_sv2di (__addr, __offset, __value, __p); @@ -2193,415 +1113,6 @@ __arm_vstrwq_scatter_shifted_offset_u32 (uint32_t * __base, uint32x4_t __offset, __builtin_mve_vstrwq_scatter_shifted_offset_uv4si ((__builtin_neon_si *) __base, __offset, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv16qi (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv4si (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv8hi (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__inactive, * __a, __imm, __p); - *__a -= __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__inactive, *__a, __imm, __p); - *__a -= __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__inactive, *__a, __imm, __p); - *__a -= __imm * 4u; - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_n_u8 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vddupq_n_uv16qi (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_n_u32 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vddupq_n_uv4si (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_n_u16 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vddupq_n_uv8hi (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __res = __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __res = __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_n_uv16qi (__a, __c, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_n_uv4si (__a, __c, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_n_uv8hi (__a, __c, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __res = __builtin_mve_vdwdupq_n_uv16qi (*__a, __c, __imm); - *__a = __builtin_mve_vdwdupq_wb_uv16qi (*__a, __c, __imm); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __res = __builtin_mve_vdwdupq_n_uv4si (*__a, __c, __imm); - *__a = __builtin_mve_vdwdupq_wb_uv4si (*__a, __c, __imm); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __res = __builtin_mve_vdwdupq_n_uv8hi (*__a, __c, __imm); - *__a = __builtin_mve_vdwdupq_wb_uv8hi (*__a, __c, __imm); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv16qi (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv4si (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv8hi (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_n_u8 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vidupq_n_uv16qi (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__inactive, *__a, __imm, __p); - *__a += __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__inactive, *__a, __imm, __p); - *__a += __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__inactive, *__a, __imm, __p); - *__a += __imm * 4u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_n_u32 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vidupq_n_uv4si (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_n_u16 (uint32_t __a, const int __imm) -{ - return __builtin_mve_vidupq_n_uv8hi (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_wb_u8 (uint32_t * __a, const int __imm) -{ - uint8x16_t __res = __builtin_mve_vidupq_n_uv16qi (*__a, __imm); - *__a += __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_wb_u16 (uint32_t * __a, const int __imm) -{ - uint16x8_t __res = __builtin_mve_vidupq_n_uv8hi (*__a, __imm); - *__a += __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_wb_u32 (uint32_t * __a, const int __imm) -{ - uint32x4_t __res = __builtin_mve_vidupq_n_uv4si (*__a, __imm); - *__a += __imm * 4u; - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_wb_u8 (uint32_t * __a, const int __imm) -{ - uint8x16_t __res = __builtin_mve_vddupq_n_uv16qi (*__a, __imm); - *__a -= __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_wb_u16 (uint32_t * __a, const int __imm) -{ - uint16x8_t __res = __builtin_mve_vddupq_n_uv8hi (*__a, __imm); - *__a -= __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_wb_u32 (uint32_t * __a, const int __imm) -{ - uint32x4_t __res = __builtin_mve_vddupq_n_uv4si (*__a, __imm); - *__a -= __imm * 4u; - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__inactive, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_n_uv16qi (__a, __c, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_n_uv4si (__a, __c, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_n_u16 (uint32_t __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_n_uv8hi (__a, __c, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_wb_u8 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __res = __builtin_mve_viwdupq_n_uv16qi (*__a, __c, __imm); - *__a = __builtin_mve_viwdupq_wb_uv16qi (*__a, __c, __imm); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_wb_u32 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __res = __builtin_mve_viwdupq_n_uv4si (*__a, __c, __imm); - *__a = __builtin_mve_viwdupq_wb_uv4si (*__a, __c, __imm); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_wb_u16 (uint32_t * __a, uint32_t __b, const int __imm) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __res = __builtin_mve_viwdupq_n_uv8hi (*__a, __c, __imm); - *__a = __builtin_mve_viwdupq_wb_uv8hi (*__a, __c, __imm); - return __res; -} - - __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrdq_gather_base_wb_s64 (uint64x2_t * __addr, const int __offset) @@ -2738,472 +1249,6 @@ __arm_vstrwq_scatter_base_wb_u32 (uint32x4_t * __addr, const int __offset, uint3 *__addr = __builtin_mve_vstrwq_scatter_base_wb_uv4si (*__addr, __offset, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vddupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint8x16_t __arg1 = __arm_vuninitializedq_u8 (); - uint8x16_t __res = __builtin_mve_vddupq_m_n_uv16qi (__arg1, * __a, __imm, __p); - *__a -= __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint16x8_t __arg1 = __arm_vuninitializedq_u16 (); - uint16x8_t __res = __builtin_mve_vddupq_m_n_uv8hi (__arg1, *__a, __imm, __p); - *__a -= __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint32x4_t __arg1 = __arm_vuninitializedq_u32 (); - uint32x4_t __res = __builtin_mve_vddupq_m_n_uv4si (__arg1, *__a, __imm, __p); - *__a -= __imm * 4u; - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_vdwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __arg1 = __arm_vuninitializedq_u8 (); - uint8x16_t __res = __builtin_mve_vdwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __arg1 = __arm_vuninitializedq_u16 (); - uint16x8_t __res = __builtin_mve_vdwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __arg1 = __arm_vuninitializedq_u32 (); - uint32x4_t __res = __builtin_mve_vdwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_n_u8 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_n_u16 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_n_u32 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __builtin_mve_vidupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_wb_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint8x16_t __arg1 = __arm_vuninitializedq_u8 (); - uint8x16_t __res = __builtin_mve_vidupq_m_n_uv16qi (__arg1, *__a, __imm, __p); - *__a += __imm * 16u; - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_wb_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint16x8_t __arg1 = __arm_vuninitializedq_u16 (); - uint16x8_t __res = __builtin_mve_vidupq_m_n_uv8hi (__arg1, *__a, __imm, __p); - *__a += __imm * 8u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_wb_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - uint32x4_t __arg1 = __arm_vuninitializedq_u32 (); - uint32x4_t __res = __builtin_mve_vidupq_m_n_uv4si (__arg1, *__a, __imm, __p); - *__a += __imm * 4u; - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_n_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv16qi (__arm_vuninitializedq_u8 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_n_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv8hi (__arm_vuninitializedq_u16 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_n_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - return __builtin_mve_viwdupq_m_n_uv4si (__arm_vuninitializedq_u32 (), __a, __c, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_wb_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint8x16_t __arg1 = __arm_vuninitializedq_u8 (); - uint8x16_t __res = __builtin_mve_viwdupq_m_n_uv16qi (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv16qi (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_wb_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint16x8_t __arg1 = __arm_vuninitializedq_u16 (); - uint16x8_t __res = __builtin_mve_viwdupq_m_n_uv8hi (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv8hi (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_wb_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - uint64_t __c = ((uint64_t) __b) << 32; - uint32x4_t __arg1 = __arm_vuninitializedq_u32 (); - uint32x4_t __res = __builtin_mve_viwdupq_m_n_uv4si (__arg1, *__a, __c, __imm, __p); - *__a = __builtin_mve_viwdupq_m_wb_uv4si (__arg1, *__a, __c, __imm, __p); - return __res; -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_s8 (int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv16qi (__arm_vuninitializedq_s8 (), __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_s16 (int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_s32 (int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_u8 (uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv16qi (__arm_vuninitializedq_u8 (), __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_u16 (uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_u32 (uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __b, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_s8 (int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv16qi (__arm_vuninitializedq_s8 (), __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_s16 (int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_s32 (int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_u8 (uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv16qi (__arm_vuninitializedq_u8 (), __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_u16 (uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_u32 (uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) -{ - int32x4_t __res = __builtin_mve_vadciq_sv4si (__a, __b); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out) -{ - uint32x4_t __res = __builtin_mve_vadciq_uv4si (__a, __b); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - int32x4_t __res = __builtin_mve_vadciq_m_sv4si (__inactive, __a, __b, __p); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - uint32x4_t __res = __builtin_mve_vadciq_m_uv4si (__inactive, __a, __b, __p); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vadcq_sv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vadcq_uv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vadcq_m_sv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vadcq_m_uv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) -{ - int32x4_t __res = __builtin_mve_vsbciq_sv4si (__a, __b); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out) -{ - uint32x4_t __res = __builtin_mve_vsbciq_uv4si (__a, __b); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - int32x4_t __res = __builtin_mve_vsbciq_m_sv4si (__inactive, __a, __b, __p); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - uint32x4_t __res = __builtin_mve_vsbciq_m_uv4si (__inactive, __a, __b, __p); - *__carry_out = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_s32 (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vsbcq_sv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_u32 (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vsbcq_uv4si (__a, __b); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_m_s32 (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - int32x4_t __res = __builtin_mve_vsbcq_m_sv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_m_u32 (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - __builtin_arm_set_fpscr_nzcvqc((__builtin_arm_get_fpscr_nzcvqc () & ~0x20000000u) | ((*__carry & 0x1u) << 29)); - uint32x4_t __res = __builtin_mve_vsbcq_m_uv4si (__inactive, __a, __b, __p); - *__carry = (__builtin_arm_get_fpscr_nzcvqc () >> 29) & 0x1u; - return __res; -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_u8 (uint8_t * __addr, uint8x16_t __value, mve_pred16_t __p) -{ - return __arm_vstrbq_p_u8 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_s8 (int8_t * __addr, int8x16_t __value, mve_pred16_t __p) -{ - return __arm_vstrbq_p_s8 (__addr, __value, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q_s8 (int8_t * __addr, int8x16x2_t __value) @@ -3222,20 +1267,6 @@ __arm_vst2q_u8 (uint8_t * __addr, uint8x16x2_t __value) __builtin_mve_vst2qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_u8 (uint8_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrbq_z_u8 ( __base, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_s8 (int8_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrbq_z_s8 ( __base, __p); -} - __extension__ extern __inline int8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q_s8 (int8_t const * __addr) @@ -3274,20 +1305,6 @@ __arm_vld4q_u8 (uint8_t const * __addr) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_u16 (uint16_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - return __arm_vstrhq_p_u16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_s16 (int16_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - return __arm_vstrhq_p_s16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q_s16 (int16_t * __addr, int16x8x2_t __value) { union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; @@ -3304,20 +1321,6 @@ __arm_vst2q_u16 (uint16_t * __addr, uint16x8x2_t __value) __builtin_mve_vst2qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_u16 (uint16_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrhq_z_u16 ( __base, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_s16 (int16_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrhq_z_s16 ( __base, __p); -} - __extension__ extern __inline int16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q_s16 (int16_t const * __addr) @@ -3356,20 +1359,6 @@ __arm_vld4q_u16 (uint16_t const * __addr) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_u32 (uint32_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - return __arm_vstrwq_p_u32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_s32 (int32_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - return __arm_vstrwq_p_s32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q_s32 (int32_t * __addr, int32x4x2_t __value) { union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; @@ -3386,20 +1375,6 @@ __arm_vst2q_u32 (uint32_t * __addr, uint32x4x2_t __value) __builtin_mve_vst2qv4si ((__builtin_neon_si *) __addr, __rv.__o); } -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_u32 (uint32_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrwq_z_u32 ( __base, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_s32 (int32_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrwq_z_s32 ( __base, __p); -} - __extension__ extern __inline int32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q_s32 (int32_t const * __addr) @@ -3684,60 +1659,6 @@ __arm_srshr (int32_t value, const int shift) return __builtin_mve_srshr_si (value, shift); } -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_s8 (int8x16_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - int8x16_t __res = __builtin_mve_vshlcq_m_vec_sv16qi (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_sv16qi (__a, *__b, __imm, __p); - return __res; -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_u8 (uint8x16_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - uint8x16_t __res = __builtin_mve_vshlcq_m_vec_uv16qi (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_uv16qi (__a, *__b, __imm, __p); - return __res; -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_s16 (int16x8_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - int16x8_t __res = __builtin_mve_vshlcq_m_vec_sv8hi (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_sv8hi (__a, *__b, __imm, __p); - return __res; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_u16 (uint16x8_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - uint16x8_t __res = __builtin_mve_vshlcq_m_vec_uv8hi (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_uv8hi (__a, *__b, __imm, __p); - return __res; -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_s32 (int32x4_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - int32x4_t __res = __builtin_mve_vshlcq_m_vec_sv4si (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_sv4si (__a, *__b, __imm, __p); - return __res; -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m_u32 (uint32x4_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - uint32x4_t __res = __builtin_mve_vshlcq_m_vec_uv4si (__a, *__b, __imm, __p); - *__b = __builtin_mve_vshlcq_m_carry_uv4si (__a, *__b, __imm, __p); - return __res; -} - #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ __extension__ extern __inline void @@ -3758,595 +1679,6 @@ __arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value) __builtin_mve_vst4qv4sf (__addr, __rv.__o); } -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_f32_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvttq_f32_f16v4sf (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_f32_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtbq_f32_f16v4sf (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_f16_s16 (int16x8_t __a) -{ - return __builtin_mve_vcvtq_to_f_sv8hf (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_f32_s32 (int32x4_t __a) -{ - return __builtin_mve_vcvtq_to_f_sv4sf (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_f16_u16 (uint16x8_t __a) -{ - return __builtin_mve_vcvtq_to_f_uv8hf (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_f32_u32 (uint32x4_t __a) -{ - return __builtin_mve_vcvtq_to_f_uv4sf (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_s16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtq_from_f_sv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_s32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtq_from_f_sv4si (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_u16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtq_from_f_uv8hi (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_u32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtq_from_f_uv4si (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_u16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtpq_uv8hi (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_u32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtpq_uv4si (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_u16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtnq_uv8hi (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_u32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtnq_uv4si (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_u16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtmq_uv8hi (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_u32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtmq_uv4si (__a); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_u16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtaq_uv8hi (__a); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_u32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtaq_uv4si (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_s16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtaq_sv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_s32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtaq_sv4si (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_s16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtnq_sv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_s32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtnq_sv4si (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_s16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtpq_sv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_s32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtpq_sv4si (__a); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_s16_f16 (float16x8_t __a) -{ - return __builtin_mve_vcvtmq_sv8hi (__a); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_s32_f32 (float32x4_t __a) -{ - return __builtin_mve_vcvtmq_sv4si (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_f16_s16 (int16x8_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_to_f_sv8hf (__a, __imm6); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_f32_s32 (int32x4_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_to_f_sv4sf (__a, __imm6); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_f16_u16 (uint16x8_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_to_f_uv8hf (__a, __imm6); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_f32_u32 (uint32x4_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_to_f_uv4sf (__a, __imm6); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_s16_f16 (float16x8_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_from_f_sv8hi (__a, __imm6); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_s32_f32 (float32x4_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_from_f_sv4si (__a, __imm6); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_u16_f16 (float16x8_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_from_f_uv8hi (__a, __imm6); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n_u32_f32 (float32x4_t __a, const int __imm6) -{ - return __builtin_mve_vcvtq_n_from_f_uv4si (__a, __imm6); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_f16 (float16x8_t __a, float16x8_t __b) -{ - return __builtin_mve_vornq_fv8hf (__a, __b); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_f16 (float16x8_t __a, float16x8_t __b) -{ - return __builtin_mve_vbicq_fv8hf (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_f32 (float32x4_t __a, float32x4_t __b) -{ - return __builtin_mve_vornq_fv4sf (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_f32 (float32x4_t __a, float32x4_t __b) -{ - return __builtin_mve_vbicq_fv4sf (__a, __b); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_f16_f32 (float16x8_t __a, float32x4_t __b) -{ - return __builtin_mve_vcvttq_f16_f32v8hf (__a, __b); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_f16_f32 (float16x8_t __a, float32x4_t __b) -{ - return __builtin_mve_vcvtbq_f16_f32v8hf (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m_s16_f16 (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_sv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m_u16_f16 (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_uv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m_s32_f32 (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_sv4si (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m_u32_f32 (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_uv4si (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_f16_s16 (float16x8_t __inactive, int16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_sv8hf (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_f16_u16 (float16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_uv8hf (__inactive, __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_f32_s32 (float32x4_t __inactive, int32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_sv4sf (__inactive, __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_f32_u32 (float32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_uv4sf (__inactive, __a, __p); -} - - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_m_f16_f32 (float16x8_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vcvtbq_m_f16_f32v8hf (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_m_f32_f16 (float32x4_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtbq_m_f32_f16v4sf (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_m_f16_f32 (float16x8_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vcvttq_m_f16_f32v8hf (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_m_f32_f16 (float32x4_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvttq_m_f32_f16v4sf (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m_s16_f16 (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_sv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m_s16_f16 (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_sv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m_s16_f16 (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_sv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_s16_f16 (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_sv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m_u16_f16 (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_uv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m_u16_f16 (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_uv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m_u16_f16 (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_uv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_u16_f16 (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_uv8hi (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m_s32_f32 (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_sv4si (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m_s32_f32 (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_sv4si (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m_s32_f32 (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_sv4si (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_s32_f32 (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_sv4si (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m_u32_f32 (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_uv4si (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m_u32_f32 (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_uv4si (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m_u32_f32 (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_uv4si (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_u32_f32 (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_uv4si (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_f16_u16 (float16x8_t __inactive, uint16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_uv8hf (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_f16_s16 (float16x8_t __inactive, int16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_sv8hf (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_f32_u32 (float32x4_t __inactive, uint32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_uv4sf (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_f32_s32 (float32x4_t __inactive, int32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_sv4sf (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_f32 (float32x4_t __inactive, float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_fv4sf (__inactive, __a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_f16 (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_fv8hf (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_s32_f32 (int32x4_t __inactive, float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_sv4si (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_s16_f16 (int16x8_t __inactive, float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_sv8hi (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_u32_f32 (uint32x4_t __inactive, float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_uv4si (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n_u16_f16 (uint16x8_t __inactive, float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_uv8hi (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_f32 (float32x4_t __inactive, float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_fv4sf (__inactive, __a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m_f16 (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_fv8hf (__inactive, __a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_f32 (float32_t const * __base) -{ - return __builtin_mve_vldrwq_fv4sf((__builtin_neon_si *) __base); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrwq_z_f32 (float32_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrwq_z_fv4sf((__builtin_neon_si *) __base, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_z_f16 (float16_t const * __base, mve_pred16_t __p) -{ - return __builtin_mve_vldrhq_z_fv8hf((__builtin_neon_hi *) __base, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vldrhq_f16 (float16_t const * __base) -{ - return __builtin_mve_vldrhq_fv8hf((__builtin_neon_hi *) __base); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrhq_gather_offset_f16 (float16_t const * __base, uint16x8_t __offset) @@ -4419,34 +1751,6 @@ __arm_vldrwq_gather_shifted_offset_z_f32 (float32_t const * __base, uint32x4_t _ __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p_f32 (float32_t * __addr, float32x4_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrwq_p_fv4sf ((__builtin_neon_si *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_f32 (float32_t * __addr, float32x4_t __value) -{ - __builtin_mve_vstrwq_fv4sf ((__builtin_neon_si *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_f16 (float16_t * __addr, float16x8_t __value) -{ - __builtin_mve_vstrhq_fv8hf ((__builtin_neon_hi *) __addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p_f16 (float16_t * __addr, float16x8_t __value, mve_pred16_t __p) -{ - __builtin_mve_vstrhq_p_fv8hf ((__builtin_neon_hi *) __addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrhq_scatter_offset_f16 (float16_t * __base, uint16x8_t __offset, float16x8_t __value) { __builtin_mve_vstrhq_scatter_offset_fv8hf ((__builtin_neon_hi *) __base, __offset, __value); @@ -4549,272 +1853,6 @@ __arm_vstrwq_scatter_base_wb_p_f32 (uint32x4_t * __addr, const int __offset, flo *__addr = __builtin_mve_vstrwq_scatter_base_wb_p_fv4sf (*__addr, __offset, __value, __p); } -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_x_s16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_x_s32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_x_u16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_x_u32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtaq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_x_s16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_x_s32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_x_u16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_x_u32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtnq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_x_s16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_x_s32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_x_u16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_x_u32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtpq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_x_s16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_sv8hi (__arm_vuninitializedq_s16 (), __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_x_s32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_sv4si (__arm_vuninitializedq_s32 (), __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_x_u16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_uv8hi (__arm_vuninitializedq_u16 (), __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_x_u32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtmq_m_uv4si (__arm_vuninitializedq_u32 (), __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_x_f32_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtbq_m_f32_f16v4sf (__arm_vuninitializedq_f32 (), __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_x_f32_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvttq_m_f32_f16v4sf (__arm_vuninitializedq_f32 (), __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_f16_u16 (uint16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_uv8hf (__arm_vuninitializedq_f16 (), __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_f16_s16 (int16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_sv8hf (__arm_vuninitializedq_f16 (), __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_f32_s32 (int32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_sv4sf (__arm_vuninitializedq_f32 (), __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_f32_u32 (uint32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_to_f_uv4sf (__arm_vuninitializedq_f32 (), __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_f16_s16 (int16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_sv8hf (__arm_vuninitializedq_f16 (), __a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_f16_u16 (uint16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_uv8hf (__arm_vuninitializedq_f16 (), __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_f32_s32 (int32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_sv4sf (__arm_vuninitializedq_f32 (), __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_f32_u32 (uint32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_to_f_uv4sf (__arm_vuninitializedq_f32 (), __a, __imm6, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_s16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_sv8hi (__arm_vuninitializedq_s16 (), __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_s32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_sv4si (__arm_vuninitializedq_s32 (), __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_u16_f16 (float16x8_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_uv8hi (__arm_vuninitializedq_u16 (), __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_u32_f32 (float32x4_t __a, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_from_f_uv4si (__arm_vuninitializedq_u32 (), __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_s16_f16 (float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_sv8hi (__arm_vuninitializedq_s16 (), __a, __imm6, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_s32_f32 (float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_sv4si (__arm_vuninitializedq_s32 (), __a, __imm6, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_u16_f16 (float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_uv8hi (__arm_vuninitializedq_u16 (), __a, __imm6, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n_u32_f32 (float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __builtin_mve_vcvtq_m_n_from_f_uv4si (__arm_vuninitializedq_u32 (), __a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_f16 (float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_fv8hf (__arm_vuninitializedq_f16 (), __a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x_f32 (float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vbicq_m_fv4sf (__arm_vuninitializedq_f32 (), __a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_f16 (float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_fv8hf (__arm_vuninitializedq_f16 (), __a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x_f32 (float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __builtin_mve_vornq_m_fv4sf (__arm_vuninitializedq_f32 (), __a, __b, __p); -} - __extension__ extern __inline float16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld4q_f16 (float16_t const * __addr) @@ -4833,13 +1871,6 @@ __arm_vld2q_f16 (float16_t const * __addr) return __rv.__i; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_f16 (float16_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrhq_z_f16 (__base, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q_f16 (float16_t * __addr, float16x8x2_t __value) @@ -4849,13 +1880,6 @@ __arm_vst2q_f16 (float16_t * __addr, float16x8x2_t __value) __builtin_mve_vst2qv8hf (__addr, __rv.__o); } -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_f16 (float16_t * __addr, float16x8_t __value, mve_pred16_t __p) -{ - return __arm_vstrhq_p_f16 (__addr, __value, __p); -} - __extension__ extern __inline float32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld4q_f32 (float32_t const * __addr) @@ -4874,13 +1898,6 @@ __arm_vld2q_f32 (float32_t const * __addr) return __rv.__i; } -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z_f32 (float32_t const *__base, mve_pred16_t __p) -{ - return __arm_vldrwq_z_f32 (__base, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q_f32 (float32_t * __addr, float32x4x2_t __value) @@ -4890,13 +1907,6 @@ __arm_vst2q_f32 (float32_t * __addr, float32x4x2_t __value) __builtin_mve_vst2qv4sf (__addr, __rv.__o); } -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p_f32 (float32_t * __addr, float32x4_t __value, mve_pred16_t __p) -{ - return __arm_vstrwq_p_f32 (__addr, __value, __p); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsetq_lane_f16 (float16_t __a, float16x8_t __b, const int __idx) @@ -4975,272 +1985,6 @@ __arm_vst4q (uint32_t * __addr, uint32x4x4_t __value) __arm_vst4q_u32 (__addr, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (uint8x16_t __a, uint8x16_t __b) -{ - return __arm_vornq_u8 (__a, __b); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (uint8x16_t __a, uint8x16_t __b) -{ - return __arm_vbicq_u8 (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (int8x16_t __a, int8x16_t __b) -{ - return __arm_vornq_s8 (__a, __b); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (int8x16_t __a, int8x16_t __b) -{ - return __arm_vbicq_s8 (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (uint16x8_t __a, uint16x8_t __b) -{ - return __arm_vornq_u16 (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (uint16x8_t __a, uint16x8_t __b) -{ - return __arm_vbicq_u16 (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (int16x8_t __a, int16x8_t __b) -{ - return __arm_vornq_s16 (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (int16x8_t __a, int16x8_t __b) -{ - return __arm_vbicq_s16 (__a, __b); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (uint32x4_t __a, uint32x4_t __b) -{ - return __arm_vornq_u32 (__a, __b); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (uint32x4_t __a, uint32x4_t __b) -{ - return __arm_vbicq_u32 (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (int32x4_t __a, int32x4_t __b) -{ - return __arm_vornq_s32 (__a, __b); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (int32x4_t __a, int32x4_t __b) -{ - return __arm_vbicq_s32 (__a, __b); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (uint16x8_t __a, const int __imm) -{ - return __arm_vbicq_n_u16 (__a, __imm); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (int16x8_t __a, const int __imm) -{ - return __arm_vbicq_n_s16 (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (uint32x4_t __a, const int __imm) -{ - return __arm_vbicq_n_u32 (__a, __imm); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (int32x4_t __a, const int __imm) -{ - return __arm_vbicq_n_s32 (__a, __imm); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n (int16x8_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vbicq_m_n_s16 (__a, __imm, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n (int32x4_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vbicq_m_n_s32 (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n (uint16x8_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vbicq_m_n_u16 (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m_n (uint32x4_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vbicq_m_n_u32 (__a, __imm, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (int8x16_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_s8 (__a, __b, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (uint8x16_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_u8 (__a, __b, __imm); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (int16x8_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_s16 (__a, __b, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (uint16x8_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_u16 (__a, __b, __imm); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (int32x4_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_s32 (__a, __b, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq (uint32x4_t __a, uint32_t * __b, const int __imm) -{ - return __arm_vshlcq_u32 (__a, __b, __imm); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_s8 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_s32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (int16x8_t __inactive, int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_s16 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (uint8x16_t __inactive, uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_u8 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_u32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_u16 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (int8x16_t __inactive, int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_s8 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_s32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (int16x8_t __inactive, int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_s16 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (uint8x16_t __inactive, uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_u8 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_u32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (uint16x8_t __inactive, uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_u16 (__inactive, __a, __b, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrbq_scatter_offset (int8_t * __base, uint8x16_t __offset, int8x16_t __value) @@ -5285,48 +2029,6 @@ __arm_vstrbq_scatter_offset (uint8_t * __base, uint16x8_t __offset, uint16x8_t _ __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (int8_t * __addr, int8x16_t __value) -{ - __arm_vstrbq_s8 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (int8_t * __addr, int32x4_t __value) -{ - __arm_vstrbq_s32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (int8_t * __addr, int16x8_t __value) -{ - __arm_vstrbq_s16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (uint8_t * __addr, uint8x16_t __value) -{ - __arm_vstrbq_u8 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (uint8_t * __addr, uint32x4_t __value) -{ - __arm_vstrbq_u32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq (uint8_t * __addr, uint16x8_t __value) -{ - __arm_vstrbq_u16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrwq_scatter_base (uint32x4_t __addr, const int __offset, int32x4_t __value) { __arm_vstrwq_scatter_base_s32 (__addr, __offset, __value); @@ -5383,48 +2085,6 @@ __arm_vldrbq_gather_offset (int8_t const * __base, uint32x4_t __offset) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (int8_t * __addr, int8x16_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_s8 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (int8_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_s32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (int8_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_s16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (uint8_t * __addr, uint8x16_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_u8 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (uint8_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_u32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrbq_p (uint8_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - __arm_vstrbq_p_u16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrbq_scatter_offset_p (int8_t * __base, uint8x16_t __offset, int8x16_t __value, mve_pred16_t __p) { __arm_vstrbq_scatter_offset_p_s8 (__base, __offset, __value, __p); @@ -5859,90 +2519,6 @@ __arm_vstrhq_scatter_shifted_offset_p (uint16_t * __base, uint16x8_t __offset, u __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq (int16_t * __addr, int32x4_t __value) -{ - __arm_vstrhq_s32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq (int16_t * __addr, int16x8_t __value) -{ - __arm_vstrhq_s16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq (uint16_t * __addr, uint32x4_t __value) -{ - __arm_vstrhq_u32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq (uint16_t * __addr, uint16x8_t __value) -{ - __arm_vstrhq_u16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p (int16_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrhq_p_s32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p (int16_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - __arm_vstrhq_p_s16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p (uint16_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrhq_p_u32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p (uint16_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - __arm_vstrhq_p_u16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq (int32_t * __addr, int32x4_t __value) -{ - __arm_vstrwq_s32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq (uint32_t * __addr, uint32x4_t __value) -{ - __arm_vstrwq_u32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p (int32_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrwq_p_s32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p (uint32_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrwq_p_u32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_p (uint64x2_t __addr, const int __offset, int64x2_t __value, mve_pred16_t __p) { __arm_vstrdq_scatter_base_p_s64 (__addr, __offset, __value, __p); @@ -6081,342 +2657,6 @@ __arm_vstrwq_scatter_shifted_offset (uint32_t * __base, uint32x4_t __offset, uin __arm_vstrwq_scatter_shifted_offset_u32 (__base, __offset, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_n_u8 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_n_u32 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_n_u16 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_wb_u8 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_wb_u16 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_m_wb_u32 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u8 (uint32_t __a, const int __imm) -{ - return __arm_vddupq_n_u8 (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u32 (uint32_t __a, const int __imm) -{ - return __arm_vddupq_n_u32 (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u16 (uint32_t __a, const int __imm) -{ - return __arm_vddupq_n_u16 (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_n_u8 (__a, __b, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_n_u32 (__a, __b, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_n_u16 (__a, __b, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_wb_u8 (__a, __b, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_wb_u32 (__a, __b, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_vdwdupq_wb_u16 (__a, __b, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint8x16_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_n_u8 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint32x4_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_n_u32 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint16x8_t __inactive, uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_n_u16 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u8 (uint32_t __a, const int __imm) -{ - return __arm_vidupq_n_u8 (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint8x16_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_wb_u8 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint16x8_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_wb_u16 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_m (uint32x4_t __inactive, uint32_t * __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_m_wb_u32 (__inactive, __a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u32 (uint32_t __a, const int __imm) -{ - return __arm_vidupq_n_u32 (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u16 (uint32_t __a, const int __imm) -{ - return __arm_vidupq_n_u16 (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u8 (uint32_t * __a, const int __imm) -{ - return __arm_vidupq_wb_u8 (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u16 (uint32_t * __a, const int __imm) -{ - return __arm_vidupq_wb_u16 (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_u32 (uint32_t * __a, const int __imm) -{ - return __arm_vidupq_wb_u32 (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u8 (uint32_t * __a, const int __imm) -{ - return __arm_vddupq_wb_u8 (__a, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u16 (uint32_t * __a, const int __imm) -{ - return __arm_vddupq_wb_u16 (__a, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_u32 (uint32_t * __a, const int __imm) -{ - return __arm_vddupq_wb_u32 (__a, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_n_u8 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint32x4_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_n_u32 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint16x8_t __inactive, uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_n_u16 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_wb_u8 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_wb_u32 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_m (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_m_wb_u16 (__inactive, __a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u8 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_n_u8 (__a, __b, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u32 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_n_u32 (__a, __b, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u16 (uint32_t __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_n_u16 (__a, __b, __imm); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u8 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_wb_u8 (__a, __b, __imm); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u32 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_wb_u32 (__a, __b, __imm); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_u16 (uint32_t * __a, uint32_t __b, const int __imm) -{ - return __arm_viwdupq_wb_u16 (__a, __b, __imm); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrdq_scatter_base_wb (uint64x2_t * __addr, const int __offset, int64x2_t __value) @@ -6473,384 +2713,6 @@ __arm_vstrwq_scatter_base_wb (uint32x4_t * __addr, const int __offset, uint32x4_ __arm_vstrwq_scatter_base_wb_u32 (__addr, __offset, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_n_u8 (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_n_u16 (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_n_u32 (__a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_wb_u8 (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_wb_u16 (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vddupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vddupq_x_wb_u32 (__a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_n_u8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_n_u16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_n_u32 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_wb_u8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_wb_u16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vdwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vdwdupq_x_wb_u32 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u8 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_n_u8 (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u16 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_n_u16 (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u32 (uint32_t __a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_n_u32 (__a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u8 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_wb_u8 (__a, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u16 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_wb_u16 (__a, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vidupq_x_u32 (uint32_t *__a, const int __imm, mve_pred16_t __p) -{ - return __arm_vidupq_x_wb_u32 (__a, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u8 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_n_u8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u16 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_n_u16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u32 (uint32_t __a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_n_u32 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u8 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_wb_u8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u16 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_wb_u16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_viwdupq_x_u32 (uint32_t *__a, uint32_t __b, const int __imm, mve_pred16_t __p) -{ - return __arm_viwdupq_x_wb_u32 (__a, __b, __imm, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_s8 (__a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_s16 (__a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_s32 (__a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_u8 (__a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_u16 (__a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_u32 (__a, __b, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (int8x16_t __a, int8x16_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_s8 (__a, __b, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (int16x8_t __a, int16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_s16 (__a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (int32x4_t __a, int32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_s32 (__a, __b, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (uint8x16_t __a, uint8x16_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_u8 (__a, __b, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (uint16x8_t __a, uint16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_u16 (__a, __b, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (uint32x4_t __a, uint32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_u32 (__a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) -{ - return __arm_vadciq_s32 (__a, __b, __carry_out); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out) -{ - return __arm_vadciq_u32 (__a, __b, __carry_out); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - return __arm_vadciq_m_s32 (__inactive, __a, __b, __carry_out, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadciq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - return __arm_vadciq_m_u32 (__inactive, __a, __b, __carry_out, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - return __arm_vadcq_s32 (__a, __b, __carry); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - return __arm_vadcq_u32 (__a, __b, __carry); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vadcq_m_s32 (__inactive, __a, __b, __carry, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vadcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vadcq_m_u32 (__inactive, __a, __b, __carry, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq (int32x4_t __a, int32x4_t __b, unsigned * __carry_out) -{ - return __arm_vsbciq_s32 (__a, __b, __carry_out); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out) -{ - return __arm_vsbciq_u32 (__a, __b, __carry_out); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - return __arm_vsbciq_m_s32 (__inactive, __a, __b, __carry_out, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbciq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry_out, mve_pred16_t __p) -{ - return __arm_vsbciq_m_u32 (__inactive, __a, __b, __carry_out, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq (int32x4_t __a, int32x4_t __b, unsigned * __carry) -{ - return __arm_vsbcq_s32 (__a, __b, __carry); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq (uint32x4_t __a, uint32x4_t __b, unsigned * __carry) -{ - return __arm_vsbcq_u32 (__a, __b, __carry); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_m (int32x4_t __inactive, int32x4_t __a, int32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vsbcq_m_s32 (__inactive, __a, __b, __carry, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vsbcq_m (uint32x4_t __inactive, uint32x4_t __a, uint32x4_t __b, unsigned * __carry, mve_pred16_t __p) -{ - return __arm_vsbcq_m_u32 (__inactive, __a, __b, __carry, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (uint8_t * __addr, uint8x16_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_u8 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (int8_t * __addr, int8x16_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_s8 (__addr, __value, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q (int8_t * __addr, int8x16x2_t __value) @@ -6865,20 +2727,6 @@ __arm_vst2q (uint8_t * __addr, uint8x16x2_t __value) __arm_vst2q_u8 (__addr, __value); } -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (uint8_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_u8 (__base, __p); -} - -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (int8_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_s8 (__base, __p); -} - __extension__ extern __inline int8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q (int8_t const * __addr) @@ -6909,20 +2757,6 @@ __arm_vld4q (uint8_t const * __addr) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (uint16_t * __addr, uint16x8_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_u16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (int16_t * __addr, int16x8_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_s16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q (int16_t * __addr, int16x8x2_t __value) { __arm_vst2q_s16 (__addr, __value); @@ -6935,20 +2769,6 @@ __arm_vst2q (uint16_t * __addr, uint16x8x2_t __value) __arm_vst2q_u16 (__addr, __value); } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (uint16_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_u16 (__base, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (int16_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_s16 (__base, __p); -} - __extension__ extern __inline int16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q (int16_t const * __addr) @@ -6979,20 +2799,6 @@ __arm_vld4q (uint16_t const * __addr) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (uint32_t * __addr, uint32x4_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_u32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (int32_t * __addr, int32x4_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_s32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q (int32_t * __addr, int32x4x2_t __value) { __arm_vst2q_s32 (__addr, __value); @@ -7005,20 +2811,6 @@ __arm_vst2q (uint32_t * __addr, uint32x4x2_t __value) __arm_vst2q_u32 (__addr, __value); } -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (uint32_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_u32 (__base, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (int32_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_s32 (__base, __p); -} - __extension__ extern __inline int32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld2q (int32_t const * __addr) @@ -7159,48 +2951,6 @@ __arm_vgetq_lane (uint64x2_t __a, const int __idx) return __arm_vgetq_lane_u64 (__a, __idx); } -__extension__ extern __inline int8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (int8x16_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_s8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (uint8x16_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_u8 (__a, __b, __imm, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (int16x8_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_s16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (uint16x8_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_u16 (__a, __b, __imm, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (int32x4_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_s32 (__a, __b, __imm, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vshlcq_m (uint32x4_t __a, uint32_t * __b, const int __imm, mve_pred16_t __p) -{ - return __arm_vshlcq_m_u32 (__a, __b, __imm, __p); -} - #if (__ARM_FEATURE_MVE & 2) /* MVE Floating point. */ __extension__ extern __inline void @@ -7217,384 +2967,6 @@ __arm_vst4q (float32_t * __addr, float32x4x4_t __value) __arm_vst4q_f32 (__addr, __value); } -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_f32 (float16x8_t __a) -{ - return __arm_vcvttq_f32_f16 (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_f32 (float16x8_t __a) -{ - return __arm_vcvtbq_f32_f16 (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq (int16x8_t __a) -{ - return __arm_vcvtq_f16_s16 (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq (int32x4_t __a) -{ - return __arm_vcvtq_f32_s32 (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq (uint16x8_t __a) -{ - return __arm_vcvtq_f16_u16 (__a); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq (uint32x4_t __a) -{ - return __arm_vcvtq_f32_u32 (__a); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n (int16x8_t __a, const int __imm6) -{ - return __arm_vcvtq_n_f16_s16 (__a, __imm6); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n (int32x4_t __a, const int __imm6) -{ - return __arm_vcvtq_n_f32_s32 (__a, __imm6); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n (uint16x8_t __a, const int __imm6) -{ - return __arm_vcvtq_n_f16_u16 (__a, __imm6); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_n (uint32x4_t __a, const int __imm6) -{ - return __arm_vcvtq_n_f32_u32 (__a, __imm6); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (float16x8_t __a, float16x8_t __b) -{ - return __arm_vornq_f16 (__a, __b); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (float16x8_t __a, float16x8_t __b) -{ - return __arm_vbicq_f16 (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq (float32x4_t __a, float32x4_t __b) -{ - return __arm_vornq_f32 (__a, __b); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq (float32x4_t __a, float32x4_t __b) -{ - return __arm_vbicq_f32 (__a, __b); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtaq_m_s16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtaq_m_u16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtaq_m_s32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtaq_m (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtaq_m_u32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (float16x8_t __inactive, int16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_f16_s16 (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (float16x8_t __inactive, uint16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_f16_u16 (__inactive, __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (float32x4_t __inactive, int32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_f32_s32 (__inactive, __a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (float32x4_t __inactive, uint32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_f32_u32 (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_m (float16x8_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vcvtbq_m_f16_f32 (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtbq_m (float32x4_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtbq_m_f32_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_m (float16x8_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vcvttq_m_f16_f32 (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvttq_m (float32x4_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvttq_m_f32_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtmq_m_s16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtnq_m_s16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtpq_m_s16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (int16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_s16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtmq_m_u16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtnq_m_u16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtpq_m_u16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (uint16x8_t __inactive, float16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_u16_f16 (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtmq_m_s32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtnq_m_s32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtpq_m_s32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (int32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_s32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtmq_m (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtmq_m_u32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtnq_m (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtnq_m_u32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtpq_m (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtpq_m_u32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m (uint32x4_t __inactive, float32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_m_u32_f32 (__inactive, __a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (float16x8_t __inactive, uint16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_f16_u16 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (float16x8_t __inactive, int16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_f16_s16 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (float32x4_t __inactive, uint32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_f32_u32 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (float32x4_t __inactive, int32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_f32_s32 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (float32x4_t __inactive, float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_f32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_m (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_m_f16 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (int32x4_t __inactive, float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_s32_f32 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (int16x8_t __inactive, float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_s16_f16 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (uint32x4_t __inactive, float32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_u32_f32 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_m_n (uint16x8_t __inactive, float16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_m_n_u16_f16 (__inactive, __a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (float32x4_t __inactive, float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_f32 (__inactive, __a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_m (float16x8_t __inactive, float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_m_f16 (__inactive, __a, __b, __p); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vldrhq_gather_offset (float16_t const * __base, uint16x8_t __offset) @@ -7653,34 +3025,6 @@ __arm_vldrwq_gather_shifted_offset_z (float32_t const * __base, uint32x4_t __off __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq_p (float32_t * __addr, float32x4_t __value, mve_pred16_t __p) -{ - __arm_vstrwq_p_f32 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrwq (float32_t * __addr, float32x4_t __value) -{ - __arm_vstrwq_f32 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq (float16_t * __addr, float16x8_t __value) -{ - __arm_vstrhq_f16 (__addr, __value); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vstrhq_p (float16_t * __addr, float16x8_t __value, mve_pred16_t __p) -{ - __arm_vstrhq_p_f16 (__addr, __value, __p); -} - -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vstrhq_scatter_offset (float16_t * __base, uint16x8_t __offset, float16x8_t __value) { __arm_vstrhq_scatter_offset_f16 (__base, __offset, __value); @@ -7763,90 +3107,6 @@ __arm_vstrwq_scatter_base_wb_p (uint32x4_t * __addr, const int __offset, float32 __arm_vstrwq_scatter_base_wb_p_f32 (__addr, __offset, __value, __p); } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x (uint16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_x_f16_u16 (__a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x (int16x8_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_x_f16_s16 (__a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x (int32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_x_f32_s32 (__a, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x (uint32x4_t __a, mve_pred16_t __p) -{ - return __arm_vcvtq_x_f32_u32 (__a, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n (int16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_x_n_f16_s16 (__a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n (uint16x8_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_x_n_f16_u16 (__a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n (int32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_x_n_f32_s32 (__a, __imm6, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vcvtq_x_n (uint32x4_t __a, const int __imm6, mve_pred16_t __p) -{ - return __arm_vcvtq_x_n_f32_u32 (__a, __imm6, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_f16 (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vbicq_x (float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vbicq_x_f32 (__a, __b, __p); -} - -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (float16x8_t __a, float16x8_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_f16 (__a, __b, __p); -} - -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vornq_x (float32x4_t __a, float32x4_t __b, mve_pred16_t __p) -{ - return __arm_vornq_x_f32 (__a, __b, __p); -} - __extension__ extern __inline float16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld4q (float16_t const * __addr) @@ -7861,13 +3121,6 @@ __arm_vld2q (float16_t const * __addr) return __arm_vld2q_f16 (__addr); } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (float16_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_f16 (__base, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q (float16_t * __addr, float16x8x2_t __value) @@ -7875,13 +3128,6 @@ __arm_vst2q (float16_t * __addr, float16x8x2_t __value) __arm_vst2q_f16 (__addr, __value); } -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (float16_t * __addr, float16x8_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_f16 (__addr, __value, __p); -} - __extension__ extern __inline float32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vld4q (float32_t const * __addr) @@ -7896,13 +3142,6 @@ __arm_vld2q (float32_t const * __addr) return __arm_vld2q_f32 (__addr); } -__extension__ extern __inline float32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vld1q_z (float32_t const *__base, mve_pred16_t __p) -{ - return __arm_vld1q_z_f32 (__base, __p); -} - __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vst2q (float32_t * __addr, float32x4x2_t __value) @@ -7910,13 +3149,6 @@ __arm_vst2q (float32_t * __addr, float32x4x2_t __value) __arm_vst2q_f32 (__addr, __value); } -__extension__ extern __inline void -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -__arm_vst1q_p (float32_t * __addr, float32x4_t __value, mve_pred16_t __p) -{ - __arm_vst1q_p_f32 (__addr, __value, __p); -} - __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vsetq_lane (float16_t __a, float16x8_t __b, const int __idx) @@ -8268,177 +3500,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: __arm_vst4q_f16 (__ARM_mve_coerce_f16_ptr(__p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x4_t)), \ int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: __arm_vst4q_f32 (__ARM_mve_coerce_f32_ptr(__p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x4_t)));}) -#define __arm_vcvtbq_f32(p0) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_float16x8_t]: __arm_vcvtbq_f32_f16 (__ARM_mve_coerce(__p0, float16x8_t)));}) - -#define __arm_vcvttq_f32(p0) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_float16x8_t]: __arm_vcvttq_f32_f16 (__ARM_mve_coerce(__p0, float16x8_t)));}) - -#define __arm_vcvtq(p0) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vcvtq_f16_s16 (__ARM_mve_coerce(__p0, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vcvtq_f32_s32 (__ARM_mve_coerce(__p0, int32x4_t)), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vcvtq_f16_u16 (__ARM_mve_coerce(__p0, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vcvtq_f32_u32 (__ARM_mve_coerce(__p0, uint32x4_t)));}) - -#define __arm_vcvtq_n(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vcvtq_n_f16_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vcvtq_n_f32_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vcvtq_n_f16_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vcvtq_n_f32_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1));}) - -#define __arm_vbicq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vbicq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vbicq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vbicq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vbicq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vbicq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vbicq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)));}) - -#define __arm_vornq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vornq_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t)), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vornq_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t)));}) - -#define __arm_vbicq_m_n(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vbicq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vbicq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1, p2), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vbicq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbicq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1, p2));}) - -#define __arm_vshlcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2), \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vshlcq_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vshlcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1, p2), \ - int (*)[__ARM_mve_type_uint8x16_t]: __arm_vshlcq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), p1, p2), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vshlcq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vshlcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1, p2));}) - -#define __arm_vcvtaq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtaq_m_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtaq_m_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtaq_m_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtaq_m_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvtq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcvtq_m_f16_s16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcvtq_m_f32_s32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vcvtq_m_f16_u16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vcvtq_m_f32_u32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtq_m_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtq_m_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtq_m_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtq_m_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvtq_m_n(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtq_m_n_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtq_m_n_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtq_m_n_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtq_m_n_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2, p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_int16x8_t]: __arm_vcvtq_m_n_f16_s16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, int16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_int32x4_t]: __arm_vcvtq_m_n_f32_s32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2, p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vcvtq_m_n_f16_u16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vcvtq_m_n_f32_u32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2, p3));}) - -#define __arm_vcvtbq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float16x8_t]: __arm_vcvtbq_m_f32_f16 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float32x4_t]: __arm_vcvtbq_m_f16_f32 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvttq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float16x8_t]: __arm_vcvttq_m_f32_f16 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float32x4_t]: __arm_vcvttq_m_f16_f32 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvtmq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtmq_m_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtmq_m_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtmq_m_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtmq_m_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvtnq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtnq_m_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtnq_m_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtnq_m_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtnq_m_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vcvtpq_m(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtpq_m_s16_f16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtpq_m_s32_f32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vcvtpq_m_u16_f16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_float32x4_t]: __arm_vcvtpq_m_u32_f32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vbicq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_m_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vbicq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vbicq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3));}) - -#define __arm_vornq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_m_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vornq_m_f16 (__ARM_mve_coerce(__p0, float16x8_t), __ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vornq_m_f32 (__ARM_mve_coerce(__p0, float32x4_t), __ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3));}) - -#define __arm_vld1q_z(p0,p1) ( \ - _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), p1), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), p1), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), p1), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), p1), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), p1), \ - int (*)[__ARM_mve_type_float16_t_ptr]: __arm_vld1q_z_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), p1), \ - int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vld1q_z_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), p1))) - #define __arm_vld2q(p0) ( \ _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ @@ -8517,17 +3578,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), p1, p2), \ int (*)[__ARM_mve_type_float32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), p1, p2))) -#define __arm_vst1q_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vst1q_p_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vst1q_p_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vst1q_p_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vst1q_p_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vst1q_p_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vst1q_p_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ - int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8_t]: __arm_vst1q_p_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8_t), p2), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vst1q_p_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - #define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \ @@ -8539,22 +3589,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x2_t]: __arm_vst2q_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x2_t)), \ int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x2_t]: __arm_vst2q_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x2_t)));}) -#define __arm_vstrhq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrhq_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrhq_s32 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8_t]: __arm_vstrhq_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8_t)));}) - -#define __arm_vstrhq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrhq_p_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrhq_p_s32 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_p_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_p_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ - int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8_t]: __arm_vstrhq_p_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, float16x8_t), p2));}) - #define __arm_vstrhq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ @@ -8591,18 +3625,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_scatter_shifted_offset_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)), \ int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_float16x8_t]: __arm_vstrhq_scatter_shifted_offset_f16 (__ARM_mve_coerce_f16_ptr(p0, float16_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, float16x8_t)));}) -#define __arm_vstrwq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_p_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_p_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_p_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4_t), p2));}) - -#define __arm_vstrwq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t)), \ - int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4_t]: __arm_vstrwq_f32 (__ARM_mve_coerce_f32_ptr(p0, float32_t *), __ARM_mve_coerce(__p1, float32x4_t)));}) - #define __arm_vstrhq_scatter_offset(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ @@ -8718,44 +3740,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t]: __arm_vstrwq_scatter_base_wb_p_u32 (p0, p1, __ARM_mve_coerce(__p2, uint32x4_t), p3), \ int (*)[__ARM_mve_type_float32x4_t]: __arm_vstrwq_scatter_base_wb_p_f32 (p0, p1, __ARM_mve_coerce(__p2, float32x4_t), p3));}) -#define __arm_vbicq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vbicq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vbicq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3));}) - -#define __arm_vcvtq_x(p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vcvtq_x_f16_s16 (__ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vcvtq_x_f32_s32 (__ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vcvtq_x_f16_u16 (__ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vcvtq_x_f32_u32 (__ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vcvtq_x_n(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vcvtq_x_n_f16_s16 (__ARM_mve_coerce(__p1, int16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vcvtq_x_n_f32_s32 (__ARM_mve_coerce(__p1, int32x4_t), p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vcvtq_x_n_f16_u16 (__ARM_mve_coerce(__p1, uint16x8_t), p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vcvtq_x_n_f32_u32 (__ARM_mve_coerce(__p1, uint32x4_t), p2, p3));}) - -#define __arm_vornq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3), \ - int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: __arm_vornq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, float16x8_t), p3), \ - int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: __arm_vornq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, float32x4_t), p3));}) - #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \ _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1), \ @@ -8804,68 +3788,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \ int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)));}) -#define __arm_vornq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vbicq(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vbicq_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vbicq_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vbicq_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vbicq_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar (__p1, int)), \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vshlcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2), \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vshlcq_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vshlcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1, p2), \ - int (*)[__ARM_mve_type_uint8x16_t]: __arm_vshlcq_u8 (__ARM_mve_coerce(__p0, uint8x16_t), p1, p2), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vshlcq_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vshlcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1, p2));}) - -#define __arm_vbicq_m_n(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vbicq_m_n_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vbicq_m_n_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1, p2), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vbicq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1, p2), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vbicq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1, p2));}) - -#define __arm_vbicq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_m_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - -#define __arm_vornq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), __ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), __ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_m_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - #define __arm_vstrwq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int32x4_t]: __arm_vstrwq_scatter_base_s32(p0, p1, __ARM_mve_coerce(__p2, int32x4_t)), \ @@ -8933,15 +3855,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_s32 (__ARM_mve_coerce_s32_ptr(__p0, int32_t *), p1, p2), \ int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vldrwq_gather_shifted_offset_z_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) -#define __arm_vst1q_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vst1q_p_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vst1q_p_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vst1q_p_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vst1q_p_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vst1q_p_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vst1q_p_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vst2q(p0,p1) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x2_t]: __arm_vst2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x2_t)), \ @@ -8951,20 +3864,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x2_t]: __arm_vst2q_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x2_t)), \ int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x2_t]: __arm_vst2q_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x2_t)));}) -#define __arm_vstrhq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrhq_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrhq_s32 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vstrhq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrhq_p_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrhq_p_s32 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_p_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_p_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vstrhq_scatter_offset_p(p0,p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ @@ -8997,17 +3896,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vstrhq_scatter_shifted_offset_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t)), \ int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vstrhq_scatter_shifted_offset_u32 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t)));}) - -#define __arm_vstrwq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vstrwq_p(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrwq_p_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrwq_p_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vstrdq_scatter_base_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_p_s64 (p0, p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ @@ -9085,34 +3973,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint32x4_t]: __arm_vuninitializedq_u32 (), \ int (*)[__ARM_mve_type_uint64x2_t]: __arm_vuninitializedq_u64 ());}) -#define __arm_vornq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vornq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vornq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vornq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vornq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vornq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vornq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - -#define __arm_vbicq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: __arm_vbicq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, int8x16_t), p3), \ - int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: __arm_vbicq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, int16x8_t), p3), \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vbicq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: __arm_vbicq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, uint8x16_t), p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: __arm_vbicq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, uint16x8_t), p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vbicq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3));}) - -#define __arm_vld1q_z(p0,p1) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld1q_z_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), p1), \ - int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld1q_z_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *), p1), \ - int (*)[__ARM_mve_type_int32_t_ptr]: __arm_vld1q_z_s32 (__ARM_mve_coerce_s32_ptr(p0, int32_t *), p1), \ - int (*)[__ARM_mve_type_uint8_t_ptr]: __arm_vld1q_z_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), p1), \ - int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld1q_z_u16 (__ARM_mve_coerce_u16_ptr(p0, uint16_t *), p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld1q_z_u32 (__ARM_mve_coerce_u32_ptr(p0, uint32_t *), p1))) - #define __arm_vld2q(p0) ( _Generic( (int (*)[__ARM_mve_typeid(p0)])0, \ int (*)[__ARM_mve_type_int8_t_ptr]: __arm_vld2q_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *)), \ int (*)[__ARM_mve_type_int16_t_ptr]: __arm_vld2q_s16 (__ARM_mve_coerce_s16_ptr(p0, int16_t *)), \ @@ -9156,73 +4016,6 @@ extern void *__ARM_undef; #endif /* MVE Integer. */ - -#define __arm_vdwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_vdwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_vdwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_viwdupq_x_u8(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u8 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_viwdupq_x_u16(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u16 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_viwdupq_x_u32(p1,p2,p3,p4) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_x_n_u32 ((uint32_t) __p1, p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_vidupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u8 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vddupq_x_u8(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u8 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u8 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vidupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u16 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vddupq_x_u16(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u16 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u16 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vidupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_x_n_u32 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vddupq_x_u32(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_x_n_u32 ((uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_x_wb_u32 (__ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadciq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadciq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vstrdq_scatter_base_wb_p(p0,p1,p2,p3) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_wb_p_s64 (p0, p1, __ARM_mve_coerce(__p2, int64x2_t), p3), \ @@ -9249,58 +4042,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_int64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_s64 (__ARM_mve_coerce_s64_ptr(p0, int64_t *), p1, p2), \ int (*)[__ARM_mve_type_uint64_t_ptr]: __arm_vldrdq_gather_shifted_offset_z_u64 (__ARM_mve_coerce_u64_ptr(p0, uint64_t *), p1, p2))) -#define __arm_vadciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadciq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadciq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) - -#define __arm_vadciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadciq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadciq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vadcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) - -#define __arm_vadcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vadcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vadcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vsbciq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsbciq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsbciq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) - -#define __arm_vsbciq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsbciq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsbciq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - -#define __arm_vsbcq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - __typeof(p2) __p2 = (p2); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsbcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, int32x4_t), p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsbcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, uint32x4_t), p3, p4));}) - -#define __arm_vsbcq(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: __arm_vsbcq_s32 (__ARM_mve_coerce(__p0, int32x4_t), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: __arm_vsbcq_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vldrbq_gather_offset_z(p0,p1,p2) ({ __typeof(p1) __p1 = (p1); \ _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vldrbq_gather_offset_z_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ @@ -9319,134 +4060,6 @@ extern void *__ARM_undef; int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vldrbq_gather_offset_u16(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vldrbq_gather_offset_u32(__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) -#define __arm_vidupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vidupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vddupq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vddupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), (uint32_t) __p1, p2, p3), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3));}) - -#define __arm_vidupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u16 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_vidupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u32 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_vidupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vidupq_n_u8 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vidupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_vddupq_u16(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u16 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_vddupq_u32(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u32 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_vddupq_u8(p0,p1) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vddupq_n_u8 ((uint32_t) __p0, p1), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vddupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1));}) - -#define __arm_viwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_viwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_viwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, (const int) p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, (const int) p2));}) - -#define __arm_viwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) - -#define __arm_viwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_viwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_viwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) - -#define __arm_vdwdupq_m(p0,p1,p2,p3,p4) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: __arm_vdwdupq_m_n_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_i_scalar(__p1, int), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u8 (__ARM_mve_coerce(__p0, uint8x16_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u16 (__ARM_mve_coerce(__p0, uint16x8_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4), \ - int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_m_wb_u32 (__ARM_mve_coerce(__p0, uint32x4_t), __ARM_mve_coerce_u32_ptr(__p1, uint32_t *), p2, p3, p4));}) - -#define __arm_vdwdupq_u16(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u16 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u16 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) - -#define __arm_vdwdupq_u32(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u32 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u32 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) - -#define __arm_vdwdupq_u8(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int_n]: __arm_vdwdupq_n_u8 (__ARM_mve_coerce_i_scalar(__p0, int), p1, p2), \ - int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vdwdupq_wb_u8 (__ARM_mve_coerce_u32_ptr(__p0, uint32_t *), p1, p2));}) - -#define __arm_vshlcq_m(p0,p1,p2,p3) ({ __typeof(p0) __p0 = (p0); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \ - int (*)[__ARM_mve_type_int8x16_t]: __arm_vshlcq_m_s8 (__ARM_mve_coerce(__p0, int8x16_t), p1, p2, p3), \ - int (*)[__ARM_mve_type_int16x8_t]: __arm_vshlcq_m_s16 (__ARM_mve_coerce(__p0, int16x8_t), p1, p2, p3), \ - int (*)[__ARM_mve_type_int32x4_t]: __arm_vshlcq_m_s32 (__ARM_mve_coerce(__p0, int32x4_t), p1, p2, p3), \ - int (*)[__ARM_mve_type_uint8x16_t]: __arm_vshlcq_m_u8 (__ARM_mve_coerce(__p0, uint8x16_t), p1, p2, p3), \ - int (*)[__ARM_mve_type_uint16x8_t]: __arm_vshlcq_m_u16 (__ARM_mve_coerce(__p0, uint16x8_t), p1, p2, p3), \ - int (*)[__ARM_mve_type_uint32x4_t]: __arm_vshlcq_m_u32 (__ARM_mve_coerce(__p0, uint32x4_t), p1, p2, p3));}) - -#define __arm_vstrbq(p0,p1) ({ __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_s8 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_s16 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t)), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_s32 (__ARM_mve_coerce_s8_ptr(p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_u8 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_u16 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t)), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_u32 (__ARM_mve_coerce_u8_ptr(p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t)));}) - -#define __arm_vstrbq_p(p0,p1,p2) ({ __typeof(p0) __p0 = (p0); \ - __typeof(p1) __p1 = (p1); \ - _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16_t]: __arm_vstrbq_p_s8 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int16x8_t]: __arm_vstrbq_p_s16 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, int16x8_t), p2), \ - int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int32x4_t]: __arm_vstrbq_p_s32 (__ARM_mve_coerce_s8_ptr(__p0, int8_t *), __ARM_mve_coerce(__p1, int32x4_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16_t]: __arm_vstrbq_p_u8 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint16x8_t]: __arm_vstrbq_p_u16 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint16x8_t), p2), \ - int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint32x4_t]: __arm_vstrbq_p_u32 (__ARM_mve_coerce_u8_ptr(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint32x4_t), p2));}) - #define __arm_vstrdq_scatter_base(p0,p1,p2) ({ __typeof(p2) __p2 = (p2); \ _Generic( (int (*)[__ARM_mve_typeid(__p2)])0, \ int (*)[__ARM_mve_type_int64x2_t]: __arm_vstrdq_scatter_base_s64 (p0, p1, __ARM_mve_coerce(__p2, int64x2_t)), \ diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index f141aab..5a0c760 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -27,7 +27,7 @@ VAR2 (UNOP_NONE_NONE, vrndmq_f, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vrndaq_f, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vrev64q_f, v8hf, v4sf) VAR2 (UNOP_NONE_NONE, vnegq_f, v8hf, v4sf) -VAR2 (UNOP_NONE_NONE, vdupq_n_f, v8hf, v4sf) +VAR5 (UNOP_NONE_NONE, vdupq_n, v8hf, v4sf, v16qi, v8hi, v4si) VAR2 (UNOP_NONE_NONE, vabsq_f, v8hf, v4sf) VAR1 (UNOP_NONE_NONE, vrev32q_f, v8hf) VAR1 (UNOP_NONE_NONE, vcvttq_f32_f16, v4sf) @@ -39,7 +39,6 @@ VAR3 (UNOP_SNONE_SNONE, vqnegq_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vqabsq_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vnegq_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vmvnq_s, v16qi, v8hi, v4si) -VAR3 (UNOP_SNONE_SNONE, vdupq_n_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vclzq_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vclsq_s, v16qi, v8hi, v4si) VAR3 (UNOP_SNONE_SNONE, vaddvq_s, v16qi, v8hi, v4si) @@ -57,7 +56,6 @@ VAR1 (UNOP_SNONE_SNONE, vrev16q_s, v16qi) VAR1 (UNOP_SNONE_SNONE, vaddlvq_s, v4si) VAR3 (UNOP_UNONE_UNONE, vrev64q_u, v16qi, v8hi, v4si) VAR3 (UNOP_UNONE_UNONE, vmvnq_u, v16qi, v8hi, v4si) -VAR3 (UNOP_UNONE_UNONE, vdupq_n_u, v16qi, v8hi, v4si) VAR3 (UNOP_UNONE_UNONE, vclzq_u, v16qi, v8hi, v4si) VAR3 (UNOP_UNONE_UNONE, vaddvq_u, v16qi, v8hi, v4si) VAR2 (UNOP_UNONE_UNONE, vrev32q_u, v16qi, v8hi) @@ -288,15 +286,11 @@ VAR1 (TERNOP_UNONE_UNONE_UNONE_UNONE, vrmlaldavhaq_u, v4si) VAR2 (TERNOP_NONE_NONE_UNONE_PRED, vcvtq_m_to_f_u, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtq_m_to_f_s, v8hf, v4sf) VAR2 (TERNOP_PRED_NONE_NONE_PRED, vcmpeqq_m_f, v8hf, v4sf) -VAR3 (TERNOP_UNONE_NONE_UNONE_IMM, vshlcq_carry_s, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_carry_u, v16qi, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_IMM, vqrshrunbq_n_s, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_NONE_NONE, vabavq_s, v16qi, v8hi, v4si) VAR3 (TERNOP_UNONE_UNONE_UNONE_UNONE, vabavq_u, v16qi, v8hi, v4si) VAR2 (TERNOP_UNONE_UNONE_NONE_PRED, vcvtaq_m_u, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_PRED, vcvtaq_m_s, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vshlcq_vec_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_NONE_NONE_UNONE_IMM, vshlcq_vec_s, v16qi, v8hi, v4si) VAR4 (TERNOP_UNONE_UNONE_UNONE_PRED, vpselq_u, v16qi, v8hi, v4si, v2di) VAR4 (TERNOP_NONE_NONE_NONE_PRED, vpselq_s, v16qi, v8hi, v4si, v2di) VAR3 (TERNOP_UNONE_UNONE_UNONE_PRED, vrev64q_m_u, v16qi, v8hi, v4si) @@ -669,20 +663,14 @@ VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vandq_m_f, v8hf, v4sf) VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_n_f, v8hf, v4sf) VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vaddq_m_f, v8hf, v4sf) VAR2 (QUADOP_NONE_NONE_NONE_NONE_PRED, vabdq_m_f, v8hf, v4sf) -VAR3 (STRS, vstrbq_s, v16qi, v8hi, v4si) -VAR3 (STRU, vstrbq_u, v16qi, v8hi, v4si) VAR3 (STRSS, vstrbq_scatter_offset_s, v16qi, v8hi, v4si) VAR3 (STRSU, vstrbq_scatter_offset_u, v16qi, v8hi, v4si) VAR1 (STRSBS, vstrwq_scatter_base_s, v4si) VAR1 (STRSBU, vstrwq_scatter_base_u, v4si) VAR3 (LDRGU, vldrbq_gather_offset_u, v16qi, v8hi, v4si) VAR3 (LDRGS, vldrbq_gather_offset_s, v16qi, v8hi, v4si) -VAR3 (LDRS, vldrbq_s, v16qi, v8hi, v4si) -VAR3 (LDRU, vldrbq_u, v16qi, v8hi, v4si) VAR1 (LDRGBS, vldrwq_gather_base_s, v4si) VAR1 (LDRGBU, vldrwq_gather_base_u, v4si) -VAR3 (STRS_P, vstrbq_p_s, v16qi, v8hi, v4si) -VAR3 (STRU_P, vstrbq_p_u, v16qi, v8hi, v4si) VAR3 (STRSS_P, vstrbq_scatter_offset_p_s, v16qi, v8hi, v4si) VAR3 (STRSU_P, vstrbq_scatter_offset_p_u, v16qi, v8hi, v4si) VAR1 (STRSBS_P, vstrwq_scatter_base_p_s, v4si) @@ -691,15 +679,6 @@ VAR1 (LDRGBS_Z, vldrwq_gather_base_z_s, v4si) VAR1 (LDRGBU_Z, vldrwq_gather_base_z_u, v4si) VAR3 (LDRGS_Z, vldrbq_gather_offset_z_s, v16qi, v8hi, v4si) VAR3 (LDRGU_Z, vldrbq_gather_offset_z_u, v16qi, v8hi, v4si) -VAR3 (LDRS_Z, vldrbq_z_s, v16qi, v8hi, v4si) -VAR3 (LDRU_Z, vldrbq_z_u, v16qi, v8hi, v4si) -VAR3 (LDRU, vld1q_u, v16qi, v8hi, v4si) -VAR3 (LDRS, vld1q_s, v16qi, v8hi, v4si) -VAR2 (LDRU_Z, vldrhq_z_u, v8hi, v4si) -VAR2 (LDRU, vldrhq_u, v8hi, v4si) -VAR2 (LDRS_Z, vldrhq_z_s, v8hi, v4si) -VAR2 (LDRS, vldrhq_s, v8hi, v4si) -VAR2 (LDRS, vld1q_f, v8hf, v4sf) VAR2 (LDRGU_Z, vldrhq_gather_shifted_offset_z_u, v8hi, v4si) VAR2 (LDRGU_Z, vldrhq_gather_offset_z_u, v8hi, v4si) VAR2 (LDRGU, vldrhq_gather_shifted_offset_u, v8hi, v4si) @@ -708,14 +687,6 @@ VAR2 (LDRGS_Z, vldrhq_gather_shifted_offset_z_s, v8hi, v4si) VAR2 (LDRGS_Z, vldrhq_gather_offset_z_s, v8hi, v4si) VAR2 (LDRGS, vldrhq_gather_shifted_offset_s, v8hi, v4si) VAR2 (LDRGS, vldrhq_gather_offset_s, v8hi, v4si) -VAR1 (LDRS, vldrhq_f, v8hf) -VAR1 (LDRS_Z, vldrhq_z_f, v8hf) -VAR1 (LDRS, vldrwq_f, v4sf) -VAR1 (LDRS, vldrwq_s, v4si) -VAR1 (LDRU, vldrwq_u, v4si) -VAR1 (LDRS_Z, vldrwq_z_f, v4sf) -VAR1 (LDRS_Z, vldrwq_z_s, v4si) -VAR1 (LDRU_Z, vldrwq_z_u, v4si) VAR1 (LDRGBS, vldrdq_gather_base_s, v2di) VAR1 (LDRGBS, vldrwq_gather_base_f, v4sf) VAR1 (LDRGBS_Z, vldrdq_gather_base_z_s, v2di) @@ -746,13 +717,6 @@ VAR1 (LDRGU_Z, vldrdq_gather_offset_z_u, v2di) VAR1 (LDRGU_Z, vldrdq_gather_shifted_offset_z_u, v2di) VAR1 (LDRGU_Z, vldrwq_gather_offset_z_u, v4si) VAR1 (LDRGU_Z, vldrwq_gather_shifted_offset_z_u, v4si) -VAR3 (STRU, vst1q_u, v16qi, v8hi, v4si) -VAR3 (STRS, vst1q_s, v16qi, v8hi, v4si) -VAR2 (STRU_P, vstrhq_p_u, v8hi, v4si) -VAR2 (STRU, vstrhq_u, v8hi, v4si) -VAR2 (STRS_P, vstrhq_p_s, v8hi, v4si) -VAR2 (STRS, vstrhq_s, v8hi, v4si) -VAR2 (STRS, vst1q_f, v8hf, v4sf) VAR2 (STRSU_P, vstrhq_scatter_shifted_offset_p_u, v8hi, v4si) VAR2 (STRSU_P, vstrhq_scatter_offset_p_u, v8hi, v4si) VAR2 (STRSU, vstrhq_scatter_shifted_offset_u, v8hi, v4si) @@ -761,14 +725,6 @@ VAR2 (STRSS_P, vstrhq_scatter_shifted_offset_p_s, v8hi, v4si) VAR2 (STRSS_P, vstrhq_scatter_offset_p_s, v8hi, v4si) VAR2 (STRSS, vstrhq_scatter_shifted_offset_s, v8hi, v4si) VAR2 (STRSS, vstrhq_scatter_offset_s, v8hi, v4si) -VAR1 (STRS, vstrhq_f, v8hf) -VAR1 (STRS_P, vstrhq_p_f, v8hf) -VAR1 (STRS, vstrwq_f, v4sf) -VAR1 (STRS, vstrwq_s, v4si) -VAR1 (STRU, vstrwq_u, v4si) -VAR1 (STRS_P, vstrwq_p_f, v4sf) -VAR1 (STRS_P, vstrwq_p_s, v4si) -VAR1 (STRU_P, vstrwq_p_u, v4si) VAR1 (STRSBS, vstrdq_scatter_base_s, v2di) VAR1 (STRSBS, vstrwq_scatter_base_f, v4sf) VAR1 (STRSBS_P, vstrdq_scatter_base_p_s, v2di) @@ -799,18 +755,6 @@ VAR1 (STRSU_P, vstrdq_scatter_offset_p_u, v2di) VAR1 (STRSU_P, vstrdq_scatter_shifted_offset_p_u, v2di) VAR1 (STRSU_P, vstrwq_scatter_offset_p_u, v4si) VAR1 (STRSU_P, vstrwq_scatter_shifted_offset_p_u, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_wb_u, v16qi, v4si, v8hi) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_wb_u, v16qi, v4si, v8hi) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_wb_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_wb_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, viwdupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUINOP_UNONE_UNONE_UNONE_UNONE_IMM_PRED, vdwdupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_IMM, vddupq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_IMM, vidupq_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vddupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vidupq_m_n_u, v16qi, v8hi, v4si) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, vdwdupq_n_u, v16qi, v4si, v8hi) -VAR3 (TERNOP_UNONE_UNONE_UNONE_IMM, viwdupq_n_u, v16qi, v4si, v8hi) VAR1 (STRSBWBU, vstrwq_scatter_base_wb_u, v4si) VAR1 (STRSBWBU, vstrdq_scatter_base_wb_u, v2di) VAR1 (STRSBWBU_P, vstrwq_scatter_base_wb_p_u, v4si) @@ -874,7 +818,3 @@ VAR1 (UQSHL, urshr_, si) VAR1 (UQSHL, urshrl_, di) VAR1 (UQSHL, uqshl_, si) VAR1 (UQSHL, uqshll_, di) -VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_vec_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_NONE_NONE_UNONE_IMM_PRED, vshlcq_m_carry_s, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_vec_u, v16qi, v8hi, v4si) -VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_PRED, vshlcq_m_carry_u, v16qi, v8hi, v4si) diff --git a/gcc/config/arm/bpabi.h b/gcc/config/arm/bpabi.h index 7a279f3..fd4b65a 100644 --- a/gcc/config/arm/bpabi.h +++ b/gcc/config/arm/bpabi.h @@ -1,6 +1,6 @@ /* Configuration file for ARM BPABI targets. Copyright (C) 2004-2024 Free Software Foundation, Inc. - Contributed by CodeSourcery, LLC + Contributed by CodeSourcery, LLC This file is part of GCC. @@ -55,7 +55,7 @@ #define TARGET_FIX_V4BX_SPEC " %{mcpu=arm8|mcpu=arm810|mcpu=strongarm*"\ "|march=armv4|mcpu=fa526|mcpu=fa626:--fix-v4bx}" -#define TARGET_FDPIC_ASM_SPEC "" +#define TARGET_FDPIC_ASM_SPEC "%{mfdpic: --fdpic}" #define BE8_LINK_SPEC \ "%{!r:%{!mbe32:%:be8_linkopt(%{mlittle-endian:little}" \ diff --git a/gcc/config/arm/elf.h b/gcc/config/arm/elf.h index 97230d1..5f176de 100644 --- a/gcc/config/arm/elf.h +++ b/gcc/config/arm/elf.h @@ -3,7 +3,7 @@ Copyright (C) 1995-2024 Free Software Foundation, Inc. Contributed by Philip Blundell <philb@gnu.org> and Catherine Moore <clm@cygnus.com> - + This file is part of GCC. GCC is free software; you can redistribute it and/or modify it @@ -111,7 +111,7 @@ #ifndef LINK_SPEC #define LINK_SPEC "%{mbig-endian:-EB} %{mlittle-endian:-EL} -X" #endif - + /* Run-time Target Specification. */ #ifndef TARGET_DEFAULT #define TARGET_DEFAULT (MASK_APCS_FRAME) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index b9ff01c..22f8c18 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -283,6 +283,14 @@ (define_mode_iterator MVE_V8HF [V8HF]) (define_mode_iterator MVE_V16QI [V16QI]) +;; Types for MVE truncating stores and widening loads +(define_mode_iterator MVE_w_narrow_TYPE [V8QI V4QI V4HI]) +(define_mode_attr MVE_w_narrow_type [(V8QI "v8qi") (V4QI "v4qi") (V4HI "v4hi")]) +(define_mode_attr MVE_wide_n_TYPE [(V8QI "V8HI") (V4QI "V4SI") (V4HI "V4SI")]) +(define_mode_attr MVE_wide_n_type [(V8QI "v8hi") (V4QI "v4si") (V4HI "v4si")]) +(define_mode_attr MVE_wide_n_sz_elem [(V8QI "16") (V4QI "32") (V4HI "32")]) +(define_mode_attr MVE_wide_n_VPRED [(V8QI "V8BI") (V4QI "V4BI") (V4HI "V4BI")]) + ;;---------------------------------------------------------------------------- ;; Code iterators ;;---------------------------------------------------------------------------- @@ -444,6 +452,7 @@ VANDQ_M_S VANDQ_M_U VBICQ_M_S VBICQ_M_U VEORQ_M_S VEORQ_M_U + VORNQ_M_S VORNQ_M_U VORRQ_M_S VORRQ_M_U ]) @@ -594,6 +603,7 @@ VANDQ_M_F VBICQ_M_F VEORQ_M_F + VORNQ_M_F VORRQ_M_F ]) @@ -939,6 +949,10 @@ (VABDQ_S "vabd") (VABDQ_U "vabd") (VABDQ_F "vabd") (VABSQ_M_F "vabs") (VABSQ_M_S "vabs") + (VADCIQ_M_S "vadci") (VADCIQ_M_U "vadci") + (VADCIQ_S "vadci") (VADCIQ_U "vadci") + (VADCQ_M_S "vadc") (VADCQ_M_U "vadc") + (VADCQ_S "vadc") (VADCQ_U "vadc") (VADDLVAQ_P_S "vaddlva") (VADDLVAQ_P_U "vaddlva") (VADDLVAQ_S "vaddlva") (VADDLVAQ_U "vaddlva") (VADDLVQ_P_S "vaddlv") (VADDLVQ_P_U "vaddlv") @@ -964,6 +978,26 @@ (VCMLAQ_M_F "vcmla") (VCMLAQ_ROT90_M_F "vcmla") (VCMLAQ_ROT180_M_F "vcmla") (VCMLAQ_ROT270_M_F "vcmla") (VCMULQ_M_F "vcmul") (VCMULQ_ROT90_M_F "vcmul") (VCMULQ_ROT180_M_F "vcmul") (VCMULQ_ROT270_M_F "vcmul") (VCREATEQ_S "vcreate") (VCREATEQ_U "vcreate") (VCREATEQ_F "vcreate") + (VCVTAQ_M_S "vcvta") (VCVTAQ_M_U "vcvta") + (VCVTAQ_S "vcvta") (VCVTAQ_U "vcvta") + (VCVTBQ_F16_F32 "vcvtb") (VCVTTQ_F16_F32 "vcvtt") + (VCVTBQ_F32_F16 "vcvtb") (VCVTTQ_F32_F16 "vcvtt") + (VCVTBQ_M_F16_F32 "vcvtb") (VCVTTQ_M_F16_F32 "vcvtt") + (VCVTBQ_M_F32_F16 "vcvtb") (VCVTTQ_M_F32_F16 "vcvtt") + (VCVTMQ_M_S "vcvtm") (VCVTMQ_M_U "vcvtm") + (VCVTMQ_S "vcvtm") (VCVTMQ_U "vcvtm") + (VCVTNQ_M_S "vcvtn") (VCVTNQ_M_U "vcvtn") + (VCVTNQ_S "vcvtn") (VCVTNQ_U "vcvtn") + (VCVTPQ_M_S "vcvtp") (VCVTPQ_M_U "vcvtp") + (VCVTPQ_S "vcvtp") (VCVTPQ_U "vcvtp") + (VCVTQ_FROM_F_S "vcvt") (VCVTQ_FROM_F_U "vcvt") + (VCVTQ_M_FROM_F_S "vcvt") (VCVTQ_M_FROM_F_U "vcvt") + (VCVTQ_M_N_FROM_F_S "vcvt") (VCVTQ_M_N_FROM_F_U "vcvt") + (VCVTQ_M_N_TO_F_S "vcvt") (VCVTQ_M_N_TO_F_U "vcvt") + (VCVTQ_M_TO_F_S "vcvt") (VCVTQ_M_TO_F_U "vcvt") + (VCVTQ_N_FROM_F_S "vcvt") (VCVTQ_N_FROM_F_U "vcvt") + (VCVTQ_N_TO_F_S "vcvt") (VCVTQ_N_TO_F_U "vcvt") + (VCVTQ_TO_F_S "vcvt") (VCVTQ_TO_F_U "vcvt") (VDUPQ_M_N_S "vdup") (VDUPQ_M_N_U "vdup") (VDUPQ_M_N_F "vdup") (VDUPQ_N_S "vdup") (VDUPQ_N_U "vdup") (VDUPQ_N_F "vdup") (VEORQ_M_S "veor") (VEORQ_M_U "veor") (VEORQ_M_F "veor") @@ -985,6 +1019,10 @@ (VHSUBQ_M_S "vhsub") (VHSUBQ_M_U "vhsub") (VHSUBQ_N_S "vhsub") (VHSUBQ_N_U "vhsub") (VHSUBQ_S "vhsub") (VHSUBQ_U "vhsub") + (VIDUPQ "vidup") (VDDUPQ "vddup") + (VIDUPQ_M "vidup") (VDDUPQ_M "vddup") + (VIWDUPQ "viwdup") (VDWDUPQ "vdwdup") + (VIWDUPQ_M "viwdup") (VDWDUPQ_M "vdwdup") (VMAXAQ_M_S "vmaxa") (VMAXAQ_S "vmaxa") (VMAXAVQ_P_S "vmaxav") @@ -1074,6 +1112,7 @@ (VMVNQ_N_S "vmvn") (VMVNQ_N_U "vmvn") (VNEGQ_M_F "vneg") (VNEGQ_M_S "vneg") + (VORNQ_M_S "vorn") (VORNQ_M_U "vorn") (VORNQ_M_F "vorn") (VORRQ_M_N_S "vorr") (VORRQ_M_N_U "vorr") (VORRQ_M_S "vorr") (VORRQ_M_U "vorr") (VORRQ_M_F "vorr") (VORRQ_N_S "vorr") (VORRQ_N_U "vorr") @@ -1208,6 +1247,10 @@ (VRSHRNTQ_N_S "vrshrnt") (VRSHRNTQ_N_U "vrshrnt") (VRSHRQ_M_N_S "vrshr") (VRSHRQ_M_N_U "vrshr") (VRSHRQ_N_S "vrshr") (VRSHRQ_N_U "vrshr") + (VSBCIQ_M_S "vsbci") (VSBCIQ_M_U "vsbci") + (VSBCIQ_S "vsbci") (VSBCIQ_U "vsbci") + (VSBCQ_M_S "vsbc") (VSBCQ_M_U "vsbc") + (VSBCQ_S "vsbc") (VSBCQ_U "vsbc") (VSHLLBQ_M_N_S "vshllb") (VSHLLBQ_M_N_U "vshllb") (VSHLLBQ_N_S "vshllb") (VSHLLBQ_N_U "vshllb") (VSHLLTQ_M_N_S "vshllt") (VSHLLTQ_M_N_U "vshllt") @@ -1317,6 +1360,9 @@ (VRNDXQ_F "vrintx") (VRNDXQ_M_F "vrintx") ]) +(define_int_attr viddupq_op [ (VIDUPQ "plus") (VDDUPQ "minus")]) +(define_int_attr viddupq_m_op [ (VIDUPQ_M "plus") (VDDUPQ_M "minus")]) + ;; plus and minus are the only SHIFTABLE_OPS for which Thumb2 allows ;; a stack pointer operand. The minus operation is a candidate for an rsub ;; and hence only plus is supported. @@ -1769,6 +1815,10 @@ (V2SF "s") (V4SF "s") (V2SF "s") (V4SF "s")]) +(define_mode_attr MVE_elem_ch [(V4QI "b") (V8QI "b") (V16QI "b") + (V4HI "h") (V8HI "h") (V8HF "h") + (V4SI "w") (V4SF "w")]) + (define_mode_attr VH_elem_ch [(V4HI "s") (V8HI "s") (V4HF "s") (V8HF "s") (HF "s")]) @@ -2472,19 +2522,16 @@ (VMLALDAVAXQ_P_S "s") (VMLALDAVAQ_P_S "s") (VMLALDAVAQ_P_U "u") (VSTRWQSB_S "s") (VSTRWQSB_U "u") (VSTRBQSO_S "s") - (VSTRBQSO_U "u") (VSTRBQ_S "s") (VSTRBQ_U "u") - (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRBQ_S "s") - (VLDRBQ_U "u") (VLDRWQGB_S "s") (VLDRWQGB_U "u") - (VLD1Q_S "s") (VLD1Q_U "u") (VLDRHQGO_S "s") + (VSTRBQSO_U "u") + (VLDRBQGO_S "s") (VLDRBQGO_U "u") (VLDRWQGB_S "s") + (VLDRWQGB_U "u") (VLDRHQGO_S "s") (VLDRHQGO_U "u") (VLDRHQGSO_S "s") (VLDRHQGSO_U "u") - (VLDRHQ_S "s") (VLDRHQ_U "u") (VLDRWQ_S "s") - (VLDRWQ_U "u") (VLDRDQGB_S "s") (VLDRDQGB_U "u") + (VLDRDQGB_S "s") (VLDRDQGB_U "u") (VLDRDQGO_S "s") (VLDRDQGO_U "u") (VLDRDQGSO_S "s") (VLDRDQGSO_U "u") (VLDRWQGO_S "s") (VLDRWQGO_U "u") - (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") (VST1Q_S "s") - (VST1Q_U "u") (VSTRHQSO_S "s") (VSTRHQSO_U "u") - (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRHQ_S "s") - (VSTRHQ_U "u") (VSTRWQ_S "s") (VSTRWQ_U "u") + (VLDRWQGSO_S "s") (VLDRWQGSO_U "u") + (VSTRHQSO_S "s") (VSTRHQSO_U "u") + (VSTRHQSSO_S "s") (VSTRHQSSO_U "u") (VSTRDQSB_S "s") (VSTRDQSB_U "u") (VSTRDQSO_S "s") (VSTRDQSO_U "u") (VSTRDQSSO_S "s") (VSTRDQSSO_U "u") (VSTRWQSO_U "u") (VSTRWQSO_S "s") (VSTRWQSSO_U "u") @@ -2720,14 +2767,10 @@ (define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U]) (define_int_iterator VCVTQ_FROM_F [VCVTQ_FROM_F_S VCVTQ_FROM_F_U]) (define_int_iterator VREV16Q [VREV16Q_U VREV16Q_S]) -(define_int_iterator VCVTAQ [VCVTAQ_U VCVTAQ_S]) (define_int_iterator VDUPQ_N [VDUPQ_N_U VDUPQ_N_S]) (define_int_iterator VADDVQ [VADDVQ_U VADDVQ_S]) (define_int_iterator VREV32Q [VREV32Q_U VREV32Q_S]) (define_int_iterator VMOVLxQ [VMOVLBQ_S VMOVLBQ_U VMOVLTQ_U VMOVLTQ_S]) -(define_int_iterator VCVTPQ [VCVTPQ_S VCVTPQ_U]) -(define_int_iterator VCVTNQ [VCVTNQ_S VCVTNQ_U]) -(define_int_iterator VCVTMQ [VCVTMQ_S VCVTMQ_U]) (define_int_iterator VADDLVQ [VADDLVQ_U VADDLVQ_S]) (define_int_iterator VCVTQ_N_TO_F [VCVTQ_N_TO_F_S VCVTQ_N_TO_F_U]) (define_int_iterator VCREATEQ [VCREATEQ_U VCREATEQ_S]) @@ -2783,7 +2826,6 @@ (define_int_iterator VSHLLxQ_N [VSHLLBQ_N_S VSHLLBQ_N_U VSHLLTQ_N_S VSHLLTQ_N_U]) (define_int_iterator VRMLALDAVHQ [VRMLALDAVHQ_U VRMLALDAVHQ_S]) (define_int_iterator VBICQ_M_N [VBICQ_M_N_S VBICQ_M_N_U]) -(define_int_iterator VCVTAQ_M [VCVTAQ_M_S VCVTAQ_M_U]) (define_int_iterator VCVTQ_M_TO_F [VCVTQ_M_TO_F_S VCVTQ_M_TO_F_U]) (define_int_iterator VQRSHRNBQ_N [VQRSHRNBQ_N_U VQRSHRNBQ_N_S]) (define_int_iterator VABAVQ [VABAVQ_S VABAVQ_U]) @@ -2833,9 +2875,6 @@ (define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S]) (define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S]) (define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U]) -(define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U]) -(define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U]) -(define_int_iterator VCVTPQ_M [VCVTPQ_M_S VCVTPQ_M_U]) (define_int_iterator VCVTQ_M_N_FROM_F [VCVTQ_M_N_FROM_F_S VCVTQ_M_N_FROM_F_U]) (define_int_iterator VCVTQ_M_FROM_F [VCVTQ_M_FROM_F_U VCVTQ_M_FROM_F_S]) (define_int_iterator VRMLALDAVHQ_P [VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U]) @@ -2899,25 +2938,17 @@ (define_int_iterator VSHRNTQ_M_N [VSHRNTQ_M_N_S VSHRNTQ_M_N_U]) (define_int_iterator VSTRWSBQ [VSTRWQSB_S VSTRWQSB_U]) (define_int_iterator VSTRBSOQ [VSTRBQSO_S VSTRBQSO_U]) -(define_int_iterator VSTRBQ [VSTRBQ_S VSTRBQ_U]) (define_int_iterator VLDRBGOQ [VLDRBQGO_S VLDRBQGO_U]) -(define_int_iterator VLDRBQ [VLDRBQ_S VLDRBQ_U]) (define_int_iterator VLDRWGBQ [VLDRWQGB_S VLDRWQGB_U]) -(define_int_iterator VLD1Q [VLD1Q_S VLD1Q_U]) (define_int_iterator VLDRHGOQ [VLDRHQGO_S VLDRHQGO_U]) (define_int_iterator VLDRHGSOQ [VLDRHQGSO_S VLDRHQGSO_U]) -(define_int_iterator VLDRHQ [VLDRHQ_S VLDRHQ_U]) -(define_int_iterator VLDRWQ [VLDRWQ_S VLDRWQ_U]) (define_int_iterator VLDRDGBQ [VLDRDQGB_S VLDRDQGB_U]) (define_int_iterator VLDRDGOQ [VLDRDQGO_S VLDRDQGO_U]) (define_int_iterator VLDRDGSOQ [VLDRDQGSO_S VLDRDQGSO_U]) (define_int_iterator VLDRWGOQ [VLDRWQGO_S VLDRWQGO_U]) (define_int_iterator VLDRWGSOQ [VLDRWQGSO_S VLDRWQGSO_U]) -(define_int_iterator VST1Q [VST1Q_S VST1Q_U]) (define_int_iterator VSTRHSOQ [VSTRHQSO_S VSTRHQSO_U]) (define_int_iterator VSTRHSSOQ [VSTRHQSSO_S VSTRHQSSO_U]) -(define_int_iterator VSTRHQ [VSTRHQ_S VSTRHQ_U]) -(define_int_iterator VSTRWQ [VSTRWQ_S VSTRWQ_U]) (define_int_iterator VSTRDSBQ [VSTRDQSB_S VSTRDQSB_U]) (define_int_iterator VSTRDSOQ [VSTRDQSO_S VSTRDQSO_U]) (define_int_iterator VSTRDSSOQ [VSTRDQSSO_S VSTRDQSSO_U]) @@ -2927,19 +2958,25 @@ (define_int_iterator VLDRWGBWBQ [VLDRWQGBWB_S VLDRWQGBWB_U]) (define_int_iterator VSTRDSBWBQ [VSTRDQSBWB_S VSTRDQSBWB_U]) (define_int_iterator VLDRDGBWBQ [VLDRDQGBWB_S VLDRDQGBWB_U]) -(define_int_iterator VADCIQ [VADCIQ_U VADCIQ_S]) -(define_int_iterator VADCIQ_M [VADCIQ_M_U VADCIQ_M_S]) -(define_int_iterator VSBCQ [VSBCQ_U VSBCQ_S]) -(define_int_iterator VSBCQ_M [VSBCQ_M_U VSBCQ_M_S]) -(define_int_iterator VSBCIQ [VSBCIQ_U VSBCIQ_S]) -(define_int_iterator VSBCIQ_M [VSBCIQ_M_U VSBCIQ_M_S]) -(define_int_iterator VADCQ [VADCQ_U VADCQ_S]) -(define_int_iterator VADCQ_M [VADCQ_M_U VADCQ_M_S]) +(define_int_iterator VxCIQ [VADCIQ_U VADCIQ_S VSBCIQ_U VSBCIQ_S]) +(define_int_iterator VxCIQ_M [VADCIQ_M_U VADCIQ_M_S VSBCIQ_M_U VSBCIQ_M_S]) +(define_int_iterator VxCQ [VADCQ_U VADCQ_S VSBCQ_U VSBCQ_S]) +(define_int_iterator VxCQ_M [VADCQ_M_U VADCQ_M_S VSBCQ_M_U VSBCQ_M_S]) (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S]) (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S]) +(define_int_iterator VCVTxQ_F16_F32 [VCVTBQ_F16_F32 VCVTTQ_F16_F32]) +(define_int_iterator VCVTxQ_F32_F16 [VCVTBQ_F32_F16 VCVTTQ_F32_F16]) +(define_int_iterator VCVTxQ_M_F16_F32 [VCVTBQ_M_F16_F32 VCVTTQ_M_F16_F32]) +(define_int_iterator VCVTxQ_M_F32_F16 [VCVTBQ_M_F32_F16 VCVTTQ_M_F32_F16]) +(define_int_iterator VCVTxQ [VCVTAQ_S VCVTAQ_U VCVTMQ_S VCVTMQ_U VCVTNQ_S VCVTNQ_U VCVTPQ_S VCVTPQ_U]) +(define_int_iterator VCVTxQ_M [VCVTAQ_M_S VCVTAQ_M_U VCVTMQ_M_S VCVTMQ_M_U VCVTNQ_M_S VCVTNQ_M_U VCVTPQ_M_S VCVTPQ_M_U]) +(define_int_iterator VIDDUPQ [VIDUPQ VDDUPQ]) +(define_int_iterator VIDDUPQ_M [VIDUPQ_M VDDUPQ_M]) +(define_int_iterator VIDWDUPQ [VIWDUPQ VDWDUPQ]) +(define_int_iterator VIDWDUPQ_M [VIWDUPQ_M VDWDUPQ_M]) (define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32 DLSTP64]) (define_int_iterator LETP [LETP8 LETP16 LETP32 diff --git a/gcc/config/arm/linux-eabi.h b/gcc/config/arm/linux-eabi.h index eef791f..b12e4ff 100644 --- a/gcc/config/arm/linux-eabi.h +++ b/gcc/config/arm/linux-eabi.h @@ -1,6 +1,6 @@ /* Configuration file for ARM GNU/Linux EABI targets. Copyright (C) 2004-2024 Free Software Foundation, Inc. - Contributed by CodeSourcery, LLC + Contributed by CodeSourcery, LLC This file is part of GCC. @@ -46,12 +46,15 @@ #undef TARGET_LINKER_EMULATION #if TARGET_BIG_ENDIAN_DEFAULT #define TARGET_LINKER_EMULATION "armelfb_linux_eabi" +#define TARGET_FDPIC_LINKER_EMULATION "armelfb_linux_fdpiceabi" #else #define TARGET_LINKER_EMULATION "armelf_linux_eabi" +#define TARGET_FDPIC_LINKER_EMULATION "armelf_linux_fdpiceabi" #endif #undef SUBTARGET_EXTRA_LINK_SPEC -#define SUBTARGET_EXTRA_LINK_SPEC " -m " TARGET_LINKER_EMULATION +#define SUBTARGET_EXTRA_LINK_SPEC " -m %{mfdpic: " \ + TARGET_FDPIC_LINKER_EMULATION ";:" TARGET_LINKER_EMULATION "}" /* GNU/Linux on ARM currently supports three dynamic linkers: - ld-linux.so.2 - for the legacy ABI diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 706a45c..e54153e 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -94,13 +94,16 @@ (set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*") (set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")]) -(define_insn "mve_vdup<mode>" - [(set (match_operand:MVE_vecs 0 "s_register_operand" "=w") - (vec_duplicate:MVE_vecs +;; +;; [vdupq_n_u, vdupq_n_s, vdupq_n_f] +;; +(define_insn "@mve_vdupq_n<mode>" + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") + (vec_duplicate:MVE_VLD_ST (match_operand:<V_elem> 1 "s_register_operand" "r")))] "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" "vdup.<V_sz_elem>\t%q0, %1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vdup<mode>")) + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vdupq_n<mode>")) (set_attr "length" "4") (set_attr "type" "mve_move")]) @@ -189,21 +192,6 @@ ]) ;; -;; [vdupq_n_f]) -;; -(define_insn "@mve_<mve_insn>q_n_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:<V_elem> 1 "s_register_operand" "r")] - MVE_FP_N_VDUPQ_ONLY)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "<mve_insn>.%#<V_sz_elem>\t%q0, %1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_f<mode>")) - (set_attr "type" "mve_move") -]) - -;; ;; [vrev32q_f]) ;; (define_insn "@mve_<mve_insn>q_f<mode>" @@ -217,48 +205,35 @@ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_f<mode>")) (set_attr "type" "mve_move") ]) -;; -;; [vcvttq_f32_f16]) -;; -(define_insn "mve_vcvttq_f32_f16v4sf" - [ - (set (match_operand:V4SF 0 "s_register_operand" "=w") - (unspec:V4SF [(match_operand:V8HF 1 "s_register_operand" "w")] - VCVTTQ_F32_F16)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtt.f32.f16\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvttq_f32_f16v4sf")) - (set_attr "type" "mve_move") -]) ;; -;; [vcvtbq_f32_f16]) +;; [vcvtbq_f32_f16] +;; [vcvttq_f32_f16] ;; -(define_insn "mve_vcvtbq_f32_f16v4sf" +(define_insn "@mve_<mve_insn>q_f32_f16v4sf" [ (set (match_operand:V4SF 0 "s_register_operand" "=w") (unspec:V4SF [(match_operand:V8HF 1 "s_register_operand" "w")] - VCVTBQ_F32_F16)) + VCVTxQ_F32_F16)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtb.f32.f16\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtbq_f32_f16v4sf")) + "<mve_insn>.f32.f16\t%q0, %q1" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_f32_f16v4sf")) (set_attr "type" "mve_move") ]) ;; -;; [vcvtq_to_f_s, vcvtq_to_f_u]) +;; [vcvtq_to_f_s, vcvtq_to_f_u] ;; -(define_insn "mve_vcvtq_to_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_to_f_<supf><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] VCVTQ_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvt.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_to_f_<supf><mode>")) + "<mve_insn>.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q1" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_to_f_<supf><mode>")) (set_attr "type" "mve_move") ]) @@ -278,17 +253,17 @@ ]) ;; -;; [vcvtq_from_f_s, vcvtq_from_f_u]) +;; [vcvtq_from_f_s, vcvtq_from_f_u] ;; -(define_insn "mve_vcvtq_from_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_from_f_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] VCVTQ_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_from_f_<supf><mode>")) + "<mve_insn>.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_from_f_<supf><mode>")) (set_attr "type" "mve_move") ]) @@ -329,21 +304,6 @@ ) ;; -;; [vdupq_n_u, vdupq_n_s]) -;; -(define_insn "@mve_<mve_insn>q_n_<supf><mode>" - [ - (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:<V_elem> 1 "s_register_operand" "r")] - VDUPQ_N)) - ] - "TARGET_HAVE_MVE" - "<mve_insn>.%#<V_sz_elem>\t%q0, %1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_<supf><mode>")) - (set_attr "type" "mve_move") -]) - -;; ;; [vclzq_u, vclzq_s]) ;; (define_insn "@mve_vclzq_s<mode>" @@ -429,62 +389,20 @@ ]) ;; -;; [vcvtpq_s, vcvtpq_u]) -;; -(define_insn "mve_vcvtpq_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] - VCVTPQ)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtp.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtpq_<supf><mode>")) - (set_attr "type" "mve_move") -]) - -;; -;; [vcvtnq_s, vcvtnq_u]) +;; [vcvtaq_u, vcvtaq_s] +;; [vcvtmq_s, vcvtmq_u] +;; [vcvtnq_s, vcvtnq_u] +;; [vcvtpq_s, vcvtpq_u] ;; -(define_insn "mve_vcvtnq_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] - VCVTNQ)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtn.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtnq_<supf><mode>")) - (set_attr "type" "mve_move") -]) - -;; -;; [vcvtmq_s, vcvtmq_u]) -;; -(define_insn "mve_vcvtmq_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] - VCVTMQ)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtm.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtmq_<supf><mode>")) - (set_attr "type" "mve_move") -]) - -;; -;; [vcvtaq_u, vcvtaq_s]) -;; -(define_insn "mve_vcvtaq_<supf><mode>" +(define_insn "@mve_<mve_insn>q_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w")] - VCVTAQ)) + VCVTxQ)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvta.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtaq_<supf><mode>")) + "<mve_insn>.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q1" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf><mode>")) (set_attr "type" "mve_move") ]) @@ -537,7 +455,7 @@ ;; ;; [vctp8q vctp16q vctp32q vctp64q]) ;; -(define_insn "mve_vctp<MVE_vctp>q<MVE_vpred>" +(define_insn "@mve_vctp<MVE_vctp>q<MVE_vpred>" [ (set (match_operand:MVE_7 0 "vpr_register_operand" "=Up") (unspec:MVE_7 [(match_operand:SI 1 "s_register_operand" "r")] @@ -581,9 +499,9 @@ ]) ;; -;; [vcvtq_n_to_f_s, vcvtq_n_to_f_u]) +;; [vcvtq_n_to_f_s, vcvtq_n_to_f_u] ;; -(define_insn "mve_vcvtq_n_to_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_n_to_f_<supf><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w") @@ -591,8 +509,8 @@ VCVTQ_N_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvt.f<V_sz_elem>.<supf><V_sz_elem>\t%q0, %q1, %2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_n_to_f_<supf><mode>")) + "<mve_insn>.f<V_sz_elem>.<supf><V_sz_elem>\t%q0, %q1, %2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_to_f_<supf><mode>")) (set_attr "type" "mve_move") ]) @@ -679,9 +597,9 @@ ]) ;; -;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u]) +;; [vcvtq_n_from_f_s, vcvtq_n_from_f_u] ;; -(define_insn "mve_vcvtq_n_from_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_n_from_f_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:<MVE_CNVT> 1 "s_register_operand" "w") @@ -689,8 +607,8 @@ VCVTQ_N_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvt.<supf><V_sz_elem>.f<V_sz_elem>\t%q0, %q1, %2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_n_from_f_<supf><mode>")) + "<mve_insn>.<supf><V_sz_elem>.f<V_sz_elem>\t%q0, %q1, %2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_from_f_<supf><mode>")) (set_attr "type" "mve_move") ]) @@ -858,7 +776,7 @@ ;; ;; [vbicq_s, vbicq_u]) ;; -(define_insn "mve_vbicq_u<mode>" +(define_insn "@mve_vbicq_u<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (and:MVE_2 (not:MVE_2 (match_operand:MVE_2 2 "s_register_operand" "w")) @@ -870,7 +788,7 @@ (set_attr "type" "mve_move") ]) -(define_expand "mve_vbicq_s<mode>" +(define_expand "@mve_vbicq_s<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand") (and:MVE_2 (not:MVE_2 (match_operand:MVE_2 2 "s_register_operand")) @@ -1076,9 +994,9 @@ ]) ;; -;; [vornq_u, vornq_s]) +;; [vornq_u, vornq_s] ;; -(define_insn "mve_vornq_s<mode>" +(define_insn "@mve_vornq_s<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (ior:MVE_2 (not:MVE_2 (match_operand:MVE_2 2 "s_register_operand" "w")) @@ -1090,7 +1008,7 @@ (set_attr "type" "mve_move") ]) -(define_expand "mve_vornq_u<mode>" +(define_expand "@mve_vornq_u<mode>" [ (set (match_operand:MVE_2 0 "s_register_operand") (ior:MVE_2 (not:MVE_2 (match_operand:MVE_2 2 "s_register_operand")) @@ -1264,7 +1182,7 @@ ;; ;; [vbicq_f]) ;; -(define_insn "mve_vbicq_f<mode>" +(define_insn "@mve_vbicq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (and:MVE_0 (not:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w")) @@ -1327,7 +1245,7 @@ ;; ;; [vctp8q_m vctp16q_m vctp32q_m vctp64q_m]) ;; -(define_insn "mve_vctp<MVE_vctp>q_m<MVE_vpred>" +(define_insn "@mve_vctp<MVE_vctp>q_m<MVE_vpred>" [ (set (match_operand:MVE_7 0 "vpr_register_operand" "=Up") (unspec:MVE_7 [(match_operand:SI 1 "s_register_operand" "r") @@ -1342,34 +1260,19 @@ ]) ;; -;; [vcvtbq_f16_f32]) +;; [vcvtbq_f16_f32] +;; [vcvttq_f16_f32] ;; -(define_insn "mve_vcvtbq_f16_f32v8hf" +(define_insn "@mve_<mve_insn>q_f16_f32v8hf" [ (set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") (match_operand:V4SF 2 "s_register_operand" "w")] - VCVTBQ_F16_F32)) + VCVTxQ_F16_F32)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtb.f16.f32\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtbq_f16_f32v8hf")) - (set_attr "type" "mve_move") -]) - -;; -;; [vcvttq_f16_f32]) -;; -(define_insn "mve_vcvttq_f16_f32v8hf" - [ - (set (match_operand:V8HF 0 "s_register_operand" "=w") - (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") - (match_operand:V4SF 2 "s_register_operand" "w")] - VCVTTQ_F16_F32)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcvtt.f16.f32\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvttq_f16_f32v8hf")) + "<mve_insn>.f16.f32\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_f16_f32v8hf")) (set_attr "type" "mve_move") ]) @@ -1499,9 +1402,9 @@ ]) ;; -;; [vornq_f]) +;; [vornq_f] ;; -(define_insn "mve_vornq_f<mode>" +(define_insn "@mve_vornq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (ior:MVE_0 (not:MVE_0 (match_operand:MVE_0 2 "s_register_operand" "w")) @@ -1655,26 +1558,29 @@ (set_attr "length""8")]) ;; -;; [vcvtaq_m_u, vcvtaq_m_s]) +;; [vcvtaq_m_u, vcvtaq_m_s] +;; [vcvtmq_m_s, vcvtmq_m_u] +;; [vcvtnq_m_s, vcvtnq_m_u] +;; [vcvtpq_m_u, vcvtpq_m_s] ;; -(define_insn "mve_vcvtaq_m_<supf><mode>" +(define_insn "@mve_<mve_insn>q_m_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") (match_operand:<MVE_CNVT> 2 "s_register_operand" "w") (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTAQ_M)) + VCVTxQ_M)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtat.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtaq_<supf><mode>")) + "vpst\;<mve_insn>t.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf><mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) ;; -;; [vcvtq_m_to_f_s, vcvtq_m_to_f_u]) +;; [vcvtq_m_to_f_s, vcvtq_m_to_f_u] ;; -(define_insn "mve_vcvtq_m_to_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_m_to_f_<supf><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") @@ -1683,8 +1589,8 @@ VCVTQ_M_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtt.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_to_f_<supf><mode>")) + "vpst\;<mve_insn>t.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_to_f_<supf><mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -1758,35 +1664,7 @@ ;; ;; [vshlcq_u vshlcq_s] ;; -(define_expand "mve_vshlcq_vec_<supf><mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_32") - (unspec:MVE_2 [(const_int 0)] VSHLCQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (SImode); - emit_insn(gen_mve_vshlcq_<supf><mode>(operands[0], ignore_wb, operands[1], - operands[2], operands[3])); - DONE; -}) - -(define_expand "mve_vshlcq_carry_<supf><mode>" - [(match_operand:SI 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_32") - (unspec:MVE_2 [(const_int 0)] VSHLCQ)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (<MODE>mode); - emit_insn(gen_mve_vshlcq_<supf><mode>(ignore_vec, operands[0], operands[1], - operands[2], operands[3])); - DONE; -}) - -(define_insn "mve_vshlcq_<supf><mode>" +(define_insn "@mve_vshlcq_<supf><mode>" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "1") @@ -1903,7 +1781,7 @@ ] "TARGET_HAVE_MVE" "vpst\;<mve_insn>t.%#<V_sz_elem>\t%q0, %2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_<supf><mode>")) + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n<mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -2237,74 +2115,42 @@ (set_attr "length""8")]) ;; -;; [vcvtbq_m_f16_f32]) +;; [vcvtbq_m_f16_f32] +;; [vcvttq_m_f16_f32] ;; -(define_insn "mve_vcvtbq_m_f16_f32v8hf" +(define_insn "@mve_<mve_insn>q_m_f16_f32v8hf" [ (set (match_operand:V8HF 0 "s_register_operand" "=w") (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTBQ_M_F16_F32)) + (match_operand:V4BI 3 "vpr_register_operand" "Up")] + VCVTxQ_M_F16_F32)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtbt.f16.f32\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtbq_f16_f32v8hf")) + "vpst\;<mve_insn>t.f16.f32\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_f16_f32v8hf")) (set_attr "type" "mve_move") (set_attr "length""8")]) ;; -;; [vcvtbq_m_f32_f16]) +;; [vcvtbq_m_f32_f16] +;; [vcvttq_m_f32_f16] ;; -(define_insn "mve_vcvtbq_m_f32_f16v4sf" +(define_insn "@mve_<mve_insn>q_m_f32_f16v4sf" [ (set (match_operand:V4SF 0 "s_register_operand" "=w") (unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0") (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTBQ_M_F32_F16)) + (match_operand:V8BI 3 "vpr_register_operand" "Up")] + VCVTxQ_M_F32_F16)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtbt.f32.f16\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtbq_f32_f16v4sf")) + "vpst\;<mve_insn>t.f32.f16\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_f32_f16v4sf")) (set_attr "type" "mve_move") (set_attr "length""8")]) ;; -;; [vcvttq_m_f16_f32]) -;; -(define_insn "mve_vcvttq_m_f16_f32v8hf" - [ - (set (match_operand:V8HF 0 "s_register_operand" "=w") - (unspec:V8HF [(match_operand:V8HF 1 "s_register_operand" "0") - (match_operand:V4SF 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTTQ_M_F16_F32)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvttt.f16.f32\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvttq_f16_f32v8hf")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; -;; [vcvttq_m_f32_f16]) -;; -(define_insn "mve_vcvttq_m_f32_f16v4sf" - [ - (set (match_operand:V4SF 0 "s_register_operand" "=w") - (unspec:V4SF [(match_operand:V4SF 1 "s_register_operand" "0") - (match_operand:V8HF 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTTQ_M_F32_F16)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvttt.f32.f16\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvttq_f32_f16v4sf")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; ;; [vdupq_m_n_f]) ;; (define_insn "@mve_<mve_insn>q_m_n_f<mode>" @@ -2317,7 +2163,7 @@ ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vpst\;<mve_insn>t.%#<V_sz_elem>\t%q0, %2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_f<mode>")) + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n<mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -2599,61 +2445,11 @@ (set_attr "type" "mve_move") (set_attr "length""8")]) -;; -;; [vcvtmq_m_s, vcvtmq_m_u]) -;; -(define_insn "mve_vcvtmq_m_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") - (match_operand:<MVE_CNVT> 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTMQ_M)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtmt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtmq_<supf><mode>")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; -;; [vcvtpq_m_u, vcvtpq_m_s]) -;; -(define_insn "mve_vcvtpq_m_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") - (match_operand:<MVE_CNVT> 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTPQ_M)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtpt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtpq_<supf><mode>")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) ;; -;; [vcvtnq_m_s, vcvtnq_m_u]) +;; [vcvtq_m_n_from_f_s, vcvtq_m_n_from_f_u] ;; -(define_insn "mve_vcvtnq_m_<supf><mode>" - [ - (set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") - (match_operand:<MVE_CNVT> 2 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 3 "vpr_register_operand" "Up")] - VCVTNQ_M)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtnt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtnq_<supf><mode>")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; -;; [vcvtq_m_n_from_f_s, vcvtq_m_n_from_f_u]) -;; -(define_insn "mve_vcvtq_m_n_from_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_m_n_from_f_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") @@ -2663,8 +2459,8 @@ VCVTQ_M_N_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2, %3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_n_from_f_<supf><mode>")) + "vpst\;<mve_insn>t.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2, %3" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_from_f_<supf><mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -2686,9 +2482,9 @@ (set_attr "length""8")]) ;; -;; [vcvtq_m_from_f_u, vcvtq_m_from_f_s]) +;; [vcvtq_m_from_f_u, vcvtq_m_from_f_s] ;; -(define_insn "mve_vcvtq_m_from_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_m_from_f_<supf><mode>" [ (set (match_operand:MVE_5 0 "s_register_operand" "=w") (unspec:MVE_5 [(match_operand:MVE_5 1 "s_register_operand" "0") @@ -2697,8 +2493,8 @@ VCVTQ_M_FROM_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtt.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_from_f_<supf><mode>")) + "vpst\;<mve_insn>t.<supf>%#<V_sz_elem>.f%#<V_sz_elem>\t%q0, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_from_f_<supf><mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -2757,9 +2553,9 @@ (set_attr "length" "8")]) ;; -;; [vcvtq_m_n_to_f_u, vcvtq_m_n_to_f_s]) +;; [vcvtq_m_n_to_f_u, vcvtq_m_n_to_f_s] ;; -(define_insn "mve_vcvtq_m_n_to_f_<supf><mode>" +(define_insn "@mve_<mve_insn>q_m_n_to_f_<supf><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") @@ -2769,8 +2565,8 @@ VCVTQ_M_N_TO_F)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vcvtt.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q2, %3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vcvtq_n_to_f_<supf><mode>")) + "vpst\;<mve_insn>t.f%#<V_sz_elem>.<supf>%#<V_sz_elem>\t%q0, %q2, %3" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_n_to_f_<supf><mode>")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -2859,6 +2655,7 @@ ;; [vandq_m_u, vandq_m_s] ;; [vbicq_m_u, vbicq_m_s] ;; [veorq_m_u, veorq_m_s] +;; [vornq_m_u, vornq_m_s] ;; [vorrq_m_u, vorrq_m_s] ;; (define_insn "@mve_<mve_insn>q_m_<supf><mode>" @@ -2986,24 +2783,6 @@ (set_attr "length""8")]) ;; -;; [vornq_m_u, vornq_m_s]) -;; -(define_insn "mve_vornq_m_<supf><mode>" - [ - (set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") - (match_operand:MVE_2 2 "s_register_operand" "w") - (match_operand:MVE_2 3 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")] - VORNQ_M)) - ] - "TARGET_HAVE_MVE" - "vpst\;vornt\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vornq_<supf><mode>")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; ;; [vqshlq_m_n_s, vqshlq_m_n_u] ;; [vshlq_m_n_s, vshlq_m_n_u] ;; @@ -3257,6 +3036,7 @@ ;; [vandq_m_f] ;; [vbicq_m_f] ;; [veorq_m_f] +;; [vornq_m_f] ;; [vorrq_m_f] ;; (define_insn "@mve_<mve_insn>q_m_f<mode>" @@ -3336,44 +3116,201 @@ (set_attr "type" "mve_move") (set_attr "length""8")]) -;; -;; [vornq_m_f]) -;; -(define_insn "mve_vornq_m_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand" "Up")] - VORNQ_M_F)) +;; Vector stores +;; [vstrbq_s8, vstrhq_s16, vstrwq_s32, +;; vstrbq_u8, vstrhq_u16, vstrwq_u32, +;; vst1q ] +(define_insn "@mve_vstrq_<mode>" + [(set (match_operand:MVE_VLD_ST 0 "mve_memory_operand" "=Ux") + (unspec:MVE_VLD_ST + [(match_operand:MVE_VLD_ST 1 "s_register_operand" "w")] + VSTRQ)) ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vpst\;vornt\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vornq_f<mode>")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" +{ + rtx ops[2]; + int regno = REGNO (operands[1]); + ops[1] = gen_rtx_REG (TImode, regno); + ops[0] = operands[0]; + output_asm_insn ("vstr<MVE_elem_ch>.<V_sz_elem>\t%q1, %E0",ops); + return ""; +} + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrq_<mode>")) + (set_attr "length" "4")]) -;; -;; [vstrbq_s vstrbq_u] -;; -(define_insn "mve_vstrbq_<supf><mode>" - [(set (match_operand:<MVE_B_ELEM> 0 "mve_memory_operand" "=Ux") - (unspec:<MVE_B_ELEM> [(match_operand:MVE_2 1 "s_register_operand" "w")] - VSTRBQ)) +;; Predicated vector stores +;; [vstrbq_p_s8, vstrhq_p_s16, vstrwq_p_s32, +;; vstrbq_p_u8, vstrhq_p_u16, vstrwq_p_u32, +;; vst1q_p ] +(define_insn "@mve_vstrq_p_<mode>" + [(set (match_operand:MVE_VLD_ST 0 "mve_memory_operand" "=Ux") + (unspec:MVE_VLD_ST [ + (match_operand:MVE_VLD_ST 1 "s_register_operand" "w") + (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up") + (match_dup 0) + ] VSTRQ_P)) ] - "TARGET_HAVE_MVE" + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" { - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn("vstrb.<V_sz_elem>\t%q1, %E0",ops); - return ""; + rtx ops[2]; + int regno = REGNO (operands[1]); + ops[1] = gen_rtx_REG (TImode, regno); + ops[0] = operands[0]; + output_asm_insn ("vpst\;vstr<MVE_elem_ch>t.<V_sz_elem>\t%q1, %E0",ops); + return ""; +} + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrq_<mode>")) + (set_attr "type" "mve_move") + (set_attr "length" "8")]) + +;; Truncating vector stores +;; [vstrbq_s16, vstrbq_s32, vstrhq_s32, +;; vstrbq_u16, vstrbq_u32, vstrhq_u32] +(define_insn "@mve_vstrq_truncate_<mode>" + [(set (match_operand:MVE_w_narrow_TYPE 0 "mve_memory_operand" "=Ux") + (unspec:MVE_w_narrow_TYPE + [(truncate:MVE_w_narrow_TYPE + (match_operand:<MVE_wide_n_TYPE> 1 "s_register_operand" "w"))] + VSTRQ_TRUNC + ))] + "TARGET_HAVE_MVE" +{ + rtx ops[2]; + int regno = REGNO (operands[1]); + ops[1] = gen_rtx_REG (TImode, regno); + ops[0] = operands[0]; + output_asm_insn ("vstr<MVE_elem_ch>.<MVE_wide_n_sz_elem>\t%q1, %E0",ops); + return ""; +} + [(set (attr "mve_unpredicated_insn") + (symbol_ref "CODE_FOR_mve_vstrq_truncate_<mode>")) + (set_attr "length" "4")]) + +;; Predicated truncating vector stores +;; [vstrbq_p_s16, vstrbq_p_s32, vstrhq_p_s32, +;; vstrbq_p_u16, vstrbq_p_u32, vstrhq_p_u32] +(define_insn "@mve_vstrq_p_truncate_<mode>" + [(set (match_operand:MVE_w_narrow_TYPE 0 "mve_memory_operand" "=Ux") + (unspec:MVE_w_narrow_TYPE [ + (truncate:MVE_w_narrow_TYPE + (match_operand:<MVE_wide_n_TYPE> 1 "s_register_operand" "w")) + (match_operand:<MVE_wide_n_VPRED> 2 "vpr_register_operand" "Up") + (match_dup 0) + ] VSTRQ_TRUNC_P))] + "TARGET_HAVE_MVE" +{ + rtx ops[2]; + int regno = REGNO (operands[1]); + ops[1] = gen_rtx_REG (TImode, regno); + ops[0] = operands[0]; + output_asm_insn ( + "vpst\;vstr<MVE_elem_ch>t.<MVE_wide_n_sz_elem>\t%q1, %E0", + ops + ); + return ""; +} + [(set (attr "mve_unpredicated_insn") + (symbol_ref "CODE_FOR_mve_vstrq_truncate_<mode>")) + (set_attr "type" "mve_move") + (set_attr "length" "8")]) + +;; Vector Loads +;; [vldrbq_s8, vldrhq_s16, vldrwq_s32, +;; vldrbq_u8, vldrhq_u16, vldrwq_u32, +;; vld1q ] +(define_insn "@mve_vldrq_<mode>" + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") + (unspec:MVE_VLD_ST + [(match_operand:MVE_VLD_ST 1 "mve_memory_operand" "Ux")] + VLDRQ))] + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" +{ + rtx ops[2]; + int regno = REGNO (operands[0]); + ops[0] = gen_rtx_REG (TImode, regno); + ops[1] = operands[1]; + output_asm_insn ("vldr<MVE_elem_ch>.<V_sz_elem>\t%q0, %E1",ops); + return ""; + } + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrq_<mode>")) + (set_attr "length" "4")]) + +;; Predicated vector loads +;; [vldrbq_z_s8, vldrhq_z_s16, vldrwq_z_s32, +;; vldrbq_z_u8, vldrhq_z_u16, vldrwq_z_u32, +;; vld1q_z ] +(define_insn "@mve_vldrq_z_<mode>" + [(set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w") + (unspec:MVE_VLD_ST [ + (match_operand:MVE_VLD_ST 1 "mve_memory_operand" "Ux") + (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up") + ] VLDRQ_Z))] + "(TARGET_HAVE_MVE && VALID_MVE_SI_MODE (<MODE>mode)) + || (TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (<MODE>mode))" +{ + rtx ops[2]; + int regno = REGNO (operands[0]); + ops[0] = gen_rtx_REG (TImode, regno); + ops[1] = operands[1]; + output_asm_insn ("vpst\;vldr<MVE_elem_ch>t.<V_sz_elem>\t%q0, %E1",ops); + return ""; +} + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrq_<mode>")) + (set_attr "type" "mve_move") + (set_attr "length" "8")]) + +;; Extending vector loads +;; [vldrbq_s16, vldrbq_s32, vldrhq_s32, +;; vldrbq_u16, vldrbq_u32, vldrhq_u32] +(define_insn "@mve_vldrq_extend_<mode><US>" + [(set (match_operand:<MVE_wide_n_TYPE> 0 "s_register_operand" "=w") + (unspec:<MVE_wide_n_TYPE> + [(SE:<MVE_wide_n_TYPE> + (match_operand:MVE_w_narrow_TYPE 1 "mve_memory_operand" "Ux"))] + VLDRQ_EXT))] + "TARGET_HAVE_MVE" +{ + rtx ops[2]; + int regno = REGNO (operands[0]); + ops[0] = gen_rtx_REG (TImode, regno); + ops[1] = operands[1]; + output_asm_insn ("vldr<MVE_elem_ch>.<US><MVE_wide_n_sz_elem>\t%q0, %E1",ops); + return ""; } - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrbq_<supf><mode>")) + [(set (attr "mve_unpredicated_insn") + (symbol_ref "CODE_FOR_mve_vldrq_extend_<mode><US>")) (set_attr "length" "4")]) +;; Predicated extending vector loads +;; [vldrbq_z_s16, vldrbq_z_s32, vldrhq_z_s32, +;; vldrbq_z_u16, vldrbq_z_u32, vldrhq_z_u32] +(define_insn "@mve_vldrq_z_extend_<mode><US>" + [(set (match_operand:<MVE_wide_n_TYPE> 0 "s_register_operand" "=w") + (unspec:<MVE_wide_n_TYPE> [ + (SE:<MVE_wide_n_TYPE> + (match_operand:MVE_w_narrow_TYPE 1 "mve_memory_operand" "Ux")) + (match_operand:<MVE_wide_n_VPRED> 2 "vpr_register_operand" "Up") + ] VLDRQ_EXT_Z))] + "TARGET_HAVE_MVE" +{ + rtx ops[2]; + int regno = REGNO (operands[0]); + ops[0] = gen_rtx_REG (TImode, regno); + ops[1] = operands[1]; + output_asm_insn ( + "vpst\;vldr<MVE_elem_ch>t.<US><MVE_wide_n_sz_elem>\t%q0, %E1", + ops + ); + return ""; +} + [(set (attr "mve_unpredicated_insn") + (symbol_ref "CODE_FOR_mve_vldrq_extend_<mode><US>")) + (set_attr "type" "mve_move") + (set_attr "length" "8")]) + ;; ;; [vstrbq_scatter_offset_s vstrbq_scatter_offset_u] ;; @@ -3451,29 +3388,6 @@ (set_attr "length" "4")]) ;; -;; [vldrbq_s vldrbq_u] -;; -(define_insn "mve_vldrbq_<supf><mode>" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:<MVE_B_ELEM> 1 "mve_memory_operand" "Ux")] - VLDRBQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - if (<V_sz_elem> == 8) - output_asm_insn ("vldrb.<V_sz_elem>\t%q0, %E1",ops); - else - output_asm_insn ("vldrb.<supf><V_sz_elem>\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrbq_<supf><mode>")) - (set_attr "length" "4")]) - -;; ;; [vldrwq_gather_base_s vldrwq_gather_base_u] ;; (define_insn "mve_vldrwq_gather_base_<supf>v4si" @@ -3551,25 +3465,6 @@ [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrwq_scatter_base_<supf>v4si")) (set_attr "length" "8")]) -(define_insn "mve_vstrbq_p_<supf><mode>" - [(set (match_operand:<MVE_B_ELEM> 0 "mve_memory_operand" "=Ux") - (unspec:<MVE_B_ELEM> - [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up") - (match_dup 0)] - VSTRBQ))] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vpst\;vstrbt.<V_sz_elem>\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrbq_<supf><mode>")) - (set_attr "length" "8")]) - ;; ;; [vldrbq_gather_offset_z_s vldrbq_gather_offset_z_u] ;; @@ -3597,30 +3492,6 @@ (set_attr "length" "8")]) ;; -;; [vldrbq_z_s vldrbq_z_u] -;; -(define_insn "mve_vldrbq_z_<supf><mode>" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:<MVE_B_ELEM> 1 "mve_memory_operand" "Ux") - (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")] - VLDRBQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - if (<V_sz_elem> == 8) - output_asm_insn ("vpst\;vldrbt.<V_sz_elem>\t%q0, %E1",ops); - else - output_asm_insn ("vpst\;vldrbt.<supf><V_sz_elem>\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrbq_<supf><mode>")) - (set_attr "length" "8")]) - -;; ;; [vldrwq_gather_base_z_s vldrwq_gather_base_z_u] ;; (define_insn "mve_vldrwq_gather_base_z_<supf>v4si" @@ -3643,26 +3514,6 @@ (set_attr "length" "8")]) ;; -;; [vldrhq_f] -;; -(define_insn "mve_vldrhq_fv8hf" - [(set (match_operand:V8HF 0 "s_register_operand" "=w") - (unspec:V8HF [(match_operand:V8HI 1 "mve_memory_operand" "Ux")] - VLDRHQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vldrh.16\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrhq_fv8hf")) - (set_attr "length" "4")]) - -;; ;; [vldrhq_gather_offset_s vldrhq_gather_offset_u] ;; (define_insn "mve_vldrhq_gather_offset_<supf><mode>" @@ -3763,176 +3614,6 @@ (set_attr "length" "8")]) ;; -;; [vldrhq_s, vldrhq_u] -;; -(define_insn "mve_vldrhq_<supf><mode>" - [(set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:<MVE_H_ELEM> 1 "mve_memory_operand" "Ux")] - VLDRHQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - if (<V_sz_elem> == 16) - output_asm_insn ("vldrh.16\t%q0, %E1",ops); - else - output_asm_insn ("vldrh.<supf><V_sz_elem>\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrhq_<supf><mode>")) - (set_attr "length" "4")]) - -;; -;; [vldrhq_z_f] -;; -(define_insn "mve_vldrhq_z_fv8hf" - [(set (match_operand:V8HF 0 "s_register_operand" "=w") - (unspec:V8HF [(match_operand:V8HI 1 "mve_memory_operand" "Ux") - (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")] - VLDRHQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vpst\;vldrht.16\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrhq_fv8hf")) - (set_attr "length" "8")]) - -;; -;; [vldrhq_z_s vldrhq_z_u] -;; -(define_insn "mve_vldrhq_z_<supf><mode>" - [(set (match_operand:MVE_5 0 "s_register_operand" "=w") - (unspec:MVE_5 [(match_operand:<MVE_H_ELEM> 1 "mve_memory_operand" "Ux") - (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up")] - VLDRHQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - if (<V_sz_elem> == 16) - output_asm_insn ("vpst\;vldrht.16\t%q0, %E1",ops); - else - output_asm_insn ("vpst\;vldrht.<supf><V_sz_elem>\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrhq_<supf><mode>")) - (set_attr "length" "8")]) - -;; -;; [vldrwq_f] -;; -(define_insn "mve_vldrwq_fv4sf" - [(set (match_operand:V4SF 0 "s_register_operand" "=w") - (unspec:V4SF [(match_operand:V4SI 1 "mve_memory_operand" "Ux")] - VLDRWQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vldrw.32\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrwq_fv4sf")) - (set_attr "length" "4")]) - -;; -;; [vldrwq_s vldrwq_u] -;; -(define_insn "mve_vldrwq_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "mve_memory_operand" "Ux")] - VLDRWQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vldrw.32\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrwq_<supf>v4si")) - (set_attr "length" "4")]) - -;; -;; [vldrwq_z_f] -;; -(define_insn "mve_vldrwq_z_fv4sf" - [(set (match_operand:V4SF 0 "s_register_operand" "=w") - (unspec:V4SF [(match_operand:V4SI 1 "mve_memory_operand" "Ux") - (match_operand:V4BI 2 "vpr_register_operand" "Up")] - VLDRWQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vpst\;vldrwt.32\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrwq_fv4sf")) - (set_attr "length" "8")]) - -;; -;; [vldrwq_z_s vldrwq_z_u] -;; -(define_insn "mve_vldrwq_z_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "mve_memory_operand" "Ux") - (match_operand:V4BI 2 "vpr_register_operand" "Up")] - VLDRWQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[0]); - ops[0] = gen_rtx_REG (TImode, regno); - ops[1] = operands[1]; - output_asm_insn ("vpst\;vldrwt.32\t%q0, %E1",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrwq_<supf>v4si")) - (set_attr "length" "8")]) - -(define_expand "@mve_vld1q_f<mode>" - [(match_operand:MVE_0 0 "s_register_operand") - (unspec:MVE_0 [(match_operand:<MVE_CNVT> 1 "mve_memory_operand")] VLD1Q_F) - ] - "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" -{ - emit_insn (gen_mve_vldr<V_sz_elem1>q_f<mode>(operands[0],operands[1])); - DONE; -}) - -(define_expand "@mve_vld1q_<supf><mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (unspec:MVE_2 [(match_operand:MVE_2 1 "mve_memory_operand")] VLD1Q) - ] - "TARGET_HAVE_MVE" -{ - emit_insn (gen_mve_vldr<V_sz_elem1>q_<supf><mode>(operands[0],operands[1])); - DONE; -}) - -;; ;; [vldrdq_gather_base_s vldrdq_gather_base_u] ;; (define_insn "mve_vldrdq_gather_base_<supf>v2di" @@ -4369,71 +4050,6 @@ (set_attr "length" "8")]) ;; -;; [vstrhq_f] -;; -(define_insn "mve_vstrhq_fv8hf" - [(set (match_operand:V8HI 0 "mve_memory_operand" "=Ux") - (unspec:V8HI [(match_operand:V8HF 1 "s_register_operand" "w")] - VSTRHQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vstrh.16\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrhq_fv8hf")) - (set_attr "length" "4")]) - -;; -;; [vstrhq_p_f] -;; -(define_insn "mve_vstrhq_p_fv8hf" - [(set (match_operand:V8HI 0 "mve_memory_operand" "=Ux") - (unspec:V8HI - [(match_operand:V8HF 1 "s_register_operand" "w") - (match_operand:V8BI 2 "vpr_register_operand" "Up") - (match_dup 0)] - VSTRHQ_F))] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vpst\;vstrht.16\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrhq_fv8hf")) - (set_attr "length" "8")]) - -;; -;; [vstrhq_p_s vstrhq_p_u] -;; -(define_insn "mve_vstrhq_p_<supf><mode>" - [(set (match_operand:<MVE_H_ELEM> 0 "mve_memory_operand" "=Ux") - (unspec:<MVE_H_ELEM> - [(match_operand:MVE_5 1 "s_register_operand" "w") - (match_operand:<MVE_VPRED> 2 "vpr_register_operand" "Up") - (match_dup 0)] - VSTRHQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vpst\;vstrht.<V_sz_elem>\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrhq_<supf><mode>")) - (set_attr "length" "8")]) - -;; ;; [vstrhq_scatter_offset_p_s vstrhq_scatter_offset_p_u] ;; (define_expand "mve_vstrhq_scatter_offset_p_<supf><mode>" @@ -4559,130 +4175,6 @@ (set_attr "length" "4")]) ;; -;; [vstrhq_s, vstrhq_u] -;; -(define_insn "mve_vstrhq_<supf><mode>" - [(set (match_operand:<MVE_H_ELEM> 0 "mve_memory_operand" "=Ux") - (unspec:<MVE_H_ELEM> [(match_operand:MVE_5 1 "s_register_operand" "w")] - VSTRHQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vstrh.<V_sz_elem>\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrhq_<supf><mode>")) - (set_attr "length" "4")]) - -;; -;; [vstrwq_f] -;; -(define_insn "mve_vstrwq_fv4sf" - [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") - (unspec:V4SI [(match_operand:V4SF 1 "s_register_operand" "w")] - VSTRWQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vstrw.32\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrwq_fv4sf")) - (set_attr "length" "4")]) - -;; -;; [vstrwq_p_f] -;; -(define_insn "mve_vstrwq_p_fv4sf" - [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") - (unspec:V4SI - [(match_operand:V4SF 1 "s_register_operand" "w") - (match_operand:V4BI 2 "vpr_register_operand" "Up") - (match_dup 0)] - VSTRWQ_F))] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vpst\;vstrwt.32\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrwq_fv4sf")) - (set_attr "length" "8")]) - -;; -;; [vstrwq_p_s vstrwq_p_u] -;; -(define_insn "mve_vstrwq_p_<supf>v4si" - [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") - (unspec:V4SI - [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:V4BI 2 "vpr_register_operand" "Up") - (match_dup 0)] - VSTRWQ))] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vpst\;vstrwt.32\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrwq_<supf>v4si")) - (set_attr "length" "8")]) - -;; -;; [vstrwq_s vstrwq_u] -;; -(define_insn "mve_vstrwq_<supf>v4si" - [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w")] - VSTRWQ)) - ] - "TARGET_HAVE_MVE" -{ - rtx ops[2]; - int regno = REGNO (operands[1]); - ops[1] = gen_rtx_REG (TImode, regno); - ops[0] = operands[0]; - output_asm_insn ("vstrw.32\t%q1, %E0",ops); - return ""; -} - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vstrwq_<supf>v4si")) - (set_attr "length" "4")]) - -(define_expand "@mve_vst1q_f<mode>" - [(match_operand:<MVE_CNVT> 0 "mve_memory_operand") - (unspec:<MVE_CNVT> [(match_operand:MVE_0 1 "s_register_operand")] VST1Q_F) - ] - "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT" -{ - emit_insn (gen_mve_vstr<V_sz_elem1>q_f<mode>(operands[0],operands[1])); - DONE; -}) - -(define_expand "@mve_vst1q_<supf><mode>" - [(match_operand:MVE_2 0 "mve_memory_operand") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand")] VST1Q) - ] - "TARGET_HAVE_MVE" -{ - emit_insn (gen_mve_vstr<V_sz_elem1>q_<supf><mode>(operands[0],operands[1])); - DONE; -}) - -;; ;; [vstrdq_scatter_base_p_s vstrdq_scatter_base_p_u] ;; (define_insn "mve_vstrdq_scatter_base_p_<supf>v2di" @@ -5271,258 +4763,79 @@ (set_attr "length" "4")]) ;; -;; [vidupq_n_u]) ;; -(define_expand "mve_vidupq_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:SI 1 "s_register_operand") - (match_operand:SI 2 "mve_imm_selective_upto_8")] - "TARGET_HAVE_MVE" -{ - rtx temp = gen_reg_rtx (SImode); - emit_move_insn (temp, operands[1]); - rtx inc = gen_int_mode (INTVAL(operands[2]) * <MVE_LANES>, SImode); - emit_insn (gen_mve_vidupq_u<mode>_insn (operands[0], temp, operands[1], - operands[2], inc)); - DONE; -}) - +;; [vddupq_u_insn, vidupq_u_insn] ;; -;; [vidupq_u_insn]) -;; -(define_insn "mve_vidupq_u<mode>_insn" +(define_insn "@mve_<mve_insn>q_u<mode>_insn" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1") (match_operand:SI 3 "mve_imm_selective_upto_8" "Rg")] - VIDUPQ)) + VIDDUPQ)) (set (match_operand:SI 1 "s_register_operand" "=Te") - (plus:SI (match_dup 2) - (match_operand:SI 4 "immediate_operand" "i")))] + (<viddupq_op>:SI (match_dup 2) + (match_operand:SI 4 "immediate_operand" "i")))] "TARGET_HAVE_MVE" - "vidup.u%#<V_sz_elem>\t%q0, %1, %3") + "<mve_insn>.u%#<V_sz_elem>\t%q0, %1, %3") ;; -;; [vidupq_m_n_u]) +;; [vddupq_m_wb_u_insn, vidupq_m_wb_u_insn] ;; -(define_expand "mve_vidupq_m_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_selective_upto_8") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand")] - "TARGET_HAVE_MVE" -{ - rtx temp = gen_reg_rtx (SImode); - emit_move_insn (temp, operands[2]); - rtx inc = gen_int_mode (INTVAL(operands[3]) * <MVE_LANES>, SImode); - emit_insn (gen_mve_vidupq_m_wb_u<mode>_insn(operands[0], operands[1], temp, - operands[2], operands[3], - operands[4], inc)); - DONE; -}) - -;; -;; [vidupq_m_wb_u_insn]) -;; -(define_insn "mve_vidupq_m_wb_u<mode>_insn" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") - (match_operand:SI 3 "s_register_operand" "2") - (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg") - (match_operand:<MVE_VPRED> 5 "vpr_register_operand" "Up")] - VIDUPQ_M)) - (set (match_operand:SI 2 "s_register_operand" "=Te") - (plus:SI (match_dup 3) - (match_operand:SI 6 "immediate_operand" "i")))] - "TARGET_HAVE_MVE" - "vpst\;\tvidupt.u%#<V_sz_elem>\t%q0, %2, %4" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vidupq_u<mode>_insn")) - (set_attr "length""8")]) - -;; -;; [vddupq_n_u]) -;; -(define_expand "mve_vddupq_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:SI 1 "s_register_operand") - (match_operand:SI 2 "mve_imm_selective_upto_8")] - "TARGET_HAVE_MVE" -{ - rtx temp = gen_reg_rtx (SImode); - emit_move_insn (temp, operands[1]); - rtx inc = gen_int_mode (INTVAL(operands[2]) * <MVE_LANES>, SImode); - emit_insn (gen_mve_vddupq_u<mode>_insn (operands[0], temp, operands[1], - operands[2], inc)); - DONE; -}) - -;; -;; [vddupq_u_insn]) -;; -(define_insn "mve_vddupq_u<mode>_insn" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1") - (match_operand:SI 3 "immediate_operand" "i")] - VDDUPQ)) - (set (match_operand:SI 1 "s_register_operand" "=Te") - (minus:SI (match_dup 2) - (match_operand:SI 4 "immediate_operand" "i")))] - "TARGET_HAVE_MVE" - "vddup.u%#<V_sz_elem>\t%q0, %1, %3") - -;; -;; [vddupq_m_n_u]) -;; -(define_expand "mve_vddupq_m_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_selective_upto_8") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand")] - "TARGET_HAVE_MVE" -{ - rtx temp = gen_reg_rtx (SImode); - emit_move_insn (temp, operands[2]); - rtx inc = gen_int_mode (INTVAL(operands[3]) * <MVE_LANES>, SImode); - emit_insn (gen_mve_vddupq_m_wb_u<mode>_insn(operands[0], operands[1], temp, - operands[2], operands[3], - operands[4], inc)); - DONE; -}) - -;; -;; [vddupq_m_wb_u_insn]) -;; -(define_insn "mve_vddupq_m_wb_u<mode>_insn" +(define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "2") (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg") (match_operand:<MVE_VPRED> 5 "vpr_register_operand" "Up")] - VDDUPQ_M)) + VIDDUPQ_M)) (set (match_operand:SI 2 "s_register_operand" "=Te") - (minus:SI (match_dup 3) - (match_operand:SI 6 "immediate_operand" "i")))] + (<viddupq_m_op>:SI (match_dup 3) + (match_operand:SI 6 "immediate_operand" "i")))] "TARGET_HAVE_MVE" - "vpst\;vddupt.u%#<V_sz_elem>\t%q0, %2, %4" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vddupq_u<mode>_insn")) + "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q0, %2, %4" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_u<mode>_insn")) (set_attr "length""8")]) ;; -;; [vdwdupq_n_u]) -;; -(define_expand "mve_vdwdupq_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:SI 1 "s_register_operand") - (match_operand:DI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_selective_upto_8")] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (SImode); - emit_insn (gen_mve_vdwdupq_wb_u<mode>_insn (operands[0], ignore_wb, - operands[1], operands[2], - operands[3])); - DONE; -}) - -;; -;; [vdwdupq_wb_u]) +;; [vdwdupq_wb_u_insn, viwdupq_wb_u_insn] ;; -(define_expand "mve_vdwdupq_wb_u<mode>" - [(match_operand:SI 0 "s_register_operand") - (match_operand:SI 1 "s_register_operand") - (match_operand:DI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_selective_upto_8") - (unspec:MVE_2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (<MODE>mode); - emit_insn (gen_mve_vdwdupq_wb_u<mode>_insn (ignore_vec, operands[0], - operands[1], operands[2], - operands[3])); - DONE; -}) - -;; -;; [vdwdupq_wb_u_insn]) -;; -(define_insn "mve_vdwdupq_wb_u<mode>_insn" +(define_insn "@mve_<mve_insn>q_wb_u<mode>_insn" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1") (subreg:SI (match_operand:DI 3 "s_register_operand" "r") 4) (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")] - VDWDUPQ)) + VIDWDUPQ)) (set (match_operand:SI 1 "s_register_operand" "=Te") (unspec:SI [(match_dup 2) (subreg:SI (match_dup 3) 4) (match_dup 4)] - VDWDUPQ))] + VIDWDUPQ))] "TARGET_HAVE_MVE" - "vdwdup.u%#<V_sz_elem>\t%q0, %2, %R3, %4" -) - -;; -;; [vdwdupq_m_n_u]) -;; -(define_expand "mve_vdwdupq_m_n_u<mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:DI 3 "s_register_operand") - (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:<MVE_VPRED> 5 "vpr_register_operand")] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (SImode); - emit_insn (gen_mve_vdwdupq_m_wb_u<mode>_insn (operands[0], ignore_wb, - operands[1], operands[2], - operands[3], operands[4], - operands[5])); - DONE; -}) - -;; -;; [vdwdupq_m_wb_u]) -;; -(define_expand "mve_vdwdupq_m_wb_u<mode>" - [(match_operand:SI 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:DI 3 "s_register_operand") - (match_operand:SI 4 "mve_imm_selective_upto_8") - (match_operand:<MVE_VPRED> 5 "vpr_register_operand")] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (<MODE>mode); - emit_insn (gen_mve_vdwdupq_m_wb_u<mode>_insn (ignore_vec, operands[0], - operands[1], operands[2], - operands[3], operands[4], - operands[5])); - DONE; -}) + "<mve_insn>.u%#<V_sz_elem>\t%q0, %2, %R3, %4" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_wb_u<mode>_insn")) + (set_attr "type" "mve_move")]) ;; -;; [vdwdupq_m_wb_u_insn]) +;; [vdwdupq_m_wb_u_insn, viwdupq_m_wb_u_insn] ;; -(define_insn "mve_vdwdupq_m_wb_u<mode>_insn" +(define_insn "@mve_<mve_insn>q_m_wb_u<mode>_insn" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0") - (match_operand:SI 3 "s_register_operand" "1") + (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0") + (match_operand:SI 3 "s_register_operand" "2") (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4) (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg") (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")] - VDWDUPQ_M)) - (set (match_operand:SI 1 "s_register_operand" "=Te") - (unspec:SI [(match_dup 2) + VIDWDUPQ_M)) + (set (match_operand:SI 2 "s_register_operand" "=Te") + (unspec:SI [(match_dup 1) (match_dup 3) (subreg:SI (match_dup 4) 4) (match_dup 5) (match_dup 6)] - VDWDUPQ_M)) + VIDWDUPQ_M)) ] "TARGET_HAVE_MVE" - "vpst\;vdwdupt.u%#<V_sz_elem>\t%q2, %3, %R4, %5" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vdwdupq_wb_u<mode>_insn")) + "vpst\;<mve_insn>t.u%#<V_sz_elem>\t%q1, %3, %R4, %5" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_wb_u<mode>_insn")) (set_attr "type" "mve_move") (set_attr "length""8")]) @@ -5562,24 +4875,6 @@ }) ;; -;; [viwdupq_wb_u_insn]) -;; -(define_insn "mve_viwdupq_wb_u<mode>_insn" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:SI 2 "s_register_operand" "1") - (subreg:SI (match_operand:DI 3 "s_register_operand" "r") 4) - (match_operand:SI 4 "mve_imm_selective_upto_8" "Rg")] - VIWDUPQ)) - (set (match_operand:SI 1 "s_register_operand" "=Te") - (unspec:SI [(match_dup 2) - (subreg:SI (match_dup 3) 4) - (match_dup 4)] - VIWDUPQ))] - "TARGET_HAVE_MVE" - "viwdup.u%#<V_sz_elem>\t%q0, %2, %R3, %4" -) - -;; ;; [viwdupq_m_n_u]) ;; (define_expand "mve_viwdupq_m_n_u<mode>" @@ -5620,31 +4915,6 @@ }) ;; -;; [viwdupq_m_wb_u_insn]) -;; -(define_insn "mve_viwdupq_m_wb_u<mode>_insn" - [(set (match_operand:MVE_2 0 "s_register_operand" "=w") - (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0") - (match_operand:SI 3 "s_register_operand" "1") - (subreg:SI (match_operand:DI 4 "s_register_operand" "r") 4) - (match_operand:SI 5 "mve_imm_selective_upto_8" "Rg") - (match_operand:<MVE_VPRED> 6 "vpr_register_operand" "Up")] - VIWDUPQ_M)) - (set (match_operand:SI 1 "s_register_operand" "=Te") - (unspec:SI [(match_dup 2) - (match_dup 3) - (subreg:SI (match_dup 4) 4) - (match_dup 5) - (match_dup 6)] - VIWDUPQ_M)) - ] - "TARGET_HAVE_MVE" - "vpst\;\tviwdupt.u%#<V_sz_elem>\t%q2, %3, %R4, %5" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_viwdupq_wb_u<mode>_insn")) - (set_attr "type" "mve_move") - (set_attr "length""8")]) - -;; ;; [vstrwq_scatter_base_wb_s vstrwq_scatter_base_wb_u] ;; (define_insn "mve_vstrwq_scatter_base_wb_<supf>v4si" @@ -6150,160 +5420,89 @@ } [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vldrdq_gather_base_wb_<supf>v2di_insn")) (set_attr "length" "8")]) -;; -;; [vadciq_m_s, vadciq_m_u]) -;; -(define_insn "mve_vadciq_m_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0") - (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:V4BI 4 "vpr_register_operand" "Up")] - VADCIQ_M)) - (set (reg:SI VFPCC_REGNUM) - (unspec:SI [(const_int 0)] - VADCIQ_M)) - ] - "TARGET_HAVE_MVE" - "vpst\;vadcit.i32\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vadciq_<supf>v4si")) - (set_attr "type" "mve_move") - (set_attr "length" "8")]) ;; -;; [vadciq_u, vadciq_s]) +;; [vadciq_u, vadciq_s] +;; [vsbciq_s, vsbciq_u] ;; -(define_insn "mve_vadciq_<supf>v4si" +(define_insn "@mve_<mve_insn>q_<supf>v4si" [(set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w")] - VADCIQ)) + VxCIQ)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(const_int 0)] - VADCIQ)) + VxCIQ)) ] "TARGET_HAVE_MVE" - "vadci.i32\t%q0, %q1, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vadciq_<supf>v4si")) + "<mve_insn>.i32\t%q0, %q1, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf>v4si")) (set_attr "type" "mve_move") (set_attr "length" "4")]) ;; -;; [vadcq_m_s, vadcq_m_u]) +;; [vadciq_m_s, vadciq_m_u] +;; [vsbciq_m_u, vsbciq_m_s] ;; -(define_insn "mve_vadcq_m_<supf>v4si" +(define_insn "@mve_<mve_insn>q_m_<supf>v4si" [(set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") (match_operand:V4BI 4 "vpr_register_operand" "Up")] - VADCQ_M)) + VxCIQ_M)) (set (reg:SI VFPCC_REGNUM) - (unspec:SI [(reg:SI VFPCC_REGNUM)] - VADCQ_M)) + (unspec:SI [(const_int 0)] + VxCIQ_M)) ] "TARGET_HAVE_MVE" - "vpst\;vadct.i32\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vadcq_<supf>v4si")) + "vpst\;<mve_insn>t.i32\t%q0, %q2, %q3" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf>v4si")) (set_attr "type" "mve_move") (set_attr "length" "8")]) ;; -;; [vadcq_u, vadcq_s]) +;; [vadcq_u, vadcq_s] +;; [vsbcq_s, vsbcq_u] ;; -(define_insn "mve_vadcq_<supf>v4si" +(define_insn "@mve_<mve_insn>q_<supf>v4si" [(set (match_operand:V4SI 0 "s_register_operand" "=w") (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") (match_operand:V4SI 2 "s_register_operand" "w")] - VADCQ)) + VxCQ)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(reg:SI VFPCC_REGNUM)] - VADCQ)) + VxCQ)) ] "TARGET_HAVE_MVE" - "vadc.i32\t%q0, %q1, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vadcq_<supf>v4si")) + "<mve_insn>.i32\t%q0, %q1, %q2" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf>v4si")) (set_attr "type" "mve_move") (set_attr "length" "4") (set_attr "conds" "set")]) ;; -;; [vsbciq_m_u, vsbciq_m_s]) +;; [vadcq_m_s, vadcq_m_u] +;; [vsbcq_m_u, vsbcq_m_s] ;; -(define_insn "mve_vsbciq_m_<supf>v4si" +(define_insn "@mve_<mve_insn>q_m_<supf>v4si" [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:V4SI 2 "s_register_operand" "w") - (match_operand:V4SI 3 "s_register_operand" "w") - (match_operand:V4BI 4 "vpr_register_operand" "Up")] - VSBCIQ_M)) - (set (reg:SI VFPCC_REGNUM) - (unspec:SI [(const_int 0)] - VSBCIQ_M)) - ] - "TARGET_HAVE_MVE" - "vpst\;vsbcit.i32\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vsbciq_<supf>v4si")) - (set_attr "type" "mve_move") - (set_attr "length" "8")]) - -;; -;; [vsbciq_s, vsbciq_u]) -;; -(define_insn "mve_vsbciq_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:V4SI 2 "s_register_operand" "w")] - VSBCIQ)) - (set (reg:SI VFPCC_REGNUM) - (unspec:SI [(const_int 0)] - VSBCIQ)) - ] - "TARGET_HAVE_MVE" - "vsbci.i32\t%q0, %q1, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vsbciq_<supf>v4si")) - (set_attr "type" "mve_move") - (set_attr "length" "4")]) - -;; -;; [vsbcq_m_u, vsbcq_m_s]) -;; -(define_insn "mve_vsbcq_m_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") + (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "0") (match_operand:V4SI 2 "s_register_operand" "w") (match_operand:V4SI 3 "s_register_operand" "w") (match_operand:V4BI 4 "vpr_register_operand" "Up")] - VSBCQ_M)) + VxCQ_M)) (set (reg:SI VFPCC_REGNUM) (unspec:SI [(reg:SI VFPCC_REGNUM)] - VSBCQ_M)) + VxCQ_M)) ] "TARGET_HAVE_MVE" - "vpst\;vsbct.i32\t%q0, %q2, %q3" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vsbcq_<supf>v4si")) + "vpst\;<mve_insn>t.i32\t%q0, %q2, %q3" + [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_<mve_insn>q_<supf>v4si")) (set_attr "type" "mve_move") (set_attr "length" "8")]) ;; -;; [vsbcq_s, vsbcq_u]) -;; -(define_insn "mve_vsbcq_<supf>v4si" - [(set (match_operand:V4SI 0 "s_register_operand" "=w") - (unspec:V4SI [(match_operand:V4SI 1 "s_register_operand" "w") - (match_operand:V4SI 2 "s_register_operand" "w")] - VSBCQ)) - (set (reg:SI VFPCC_REGNUM) - (unspec:SI [(reg:SI VFPCC_REGNUM)] - VSBCQ)) - ] - "TARGET_HAVE_MVE" - "vsbc.i32\t%q0, %q1, %q2" - [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_vsbcq_<supf>v4si")) - (set_attr "type" "mve_move") - (set_attr "length" "4")]) - -;; ;; [vst2q]) ;; (define_insn "mve_vst2q<mode>" @@ -6652,39 +5851,7 @@ ;; ;; [vshlcq_m_u vshlcq_m_s] ;; -(define_expand "mve_vshlcq_m_vec_<supf><mode>" - [(match_operand:MVE_2 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_32") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand") - (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)] - "TARGET_HAVE_MVE" -{ - rtx ignore_wb = gen_reg_rtx (SImode); - emit_insn (gen_mve_vshlcq_m_<supf><mode> (operands[0], ignore_wb, operands[1], - operands[2], operands[3], - operands[4])); - DONE; -}) - -(define_expand "mve_vshlcq_m_carry_<supf><mode>" - [(match_operand:SI 0 "s_register_operand") - (match_operand:MVE_2 1 "s_register_operand") - (match_operand:SI 2 "s_register_operand") - (match_operand:SI 3 "mve_imm_32") - (match_operand:<MVE_VPRED> 4 "vpr_register_operand") - (unspec:MVE_2 [(const_int 0)] VSHLCQ_M)] - "TARGET_HAVE_MVE" -{ - rtx ignore_vec = gen_reg_rtx (<MODE>mode); - emit_insn (gen_mve_vshlcq_m_<supf><mode> (ignore_vec, operands[0], - operands[1], operands[2], - operands[3], operands[4])); - DONE; -}) - -(define_insn "mve_vshlcq_m_<supf><mode>" +(define_insn "@mve_vshlcq_m_<supf><mode>" [(set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 2 "s_register_operand" "0") (match_operand:SI 3 "s_register_operand" "1") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fa4a7ae..6892b7b 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2989,7 +2989,7 @@ ;; ... ;; ;; and so the vectorizer provides r, in which the result has to be accumulated. -(define_insn "<sup>dot_prod<vsi2qi>" +(define_insn "<sup>dot_prod<mode><vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" "w") @@ -3002,7 +3002,7 @@ ) ;; These instructions map to the __builtins for the Dot Product operations -(define_expand "neon_<sup>dot<vsi2qi>" +(define_expand "neon_<sup>dot<mode><vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand") @@ -3013,7 +3013,7 @@ ) ;; These instructions map to the __builtins for the Dot Product operations. -(define_insn "neon_usdot<vsi2qi>" +(define_insn "neon_usdot<mode><vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") (plus:VCVTI (unspec:VCVTI @@ -3112,7 +3112,7 @@ ) ;; Auto-vectorizer pattern for usdot -(define_expand "usdot_prod<vsi2qi>" +(define_expand "usdot_prod<mode><vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand") (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand") diff --git a/gcc/config/arm/symbian.h b/gcc/config/arm/symbian.h index 6431d61..4bfe9c8 100644 --- a/gcc/config/arm/symbian.h +++ b/gcc/config/arm/symbian.h @@ -1,6 +1,6 @@ /* Configuration file for Symbian OS on ARM processors. Copyright (C) 2004-2024 Free Software Foundation, Inc. - Contributed by CodeSourcery, LLC + Contributed by CodeSourcery, LLC This file is part of GCC. @@ -29,7 +29,7 @@ Make all symbols hidden by default. Symbian OS expects that all exported symbols will be explicitly marked with - "__declspec(dllexport)". + "__declspec(dllexport)". Enumeration types use 4 bytes, even if the enumerals are small, unless explicitly overridden. @@ -63,7 +63,7 @@ #undef SUBTARGET_ASM_FLOAT_SPEC #define SUBTARGET_ASM_FLOAT_SPEC \ "%{!mfpu=*:-mfpu=vfp} %{!mcpu=*:%{!march=*:-march=armv5t}}" - + /* Define the __symbian__ macro. */ #undef TARGET_OS_CPP_BUILTINS #define TARGET_OS_CPP_BUILTINS() \ diff --git a/gcc/config/arm/unknown-elf.h b/gcc/config/arm/unknown-elf.h index 6b31304..b1a1764 100644 --- a/gcc/config/arm/unknown-elf.h +++ b/gcc/config/arm/unknown-elf.h @@ -91,6 +91,6 @@ /* The libgcc udivmod functions may throw exceptions. If newlib is configured to support long longs in I/O, then printf will depend on udivmoddi4, which will depend on the exception unwind routines, - which will depend on abort, which is defined in libc. */ + which will depend on abort, which is defined in libc. */ #undef LINK_GCC_C_SEQUENCE_SPEC #define LINK_GCC_C_SEQUENCE_SPEC "--start-group %G %{!nolibc:%L} --end-group" diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index f5f4d15..01963d5 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -1150,27 +1150,18 @@ VSTRWQSB_U VSTRBQSO_S VSTRBQSO_U - VSTRBQ_S - VSTRBQ_U + VLDRQ + VLDRQ_Z + VLDRQ_EXT + VLDRQ_EXT_Z VLDRBQGO_S VLDRBQGO_U - VLDRBQ_S - VLDRBQ_U VLDRWQGB_S VLDRWQGB_U - VLD1Q_F - VLD1Q_S - VLD1Q_U - VLDRHQ_F VLDRHQGO_S VLDRHQGO_U VLDRHQGSO_S VLDRHQGSO_U - VLDRHQ_S - VLDRHQ_U - VLDRWQ_F - VLDRWQ_S - VLDRWQ_U VLDRDQGB_S VLDRDQGB_U VLDRDQGO_S @@ -1186,15 +1177,11 @@ VLDRWQGSO_F VLDRWQGSO_S VLDRWQGSO_U - VSTRHQ_F - VST1Q_S - VST1Q_U + VSTRQ + VSTRQ_P + VSTRQ_TRUNC + VSTRQ_TRUNC_P VSTRHQSO_S - VSTRHQ_U - VSTRWQ_S - VSTRWQ_U - VSTRWQ_F - VST1Q_F VSTRDQSB_S VSTRDQSB_U VSTRDQSO_S diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 773f556..3212d9c 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -367,7 +367,7 @@ case 8: return \"vmov%?\\t%Q0, %R0, %P1\\t%@ int\"; case 9: - if (TARGET_VFP_SINGLE || TARGET_HAVE_MVE) + if (TARGET_VFP_SINGLE && !TARGET_HAVE_MVE) return \"vmov%?.f32\\t%0, %1\\t%@ int\;vmov%?.f32\\t%p0, %p1\\t%@ int\"; else return \"vmov%?.f64\\t%P0, %P1\\t%@ int\"; @@ -385,7 +385,7 @@ (symbol_ref "arm_count_output_move_double_insns (operands) * 4") (eq_attr "alternative" "9") (if_then_else - (match_test "TARGET_VFP_SINGLE") + (match_test "TARGET_VFP_SINGLE && !TARGET_HAVE_MVE") (const_int 8) (const_int 4))] (const_int 4))) @@ -744,7 +744,7 @@ case 6: case 7: case 9: return output_move_double (operands, true, NULL); case 8: - if (TARGET_VFP_SINGLE) + if (TARGET_VFP_SINGLE && !TARGET_HAVE_MVE) return \"vmov%?.f32\\t%0, %1\;vmov%?.f32\\t%p0, %p1\"; else return \"vmov%?.f64\\t%P0, %P1\"; @@ -758,7 +758,7 @@ (set (attr "length") (cond [(eq_attr "alternative" "6,7,9") (const_int 8) (eq_attr "alternative" "8") (if_then_else - (match_test "TARGET_VFP_SINGLE") + (match_test "TARGET_VFP_SINGLE && !TARGET_HAVE_MVE") (const_int 8) (const_int 4))] (const_int 4))) diff --git a/gcc/config/arm/vxworks.h b/gcc/config/arm/vxworks.h index 7765d92..bafc650 100644 --- a/gcc/config/arm/vxworks.h +++ b/gcc/config/arm/vxworks.h @@ -1,10 +1,10 @@ /* Definitions of target machine for GCC, - for ARM with targeting the VXWorks run time environment. + for ARM with targeting the VXWorks run time environment. Copyright (C) 1999-2024 Free Software Foundation, Inc. Contributed by: Mike Stump <mrs@wrs.com> Brought up to date by CodeSourcery, LLC. - + This file is part of GCC. GCC is free software; you can redistribute it and/or modify diff --git a/gcc/config/avr/avr-arch.h b/gcc/config/avr/avr-arch.h index 69e8db1..072e44d 100644 --- a/gcc/config/avr/avr-arch.h +++ b/gcc/config/avr/avr-arch.h @@ -1,5 +1,4 @@ -/* Definitions of types that are used to store AVR architecture and - device information. +/* Device information for AVR 8-bit microcontrollers. Copyright (C) 2012-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) @@ -14,7 +13,7 @@ GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/avr/avr-c.cc b/gcc/config/avr/avr-c.cc index ca484f2..d3c40d7 100644 --- a/gcc/config/avr/avr-c.cc +++ b/gcc/config/avr/avr-c.cc @@ -1,4 +1,5 @@ -/* Copyright (C) 2009-2024 Free Software Foundation, Inc. +/* Code for the C/C++ front end for AVR 8-bit microcontrollers. + Copyright (C) 2009-2024 Free Software Foundation, Inc. Contributed by Anatoly Sokolov (aesok@post.ru) This file is part of GCC. @@ -7,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ @@ -50,7 +51,7 @@ static tree avr_resolve_overloaded_builtin (unsigned int iloc, tree fndecl, void *vargs) { tree type0, type1, fold = NULL_TREE; - enum avr_builtin_id id = AVR_BUILTIN_COUNT; + avr_builtin_id id = AVR_BUILTIN_COUNT; location_t loc = (location_t) iloc; vec<tree, va_gc> &args = * (vec<tree, va_gc>*) vargs; @@ -290,7 +291,7 @@ avr_toupper (char *up, const char *lo) /* Worker function for TARGET_CPU_CPP_BUILTINS. */ void -avr_cpu_cpp_builtins (struct cpp_reader *pfile) +avr_cpu_cpp_builtins (cpp_reader *pfile) { builtin_define_std ("AVR"); diff --git a/gcc/config/avr/avr-devices.cc b/gcc/config/avr/avr-devices.cc index 456a6b7..9bb55dc 100644 --- a/gcc/config/avr/avr-devices.cc +++ b/gcc/config/avr/avr-devices.cc @@ -7,12 +7,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ @@ -20,6 +20,7 @@ #ifndef IN_GEN_AVR_MMCU_TEXI #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/avr/avr-dimode.md b/gcc/config/avr/avr-dimode.md index c357213..5933721 100644 --- a/gcc/config/avr/avr-dimode.md +++ b/gcc/config/avr/avr-dimode.md @@ -1,5 +1,4 @@ -;; Machine description for GNU compiler, -;; for Atmel AVR micro controllers. +;; Support 64-bit operations for AVR 8-bit microcontrollers. ;; Copyright (C) 1998-2024 Free Software Foundation, Inc. ;; Contributed by Georg Lay (avr@gjlay.de) ;; @@ -460,11 +459,11 @@ (label_ref (match_operand 3)) (pc)))] "avr_have_dimode" - { + { int icode = (int) GET_CODE (operands[0]); targetm.canonicalize_comparison (&icode, &operands[1], &operands[2], false); - operands[0] = gen_rtx_fmt_ee ((enum rtx_code) icode, + operands[0] = gen_rtx_fmt_ee ((rtx_code) icode, VOIDmode, operands[1], operands[2]); rtx acc_a = gen_rtx_REG (<MODE>mode, ACC_A); @@ -489,7 +488,7 @@ emit_jump_insn (gen_cbranch_<mode>2_split (operands[0], operands[3])); } DONE; - }) + }) (define_insn_and_split "cbranch_<mode>2_split" [(set (pc) diff --git a/gcc/config/avr/avr-fixed.md b/gcc/config/avr/avr-fixed.md index 911b8b2..eb83751a 100644 --- a/gcc/config/avr/avr-fixed.md +++ b/gcc/config/avr/avr-fixed.md @@ -1,5 +1,4 @@ -;; This file contains instructions that support fixed-point operations -;; for Atmel AVR micro controllers. +;; Support fixed-point operations for AVR 8-bit microcontrollers. ;; Copyright (C) 2012-2024 Free Software Foundation, Inc. ;; ;; Contributed by Sean D'Epagnier (sean@depagnier.com) diff --git a/gcc/config/avr/avr-log.cc b/gcc/config/avr/avr-log.cc index d702c5f..5708ac3 100644 --- a/gcc/config/avr/avr-log.cc +++ b/gcc/config/avr/avr-log.cc @@ -1,4 +1,4 @@ -/* Subroutines for log output for Atmel AVR back end. +/* Subroutines for log output for AVR 8-bit microcontrollers. Copyright (C) 2011-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) @@ -8,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/avr/avr-mcus.def b/gcc/config/avr/avr-mcus.def index 068875a..f7401ab 100644 --- a/gcc/config/avr/avr-mcus.def +++ b/gcc/config/avr/avr-mcus.def @@ -1,4 +1,4 @@ -/* AVR MCUs. +/* Information on supported AVR 8-bit microcontrollers. Copyright (C) 2009-2024 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/avr/avr-modes.def b/gcc/config/avr/avr-modes.def index e0633d6..e69636a 100644 --- a/gcc/config/avr/avr-modes.def +++ b/gcc/config/avr/avr-modes.def @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Extra machine modes for AVR 8-bit microcontrollers. + Copyright (C) 2012-2024 Free Software Foundation, Inc. This file is part of GCC. @@ -18,6 +19,12 @@ FRACTIONAL_INT_MODE (PSI, 24, 3); +/* Used when the N (and Z) flag(s) of SREG are set. + The N flag indicates whether the value is negative. + The Z flag indicates whether the value is zero. */ +CC_MODE (CCN); +CC_MODE (CCZN); + /* Make TA and UTA 64 bits wide. 128 bit wide modes would be insane on a 8-bit machine. This needs special treatment in avr.cc and avr-lib.h. */ diff --git a/gcc/config/avr/avr-passes.cc b/gcc/config/avr/avr-passes.cc new file mode 100644 index 0000000..205b490 --- /dev/null +++ b/gcc/config/avr/avr-passes.cc @@ -0,0 +1,1939 @@ +/* Support for avr-passes.def for AVR 8-bit microcontrollers. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#define IN_TARGET_CODE 1 + +#define INCLUDE_VECTOR +#include "config.h" +#include "system.h" +#include "intl.h" +#include "coretypes.h" +#include "backend.h" +#include "target.h" +#include "rtl.h" +#include "tree.h" +#include "cfghooks.h" +#include "cfganal.h" +#include "df.h" +#include "memmodel.h" +#include "tm_p.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "explow.h" +#include "cfgrtl.h" +#include "context.h" +#include "tree-pass.h" + +namespace +{ + + +////////////////////////////////////////////////////////////////////////////// +// Try to replace 2 cbranch insns with 1 comparison and 2 branches. + +static const pass_data avr_pass_data_ifelse = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish | TODO_df_verify // todo_flags_finish +}; + +class avr_pass_ifelse : public rtl_opt_pass +{ +public: + avr_pass_ifelse (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_ifelse, ctxt) + { + this->name = name; + } + + bool gate (function *) final override + { + return optimize > 0; + } + + unsigned int execute (function *func) final override; +}; // avr_pass_ifelse + + +/* Return TRUE iff comparison code CODE is explicitly signed. */ + +static bool +avr_strict_signed_p (rtx_code code) +{ + return code == GT || code == GE || code == LT || code == LE; +} + + +/* Return TRUE iff comparison code CODE is explicitly unsigned. */ + +static bool +avr_strict_unsigned_p (rtx_code code) +{ + return code == GTU || code == GEU || code == LTU || code == LEU; +} + +#include "config/avr/ranges.h" + +/* Suppose the inputs represent a code like + + if (x <CMP1> XVAL1) goto way1; + if (x <CMP2> XVAL2) goto way2; + way3:; + + with two integer mode comparisons where XVAL1 and XVAL2 are CONST_INT. + When this can be rewritten in the form + + if (x <cond1> xval) goto way1; + if (x <cond2> xval) goto way2; + way3:; + + then set CMP1 = cond1, CMP2 = cond2, and return xval. Else return NULL_RTX. + When SWAPT is returned true, then way1 and way2 must be swapped. + When the incomping SWAPT is false, the outgoing one will be false, too. */ + +static rtx +avr_2comparisons_rhs (rtx_code &cmp1, rtx xval1, + rtx_code &cmp2, rtx xval2, + machine_mode mode, bool &swapt) +{ + const bool may_swapt = swapt; + swapt = false; + + ////////////////////////////////////////////////////////////////// + // Step 0: Decide about signedness, map xval1/2 to the range + // of [un]signed machine mode. + + const bool signed1_p = avr_strict_signed_p (cmp1); + const bool signed2_p = avr_strict_signed_p (cmp2); + const bool unsigned1_p = avr_strict_unsigned_p (cmp1); + const bool unsigned2_p = avr_strict_unsigned_p (cmp2); + const bool signed_p = signed1_p || signed2_p; + bool unsigned_p = unsigned1_p || unsigned2_p; + + using T = Ranges::scalar_type; + T val1 = INTVAL (xval1); + T val2 = INTVAL (xval2); + + if (signed_p + unsigned_p > 1) + { + // Don't go down that rabbit hole. When the RHSs are the + // same, we can still save one comparison. + return val1 == val2 ? xval1 : NULL_RTX; + } + + // Decide about signedness. When no explicit signedness is present, + // then cases that are close to the unsigned boundary like EQ 0, EQ 1 + // can also be optimized. + if (unsigned_p + || (! signed_p && IN_RANGE (val1, -2, 2))) + { + unsigned_p = true; + val1 = UINTVAL (xval1) & GET_MODE_MASK (mode); + val2 = UINTVAL (xval2) & GET_MODE_MASK (mode); + } + + // No way we can decompose the domain in a usable manner when the + // RHSes are too far apart. + if (! IN_RANGE (val1 - val2, -2, 2)) + return NULL_RTX; + + ////////////////////////////////////////////////////////////////// + // Step 1: Represent the input conditions as truth Ranges. This + // establishes a decomposition / coloring of the domain. + + Ranges dom = Ranges::NBitsRanges (GET_MODE_BITSIZE (mode), unsigned_p, + Ranges::ALL); + Ranges r[4] = { dom, dom.truth (cmp1, val1), dom.truth (cmp2, val2), dom }; + + // r[1] shadows r[2] shadows r[3]. r[0] is just for nice indices. + r[3].minus (r[2]); + r[3].minus (r[1]); + r[2].minus (r[1]); + + ////////////////////////////////////////////////////////////////// + // Step 2: Filter for cases where the domain decomposes into three + // intervals: One to the left, one to the right, and one + // in the middle where the latter holds exactly one value. + + for (int i = 1; i <= 3; ++i) + { + // Keep track of which Ranges is which. + r[i].tag = i; + + gcc_assert (r[i].check ()); + + // Filter for proper intervals. Also return for the empty set, + // since cases where [m_min, m_max] decomposes into two intervals + // or less have been sorted out by the generic optimizers already, + // and hence should not be seen here. And more than two intervals + // at a time cannot be optimized of course. + if (r[i].size () != 1) + return NULL_RTX; + } + + // Bubble-sort the three intervals such that: + // [1] is the left interval, i.e. the one taken by LT[U]. + // [2] is the middle interval, i.e. the one taken by EQ. + // [3] is the right interval, i.e. the one taken by GT[U]. + Ranges::sort2 (r[1], r[3]); + Ranges::sort2 (r[2], r[3]); + Ranges::sort2 (r[1], r[2]); + + if (dump_file) + fprintf (dump_file, + ";; Decomposed: .%d=[%ld, %ld] .%d=[%ld, %ld] .%d=[%ld, %ld]\n", + r[1].tag, (long) r[1].ranges[0].lo, (long) r[1].ranges[0].hi, + r[2].tag, (long) r[2].ranges[0].lo, (long) r[2].ranges[0].hi, + r[3].tag, (long) r[3].ranges[0].lo, (long) r[3].ranges[0].hi); + + // EQ / NE can handle only one value. + if (r[2].cardinality (0) != 1) + return NULL_RTX; + + // Success! This is the sought for xval. + const T val = r[2].ranges[0].lo; + + ////////////////////////////////////////////////////////////////// + // Step 3: Work out which label gets which condition, trying to + // avoid the expensive codes GT[U] and LE[U] if possible. + // Avoiding expensive codes is always possible when labels + // way1 and way2 may be swapped. + + // The xx1 ways have an expensive GT for cmp1 which can be avoided + // by swapping way1 with way2. + swapt = may_swapt && r[3].tag == 1; + if (swapt) + std::swap (r[3], r[2].tag == 2 ? r[2] : r[1]); + + // 6 = 3! ways to assign LT, EQ, GT to the three labels. + const int way = 100 * r[1].tag + 10 * r[2].tag + r[3].tag; + + if (dump_file) + fprintf (dump_file, ";; Success: unsigned=%d, swapt=%d, way=%d, rhs=%ld\n", + unsigned_p, swapt, way, (long) val); + +#define WAY(w, c1, c2) \ + case w: \ + cmp1 = unsigned_p ? unsigned_condition (c1) : c1; \ + cmp2 = unsigned_p ? unsigned_condition (c2) : c2; \ + break; + + switch (way) + { + default: + gcc_unreachable(); + + // cmp1 gets the LT, avoid difficult branches for cmp2. + WAY (123, LT, EQ); + WAY (132, LT, NE); + + // cmp1 gets the EQ, avoid difficult branches for cmp2. + WAY (213, EQ, LT); + WAY (312, EQ, GE); + + // cmp1 gets the difficult GT, unavoidable as we may not swap way1/2. + WAY (231, GT, NE); + WAY (321, GT, EQ); + } + +#undef WAY + + return gen_int_mode (val, mode); +} + + +/* A helper for the next method. Suppose we have two conditional branches + with REG and CONST_INT operands + + if (reg <cond1> xval1) goto label1; + if (reg <cond2> xval2) goto label2; + + If the second comparison is redundant and there are codes <cmp1> + and <cmp2> such that the sequence can be performed as + + REG_CC = compare (reg, xval); + if (REG_CC <cmp1> 0) goto label1; + if (REG_CC <cmp2> 0) goto label2; + + then set COND1 to cmp1, COND2 to cmp2, SWAPT to true when the branch + targets have to be swapped, and return XVAL. Otherwise, return NULL_RTX. + This function may clobber COND1 and COND2 even when it returns NULL_RTX. + + REVERSE_COND1 can be set to reverse condition COND1. This is useful + when the second comparison does not follow the first one, but is + located after label1 like in: + + if (reg <cond1> xval1) goto label1; + ... + label1: + if (reg <cond2> xval2) goto label2; + + In such a case we cannot swap the labels, and we may end up with a + difficult branch -- though one comparison can still be optimized out. + Getting rid of such difficult branches would require to reorder blocks. */ + +static rtx +avr_redundant_compare (rtx xreg1, rtx_code &cond1, rtx xval1, + rtx xreg2, rtx_code &cond2, rtx xval2, + bool reverse_cond1, bool &swapt) +{ + // Make sure we have two REG <cond> CONST_INT comparisons with the same reg. + if (! rtx_equal_p (xreg1, xreg2) + || ! CONST_INT_P (xval1) + || ! CONST_INT_P (xval2)) + return NULL_RTX; + + if (reverse_cond1) + cond1 = reverse_condition (cond1); + + // Allow swapping label1 <-> label2 only when ! reverse_cond1. + swapt = ! reverse_cond1; + rtx_code c1 = cond1; + rtx_code c2 = cond2; + rtx xval = avr_2comparisons_rhs (c1, xval1, + c2, xval2, GET_MODE (xreg1), swapt); + if (! xval) + return NULL_RTX; + + if (dump_file) + { + rtx_code a1 = reverse_cond1 ? reverse_condition (cond1) : cond1; + rtx_code b1 = reverse_cond1 ? reverse_condition (c1) : c1; + const char *s_rev1 = reverse_cond1 ? " reverse_cond1" : ""; + avr_dump (";; cond1: %C %r%s\n", a1, xval1, s_rev1); + avr_dump (";; cond2: %C %r\n", cond2, xval2); + avr_dump (";; => %C %d\n", b1, (int) INTVAL (xval)); + avr_dump (";; => %C %d\n", c2, (int) INTVAL (xval)); + } + + cond1 = c1; + cond2 = c2; + + return xval; +} + + +/* Similar to the function above, but assume that + + if (xreg1 <cond1> xval1) goto label1; + if (xreg2 <cond2> xval2) goto label2; + + are two subsequent REG-REG comparisons. When this can be represented as + + REG_CC = compare (reg, xval); + if (REG_CC <cmp1> 0) goto label1; + if (REG_CC <cmp2> 0) goto label2; + + then set XREG1 to reg, COND1 and COND2 accordingly, and return xval. + Otherwise, return NULL_RTX. This optmization can be performed + when { xreg1, xval1 } and { xreg2, xval2 } are equal as sets. + It can be done in such a way that no difficult branches occur. */ + +static rtx +avr_redundant_compare_regs (rtx &xreg1, rtx_code &cond1, rtx &xval1, + rtx &xreg2, rtx_code &cond2, rtx &xval2, + bool reverse_cond1) +{ + bool swapped; + + if (! REG_P (xval1)) + return NULL_RTX; + else if (rtx_equal_p (xreg1, xreg2) + && rtx_equal_p (xval1, xval2)) + swapped = false; + else if (rtx_equal_p (xreg1, xval2) + && rtx_equal_p (xreg2, xval1)) + swapped = true; + else + return NULL_RTX; + + // Found a redundant REG-REG comparison. Assume that the incoming + // representation has been canonicalized by CANONICALIZE_COMPARISON. + // We can always represent this using only one comparison and in such + // a way that no difficult branches are required. + + if (dump_file) + { + const char *s_rev1 = reverse_cond1 ? " reverse_cond1" : ""; + avr_dump (";; %r %C %r%s\n", xreg1, cond1, xval1, s_rev1); + avr_dump (";; %r %C %r\n", xreg2, cond2, xval2); + } + + if (reverse_cond1) + cond1 = reverse_condition (cond1); + + if (swapped) + { + if (cond1 == EQ || cond1 == NE) + { + avr_dump (";; case #21\n"); + std::swap (xreg1, xval1); + } + else + { + std::swap (xreg2, xval2); + cond2 = swap_condition (cond2); + + // The swap may have introduced a difficult comparison. + // In order to get of it, only a few cases need extra care. + if ((cond1 == LT && cond2 == GT) + || (cond1 == LTU && cond2 == GTU)) + { + avr_dump (";; case #22\n"); + cond2 = NE; + } + else + avr_dump (";; case #23\n"); + } + } + else + avr_dump (";; case #20\n"); + + return xval1; +} + + +/* INSN1 and INSN2 are two cbranch insns for the same integer mode. + When FOLLOW_LABEL1 is false, then INSN2 is located in the fallthrough + path of INSN1. When FOLLOW_LABEL1 is true, then INSN2 is located at + the true edge of INSN1, INSN2 is preceded by a barrier, and no other + edge leads to the basic block of INSN2. + + Try to replace INSN1 and INSN2 by a compare insn and two branch insns. + When such a replacement has been performed, then return the insn where the + caller should continue scanning the insn stream. Else, return nullptr. */ + +static rtx_insn * +avr_optimize_2ifelse (rtx_jump_insn *insn1, + rtx_jump_insn *insn2, bool follow_label1) +{ + avr_dump (";; Investigating jump_insn %d and jump_insn %d.\n", + INSN_UID (insn1), INSN_UID (insn2)); + + // Extract the operands of the insns: + // $0 = comparison operator ($1, $2) + // $1 = reg + // $2 = reg or const_int + // $3 = code_label + // $4 = optional SCRATCH for HI, PSI, SI cases. + + const auto &op = recog_data.operand; + + extract_insn (insn1); + rtx xop1[5] = { op[0], op[1], op[2], op[3], op[4] }; + int n_operands = recog_data.n_operands; + + extract_insn (insn2); + rtx xop2[5] = { op[0], op[1], op[2], op[3], op[4] }; + + rtx_code code1 = GET_CODE (xop1[0]); + rtx_code code2 = GET_CODE (xop2[0]); + bool swap_targets = false; + + // Search redundant REG-REG comparison. + rtx xval = avr_redundant_compare_regs (xop1[1], code1, xop1[2], + xop2[1], code2, xop2[2], + follow_label1); + + // Search redundant REG-CONST_INT comparison. + if (! xval) + xval = avr_redundant_compare (xop1[1], code1, xop1[2], + xop2[1], code2, xop2[2], + follow_label1, swap_targets); + if (! xval) + { + avr_dump (";; Nothing found for jump_insn %d and jump_insn %d.\n", + INSN_UID (insn1), INSN_UID (insn2)); + return nullptr; + } + + if (follow_label1) + code1 = reverse_condition (code1); + + ////////////////////////////////////////////////////// + // Found a replacement. + + if (dump_file) + { + avr_dump (";; => %C %r\n", code1, xval); + avr_dump (";; => %C %r\n", code2, xval); + + fprintf (dump_file, "\n;; Found chain of jump_insn %d and" + " jump_insn %d, follow_label1=%d:\n", + INSN_UID (insn1), INSN_UID (insn2), follow_label1); + print_rtl_single (dump_file, PATTERN (insn1)); + print_rtl_single (dump_file, PATTERN (insn2)); + } + + rtx_insn *next_insn + = next_nonnote_nondebug_insn (follow_label1 ? insn1 : insn2); + + // Pop the new branch conditions and the new comparison. + // Prematurely split into compare + branch so that we can drop + // the 2nd comparison. The following pass, split2, splits all + // insns for REG_CC, and it should still work as usual even when + // there are already some REG_CC insns around. + + rtx xcond1 = gen_rtx_fmt_ee (code1, VOIDmode, cc_reg_rtx, const0_rtx); + rtx xcond2 = gen_rtx_fmt_ee (code2, VOIDmode, cc_reg_rtx, const0_rtx); + rtx xpat1 = gen_branch (xop1[3], xcond1); + rtx xpat2 = gen_branch (xop2[3], xcond2); + rtx xcompare = NULL_RTX; + machine_mode mode = GET_MODE (xop1[1]); + + if (mode == QImode) + { + gcc_assert (n_operands == 4); + xcompare = gen_cmpqi3 (xop1[1], xval); + } + else + { + gcc_assert (n_operands == 5); + rtx scratch = GET_CODE (xop1[4]) == SCRATCH ? xop2[4] : xop1[4]; + rtx (*gen_cmp)(rtx,rtx,rtx) + = mode == HImode ? gen_gen_comparehi + : mode == PSImode ? gen_gen_comparepsi + : gen_gen_comparesi; // SImode + xcompare = gen_cmp (xop1[1], xval, scratch); + } + + // Emit that stuff. + + rtx_insn *cmp = emit_insn_before (xcompare, insn1); + rtx_jump_insn *branch1 = emit_jump_insn_after (xpat1, insn1); + rtx_jump_insn *branch2 = emit_jump_insn_after (xpat2, insn2); + + JUMP_LABEL (branch1) = xop1[3]; + JUMP_LABEL (branch2) = xop2[3]; + // delete_insn() decrements LABEL_NUSES when deleting a JUMP_INSN, + // but when we pop a new JUMP_INSN, do it by hand. + ++LABEL_NUSES (xop1[3]); + ++LABEL_NUSES (xop2[3]); + + delete_insn (insn1); + delete_insn (insn2); + + if (swap_targets) + { + gcc_assert (! follow_label1); + + basic_block to1 = BLOCK_FOR_INSN (xop1[3]); + basic_block to2 = BLOCK_FOR_INSN (xop2[3]); + edge e1 = find_edge (BLOCK_FOR_INSN (branch1), to1); + edge e2 = find_edge (BLOCK_FOR_INSN (branch2), to2); + gcc_assert (e1); + gcc_assert (e2); + redirect_edge_and_branch (e1, to2); + redirect_edge_and_branch (e2, to1); + } + + // As a side effect, also recog the new insns. + gcc_assert (valid_insn_p (cmp)); + gcc_assert (valid_insn_p (branch1)); + gcc_assert (valid_insn_p (branch2)); + + return next_insn; +} + + +/* Sequences like + + SREG = compare (reg, 1 + val); + if (SREG >= 0) goto label1; + SREG = compare (reg, val); + if (SREG == 0) goto label2; + + can be optimized to + + SREG = compare (reg, val); + if (SREG == 0) goto label2; + if (SREG >= 0) goto label1; + + Almost all cases where one of the comparisons is redundant can + be transformed in such a way that only one comparison is required + and no difficult branches are needed. */ + +unsigned int +avr_pass_ifelse::execute (function *) +{ + rtx_insn *next_insn; + + for (rtx_insn *insn = get_insns(); insn; insn = next_insn) + { + next_insn = next_nonnote_nondebug_insn (insn); + + if (! next_insn) + break; + + // Search for two cbranch insns. The first one is a cbranch. + // Filter for "cbranch<mode>4_insn" with mode in QI, HI, PSI, SI. + + if (! JUMP_P (insn)) + continue; + + int icode1 = recog_memoized (insn); + + if (icode1 != CODE_FOR_cbranchqi4_insn + && icode1 != CODE_FOR_cbranchhi4_insn + && icode1 != CODE_FOR_cbranchpsi4_insn + && icode1 != CODE_FOR_cbranchsi4_insn) + continue; + + rtx_jump_insn *insn1 = as_a<rtx_jump_insn *> (insn); + + // jmp[0]: We can optimize cbranches that follow cbranch insn1. + rtx_insn *jmp[2] = { next_insn, nullptr }; + + // jmp[1]: A cbranch following the label of cbranch insn1. + if (LABEL_NUSES (JUMP_LABEL (insn1)) == 1) + { + rtx_insn *code_label1 = JUMP_LABEL_AS_INSN (insn1); + rtx_insn *barrier = prev_nonnote_nondebug_insn (code_label1); + + // When the target label of insn1 is used exactly once and is + // not a fallthrough, i.e. is preceded by a barrier, then + // consider the insn following that label. + if (barrier && BARRIER_P (barrier)) + jmp[1] = next_nonnote_nondebug_insn (code_label1); + } + + // With almost certainty, only one of the two possible jumps can + // be optimized with insn1, but it's hard to tell which one a priori. + // Just try both. In the unlikely case where both could be optimized, + // prefer jmp[0] because eliminating difficult branches is impeded + // by following label1. + + for (int j = 0; j < 2; ++j) + if (jmp[j] && JUMP_P (jmp[j]) + && recog_memoized (jmp[j]) == icode1) + { + rtx_insn *next + = avr_optimize_2ifelse (insn1, as_a<rtx_jump_insn *> (jmp[j]), + j == 1 /* follow_label1 */); + if (next) + { + next_insn = next; + break; + } + } + + } // loop insns + + return 0; +} + + + +////////////////////////////////////////////////////////////////////////////// +// Optimize results of the casesi expander for modes < SImode. + +static const pass_data avr_pass_data_casesi = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + 0 // todo_flags_finish +}; + +class avr_pass_casesi : public rtl_opt_pass +{ +public: + avr_pass_casesi (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_casesi, ctxt) + { + this->name = name; + } + + bool gate (function *) final override + { + return optimize > 0; + } + + unsigned int execute (function *) final override; +}; // avr_pass_casesi + + +/* Make one parallel insn with all the patterns from insns i[0]..i[5]. */ + +static rtx_insn * +avr_parallel_insn_from_insns (rtx_insn *i[5]) +{ + rtvec vec = gen_rtvec (5, PATTERN (i[0]), PATTERN (i[1]), PATTERN (i[2]), + PATTERN (i[3]), PATTERN (i[4])); + start_sequence(); + emit (gen_rtx_PARALLEL (VOIDmode, vec)); + rtx_insn *insn = get_insns(); + end_sequence(); + + return insn; +} + + +/* Return true if we see an insn stream generated by casesi expander together + with an extension to SImode of the switch value. + + If this is the case, fill in the insns from casesi to INSNS[1..5] and + the SImode extension to INSNS[0]. Moreover, extract the operands of + pattern casesi_<mode>_sequence forged from the sequence to recog_data. */ + +static bool +avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[5]) +{ + rtx set_4, set_0; + + /* A first and quick test for a casesi sequences. As a side effect of + the test, harvest respective insns to INSNS[0..4]. */ + + if (!(JUMP_P (insns[4] = insn) + // casesi is the only insn that comes up with UNSPEC_INDEX_JMP, + // hence the following test ensures that we are actually dealing + // with code from casesi. + && (set_4 = single_set (insns[4])) + && UNSPEC == GET_CODE (SET_SRC (set_4)) + && UNSPEC_INDEX_JMP == XINT (SET_SRC (set_4), 1) + + && (insns[3] = prev_real_insn (insns[4])) + && (insns[2] = prev_real_insn (insns[3])) + && (insns[1] = prev_real_insn (insns[2])) + + // Insn prior to casesi. + && (insns[0] = prev_real_insn (insns[1])) + && (set_0 = single_set (insns[0])) + && extend_operator (SET_SRC (set_0), SImode))) + { + return false; + } + + if (dump_file) + { + fprintf (dump_file, ";; Sequence from casesi in " + "[bb %d]:\n\n", bb->index); + for (int i = 0; i < 5; i++) + print_rtl_single (dump_file, insns[i]); + } + + /* We have to deal with quite some operands. Extracting them by hand + would be tedious, therefore wrap the insn patterns into a parallel, + run recog against it and then use insn extract to get the operands. */ + + rtx_insn *xinsn = avr_parallel_insn_from_insns (insns); + + INSN_CODE (xinsn) = recog (PATTERN (xinsn), xinsn, NULL /* num_clobbers */); + + /* Failing to recognize means that someone changed the casesi expander or + that some passes prior to this one performed some unexpected changes. + Gracefully drop such situations instead of aborting. */ + + if (INSN_CODE (xinsn) < 0) + { + if (dump_file) + fprintf (dump_file, ";; Sequence not recognized, giving up.\n\n"); + + return false; + } + + gcc_assert (CODE_FOR_casesi_qi_sequence == INSN_CODE (xinsn) + || CODE_FOR_casesi_hi_sequence == INSN_CODE (xinsn)); + + extract_insn (xinsn); + + // Assert on the anatomy of xinsn's operands we are going to work with. + + gcc_assert (recog_data.n_operands == 11); + gcc_assert (recog_data.n_dups == 4); + + if (dump_file) + { + fprintf (dump_file, ";; Operands extracted:\n"); + for (int i = 0; i < recog_data.n_operands; i++) + avr_fdump (dump_file, ";; $%d = %r\n", i, recog_data.operand[i]); + fprintf (dump_file, "\n"); + } + + return true; +} + + +/* INSNS[1..4] is a sequence as generated by casesi and INSNS[0] is an + extension of an 8-bit or 16-bit integer to SImode. XOP contains the + operands of INSNS as extracted by insn_extract from pattern + casesi_<mode>_sequence: + + $0: SImode reg switch value as result of $9. + $1: Negative of smallest index in switch. + $2: Number of entries in switch. + $3: Label to table. + $4: Label if out-of-bounds. + $5: $0 + $1. + $6: 3-byte PC: subreg:HI ($5) + label_ref ($3) + 2-byte PC: subreg:HI ($5) + $7: HI reg index into table (Z or pseudo) + $8: R24 or const0_rtx (to be clobbered) + $9: Extension to SImode of an 8-bit or 16-bit integer register $10. + $10: QImode or HImode register input of $9. + + Try to optimize this sequence, i.e. use the original HImode / QImode + switch value instead of SImode. */ + +static void +avr_optimize_casesi (rtx_insn *insns[5], rtx *xop) +{ + // Original mode of the switch value; this is QImode or HImode. + machine_mode mode = GET_MODE (xop[10]); + + // How the original switch value was extended to SImode; this is + // SIGN_EXTEND or ZERO_EXTEND. + rtx_code code = GET_CODE (xop[9]); + + // Lower index, upper index (plus one) and range of case calues. + HOST_WIDE_INT low_idx = -INTVAL (xop[1]); + HOST_WIDE_INT num_idx = INTVAL (xop[2]); + HOST_WIDE_INT hig_idx = low_idx + num_idx; + + // Maximum ranges of (un)signed QImode resp. HImode. + unsigned umax = QImode == mode ? 0xff : 0xffff; + int imax = QImode == mode ? 0x7f : 0x7fff; + int imin = -imax - 1; + + // Testing the case range and whether it fits into the range of the + // (un)signed mode. This test should actually always pass because it + // makes no sense to have case values outside the mode range. Notice + // that case labels which are unreachable because they are outside the + // mode of the switch value (e.g. "case -1" for uint8_t) have already + // been thrown away by the middle-end. + + if (SIGN_EXTEND == code + && low_idx >= imin + && hig_idx <= imax) + { + // ok + } + else if (ZERO_EXTEND == code + && low_idx >= 0 + && (unsigned) hig_idx <= umax) + { + // ok + } + else + { + if (dump_file) + fprintf (dump_file, ";; Case ranges too big, giving up.\n\n"); + return; + } + + // Do normalization of switch value $10 and out-of-bound check in its + // original mode instead of in SImode. Use a newly created pseudo. + // This will replace insns[1..2]. + + start_sequence(); + + rtx reg = copy_to_mode_reg (mode, xop[10]); + + rtx (*gen_add)(rtx,rtx,rtx) = QImode == mode ? gen_addqi3 : gen_addhi3; + rtx (*gen_cbranch)(rtx,rtx,rtx,rtx) + = QImode == mode ? gen_cbranchqi4 : gen_cbranchhi4; + + emit_insn (gen_add (reg, reg, gen_int_mode (-low_idx, mode))); + rtx op0 = reg; rtx op1 = gen_int_mode (num_idx, mode); + rtx labelref = copy_rtx (xop[4]); + rtx xbranch = gen_cbranch (gen_rtx_fmt_ee (GTU, VOIDmode, op0, op1), + op0, op1, labelref); + rtx_insn *cbranch = emit_jump_insn (xbranch); + JUMP_LABEL (cbranch) = xop[4]; + ++LABEL_NUSES (xop[4]); + + rtx_insn *seq1 = get_insns(); + rtx_insn *last1 = get_last_insn(); + end_sequence(); + + emit_insn_after (seq1, insns[2]); + + // After the out-of-bounds test and corresponding branch, use a + // 16-bit index. If QImode is used, extend it to HImode first. + // This will replace insns[4]. + + start_sequence(); + + if (QImode == mode) + reg = force_reg (HImode, gen_rtx_fmt_e (code, HImode, reg)); + + rtx pat_4 = AVR_3_BYTE_PC + ? gen_movhi (xop[7], reg) + : gen_addhi3 (xop[7], reg, gen_rtx_LABEL_REF (VOIDmode, xop[3])); + + emit_insn (pat_4); + + rtx_insn *seq2 = get_insns(); + rtx_insn *last2 = get_last_insn(); + end_sequence(); + + emit_insn_after (seq2, insns[3]); + + if (dump_file) + { + fprintf (dump_file, ";; New insns: "); + + for (rtx_insn *insn = seq1; ; insn = NEXT_INSN (insn)) + { + fprintf (dump_file, "%d, ", INSN_UID (insn)); + if (insn == last1) + break; + } + for (rtx_insn *insn = seq2; ; insn = NEXT_INSN (insn)) + { + fprintf (dump_file, "%d%s", INSN_UID (insn), + insn == last2 ? ".\n\n" : ", "); + if (insn == last2) + break; + } + + fprintf (dump_file, ";; Deleting insns: %d, %d, %d.\n\n", + INSN_UID (insns[1]), INSN_UID (insns[2]), INSN_UID (insns[3])); + } + + // Pseudodelete the SImode and subreg of SImode insns. We don't care + // about the extension insns[0]: Its result is now unused and other + // passes will clean it up. + + SET_INSN_DELETED (insns[1]); + SET_INSN_DELETED (insns[2]); + SET_INSN_DELETED (insns[3]); +} + + +unsigned int +avr_pass_casesi::execute (function *func) +{ + basic_block bb; + + FOR_EACH_BB_FN (bb, func) + { + rtx_insn *insn, *insns[5]; + + FOR_BB_INSNS (bb, insn) + { + if (avr_is_casesi_sequence (bb, insn, insns)) + { + avr_optimize_casesi (insns, recog_data.operand); + } + } + } + + return 0; +} + +} // anonymous namespace + +/* Perform some extra checks on operands of casesi_<mode>_sequence. + Not all operand dependencies can be described by means of predicates. + This function performs left over checks and should always return true. + Returning false means that someone changed the casesi expander but did + not adjust casesi_<mode>_sequence. */ + +bool +avr_casei_sequence_check_operands (rtx *xop) +{ + rtx sub_5 = NULL_RTX; + + if (AVR_HAVE_EIJMP_EICALL + // The last clobber op of the tablejump. + && xop[8] == all_regs_rtx[REG_24]) + { + // $6 is: (subreg:SI ($5) 0) + sub_5 = xop[6]; + } + + if (!AVR_HAVE_EIJMP_EICALL + // $6 is: (plus:HI (subreg:SI ($5) 0) + // (label_ref ($3))) + && PLUS == GET_CODE (xop[6]) + && LABEL_REF == GET_CODE (XEXP (xop[6], 1)) + && rtx_equal_p (xop[3], XEXP (XEXP (xop[6], 1), 0)) + // The last clobber op of the tablejump. + && xop[8] == const0_rtx) + { + sub_5 = XEXP (xop[6], 0); + } + + if (sub_5 + && SUBREG_P (sub_5) + && SUBREG_BYTE (sub_5) == 0 + && rtx_equal_p (xop[5], SUBREG_REG (sub_5))) + return true; + + if (dump_file) + fprintf (dump_file, "\n;; Failed condition for casesi_<mode>_sequence\n\n"); + + return false; +} + +namespace +{ + + +////////////////////////////////////////////////////////////////////////////// +// Find more POST_INC and PRE_DEC cases. + +static const pass_data avr_pass_data_fuse_add = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_MACH_DEP, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish // todo_flags_finish +}; + +class avr_pass_fuse_add : public rtl_opt_pass +{ +public: + avr_pass_fuse_add (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_fuse_add, ctxt) + { + this->name = name; + } + + // Cloning is required because we are running one instance of the pass + // before peephole2. and a second one after cprop_hardreg. + opt_pass * clone () final override + { + return make_avr_pass_fuse_add (m_ctxt); + } + + bool gate (function *) final override + { + return optimize && avr_fuse_add > 0; + } + + unsigned int execute (function *) final override; + + struct Some_Insn + { + rtx_insn *insn = nullptr; + rtx dest, src; + bool valid () const { return insn != nullptr; } + void set_deleted () + { + gcc_assert (insn); + SET_INSN_DELETED (insn); + insn = nullptr; + } + }; + + // If .insn is not NULL, then this is a reg:HI += const_int + // of an address register. + struct Add_Insn : Some_Insn + { + rtx addend; + int regno; + Add_Insn () {} + Add_Insn (rtx_insn *insn); + }; + + // If .insn is not NULL, then this sets an address register + // to a constant value. + struct Ldi_Insn : Some_Insn + { + int regno; + Ldi_Insn () {} + Ldi_Insn (rtx_insn *insn); + }; + + // If .insn is not NULL, then this is a load or store insn where the + // address is REG or POST_INC with an address register. + struct Mem_Insn : Some_Insn + { + rtx reg_or_0, mem, addr, addr_reg; + int addr_regno; + rtx_code addr_code; + machine_mode mode; + addr_space_t addr_space; + bool store_p, volatile_p; + Mem_Insn () {} + Mem_Insn (rtx_insn *insn); + }; + + rtx_insn *fuse_ldi_add (Ldi_Insn &prev_ldi, Add_Insn &add); + rtx_insn *fuse_add_add (Add_Insn &prev_add, Add_Insn &add); + rtx_insn *fuse_add_mem (Add_Insn &prev_add, Mem_Insn &mem); + rtx_insn *fuse_mem_add (Mem_Insn &prev_mem, Add_Insn &add); +}; // avr_pass_fuse_add + + +/* Describe properties of AVR's indirect load and store instructions + LD, LDD, ST, STD, LPM, ELPM depending on register number, volatility etc. + Rules for "volatile" accesses are: + + | Xmega | non-Xmega + ------+-----------------+---------------- + load | read LSB first | read LSB first + store | write LSB first | write MSB first +*/ + +struct AVR_LdSt_Props +{ + bool has_postinc, has_predec, has_ldd; + // The insn printers will use POST_INC or PRE_DEC addressing, no matter + // what adressing modes we are feeding into them. + bool want_postinc, want_predec; + + AVR_LdSt_Props (int regno, bool store_p, bool volatile_p, addr_space_t as) + { + bool generic_p = ADDR_SPACE_GENERIC_P (as); + bool flashx_p = ! generic_p && as != ADDR_SPACE_MEMX; + has_postinc = generic_p || (flashx_p && regno == REG_Z); + has_predec = generic_p; + has_ldd = ! AVR_TINY && generic_p && (regno == REG_Y || regno == REG_Z); + want_predec = volatile_p && generic_p && ! AVR_XMEGA && store_p; + want_postinc = volatile_p && generic_p && (AVR_XMEGA || ! store_p); + want_postinc |= flashx_p && regno == REG_Z; + } + + AVR_LdSt_Props (const avr_pass_fuse_add::Mem_Insn &m) + : AVR_LdSt_Props (m.addr_regno, m.store_p, m.volatile_p, m.addr_space) + { + gcc_assert (m.valid ()); + } +}; + + +/* Emit a single_set that clobbers REG_CC. */ + +static rtx_insn * +emit_move_ccc (rtx dest, rtx src) +{ + return emit_insn (gen_gen_move_clobbercc (dest, src)); +} + + +/* Emit a single_set that clobbers REG_CC after insn AFTER. */ + +static rtx_insn * +emit_move_ccc_after (rtx dest, rtx src, rtx_insn *after) +{ + return emit_insn_after (gen_gen_move_clobbercc (dest, src), after); +} + +static bool +reg_seen_between_p (const_rtx reg, const rtx_insn *from, const rtx_insn *to) +{ + return (reg_used_between_p (reg, from, to) + || reg_set_between_p (reg, from, to)); +} + + +static void +avr_maybe_adjust_cfa (rtx_insn *insn, rtx reg, int addend) +{ + if (addend + && frame_pointer_needed + && REGNO (reg) == FRAME_POINTER_REGNUM + && avr_fuse_add == 3) + { + rtx plus = plus_constant (Pmode, reg, addend); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (reg, plus)); + } +} + + +// If successful, this represents a SET of a pointer register to a constant. +avr_pass_fuse_add::Ldi_Insn::Ldi_Insn (rtx_insn *insn) +{ + rtx set = single_set (insn); + if (!set) + return; + + src = SET_SRC (set); + dest = SET_DEST (set); + + if (REG_P (dest) + && GET_MODE (dest) == Pmode + && IN_RANGE (regno = REGNO (dest), REG_X, REG_Z) + && CONSTANT_P (src)) + { + this->insn = insn; + } +} + +// If successful, this represents a PLUS with CONST_INT of a pointer +// register X, Y or Z. Otherwise, the object is not valid(). +avr_pass_fuse_add::Add_Insn::Add_Insn (rtx_insn *insn) +{ + rtx set = single_set (insn); + if (!set) + return; + + src = SET_SRC (set); + dest = SET_DEST (set); + if (REG_P (dest) + // We are only interested in PLUSes that change address regs. + && GET_MODE (dest) == Pmode + && IN_RANGE (regno = REGNO (dest), REG_X, REG_Z) + && PLUS == GET_CODE (src) + && rtx_equal_p (XEXP (src, 0), dest) + && CONST_INT_P (XEXP (src, 1))) + { + // This is reg:HI += const_int. + addend = XEXP (src, 1); + this->insn = insn; + } +} + +// If successful, this represents a load or store insn where the addressing +// mode uses pointer register X, Y or Z. Otherwise, the object is not valid(). +avr_pass_fuse_add::avr_pass_fuse_add::Mem_Insn::Mem_Insn (rtx_insn *insn) +{ + rtx set = single_set (insn); + if (!set) + return; + + src = SET_SRC (set); + dest = SET_DEST (set); + mode = GET_MODE (dest); + + if (MEM_P (dest) + && (REG_P (src) || src == CONST0_RTX (mode))) + { + reg_or_0 = src; + mem = dest; + } + else if (REG_P (dest) && MEM_P (src)) + { + reg_or_0 = dest; + mem = src; + } + else + return; + + if (avr_mem_memx_p (mem) + || avr_load_libgcc_p (mem)) + return; + + addr = XEXP (mem, 0); + addr_code = GET_CODE (addr); + + if (addr_code == REG) + addr_reg = addr; + else if (addr_code == POST_INC || addr_code == PRE_DEC) + addr_reg = XEXP (addr, 0); + else + return; + + addr_regno = REGNO (addr_reg); + + if (avr_fuse_add == 2 + && frame_pointer_needed + && addr_regno == FRAME_POINTER_REGNUM) + MEM_VOLATILE_P (mem) = 0; + + if (reg_overlap_mentioned_p (reg_or_0, addr) // Can handle CONSTANT_P. + || addr_regno > REG_Z + || avr_mem_memx_p (mem) + // The following optimizations only handle REG and POST_INC, + // so that's all what we allow here. + || (addr_code != REG && addr_code != POST_INC)) + return; + + addr_space = MEM_ADDR_SPACE (mem); + volatile_p = MEM_VOLATILE_P (mem); + store_p = MEM_P (dest); + + // Turn this "valid". + this->insn = insn; +} + +/* Try to combine a Ldi insn with a PLUS CONST_INT addend to one Ldi insn. + If LDI is valid, then it precedes ADD in the same block. + When a replacement is found, a new insn is emitted and the old insns + are pseudo-deleted. The returned insn is the point where the calling + scanner should continue. When no replacement is found, nullptr is + returned and nothing changed. */ + +rtx_insn * +avr_pass_fuse_add::fuse_ldi_add (Ldi_Insn &ldi, Add_Insn &add) +{ + if (! ldi.valid () + || reg_seen_between_p (ldi.dest, ldi.insn, add.insn)) + { + // If something is between the Ldi and the current insn, we can + // set the Ldi invalid to speed future scans. + return ldi.insn = nullptr; + } + + // Found a Ldi with const and a PLUS insns in the same BB, + // and with no interfering insns between them. + + // Emit new Ldi with the sum of the original offsets after the old Ldi. + rtx xval = plus_constant (Pmode, ldi.src, INTVAL (add.addend)); + + rtx_insn *insn = emit_move_ccc_after (ldi.dest, xval, ldi.insn); + avr_dump (";; new Ldi[%d] insn %d after %d: R%d = %r\n\n", ldi.regno, + INSN_UID (insn), INSN_UID (ldi.insn), ldi.regno, xval); + + rtx_insn *next = NEXT_INSN (add.insn); + ldi.set_deleted (); + add.set_deleted (); + + return next; +} + +/* Try to combine two PLUS insns with CONST_INT addend to one such insn. + If PREV_ADD is valid, then it precedes ADD in the same basic block. + When a replacement is found, a new insn is emitted and the old insns + are pseudo-deleted. The returned insn is the point where the calling + scanner should continue. When no replacement is found, nullptr is + returned and nothing changed. */ + +rtx_insn * +avr_pass_fuse_add::fuse_add_add (Add_Insn &prev_add, Add_Insn &add) +{ + if (! prev_add.valid () + || reg_seen_between_p (add.dest, prev_add.insn, add.insn)) + { + // If something is between the previous Add and the current insn, + // we can set the previous Add invalid to speed future scans. + return prev_add.insn = nullptr; + } + + // Found two PLUS insns in the same BB, and with no interfering + // insns between them. + rtx plus = plus_constant (Pmode, add.src, INTVAL (prev_add.addend)); + + rtx_insn *next; + if (REG_P (plus)) + { + avr_dump (";; Add[%d] from %d annihilates %d\n\n", add.regno, + INSN_UID (prev_add.insn), INSN_UID (add.insn)); + next = NEXT_INSN (add.insn); + } + else + { + // Emit after the current insn, so that it will be picked + // up as next valid Add insn. + next = emit_move_ccc_after (add.dest, plus, add.insn); + avr_dump (";; #1 new Add[%d] insn %d after %d: R%d += %d\n\n", + add.regno, INSN_UID (next), INSN_UID (add.insn), + add.regno, (int) INTVAL (XEXP (plus, 1))); + gcc_assert (GET_CODE (plus) == PLUS); + } + + add.set_deleted (); + prev_add.set_deleted (); + + return next; +} + +/* Try to combine a PLUS of the address register with a load or store insn. + If ADD is valid, then it precedes MEM in the same basic block. + When a replacement is found, a new insn is emitted and the old insns + are pseudo-deleted. The returned insn is the point where the calling + scanner should continue. When no replacement is found, nullptr is + returned and nothing changed. */ + +rtx_insn * +avr_pass_fuse_add::fuse_add_mem (Add_Insn &add, Mem_Insn &mem) +{ + if (! add.valid () + || reg_seen_between_p (add.dest, add.insn, mem.insn)) + { + // If something is between the Add and the current insn, we can + // set the Add invalid to speed future scans. + return add.insn = nullptr; + } + + AVR_LdSt_Props ap { mem }; + + int msize = GET_MODE_SIZE (mem.mode); + + // The mem insn really wants PRE_DEC. + bool case1 = ((mem.addr_code == REG || mem.addr_code == POST_INC) + && msize > 1 && ap.want_predec && ! ap.has_ldd); + + // The offset can be consumed by a PRE_DEC. + bool case2 = (- INTVAL (add.addend) == msize + && (mem.addr_code == REG || mem.addr_code == POST_INC) + && ap.has_predec && ! ap.want_postinc); + + if (! case1 && ! case2) + return nullptr; + + // Change from REG or POST_INC to PRE_DEC. + rtx xmem = change_address (mem.mem, mem.mode, + gen_rtx_PRE_DEC (Pmode, mem.addr_reg)); + rtx dest = mem.store_p ? xmem : mem.reg_or_0; + rtx src = mem.store_p ? mem.reg_or_0 : xmem; + + rtx_insn *next = emit_move_ccc_after (dest, src, mem.insn); + add_reg_note (next, REG_INC, mem.addr_reg); + avr_dump (";; new Mem[%d] insn %d after %d: %r = %r\n\n", mem.addr_regno, + INSN_UID (next), INSN_UID (mem.insn), dest, src); + + // Changing REG or POST_INC -> PRE_DEC means that the addend before + // the memory access must be increased by the size of the access, + rtx plus = plus_constant (Pmode, add.src, msize); + if (! REG_P (plus)) + { + rtx_insn *insn = emit_move_ccc_after (add.dest, plus, add.insn); + avr_dump (";; #2 new Add[%d] insn %d after %d: R%d += %d\n\n", + add.regno, INSN_UID (insn), INSN_UID (add.insn), + add.regno, (int) INTVAL (XEXP (plus, 1))); + gcc_assert (GET_CODE (plus) == PLUS); + } + else + avr_dump (";; Add[%d] insn %d consumed into %d\n\n", + add.regno, INSN_UID (add.insn), INSN_UID (next)); + + // Changing POST_INC -> PRE_DEC means that the addend after the mem has to be + // the size of the access. The hope is that this new add insn may be unused. + if (mem.addr_code == POST_INC) + { + plus = plus_constant (Pmode, add.dest, msize); + rtx_insn *next2 = emit_move_ccc_after (add.dest, plus, next); + avr_dump (";; #3 new Add[%d] insn %d after %d: R%d += %d\n\n", add.regno, + INSN_UID (next2), INSN_UID (next), add.regno, msize); + next = next2; + } + + add.set_deleted (); + mem.set_deleted (); + + return next; +} + +/* Try to combine a load or store insn with a PLUS of the address register. + If MEM is valid, then it precedes ADD in the same basic block. + When a replacement is found, a new insn is emitted and the old insns + are pseudo-deleted. The returned insn is the point where the calling + scanner should continue. When no replacement is found, nullptr is + returned and nothing changed. */ + +rtx_insn * +avr_pass_fuse_add::fuse_mem_add (Mem_Insn &mem, Add_Insn &add) +{ + if (! mem.valid () + || reg_seen_between_p (add.dest, mem.insn, add.insn)) + { + // If something is between the Mem and the current insn, we can + // set the Mem invalid to speed future scans. + return mem.insn = nullptr; + } + + AVR_LdSt_Props ap { mem }; + + int msize = GET_MODE_SIZE (mem.mode); + + // The add insn can be consumed by a POST_INC. + bool case1 = (mem.addr_code == REG + && INTVAL (add.addend) == msize + && ap.has_postinc && ! ap.want_predec); + + // There are cases where even a partial consumption of the offset is better. + // This are the cases where no LD+offset addressing is available, because + // the address register is obviously used after the mem insn, and a mem insn + // with REG addressing mode will have to restore the address. + bool case2 = (mem.addr_code == REG + && msize > 1 && ap.want_postinc && ! ap.has_ldd); + + if (! case1 && ! case2) + return nullptr; + + // Change addressing mode from REG to POST_INC. + rtx xmem = change_address (mem.mem, mem.mode, + gen_rtx_POST_INC (Pmode, mem.addr_reg)); + rtx dest = mem.store_p ? xmem : mem.reg_or_0; + rtx src = mem.store_p ? mem.reg_or_0 : xmem; + + rtx_insn *insn = emit_move_ccc_after (dest, src, mem.insn); + add_reg_note (insn, REG_INC, mem.addr_reg); + avr_dump (";; new Mem[%d] insn %d after %d: %r = %r\n\n", add.regno, + INSN_UID (insn), INSN_UID (mem.insn), dest, src); + + rtx_insn *next = NEXT_INSN (add.insn); + + // Changing REG -> POST_INC means that the post addend must be + // decreased by the size of the access. + rtx plus = plus_constant (Pmode, add.src, -msize); + if (! REG_P (plus)) + { + next = emit_move_ccc_after (mem.addr_reg, plus, add.insn); + avr_dump (";; #4 new Add[%d] insn %d after %d: R%d += %d\n\n", + add.regno, INSN_UID (next), INSN_UID (add.insn), + add.regno, (int) INTVAL (XEXP (plus, 1))); + gcc_assert (GET_CODE (plus) == PLUS); + } + else + avr_dump (";; Add[%d] insn %d consumed into %d\n\n", + add.regno, INSN_UID (add.insn), INSN_UID (insn)); + + add.set_deleted (); + mem.set_deleted (); + + return next; +} + +/* Try to post-reload combine PLUS with CONST_INt of pointer registers with: + - Sets to a constant address. + - PLUS insn of that kind. + - Indirect loads and stores. + In almost all cases, combine opportunities arise from the preparation + done by `avr_split_fake_addressing_move', but in some rare cases combinations + are found for the ordinary cores, too. + As we consider at most one Mem insn per try, there may still be missed + optimizations like POST_INC + PLUS + POST_INC might be performed + as PRE_DEC + PRE_DEC for two adjacent locations. */ + +unsigned int +avr_pass_fuse_add::execute (function *func) +{ + df_note_add_problem (); + df_analyze (); + + int n_add = 0, n_mem = 0, n_ldi = 0; + basic_block bb; + + FOR_EACH_BB_FN (bb, func) + { + Ldi_Insn prev_ldi_insns[REG_32]; + Add_Insn prev_add_insns[REG_32]; + Mem_Insn prev_mem_insns[REG_32]; + rtx_insn *insn, *curr; + + avr_dump ("\n;; basic block %d\n\n", bb->index); + + FOR_BB_INSNS_SAFE (bb, insn, curr) + { + rtx_insn *next = nullptr; + Ldi_Insn ldi_insn { insn }; + Add_Insn add_insn { insn }; + Mem_Insn mem_insn { insn }; + + if (add_insn.valid ()) + { + // Found reg:HI += const_int + avr_dump (";; insn %d: Add[%d]: R%d += %d\n\n", + INSN_UID (add_insn.insn), add_insn.regno, + add_insn.regno, (int) INTVAL (add_insn.addend)); + Ldi_Insn &prev_ldi_insn = prev_ldi_insns[add_insn.regno]; + Add_Insn &prev_add_insn = prev_add_insns[add_insn.regno]; + Mem_Insn &prev_mem_insn = prev_mem_insns[add_insn.regno]; + if ((next = fuse_ldi_add (prev_ldi_insn, add_insn))) + curr = next, n_ldi += 1; + else if ((next = fuse_add_add (prev_add_insn, add_insn))) + curr = next, n_add += 1; + else if ((next = fuse_mem_add (prev_mem_insn, add_insn))) + curr = next, n_mem += 1; + else + prev_add_insn = add_insn; + } + else if (mem_insn.valid ()) + { + int addr_regno = REGNO (mem_insn.addr_reg); + avr_dump (";; insn %d: Mem[%d]: %r = %r\n\n", + INSN_UID (mem_insn.insn), addr_regno, + mem_insn.dest, mem_insn.src); + Add_Insn &prev_add_insn = prev_add_insns[addr_regno]; + if ((next = fuse_add_mem (prev_add_insn, mem_insn))) + curr = next, n_mem += 1; + else + prev_mem_insns[addr_regno] = mem_insn; + } + else if (ldi_insn.valid ()) + { + if (! CONST_INT_P (ldi_insn.src)) + avr_dump (";; insn %d: Ldi[%d]: R%d = %r\n\n", + INSN_UID (ldi_insn.insn), ldi_insn.regno, + ldi_insn.regno, ldi_insn.src); + prev_ldi_insns[ldi_insn.regno] = ldi_insn; + } + } // for insns + } // for BBs + + avr_dump (";; Function %f: Found %d changes: %d ldi, %d add, %d mem.\n", + n_ldi + n_add + n_mem, n_ldi, n_add, n_mem); + + return 0; +} + + + +////////////////////////////////////////////////////////////////////////////// +// Determine whether an ISR may use the __gcc_isr pseudo-instruction. + +static const pass_data avr_pass_data_pre_proep = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + 0 // todo_flags_finish +}; + +class avr_pass_pre_proep : public rtl_opt_pass +{ +public: + avr_pass_pre_proep (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_pre_proep, ctxt) + { + this->name = name; + } + + void compute_maybe_gasisr (function *); + + unsigned int execute (function *fun) final override + { + if (avr_gasisr_prologues + // Whether this function is an ISR worth scanning at all. + && !fun->machine->is_no_gccisr + && (fun->machine->is_interrupt + || fun->machine->is_signal) + && !cfun->machine->is_naked + // Paranoia: Non-local gotos and labels that might escape. + && !cfun->calls_setjmp + && !cfun->has_nonlocal_label + && !cfun->has_forced_label_in_static) + { + compute_maybe_gasisr (fun); + } + + return 0; + } + +}; // avr_pass_pre_proep + + +/* Set fun->machine->gasisr.maybe provided we don't find anything that + prohibits GAS generating parts of ISR prologues / epilogues for us. */ + +void +avr_pass_pre_proep::compute_maybe_gasisr (function *fun) +{ + // Don't use BB iterators so that we see JUMP_TABLE_DATA. + + for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + // Transparent calls always use [R]CALL and are filtered out by GAS. + // ISRs don't use -mcall-prologues, hence what remains to be filtered + // out are open coded (tail) calls. + + if (CALL_P (insn)) + return; + + // __tablejump2__ clobbers something and is targeted by JMP so + // that GAS won't see its usage. + + if (AVR_HAVE_JMP_CALL + && JUMP_TABLE_DATA_P (insn)) + return; + + // Non-local gotos not seen in *FUN. + + if (JUMP_P (insn) + && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX)) + return; + } + + fun->machine->gasisr.maybe = 1; +} + + + +////////////////////////////////////////////////////////////////////////////// +// Late recomputation of notes so we can use `reg_unused_after()' and friends. + +static const pass_data avr_pass_data_recompute_notes = +{ + RTL_PASS, // type + "", // name (will be patched) + OPTGROUP_NONE, // optinfo_flags + TV_DF_SCAN, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish | TODO_df_verify // todo_flags_finish +}; + +class avr_pass_recompute_notes : public rtl_opt_pass +{ +public: + avr_pass_recompute_notes (gcc::context *ctxt, const char *name) + : rtl_opt_pass (avr_pass_data_recompute_notes, ctxt) + { + this->name = name; + } + + unsigned int execute (function *) final override + { + df_note_add_problem (); + df_analyze (); + + return 0; + } +}; // avr_pass_recompute_notes + +} // anonymous namespace + + + +////////////////////////////////////////////////////////////////////////////// +// Function visible and used outside this module. + +/* During reload, we allow much more addresses than Reduced Tiny actually + supports. Split them after reload in order to get closer to the + core's capabilities. This sets the stage for pass .avr-fuse-add. */ + +bool +avr_split_fake_addressing_move (rtx_insn * /*insn*/, rtx *xop) +{ + bool store_p = false; + rtx mem, reg_or_0; + + if (REG_P (xop[0]) && MEM_P (xop[1])) + { + reg_or_0 = xop[0]; + mem = xop[1]; + } + else if (MEM_P (xop[0]) + && (REG_P (xop[1]) + || xop[1] == CONST0_RTX (GET_MODE (xop[0])))) + { + mem = xop[0]; + reg_or_0 = xop[1]; + store_p = true; + } + else + return false; + + machine_mode mode = GET_MODE (mem); + rtx base, addr = XEXP (mem, 0); + rtx_code addr_code = GET_CODE (addr); + + if (REG_P (reg_or_0) + && reg_overlap_mentioned_p (reg_or_0, addr)) + return false; + else if (addr_code == PLUS || addr_code == PRE_DEC || addr_code == POST_INC) + base = XEXP (addr, 0); + else if (addr_code == REG) + base = addr; + else + return false; + + if (REGNO (base) > REG_Z) + return false; + + if (! AVR_TINY + // Only keep base registers that can't do PLUS addressing. + && ((REGNO (base) != REG_X + && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (mem))) + || avr_load_libgcc_p (mem) + || avr_mem_memx_p (mem))) + return false; + + bool volatile_p = MEM_VOLATILE_P (mem); + bool mem_volatile_p = false; + if (frame_pointer_needed + && REGNO (base) == FRAME_POINTER_REGNUM) + { + if (avr_fuse_add < 2 + // Be a projection (we always split PLUS). + || (avr_fuse_add == 2 && volatile_p && addr_code != PLUS)) + return false; + + // Changing the frame pointer locally may confuse later passes + // like .dse2 which don't track changes of FP, not even when + // respective CFA notes are present. An example is pr22141-1.c. + if (avr_fuse_add == 2) + mem_volatile_p = true; + } + + rtx_code new_code = UNKNOWN; + HOST_WIDE_INT add = 0, sub = 0; + int msize = GET_MODE_SIZE (mode); + + AVR_LdSt_Props ap { (int) REGNO (base), store_p, volatile_p, + ADDR_SPACE_GENERIC }; + + switch (addr_code) + { + default: + return false; + + case PLUS: + add = INTVAL (XEXP (addr, 1)); + if (msize == 1) + { + new_code = REG; + sub = -add; + } + else if (ap.want_predec) + { + // volatile stores prefer PRE_DEC (MSB first) + sub = -add; + add += msize; + new_code = PRE_DEC; + } + else + { + new_code = POST_INC; + sub = -add - msize; + } + break; + + case POST_INC: + // volatile stores prefer PRE_DEC (MSB first) + if (msize > 1 && ap.want_predec) + { + add = msize; + new_code = PRE_DEC; + sub = msize; + break; + } + return false; + + case PRE_DEC: + // volatile loads prefer POST_INC (LSB first) + if (msize > 1 && ap.want_postinc) + { + add = -msize; + new_code = POST_INC; + sub = -msize; + break; + } + return false; + + case REG: + if (msize == 1) + return false; + + if (ap.want_predec) + { + add = msize; + new_code = PRE_DEC; + sub = 0; + } + else + { + add = 0; + new_code = POST_INC; + sub = -msize; + } + break; + } // switch addr_code + + rtx_insn *insn; + + if (add) + { + insn = emit_move_ccc (base, plus_constant (Pmode, base, add)); + avr_maybe_adjust_cfa (insn, base, add); + } + + rtx new_addr = new_code == REG + ? base + : gen_rtx_fmt_e (new_code, Pmode, base); + + rtx new_mem = change_address (mem, mode, new_addr); + if (mem_volatile_p) + MEM_VOLATILE_P (new_mem) = 1; + + insn = emit_move_ccc (store_p ? new_mem : reg_or_0, + store_p ? reg_or_0 : new_mem); + if (auto_inc_p (new_addr)) + { + add_reg_note (insn, REG_INC, base); + int off = new_code == POST_INC ? msize : -msize; + avr_maybe_adjust_cfa (insn, base, off); + } + + if (sub) + { + insn = emit_move_ccc (base, plus_constant (Pmode, base, sub)); + avr_maybe_adjust_cfa (insn, base, sub); + } + + return true; +} + + + +// Functions make_<pass-name> (gcc::context*) where <pass-name> is +// according to the pass declaration in avr-passes.def. GCC's pass +// manager uses these function to create the respective pass object. + +// Optimize results of the casesi expander for modes < SImode. + +rtl_opt_pass * +make_avr_pass_casesi (gcc::context *ctxt) +{ + return new avr_pass_casesi (ctxt, "avr-casesi"); +} + +// Try to replace 2 cbranch insns with 1 comparison and 2 branches. + +rtl_opt_pass * +make_avr_pass_ifelse (gcc::context *ctxt) +{ + return new avr_pass_ifelse (ctxt, "avr-ifelse"); +} + +// Determine whether an ISR may use the __gcc_isr pseudo-instruction. + +rtl_opt_pass * +make_avr_pass_pre_proep (gcc::context *ctxt) +{ + return new avr_pass_pre_proep (ctxt, "avr-pre-proep"); +} + +// Find more POST_INC and PRE_DEC cases. + +rtl_opt_pass * +make_avr_pass_fuse_add (gcc::context *ctxt) +{ + return new avr_pass_fuse_add (ctxt, "avr-fuse-add"); +} + +// Late recomputation of notes so we can use `reg_unused_after()' and friends. + +rtl_opt_pass * +make_avr_pass_recompute_notes (gcc::context *ctxt) +{ + return new avr_pass_recompute_notes (ctxt, "avr-notes-free-cfg"); +} diff --git a/gcc/config/avr/avr-passes.def b/gcc/config/avr/avr-passes.def index 748260e..d39bdd8 100644 --- a/gcc/config/avr/avr-passes.def +++ b/gcc/config/avr/avr-passes.def @@ -1,4 +1,4 @@ -/* Description of target passes for AVR. +/* Description of target passes for AVR 8-bit microcontrollers. Copyright (C) 2016-2024 Free Software Foundation, Inc. */ /* This file is part of GCC. @@ -20,12 +20,33 @@ /* A post reload optimization pass that fuses PLUS insns with CONST_INT addend with a load or store insn to get POST_INC or PRE_DEC addressing. It can also fuse two PLUSes to a single one, which may occur due to - splits from `avr_split_tiny_move'. We do this in an own pass because - it can find more cases than peephole2, for example when there are - unrelated insns between the interesting ones. */ + splits from `avr_split_fake_addressing_move'. We do this in an own + pass because it can find more cases than peephole2, for example when + there are unrelated insns between the interesting ones. */ INSERT_PASS_BEFORE (pass_peephole2, 1, avr_pass_fuse_add); +/* There are cases where avr-fuse-add doesn't find POST_INC cases because + the RTL code at that time is too long-winded, and moves registers back and + forth (which seems to be the same reason for why pass auto_inc_dec cannot + find POST_INC, either). Some of that long-windedness is cleaned up very + late in pass cprop_hardreg, which opens up new opportunities to find post + increments. An example is the following function from AVR-LibC's qsort: + + void swapfunc (char *a, char *b, int n) + { + do + { + char tmp = *a; + *a++ = *b; + *b++ = tmp; + } while (--n > 0); + } + + Hence, run avr-fuse-add twice; the second time after cprop_hardreg. */ + +INSERT_PASS_AFTER (pass_cprop_hardreg, 1, avr_pass_fuse_add); + /* An analysis pass that runs prior to prologue / epilogue generation. Computes cfun->machine->gasisr.maybe which is used in prologue and epilogue generation provided -mgas-isr-prologues is on. */ @@ -47,9 +68,9 @@ INSERT_PASS_BEFORE (pass_free_cfg, 1, avr_pass_recompute_notes); tries to fix such situations by operating on the original mode. This reduces code size and register pressure. - The assertion is that the code generated by casesi is unaltered and a + The assertion is that the code generated by casesi is unaltered and a sign-extend or zero-extend from QImode or HImode precedes the casesi - insns withaout any insns in between. */ + insns without any insns in between. */ INSERT_PASS_AFTER (pass_expand, 1, avr_pass_casesi); diff --git a/gcc/config/avr/avr-protos.h b/gcc/config/avr/avr-protos.h index 34298b9..96708eb 100644 --- a/gcc/config/avr/avr-protos.h +++ b/gcc/config/avr/avr-protos.h @@ -1,5 +1,4 @@ -/* Prototypes for exported functions defined in avr.cc - +/* Prototypes for tm_p.h for AVR 8-bit microcontrollers. Copyright (C) 2000-2024 Free Software Foundation, Inc. Contributed by Denis Chertykov (chertykov@gmail.com) @@ -21,7 +20,7 @@ extern int avr_function_arg_regno_p (int r); -extern void avr_cpu_cpp_builtins (struct cpp_reader * pfile); +extern void avr_cpu_cpp_builtins (cpp_reader * pfile); extern enum reg_class avr_regno_reg_class (int r); extern void asm_globalize_label (FILE *file, const char *name); extern void avr_adjust_reg_alloc_order (void); @@ -55,7 +54,7 @@ extern const char *avr_out_tsthi (rtx_insn *, rtx*, int*); extern const char *avr_out_tstpsi (rtx_insn *, rtx*, int*); extern const char *avr_out_compare (rtx_insn *, rtx*, int*); extern const char *avr_out_compare64 (rtx_insn *, rtx*, int*); -extern const char *ret_cond_branch (rtx x, int len, int reverse); +extern const char *avr_cond_branch (rtx_insn *, rtx *); extern const char *avr_out_movpsi (rtx_insn *, rtx*, int*); extern const char *avr_out_sign_extend (rtx_insn *, rtx*, int*); extern const char *avr_out_insert_notbit (rtx_insn *, rtx*, int*); @@ -63,7 +62,11 @@ extern const char *avr_out_insv (rtx_insn *, rtx*, int*); extern const char *avr_out_extr (rtx_insn *, rtx*, int*); extern const char *avr_out_extr_not (rtx_insn *, rtx*, int*); extern const char *avr_out_plus_set_ZN (rtx*, int*); -extern const char *avr_out_cmp_ext (rtx*, enum rtx_code, int*); +extern const char *avr_out_plus_set_N (rtx*, int*); +extern const char *avr_out_op8_set_ZN (rtx_code, rtx*, int*); +extern int avr_len_op8_set_ZN (rtx_code, rtx*); +extern bool avr_op8_ZN_operator (rtx); +extern const char *avr_out_cmp_ext (rtx*, rtx_code, int*); extern const char *ashlqi3_out (rtx_insn *insn, rtx operands[], int *len); extern const char *ashlhi3_out (rtx_insn *insn, rtx operands[], int *len); @@ -91,7 +94,6 @@ extern void avr_expand_epilogue (bool); extern bool avr_emit_cpymemhi (rtx*); extern void avr_emit_xior_with_shift (rtx_insn*, rtx*, int); extern int avr_epilogue_uses (int regno); -extern bool avr_split_tiny_move (rtx_insn *insn, rtx *operands); extern void avr_output_addr_vec (rtx_insn*, rtx); extern const char *avr_out_sbxx_branch (rtx_insn *insn, rtx operands[]); @@ -113,7 +115,8 @@ extern const char* output_reload_inhi (rtx*, rtx, int*); extern const char* output_reload_insisf (rtx*, rtx, int*); extern const char* avr_out_reload_inpsi (rtx*, rtx, int*); extern const char* avr_out_lpm (rtx_insn *, rtx*, int*); -extern void avr_notice_update_cc (rtx body, rtx_insn *insn); +extern const char* avr_out_cmp_lsr (rtx_insn *, rtx*, int*); +extern void avr_maybe_cmp_lsr (rtx *); extern int reg_unused_after (rtx_insn *insn, rtx reg); extern int avr_jump_mode (rtx x, rtx_insn *insn, int = 0); extern int test_hard_reg_class (enum reg_class rclass, rtx x); @@ -121,11 +124,11 @@ extern int jump_over_one_insn_p (rtx_insn *insn, rtx dest); extern void avr_final_prescan_insn (rtx_insn *insn, rtx *operand, int num_operands); -extern RTX_CODE avr_normalize_condition (RTX_CODE condition); +extern rtx_code avr_normalize_condition (rtx_code condition); extern void out_shift_with_cnt (const char *templ, rtx_insn *insn, rtx operands[], int *len, int t_len); -extern enum reg_class avr_mode_code_base_reg_class (machine_mode, addr_space_t, RTX_CODE, RTX_CODE); -extern bool avr_regno_mode_code_ok_for_base_p (int, machine_mode, addr_space_t, RTX_CODE, RTX_CODE); +extern enum reg_class avr_mode_code_base_reg_class (machine_mode, addr_space_t, rtx_code, rtx_code); +extern bool avr_regno_mode_code_ok_for_base_p (int, machine_mode, addr_space_t, rtx_code, rtx_code); extern rtx avr_incoming_return_addr_rtx (void); extern rtx avr_legitimize_reload_address (rtx*, machine_mode, int, int, int, int, rtx (*)(rtx,int)); extern bool avr_adiw_reg_p (rtx); @@ -134,9 +137,8 @@ extern bool avr_mem_memx_p (rtx); extern bool avr_load_libgcc_p (rtx); extern bool avr_xload_libgcc_p (machine_mode); extern rtx avr_eval_addr_attrib (rtx x); -extern bool avr_casei_sequence_check_operands (rtx *xop); -extern bool avr_float_lib_compare_returns_bool (machine_mode, enum rtx_code); +extern bool avr_float_lib_compare_returns_bool (machine_mode, rtx_code); static inline unsigned regmask (machine_mode mode, unsigned regno) @@ -154,6 +156,8 @@ extern rtx zero_reg_rtx; extern rtx all_regs_rtx[32]; extern rtx rampz_rtx; extern rtx cc_reg_rtx; +extern rtx ccn_reg_rtx; +extern rtx cczn_reg_rtx; #endif /* RTX_CODE */ @@ -163,6 +167,8 @@ extern void asm_output_float (FILE *file, REAL_VALUE_TYPE n); extern bool avr_have_dimode; +/* From avr-passes.cc */ + namespace gcc { class context; } class rtl_opt_pass; @@ -171,6 +177,10 @@ extern rtl_opt_pass *make_avr_pass_pre_proep (gcc::context *); extern rtl_opt_pass *make_avr_pass_recompute_notes (gcc::context *); extern rtl_opt_pass *make_avr_pass_casesi (gcc::context *); extern rtl_opt_pass *make_avr_pass_ifelse (gcc::context *); +#ifdef RTX_CODE +extern bool avr_casei_sequence_check_operands (rtx *xop); +extern bool avr_split_fake_addressing_move (rtx_insn *insn, rtx *operands); +#endif /* RTX_CODE */ /* From avr-log.cc */ diff --git a/gcc/config/avr/avr.cc b/gcc/config/avr/avr.cc index c520b98..f62ea8a 100644 --- a/gcc/config/avr/avr.cc +++ b/gcc/config/avr/avr.cc @@ -1,4 +1,4 @@ -/* Subroutines for insn-output.cc for ATMEL AVR micro controllers +/* Subroutines for insn-output.cc for AVR 8-bit microcontrollers Copyright (C) 1998-2024 Free Software Foundation, Inc. Contributed by Denis Chertykov (chertykov@gmail.com) @@ -8,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ @@ -50,11 +50,10 @@ #include "explow.h" #include "expr.h" #include "langhooks.h" -#include "cfgrtl.h" #include "builtins.h" -#include "context.h" #include "tree-pass.h" -#include "print-rtl.h" +#include "context.h" +#include "pass_manager.h" #include "rtl-iter.h" /* This file should be included last. */ @@ -154,17 +153,6 @@ static const char *out_movqi_mr_r (rtx_insn *, rtx[], int *); static const char *out_movhi_mr_r (rtx_insn *, rtx[], int *); static const char *out_movsi_mr_r (rtx_insn *, rtx[], int *); -static int get_sequence_length (rtx_insn *insns); -static int sequent_regs_live (void); -static const char *ptrreg_to_str (int); -static const char *cond_string (enum rtx_code); -static int avr_num_arg_regs (machine_mode, const_tree); -static int avr_operand_rtx_cost (rtx, machine_mode, enum rtx_code, - int, bool); -static void output_reload_in_const (rtx *, rtx, int *, bool); -static struct machine_function *avr_init_machine_status (void); -static bool _reg_unused_after (rtx_insn *insn, rtx reg, bool look_at_insn); - /* Prototypes for hook implementors if needed before their implementation. */ @@ -196,7 +184,11 @@ rtx zero_reg_rtx; /* Condition Code register RTX (reg:CC REG_CC) */ extern GTY(()) rtx cc_reg_rtx; +extern GTY(()) rtx ccn_reg_rtx; +extern GTY(()) rtx cczn_reg_rtx; rtx cc_reg_rtx; +rtx ccn_reg_rtx; +rtx cczn_reg_rtx; /* RTXs for all general purpose registers as QImode */ extern GTY(()) rtx all_regs_rtx[REG_32]; @@ -222,7 +214,7 @@ static GTY(()) rtx xstring_e; /* Current architecture. */ const avr_arch_t *avr_arch; -enum avr_arch_id avr_arch_index; +avr_arch_id avr_arch_index; /* Unnamed sections associated to __attribute__((progmem)) aka. PROGMEM or to address space __flash* or __memx. Only used as singletons inside @@ -255,820 +247,144 @@ avr_tolower (char *lo, const char *up) } -/* Constraint helper function. XVAL is a CONST_INT or a CONST_DOUBLE. - Return true if the least significant N_BYTES bytes of XVAL all have a - popcount in POP_MASK and false, otherwise. POP_MASK represents a subset - of integers which contains an integer N iff bit N of POP_MASK is set. */ +/* Return chunk of mode MODE of X as an rtx. N specifies the subreg + byte at which the chunk starts. N must be an integral multiple + of the mode size. */ -bool -avr_popcount_each_byte (rtx xval, int n_bytes, int pop_mask) +static rtx +avr_chunk (machine_mode mode, rtx x, int n) { - machine_mode mode = GET_MODE (xval); - - if (VOIDmode == mode) - mode = SImode; - - for (int i = 0; i < n_bytes; i++) - { - rtx xval8 = simplify_gen_subreg (QImode, xval, mode, i); - unsigned int val8 = UINTVAL (xval8) & GET_MODE_MASK (QImode); - - if ((pop_mask & (1 << popcount_hwi (val8))) == 0) - return false; - } - - return true; + gcc_assert (n % GET_MODE_SIZE (mode) == 0); + machine_mode xmode = GET_MODE (x) == VOIDmode ? DImode : GET_MODE (x); + return simplify_gen_subreg (mode, x, xmode, n); } -/* Constraint helper function. XVAL is a CONST_INT. Return true if we - can perform XOR without a clobber reg, provided the operation is on - a d-register. This means each byte is in { 0, 0xff, 0x80 }. */ +/* Return the N-th byte of X as an rtx. */ -bool -avr_xor_noclobber_dconst (rtx xval, int n_bytes) +static rtx +avr_byte (rtx x, int n) { - machine_mode mode = GET_MODE (xval); - - if (VOIDmode == mode) - mode = SImode; - - for (int i = 0; i < n_bytes; ++i) - { - rtx xval8 = simplify_gen_subreg (QImode, xval, mode, i); - unsigned int val8 = UINTVAL (xval8) & GET_MODE_MASK (QImode); - - if (val8 != 0 && val8 != 0xff && val8 != 0x80) - return false; - } - - return true; + return avr_chunk (QImode, x, n); } -/* Access some RTX as INT_MODE. If X is a CONST_FIXED we can get - the bit representation of X by "casting" it to CONST_INT. */ +/* Return the sub-word of X starting at byte number N. */ -rtx -avr_to_int_mode (rtx x) +static rtx +avr_word (rtx x, int n) { - machine_mode mode = GET_MODE (x); - - return VOIDmode == mode - ? x - : simplify_gen_subreg (int_mode_for_mode (mode).require (), x, mode, 0); + return avr_chunk (HImode, x, n); } -/* Return true if hard register REG supports the ADIW and SBIW instructions. */ +/* Return the N-th byte of compile-time constant X as an int8_t. */ -bool -avr_adiw_reg_p (rtx reg) +static int8_t +avr_int8 (rtx x, int n) { - return (AVR_HAVE_ADIW - && test_hard_reg_class (ADDW_REGS, reg)); -} + gcc_assert (CONST_INT_P (x) || CONST_FIXED_P (x) || CONST_DOUBLE_P (x)); - -static bool -ra_in_progress () -{ - return avr_lra_p ? lra_in_progress : reload_in_progress; + return (int8_t) trunc_int_for_mode (INTVAL (avr_byte (x, n)), QImode); } +/* Return the N-th byte of compile-time constant X as an uint8_t. */ -namespace { - -static const pass_data avr_pass_data_recompute_notes = +static uint8_t +avr_uint8 (rtx x, int n) { - RTL_PASS, // type - "", // name (will be patched) - OPTGROUP_NONE, // optinfo_flags - TV_DF_SCAN, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - TODO_df_finish | TODO_df_verify // todo_flags_finish -}; - - -class avr_pass_recompute_notes : public rtl_opt_pass -{ -public: - avr_pass_recompute_notes (gcc::context *ctxt, const char *name) - : rtl_opt_pass (avr_pass_data_recompute_notes, ctxt) - { - this->name = name; - } - - virtual unsigned int execute (function *) - { - df_note_add_problem (); - df_analyze (); - - return 0; - } -}; // avr_pass_recompute_notes - -static const pass_data avr_pass_data_casesi = -{ - RTL_PASS, // type - "", // name (will be patched) - OPTGROUP_NONE, // optinfo_flags - TV_DF_SCAN, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - 0 // todo_flags_finish -}; - - -class avr_pass_casesi : public rtl_opt_pass -{ -public: - avr_pass_casesi (gcc::context *ctxt, const char *name) - : rtl_opt_pass (avr_pass_data_casesi, ctxt) - { - this->name = name; - } - - void avr_rest_of_handle_casesi (function *); - - virtual bool gate (function *) { return optimize > 0; } - - virtual unsigned int execute (function *func) - { - avr_rest_of_handle_casesi (func); - - return 0; - } -}; // avr_pass_casesi - - -static const pass_data avr_pass_data_ifelse = -{ - RTL_PASS, // type - "", // name (will be patched) - OPTGROUP_NONE, // optinfo_flags - TV_DF_SCAN, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - TODO_df_finish | TODO_df_verify // todo_flags_finish -}; - -class avr_pass_ifelse : public rtl_opt_pass -{ -public: - avr_pass_ifelse (gcc::context *ctxt, const char *name) - : rtl_opt_pass (avr_pass_data_ifelse, ctxt) - { - this->name = name; - } - - void avr_rest_of_handle_ifelse (function *); - - virtual bool gate (function *) { return optimize > 0; } - - virtual unsigned int execute (function *func) - { - avr_rest_of_handle_ifelse (func); + return (uint8_t) avr_int8 (x, n); +} - return 0; - } -}; // avr_pass_ifelse -} // anon namespace +/* Return the sub-word of compile-time constant X that starts + at byte N as an int16_t. */ -rtl_opt_pass * -make_avr_pass_recompute_notes (gcc::context *ctxt) +static int16_t +avr_int16 (rtx x, int n) { - return new avr_pass_recompute_notes (ctxt, "avr-notes-free-cfg"); -} + gcc_assert (CONST_INT_P (x) || CONST_FIXED_P (x) || CONST_DOUBLE_P (x)); -rtl_opt_pass * -make_avr_pass_casesi (gcc::context *ctxt) -{ - return new avr_pass_casesi (ctxt, "avr-casesi"); + return (int16_t) trunc_int_for_mode (INTVAL (avr_word (x, n)), HImode); } -rtl_opt_pass * -make_avr_pass_ifelse (gcc::context *ctxt) -{ - return new avr_pass_ifelse (ctxt, "avr-ifelse"); -} +/* Return the sub-word of compile-time constant X that starts + at byte N as an uint16_t. */ - -/* Make one parallel insn with all the patterns from insns i[0]..i[5]. */ - -static rtx_insn * -avr_parallel_insn_from_insns (rtx_insn *i[5]) +static uint16_t +avr_uint16 (rtx x, int n) { - rtvec vec = gen_rtvec (5, PATTERN (i[0]), PATTERN (i[1]), PATTERN (i[2]), - PATTERN (i[3]), PATTERN (i[4])); - start_sequence(); - emit (gen_rtx_PARALLEL (VOIDmode, vec)); - rtx_insn *insn = get_insns(); - end_sequence(); - - return insn; + return (uint16_t) avr_int16 (x, n); } -/* Return true if we see an insn stream generated by casesi expander together - with an extension to SImode of the switch value. - - If this is the case, fill in the insns from casesi to INSNS[1..5] and - the SImode extension to INSNS[0]. Moreover, extract the operands of - pattern casesi_<mode>_sequence forged from the sequence to recog_data. */ +/* Constraint helper function. XVAL is a CONST_INT or a CONST_DOUBLE. + Return true if the least significant N_BYTES bytes of XVAL all have a + popcount in POP_MASK and false, otherwise. POP_MASK represents a subset + of integers which contains an integer N iff bit N of POP_MASK is set. */ -static bool -avr_is_casesi_sequence (basic_block bb, rtx_insn *insn, rtx_insn *insns[5]) +bool +avr_popcount_each_byte (rtx xval, int n_bytes, int pop_mask) { - rtx set_4, set_0; - - /* A first and quick test for a casesi sequences. As a side effect of - the test, harvest respective insns to INSNS[0..4]. */ - - if (!(JUMP_P (insns[4] = insn) - // casesi is the only insn that comes up with UNSPEC_INDEX_JMP, - // hence the following test ensures that we are actually dealing - // with code from casesi. - && (set_4 = single_set (insns[4])) - && UNSPEC == GET_CODE (SET_SRC (set_4)) - && UNSPEC_INDEX_JMP == XINT (SET_SRC (set_4), 1) - - && (insns[3] = prev_real_insn (insns[4])) - && (insns[2] = prev_real_insn (insns[3])) - && (insns[1] = prev_real_insn (insns[2])) - - // Insn prior to casesi. - && (insns[0] = prev_real_insn (insns[1])) - && (set_0 = single_set (insns[0])) - && extend_operator (SET_SRC (set_0), SImode))) - { - return false; - } - - if (dump_file) - { - fprintf (dump_file, ";; Sequence from casesi in " - "[bb %d]:\n\n", bb->index); - for (int i = 0; i < 5; i++) - print_rtl_single (dump_file, insns[i]); - } - - /* We have to deal with quite some operands. Extracting them by hand - would be tedious, therefore wrap the insn patterns into a parallel, - run recog against it and then use insn extract to get the operands. */ - - rtx_insn *xinsn = avr_parallel_insn_from_insns (insns); - - INSN_CODE (xinsn) = recog (PATTERN (xinsn), xinsn, NULL /* num_clobbers */); - - /* Failing to recognize means that someone changed the casesi expander or - that some passes prior to this one performed some unexpected changes. - Gracefully drop such situations instead of aborting. */ - - if (INSN_CODE (xinsn) < 0) + for (int i = 0; i < n_bytes; i++) { - if (dump_file) - fprintf (dump_file, ";; Sequence not recognized, giving up.\n\n"); - - return false; - } - - gcc_assert (CODE_FOR_casesi_qi_sequence == INSN_CODE (xinsn) - || CODE_FOR_casesi_hi_sequence == INSN_CODE (xinsn)); - - extract_insn (xinsn); + unsigned int val8 = avr_uint8 (xval, i); - // Assert on the anatomy of xinsn's operands we are going to work with. - - gcc_assert (recog_data.n_operands == 11); - gcc_assert (recog_data.n_dups == 4); - - if (dump_file) - { - fprintf (dump_file, ";; Operands extracted:\n"); - for (int i = 0; i < recog_data.n_operands; i++) - avr_fdump (dump_file, ";; $%d = %r\n", i, recog_data.operand[i]); - fprintf (dump_file, "\n"); + if ((pop_mask & (1 << popcount_hwi (val8))) == 0) + return false; } return true; } -/* Perform some extra checks on operands of casesi_<mode>_sequence. - Not all operand dependencies can be described by means of predicates. - This function performs left over checks and should always return true. - Returning false means that someone changed the casesi expander but did - not adjust casesi_<mode>_sequence. */ +/* Constraint helper function. XVAL is a CONST_INT. Return true if we + can perform XOR without a clobber reg, provided the operation is on + a d-register. This means each byte is in { 0, 0xff, 0x80 }. */ bool -avr_casei_sequence_check_operands (rtx *xop) +avr_xor_noclobber_dconst (rtx xval, int n_bytes) { - rtx sub_5 = NULL_RTX; - - if (AVR_HAVE_EIJMP_EICALL - // The last clobber op of the tablejump. - && xop[8] == all_regs_rtx[REG_24]) + for (int i = 0; i < n_bytes; ++i) { - // $6 is: (subreg:SI ($5) 0) - sub_5 = xop[6]; - } + unsigned int val8 = avr_uint8 (xval, i); - if (!AVR_HAVE_EIJMP_EICALL - // $6 is: (plus:HI (subreg:SI ($5) 0) - // (label_ref ($3))) - && PLUS == GET_CODE (xop[6]) - && LABEL_REF == GET_CODE (XEXP (xop[6], 1)) - && rtx_equal_p (xop[3], XEXP (XEXP (xop[6], 1), 0)) - // The last clobber op of the tablejump. - && xop[8] == const0_rtx) - { - sub_5 = XEXP (xop[6], 0); + if (val8 != 0 && val8 != 0xff && val8 != 0x80) + return false; } - if (sub_5 - && SUBREG_P (sub_5) - && SUBREG_BYTE (sub_5) == 0 - && rtx_equal_p (xop[5], SUBREG_REG (sub_5))) - return true; - - if (dump_file) - fprintf (dump_file, "\n;; Failed condition for casesi_<mode>_sequence\n\n"); - - return false; + return true; } -/* INSNS[1..4] is a sequence as generated by casesi and INSNS[0] is an - extension of an 8-bit or 16-bit integer to SImode. XOP contains the - operands of INSNS as extracted by insn_extract from pattern - casesi_<mode>_sequence: - - $0: SImode reg switch value as result of $9. - $1: Negative of smallest index in switch. - $2: Number of entries in switch. - $3: Label to table. - $4: Label if out-of-bounds. - $5: $0 + $1. - $6: 3-byte PC: subreg:HI ($5) + label_ref ($3) - 2-byte PC: subreg:HI ($5) - $7: HI reg index into table (Z or pseudo) - $8: R24 or const0_rtx (to be clobbered) - $9: Extension to SImode of an 8-bit or 16-bit integer register $10. - $10: QImode or HImode register input of $9. - - Try to optimize this sequence, i.e. use the original HImode / QImode - switch value instead of SImode. */ - -static void -avr_optimize_casesi (rtx_insn *insns[5], rtx *xop) -{ - // Original mode of the switch value; this is QImode or HImode. - machine_mode mode = GET_MODE (xop[10]); - - // How the original switch value was extended to SImode; this is - // SIGN_EXTEND or ZERO_EXTEND. - enum rtx_code code = GET_CODE (xop[9]); - - // Lower index, upper index (plus one) and range of case calues. - HOST_WIDE_INT low_idx = -INTVAL (xop[1]); - HOST_WIDE_INT num_idx = INTVAL (xop[2]); - HOST_WIDE_INT hig_idx = low_idx + num_idx; - - // Maximum ranges of (un)signed QImode resp. HImode. - unsigned umax = QImode == mode ? 0xff : 0xffff; - int imax = QImode == mode ? 0x7f : 0x7fff; - int imin = -imax - 1; - - // Testing the case range and whether it fits into the range of the - // (un)signed mode. This test should actually always pass because it - // makes no sense to have case values outside the mode range. Notice - // that case labels which are unreachable because they are outside the - // mode of the switch value (e.g. "case -1" for uint8_t) have already - // been thrown away by the middle-end. - - if (SIGN_EXTEND == code - && low_idx >= imin - && hig_idx <= imax) - { - // ok - } - else if (ZERO_EXTEND == code - && low_idx >= 0 - && (unsigned) hig_idx <= umax) - { - // ok - } - else - { - if (dump_file) - fprintf (dump_file, ";; Case ranges too big, giving up.\n\n"); - return; - } - - // Do normalization of switch value $10 and out-of-bound check in its - // original mode instead of in SImode. Use a newly created pseudo. - // This will replace insns[1..2]. - - start_sequence(); - - rtx reg = copy_to_mode_reg (mode, xop[10]); - - rtx (*gen_add)(rtx,rtx,rtx) = QImode == mode ? gen_addqi3 : gen_addhi3; - rtx (*gen_cbranch)(rtx,rtx,rtx,rtx) - = QImode == mode ? gen_cbranchqi4 : gen_cbranchhi4; - - emit_insn (gen_add (reg, reg, gen_int_mode (-low_idx, mode))); - rtx op0 = reg; rtx op1 = gen_int_mode (num_idx, mode); - rtx labelref = copy_rtx (xop[4]); - rtx xbranch = gen_cbranch (gen_rtx_fmt_ee (GTU, VOIDmode, op0, op1), - op0, op1, labelref); - rtx_insn *cbranch = emit_jump_insn (xbranch); - JUMP_LABEL (cbranch) = xop[4]; - ++LABEL_NUSES (xop[4]); - - rtx_insn *seq1 = get_insns(); - rtx_insn *last1 = get_last_insn(); - end_sequence(); - - emit_insn_after (seq1, insns[2]); - - // After the out-of-bounds test and corresponding branch, use a - // 16-bit index. If QImode is used, extend it to HImode first. - // This will replace insns[4]. - - start_sequence(); - - if (QImode == mode) - reg = force_reg (HImode, gen_rtx_fmt_e (code, HImode, reg)); - - rtx pat_4 = AVR_3_BYTE_PC - ? gen_movhi (xop[7], reg) - : gen_addhi3 (xop[7], reg, gen_rtx_LABEL_REF (VOIDmode, xop[3])); - - emit_insn (pat_4); - - rtx_insn *seq2 = get_insns(); - rtx_insn *last2 = get_last_insn(); - end_sequence(); - - emit_insn_after (seq2, insns[3]); - - if (dump_file) - { - fprintf (dump_file, ";; New insns: "); - - for (rtx_insn *insn = seq1; ; insn = NEXT_INSN (insn)) - { - fprintf (dump_file, "%d, ", INSN_UID (insn)); - if (insn == last1) - break; - } - for (rtx_insn *insn = seq2; ; insn = NEXT_INSN (insn)) - { - fprintf (dump_file, "%d%s", INSN_UID (insn), - insn == last2 ? ".\n\n" : ", "); - if (insn == last2) - break; - } - - fprintf (dump_file, ";; Deleting insns: %d, %d, %d.\n\n", - INSN_UID (insns[1]), INSN_UID (insns[2]), INSN_UID (insns[3])); - } - - // Pseudodelete the SImode and subreg of SImode insns. We don't care - // about the extension insns[0]: Its result is now unused and other - // passes will clean it up. - - SET_INSN_DELETED (insns[1]); - SET_INSN_DELETED (insns[2]); - SET_INSN_DELETED (insns[3]); -} - +/* Access some RTX as INT_MODE. If X is a CONST_FIXED we can get + the bit representation of X by "casting" it to CONST_INT. */ -void -avr_pass_casesi::avr_rest_of_handle_casesi (function *func) +rtx +avr_to_int_mode (rtx x) { - basic_block bb; - - FOR_EACH_BB_FN (bb, func) - { - rtx_insn *insn, *insns[5]; + machine_mode mode = GET_MODE (x); - FOR_BB_INSNS (bb, insn) - { - if (avr_is_casesi_sequence (bb, insn, insns)) - { - avr_optimize_casesi (insns, recog_data.operand); - } - } - } + return VOIDmode == mode + ? x + : simplify_gen_subreg (int_mode_for_mode (mode).require (), x, mode, 0); } -/* A helper for the next method. Suppose we have two conditional branches - - if (reg <cond1> xval1) goto label1; - if (reg <cond2> xval2) goto label2; - - If the second comparison is redundant and there is a code <cond> such - that the sequence can be performed as - - REG_CC = compare (reg, xval1); - if (REG_CC <cond1> 0) goto label1; - if (REG_CC <cond> 0) goto label2; - - then return <cond>. Otherwise, return UNKNOWN. - xval1 and xval2 are CONST_INT, and mode is the scalar int mode in which - the comparison will be carried out. reverse_cond1 can be set to reverse - condition cond1. This is useful if the second comparison does not follow - the first one, but is located after label1 like in: - - if (reg <cond1> xval1) goto label1; - ... - label1: - if (reg <cond2> xval2) goto label2; */ - -static enum rtx_code -avr_redundant_compare (enum rtx_code cond1, rtx xval1, - enum rtx_code cond2, rtx xval2, - machine_mode mode, bool reverse_cond1) -{ - HOST_WIDE_INT ival1 = INTVAL (xval1); - HOST_WIDE_INT ival2 = INTVAL (xval2); - - unsigned HOST_WIDE_INT mask = GET_MODE_MASK (mode); - unsigned HOST_WIDE_INT uval1 = mask & UINTVAL (xval1); - unsigned HOST_WIDE_INT uval2 = mask & UINTVAL (xval2); - - if (reverse_cond1) - cond1 = reverse_condition (cond1); - - if (cond1 == EQ) - { - //////////////////////////////////////////////// - // A sequence like - // if (reg == val) goto label1; - // if (reg > val) goto label2; - // can be re-written using the same, simple comparison like in: - // REG_CC = compare (reg, val) - // if (REG_CC == 0) goto label1; - // if (REG_CC >= 0) goto label2; - if (ival1 == ival2 - && (cond2 == GT || cond2 == GTU)) - return avr_normalize_condition (cond2); - - // Similar, but the input sequence is like - // if (reg == val) goto label1; - // if (reg >= val) goto label2; - if (ival1 == ival2 - && (cond2 == GE || cond2 == GEU)) - return cond2; - - // Similar, but the input sequence is like - // if (reg == val) goto label1; - // if (reg >= val + 1) goto label2; - if ((cond2 == GE && ival2 == 1 + ival1) - || (cond2 == GEU && uval2 == 1 + uval1)) - return cond2; - - // Similar, but the input sequence is like - // if (reg == val) goto label1; - // if (reg > val - 1) goto label2; - if ((cond2 == GT && ival2 == ival1 - 1) - || (cond2 == GTU && uval2 == uval1 - 1)) - return avr_normalize_condition (cond2); - - ///////////////////////////////////////////////////////// - // A sequence like - // if (reg == val) goto label1; - // if (reg < 1 + val) goto label2; - // can be re-written as - // REG_CC = compare (reg, val) - // if (REG_CC == 0) goto label1; - // if (REG_CC < 0) goto label2; - if ((cond2 == LT && ival2 == 1 + ival1) - || (cond2 == LTU && uval2 == 1 + uval1)) - return cond2; - - // Similar, but with an input sequence like - // if (reg == val) goto label1; - // if (reg <= val) goto label2; - if (ival1 == ival2 - && (cond2 == LE || cond2 == LEU)) - return avr_normalize_condition (cond2); - - // Similar, but with an input sequence like - // if (reg == val) goto label1; - // if (reg < val) goto label2; - if (ival1 == ival2 - && (cond2 == LT || cond2 == LTU)) - return cond2; - - // Similar, but with an input sequence like - // if (reg == val) goto label1; - // if (reg <= val - 1) goto label2; - if ((cond2 == LE && ival2 == ival1 - 1) - || (cond2 == LEU && uval2 == uval1 - 1)) - return avr_normalize_condition (cond2); - - } // cond1 == EQ +/* Return true if hard register REG supports the ADIW and SBIW instructions. */ - return UNKNOWN; +bool +avr_adiw_reg_p (rtx reg) +{ + return (AVR_HAVE_ADIW + && test_hard_reg_class (ADDW_REGS, reg)); } -/* If-else decision trees generated for switch / case may produce sequences - like - - SREG = compare (reg, val); - if (SREG == 0) goto label1; - SREG = compare (reg, 1 + val); - if (SREG >= 0) goto label2; - - which can be optimized to - - SREG = compare (reg, val); - if (SREG == 0) goto label1; - if (SREG >= 0) goto label2; - - The optimal place for such a pass would be directly after expand, but - it's not possible for a jump insn to target more than one code label. - Hence, run a mini pass right before split2 which introduces REG_CC. */ - -void -avr_pass_ifelse::avr_rest_of_handle_ifelse (function *) +static bool +ra_in_progress () { - rtx_insn *next_insn; - - for (rtx_insn *insn = get_insns(); insn; insn = next_insn) - { - next_insn = next_nonnote_nondebug_insn (insn); - - if (! next_insn) - break; - - // Search for two cbranch insns. The first one is a cbranch. - // Filter for "cbranch<mode>4_insn" with mode in QI, HI, PSI, SI. - - if (! JUMP_P (insn)) - continue; - - int icode1 = recog_memoized (insn); - - if (icode1 != CODE_FOR_cbranchqi4_insn - && icode1 != CODE_FOR_cbranchhi4_insn - && icode1 != CODE_FOR_cbranchpsi4_insn - && icode1 != CODE_FOR_cbranchsi4_insn) - continue; - - rtx_jump_insn *insn1 = as_a<rtx_jump_insn *> (insn); - rtx_jump_insn *insn2 = nullptr; - bool follow_label1 = false; - - // Extract the operands of the first insn: - // $0 = comparison operator ($1, $2) - // $1 = reg - // $2 = reg or const_int - // $3 = code_label - // $4 = optional SCRATCH for HI, PSI, SI cases. - - const auto &op = recog_data.operand; - - extract_insn (insn1); - rtx xop1[5] = { op[0], op[1], op[2], op[3], op[4] }; - int n_operands = recog_data.n_operands; - - // For now, we can optimize cbranches that follow an EQ cbranch, - // and cbranches that follow the label of a NE cbranch. - - if (GET_CODE (xop1[0]) == EQ - && JUMP_P (next_insn) - && recog_memoized (next_insn) == icode1) - { - // The 2nd cbranch insn follows insn1, i.e. is located in the - // fallthrough path of insn1. - - insn2 = as_a<rtx_jump_insn *> (next_insn); - } - else if (GET_CODE (xop1[0]) == NE) - { - // insn1 might branch to a label followed by a cbranch. - - rtx target1 = JUMP_LABEL (insn1); - rtx_insn *code_label1 = JUMP_LABEL_AS_INSN (insn1); - rtx_insn *next = next_nonnote_nondebug_insn (code_label1); - rtx_insn *barrier = prev_nonnote_nondebug_insn (code_label1); - - if (// Target label of insn1 is used exactly once and - // is not a fallthru, i.e. is preceded by a barrier. - LABEL_NUSES (target1) == 1 - && barrier - && BARRIER_P (barrier) - // Following the target label is a cbranch of the same kind. - && next - && JUMP_P (next) - && recog_memoized (next) == icode1) - { - follow_label1 = true; - insn2 = as_a<rtx_jump_insn *> (next); - } - } - - if (! insn2) - continue; - - // Also extract operands of insn2, and filter for REG + CONST_INT - // comparsons against the same register. - - extract_insn (insn2); - rtx xop2[5] = { op[0], op[1], op[2], op[3], op[4] }; - - if (! rtx_equal_p (xop1[1], xop2[1]) - || ! CONST_INT_P (xop1[2]) - || ! CONST_INT_P (xop2[2])) - continue; - - machine_mode mode = GET_MODE (xop1[1]); - enum rtx_code code1 = GET_CODE (xop1[0]); - enum rtx_code code2 = GET_CODE (xop2[0]); - - code2 = avr_redundant_compare (code1, xop1[2], code2, xop2[2], - mode, follow_label1); - if (code2 == UNKNOWN) - continue; - - ////////////////////////////////////////////////////// - // Found a replacement. - - if (dump_file) - { - fprintf (dump_file, "\n;; Found chain of jump_insn %d and" - " jump_insn %d, follow_label1=%d:\n", - INSN_UID (insn1), INSN_UID (insn2), follow_label1); - print_rtl_single (dump_file, PATTERN (insn1)); - print_rtl_single (dump_file, PATTERN (insn2)); - } - - if (! follow_label1) - next_insn = next_nonnote_nondebug_insn (insn2); - - // Pop the new branch conditions and the new comparison. - // Prematurely split into compare + branch so that we can drop - // the 2nd comparison. The following pass, split2, splits all - // insns for REG_CC, and it should still work as usual even when - // there are already some REG_CC insns around. - - rtx xcond1 = gen_rtx_fmt_ee (code1, VOIDmode, cc_reg_rtx, const0_rtx); - rtx xcond2 = gen_rtx_fmt_ee (code2, VOIDmode, cc_reg_rtx, const0_rtx); - rtx xpat1 = gen_branch (xop1[3], xcond1); - rtx xpat2 = gen_branch (xop2[3], xcond2); - rtx xcompare = NULL_RTX; - - if (mode == QImode) - { - gcc_assert (n_operands == 4); - xcompare = gen_cmpqi3 (xop1[1], xop1[2]); - } - else - { - gcc_assert (n_operands == 5); - rtx (*gen_cmp)(rtx,rtx,rtx) - = mode == HImode ? gen_gen_comparehi - : mode == PSImode ? gen_gen_comparepsi - : gen_gen_comparesi; // SImode - xcompare = gen_cmp (xop1[1], xop1[2], xop1[4]); - } - - // Emit that stuff. - - rtx_insn *cmp = emit_insn_before (xcompare, insn1); - rtx_jump_insn *branch1 = emit_jump_insn_before (xpat1, insn1); - rtx_jump_insn *branch2 = emit_jump_insn_before (xpat2, insn2); - - JUMP_LABEL (branch1) = xop1[3]; - JUMP_LABEL (branch2) = xop2[3]; - // delete_insn() decrements LABEL_NUSES when deleting a JUMP_INSN, but - // when we pop a new JUMP_INSN, do it by hand. - ++LABEL_NUSES (xop1[3]); - ++LABEL_NUSES (xop2[3]); - - delete_insn (insn1); - delete_insn (insn2); - - // As a side effect, also recog the new insns. - gcc_assert (valid_insn_p (cmp)); - gcc_assert (valid_insn_p (branch1)); - gcc_assert (valid_insn_p (branch2)); - } // loop insns + return avr_lra_p ? lra_in_progress : reload_in_progress; } @@ -1189,17 +505,24 @@ avr_option_override (void) avr_addr.sp_l = 0x3D + avr_arch->sfr_offset; avr_addr.sp_h = avr_addr.sp_l + 1; - init_machine_status = avr_init_machine_status; + init_machine_status = []() + { + return ggc_cleared_alloc<machine_function> (); + }; avr_log_set_avr_log(); -} -/* Function to set up the backend function structure. */ + /* As long as peep2_rescan is not implemented, see + http://gcc.gnu.org/ml/gcc-patches/2011-10/msg02819.html + we add a second peephole2 run to get best results. */ + { + opt_pass *extra_peephole2 + = g->get_passes ()->get_pass_peephole2 ()->clone (); + register_pass_info peep2_2_info + = { extra_peephole2, "peephole2", 1, PASS_POS_INSERT_AFTER }; -static struct machine_function * -avr_init_machine_status (void) -{ - return ggc_cleared_alloc<machine_function> (); + register_pass (&peep2_2_info); + } } @@ -1216,7 +539,9 @@ avr_init_expanders (void) tmp_reg_rtx = all_regs_rtx[AVR_TMP_REGNO]; zero_reg_rtx = all_regs_rtx[AVR_ZERO_REGNO]; - cc_reg_rtx = gen_rtx_REG (CCmode, REG_CC); + cc_reg_rtx = gen_rtx_REG (CCmode, REG_CC); + ccn_reg_rtx = gen_rtx_REG (CCNmode, REG_CC); + cczn_reg_rtx = gen_rtx_REG (CCZNmode, REG_CC); lpm_addr_reg_rtx = gen_rtx_REG (HImode, REG_Z); @@ -1239,10 +564,10 @@ avr_init_expanders (void) /* Implement `REGNO_REG_CLASS'. */ /* Return register class for register R. */ -enum reg_class +reg_class avr_regno_reg_class (int r) { - static const enum reg_class reg_class_tab[] = + static const reg_class reg_class_tab[] = { R0_REG, /* r1 - r15 */ @@ -1896,682 +1221,9 @@ sequent_regs_live (void) } -namespace { -static const pass_data avr_pass_data_fuse_add = -{ - RTL_PASS, // type - "", // name (will be patched) - OPTGROUP_NONE, // optinfo_flags - TV_DF_SCAN, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - TODO_df_finish // todo_flags_finish -}; - - -class avr_pass_fuse_add : public rtl_opt_pass -{ -public: - avr_pass_fuse_add (gcc::context *ctxt, const char *name) - : rtl_opt_pass (avr_pass_data_fuse_add, ctxt) - { - this->name = name; - } - - virtual bool gate (function *) { return optimize && avr_fuse_add > 0; } - - virtual unsigned int execute (function *); - - struct Some_Insn - { - rtx_insn *insn = nullptr; - rtx dest, src; - bool valid () const { return insn != nullptr; } - void set_deleted () - { - gcc_assert (insn); - SET_INSN_DELETED (insn); - insn = nullptr; - } - }; - - // If .insn is not NULL, then this is a reg:HI += const_int - // of an address register. - struct Add_Insn : Some_Insn - { - rtx addend; - int regno; - Add_Insn () {} - Add_Insn (rtx_insn *insn); - }; - - // If .insn is not NULL, then this sets an address register - // to a constant value. - struct Ldi_Insn : Some_Insn - { - int regno; - Ldi_Insn () {} - Ldi_Insn (rtx_insn *insn); - }; - - // If .insn is not NULL, then this is a load or store insn where the - // address is REG or POST_INC with an address register. - struct Mem_Insn : Some_Insn - { - rtx reg_or_0, mem, addr, addr_reg; - int addr_regno; - enum rtx_code addr_code; - machine_mode mode; - addr_space_t addr_space; - bool store_p, volatile_p; - Mem_Insn () {} - Mem_Insn (rtx_insn *insn); - }; - - rtx_insn *fuse_ldi_add (Ldi_Insn &prev_ldi, Add_Insn &add); - rtx_insn *fuse_add_add (Add_Insn &prev_add, Add_Insn &add); - rtx_insn *fuse_add_mem (Add_Insn &prev_add, Mem_Insn &mem); - rtx_insn *fuse_mem_add (Mem_Insn &prev_mem, Add_Insn &add); -}; // avr_pass_fuse_add - -} // anon namespace - -rtl_opt_pass * -make_avr_pass_fuse_add (gcc::context *ctxt) -{ - return new avr_pass_fuse_add (ctxt, "avr-fuse-add"); -} - -/* Describe properties of AVR's indirect load and store instructions - LD, LDD, ST, STD, LPM, ELPM depending on register number, volatility etc. - Rules for "volatile" accesses are: - - | Xmega | non-Xmega - ------+-----------------+---------------- - load | read LSB first | read LSB first - store | write LSB first | write MSB first -*/ - -struct AVR_LdSt_Props -{ - bool has_postinc, has_predec, has_ldd; - // The insn printers will use POST_INC or PRE_DEC addressing, no matter - // what adressing modes we are feeding into them. - bool want_postinc, want_predec; - - AVR_LdSt_Props (int regno, bool store_p, bool volatile_p, addr_space_t as) - { - bool generic_p = ADDR_SPACE_GENERIC_P (as); - bool flashx_p = ! generic_p && as != ADDR_SPACE_MEMX; - has_postinc = generic_p || (flashx_p && regno == REG_Z); - has_predec = generic_p; - has_ldd = ! AVR_TINY && generic_p && (regno == REG_Y || regno == REG_Z); - want_predec = volatile_p && generic_p && ! AVR_XMEGA && store_p; - want_postinc = volatile_p && generic_p && (AVR_XMEGA || ! store_p); - want_postinc |= flashx_p && regno == REG_Z; - } - - AVR_LdSt_Props (const avr_pass_fuse_add::Mem_Insn &m) - : AVR_LdSt_Props (m.addr_regno, m.store_p, m.volatile_p, m.addr_space) - { - gcc_assert (m.valid ()); - } -}; - -/* Emit a single_set that clobbers REG_CC. */ - -static rtx_insn * -emit_move_ccc (rtx dest, rtx src) -{ - return emit_insn (gen_gen_move_clobbercc (dest, src)); -} - -/* Emit a single_set that clobbers REG_CC after insn AFTER. */ - -static rtx_insn * -emit_move_ccc_after (rtx dest, rtx src, rtx_insn *after) -{ - return emit_insn_after (gen_gen_move_clobbercc (dest, src), after); -} - -static bool -reg_seen_between_p (const_rtx reg, const rtx_insn *from, const rtx_insn *to) -{ - return (reg_used_between_p (reg, from, to) - || reg_set_between_p (reg, from, to)); -} - - -static void -avr_maybe_adjust_cfa (rtx_insn *insn, rtx reg, int addend) -{ - if (addend - && frame_pointer_needed - && REGNO (reg) == FRAME_POINTER_REGNUM - && avr_fuse_add == 3) - { - rtx plus = plus_constant (Pmode, reg, addend); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (reg, plus)); - } -} - - -// If successful, this represents a SET of a pointer register to a constant. -avr_pass_fuse_add::Ldi_Insn::Ldi_Insn (rtx_insn *insn) -{ - rtx set = single_set (insn); - if (!set) - return; - - src = SET_SRC (set); - dest = SET_DEST (set); - - if (REG_P (dest) - && GET_MODE (dest) == Pmode - && IN_RANGE (regno = REGNO (dest), REG_X, REG_Z) - && CONSTANT_P (src)) - { - this->insn = insn; - } -} - -// If successful, this represents a PLUS with CONST_INT of a pointer -// register X, Y or Z. Otherwise, the object is not valid(). -avr_pass_fuse_add::Add_Insn::Add_Insn (rtx_insn *insn) -{ - rtx set = single_set (insn); - if (!set) - return; - - src = SET_SRC (set); - dest = SET_DEST (set); - if (REG_P (dest) - // We are only interested in PLUSes that change address regs. - && GET_MODE (dest) == Pmode - && IN_RANGE (regno = REGNO (dest), REG_X, REG_Z) - && PLUS == GET_CODE (src) - && rtx_equal_p (XEXP (src, 0), dest) - && CONST_INT_P (XEXP (src, 1))) - { - // This is reg:HI += const_int. - addend = XEXP (src, 1); - this->insn = insn; - } -} - -// If successful, this represents a load or store insn where the addressing -// mode uses pointer register X, Y or Z. Otherwise, the object is not valid(). -avr_pass_fuse_add::Mem_Insn::Mem_Insn (rtx_insn *insn) -{ - rtx set = single_set (insn); - if (!set) - return; - - src = SET_SRC (set); - dest = SET_DEST (set); - mode = GET_MODE (dest); - - if (MEM_P (dest) - && (REG_P (src) || src == CONST0_RTX (mode))) - { - reg_or_0 = src; - mem = dest; - } - else if (REG_P (dest) && MEM_P (src)) - { - reg_or_0 = dest; - mem = src; - } - else - return; - - if (avr_mem_memx_p (mem) - || avr_load_libgcc_p (mem)) - return; - - addr = XEXP (mem, 0); - addr_code = GET_CODE (addr); - - if (addr_code == REG) - addr_reg = addr; - else if (addr_code == POST_INC || addr_code == PRE_DEC) - addr_reg = XEXP (addr, 0); - else - return; - - addr_regno = REGNO (addr_reg); - - if (avr_fuse_add == 2 - && frame_pointer_needed - && addr_regno == FRAME_POINTER_REGNUM) - MEM_VOLATILE_P (mem) = 0; - - if (reg_overlap_mentioned_p (reg_or_0, addr) // Can handle CONSTANT_P. - || addr_regno > REG_Z - || avr_mem_memx_p (mem) - // The following optimizations only handle REG and POST_INC, - // so that's all what we allow here. - || (addr_code != REG && addr_code != POST_INC)) - return; - - addr_space = MEM_ADDR_SPACE (mem); - volatile_p = MEM_VOLATILE_P (mem); - store_p = MEM_P (dest); - - // Turn this "valid". - this->insn = insn; -} - -/* Try to combine a Ldi insn with a PLUS CONST_INT addend to one Ldi insn. - If LDI is valid, then it precedes ADD in the same block. - When a replacement is found, a new insn is emitted and the old insns - are pseudo-deleted. The returned insn is the point where the calling - scanner should continue. When no replacement is found, nullptr is - returned and nothing changed. */ - -rtx_insn * -avr_pass_fuse_add::fuse_ldi_add (Ldi_Insn &ldi, Add_Insn &add) -{ - if (! ldi.valid () - || reg_seen_between_p (ldi.dest, ldi.insn, add.insn)) - { - // If something is between the Ldi and the current insn, we can - // set the Ldi invalid to speed future scans. - return ldi.insn = nullptr; - } - - // Found a Ldi with const and a PLUS insns in the same BB, - // and with no interfering insns between them. - - // Emit new Ldi with the sum of the original offsets after the old Ldi. - rtx xval = plus_constant (Pmode, ldi.src, INTVAL (add.addend)); - - rtx_insn *insn = emit_move_ccc_after (ldi.dest, xval, ldi.insn); - avr_dump (";; new Ldi[%d] insn %d after %d: R%d = %r\n\n", ldi.regno, - INSN_UID (insn), INSN_UID (ldi.insn), ldi.regno, xval); - - rtx_insn *next = NEXT_INSN (add.insn); - ldi.set_deleted (); - add.set_deleted (); - - return next; -} - -/* Try to combine two PLUS insns with CONST_INT addend to one such insn. - If PREV_ADD is valid, then it precedes ADD in the same basic block. - When a replacement is found, a new insn is emitted and the old insns - are pseudo-deleted. The returned insn is the point where the calling - scanner should continue. When no replacement is found, nullptr is - returned and nothing changed. */ - -rtx_insn * -avr_pass_fuse_add::fuse_add_add (Add_Insn &prev_add, Add_Insn &add) -{ - if (! prev_add.valid () - || reg_seen_between_p (add.dest, prev_add.insn, add.insn)) - { - // If something is between the previous Add and the current insn, - // we can set the previous Add invalid to speed future scans. - return prev_add.insn = nullptr; - } - - // Found two PLUS insns in the same BB, and with no interfering - // insns between them. - rtx plus = plus_constant (Pmode, add.src, INTVAL (prev_add.addend)); - - rtx_insn *next; - if (REG_P (plus)) - { - avr_dump (";; Add[%d] from %d annihilates %d\n\n", add.regno, - INSN_UID (prev_add.insn), INSN_UID (add.insn)); - next = NEXT_INSN (add.insn); - } - else - { - // Emit after the current insn, so that it will be picked - // up as next valid Add insn. - next = emit_move_ccc_after (add.dest, plus, add.insn); - avr_dump (";; #1 new Add[%d] insn %d after %d: R%d += %d\n\n", - add.regno, INSN_UID (next), INSN_UID (add.insn), - add.regno, (int) INTVAL (XEXP (plus, 1))); - gcc_assert (GET_CODE (plus) == PLUS); - } - - add.set_deleted (); - prev_add.set_deleted (); - - return next; -} - -/* Try to combine a PLUS of the address register with a load or store insn. - If ADD is valid, then it precedes MEM in the same basic block. - When a replacement is found, a new insn is emitted and the old insns - are pseudo-deleted. The returned insn is the point where the calling - scanner should continue. When no replacement is found, nullptr is - returned and nothing changed. */ - -rtx_insn * -avr_pass_fuse_add::fuse_add_mem (Add_Insn &add, Mem_Insn &mem) -{ - if (! add.valid () - || reg_seen_between_p (add.dest, add.insn, mem.insn)) - { - // If something is between the Add and the current insn, we can - // set the Add invalid to speed future scans. - return add.insn = nullptr; - } - - AVR_LdSt_Props ap { mem }; - - int msize = GET_MODE_SIZE (mem.mode); - - // The mem insn really wants PRE_DEC. - bool case1 = ((mem.addr_code == REG || mem.addr_code == POST_INC) - && msize > 1 && ap.want_predec && ! ap.has_ldd); - - // The offset can be consumed by a PRE_DEC. - bool case2 = (- INTVAL (add.addend) == msize - && (mem.addr_code == REG || mem.addr_code == POST_INC) - && ap.has_predec && ! ap.want_postinc); - - if (! case1 && ! case2) - return nullptr; - - // Change from REG or POST_INC to PRE_DEC. - rtx xmem = change_address (mem.mem, mem.mode, - gen_rtx_PRE_DEC (Pmode, mem.addr_reg)); - rtx dest = mem.store_p ? xmem : mem.reg_or_0; - rtx src = mem.store_p ? mem.reg_or_0 : xmem; - - rtx_insn *next = emit_move_ccc_after (dest, src, mem.insn); - add_reg_note (next, REG_INC, mem.addr_reg); - avr_dump (";; new Mem[%d] insn %d after %d: %r = %r\n\n", mem.addr_regno, - INSN_UID (next), INSN_UID (mem.insn), dest, src); - - // Changing REG or POST_INC -> PRE_DEC means that the addend before - // the memory access must be increased by the size of the access, - rtx plus = plus_constant (Pmode, add.src, msize); - if (! REG_P (plus)) - { - rtx_insn *insn = emit_move_ccc_after (add.dest, plus, add.insn); - avr_dump (";; #2 new Add[%d] insn %d after %d: R%d += %d\n\n", - add.regno, INSN_UID (insn), INSN_UID (add.insn), - add.regno, (int) INTVAL (XEXP (plus, 1))); - gcc_assert (GET_CODE (plus) == PLUS); - } - else - avr_dump (";; Add[%d] insn %d consumed into %d\n\n", - add.regno, INSN_UID (add.insn), INSN_UID (next)); - - // Changing POST_INC -> PRE_DEC means that the addend after the mem has to be - // the size of the access. The hope is that this new add insn may be unused. - if (mem.addr_code == POST_INC) - { - plus = plus_constant (Pmode, add.dest, msize); - rtx_insn *next2 = emit_move_ccc_after (add.dest, plus, next); - avr_dump (";; #3 new Add[%d] insn %d after %d: R%d += %d\n\n", add.regno, - INSN_UID (next2), INSN_UID (next), add.regno, msize); - next = next2; - } - - add.set_deleted (); - mem.set_deleted (); - - return next; -} - -/* Try to combine a load or store insn with a PLUS of the address register. - If MEM is valid, then it precedes ADD in the same basic block. - When a replacement is found, a new insn is emitted and the old insns - are pseudo-deleted. The returned insn is the point where the calling - scanner should continue. When no replacement is found, nullptr is - returned and nothing changed. */ - -rtx_insn * -avr_pass_fuse_add::fuse_mem_add (Mem_Insn &mem, Add_Insn &add) -{ - if (! mem.valid () - || reg_seen_between_p (add.dest, mem.insn, add.insn)) - { - // If something is between the Mem and the current insn, we can - // set the Mem invalid to speed future scans. - return mem.insn = nullptr; - } - - AVR_LdSt_Props ap { mem }; - - int msize = GET_MODE_SIZE (mem.mode); - - // The add insn can be consumed by a POST_INC. - bool case1 = (mem.addr_code == REG - && INTVAL (add.addend) == msize - && ap.has_postinc && ! ap.want_predec); - - // There are cases where even a partial consumption of the offset is better. - // This are the cases where no LD+offset addressing is available, because - // the address register is obviously used after the mem insn, and a mem insn - // with REG addressing mode will have to restore the address. - bool case2 = (mem.addr_code == REG - && msize > 1 && ap.want_postinc && ! ap.has_ldd); - - if (! case1 && ! case2) - return nullptr; - - // Change addressing mode from REG to POST_INC. - rtx xmem = change_address (mem.mem, mem.mode, - gen_rtx_POST_INC (Pmode, mem.addr_reg)); - rtx dest = mem.store_p ? xmem : mem.reg_or_0; - rtx src = mem.store_p ? mem.reg_or_0 : xmem; - - rtx_insn *insn = emit_move_ccc_after (dest, src, mem.insn); - add_reg_note (insn, REG_INC, mem.addr_reg); - avr_dump (";; new Mem[%d] insn %d after %d: %r = %r\n\n", add.regno, - INSN_UID (insn), INSN_UID (mem.insn), dest, src); - - rtx_insn *next = NEXT_INSN (add.insn); - - // Changing REG -> POST_INC means that the post addend must be - // decreased by the size of the access. - rtx plus = plus_constant (Pmode, add.src, -msize); - if (! REG_P (plus)) - { - next = emit_move_ccc_after (mem.addr_reg, plus, add.insn); - avr_dump (";; #4 new Add[%d] insn %d after %d: R%d += %d\n\n", - add.regno, INSN_UID (next), INSN_UID (add.insn), - add.regno, (int) INTVAL (XEXP (plus, 1))); - gcc_assert (GET_CODE (plus) == PLUS); - } - else - avr_dump (";; Add[%d] insn %d consumed into %d\n\n", - add.regno, INSN_UID (add.insn), INSN_UID (insn)); - - add.set_deleted (); - mem.set_deleted (); - - return next; -} - -/* Try to post-reload combine PLUS with CONST_INt of pointer registers with: - - Sets to a constant address. - - PLUS insn of that kind. - - Indirect loads and stores. - In almost all cases, combine opportunities arise from the preparation - done by `avr_split_tiny_move', but in some rare cases combinations are - found for the ordinary cores, too. - As we consider at most one Mem insn per try, there may still be missed - optimizations like POST_INC + PLUS + POST_INC might be performed - as PRE_DEC + PRE_DEC for two adjacent locations. */ - -unsigned int -avr_pass_fuse_add::execute (function *func) -{ - df_note_add_problem (); - df_analyze (); - - int n_add = 0, n_mem = 0, n_ldi = 0; - basic_block bb; - - FOR_EACH_BB_FN (bb, func) - { - Ldi_Insn prev_ldi_insns[REG_32]; - Add_Insn prev_add_insns[REG_32]; - Mem_Insn prev_mem_insns[REG_32]; - rtx_insn *insn, *curr; - - avr_dump ("\n;; basic block %d\n\n", bb->index); - - FOR_BB_INSNS_SAFE (bb, insn, curr) - { - rtx_insn *next = nullptr; - Ldi_Insn ldi_insn { insn }; - Add_Insn add_insn { insn }; - Mem_Insn mem_insn { insn }; - - if (add_insn.valid ()) - { - // Found reg:HI += const_int - avr_dump (";; insn %d: Add[%d]: R%d += %d\n\n", - INSN_UID (add_insn.insn), add_insn.regno, - add_insn.regno, (int) INTVAL (add_insn.addend)); - Ldi_Insn &prev_ldi_insn = prev_ldi_insns[add_insn.regno]; - Add_Insn &prev_add_insn = prev_add_insns[add_insn.regno]; - Mem_Insn &prev_mem_insn = prev_mem_insns[add_insn.regno]; - if ((next = fuse_ldi_add (prev_ldi_insn, add_insn))) - curr = next, n_ldi += 1; - else if ((next = fuse_add_add (prev_add_insn, add_insn))) - curr = next, n_add += 1; - else if ((next = fuse_mem_add (prev_mem_insn, add_insn))) - curr = next, n_mem += 1; - else - prev_add_insn = add_insn; - } - else if (mem_insn.valid ()) - { - int addr_regno = REGNO (mem_insn.addr_reg); - avr_dump (";; insn %d: Mem[%d]: %r = %r\n\n", - INSN_UID (mem_insn.insn), addr_regno, - mem_insn.dest, mem_insn.src); - Add_Insn &prev_add_insn = prev_add_insns[addr_regno]; - if ((next = fuse_add_mem (prev_add_insn, mem_insn))) - curr = next, n_mem += 1; - else - prev_mem_insns[addr_regno] = mem_insn; - } - else if (ldi_insn.valid ()) - { - if (! CONST_INT_P (ldi_insn.src)) - avr_dump (";; insn %d: Ldi[%d]: R%d = %r\n\n", - INSN_UID (ldi_insn.insn), ldi_insn.regno, - ldi_insn.regno, ldi_insn.src); - prev_ldi_insns[ldi_insn.regno] = ldi_insn; - } - } // for insns - } // for BBs - - avr_dump (";; Function %f: Found %d changes: %d ldi, %d add, %d mem.\n", - n_ldi + n_add + n_mem, n_ldi, n_add, n_mem); - - return 0; -} - - -namespace { -static const pass_data avr_pass_data_pre_proep = -{ - RTL_PASS, // type - "", // name (will be patched) - OPTGROUP_NONE, // optinfo_flags - TV_DF_SCAN, // tv_id - 0, // properties_required - 0, // properties_provided - 0, // properties_destroyed - 0, // todo_flags_start - 0 // todo_flags_finish -}; - - -class avr_pass_pre_proep : public rtl_opt_pass -{ -public: - avr_pass_pre_proep (gcc::context *ctxt, const char *name) - : rtl_opt_pass (avr_pass_data_pre_proep, ctxt) - { - this->name = name; - } - - void compute_maybe_gasisr (function *); - - virtual unsigned int execute (function *fun) - { - if (avr_gasisr_prologues - // Whether this function is an ISR worth scanning at all. - && !fun->machine->is_no_gccisr - && (fun->machine->is_interrupt - || fun->machine->is_signal) - && !cfun->machine->is_naked - // Paranoia: Non-local gotos and labels that might escape. - && !cfun->calls_setjmp - && !cfun->has_nonlocal_label - && !cfun->has_forced_label_in_static) - { - compute_maybe_gasisr (fun); - } - - return 0; - } - -}; // avr_pass_pre_proep - -} // anon namespace - -rtl_opt_pass * -make_avr_pass_pre_proep (gcc::context *ctxt) -{ - return new avr_pass_pre_proep (ctxt, "avr-pre-proep"); -} - - -/* Set fun->machine->gasisr.maybe provided we don't find anything that - prohibits GAS generating parts of ISR prologues / epilogues for us. */ - -void -avr_pass_pre_proep::compute_maybe_gasisr (function *fun) -{ - // Don't use BB iterators so that we see JUMP_TABLE_DATA. - - for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) - { - // Transparent calls always use [R]CALL and are filtered out by GAS. - // ISRs don't use -mcall-prologues, hence what remains to be filtered - // out are open coded (tail) calls. - - if (CALL_P (insn)) - return; - - // __tablejump2__ clobbers something and is targeted by JMP so - // that GAS won't see its usage. - - if (AVR_HAVE_JMP_CALL - && JUMP_TABLE_DATA_P (insn)) - return; - - // Non-local gotos not seen in *FUN. - - if (JUMP_P (insn) - && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX)) - return; - } - - fun->machine->gasisr.maybe = 1; -} - - /* Obtain the length sequence of insns. */ -int +static int get_sequence_length (rtx_insn *insns) { int length = 0; @@ -3478,7 +2130,7 @@ avr_address_tiny_absdata_p (rtx x, machine_mode mode) static inline bool avr_reg_ok_for_addr_p (rtx reg, addr_space_t as, - RTX_CODE outer_code, bool strict) + rtx_code outer_code, bool strict) { return (REG_P (reg) && (avr_regno_mode_code_ok_for_base_p (REGNO (reg), QImode, @@ -3767,14 +2419,13 @@ ptrreg_to_str (int regno) return NULL; } -/* Return the condition name as a string. - Used in conditional jump constructing */ + +/* Return the condition name as a string to be used in a BR** instruction. + Used in conditional jump constructing. */ static const char * -cond_string (enum rtx_code code) +avr_cond_string (rtx_code code, bool cc_overflow_unusable) { - bool cc_overflow_unusable = false; - switch (code) { case NE: @@ -3782,15 +2433,9 @@ cond_string (enum rtx_code code) case EQ: return "eq"; case GE: - if (cc_overflow_unusable) - return "pl"; - else - return "ge"; + return cc_overflow_unusable ? "pl" : "ge"; case LT: - if (cc_overflow_unusable) - return "mi"; - else - return "lt"; + return cc_overflow_unusable ? "mi" : "lt"; case GEU: return "sh"; case LTU: @@ -3820,6 +2465,7 @@ avr_address_tiny_pm_p (rtx x) return false; } + /* Implement `TARGET_PRINT_OPERAND_ADDRESS'. */ /* Output ADDR to FILE as address. */ @@ -4071,7 +2717,7 @@ avr_print_operand (FILE *file, rtx x, int code) " with data memory address")) { output_addr_const (stderr, x); - fprintf(stderr,"\n"); + fprintf (stderr, "\n"); } /* Use normal symbol for direct address no linker trampoline needed */ output_addr_const (file, x); @@ -4103,10 +2749,11 @@ avr_print_operand (FILE *file, rtx x, int code) } else if (GET_CODE (x) == CONST_STRING) fputs (XSTR (x, 0), file); - else if (code == 'j') - fputs (cond_string (GET_CODE (x)), file); - else if (code == 'k') - fputs (cond_string (reverse_condition (GET_CODE (x))), file); + else if (code == 'j' || code == 'L') + fputs (avr_cond_string (GET_CODE (x), code == 'L'), file); + else if (code == 'k' || code == 'K') + fputs (avr_cond_string (reverse_condition (GET_CODE (x)), code == 'K'), + file); else avr_print_operand_address (file, VOIDmode, x); } @@ -4121,7 +2768,7 @@ avr_print_operand (FILE *file, rtx x, int code) static bool avr_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, unsigned int align, - enum by_pieces_operation op, bool speed_p) + by_pieces_operation op, bool speed_p) { if (op != MOVE_BY_PIECES || (speed_p && size > MOVE_MAX_PIECES)) @@ -4130,6 +2777,7 @@ avr_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, return size <= MOVE_MAX_PIECES; } + /* Choose mode for jump insn: 1 - relative jump in range -63 <= x <= 62 ; 2 - relative jump in range -2046 <= x <= 2045 ; @@ -4156,16 +2804,21 @@ avr_jump_mode (rtx x, rtx_insn *insn, int extra) return 2; } -/* Return an AVR condition jump commands. - X is a comparison RTX. - LEN is a number returned by avr_jump_mode function. - If REVERSE nonzero then condition code in X must be reversed. */ + +/* Return the asm code for conditional branch INSN, where XOP[0] is the jump + target label and XOP[1] is a comparison operator of REG_CC against 0. */ const char * -ret_cond_branch (rtx x, int len, int reverse) +avr_cond_branch (rtx_insn *insn, rtx *xop) { - RTX_CODE cond = reverse ? reverse_condition (GET_CODE (x)) : GET_CODE (x); - bool cc_overflow_unusable = false; + machine_mode ccmode = GET_MODE (XEXP (xop[1], 0)); + rtx_code cond = GET_CODE (xop[1]); + bool cc_overflow_unusable = ccmode != CCmode; + int len = avr_jump_mode (xop[0], insn); + + if (ccmode == CCNmode) + // The N flag can only do < 0 and >= 0. + gcc_assert (cond == GE || cond == LT); switch (cond) { @@ -4227,33 +2880,20 @@ ret_cond_branch (rtx x, int len, int reverse) "brsh .+4" CR_TAB "jmp %0")); default: - if (reverse) - { - switch (len) - { - case 1: - return "br%k1 %0"; - case 2: - return ("br%j1 .+2" CR_TAB - "rjmp %0"); - default: - return ("br%j1 .+4" CR_TAB - "jmp %0"); - } - } - else + switch (len) { - switch (len) - { - case 1: - return "br%j1 %0"; - case 2: - return ("br%k1 .+2" CR_TAB - "rjmp %0"); - default: - return ("br%k1 .+4" CR_TAB - "jmp %0"); - } + case 1: + return cc_overflow_unusable + ? "br%L1 %0" + : "br%j1 %0"; + case 2: + return cc_overflow_unusable + ? "br%K1 .+2" CR_TAB "rjmp %0" + : "br%k1 .+2" CR_TAB "rjmp %0"; + default: + return cc_overflow_unusable + ? "br%K1 .+4" CR_TAB "jmp %0" + : "br%k1 .+4" CR_TAB "jmp %0"; } } return ""; @@ -4337,6 +2977,7 @@ avr_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype, rtx libname, cfun->machine->sibcall_fails = 0; } + /* Returns the number of registers to allocate for a function argument. */ static int @@ -4503,6 +3144,152 @@ avr_xload_libgcc_p (machine_mode mode) } +/* Return true when INSN has a REG_UNUSED note for hard reg REG. + rtlanal.cc::find_reg_note() uses == to compare XEXP (link, 0) + therefore use a custom function. */ + +static bool +avr_insn_has_reg_unused_note_p (rtx_insn *insn, rtx reg) +{ + for (rtx link = REG_NOTES (insn); link; link = XEXP (link, 1)) + if (REG_NOTE_KIND (link) == REG_UNUSED + && REG_P (XEXP (link, 0)) + && REGNO (reg) >= REGNO (XEXP (link, 0)) + && END_REGNO (reg) <= END_REGNO (XEXP (link, 0))) + return true; + + return false; +} + + +/* A helper for the next function. + Return nonzero if REG is not used after INSN. + We assume REG is a reload reg, and therefore does + not live past labels. It may live past calls or jumps though. */ + +static bool +_reg_unused_after (rtx_insn *insn, rtx reg, bool look_at_insn) +{ + if (look_at_insn) + { + /* If the reg is set by this instruction, then it is safe for our + case. Disregard the case where this is a store to memory, since + we are checking a register used in the store address. */ + rtx set = single_set (insn); + if (set && !MEM_P (SET_DEST (set)) + && reg_overlap_mentioned_p (reg, SET_DEST (set))) + return 1; + + /* This case occurs when fuse-add introduced a POST_INC addressing, + but the address register is unused after. */ + if (set) + { + rtx mem = MEM_P (SET_SRC (set)) ? SET_SRC (set) : SET_DEST (set); + if (MEM_P (mem) + && reg_overlap_mentioned_p (reg, XEXP (mem, 0)) + && avr_insn_has_reg_unused_note_p (insn, reg)) + return 1; + } + } + + while ((insn = NEXT_INSN (insn))) + { + rtx set; + rtx_code code = GET_CODE (insn); + +#if 0 + /* If this is a label that existed before reload, then the register + if dead here. However, if this is a label added by reorg, then + the register may still be live here. We can't tell the difference, + so we just ignore labels completely. */ + if (code == CODE_LABEL) + return 1; + /* else */ +#endif + + if (!INSN_P (insn)) + continue; + + if (code == JUMP_INSN) + return 0; + + /* If this is a sequence, we must handle them all at once. + We could have for instance a call that sets the target register, + and an insn in a delay slot that uses the register. In this case, + we must return 0. */ + else if (code == INSN && GET_CODE (PATTERN (insn)) == SEQUENCE) + { + rtx_sequence *seq = as_a <rtx_sequence *> (PATTERN (insn)); + int retval = 0; + + for (int i = 0; i < seq->len (); i++) + { + rtx_insn *this_insn = seq->insn (i); + rtx set = single_set (this_insn); + + if (CALL_P (this_insn)) + code = CALL_INSN; + else if (JUMP_P (this_insn)) + { + if (INSN_ANNULLED_BRANCH_P (this_insn)) + return 0; + code = JUMP_INSN; + } + + if (set && reg_overlap_mentioned_p (reg, SET_SRC (set))) + return 0; + if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) + { + if (!MEM_P (SET_DEST (set))) + retval = 1; + else + return 0; + } + if (set == 0 + && reg_overlap_mentioned_p (reg, PATTERN (this_insn))) + return 0; + } + if (retval == 1) + return 1; + else if (code == JUMP_INSN) + return 0; + } + + if (code == CALL_INSN) + { + rtx tem; + for (tem = CALL_INSN_FUNCTION_USAGE (insn); tem; tem = XEXP (tem, 1)) + if (GET_CODE (XEXP (tem, 0)) == USE + && REG_P (XEXP (XEXP (tem, 0), 0)) + && reg_overlap_mentioned_p (reg, XEXP (XEXP (tem, 0), 0))) + return 0; + if (call_used_or_fixed_reg_p (REGNO (reg))) + return 1; + } + + set = single_set (insn); + + if (set && reg_overlap_mentioned_p (reg, SET_SRC (set))) + return 0; + if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) + return !MEM_P (SET_DEST (set)); + if (set == 0 && reg_overlap_mentioned_p (reg, PATTERN (insn))) + return 0; + } + return 1; +} + + +/* Return nonzero if register REG dead after INSN. */ + +int +reg_unused_after (rtx_insn *insn, rtx reg) +{ + return (dead_or_set_p (insn, reg) + || (REG_P (reg) && _reg_unused_after (insn, reg, true))); +} + + /* Fixme: This is a hack because secondary reloads don't works as expected. Find an unused d-register to be used as scratch in INSN. @@ -4624,7 +3411,7 @@ avr_out_lpm_no_lpmx (rtx_insn *insn, rtx *xop, int *plen) for (int i = 0; i < n_bytes; ++i) { - rtx reg = simplify_gen_subreg (QImode, dest, GET_MODE (dest), i); + rtx reg = avr_byte (dest, i); if (i > 0) avr_asm_len ("adiw %2,1", xop, plen, 1); @@ -4672,7 +3459,7 @@ avr_out_lpm (rtx_insn *insn, rtx *op, int *plen) } rtx addr = XEXP (src, 0); - RTX_CODE code = GET_CODE (addr); + rtx_code code = GET_CODE (addr); gcc_assert (REG_P (dest)); gcc_assert (REG == code || POST_INC == code); @@ -4848,6 +3635,232 @@ avr_out_xload (rtx_insn * /*insn*/, rtx *op, int *plen) } +/* A helper for `output_reload_insisf' and `output_reload_inhi'. */ +/* Set register OP[0] to compile-time constant OP[1]. + CLOBBER_REG is a QI clobber register or NULL_RTX. + LEN == NULL: output instructions. + LEN != NULL: set *LEN to the length of the instruction sequence + (in words) printed with LEN = NULL. + If CLEAR_P is true, OP[0] had been cleard to Zero already. + If CLEAR_P is false, nothing is known about OP[0]. + + The effect on cc0 is as follows: + + Load 0 to any register except ZERO_REG : NONE + Load ld register with any value : NONE + Anything else: : CLOBBER */ + +static void +output_reload_in_const (rtx *op, rtx clobber_reg, int *len, bool clear_p) +{ + rtx src = op[1]; + rtx dest = op[0]; + rtx xval, xdest[4]; + int ival[4]; + int clobber_val = 1234; + bool cooked_clobber_p = false; + bool set_p = false; + machine_mode mode = GET_MODE (dest); + int n_bytes = GET_MODE_SIZE (mode); + + gcc_assert (REG_P (dest) + && CONSTANT_P (src)); + + if (len) + *len = 0; + + /* (REG:SI 14) is special: It's neither in LD_REGS nor in NO_LD_REGS + but has some subregs that are in LD_REGS. Use the MSB (REG:QI 17). */ + + if (REGNO (dest) < REG_16 + && END_REGNO (dest) > REG_16) + { + clobber_reg = all_regs_rtx[END_REGNO (dest) - 1]; + } + + /* We might need a clobber reg but don't have one. Look at the value to + be loaded more closely. A clobber is only needed if it is a symbol + or contains a byte that is neither 0, -1 or a power of 2. */ + + if (NULL_RTX == clobber_reg + && !test_hard_reg_class (LD_REGS, dest) + && (! (CONST_INT_P (src) || CONST_FIXED_P (src) || CONST_DOUBLE_P (src)) + || !avr_popcount_each_byte (src, n_bytes, + (1 << 0) | (1 << 1) | (1 << 8)))) + { + /* We have no clobber register but need one. Cook one up. + That's cheaper than loading from constant pool. */ + + cooked_clobber_p = true; + clobber_reg = all_regs_rtx[REG_Z + 1]; + avr_asm_len ("mov __tmp_reg__,%0", &clobber_reg, len, 1); + } + + /* Now start filling DEST from LSB to MSB. */ + + for (int n = 0; n < n_bytes; n++) + { + bool done_byte = false; + rtx xop[3]; + + /* Crop the n-th destination byte. */ + + xdest[n] = avr_byte (dest, n); + int ldreg_p = test_hard_reg_class (LD_REGS, xdest[n]); + + if (!CONST_INT_P (src) + && !CONST_FIXED_P (src) + && !CONST_DOUBLE_P (src)) + { + static const char *const asm_code[][2] = + { + { "ldi %2,lo8(%1)" CR_TAB "mov %0,%2", "ldi %0,lo8(%1)" }, + { "ldi %2,hi8(%1)" CR_TAB "mov %0,%2", "ldi %0,hi8(%1)" }, + { "ldi %2,hlo8(%1)" CR_TAB "mov %0,%2", "ldi %0,hlo8(%1)" }, + { "ldi %2,hhi8(%1)" CR_TAB "mov %0,%2", "ldi %0,hhi8(%1)" } + }; + + xop[0] = xdest[n]; + xop[1] = src; + xop[2] = clobber_reg; + + avr_asm_len (asm_code[n][ldreg_p], xop, len, ldreg_p ? 1 : 2); + + continue; + } + + /* Crop the n-th source byte. */ + + xval = avr_byte (src, n); + ival[n] = INTVAL (xval); + + /* Look if we can reuse the low word by means of MOVW. */ + + if (n == 2 + && n_bytes >= 4 + && AVR_HAVE_MOVW) + { + int lo16 = avr_int16 (src, 0); + int hi16 = avr_int16 (src, 2); + + if (lo16 == hi16) + { + if (lo16 != 0 || ! clear_p) + avr_asm_len ("movw %C0,%A0", &op[0], len, 1); + + break; + } + } + + /* Don't use CLR so that cc0 is set as expected. */ + + if (ival[n] == 0) + { + if (!clear_p) + avr_asm_len (ldreg_p ? "ldi %0,0" + : AVR_ZERO_REGNO == REGNO (xdest[n]) ? "clr %0" + : "mov %0,__zero_reg__", + &xdest[n], len, 1); + continue; + } + + if (clobber_val == ival[n] + && REGNO (clobber_reg) == REGNO (xdest[n])) + { + continue; + } + + /* LD_REGS can use LDI to move a constant value */ + + if (ldreg_p) + { + xop[0] = xdest[n]; + xop[1] = xval; + avr_asm_len ("ldi %0,lo8(%1)", xop, len, 1); + continue; + } + + /* Try to reuse value already loaded in some lower byte. */ + + for (int j = 0; j < n; j++) + if (ival[j] == ival[n]) + { + xop[0] = xdest[n]; + xop[1] = xdest[j]; + + avr_asm_len ("mov %0,%1", xop, len, 1); + done_byte = true; + break; + } + + if (done_byte) + continue; + + /* Need no clobber reg for -1: Use CLR/DEC */ + + if (ival[n] == -1) + { + if (!clear_p) + avr_asm_len ("clr %0", &xdest[n], len, 1); + + avr_asm_len ("dec %0", &xdest[n], len, 1); + continue; + } + else if (ival[n] == 1) + { + if (!clear_p) + avr_asm_len ("clr %0", &xdest[n], len, 1); + + avr_asm_len ("inc %0", &xdest[n], len, 1); + continue; + } + + /* Use T flag or INC to manage powers of 2 if we have + no clobber reg. */ + + if (NULL_RTX == clobber_reg + && single_one_operand (xval, QImode)) + { + xop[0] = xdest[n]; + xop[1] = GEN_INT (exact_log2 (ival[n] & GET_MODE_MASK (QImode))); + + gcc_assert (constm1_rtx != xop[1]); + + if (!set_p) + { + set_p = true; + avr_asm_len ("set", xop, len, 1); + } + + if (!clear_p) + avr_asm_len ("clr %0", xop, len, 1); + + avr_asm_len ("bld %0,%1", xop, len, 1); + continue; + } + + /* We actually need the LD_REGS clobber reg. */ + + gcc_assert (NULL_RTX != clobber_reg); + + xop[0] = xdest[n]; + xop[1] = xval; + xop[2] = clobber_reg; + clobber_val = ival[n]; + + avr_asm_len ("ldi %2,lo8(%1)" CR_TAB + "mov %0,%2", xop, len, 2); + } + + /* If we cooked up a clobber reg above, restore it. */ + + if (cooked_clobber_p) + { + avr_asm_len ("mov %0,__tmp_reg__", &clobber_reg, len, 1); + } +} + + const char * output_movqi (rtx_insn *insn, rtx operands[], int *plen) { @@ -4951,7 +3964,7 @@ output_movhi (rtx_insn *insn, rtx xop[], int *plen) } /* REG_P (src) */ else if (CONSTANT_P (src)) { - return output_reload_inhi (xop, NULL, plen); + return output_reload_inhi (xop, NULL_RTX, plen); } else if (MEM_P (src)) { @@ -5588,7 +4601,7 @@ avr_out_movsi_mr_r_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *l) /* "ld r26,-X" is undefined */ if (reg_unused_after (insn, base)) { - return *l = 7, ("mov __tmp_reg__, %B1" CR_TAB + return *l = 7, ("mov __tmp_reg__,%B1" CR_TAB "st %0,%A1" CR_TAB TINY_ADIW (%E0, %F0, 1) CR_TAB "st %0+,__tmp_reg__" CR_TAB @@ -5597,7 +4610,7 @@ avr_out_movsi_mr_r_reg_no_disp_tiny (rtx_insn *insn, rtx op[], int *l) } else { - return *l = 9, ("mov __tmp_reg__, %B1" CR_TAB + return *l = 9, ("mov __tmp_reg__,%B1" CR_TAB "st %0,%A1" CR_TAB TINY_ADIW (%E0, %F0, 1) CR_TAB "st %0+,__tmp_reg__" CR_TAB @@ -5695,7 +4708,7 @@ out_movsi_mr_r (rtx_insn *insn, rtx op[], int *l) { if (io_address_operand (base, SImode)) { - return *l=4,("out %i0, %A1" CR_TAB + return *l=4,("out %i0,%A1" CR_TAB "out %i0+1,%B1" CR_TAB "out %i0+2,%C1" CR_TAB "out %i0+3,%D1"); @@ -6786,182 +5799,6 @@ out_movhi_mr_r (rtx_insn *insn, rtx op[], int *plen) } -/* During reload, we allow much more addresses than Reduced Tiny actually - supports. Split them after reload in order to get closer to the - core's capabilities. This sets the stage for pass .avr-fuse-add. */ - -bool -avr_split_tiny_move (rtx_insn * /*insn*/, rtx *xop) -{ - bool store_p = false; - rtx mem, reg_or_0; - - if (REG_P (xop[0]) && MEM_P (xop[1])) - { - reg_or_0 = xop[0]; - mem = xop[1]; - } - else if (MEM_P (xop[0]) - && (REG_P (xop[1]) - || xop[1] == CONST0_RTX (GET_MODE (xop[0])))) - { - mem = xop[0]; - reg_or_0 = xop[1]; - store_p = true; - } - else - return false; - - machine_mode mode = GET_MODE (mem); - rtx base, addr = XEXP (mem, 0); - enum rtx_code addr_code = GET_CODE (addr); - - if (REG_P (reg_or_0) - && reg_overlap_mentioned_p (reg_or_0, addr)) - return false; - else if (addr_code == PLUS || addr_code == PRE_DEC || addr_code == POST_INC) - base = XEXP (addr, 0); - else if (addr_code == REG) - base = addr; - else - return false; - - if (REGNO (base) > REG_Z) - return false; - - if (! AVR_TINY - // Only keep base registers that can't do PLUS addressing. - && ((REGNO (base) != REG_X - && ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (mem))) - || avr_load_libgcc_p (mem) - || avr_mem_memx_p (mem))) - return false; - - bool volatile_p = MEM_VOLATILE_P (mem); - bool mem_volatile_p = false; - if (frame_pointer_needed - && REGNO (base) == FRAME_POINTER_REGNUM) - { - if (avr_fuse_add < 2 - // Be a projection (we always split PLUS). - || (avr_fuse_add == 2 && volatile_p && addr_code != PLUS)) - return false; - - // Changing the frame pointer locally may confuse later passes - // like .dse2 which don't track changes of FP, not even when - // respective CFA notes are present. An example is pr22141-1.c. - if (avr_fuse_add == 2) - mem_volatile_p = true; - } - - enum rtx_code new_code = UNKNOWN; - HOST_WIDE_INT add = 0, sub = 0; - int msize = GET_MODE_SIZE (mode); - - AVR_LdSt_Props ap { REGNO (base), store_p, volatile_p, ADDR_SPACE_GENERIC }; - - switch (addr_code) - { - default: - return false; - - case PLUS: - add = INTVAL (XEXP (addr, 1)); - if (msize == 1) - { - new_code = REG; - sub = -add; - } - else if (ap.want_predec) - { - // volatile stores prefer PRE_DEC (MSB first) - sub = -add; - add += msize; - new_code = PRE_DEC; - } - else - { - new_code = POST_INC; - sub = -add - msize; - } - break; - - case POST_INC: - // volatile stores prefer PRE_DEC (MSB first) - if (msize > 1 && ap.want_predec) - { - add = msize; - new_code = PRE_DEC; - sub = msize; - break; - } - return false; - - case PRE_DEC: - // volatile loads prefer POST_INC (LSB first) - if (msize > 1 && ap.want_postinc) - { - add = -msize; - new_code = POST_INC; - sub = -msize; - break; - } - return false; - - case REG: - if (msize == 1) - return false; - - if (ap.want_predec) - { - add = msize; - new_code = PRE_DEC; - sub = 0; - } - else - { - add = 0; - new_code = POST_INC; - sub = -msize; - } - break; - } // switch addr_code - - rtx_insn *insn; - - if (add) - { - insn = emit_move_ccc (base, plus_constant (Pmode, base, add)); - avr_maybe_adjust_cfa (insn, base, add); - } - - rtx new_addr = new_code == REG - ? base - : gen_rtx_fmt_e (new_code, Pmode, base); - - rtx new_mem = change_address (mem, mode, new_addr); - if (mem_volatile_p) - MEM_VOLATILE_P (new_mem) = 1; - - insn = emit_move_ccc (store_p ? new_mem : reg_or_0, - store_p ? reg_or_0 : new_mem); - if (auto_inc_p (new_addr)) - { - add_reg_note (insn, REG_INC, base); - int off = new_code == POST_INC ? msize : -msize; - avr_maybe_adjust_cfa (insn, base, off); - } - - if (sub) - { - insn = emit_move_ccc (base, plus_constant (Pmode, base, sub)); - avr_maybe_adjust_cfa (insn, base, sub); - } - - return true; -} - - /* Implement `TARGET_FRAME_POINTER_REQUIRED'. */ /* Return 1 if frame pointer for current function required. */ @@ -6982,7 +5819,7 @@ avr_frame_pointer_required_p (void) For now, just look at the next insn, which misses some opportunities like following jumps. */ -static RTX_CODE +static rtx_code compare_condition (rtx_insn *insn) { rtx set; @@ -7009,7 +5846,7 @@ compare_condition (rtx_insn *insn) static bool compare_sign_p (rtx_insn *insn) { - RTX_CODE cond = compare_condition (insn); + rtx_code cond = compare_condition (insn); return (cond == GE || cond == LT); } @@ -7019,7 +5856,7 @@ compare_sign_p (rtx_insn *insn) static bool compare_eq_p (rtx_insn *insn) { - RTX_CODE cond = compare_condition (insn); + rtx_code cond = compare_condition (insn); return (cond == EQ || cond == NE); } @@ -7032,7 +5869,7 @@ compare_eq_p (rtx_insn *insn) static void avr_canonicalize_comparison (int *icode, rtx *op0, rtx *op1, bool op0_fixed) { - enum rtx_code code = (enum rtx_code) *icode; + rtx_code code = (rtx_code) *icode; machine_mode mode = GET_MODE (*op0); bool signed_p = code == GT || code == LE; @@ -7105,18 +5942,116 @@ avr_canonicalize_comparison (int *icode, rtx *op0, rtx *op1, bool op0_fixed) } } -/* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return SFmode or DFmode - for TI_{LONG_,}DOUBLE_TYPE which is for {long,} double type, go with - the default one for the others. */ -static machine_mode -avr_c_mode_for_floating_type (enum tree_index ti) +/* Try to turn a GEU or LTU comparison of register XOP[1] into an + NE / EQ comparison of the higher bytes of XOP[1] against 0. + XOP[1] has scalar int or scalar fixed-point mode of 2, 3 or 4 bytes. + XOP[2] is a compile-time constant, and XOP[0] = XOP[1] <comp> XOP[2] + is the comparison operator. XOP[3] is the branch label, and XOP[4] + is a QImode scratch operand. + When XOP[1] (viewed as a CONST_INT) is an integral power of 256, + then a GTU or LTU comparison can be turned into a NE or EQ comparison + of the high bytes against zero. For example, the C code + + if (x >= 1) + ccc = 0; + + where x is an unsigned _Accum may be compiled as: + + or r24,r25 ; *cmpsi_lsr + breq .L1 ; branch + sts ccc,__zero_reg__ ; movqi_insn + .L1: + + In the case of success, the operands will be such that they comprise + a *cmp<mode>_lsr insn, where mode is HI, PSI or SI, and XOP[0] will be + a NE or EQ branch condition. Otherwise, XOP[] is unchanged. */ + +void +avr_maybe_cmp_lsr (rtx *xop) { - if (ti == TI_DOUBLE_TYPE) - return avr_double == 32 ? SFmode : DFmode; - if (ti == TI_LONG_DOUBLE_TYPE) - return avr_long_double == 32 ? SFmode : DFmode; - return default_mode_for_floating_type (ti); + rtx_code comp = GET_CODE (xop[0]); + + if ((comp == GEU || comp == LTU) + && (CONST_INT_P (xop[2]) || CONST_FIXED_P (xop[2]))) + { + rtx xreg = avr_to_int_mode (xop[1]); + rtx xval = avr_to_int_mode (xop[2]); + machine_mode imode = GET_MODE (xreg); + auto uval = UINTVAL (xval) & GET_MODE_MASK (imode); + int shift = exact_log2 (uval); + + if (shift == 8 || shift == 16 || shift == 24) + { + // Operands such that the compare becomes *cmp<mode>_lsr. + xop[1] = gen_rtx_LSHIFTRT (imode, xreg, GEN_INT (shift)); + xop[2] = const0_rtx; + xop[4] = gen_rtx_SCRATCH (QImode); + // Branch condition. + xop[0] = gen_rtx_fmt_ee (comp == GEU ? NE : EQ, + VOIDmode, xop[1], xop[2]); + } + } +} + + +/* Output an EQ / NE compare of HI, PSI or SI register XOP[0] against 0, + where only the bits starting at XOP[1] are relevant. XOP[1] is a + const_int that is 8, 16 or 24. Return "". + PLEN == 0: Output instructions. + PLEN != 0: Set *PLEN to the length of the sequence in words. */ + +const char * +avr_out_cmp_lsr (rtx_insn *insn, rtx *xop, int *plen) +{ + rtx xreg = xop[0]; + const int n_bytes = GET_MODE_SIZE (GET_MODE (xreg)); + const int shift = INTVAL (xop[1]); + const rtx_code cond = compare_condition (insn); + + gcc_assert (shift == 8 || shift == 16 || shift == 24); + gcc_assert (shift < 8 * n_bytes); + gcc_assert (cond == UNKNOWN || cond == NE || cond == EQ); + + const bool used_p = ! reg_unused_after (insn, xreg); + + if (plen) + *plen = 0; + + if (shift / 8 == n_bytes - 1) + { + rtx xmsb = avr_byte (xreg, n_bytes - 1); + avr_asm_len ("tst %0", &xmsb, plen, 1); + } + else if (n_bytes == 4 + && shift <= 16 + && AVR_HAVE_ADIW + && REGNO (xreg) >= REG_22 + // The sequence also works when xreg is unused after, + // but SBIW is slower than OR. + && used_p) + { + avr_asm_len ("sbiw %C0,0", &xreg, plen, 1); + if (shift == 8) + avr_asm_len ("cpc %B0,__zero_reg__", &xreg, plen, 1); + } + else + { + rtx op[2] = { avr_byte (xreg, shift / 8), tmp_reg_rtx }; + if (used_p) + { + avr_asm_len ("mov %1,%0", op, plen, 1); + op[0] = tmp_reg_rtx; + } + + for (int i = 1 + shift / 8; i < n_bytes; ++i) + { + op[1] = avr_byte (xreg, i); + avr_asm_len ("or %0,%1", op, plen, 1); + } + } + + return ""; } @@ -7153,9 +6088,6 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) xval = avr_to_int_mode (xop[1]); } - /* MODE of the comparison. */ - machine_mode mode = GET_MODE (xreg); - gcc_assert (REG_P (xreg)); gcc_assert ((CONST_INT_P (xval) && n_bytes <= 4) || (const_double_operand (xval, VOIDmode) && n_bytes == 8)); @@ -7163,13 +6095,16 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) if (plen) *plen = 0; + const rtx_code cond = compare_condition (insn); + const bool eqne_p = cond == EQ || cond == NE; + /* Comparisons == +/-1 and != +/-1 can be done similar to camparing against 0 by ORing the bytes. This is one instruction shorter. Notice that 64-bit comparisons are always against reg:ALL8 18 (ACC_A) and therefore don't use this. */ - if (!test_hard_reg_class (LD_REGS, xreg) - && compare_eq_p (insn) + if (eqne_p + && ! test_hard_reg_class (LD_REGS, xreg) && reg_unused_after (insn, xreg)) { if (xval == const1_rtx) @@ -7198,69 +6133,65 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) } } - /* Comparisons == -1 and != -1 of a d-register that's used after the - comparison. (If it's unused after we use CPI / SBCI or ADIW sequence - from below.) Instead of CPI Rlo,-1 / LDI Rx,-1 / CPC Rhi,Rx we can - use CPI Rlo,-1 / CPC Rhi,Rlo which is 1 instruction shorter: - If CPI is true then Rlo contains -1 and we can use Rlo instead of Rx - when CPC'ing the high part. If CPI is false then CPC cannot render - the result to true. This also works for the more generic case where - the constant is of the form 0xabab. */ + /* Comparisons == and != may change the order in which the sub-bytes are + being compared. Start with the high 16 bits so we can use SBIW. */ - if (n_bytes == 2 - && xval != const0_rtx - && test_hard_reg_class (LD_REGS, xreg) - && compare_eq_p (insn) - && !reg_unused_after (insn, xreg)) + if (n_bytes == 4 + && eqne_p + && AVR_HAVE_ADIW + && REGNO (xreg) >= REG_22 + && (xval == const0_rtx + || (IN_RANGE (avr_int16 (xval, 2), 0, 63) + && reg_unused_after (insn, xreg)))) { - rtx xlo8 = simplify_gen_subreg (QImode, xval, mode, 0); - rtx xhi8 = simplify_gen_subreg (QImode, xval, mode, 1); + xop[2] = avr_word (xval, 2); + return avr_asm_len ("sbiw %C0,%2" CR_TAB + "sbci %B0,hi8(%1)" CR_TAB + "sbci %A0,lo8(%1)", xop, plen, 3); + } - if (INTVAL (xlo8) == INTVAL (xhi8)) - { - xop[0] = xreg; - xop[1] = xlo8; + bool changed[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - return avr_asm_len ("cpi %A0,%1" CR_TAB - "cpc %B0,%A0", xop, plen, 2); - } - } + /* The >= and < comparisons may skip the lower bytes when the according bytes + of the constant are all zeros. In that case, the comparison may start + at a byte other than the LSB. */ - for (int i = 0; i < n_bytes; i++) + const int start = ((cond == GEU || cond == LTU || cond == GE || cond == LT) + && INTVAL (xval) != 0) + ? ctz_hwi (INTVAL (xval)) / 8 + : 0; + + for (int i = start; i < n_bytes; i++) { /* We compare byte-wise. */ - rtx reg8 = simplify_gen_subreg (QImode, xreg, mode, i); - rtx xval8 = simplify_gen_subreg (QImode, xval, mode, i); + xop[0] = avr_byte (xreg, i); + xop[1] = avr_byte (xval, i); /* 8-bit value to compare with this byte. */ - unsigned int val8 = UINTVAL (xval8) & GET_MODE_MASK (QImode); - - /* Registers R16..R31 can operate with immediate. */ - bool ld_reg_p = test_hard_reg_class (LD_REGS, reg8); - - xop[0] = reg8; - xop[1] = gen_int_mode (val8, QImode); + unsigned int val8 = avr_uint8 (xval, i); /* Word registers >= R24 can use SBIW/ADIW with 0..63. */ - if (i == 0 - && avr_adiw_reg_p (reg8)) + if (i == start + && i % 2 == 0 + && n_bytes - start >= 2 + && avr_adiw_reg_p (xop[0])) { - int val16 = trunc_int_for_mode (INTVAL (xval), HImode); + int val16 = avr_int16 (xval, i); if (IN_RANGE (val16, 0, 63) && (val8 == 0 || reg_unused_after (insn, xreg))) { avr_asm_len ("sbiw %0,%1", xop, plen, 1); - + changed[i] = changed[i + 1] = val8 != 0; i++; continue; } - if (n_bytes == 2 - && IN_RANGE (val16, -63, -1) - && compare_eq_p (insn) + if (IN_RANGE (val16, -63, -1) + && eqne_p + && n_bytes - start == 2 && reg_unused_after (insn, xreg)) { return avr_asm_len ("adiw %0,%n1", xop, plen, 1); @@ -7271,7 +6202,7 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) if (val8 == 0) { - avr_asm_len (i == 0 + avr_asm_len (i == start ? "cp %0,__zero_reg__" : "cpc %0,__zero_reg__", xop, plen, 1); continue; @@ -7282,9 +6213,9 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) instruction; the only difference is that comparisons don't write the result back to the target register. */ - if (ld_reg_p) + if (test_hard_reg_class (LD_REGS, xop[0])) { - if (i == 0) + if (i == start) { avr_asm_len ("cpi %0,%1", xop, plen, 1); continue; @@ -7292,10 +6223,37 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) else if (reg_unused_after (insn, xreg)) { avr_asm_len ("sbci %0,%1", xop, plen, 1); + changed[i] = true; continue; } } + /* When byte comparisons for an EQ or NE comparison look like + compare (x[i], C) + compare (x[j], C) + then we can instead use + compare (x[i], C) + compare (x[j], x[i]) + which is shorter, and the outcome of the comparison is the same. */ + + if (eqne_p) + { + bool found = false; + + for (int j = start; j < i && ! found; ++j) + if (val8 == avr_uint8 (xval, j) + // Make sure that we didn't clobber x[j] above. + && ! changed[j]) + { + rtx op[] = { xop[0], avr_byte (xreg, j) }; + avr_asm_len ("cpc %0,%1", op, plen, 1); + found = true; + } + + if (found) + continue; + } + /* Must load the value into the scratch register. */ gcc_assert (REG_P (xop[2])); @@ -7304,7 +6262,7 @@ avr_out_compare (rtx_insn *insn, rtx *xop, int *plen) avr_asm_len ("ldi %2,%1", xop, plen, 1); clobber_val = (int) val8; - avr_asm_len (i == 0 + avr_asm_len (i == start ? "cp %0,%2" : "cpc %0,%2", xop, plen, 1); } @@ -7405,7 +6363,7 @@ avr_out_tstsi (rtx_insn *insn, rtx *op, int *plen) PLEN == 0: Print instructions. */ const char * -avr_out_cmp_ext (rtx xop[], enum rtx_code code, int *plen) +avr_out_cmp_ext (rtx xop[], rtx_code code, int *plen) { // The smaller reg is the one that's to be extended. Get its index as z. int z = GET_MODE_SIZE (GET_MODE (xop[1])) < GET_MODE_SIZE (GET_MODE (xop[0])); @@ -7425,7 +6383,7 @@ avr_out_cmp_ext (rtx xop[], enum rtx_code code, int *plen) { // Sign-extend the high-byte of zreg to tmp_reg. int zmsb = GET_MODE_SIZE (zmode) - 1; - rtx xzmsb = simplify_gen_subreg (QImode, zreg, zmode, zmsb); + rtx xzmsb = avr_byte (zreg, zmsb); avr_asm_len ("mov __tmp_reg__,%0" CR_TAB "rol __tmp_reg__" CR_TAB @@ -7448,10 +6406,8 @@ avr_out_cmp_ext (rtx xop[], enum rtx_code code, int *plen) for (int b = 1; b < n_bytes; ++b) { rtx regs[2]; - regs[1 - z] = simplify_gen_subreg (QImode, reg, mode, b); - regs[z] = (b < GET_MODE_SIZE (zmode) - ? simplify_gen_subreg (QImode, zreg, zmode, b) - : zex); + regs[1 - z] = avr_byte (reg, b); + regs[z] = b < GET_MODE_SIZE (zmode) ? avr_byte (zreg, b) : zex; avr_asm_len ("cpc %0,%1", regs, plen, 1); } @@ -8994,6 +7950,34 @@ lshrsi3_out (rtx_insn *insn, rtx operands[], int *len) } +/* When INSN is a PARALLEL with two SETs, a SET of REG_CC and a SET of a + GPR, then return the second SET and set *CCMODE to the first SET's mode. + Otherwise, return single_set and set *CCMODE to VOIDmode. */ + +static rtx +avr_cc_set (rtx_insn *insn, machine_mode *ccmode) +{ + // single_set() not only depends on the anatomy of an insn but also + // on REG_UNUSED notes, thus we have to analyze by hand so that the + // result only depends on the pattern. + + rtx pat = PATTERN (insn); + + if (GET_CODE (pat) == PARALLEL + && XVECLEN (pat, 0) == 2 + && GET_CODE (XVECEXP (pat, 0, 0)) == SET + && GET_CODE (XVECEXP (pat, 0, 1)) == SET) + { + rtx ccset = XVECEXP (pat, 0, 0); + *ccmode = GET_MODE (SET_DEST (ccset)); + return XVECEXP (pat, 0, 1); + } + + *ccmode = VOIDmode; + return single_set (insn); +} + + /* Output addition of registers YOP[0] and YOP[1] YOP[0] += extend (YOP[1]) @@ -9002,8 +7986,11 @@ lshrsi3_out (rtx_insn *insn, rtx operands[], int *len) YOP[0] -= extend (YOP[2]) - where the integer modes satisfy SI >= YOP[0].mode > YOP[1/2].mode >= QI, - and the extension may be sign- or zero-extend. Returns "". + where the integer modes satisfy SI >= YOP[0].mode >= YOP[1/2].mode >= QI, + and the extension may be sign-extend, zero-extend or reg (no extend). + INSN is either a single_set or a true parallel insn. In the latter case, + INSN has two SETs: A SET of REG_CC and a SET like in the single_set case. + Returns "". If PLEN == NULL output the instructions. If PLEN != NULL set *PLEN to the length of the sequence in words. */ @@ -9012,19 +7999,24 @@ const char * avr_out_plus_ext (rtx_insn *insn, rtx *yop, int *plen) { rtx regs[2]; + machine_mode ccmode; + + /* Ouch! Whether or not an insn is a single_set does not only depend + on the anatomy of the pattern, but also on REG_UNUSED notes. + Hence we have to dig by hand... */ - const rtx src = SET_SRC (single_set (insn)); - const RTX_CODE add = GET_CODE (src); + const rtx src = SET_SRC (avr_cc_set (insn, &ccmode)); + const rtx_code add = GET_CODE (src); gcc_assert (GET_CODE (src) == PLUS || GET_CODE (src) == MINUS); // Use XOP[] in the remainder with XOP[0] = YOP[0] and XOP[1] = YOP[1/2]. rtx xop[2] = { yop[0], yop[add == PLUS ? 1 : 2] }; const rtx xreg = XEXP (src, add == PLUS ? 1 : 0); const rtx xext = XEXP (src, add == PLUS ? 0 : 1); - const RTX_CODE ext = GET_CODE (xext); + const rtx_code ext = GET_CODE (xext); gcc_assert (REG_P (xreg) - && (ext == ZERO_EXTEND || ext == SIGN_EXTEND)); + && (ext == ZERO_EXTEND || ext == SIGN_EXTEND || ext == REG)); const int n_bytes0 = GET_MODE_SIZE (GET_MODE (xop[0])); const int n_bytes1 = GET_MODE_SIZE (GET_MODE (xop[1])); @@ -9044,7 +8036,9 @@ avr_out_plus_ext (rtx_insn *insn, rtx *yop, int *plen) if (ext == SIGN_EXTEND && (n_bytes0 > 1 + n_bytes1 - || reg_overlap_mentioned_p (msb1, xop[0]))) + || reg_overlap_mentioned_p (msb1, xop[0]) + // The insn also wants to set SREG.N and SREG.Z. + || ccmode == CCZNmode)) { // Sign-extending more than one byte: Set tmp_reg to 0 or -1 // depending on $1.msb. Same for the pathological case where @@ -9111,8 +8105,8 @@ avr_out_plus_ext (rtx_insn *insn, rtx *yop, int *plen) fixed-point rounding, cf. `avr_out_round'. */ static void -avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, - enum rtx_code code_sat, int sign, bool out_label) +avr_out_plus_1 (rtx insn, rtx *xop, int *plen, rtx_code code, + rtx_code code_sat, int sign, bool out_label) { /* MODE of the operation. */ machine_mode mode = GET_MODE (xop[0]); @@ -9150,8 +8144,8 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, for (int i = 0; i < n_bytes; i++) { /* We operate byte-wise on the destination. */ - op[0] = simplify_gen_subreg (QImode, xop[0], mode, i); - op[1] = simplify_gen_subreg (QImode, xop[2], mode, i); + op[0] = avr_byte (xop[0], i); + op[1] = avr_byte (xop[2], i); if (i == 0) avr_asm_len (code == PLUS ? "add %0,%1" : "sub %0,%1", @@ -9187,8 +8181,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, if (SS_PLUS == code_sat && MINUS == code && sign < 0 - && 0x80 == (INTVAL (simplify_gen_subreg (QImode, xval, imode, n_bytes-1)) - & GET_MODE_MASK (QImode))) + && 0x80 == avr_uint8 (xval, n_bytes - 1)) { /* We compute x + 0x80 by means of SUB instructions. We negated the constant subtrahend above and are left with x - (-128) so that we @@ -9197,7 +8190,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, where this must be done is when NEG overflowed in case [2s] because the V computation needs the right sign of the subtrahend. */ - rtx msb = simplify_gen_subreg (QImode, xop[0], mode, n_bytes - 1); + rtx msb = avr_byte (xop[0], n_bytes - 1); avr_asm_len ("subi %0,128" CR_TAB "brmi 0f", &msb, plen, 2); @@ -9209,8 +8202,8 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, for (int i = 0; i < n_bytes; i++) { /* We operate byte-wise on the destination. */ - rtx reg8 = simplify_gen_subreg (QImode, xop[0], mode, i); - rtx xval8 = simplify_gen_subreg (QImode, xval, imode, i); + rtx reg8 = avr_byte (xop[0], i); + rtx xval8 = avr_byte (xval, i); /* 8-bit value to operate with this byte. */ unsigned int val8 = UINTVAL (xval8) & GET_MODE_MASK (QImode); @@ -9228,8 +8221,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, && i + 2 <= n_bytes && avr_adiw_reg_p (reg8)) { - rtx xval16 = simplify_gen_subreg (HImode, xval, imode, i); - unsigned int val16 = UINTVAL (xval16) & GET_MODE_MASK (HImode); + unsigned int val16 = avr_uint16 (xval, i); /* Registers R24, X, Y, Z can use ADIW/SBIW with constants < 64 i.e. operate word-wise. */ @@ -9384,10 +8376,8 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, The cases a - b actually perform a - (-(-b)) if B is CONST. */ - op[0] = simplify_gen_subreg (QImode, xop[0], mode, n_bytes-1); - op[1] = n_bytes > 1 - ? simplify_gen_subreg (QImode, xop[0], mode, n_bytes-2) - : NULL_RTX; + op[0] = avr_byte (xop[0], n_bytes - 1); + op[1] = n_bytes > 1 ? avr_byte (xop[0], n_bytes - 2) : NULL_RTX; bool need_copy = true; int len_call = 1 + AVR_HAVE_JMP_CALL; @@ -9420,7 +8410,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, { /* [1s,reg] */ - op[2] = simplify_gen_subreg (QImode, xop[2], mode, n_bytes-1); + op[2] = avr_byte (xop[2], n_bytes - 1); if (n_bytes == 1) avr_asm_len ("ldi %0,0x80" CR_TAB @@ -9436,7 +8426,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, { /* [3s,reg] */ - op[2] = simplify_gen_subreg (QImode, xop[2], mode, n_bytes-1); + op[2] = avr_byte (xop[2], n_bytes - 1); if (n_bytes == 1) avr_asm_len ("ldi %0,0x7f" CR_TAB @@ -9577,7 +8567,7 @@ avr_out_plus_1 (rtx insn, rtx *xop, int *plen, enum rtx_code code, are additions/subtraction for pointer modes, i.e. HImode and PSImode. */ static const char * -avr_out_plus_symbol (rtx *xop, enum rtx_code code, int *plen) +avr_out_plus_symbol (rtx *xop, rtx_code code, int *plen) { machine_mode mode = GET_MODE (xop[0]); @@ -9633,8 +8623,8 @@ avr_out_plus (rtx insn, rtx *xop, int *plen, bool out_label) machine_mode mode = GET_MODE (xdest); scalar_int_mode imode = int_mode_for_mode (mode).require (); int n_bytes = GET_MODE_SIZE (mode); - enum rtx_code code_sat = GET_CODE (SET_SRC (xpattern)); - enum rtx_code code + rtx_code code_sat = GET_CODE (SET_SRC (xpattern)); + rtx_code code = (PLUS == code_sat || SS_PLUS == code_sat || US_PLUS == code_sat ? PLUS : MINUS); @@ -9677,8 +8667,7 @@ avr_out_plus (rtx insn, rtx *xop, int *plen, bool out_label) /* Saturation will need the sign of the original operand. */ - rtx xmsb = simplify_gen_subreg (QImode, op[2], imode, n_bytes-1); - int sign = INTVAL (xmsb) < 0 ? -1 : 1; + int sign = avr_int8 (op[2], n_bytes - 1) < 0 ? -1 : 1; /* If we subtract and the subtrahend is a constant, then negate it so that avr_out_plus_1 can be used. */ @@ -9702,10 +8691,51 @@ avr_out_plus (rtx insn, rtx *xop, int *plen, bool out_label) } +/* Output an addition with a compile-time constant that sets SREG.N: + + XOP[0] += XOP[1] + + where XOP[0] is a HI, PSI or SI register, and XOP[1] is a register or a + compile-time constant. XOP[2] is SCRATCH or a QI clobber reg. Return "". + + If PLEN == NULL output the instructions. + If PLEN != NULL set *PLEN to the length of the sequence in words. */ + +const char * +avr_out_plus_set_N (rtx *xop, int *plen) +{ + gcc_assert (xop[1] != const0_rtx); + + // The output function for vanilla additions, avr_out_plus_1, can be + // used because it always issues an operation on the MSB (except when + // the addend is zero). + + rtx op[] = { xop[0], xop[0], xop[1], xop[2] }; + + if (REG_P (xop[1])) + { + avr_out_plus_1 (NULL_RTX, op, plen, PLUS, UNKNOWN, 0, false); + } + else + { + int len_plus, len_minus; + + avr_out_plus_1 (NULL_RTX, op, &len_plus, PLUS, UNKNOWN, 0, false); + avr_out_plus_1 (NULL_RTX, op, &len_minus, MINUS, UNKNOWN, 0, false); + + avr_out_plus_1 (NULL_RTX, op, plen, len_minus < len_plus ? MINUS : PLUS, + UNKNOWN, 0, false); + } + + return ""; +} + + /* Output an instruction sequence for addition of REG in XOP[0] and CONST_INT in XOP[1] in such a way that SREG.Z and SREG.N are set according to the - result. XOP[2] might be a d-regs clobber register. If XOP[2] is SCRATCH, - then the addition can be performed without a clobber reg. Return "". + result. The mode is HI, PSI or SI. XOP[2] might be a d-regs clobber + register. If XOP[2] is SCRATCH, then the addition can be performed + without a clobber reg. Return "". If PLEN == NULL, then output the instructions. If PLEN != NULL, then set *PLEN to the length of the sequence in words. */ @@ -9725,21 +8755,12 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) // Number of bytes to operate on. int n_bytes = GET_MODE_SIZE (mode); - if (n_bytes == 1) - { - if (INTVAL (xval) == 1) - return avr_asm_len ("inc %0", xop, plen, 1); - - if (INTVAL (xval) == -1) - return avr_asm_len ("dec %0", xop, plen, 1); - } - if (n_bytes == 2 && avr_adiw_reg_p (xreg) && IN_RANGE (INTVAL (xval), 1, 63)) { // Add 16-bit value in [1..63] to a w register. - return avr_asm_len ("adiw %0, %1", xop, plen, 1); + return avr_asm_len ("adiw %0,%1", xop, plen, 1); } // Addition won't work; subtract the negative of XVAL instead. @@ -9758,17 +8779,17 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) // SBIW'ed in one go. for (int i = 0; i < n_bytes; ++i) { - op[0] = simplify_gen_subreg (QImode, xreg, mode, i); + op[0] = avr_byte (xreg, i); if (i == 0 && n_bytes >= 2 && avr_adiw_reg_p (op[0])) { - op[1] = simplify_gen_subreg (HImode, xval, mode, 0); + op[1] = avr_word (xval, 0); if (IN_RANGE (INTVAL (op[1]), 0, 63)) { // SBIW can handle the lower 16 bits. - avr_asm_len ("sbiw %0, %1", op, plen, 1); + avr_asm_len ("sbiw %0,%1", op, plen, 1); // Next byte has already been handled: Skip it. ++i; @@ -9776,14 +8797,14 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) } } - op[1] = simplify_gen_subreg (QImode, xval, mode, i); + op[1] = avr_byte (xval, i); if (test_hard_reg_class (LD_REGS, op[0])) { // d-regs can subtract immediates. avr_asm_len (i == 0 - ? "subi %0, %1" - : "sbci %0, %1", op, plen, 1); + ? "subi %0,%1" + : "sbci %0,%1", op, plen, 1); } else { @@ -9792,8 +8813,8 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) { // Any register can subtract 0. avr_asm_len (i == 0 - ? "sub %0, __zero_reg__" - : "sbc %0, __zero_reg__", op, plen, 1); + ? "sub %0,__zero_reg__" + : "sbc %0,__zero_reg__", op, plen, 1); } else { @@ -9803,13 +8824,13 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) { // Load partial xval to QI clobber reg and memoize for later. gcc_assert (REG_P (op[2])); - avr_asm_len ("ldi %2, %1", op, plen, 1); + avr_asm_len ("ldi %2,%1", op, plen, 1); clobber_val = val8; } avr_asm_len (i == 0 - ? "sub %0, %2" - : "sbc %0, %2", op, plen, 1); + ? "sub %0,%2" + : "sbc %0,%2", op, plen, 1); } } } // Loop bytes. @@ -9818,6 +8839,136 @@ avr_out_plus_set_ZN (rtx *xop, int *plen) } +/* A helper worker for `op8_ZN_operator'. Allow + + OP0 <code> OP1 + + QImode operations that set SREG.N and SREG.Z in a usable way. + these are: + + * OP0 is a QImode register, and + * OP1 is a QImode register or CONST_INT, and + + the allowed operations is one of: + + * SHIFTs with a const_int offset in { 1, 2, 3 }. + * MINUS and XOR with a register operand + * IOR and AND with a register operand, or d-reg + const_int + * PLUS with a register operand, or d-reg + const_int, + or a const_int in { -2, -1, 1, 2 }. */ + +bool +avr_op8_ZN_operator (rtx op) +{ + const rtx_code code = GET_CODE (op); + rtx op0 = XEXP (op, 0); + rtx op1 = XEXP (op, 1); + + if (! register_operand (op0, QImode) + || ! (register_operand (op1, QImode) + || const_int_operand (op1, QImode))) + return false; + + const bool reg1_p = REG_P (op1); + const bool ld_reg0_p = test_hard_reg_class (LD_REGS, op0); + + switch (code) + { + default: + break; + + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + return const_1_to_3_operand (op1, QImode); + + case MINUS: + case XOR: + return reg1_p; + + case IOR: + case AND: + return reg1_p || ld_reg0_p; + + case PLUS: + return reg1_p || ld_reg0_p || abs1_abs2_operand (op1, QImode); + } + + return false; +} + + +/* Output a QImode instruction sequence for + + XOP[0] = XOP[0] <CODE> XOP[2] + + where XOP[0] is a register, and the possible operands and CODEs + are according to `avr_op8_ZN_operator' from above. Return "". + + If PLEN == NULL, then output the instructions. + If PLEN != NULL, then set *PLEN to the length of the sequence in words. */ + +const char * +avr_out_op8_set_ZN (rtx_code code, rtx *xop, int *plen) +{ + const bool reg2_p = REG_P (xop[2]); + const int ival = CONST_INT_P (xop[2]) ? (int) INTVAL (xop[2]) : 0; + + gcc_assert (op8_ZN_operator (gen_rtx_fmt_ee (code, QImode, xop[0], xop[2]), + QImode)); + if (plen) + *plen = 0; + + const char *tpl = nullptr; + int times = 1; + + if (code == ASHIFT) + tpl = "lsl %0", times = ival; + else if (code == LSHIFTRT) + tpl = "lsr %0", times = ival; + else if (code == ASHIFTRT) + tpl = "asr %0", times = ival; + else if (code == MINUS) + tpl = "sub %0,%2"; + else if (code == XOR) + tpl = "eor %0,%2"; + else if (code == AND) + tpl = reg2_p ? "and %0,%2" : "andi %0,lo8(%2)"; + else if (code == IOR) + tpl = reg2_p ? "or %0,%2" : "ori %0,lo8(%2)"; + else if (code == PLUS) + { + if (ival + && ! test_hard_reg_class (LD_REGS, xop[0])) + { + tpl = ival > 0 ? "inc %0" : "dec %0"; + times = std::abs (ival); + } + else + tpl = reg2_p ? "add %0,%2" : "subi %0,lo8(%n2)"; + } + else + gcc_unreachable(); + + for (int i = 0; i < times; ++i) + avr_asm_len (tpl, xop, plen, 1); + + return ""; +} + + +/* Used in the "length" attribute of insn "*op8.for.cczn.<code>". */ + +int +avr_len_op8_set_ZN (rtx_code code, rtx *xop) +{ + int len; + (void) avr_out_op8_set_ZN (code, xop, &len); + + return len; +} + + /* Output bit operation (IOR, AND, XOR) with register XOP[0] and compile time constant XOP[2]: @@ -9834,7 +8985,7 @@ avr_out_bitop (rtx insn, rtx *xop, int *plen) { /* CODE and MODE of the operation. */ rtx xpattern = INSN_P (insn) ? single_set (as_a <rtx_insn *> (insn)) : insn; - enum rtx_code code = GET_CODE (SET_SRC (xpattern)); + rtx_code code = GET_CODE (SET_SRC (xpattern)); machine_mode mode = GET_MODE (xop[0]); /* Number of bytes to operate on. */ @@ -9861,11 +9012,10 @@ avr_out_bitop (rtx insn, rtx *xop, int *plen) for (int i = 0; i < n_bytes; i++) { /* We operate byte-wise on the destination. */ - rtx reg8 = simplify_gen_subreg (QImode, xop[0], mode, i); - rtx xval8 = simplify_gen_subreg (QImode, xop[2], mode, i); + rtx reg8 = avr_byte (xop[0], i); /* 8-bit value to operate with this byte. */ - unsigned int val8 = UINTVAL (xval8) & GET_MODE_MASK (QImode); + unsigned int val8 = avr_uint8 (xop[2], i); /* Number of bits set in the current byte of the constant. */ int pop8 = popcount_hwi (val8); @@ -9984,12 +9134,12 @@ void avr_emit_xior_with_shift (rtx_insn *insn, rtx *xop, int bitoff) { rtx src = SET_SRC (single_set (insn)); - RTX_CODE xior = GET_CODE (src); + rtx_code xior = GET_CODE (src); gcc_assert (xior == XOR || xior == IOR); gcc_assert (bitoff % 8 == 0); // Work out the shift offset in bytes; negative for shift right. - RTX_CODE shift = GET_CODE (XEXP (src, 0)); + rtx_code shift = GET_CODE (XEXP (src, 0)); int byteoff = 0?0 : shift == ASHIFT ? bitoff / 8 : shift == LSHIFTRT ? -bitoff / 8 @@ -10214,7 +9364,7 @@ avr_out_insv (rtx_insn *insn, rtx xop[], int *plen) } // Any of ASHIFT, LSHIFTRT, ASHIFTRT. - enum rtx_code code = GET_CODE (XEXP (xsrc, 0)); + rtx_code code = GET_CODE (XEXP (xsrc, 0)); int shift = code == ASHIFT ? INTVAL (xop2) : -INTVAL (xop2); // Determines the position of the output bit. @@ -10231,11 +9381,9 @@ avr_out_insv (rtx_insn *insn, rtx xop[], int *plen) rtx op[4] = { // Output - simplify_gen_subreg (QImode, xop[0], mode, obit / 8), - GEN_INT (obit & 7), + avr_byte (xop[0], obit / 8), GEN_INT (obit & 7), // Input - simplify_gen_subreg (QImode, xop[1], mode, ibit / 8), - GEN_INT (ibit & 7) + avr_byte (xop[1], ibit / 8), GEN_INT (ibit & 7) }; obit &= 7; ibit &= 7; @@ -10322,7 +9470,7 @@ avr_out_insv (rtx_insn *insn, rtx xop[], int *plen) { for (int b = 0; b < n_bytes; ++b) { - rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b); + rtx byte = avr_byte (xop[0], b); if (REGNO (byte) != REGNO (op[0])) avr_asm_len ("clr %0", &byte, plen, 1); } @@ -10342,7 +9490,7 @@ avr_out_insv (rtx_insn *insn, rtx xop[], int *plen) else for (int b = 0; b < n_bytes; ++b) { - rtx byte = simplify_gen_subreg (QImode, xop[0], mode, b); + rtx byte = avr_byte (xop[0], b); avr_asm_len ("clr %0", &byte, plen, 1); } @@ -10366,7 +9514,7 @@ avr_out_extr (rtx_insn *insn, rtx xop[], int *plen) if (GET_MODE (src) != QImode) { - src = xop[1] = simplify_gen_subreg (QImode, src, GET_MODE (src), bit / 8); + src = xop[1] = avr_byte (src, bit / 8); bit %= 8; xop[2] = GEN_INT (bit); } @@ -10493,7 +9641,7 @@ const char * avr_out_fract (rtx_insn *insn, rtx operands[], bool intsigned, int *plen) { rtx xop[6]; - RTX_CODE shift = UNKNOWN; + rtx_code shift = UNKNOWN; bool sign_in_carry = false; bool msb_in_carry = false; bool lsb_in_tmp_reg = false; @@ -11251,7 +10399,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len) /* Read from insn attribute "adjust_len" if/how length is to be adjusted. */ - enum attr_adjust_len adjust_len = get_attr_adjust_len (insn); + attr_adjust_len adjust_len = get_attr_adjust_len (insn); if (adjust_len == ADJUST_LEN_NO) { @@ -11301,6 +10449,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len) case ADJUST_LEN_COMPARE64: avr_out_compare64 (insn, op, &len); break; case ADJUST_LEN_CMP_UEXT: avr_out_cmp_ext (op, ZERO_EXTEND, &len); break; case ADJUST_LEN_CMP_SEXT: avr_out_cmp_ext (op, SIGN_EXTEND, &len); break; + case ADJUST_LEN_CMP_LSR: avr_out_cmp_lsr (insn, op, &len); break; case ADJUST_LEN_LSHRQI: lshrqi3_out (insn, op, &len); break; case ADJUST_LEN_LSHRHI: lshrhi3_out (insn, op, &len); break; @@ -11322,6 +10471,7 @@ avr_adjust_insn_length (rtx_insn *insn, int len) case ADJUST_LEN_INSERT_BITS: avr_out_insert_bits (op, &len); break; case ADJUST_LEN_ADD_SET_ZN: avr_out_plus_set_ZN (op, &len); break; + case ADJUST_LEN_ADD_SET_N: avr_out_plus_set_N (op, &len); break; case ADJUST_LEN_INSV_NOTBIT: avr_out_insert_notbit (insn, op, &len); break; @@ -11333,151 +10483,6 @@ avr_adjust_insn_length (rtx_insn *insn, int len) } -/* Return true when INSN has a REG_UNUSED note for hard reg REG. - rtlanal.cc::find_reg_note() uses == to compare XEXP (link, 0) - therefore use a custom function. */ - -static bool -avr_insn_has_reg_unused_note_p (rtx_insn *insn, rtx reg) -{ - for (rtx link = REG_NOTES (insn); link; link = XEXP (link, 1)) - if (REG_NOTE_KIND (link) == REG_UNUSED - && REG_P (XEXP (link, 0)) - && REGNO (reg) >= REGNO (XEXP (link, 0)) - && END_REGNO (reg) <= END_REGNO (XEXP (link, 0))) - return true; - - return false; -} - - -/* Return nonzero if register REG dead after INSN. */ - -int -reg_unused_after (rtx_insn *insn, rtx reg) -{ - return (dead_or_set_p (insn, reg) - || (REG_P (reg) && _reg_unused_after (insn, reg, true))); -} - -/* A helper for the previous function. - Return nonzero if REG is not used after INSN. - We assume REG is a reload reg, and therefore does - not live past labels. It may live past calls or jumps though. */ - -bool -_reg_unused_after (rtx_insn *insn, rtx reg, bool look_at_insn) -{ - if (look_at_insn) - { - /* If the reg is set by this instruction, then it is safe for our - case. Disregard the case where this is a store to memory, since - we are checking a register used in the store address. */ - rtx set = single_set (insn); - if (set && !MEM_P (SET_DEST (set)) - && reg_overlap_mentioned_p (reg, SET_DEST (set))) - return 1; - - /* This case occurs when fuse-add introduced a POST_INC addressing, - but the address register is unused after. */ - if (set) - { - rtx mem = MEM_P (SET_SRC (set)) ? SET_SRC (set) : SET_DEST (set); - if (MEM_P (mem) - && reg_overlap_mentioned_p (reg, XEXP (mem, 0)) - && avr_insn_has_reg_unused_note_p (insn, reg)) - return 1; - } - } - - while ((insn = NEXT_INSN (insn))) - { - rtx set; - enum rtx_code code = GET_CODE (insn); - -#if 0 - /* If this is a label that existed before reload, then the register - if dead here. However, if this is a label added by reorg, then - the register may still be live here. We can't tell the difference, - so we just ignore labels completely. */ - if (code == CODE_LABEL) - return 1; - /* else */ -#endif - - if (!INSN_P (insn)) - continue; - - if (code == JUMP_INSN) - return 0; - - /* If this is a sequence, we must handle them all at once. - We could have for instance a call that sets the target register, - and an insn in a delay slot that uses the register. In this case, - we must return 0. */ - else if (code == INSN && GET_CODE (PATTERN (insn)) == SEQUENCE) - { - rtx_sequence *seq = as_a <rtx_sequence *> (PATTERN (insn)); - int retval = 0; - - for (int i = 0; i < seq->len (); i++) - { - rtx_insn *this_insn = seq->insn (i); - rtx set = single_set (this_insn); - - if (CALL_P (this_insn)) - code = CALL_INSN; - else if (JUMP_P (this_insn)) - { - if (INSN_ANNULLED_BRANCH_P (this_insn)) - return 0; - code = JUMP_INSN; - } - - if (set && reg_overlap_mentioned_p (reg, SET_SRC (set))) - return 0; - if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) - { - if (!MEM_P (SET_DEST (set))) - retval = 1; - else - return 0; - } - if (set == 0 - && reg_overlap_mentioned_p (reg, PATTERN (this_insn))) - return 0; - } - if (retval == 1) - return 1; - else if (code == JUMP_INSN) - return 0; - } - - if (code == CALL_INSN) - { - rtx tem; - for (tem = CALL_INSN_FUNCTION_USAGE (insn); tem; tem = XEXP (tem, 1)) - if (GET_CODE (XEXP (tem, 0)) == USE - && REG_P (XEXP (XEXP (tem, 0), 0)) - && reg_overlap_mentioned_p (reg, XEXP (XEXP (tem, 0), 0))) - return 0; - if (call_used_or_fixed_reg_p (REGNO (reg))) - return 1; - } - - set = single_set (insn); - - if (set && reg_overlap_mentioned_p (reg, SET_SRC (set))) - return 0; - if (set && reg_overlap_mentioned_p (reg, SET_DEST (set))) - return !MEM_P (SET_DEST (set)); - if (set == 0 && reg_overlap_mentioned_p (reg, PATTERN (insn))) - return 0; - } - return 1; -} - - /* Implement `TARGET_ASM_INTEGER'. */ /* Target hook for assembling integer objects. The AVR version needs special handling for references to certain labels. */ @@ -11517,10 +10522,7 @@ avr_assemble_integer (rtx x, unsigned int size, int aligned_p) /* varasm fails to handle big fixed modes that don't fit in hwi. */ for (unsigned n = 0; n < size; n++) - { - rtx xn = simplify_gen_subreg (QImode, x, GET_MODE (x), n); - default_assemble_integer (xn, 1, aligned_p); - } + default_assemble_integer (avr_byte (x, n), 1, aligned_p); return true; } @@ -11540,7 +10542,7 @@ avr_assemble_integer (rtx x, unsigned int size, int aligned_p) static unsigned char avr_class_max_nregs (reg_class_t rclass, machine_mode mode) { - if (rclass == CC_REG && mode == CCmode) + if (rclass == CC_REG && GET_MODE_CLASS (mode) == MODE_CC) return 1; return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); @@ -12602,6 +11604,7 @@ avr_asm_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) return sect; } + /* Implement `TARGET_ASM_FILE_START'. */ /* Outputs some text at the start of each assembler file. */ @@ -12846,16 +11849,16 @@ avr_cbranch_cost (rtx x) } -/* Mutually recursive subroutine of avr_rtx_cost for calculating the +/* Mutually recursive subroutine of `avr_rtx_cost' for calculating the cost of an RTX operand given its context. X is the rtx of the operand, MODE is its mode, and OUTER is the rtx_code of this operand's parent operator. */ static int -avr_operand_rtx_cost (rtx x, machine_mode mode, enum rtx_code outer, +avr_operand_rtx_cost (rtx x, machine_mode mode, rtx_code outer, int opno, bool speed) { - enum rtx_code code = GET_CODE (x); + rtx_code code = GET_CODE (x); switch (code) { @@ -12887,7 +11890,7 @@ static bool avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code, int /*opno*/, int *total, bool speed) { - enum rtx_code code = GET_CODE (x); + rtx_code code = GET_CODE (x); HOST_WIDE_INT val; switch (code) @@ -13208,8 +12211,8 @@ avr_rtx_costs_1 (rtx x, machine_mode mode, int outer_code, { rtx op0 = XEXP (x, 0); rtx op1 = XEXP (x, 1); - enum rtx_code code0 = GET_CODE (op0); - enum rtx_code code1 = GET_CODE (op1); + rtx_code code0 = GET_CODE (op0); + rtx_code code1 = GET_CODE (op1); bool ex0 = SIGN_EXTEND == code0 || ZERO_EXTEND == code0; bool ex1 = SIGN_EXTEND == code1 || ZERO_EXTEND == code1; @@ -13899,7 +12902,7 @@ avr_insn_cost (rtx_insn *insn, bool speed) subrtx_iterator::array_type array; FOR_EACH_SUBRTX (iter, array, SET_SRC (set), NONCONST) { - enum rtx_code code = GET_CODE (*iter); + rtx_code code = GET_CODE (*iter); not_bit_p |= code == NOT || code == XOR || code == GE; } @@ -13992,8 +12995,8 @@ extra_constraint_Q (rtx x) /* Convert condition code CONDITION to the valid AVR condition code. */ -RTX_CODE -avr_normalize_condition (RTX_CODE condition) +rtx_code +avr_normalize_condition (rtx_code condition) { switch (condition) { @@ -14068,7 +13071,7 @@ avr_function_value (const_tree type, const_tree /*fn_decl_or_type*/, } int -test_hard_reg_class (enum reg_class rclass, rtx x) +test_hard_reg_class (reg_class rclass, rtx x) { int regno = true_regnum (x); if (regno < 0) @@ -14081,7 +13084,7 @@ test_hard_reg_class (enum reg_class rclass, rtx x) } -/* Helper for jump_over_one_insn_p: Test if INSN is a 2-word instruction +/* Helper for `jump_over_one_insn_p': Test if INSN is a 2-word instruction and thus is suitable to be skipped by CPSE, SBRC, etc. */ static bool @@ -14095,7 +13098,10 @@ avr_2word_insn_p (rtx_insn *insn) switch (INSN_CODE (insn)) { default: - return false; + return (recog_memoized (insn) >= 0 + // Transparent calls may be skipped. + && (get_attr_type (insn) == TYPE_XCALL + || get_attr_adjust_len (insn) == ADJUST_LEN_CALL)); case CODE_FOR_movqi_insn: case CODE_FOR_movuqq_insn: @@ -14154,7 +13160,7 @@ jump_over_one_insn_p (rtx_insn *insn, rtx dest) static unsigned int avr_hard_regno_nregs (unsigned int regno, machine_mode mode) { - if (regno == REG_CC && mode == CCmode) + if (regno == REG_CC && GET_MODE_CLASS (mode) == MODE_CC) return 1; return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); @@ -14169,7 +13175,7 @@ static bool avr_hard_regno_mode_ok (unsigned int regno, machine_mode mode) { if (regno == REG_CC) - return mode == CCmode; + return GET_MODE_CLASS (mode) == MODE_CC; /* NOTE: 8-bit values must not be disallowed for R28 or R29. Disallowing QI et al. in these regs might lead to code like @@ -14227,9 +13233,9 @@ avr_hard_regno_call_part_clobbered (unsigned, unsigned regno, /* Implement `MODE_CODE_BASE_REG_CLASS'. */ -enum reg_class +reg_class avr_mode_code_base_reg_class (machine_mode /*mode*/, addr_space_t as, - RTX_CODE outer_code, RTX_CODE /*index_code*/) + rtx_code outer_code, rtx_code /*index_code*/) { if (!ADDR_SPACE_GENERIC_P (as)) { @@ -14252,8 +13258,8 @@ avr_mode_code_base_reg_class (machine_mode /*mode*/, addr_space_t as, bool avr_regno_mode_code_ok_for_base_p (int regno, machine_mode /*mode*/, - addr_space_t as, RTX_CODE outer_code, - RTX_CODE /*index_code*/) + addr_space_t as, rtx_code outer_code, + rtx_code /*index_code*/) { bool ok = false; @@ -14316,232 +13322,6 @@ avr_regno_mode_code_ok_for_base_p (int regno, machine_mode /*mode*/, } -/* A helper for `output_reload_insisf' and `output_reload_inhi'. */ -/* Set 32-bit register OP[0] to compile-time constant OP[1]. - CLOBBER_REG is a QI clobber register or NULL_RTX. - LEN == NULL: output instructions. - LEN != NULL: set *LEN to the length of the instruction sequence - (in words) printed with LEN = NULL. - If CLEAR_P is true, OP[0] had been cleard to Zero already. - If CLEAR_P is false, nothing is known about OP[0]. - - The effect on cc0 is as follows: - - Load 0 to any register except ZERO_REG : NONE - Load ld register with any value : NONE - Anything else: : CLOBBER */ - -static void -output_reload_in_const (rtx *op, rtx clobber_reg, int *len, bool clear_p) -{ - rtx src = op[1]; - rtx dest = op[0]; - rtx xval, xdest[4]; - int ival[4]; - int clobber_val = 1234; - bool cooked_clobber_p = false; - bool set_p = false; - machine_mode mode = GET_MODE (dest); - int n_bytes = GET_MODE_SIZE (mode); - - gcc_assert (REG_P (dest) - && CONSTANT_P (src)); - - if (len) - *len = 0; - - /* (REG:SI 14) is special: It's neither in LD_REGS nor in NO_LD_REGS - but has some subregs that are in LD_REGS. Use the MSB (REG:QI 17). */ - - if (REGNO (dest) < REG_16 - && REGNO (dest) + GET_MODE_SIZE (mode) > REG_16) - { - clobber_reg = all_regs_rtx[REGNO (dest) + n_bytes - 1]; - } - - /* We might need a clobber reg but don't have one. Look at the value to - be loaded more closely. A clobber is only needed if it is a symbol - or contains a byte that is neither 0, -1 or a power of 2. */ - - if (NULL_RTX == clobber_reg - && !test_hard_reg_class (LD_REGS, dest) - && (! (CONST_INT_P (src) || CONST_FIXED_P (src) || CONST_DOUBLE_P (src)) - || !avr_popcount_each_byte (src, n_bytes, - (1 << 0) | (1 << 1) | (1 << 8)))) - { - /* We have no clobber register but need one. Cook one up. - That's cheaper than loading from constant pool. */ - - cooked_clobber_p = true; - clobber_reg = all_regs_rtx[REG_Z + 1]; - avr_asm_len ("mov __tmp_reg__,%0", &clobber_reg, len, 1); - } - - /* Now start filling DEST from LSB to MSB. */ - - for (int n = 0; n < n_bytes; n++) - { - bool done_byte = false; - rtx xop[3]; - - /* Crop the n-th destination byte. */ - - xdest[n] = simplify_gen_subreg (QImode, dest, mode, n); - int ldreg_p = test_hard_reg_class (LD_REGS, xdest[n]); - - if (!CONST_INT_P (src) - && !CONST_FIXED_P (src) - && !CONST_DOUBLE_P (src)) - { - static const char *const asm_code[][2] = - { - { "ldi %2,lo8(%1)" CR_TAB "mov %0,%2", "ldi %0,lo8(%1)" }, - { "ldi %2,hi8(%1)" CR_TAB "mov %0,%2", "ldi %0,hi8(%1)" }, - { "ldi %2,hlo8(%1)" CR_TAB "mov %0,%2", "ldi %0,hlo8(%1)" }, - { "ldi %2,hhi8(%1)" CR_TAB "mov %0,%2", "ldi %0,hhi8(%1)" } - }; - - xop[0] = xdest[n]; - xop[1] = src; - xop[2] = clobber_reg; - - avr_asm_len (asm_code[n][ldreg_p], xop, len, ldreg_p ? 1 : 2); - - continue; - } - - /* Crop the n-th source byte. */ - - xval = simplify_gen_subreg (QImode, src, mode, n); - ival[n] = INTVAL (xval); - - /* Look if we can reuse the low word by means of MOVW. */ - - if (n == 2 - && n_bytes >= 4 - && AVR_HAVE_MOVW) - { - rtx lo16 = simplify_gen_subreg (HImode, src, mode, 0); - rtx hi16 = simplify_gen_subreg (HImode, src, mode, 2); - - if (INTVAL (lo16) == INTVAL (hi16)) - { - if (INTVAL (lo16) != 0 || !clear_p) - avr_asm_len ("movw %C0,%A0", &op[0], len, 1); - - break; - } - } - - /* Don't use CLR so that cc0 is set as expected. */ - - if (ival[n] == 0) - { - if (!clear_p) - avr_asm_len (ldreg_p ? "ldi %0,0" - : AVR_ZERO_REGNO == REGNO (xdest[n]) ? "clr %0" - : "mov %0,__zero_reg__", - &xdest[n], len, 1); - continue; - } - - if (clobber_val == ival[n] - && REGNO (clobber_reg) == REGNO (xdest[n])) - { - continue; - } - - /* LD_REGS can use LDI to move a constant value */ - - if (ldreg_p) - { - xop[0] = xdest[n]; - xop[1] = xval; - avr_asm_len ("ldi %0,lo8(%1)", xop, len, 1); - continue; - } - - /* Try to reuse value already loaded in some lower byte. */ - - for (int j = 0; j < n; j++) - if (ival[j] == ival[n]) - { - xop[0] = xdest[n]; - xop[1] = xdest[j]; - - avr_asm_len ("mov %0,%1", xop, len, 1); - done_byte = true; - break; - } - - if (done_byte) - continue; - - /* Need no clobber reg for -1: Use CLR/DEC */ - - if (ival[n] == -1) - { - if (!clear_p) - avr_asm_len ("clr %0", &xdest[n], len, 1); - - avr_asm_len ("dec %0", &xdest[n], len, 1); - continue; - } - else if (ival[n] == 1) - { - if (!clear_p) - avr_asm_len ("clr %0", &xdest[n], len, 1); - - avr_asm_len ("inc %0", &xdest[n], len, 1); - continue; - } - - /* Use T flag or INC to manage powers of 2 if we have - no clobber reg. */ - - if (NULL_RTX == clobber_reg - && single_one_operand (xval, QImode)) - { - xop[0] = xdest[n]; - xop[1] = GEN_INT (exact_log2 (ival[n] & GET_MODE_MASK (QImode))); - - gcc_assert (constm1_rtx != xop[1]); - - if (!set_p) - { - set_p = true; - avr_asm_len ("set", xop, len, 1); - } - - if (!clear_p) - avr_asm_len ("clr %0", xop, len, 1); - - avr_asm_len ("bld %0,%1", xop, len, 1); - continue; - } - - /* We actually need the LD_REGS clobber reg. */ - - gcc_assert (NULL_RTX != clobber_reg); - - xop[0] = xdest[n]; - xop[1] = xval; - xop[2] = clobber_reg; - clobber_val = ival[n]; - - avr_asm_len ("ldi %2,lo8(%1)" CR_TAB - "mov %0,%2", xop, len, 2); - } - - /* If we cooked up a clobber reg above, restore it. */ - - if (cooked_clobber_p) - { - avr_asm_len ("mov %0,__tmp_reg__", &clobber_reg, len, 1); - } -} - - /* Reload the constant OP[1] into the HI register OP[0]. CLOBBER_REG is a QI clobber reg needed to move vast majority of consts into a NO_LD_REGS register. If CLOBBER_REG is NULL_RTX we either don't @@ -14825,10 +13605,14 @@ avr_hard_regno_rename_ok (unsigned int old_reg, unsigned int new_reg) const char * avr_out_sbxx_branch (rtx_insn *insn, rtx operands[]) { - enum rtx_code comp = GET_CODE (operands[0]); + rtx_code comp = GET_CODE (operands[0]); bool long_jump = get_attr_length (insn) >= 4; bool reverse = long_jump || jump_over_one_insn_p (insn, operands[3]); + // PR116953: jump_over_one_insn_p may call extract on the next insn, + // clobbering recog_data.operand. Thus, restore recog_data. + extract_constrain_insn_cached (insn); + if (comp == GE) comp = EQ; else if (comp == LT) @@ -15723,7 +14507,7 @@ avr_has_nibble_0xf (rtx ival) typedef struct { /* tree code of binary function G */ - enum tree_code code; + tree_code code; /* The constant second argument of G */ int arg; @@ -15986,7 +14770,7 @@ struct GTY(()) avr_builtin_description that a built-in's ID can be used to access the built-in by means of avr_bdesc[ID] */ -static GTY(()) struct avr_builtin_description +static GTY(()) avr_builtin_description avr_bdesc[AVR_BUILTIN_COUNT] = { #define DEF_BUILTIN(NAME, N_ARGS, TYPE, ICODE, LIBNAME) \ @@ -16282,7 +15066,7 @@ avr_expand_builtin (tree exp, rtx target, rtx /*subtarget*/, tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); const char *bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); unsigned int id = DECL_MD_FUNCTION_CODE (fndecl); - const struct avr_builtin_description *d = &avr_bdesc[id]; + const avr_builtin_description *d = &avr_bdesc[id]; tree arg0; rtx op0; @@ -16678,10 +15462,25 @@ avr_md_asm_adjust (vec<rtx> &/*outputs*/, vec<rtx> &/*inputs*/, } +/* Implement `TARGET_C_MODE_FOR_FLOATING_TYPE'. Return SFmode or DFmode + for TI_{LONG_,}DOUBLE_TYPE which is for {long,} double type, go with + the default one for the others. */ + +static machine_mode +avr_c_mode_for_floating_type (tree_index ti) +{ + if (ti == TI_DOUBLE_TYPE) + return avr_double == 32 ? SFmode : DFmode; + if (ti == TI_LONG_DOUBLE_TYPE) + return avr_long_double == 32 ? SFmode : DFmode; + return default_mode_for_floating_type (ti); +} + + /* Worker function for `FLOAT_LIB_COMPARE_RETURNS_BOOL'. */ bool -avr_float_lib_compare_returns_bool (machine_mode mode, enum rtx_code) +avr_float_lib_compare_returns_bool (machine_mode mode, rtx_code) { if (mode == DFmode) { @@ -16910,7 +15709,7 @@ avr_use_lra_p () #undef TARGET_C_MODE_FOR_FLOATING_TYPE #define TARGET_C_MODE_FOR_FLOATING_TYPE avr_c_mode_for_floating_type -struct gcc_target targetm = TARGET_INITIALIZER; +gcc_target targetm = TARGET_INITIALIZER; #include "gt-avr.h" diff --git a/gcc/config/avr/avr.h b/gcc/config/avr/avr.h index 56b7f39..3ef1897 100644 --- a/gcc/config/avr/avr.h +++ b/gcc/config/avr/avr.h @@ -1,5 +1,5 @@ /* Definitions of target machine for GNU compiler, - for ATMEL AVR at90s8515, ATmega103/103L, ATmega603/603L microcontrollers. + for AVR 8-bit microcontrollers. Copyright (C) 1998-2024 Free Software Foundation, Inc. Contributed by Denis Chertykov (chertykov@gmail.com) @@ -308,18 +308,25 @@ enum reg_class { #define STATIC_CHAIN_REGNUM ((AVR_TINY) ? 18 :2) -#define ELIMINABLE_REGS { \ +#define RELOAD_ELIMINABLE_REGS { \ { ARG_POINTER_REGNUM, STACK_POINTER_REGNUM }, \ { ARG_POINTER_REGNUM, FRAME_POINTER_REGNUM }, \ { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM }, \ { FRAME_POINTER_REGNUM + 1, STACK_POINTER_REGNUM + 1 } } +#define ELIMINABLE_REGS \ + { \ + { ARG_POINTER_REGNUM, STACK_POINTER_REGNUM }, \ + { ARG_POINTER_REGNUM, FRAME_POINTER_REGNUM }, \ + { FRAME_POINTER_REGNUM, STACK_POINTER_REGNUM } \ + } + #define INITIAL_ELIMINATION_OFFSET(FROM, TO, OFFSET) \ OFFSET = avr_initial_elimination_offset (FROM, TO) #define RETURN_ADDR_RTX(count, tem) avr_return_addr_rtx (count, tem) -/* Don't use Push rounding. expr.cc: emit_single_push_insn is broken +/* Don't use Push rounding. expr.cc: emit_single_push_insn is broken for POST_DEC targets (PR27386). */ /*#define PUSH_ROUNDING(NPUSHED) (NPUSHED)*/ @@ -478,7 +485,7 @@ typedef struct avr_args /* Set MOVE_RATIO to 3 to allow memory moves upto 4 bytes to happen by pieces when optimizing for speed, like it did when MOVE_MAX_PIECES - was 4. When optimizing for size, allow memory moves upto 2 bytes. + was 4. When optimizing for size, allow memory moves upto 2 bytes. Also see avr_use_by_pieces_infrastructure_p. */ #define MOVE_RATIO(speed) ((speed) ? 3 : 2) @@ -561,19 +568,19 @@ struct GTY(()) machine_function -1 when "signal" attribute(s) with arguments are present but none without argument. */ int is_signal; - + /* 'true' - if current function is a non-blocking interrupt service routine as specified by the "isr_noblock" attribute. */ int is_noblock; - /* 'true' - if current function is a 'task' function + /* 'true' - if current function is a 'task' function as specified by the "OS_task" attribute. */ int is_OS_task; - /* 'true' - if current function is a 'main' function + /* 'true' - if current function is a 'main' function as specified by the "OS_main" attribute. */ int is_OS_main; - + /* Current function stack size. */ int stack_usage; diff --git a/gcc/config/avr/avr.md b/gcc/config/avr/avr.md index c10709e..aae8a69 100644 --- a/gcc/config/avr/avr.md +++ b/gcc/config/avr/avr.md @@ -1,5 +1,5 @@ ;; Machine description for GNU compiler, -;; for ATMEL AVR micro controllers. +;; for AVR 8-bit microcontrollers. ;; Copyright (C) 1998-2024 Free Software Foundation, Inc. ;; Contributed by Denis Chertykov (chertykov@gmail.com) @@ -87,7 +87,6 @@ UNSPEC_FMUL UNSPEC_FMULS UNSPEC_FMULSU - UNSPEC_COPYSIGN UNSPEC_INSERT_BITS UNSPEC_ROUND ]) @@ -171,7 +170,7 @@ ashlsi, ashrsi, lshrsi, ashlpsi, ashrpsi, lshrpsi, insert_bits, insv_notbit, insv, - add_set_ZN, cmp_uext, cmp_sext, + add_set_ZN, add_set_N, cmp_uext, cmp_sext, cmp_lsr, no" (const_string "no")) @@ -261,6 +260,7 @@ (define_mode_iterator QIDI [QI HI PSI SI DI]) (define_mode_iterator QIPSI [QI HI PSI]) (define_mode_iterator HISI [HI PSI SI]) +(define_mode_iterator HI_SI [HI SI]) ;; Ordered integral and fixed-point modes of specific sizes. (define_mode_iterator ALL1 [QI QQ UQQ]) @@ -277,6 +277,10 @@ (define_mode_iterator ALLs234 [HI SI PSI HQ HA SQ SA]) +(define_mode_iterator ALLCC [CC CCN CCZN]) + +(define_mode_attr CCname [(CC "") (CCN "_N") (CCZN "_ZN")]) + ;; All supported move-modes (define_mode_iterator MOVMODE [QI QQ UQQ HI HQ UHQ HA UHA @@ -320,6 +324,9 @@ (define_code_iterator xior [xor ior]) (define_code_iterator eqne [eq ne]) (define_code_iterator gelt [ge lt]) +(define_code_iterator eqnegtle [eq ne gt le]) +(define_code_iterator cmp_signed [eq ne ge lt gt le]) +(define_code_iterator op8_ZN [plus minus and ior xor ashift ashiftrt lshiftrt]) (define_code_iterator ss_addsub [ss_plus ss_minus]) (define_code_iterator us_addsub [us_plus us_minus]) @@ -985,41 +992,10 @@ (clobber (reg:CC REG_CC))])]) -;; For LPM loads from AS1 we split -;; R = *Z -;; to -;; R = *Z++ -;; Z = Z - sizeof (R) -;; -;; so that the second instruction can be optimized out. - -(define_split ; "split-lpmx" - [(set (match_operand:HISI 0 "register_operand" "") - (match_operand:HISI 1 "memory_operand" ""))] - "reload_completed - && AVR_HAVE_LPMX - && avr_mem_flash_p (operands[1]) - && REG_P (XEXP (operands[1], 0)) - && !reg_overlap_mentioned_p (XEXP (operands[1], 0), operands[0])" - [(set (match_dup 0) - (match_dup 2)) - (set (match_dup 3) - (plus:HI (match_dup 3) - (match_dup 4)))] - { - rtx addr = XEXP (operands[1], 0); - - operands[2] = replace_equiv_address (operands[1], - gen_rtx_POST_INC (Pmode, addr)); - operands[3] = addr; - operands[4] = gen_int_mode (-<SIZE>, HImode); - }) - - ;; Legitimate address and stuff allows way more addressing modes than ;; Reduced Tiny actually supports. Split them now so that we get ;; closer to real instructions which may result in some optimization -;; opportunities. +;; opportunities. This applies also to fake X + offset addressing. (define_split [(parallel [(set (match_operand:MOVMODE 0 "nonimmediate_operand") (match_operand:MOVMODE 1 "general_operand")) @@ -1032,7 +1008,7 @@ && (MEM_P (operands[0]) || MEM_P (operands[1]))" [(scratch)] { - if (avr_split_tiny_move (curr_insn, operands)) + if (avr_split_fake_addressing_move (curr_insn, operands)) DONE; FAIL; }) @@ -6655,6 +6631,34 @@ (set_attr "adjust_len" "tstsi,*,compare,compare")]) +;; "*cmphi_lsr" +;; "*cmpsi_lsr" +;; "*cmppsi_lsr" +(define_insn_and_split "*cmp<mode>_lsr" + [(set (reg:CC REG_CC) + (compare:CC (lshiftrt:HISI (match_operand:HISI 0 "register_operand" "r") + (match_operand:QI 1 "const_8_16_24_operand" "n")) + (const_int 0))) + (clobber (scratch:QI))] + "reload_completed" + { + return avr_out_cmp_lsr (insn, operands, NULL); + } + "&& 1" + [;; "cmpqi3" + (set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (const_int 0)))] + { + // When the comparison is just one byte, then cmpqi3. + if (INTVAL (operands[1]) / 8 == <SIZE> - 1) + operands[0] = simplify_gen_subreg (QImode, operands[0], <MODE>mode, <SIZE> - 1); + else + FAIL; + } + [(set_attr "adjust_len" "cmp_lsr")]) + + ;; A helper for avr_pass_ifelse::avr_rest_of_handle_ifelse(). (define_expand "gen_compare<mode>" [(parallel [(set (reg:CC REG_CC) @@ -6684,7 +6688,7 @@ int icode = (int) GET_CODE (operands[0]); targetm.canonicalize_comparison (&icode, &operands[1], &operands[2], false); - PUT_CODE (operands[0], (enum rtx_code) icode); + PUT_CODE (operands[0], (rtx_code) icode); }) (define_expand "cbranch<mode>4" @@ -6701,7 +6705,7 @@ int icode = (int) GET_CODE (operands[0]); targetm.canonicalize_comparison (&icode, &operands[1], &operands[2], false); - PUT_CODE (operands[0], (enum rtx_code) icode); + PUT_CODE (operands[0], (rtx_code) icode); }) @@ -6748,20 +6752,9 @@ (label_ref (match_dup 3)) (pc)))] { - // Unsigned >= 65536 and < 65536 can be performed by testing the - // high word against 0. - if ((GET_CODE (operands[0]) == LTU - || GET_CODE (operands[0]) == GEU) - && const_operand (operands[2], <MODE>mode) - && INTVAL (avr_to_int_mode (operands[2])) == 65536) - { - // "cmphi3" of the high word against 0. - operands[0] = copy_rtx (operands[0]); - PUT_CODE (operands[0], GET_CODE (operands[0]) == GEU ? NE : EQ); - operands[1] = simplify_gen_subreg (HImode, operands[1], <MODE>mode, 2); - operands[2] = const0_rtx; - operands[4] = gen_rtx_SCRATCH (QImode); - } + // Unsigned >= 256^n and < 256^n can be performed by testing the + // higher bytes against 0 (*cmpsi_lsr). + avr_maybe_cmp_lsr (operands); }) ;; "cbranchpsi4_insn" @@ -6784,7 +6777,12 @@ (if_then_else (match_op_dup 0 [(reg:CC REG_CC) (const_int 0)]) (label_ref (match_dup 3)) - (pc)))]) + (pc)))] + { + // Unsigned >= 256^n and < 256^n can be performed by testing the + // higher bytes against 0 (*cmppsi_lsr). + avr_maybe_cmp_lsr (operands); + }) ;; "cbranchhi4_insn" ;; "cbranchhq4_insn" "cbranchuhq4_insn" "cbranchha4_insn" "cbranchuha4_insn" @@ -6810,21 +6808,11 @@ (pc)))] { // Unsigned >= 256 and < 256 can be performed by testing the - // high byte against 0. - if ((GET_CODE (operands[0]) == LTU - || GET_CODE (operands[0]) == GEU) - && const_operand (operands[2], <MODE>mode) - && INTVAL (avr_to_int_mode (operands[2])) == 256) - { - rtx_code code = GET_CODE (operands[0]) == GEU ? NE : EQ; - rtx hi8 = simplify_gen_subreg (QImode, operands[1], <MODE>mode, 1); - rtx cmp = gen_rtx_fmt_ee (code, VOIDmode, cc_reg_rtx, const0_rtx); - emit (gen_cmpqi3 (hi8, const0_rtx)); - emit (gen_branch (operands[3], cmp)); - DONE; - } + // high byte against 0 (*cmphi_lsr). + avr_maybe_cmp_lsr (operands); }) + ;; Combiner pattern to compare sign- or zero-extended register against ;; a wider register, like comparing uint8_t against uint16_t. (define_insn_and_split "*cbranch<HISI:mode>.<code><QIPSI:mode>.0" @@ -6889,6 +6877,469 @@ }) +;; Try optimize decrement-and-branch. When we have an addition followed +;; by a comparison of the result against zero, we can output the addition +;; in such a way that SREG.N and SREG.Z are set according to the result. +;; The comparisons are split2 from their cbranch insns and before +;; peephole2 patterns like for swapped_tst and sbrx_branch have been applied. + +;; We do NOT use cmpelim / SELECT_CC_MODE because it has many shortcomings +;; and is by no means equipollent to the removed cc0 framework -- at least +;; with regard to the avr backend: Whether or not the result of a comparison +;; can be obtained as a byproduct of an operation might depend on the +;; availability of a scratch register: There are cases where we need a +;; scratch register to optimize away a comparison, and where the operation +;; without a comparison does not require a scratch. With the peep2 approach +;; below, we can get a scratch from the peep2 framework without increasing +;; the register pressure, whereas cmpelim doesn't offer such a feature. +;; When no scratch is available, then we just don't perform the optimizaton, +;; i.e. the comparison against 0 won't be optimized away, which is preferred +;; over increasing the register pressure -- in many cases without reason -- +;; which might result in additional spills. +;; What we definitely do not want is to pop a scratch without need, and +;; in some arithmetic insn we won't know whether it might also be considered +;; for CCmode generation, at least not prior to register allocation: +;; CCmode only comes into existence after register allocation. +;; cmpelim has more shortcomings, for example some comparisons may not +;; be available, and it does not handle several of the forms supported below, +;; just to mention two. A solution for the former would be to return VOIDmode +;; in SELECT_CC_MODE, but cmpelim doesn't handle that. Anyway, it's pointless +;; to speculate about how other shortcomings could be fixed when the scratch +;; problem is unsoved in cmpelim. +;; Apart from that, compare-elim.cc lists some demands that are not +;; compatible with this bachend. For example, it assumes that when an insn +;; can set the condition code, it is always of the form compare:CCM, i.e. +;; all comparisons are supported. This is not the case for AVR, see the +;; peep2 conditions below. There is no way (at least not a documented one) +;; to express that in SELECT_CC_MODE. +;; Apart from that passes running before register allocation (and thus +;; before split2) have #ifdef SELECT_CC_MODE, and nowhere there is an +;; explanation on how to handle that. +;; Skipping cmpelim is accomplished by not defining TARGET_FLAGS_REGNUM. + +;; Note: reload1.cc::do_output_reload() does not support output reloads +;; for JUMP_INSNs, hence letting combine doing decrement-and-branch might +;; run into an ICE. Doing reloads by hand is too painful, hence, stick with +;; RTL peepholes for now. + +(define_expand "gen_add_for_<code>_<mode>" + [;; "*add.for.cczn.<mode>" + (parallel [(set (reg:CCZN REG_CC) + (compare:CCZN (plus:HISI (match_operand:HISI 0 "register_operand") + (match_operand:HISI 1 "const_int_operand")) + (const_int 0))) + (set (match_dup 0) + (plus:HISI (match_dup 0) + (match_dup 1))) + (clobber (match_operand:QI 3))]) + ;; "branch_ZN" + (set (pc) + (if_then_else (eqnegtle (reg:CCZN REG_CC) + (const_int 0)) + (label_ref (match_dup 2)) + (pc)))]) + +(define_expand "gen_add_for_<code>_<mode>" + [;; "*add.for.ccn.<mode>" + (parallel [(set (reg:CCN REG_CC) + (compare:CCN (plus:HISI (match_operand:HISI 0 "register_operand") + (match_operand:HISI 1 "nonmemory_operand")) + (const_int 0))) + (set (match_dup 0) + (plus:HISI (match_dup 0) + (match_dup 1))) + (clobber (match_operand:QI 3))]) + ;; "branch_N" + (set (pc) + (if_then_else (gelt (reg:CCN REG_CC) + (const_int 0)) + (label_ref (match_dup 2)) + (pc)))]) + + +;; 1/3: Additions without a scratch register. +(define_peephole2 + [(parallel [(set (match_operand:HISI 0 "register_operand") + (plus:HISI (match_dup 0) + (match_operand:HISI 1 "nonmemory_operand"))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:HISI 3 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "// Multi-byte reg-reg additions only set the N flag. + (<CODE> == GE || <CODE> == LT || ! REG_P (operands[1])) + // Needs a const or a d-reg. + && (REG_P (operands[1]) || d_register_operand (operands[0], <MODE>mode)) + && peep2_regno_dead_p (3, REG_CC)" + [(scratch)] + { + emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], + gen_rtx_SCRATCH (QImode))); + DONE; + }) + +;; 2/3: Additions with a scratch register from the insn. +(define_peephole2 + [(parallel [(set (match_operand:HISI 0 "register_operand") + (plus:HISI (match_dup 0) + (match_operand:HISI 1 "nonmemory_operand"))) + (clobber (match_operand:QI 3 "scratch_or_d_register_operand")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:HISI 4 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "// Multi-byte reg-reg additions only set the N flag. + (<CODE> == GE || <CODE> == LT || ! REG_P (operands[1])) + && peep2_regno_dead_p (3, REG_CC)" + [(scratch)] + { + rtx scratch = operands[3]; + + // We need either a d-register or a scratch register + // when $1 is not a register. + if (! REG_P (operands[1]) + && ! REG_P (scratch) + && ! d_register_operand (operands[0], <MODE>mode)) + FAIL; + + emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], + scratch)); + DONE; + }) + +;; 3/3: Additions with a scratch register from peephole2. +(define_peephole2 + [(match_scratch:QI 3 "d") + (parallel [(set (match_operand:HISI 0 "register_operand") + (plus:HISI (match_dup 0) + (match_operand:HISI 1 "const_int_operand"))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:HISI 4 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "peep2_regno_dead_p (3, REG_CC)" + [(scratch)] + { + emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], + operands[3])); + DONE; + }) + +;; Result of the above three peepholes is an addition that also +;; performs a signed comparison (of the result) against zero. +;; FIXME: Using (match_dup 0) instead of operands[3/4] makes rnregs +;; barf in regrename.cc::merge_overlapping_regs(). For now, use the +;; fix from PR50788: Constrain as "0". + +;; "*add.for.cczn.hi" "*add.for.cczn.psi" "*add.for.cczn.si" +(define_insn "*add.for.cczn.<mode>" + [(set (reg:CCZN REG_CC) + (compare:CCZN + (plus:HISI (match_operand:HISI 3 "register_operand" "0 ,0") + (match_operand:HISI 1 "const_int_operand" "n ,n")) + (const_int 0))) + (set (match_operand:HISI 0 "register_operand" "=d ,r") + (plus:HISI (match_operand:HISI 4 "register_operand" "0 ,0") + (match_operand:HISI 5 "const_int_operand" "1 ,1"))) + (clobber (match_scratch:QI 2 "=X ,&d"))] + "reload_completed" + { + return avr_out_plus_set_ZN (operands, nullptr); + } + [(set (attr "length") + (symbol_ref "<SIZE> * (1 + REG_P (operands[2]))")) + (set_attr "adjust_len" "add_set_ZN")]) + +;; "*add.for.ccn.hi" "*add.for.ccn.psi" "*add.for.ccn.si" +(define_insn "*add.for.ccn.<mode>" + [(set (reg:CCN REG_CC) + (compare:CCN + (plus:HISI (match_operand:HISI 3 "register_operand" "0 ,0 ,0") + (match_operand:HISI 1 "nonmemory_operand" "n ,n ,r")) + (const_int 0))) + (set (match_operand:HISI 0 "register_operand" "=d ,r ,r") + (plus:HISI (match_operand:HISI 4 "register_operand" "0 ,0 ,0") + (match_operand:HISI 5 "nonmemory_operand" "1 ,1 ,1"))) + (clobber (match_scratch:QI 2 "=X ,&d,X"))] + "reload_completed" + { + return avr_out_plus_set_N (operands, nullptr); + } + [(set (attr "length") + (symbol_ref "<SIZE> * (1 + REG_P (operands[2]))")) + (set_attr "adjust_len" "add_set_N")]) + + +;; 1/3: Subtractions with REG subtrahend set Z and N in a meaningful way. +;; The QI and PSI cases are handled below because they don't have a scratch:QI. +(define_peephole2 + [(parallel [(set (match_operand:HI_SI 0 "register_operand") + (minus:HI_SI (match_dup 0) + (match_operand:HI_SI 1 "register_operand"))) + (clobber (scratch:QI)) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:HI_SI 3 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "peep2_regno_dead_p (3, REG_CC)" + [;; "*sub.for.cczn.<mode>" + (parallel [(set (reg:CCZN REG_CC) + (compare:CCZN (minus:HI_SI (match_dup 0) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) + (minus:HI_SI (match_dup 0) + (match_dup 1)))]) + ;; "branch_ZN" + (set (pc) + (if_then_else (cmp_signed (reg:CCZN REG_CC) + (const_int 0)) + (label_ref (match_dup 2)) + (pc)))]) + +;; 2/3: Subtractions with a PSImode REG: no scratch:QI. +(define_peephole2 + [(parallel [(set (match_operand:PSI 0 "register_operand") + (minus:PSI (match_dup 0) + (match_operand:PSI 1 "register_operand"))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:PSI 3 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "peep2_regno_dead_p (3, REG_CC)" + [;; "*sub.for.cczn.psi" + (parallel [(set (reg:CCZN REG_CC) + (compare:CCZN (minus:PSI (match_dup 0) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) + (minus:PSI (match_dup 0) + (match_dup 1)))]) + ;; "branch_ZN" + (set (pc) + (if_then_else (cmp_signed (reg:CCZN REG_CC) + (const_int 0)) + (label_ref (match_dup 2)) + (pc)))]) + +;; 3/3: Subtractions that extend the subtrahend. +(define_peephole2 + [(parallel [(set (match_operand:HISI 0 "register_operand") + (minus:HISI (match_dup 0) + (any_extend:HISI (match_operand:QIPSI 1 "register_operand")))) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:HISI 3 "const0_operand"))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "<HISI:SIZE> > <QIPSI:SIZE> + && peep2_regno_dead_p (3, REG_CC)" + [;; "*sub-extend<QIPSI:mode>.for.cczn.<HISI:mode>" + (parallel [(set (reg:CCZN REG_CC) + (compare:CCZN (minus:HISI (match_dup 0) + (any_extend:HISI (match_dup 1))) + (const_int 0))) + (set (match_dup 0) + (minus:HISI (match_dup 0) + (any_extend:HISI (match_dup 1))))]) + ;; "branch_ZN" + (set (pc) + (if_then_else (cmp_signed (reg:CCZN REG_CC) + (const_int 0)) + (label_ref (match_dup 2)) + (pc)))]) + +;; "*sub.for.cczn.hi" +;; "*sub.for.cczn.psi" +;; "*sub.for.cczn.si" +(define_insn "*sub.for.cczn.<mode>" + [(set (reg:CCZN REG_CC) + (compare:CCZN (minus:HISI (match_operand:HISI 3 "register_operand" "1") + (match_operand:HISI 4 "register_operand" "2")) + (const_int 0))) + (set (match_operand:HISI 0 "register_operand" "=r") + (minus:HISI (match_operand:HISI 1 "register_operand" "0") + (match_operand:HISI 2 "register_operand" "r")))] + "reload_completed" + { + return avr_out_plus_ext (insn, operands, nullptr); + } + [(set_attr "length" "<SIZE>")]) + + +(define_insn "*sub-extend<QIPSI:mode>.for.cczn.<HISI:mode>" + [(set (reg:CCZN REG_CC) + (compare:CCZN (minus:HISI (match_operand:HISI 3 "register_operand" "0") + (any_extend:HISI + (match_operand:QIPSI 4 "register_operand" "2"))) + (const_int 0))) + (set (match_operand:HISI 0 "register_operand" "=r") + (minus:HISI (match_operand:HISI 1 "register_operand" "0") + (any_extend:HISI (match_operand:QIPSI 2 "register_operand" "r"))))] + "reload_completed + && <HISI:SIZE> > <QIPSI:SIZE>" + { + return avr_out_plus_ext (insn, operands, nullptr); + } + [(set (attr "length") + (symbol_ref "<HISI:SIZE> + 3 * (<CODE> == SIGN_EXTEND)"))]) + + +;; Operations other that PLUS can set the condition code in +;; a meaningful way, too. + +;; 1/1 Left shift sets the N bit. +(define_peephole2 + [(parallel [(set (match_operand:HISI 0 "register_operand") + (ashift:HISI (match_dup 0) + (const_int 1))) + (clobber (match_operand:QI 3 "scratch_operand")) + (clobber (reg:CC REG_CC))]) + (parallel [(set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (const_int 0))) + (clobber (scratch:QI))]) + (set (pc) + (if_then_else (gelt (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))] + "peep2_regno_dead_p (3, REG_CC)" + [;; "*ashift.for.ccn.<mode>" + (parallel [(set (reg:CCN REG_CC) + (compare:CCN (ashift:HISI (match_dup 0) + (const_int 1)) + (const_int 0))) + (set (match_dup 0) + (ashift:HISI (match_dup 0) + (const_int 1)))]) + ;; "branch_N" + (set (pc) + (if_then_else (gelt (reg:CCN REG_CC) + (const_int 0)) + (label_ref (match_operand 2)) + (pc)))]) + +(define_insn "*ashift.for.ccn.<mode>" + [(set (reg:CCN REG_CC) + (compare:CCN (ashift:HISI (match_operand:HISI 2 "register_operand" "0") + (const_int 1)) + (const_int 0))) + (set (match_operand:HISI 0 "register_operand" "=r") + (ashift:HISI (match_operand:HISI 1 "register_operand" "0") + (const_int 1)))] + "reload_completed" + { + output_asm_insn ("lsl %A0", operands); + output_asm_insn ("rol %B0", operands); + if (<SIZE> >= 3) output_asm_insn ("rol %C0", operands); + if (<SIZE> >= 4) output_asm_insn ("rol %D0", operands); + return ""; + } + [(set_attr "length" "<SIZE>")]) + + +;; 1/1 QImode operations that set Z and N in a meaningful way. +(define_peephole2 + [(parallel [(set (match_operand:QI 0 "register_operand") + (match_operator:QI 2 "op8_ZN_operator" [(match_dup 0) + (match_operand:QI 1)])) + (clobber (reg:CC REG_CC))]) + (set (reg:CC REG_CC) + (compare:CC (match_dup 0) + (match_operand:QI 4 "const0_operand"))) + (set (pc) + (if_then_else (cmp_signed (reg:CC REG_CC) + (const_int 0)) + (label_ref (match_operand 3)) + (pc)))] + "peep2_regno_dead_p (3, REG_CC)" + [;; "*op8.for.cczn.<code>" + (parallel [(set (reg:CCZN REG_CC) + (compare:CCZN (match_op_dup 2 [(match_dup 0) + (match_dup 1)]) + (const_int 0))) + (set (match_dup 0) + (match_op_dup 2 [(match_dup 0) + (match_dup 1)]))]) + ;; "branch_ZN" + (set (pc) + (if_then_else (cmp_signed (reg:CCZN REG_CC) + (const_int 0)) + (label_ref (match_operand 3)) + (pc)))]) + +;; Constraints and predicate for the insn below. This is what op8_ZN_operator +;; allows. Constraints are written in such a way that all cases have two +;; alternatives (shifts, XOR and MINUS have effectively just one alternative). +;; Note again that due to nregs, match_dup's won't work. +(define_code_attr c0_op8 + [(xor "r,r") (minus "r,r") (ashift "r,r") (ashiftrt "r,r") (lshiftrt "r,r") + (and "d,r") (ior "d,r") (plus "d,r")]) + +(define_code_attr c2_op8 + [(xor "r,r") (minus "r,r") (and "n,r") (ior "n,r") (plus "n,r P N K Cm2") + (ashift "P K,C03") (ashiftrt "P K,C03") (lshiftrt "P K,C03")]) + +(define_code_attr p2_op8 + [(ashift "const_1_to_3") (ashiftrt "const_1_to_3") (lshiftrt "const_1_to_3") + (xor "register") (minus "register") + (plus "nonmemory") (and "nonmemory") (ior "nonmemory")]) + +;; Result of the peephole2 above: An 8-bit operation that sets Z and N. +;; The allowed operations are: PLUS, MINUS, AND, IOR, XOR and SHIFTs +;; with operands according to op8_ZN_operator. +(define_insn "*op8.for.cczn.<code>" + [(set (reg:CCZN REG_CC) + (compare:CCZN (op8_ZN:QI (match_operand:QI 3 "register_operand" "0,0") + (match_operand:QI 4 "<p2_op8>_operand" "2,2")) + (const_int 0))) + (set (match_operand:QI 0 "register_operand" "=<c0_op8>") + (op8_ZN:QI (match_operand:QI 1 "register_operand" "0,0") + (match_operand:QI 2 "<p2_op8>_operand" "<c2_op8>")))] + "reload_completed" + { + return avr_out_op8_set_ZN (<CODE>, operands, nullptr); + } + [(set (attr "length") + (symbol_ref "avr_len_op8_set_ZN (<CODE>, operands)"))]) + + ;; Test a single bit in a QI/HI/SImode register. ;; Combine will create zero-extract patterns for single-bit tests. ;; Permit any mode in source pattern by using VOIDmode. @@ -7050,32 +7501,25 @@ ;; Compare with 0 (test) jumps ;; ************************************************************************ -(define_insn "branch" +;; "branch" +;; "branch_N" +;; "branch_ZN" +(define_insn "branch<CCname>" [(set (pc) - (if_then_else (match_operator 1 "simple_comparison_operator" - [(reg:CC REG_CC) + (if_then_else (match_operator 1 "ordered_comparison_operator" + [(reg:ALLCC REG_CC) (const_int 0)]) (label_ref (match_operand 0)) (pc)))] "reload_completed" { - return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 0); - } - [(set_attr "type" "branch")]) - - -(define_insn "difficult_branch" - [(set (pc) - (if_then_else (match_operator 1 "difficult_comparison_operator" - [(reg:CC REG_CC) - (const_int 0)]) - (label_ref (match_operand 0 "" "")) - (pc)))] - "reload_completed" - { - return ret_cond_branch (operands[1], avr_jump_mode (operands[0], insn), 0); + return avr_cond_branch (insn, operands); } - [(set_attr "type" "branch1")]) + [(set (attr "type") + (if_then_else + (match_test "simple_comparison_operator (operands[1], VOIDmode)") + (const_string "branch") + (const_string "branch1")))]) ;; ************************************************************************** @@ -8839,12 +9283,18 @@ ;; Copysign (define_insn "copysignsf3" - [(set (match_operand:SF 0 "register_operand" "=r") - (unspec:SF [(match_operand:SF 1 "register_operand" "0") - (match_operand:SF 2 "register_operand" "r")] - UNSPEC_COPYSIGN))] + [(set (match_operand:SF 0 "register_operand" "=r") + (copysign:SF (match_operand:SF 1 "register_operand" "0") + (match_operand:SF 2 "nonmemory_operand" "rF")))] "" - "bst %D2,7\;bld %D0,7" + { + if (const_double_operand (operands[2], SFmode)) + { + rtx xmsb = simplify_gen_subreg (QImode, operands[2], SFmode, 3); + return INTVAL (xmsb) < 0 ? "set\;bld %D0,7" : "clt\;bld %D0,7"; + } + return "bst %D2,7\;bld %D0,7"; + } [(set_attr "length" "2")]) ;; Swap Bytes (change byte-endianness) @@ -9555,173 +10005,6 @@ (clobber (reg:CC REG_CC))])]) -;; Try optimize decrement-and-branch. When we have an addition followed -;; by a comparison of the result against zero, we can output the addition -;; in such a way that SREG.N and SREG.Z are set according to the result. - -;; { -1, +1 } for QImode, otherwise the empty set. -(define_mode_attr p1m1 [(QI "N P") - (HI "Yxx") (PSI "Yxx") (SI "Yxx")]) - -;; FIXME: reload1.cc::do_output_reload() does not support output reloads -;; for JUMP_INSNs, hence letting combine doing decrement-and-branch like -;; the following might run into ICE. Doing reloads by hand is too painful... -; -; (define_insn_and_split "*add.for.eqne.<mode>.cbranch" -; [(set (pc) -; (if_then_else (eqne (match_operand:QISI 1 "register_operand" "0") -; (match_operand:QISI 2 "const_int_operand" "n")) -; (label_ref (match_operand 4)) -; (pc))) -; (set (match_operand:QISI 0 "register_operand" "=r") -; (plus:QISI (match_dup 1) -; (match_operand:QISI 3 "const_int_operand" "n")))] -; ;; No clobber for now as combine might not have one handy. -; ;; We pop a scatch in split1. -; "!reload_completed -; && const0_rtx == simplify_binary_operation (PLUS, <MODE>mode, -; operands[2], operands[3])" -; { gcc_unreachable(); } -; "&& 1" -; [(parallel [(set (pc) -; (if_then_else (eqne (match_dup 1) -; (match_dup 2)) -; (label_ref (match_dup 4)) -; (pc))) -; (set (match_dup 0) -; (plus:QISI (match_dup 1) -; (match_dup 3))) -; (clobber (scratch:QI))])]) -; -;; ...Hence, stick with RTL peepholes for now. Unfortunately, there is no -;; canonical form, and if reload shuffles registers around, we might miss -;; opportunities to match a decrement-and-branch. -;; doloop_end doesn't reload either, so doloop_end also won't work. - -(define_expand "gen_add_for_<code>_<mode>" - ; "*add.for.eqne.<mode>" - [(parallel [(set (reg:CC REG_CC) - (compare:CC (plus:QISI (match_operand:QISI 0 "register_operand") - (match_operand:QISI 1 "const_int_operand")) - (const_int 0))) - (set (match_dup 0) - (plus:QISI (match_dup 0) - (match_dup 1))) - (clobber (match_operand:QI 3))]) - ; "branch" - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_dup 2)) - (pc)))]) - - -;; 1/3: A version without clobber: d-reg or 8-bit adds +/-1. -(define_peephole2 - [(parallel [(set (match_operand:QISI 0 "register_operand") - (plus:QISI (match_dup 0) - (match_operand:QISI 1 "const_int_operand"))) - (clobber (reg:CC REG_CC))]) - (set (reg:CC REG_CC) - (compare:CC (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2)) - (pc)))] - "peep2_regno_dead_p (3, REG_CC) - && (d_register_operand (operands[0], <MODE>mode) - || (<MODE>mode == QImode - && (INTVAL (operands[1]) == 1 - || INTVAL (operands[1]) == -1)))" - [(scratch)] - { - emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], - gen_rtx_SCRATCH (QImode))); - DONE; - }) - -;; 2/3: A version with clobber from the insn. -(define_peephole2 - [(parallel [(set (match_operand:QISI 0 "register_operand") - (plus:QISI (match_dup 0) - (match_operand:QISI 1 "const_int_operand"))) - (clobber (match_operand:QI 3 "scratch_or_d_register_operand")) - (clobber (reg:CC REG_CC))]) - (parallel [(set (reg:CC REG_CC) - (compare:CC (match_dup 0) - (const_int 0))) - (clobber (match_operand:QI 4 "scratch_or_d_register_operand"))]) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2)) - (pc)))] - "peep2_regno_dead_p (3, REG_CC)" - [(scratch)] - { - rtx scratch = REG_P (operands[3]) ? operands[3] : operands[4]; - - // We need either a d-register or a scratch register to clobber. - if (! REG_P (scratch) - && ! d_register_operand (operands[0], <MODE>mode) - && ! (QImode == <MODE>mode - && (INTVAL (operands[1]) == 1 - || INTVAL (operands[1]) == -1))) - { - FAIL; - } - emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], - scratch)); - DONE; - }) - -;; 3/3 A version with a clobber from peephole2. -(define_peephole2 - [(match_scratch:QI 3 "d") - (parallel [(set (match_operand:QISI 0 "register_operand") - (plus:QISI (match_dup 0) - (match_operand:QISI 1 "const_int_operand"))) - (clobber (reg:CC REG_CC))]) - (set (reg:CC REG_CC) - (compare:CC (match_dup 0) - (const_int 0))) - (set (pc) - (if_then_else (eqne (reg:CC REG_CC) - (const_int 0)) - (label_ref (match_operand 2)) - (pc)))] - "peep2_regno_dead_p (3, REG_CC)" - [(scratch)] - { - emit (gen_gen_add_for_<code>_<mode> (operands[0], operands[1], operands[2], - operands[3])); - DONE; - }) - -;; Result of the above three peepholes is an addition that also -;; performs an EQ or NE comparison (of the result) against zero. -;; FIXME: Using (match_dup 0) instead of operands[3/4] makes rnregs -;; barf in regrename.cc::merge_overlapping_regs(). For now, use the -;; fix from PR50788: Constrain as "0". -(define_insn "*add.for.eqne.<mode>" - [(set (reg:CC REG_CC) - (compare:CC - (plus:QISI (match_operand:QISI 3 "register_operand" "0,0 ,0") - (match_operand:QISI 1 "const_int_operand" "n,<p1m1>,n")) - (const_int 0))) - (set (match_operand:QISI 0 "register_operand" "=d,*r ,r") - (plus:QISI (match_operand:QISI 4 "register_operand" "0,0 ,0") - (match_dup 1))) - (clobber (match_scratch:QI 2 "=X,X ,&d"))] - "reload_completed" - { - return avr_out_plus_set_ZN (operands, nullptr); - } - [(set_attr "adjust_len" "add_set_ZN")]) - - ;; Swapping both comparison and branch condition. This can turn difficult ;; branches to easy ones. And in some cases, a comparison against one can ;; be turned into a comparison against zero. @@ -9749,7 +10032,7 @@ (pc)))] { rtx xval = avr_to_int_mode (operands[2]); - enum rtx_code code = GET_CODE (operands[0]); + rtx_code code = GET_CODE (operands[0]); if (code == GT && xval == const0_rtx) code = LT; @@ -9789,7 +10072,7 @@ (pc)))] { rtx xval = avr_to_int_mode (operands[2]); - enum rtx_code code = GET_CODE (operands[0]); + rtx_code code = GET_CODE (operands[0]); if (code == GT && xval == const0_rtx) code = LT; diff --git a/gcc/config/avr/avr.opt b/gcc/config/avr/avr.opt index 444ed7e..625323f 100644 --- a/gcc/config/avr/avr.opt +++ b/gcc/config/avr/avr.opt @@ -1,4 +1,4 @@ -; Options for the ATMEL AVR port of the compiler. +; Options for AVR 8-bit microcontrollers. ; Copyright (C) 2005-2024 Free Software Foundation, Inc. ; diff --git a/gcc/config/avr/avrlibc.h b/gcc/config/avr/avrlibc.h index fb4ffed..60fce70 100644 --- a/gcc/config/avr/avrlibc.h +++ b/gcc/config/avr/avrlibc.h @@ -1,5 +1,4 @@ -/* Definitions of target machine for the GNU compiler collection - for Atmel AVR micro controller if configured for AVR-Libc. +/* Definitions for AVR 8-bit microcontrollers if configured for AVR-LibC. Copyright (C) 2012-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) diff --git a/gcc/config/avr/constraints.md b/gcc/config/avr/constraints.md index 963e23a..9512302 100644 --- a/gcc/config/avr/constraints.md +++ b/gcc/config/avr/constraints.md @@ -1,4 +1,4 @@ -;; Constraint definitions for ATMEL AVR micro controllers. +;; Insn constraint definitions for AVR 8-bit microcontrollers. ;; Copyright (C) 2006-2024 Free Software Foundation, Inc. ;; ;; This file is part of GCC. diff --git a/gcc/config/avr/driver-avr.cc b/gcc/config/avr/driver-avr.cc index 92e875a..3eefcab 100644 --- a/gcc/config/avr/driver-avr.cc +++ b/gcc/config/avr/driver-avr.cc @@ -1,4 +1,4 @@ -/* Subroutines for the gcc driver. +/* Subroutines for the gcc driver for AVR 8-bit microcontrollers. Copyright (C) 2009-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay <avr@gjlay.de> @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/avr/elf.h b/gcc/config/avr/elf.h index 0112aa3..1e769f6 100644 --- a/gcc/config/avr/elf.h +++ b/gcc/config/avr/elf.h @@ -1,4 +1,5 @@ -/* Copyright (C) 2011-2024 Free Software Foundation, Inc. +/* Overrides for elfos.h for AVR 8-bit microcontrollers. + Copyright (C) 2011-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) This file is part of GCC. @@ -7,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/avr/gen-avr-mmcu-specs.cc b/gcc/config/avr/gen-avr-mmcu-specs.cc index bb94bea..4bdc0c2 100644 --- a/gcc/config/avr/gen-avr-mmcu-specs.cc +++ b/gcc/config/avr/gen-avr-mmcu-specs.cc @@ -1,4 +1,5 @@ -/* Copyright (C) 1998-2024 Free Software Foundation, Inc. +/* Build device-specs for AVR 8-bit microcontrollers. + Copyright (C) 1998-2024 Free Software Foundation, Inc. Contributed by Joern Rennecke This file is part of GCC. @@ -7,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/avr/gen-avr-mmcu-texi.cc b/gcc/config/avr/gen-avr-mmcu-texi.cc index 70aa430..df2620f 100644 --- a/gcc/config/avr/gen-avr-mmcu-texi.cc +++ b/gcc/config/avr/gen-avr-mmcu-texi.cc @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Build texi documentation for option -mmcu for AVR 8-bit microcontrollers. + Copyright (C) 2012-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) This file is part of GCC. @@ -7,12 +8,12 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/avr/predicates.md b/gcc/config/avr/predicates.md index 5b49481..c44ebff 100644 --- a/gcc/config/avr/predicates.md +++ b/gcc/config/avr/predicates.md @@ -1,4 +1,4 @@ -;; Predicate definitions for ATMEL AVR micro controllers. +;; Insn predicate definitions for AVR 8-bit microcontrollers. ;; Copyright (C) 2006-2024 Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -147,6 +147,11 @@ (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 2, 7)"))) +;; Return true if OP is constant integer 1..3 for MODE. +(define_predicate "const_1_to_3_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 1, 3)"))) + ;; Return 1 if OP is constant integer 1..6 for MODE. (define_predicate "const_1_to_6_operand" (and (match_code "const_int") @@ -162,6 +167,12 @@ (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), -255, -1)"))) +;; Return true if OP is a CONST_INT in { -2, -1, 1, 2 }. +(define_predicate "abs1_abs2_operand" + (and (match_code "const_int") + (match_test "INTVAL (op) != 0") + (match_test "IN_RANGE (INTVAL (op), -2, 2)"))) + ;; Returns true if OP is either the constant zero or a register. (define_predicate "reg_or_0_operand" (ior (match_operand 0 "register_operand") @@ -242,10 +253,30 @@ (and (match_operand 0 "comparison_operator") (not (match_code "gt,gtu,le,leu")))) +;; True for EQ, NE, GE, LT, GT, LE +(define_predicate "signed_comparison_operator" + (match_code "eq,ne,ge,lt,gt,le")) + ;; True for SIGN_EXTEND, ZERO_EXTEND. (define_predicate "extend_operator" (match_code "sign_extend,zero_extend")) +;; True for 8-bit operations that set SREG.N and SREG.Z in a +;; usable way: +;; * OP0 is a QImode register, and +;; * OP1 is a QImode register or CONST_INT, and +;; +;; the allowed operations is one of: +;; +;; * SHIFTs with a const_int offset in { 1, 2, 3 }. +;; * MINUS and XOR with a register operand +;; * IOR and AND with a register operand, or d-reg + const_int +;; * PLUS with a register operand, or d-reg + const_int, +;; or a const_int in { -2, -1, 1, 2 }. */ +(define_predicate "op8_ZN_operator" + (and (match_code "plus,minus,ashift,ashiftrt,lshiftrt,and,ior,xor") + (match_test "avr_op8_ZN_operator (op)"))) + ;; Return true if OP is a valid call operand. (define_predicate "call_insn_operand" (and (match_code "mem") diff --git a/gcc/config/avr/ranges.h b/gcc/config/avr/ranges.h new file mode 100644 index 0000000..89f6896 --- /dev/null +++ b/gcc/config/avr/ranges.h @@ -0,0 +1,278 @@ +/* Subsets of a finite interval over Z. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* A class that represents the union of finitely many intervals. + The domain over which the intervals are defined is a finite integer + interval [m_min, m_max], usually the range of some [u]intN_t. + Supported operations are: + - Complement w.r.t. the domain (invert) + - Union (union_) + - Intersection (intersect) + - Difference / Setminus (minus). + Ranges is closed under all operations: The result of all operations + is a Ranges over the same domain. (As opposed to value-range.h which + may ICE for some operations, see below). + + The representation is unique in the sense that when we have two + Ranges A and B, then + 1) A == B <==> A.size == B.size && Ai == Bi for all i. + + The representation is normalized: + 2) Ai != {} ;; There are no empty intervals. + 3) Ai.hi < A{i+1}.lo ;; The Ai's are in increasing order and separated + ;; by at least one value (non-adjacent). + The sub-intervals Ai are maintained as a std::vector. + The computation of union and intersection scales like A.size * B.size + i.e. Ranges is only eligible for GCC when size() has a fixed upper + bound independent of the program being compiled (or there are other + means to guarantee that the complexity is linearistic). + In the context of AVR, we have size() <= 3. + + The reason why we don't use value-range.h's irange or int_range is that + these use the integers Z as their domain, which makes computations like + invert() quite nasty as they may ICE for common cases. Doing all + these special cases (like one sub-interval touches the domain bounds) + makes using value-range.h more laborious (and instable) than using our + own mini Ranger. */ + +struct Ranges +{ + // This is good enough as it covers (un)signed SImode. + using T = HOST_WIDE_INT; + typedef T scalar_type; + + // Non-empty ranges. Empty sets are only used transiently; + // Ranges.ranges[] doesn't use them. + struct SubRange + { + // Lower and upper bound, inclusively. + T lo, hi; + + SubRange intersect (const SubRange &r) const + { + if (lo >= r.lo && hi <= r.hi) + return *this; + else if (r.lo >= lo && r.hi <= hi) + return r; + else if (lo > r.hi || hi < r.lo) + return SubRange { 1, 0 }; + else + return SubRange { std::max (lo, r.lo), std::min (hi, r.hi) }; + } + + T cardinality () const + { + return std::max<T> (0, hi - lo + 1); + } + }; + + // Finitely many intervals over [m_min, m_max] that are normalized: + // No empty sets, increasing order, separated by at least one value. + T m_min, m_max; + std::vector<SubRange> ranges; + + // Not used anywhere in Ranges; can be used elsewhere. + // May be clobbered by set operations. + int tag = -1; + + enum initial_range { EMPTY, ALL }; + + Ranges (T mi, T ma, initial_range ir) + : m_min (mi), m_max (ma) + { + if (ir == ALL) + push (mi, ma); + } + + // Domain is the range of some [u]intN_t. + static Ranges NBitsRanges (int n_bits, bool unsigned_p, initial_range ir) + { + T mask = ((T) 1 << n_bits) - 1; + gcc_assert (mask > 0); + T ma = mask >> ! unsigned_p; + return Ranges (unsigned_p ? 0 : -ma - 1, ma, ir); + } + + static void sort2 (Ranges &a, Ranges &b) + { + if (a.size () && b.size ()) + if (a.ranges[0].lo > b.ranges[0].lo) + std::swap (a, b); + } + + void print (FILE *file) const + { + if (file) + { + fprintf (file, " .tag%d=#%d={", tag, size ()); + for (const auto &r : ranges) + fprintf (file, "[ %ld, %ld ]", (long) r.lo, (long) r.hi); + fprintf (file, "}\n"); + } + } + + // The number of sub-intervals in .ranges. + int size () const + { + return (int) ranges.size (); + } + + // Append [LO, HI] & [m_min, m_max] to .ranges provided the + // former is non-empty. + void push (T lo, T hi) + { + lo = std::max (lo, m_min); + hi = std::min (hi, m_max); + + if (lo <= hi) + ranges.push_back (SubRange { lo, hi }); + } + + // Append R to .ranges provided the former is non-empty. + void push (const SubRange &r) + { + push (r.lo, r.hi); + } + + // Cardinality of the n-th interval. + T cardinality (int n) const + { + return n < size () ? ranges[n].cardinality () : 0; + } + + // Check that *this is normalized: .ranges are non-empty, non-overlapping, + // non-adjacent and increasing. + bool check () const + { + bool bad = size () && (ranges[0].lo < m_min + || ranges[size () - 1].hi > m_max); + + for (int n = 0; n < size (); ++n) + { + bad |= ranges[n].lo > ranges[n].hi; + bad |= n > 0 && ranges[n - 1].hi >= ranges[n].lo; + } + + if (bad) + print (dump_file); + + return ! bad; + } + + // Intersect A and B according to (U Ai) & (U Bj) = U (Ai & Bj) + // This has quadratic complexity, but also the nice property that + // when A and B are normalized, then the result is too. + void intersect (const Ranges &r) + { + gcc_assert (m_min == r.m_min && m_max == r.m_max); + + if (this == &r) + return; + + std::vector<SubRange> rs; + std::swap (rs, ranges); + + for (const auto &a : rs) + for (const auto &b : r.ranges) + push (a.intersect (b)); + } + + // Complement w.r.t. the domain [m_min, m_max]. + void invert () + { + std::vector<SubRange> rs; + std::swap (rs, ranges); + + if (rs.size () == 0) + push (m_min, m_max); + else + { + push (m_min, rs[0].lo - 1); + + for (size_t n = 1; n < rs.size (); ++n) + push (rs[n - 1].hi + 1, rs[n].lo - 1); + + push (rs[rs.size () - 1].hi + 1, m_max); + } + } + + // Set-minus. + void minus (const Ranges &r) + { + gcc_assert (m_min == r.m_min && m_max == r.m_max); + + Ranges sub = r; + sub.invert (); + intersect (sub); + } + + // Union of sets. Not needed in avr.cc but added for completeness. + // DeMorgan this for simplicity. + void union_ (const Ranges &r) + { + gcc_assert (m_min == r.m_min && m_max == r.m_max); + + if (this != &r) + { + invert (); + minus (r); + invert (); + } + } + + // Get the truth Ranges for x <cmp> val. For example, + // LT 3 will return [m_min, 2]. + Ranges truth (rtx_code cmp, T val, bool strict = true) + { + if (strict) + { + if (avr_strict_signed_p (cmp)) + gcc_assert (m_min == -m_max - 1); + else if (avr_strict_unsigned_p (cmp)) + gcc_assert (m_min == 0); + + gcc_assert (IN_RANGE (val, m_min, m_max)); + } + + bool rev = cmp == NE || cmp == LTU || cmp == LT || cmp == GTU || cmp == GT; + if (rev) + cmp = reverse_condition (cmp); + + T lo = m_min; + T hi = m_max; + + if (cmp == EQ) + lo = hi = val; + else if (cmp == LEU || cmp == LE) + hi = val; + else if (cmp == GEU || cmp == GE) + lo = val; + else + gcc_unreachable (); + + Ranges rs (m_min, m_max, Ranges::EMPTY); + rs.push (lo, hi); + + if (rev) + rs.invert (); + + return rs; + } + +}; // struct Ranges diff --git a/gcc/config/avr/specs.h b/gcc/config/avr/specs.h index 0ccc37b..a1e2c38 100644 --- a/gcc/config/avr/specs.h +++ b/gcc/config/avr/specs.h @@ -1,4 +1,4 @@ -/* Specs definitions for Atmel AVR back end. +/* Specs definitions for AVR 8-bit microcontrollers. Copyright (C) 2012-2024 Free Software Foundation, Inc. Contributed by Georg-Johann Lay (avr@gjlay.de) diff --git a/gcc/config/avr/stdfix.h b/gcc/config/avr/stdfix.h index e130a26..9a594f9 100644 --- a/gcc/config/avr/stdfix.h +++ b/gcc/config/avr/stdfix.h @@ -108,12 +108,12 @@ typedef long long unsigned int uint_uk_t; /* The Embedded-C paper specifies results only for rounding points 0 < RP < FBIT - + As an extension, the following functions work as expected with rounding points -IBIT < RP < FBIT - + For example, rounding an accum with a rounding point of -1 will result in an even integer value. */ diff --git a/gcc/config/avr/t-avr b/gcc/config/avr/t-avr index 449512a..3da1328 100644 --- a/gcc/config/avr/t-avr +++ b/gcc/config/avr/t-avr @@ -59,8 +59,14 @@ avr-log.o: $(srcdir)/config/avr/avr-log.cc \ $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(INPUT_H) dumpfile.h $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< +avr-passes.o: $(srcdir)/config/avr/avr-passes.cc \ + $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TREE_H) $(INPUT_H) + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) $< + avr.o avr-c.o: $(srcdir)/config/avr/builtins.def +avr-passes.o: $(srcdir)/config/avr/ranges.h + # This overrides stdfix.h from USER_H which we supply and include # in our own stdfix.h as stdfix-gcc.h. diff --git a/gcc/config/bfin/bfin-protos.h b/gcc/config/bfin/bfin-protos.h index bd49329..053fc4a 100644 --- a/gcc/config/bfin/bfin-protos.h +++ b/gcc/config/bfin/bfin-protos.h @@ -71,7 +71,7 @@ extern char *bfin_asm_long (void); extern char *bfin_asm_short (void); extern int log2constp (unsigned HOST_WIDE_INT); -extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx); +extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx); extern HOST_WIDE_INT bfin_initial_elimination_offset (int, int); extern int effective_address_32bit_p (rtx, machine_mode); diff --git a/gcc/config/bfin/bfin.cc b/gcc/config/bfin/bfin.cc index 3e40f2c..13d2e10 100644 --- a/gcc/config/bfin/bfin.cc +++ b/gcc/config/bfin/bfin.cc @@ -97,14 +97,14 @@ bfin_globalize_label (FILE *stream, const char *name) fputc ('\n',stream); } -static void -output_file_start (void) +static void +output_file_start (void) { FILE *file = asm_out_file; int i; fprintf (file, ".file \"%s\";\n", LOCATION_FILE (input_location)); - + for (i = 0; arg_regs[i] >= 0; i++) ; max_arg_registers = i; /* how many arg reg used */ @@ -417,7 +417,7 @@ expand_prologue_reg_save (rtx spreg, int saveall, bool is_inthandler) } } for (i = REG_P7 + 1; i < REG_CC; i++) - if (saveall + if (saveall || (is_inthandler && (df_regs_ever_live_p (i) || (!leaf_function_p () && call_used_or_fixed_reg_p (i))))) @@ -548,7 +548,7 @@ expand_epilogue_reg_restore (rtx spreg, bool saveall, bool is_inthandler) it. Normally, this macro will push all remaining incoming registers on the - stack and set PRETEND_SIZE to the length of the registers pushed. + stack and set PRETEND_SIZE to the length of the registers pushed. Blackfin specific : - VDSP C compiler manual (our ABI) says that a variable args function @@ -590,7 +590,7 @@ setup_incoming_varargs (cumulative_args_t cum, be accessed via the stack pointer) in functions that seem suitable. */ static bool -bfin_frame_pointer_required (void) +bfin_frame_pointer_required (void) { e_funkind fkind = funkind (TREE_TYPE (current_function_decl)); @@ -906,7 +906,7 @@ do_unlink (rtx spreg, HOST_WIDE_INT frame_size, bool all, int epilogue_p) if (stack_frame_needed_p ()) emit_insn (gen_unlink ()); - else + else { rtx postinc = gen_rtx_MEM (Pmode, gen_rtx_POST_INC (Pmode, spreg)); @@ -968,7 +968,7 @@ expand_interrupt_handler_prologue (rtx spreg, e_funkind fkind, bool all) emit_insn (gen_movsi_low (p5reg, p5reg, chipid)); emit_insn (gen_dummy_load (p5reg, bfin_cc_rtx)); } - + if (lookup_attribute ("nesting", attrs)) { rtx srcreg = gen_rtx_REG (Pmode, ret_regs[fkind]); @@ -1046,7 +1046,7 @@ bfin_load_pic_reg (rtx dest) pic reg, since the caller always passes a usable one. */ if (local_info_node && local_info_node->local) return pic_offset_table_rtx; - + if (OPTION_SET_P (bfin_library_id)) addr = plus_constant (Pmode, pic_offset_table_rtx, -4 - bfin_library_id * 4); @@ -1236,7 +1236,7 @@ bfin_delegitimize_address (rtx orig_x) 32-bit instruction. */ int -effective_address_32bit_p (rtx op, machine_mode mode) +effective_address_32bit_p (rtx op, machine_mode mode) { HOST_WIDE_INT offset; @@ -1312,7 +1312,7 @@ print_address_operand (FILE *file, rtx x) case PRE_DEC: fprintf (file, "--"); - output_address (VOIDmode, XEXP (x, 0)); + output_address (VOIDmode, XEXP (x, 0)); break; case POST_INC: output_address (VOIDmode, XEXP (x, 0)); @@ -1390,7 +1390,7 @@ print_operand (FILE *file, rtx x, char code) output_operand_lossage ("invalid %%j value"); } break; - + case 'J': /* reverse logic */ switch (GET_CODE(x)) { @@ -1491,7 +1491,7 @@ print_operand (FILE *file, rtx x, char code) else output_operand_lossage ("invalid operand for code '%c'", code); } - else + else fprintf (file, "%s", reg_names[REGNO (x)]); break; @@ -1620,7 +1620,7 @@ print_operand (FILE *file, rtx x, char code) /* Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function whose data type is FNTYPE. - For a library call, FNTYPE is 0. + For a library call, FNTYPE is 0. VDSP C Compiler manual, our ABI says that first 3 words of arguments will use R0, R1 and R2. */ @@ -1718,7 +1718,7 @@ bfin_arg_partial_bytes (cumulative_args_t cum, const function_arg_info &arg) { int bytes = arg.promoted_size_in_bytes (); int bytes_left = get_cumulative_args (cum)->nregs * UNITS_PER_WORD; - + if (bytes == -1) return 0; @@ -1759,7 +1759,7 @@ bfin_struct_value_rtx (tree fntype ATTRIBUTE_UNUSED, /* Return true when register may be used to pass function parameters. */ -bool +bool function_arg_regno_p (int n) { int i; @@ -2701,7 +2701,7 @@ bfin_valid_reg_p (unsigned int regno, int strict, machine_mode mode, /* Recognize an RTL expression that is a valid memory address for an instruction. The MODE argument is the machine mode for the MEM expression - that wants to use this address. + that wants to use this address. Blackfin addressing modes are as follows: @@ -2710,7 +2710,7 @@ bfin_valid_reg_p (unsigned int regno, int strict, machine_mode mode, B [ Preg + uimm15 ] W [ Preg + uimm16m2 ] - [ Preg + uimm17m4 ] + [ Preg + uimm17m4 ] [preg++] [preg--] @@ -2888,8 +2888,8 @@ bfin_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, else *total = cost2; return true; - - case ASHIFT: + + case ASHIFT: case ASHIFTRT: case LSHIFTRT: if (mode == DImode) @@ -2904,7 +2904,7 @@ bfin_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, *total += rtx_cost (op0, mode, code, 0, speed); return true; - + case IOR: case AND: case XOR: @@ -3152,11 +3152,11 @@ output_push_multiple (rtx insn, rtx *operands) { char buf[80]; int ok; - + /* Validate the insn again, and compute first_[dp]reg_to_save. */ ok = analyze_push_multiple_operation (PATTERN (insn)); gcc_assert (ok); - + if (first_dreg_to_save == 8) sprintf (buf, "[--sp] = ( p5:%d );\n", first_preg_to_save); else if (first_preg_to_save == 6) @@ -3176,7 +3176,7 @@ output_pop_multiple (rtx insn, rtx *operands) { char buf[80]; int ok; - + /* Validate the insn again, and compute first_[dp]reg_to_save. */ ok = analyze_pop_multiple_operation (PATTERN (insn)); gcc_assert (ok); @@ -3856,7 +3856,7 @@ static void hwloop_fail (hwloop_info loop) { rtx insn = loop->loop_end; - + if (DPREG_P (loop->iter_reg)) { /* If loop->iter_reg is a DREG or PREG, we can split it here @@ -3880,7 +3880,7 @@ hwloop_fail (hwloop_info loop) } else { - splitting_loops = 1; + splitting_loops = 1; try_split (PATTERN (insn), safe_as_a <rtx_insn *> (insn), 1); splitting_loops = 0; } @@ -4132,7 +4132,7 @@ workaround_rts_anomaly (void) if (BARRIER_P (insn)) return; - + if (NOTE_P (insn) || LABEL_P (insn)) continue; @@ -4286,7 +4286,7 @@ indirect_call_p (rtx pat) pat = XEXP (pat, 0); gcc_assert (GET_CODE (pat) == MEM); pat = XEXP (pat, 0); - + return REG_P (pat); } @@ -4329,7 +4329,7 @@ workaround_speculation (void) int delay_needed = 0; next = find_next_insn_start (insn); - + if (NOTE_P (insn) || BARRIER_P (insn)) continue; if (JUMP_TABLE_DATA_P (insn)) @@ -4344,7 +4344,7 @@ workaround_speculation (void) pat = PATTERN (insn); if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER) continue; - + if (GET_CODE (pat) == ASM_INPUT || asm_noperands (pat) >= 0) { np_check_regno = -1; @@ -4603,7 +4603,7 @@ add_sched_insns_for_speculation (void) if (GET_CODE (PATTERN (next)) == UNSPEC_VOLATILE && get_attr_type (next) == TYPE_STALL) continue; - emit_insn_before (gen_stall (GEN_INT (1)), next); + emit_insn_before (gen_stall (GEN_INT (1)), next); } } } @@ -4719,7 +4719,7 @@ bfin_comp_type_attributes (const_tree type1, const_tree type2) if (kind1 != kind2) return 0; - + /* Check for mismatched modifiers */ if (!lookup_attribute ("nesting", TYPE_ATTRIBUTES (type1)) != !lookup_attribute ("nesting", TYPE_ATTRIBUTES (type2))) @@ -4744,9 +4744,9 @@ bfin_comp_type_attributes (const_tree type1, const_tree type2) struct attribute_spec.handler. */ static tree -bfin_handle_longcall_attribute (tree *node, tree name, - tree args ATTRIBUTE_UNUSED, - int flags ATTRIBUTE_UNUSED, +bfin_handle_longcall_attribute (tree *node, tree name, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) { if (TREE_CODE (*node) != FUNCTION_TYPE @@ -5154,7 +5154,7 @@ bfin_init_builtins (void) = build_function_type_list (integer_type_node, build_pointer_type (integer_type_node), NULL_TREE); - + /* Add the remaining MMX insns with somewhat more complicated types. */ def_builtin ("__builtin_bfin_csync", void_ftype_void, BFIN_BUILTIN_CSYNC); def_builtin ("__builtin_bfin_ssync", void_ftype_void, BFIN_BUILTIN_SSYNC); @@ -5746,7 +5746,7 @@ bfin_conditional_register_usage (void) #define TARGET_EXPAND_BUILTIN bfin_expand_builtin #undef TARGET_ASM_GLOBALIZE_LABEL -#define TARGET_ASM_GLOBALIZE_LABEL bfin_globalize_label +#define TARGET_ASM_GLOBALIZE_LABEL bfin_globalize_label #undef TARGET_ASM_FILE_START #define TARGET_ASM_FILE_START output_file_start diff --git a/gcc/config/bfin/bfin.h b/gcc/config/bfin/bfin.h index e957c31..ef0ba70 100644 --- a/gcc/config/bfin/bfin.h +++ b/gcc/config/bfin/bfin.h @@ -295,10 +295,10 @@ extern const char *bfin_library_id_string; /* Define this if the above stack space is to be considered part of the * space allocated by the caller. */ #define OUTGOING_REG_PARM_STACK_SPACE(FNTYPE) 1 - + /* Define this if the maximum size of all the outgoing args is to be accumulated and pushed during the prologue. The amount can be - found in the variable crtl->outgoing_args_size. */ + found in the variable crtl->outgoing_args_size. */ #define ACCUMULATE_OUTGOING_ARGS 1 /*#define DATA_ALIGNMENT(TYPE, BASIC-ALIGN) for arrays.. */ @@ -876,11 +876,11 @@ typedef struct { #define DEFAULT_SIGNED_CHAR 1 /* FLOAT_TYPE_SIZE get poisoned, so add BFIN_ prefix. */ #define BFIN_FLOAT_TYPE_SIZE BITS_PER_WORD -#define SHORT_TYPE_SIZE 16 +#define SHORT_TYPE_SIZE 16 #define CHAR_TYPE_SIZE 8 #define INT_TYPE_SIZE 32 #define LONG_TYPE_SIZE 32 -#define LONG_LONG_TYPE_SIZE 64 +#define LONG_LONG_TYPE_SIZE 64 /* Note: Fix this to depend on target switch. -- lev */ @@ -943,7 +943,7 @@ typedef struct { #define JUMP_TABLES_IN_TEXT_SECTION flag_pic /* Define if operations between registers always perform the operation - on the full register even if a narrower mode is specified. + on the full register even if a narrower mode is specified. #define WORD_REGISTER_OPERATIONS 1 */ @@ -1095,7 +1095,7 @@ extern rtx bfin_cc_rtx, bfin_rets_rtx; #define SET_ASM_OP ".set " /* Debugger register number for a given compiler register number */ -#define DEBUGGER_REGNO(REGNO) (REGNO) +#define DEBUGGER_REGNO(REGNO) (REGNO) #define SIZE_ASM_OP "\t.size\t" diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc index 2051fa5..aa00d14 100644 --- a/gcc/config/bpf/bpf.cc +++ b/gcc/config/bpf/bpf.cc @@ -19,6 +19,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/bpf/btfext-out.cc b/gcc/config/bpf/btfext-out.cc index b3df7b5..ca6241a 100644 --- a/gcc/config/bpf/btfext-out.cc +++ b/gcc/config/bpf/btfext-out.cc @@ -19,6 +19,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/bpf/core-builtins.cc b/gcc/config/bpf/core-builtins.cc index 86e2e9d..deb368a 100644 --- a/gcc/config/bpf/core-builtins.cc +++ b/gcc/config/bpf/core-builtins.cc @@ -19,6 +19,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/c6x/c6x.cc b/gcc/config/c6x/c6x.cc index 4ea3a1e..20a1666 100644 --- a/gcc/config/c6x/c6x.cc +++ b/gcc/config/c6x/c6x.cc @@ -6398,7 +6398,7 @@ c6x_init_builtins (void) tree v2si_ftype_v2hi_v2hi = build_function_type_list (V2SI_type_node, V2HI_type_node, V2HI_type_node, NULL_TREE); - + def_builtin ("__builtin_c6x_sadd", int_ftype_int_int, C6X_BUILTIN_SADD); def_builtin ("__builtin_c6x_ssub", int_ftype_int_int, diff --git a/gcc/config/c6x/c6x.md b/gcc/config/c6x/c6x.md index 5964dd6..ea9ffe8 100644 --- a/gcc/config/c6x/c6x.md +++ b/gcc/config/c6x/c6x.md @@ -3082,7 +3082,7 @@ ;; Widening vector multiply and dot product. ;; See c6x-mult.md.in for the define_insn patterns -(define_expand "sdot_prodv2hi" +(define_expand "sdot_prodsiv2hi" [(match_operand:SI 0 "register_operand" "") (match_operand:V2HI 1 "register_operand" "") (match_operand:V2HI 2 "register_operand" "") diff --git a/gcc/config/cris/cris.cc b/gcc/config/cris/cris.cc index 617fc0a..8173f85 100644 --- a/gcc/config/cris/cris.cc +++ b/gcc/config/cris/cris.cc @@ -2279,7 +2279,7 @@ cris_side_effect_mode_ok (enum rtx_code code, rtx *ops, /* Queue an .ident string in the queue of top-level asm statements. If the front-end is done, we must be being called from toplev.cc. In that case, do nothing. */ -void +void cris_asm_output_ident (const char *string) { if (symtab->state != PARSING) @@ -3597,7 +3597,7 @@ cris_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, if (for_return == 1) return mode; return CRIS_PROMOTED_MODE (mode, *punsignedp, type); -} +} /* Atomic types require alignment to be at least their "natural" size. */ diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md index c15395b..55f4d10 100644 --- a/gcc/config/cris/cris.md +++ b/gcc/config/cris/cris.md @@ -2418,7 +2418,7 @@ (pc)))] "reload_completed" { - return <MODE>mode == CC_NZmode ? "b<oCC> %l0%#": "b<CC> %l0%#"; + return <MODE>mode == CC_NZmode ? "b<oCC> %l0%#" : "b<CC> %l0%#"; } [(set_attr "slottable" "has_slot")]) @@ -3024,6 +3024,7 @@ ;; Re-compose a decomposed "indirect offset" address for a szext ;; operation. The non-clobbering "addi" is generated by LRA. ;; This and lra_szext_decomposed is covered by cris/rld-legit1.c. +;; (Unfortunately not true when enabling late-combine.) (define_peephole2 ; lra_szext_decomposed_indirect_with_offset [(parallel [(set (match_operand:SI 0 "register_operand") @@ -3046,6 +3047,50 @@ (mem:BW2 (plus:SI (szext:SI (mem:BW (match_dup 1))) (match_dup 2))))) (clobber (reg:CC CRIS_CC0_REGNUM))])]) +;; When enabling late-combine, we get a slightly changed register +;; allocation. The two allocations for the pseudo-registers involved +;; in the matching pattern get "swapped" and the (plus ...) in the +;; pattern above is now a load from a stack-slot. If peephole2 is +;; disabled, we see that the original sequence is actually improved; +;; one less incoming instruction, a load. We need to "undo" that +;; improvement a bit and move that load "back" to before the sequence +;; we combine in lra_szext_decomposed_indirect_with_offset. But that +;; changed again, so there's no define_peephole2 for that sequence +;; here, because it'd be hard or impossible to write a matching +;; test-case. A few commits later, the incoming pattern sequence has +;; changed again: back to the original but with the (plus...) part of +;; the address inside the second memory reference. +;; Coverage: cris/rld-legit1.c@r15-1880-gce34fcc572a0dc or +;; r15-3386-gaf1500dd8c00 when adding -flate-combine-instructions. + +(define_peephole2 ; lra_szext_decomposed_indir_plus + [(parallel + [(set (match_operand:SI 0 "register_operand") + (sign_extend:SI (mem:BW (match_operand:SI 1 "register_operand")))) + (clobber (reg:CC CRIS_CC0_REGNUM))]) + (parallel + [(set (match_operand:SI 3 "register_operand") + (szext:SI (mem:BW2 (plus:SI + (match_operand:SI 4 "register_operand") + (match_operand:SI 2 "register_operand"))))) + (clobber (reg:CC CRIS_CC0_REGNUM))])] + "(REGNO (operands[0]) == REGNO (operands[3]) + || peep2_reg_dead_p (3, operands[0])) + && (REGNO (operands[0]) == REGNO (operands[1]) + || peep2_reg_dead_p (3, operands[0])) + && (rtx_equal_p (operands[2], operands[0]) + || rtx_equal_p (operands[4], operands[0]))" + [(parallel + [(set + (match_dup 3) + (szext:SI + (mem:BW2 (plus:SI (szext:SI (mem:BW (match_dup 1))) (match_dup 2))))) + (clobber (reg:CC CRIS_CC0_REGNUM))])] +{ + if (! rtx_equal_p (operands[4], operands[0])) + operands[2] = operands[4]; +}) + ;; Add operations with similar or same decomposed addresses here, when ;; encountered - but only when covered by mentioned test-cases for at ;; least one of the cases generalized in the pattern. diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc index aaec1c3..69f3d0a 100644 --- a/gcc/config/darwin-c.cc +++ b/gcc/config/darwin-c.cc @@ -733,7 +733,7 @@ darwin_cpp_builtins (cpp_reader *pfile) /* Since we do not (at 4.6) support ObjC gc for the NeXT runtime, the following will cause a syntax error if one tries to compile gc attributed - items. However, without this, NeXT system headers cannot be parsed + items. However, without this, NeXT system headers cannot be parsed properly (on systems >= darwin 9). */ if (flag_objc_gc) { @@ -805,24 +805,24 @@ darwin_cfstring_ref_p (const_tree strp) return false; tn = TYPE_NAME (strp); - if (tn) + if (tn) tn = DECL_NAME (tn); - return (tn + return (tn && IDENTIFIER_POINTER (tn) && startswith (IDENTIFIER_POINTER (tn), "CFStringRef")); } /* At present the behavior of this is undefined and it does nothing. */ static void -darwin_check_cfstring_format_arg (tree ARG_UNUSED (format_arg), +darwin_check_cfstring_format_arg (tree ARG_UNUSED (format_arg), tree ARG_UNUSED (args_list)) { } /* The extra format types we recognize. */ EXPORTED_CONST format_kind_info darwin_additional_format_types[] = { - { "CFString", NULL, NULL, NULL, NULL, - NULL, NULL, + { "CFString", NULL, NULL, NULL, NULL, + NULL, NULL, FMT_FLAG_ARG_CONVERT|FMT_FLAG_PARSE_ARG_CONVERT_EXTERNAL, 0, 0, 0, 0, 0, 0, NULL, NULL } diff --git a/gcc/config/darwin-driver.cc b/gcc/config/darwin-driver.cc index eabe9bc..2aa0b0c 100644 --- a/gcc/config/darwin-driver.cc +++ b/gcc/config/darwin-driver.cc @@ -191,8 +191,8 @@ darwin_find_version_from_kernel (void) /* When running on a Darwin system and using that system's headers and libraries, default the -mmacosx-version-min flag to be the version - of the system on which the compiler is running. - + of the system on which the compiler is running. + When building cross or native cross compilers, default to the OSX version of the target (as provided by the most specific target header included in tm.h). This may be overidden by setting the flag explicitly @@ -287,7 +287,7 @@ darwin_driver_init (unsigned int *decoded_options_count, case OPT_arch: /* Support provision of a single -arch xxxx flag as a means of specifying the sub-target/multi-lib. Translate this into -m32/64 - as appropriate. */ + as appropriate. */ if (!strcmp ((*decoded_options)[i].arg, "i386")) seenX86 = true; else if (!strcmp ((*decoded_options)[i].arg, "x86_64")) @@ -307,7 +307,7 @@ darwin_driver_init (unsigned int *decoded_options_count, * sizeof (struct cl_decoded_option))); } --i; - --*decoded_options_count; + --*decoded_options_count; break; case OPT_m32: @@ -370,7 +370,7 @@ darwin_driver_init (unsigned int *decoded_options_count, { if (seenX86_64 || seenM64) { - const char *op = (seenX86_64? "-arch x86_64": "-m64"); + const char *op = (seenX86_64 ? "-arch x86_64" : "-m64"); warning (0, "%qs conflicts with %<-arch i386%> (%qs ignored)", op, op); } @@ -384,7 +384,7 @@ darwin_driver_init (unsigned int *decoded_options_count, " (%<-m32%> ignored)"); if (! seenM64) /* Add -m64 if the User didn't. */ appendM64 = true; - } + } #elif DARWIN_PPC if (seenX86 || seenX86_64) warning (0, "this compiler does not support x86" @@ -393,7 +393,7 @@ darwin_driver_init (unsigned int *decoded_options_count, { if (seenPPC64 || seenM64) { - const char *op = (seenPPC64? "-arch ppc64": "-m64"); + const char *op = (seenPPC64 ? "-arch ppc64" : "-m64"); warning (0, "%qs conflicts with %<-arch ppc%> (%qs ignored)", op, op); } diff --git a/gcc/config/darwin-protos.h b/gcc/config/darwin-protos.h index b67e052..523d7db 100644 --- a/gcc/config/darwin-protos.h +++ b/gcc/config/darwin-protos.h @@ -59,7 +59,7 @@ extern void darwin_set_default_type_attributes (tree); extern int machopic_reloc_rw_mask (void); extern section *machopic_select_section (tree, int, unsigned HOST_WIDE_INT); -extern section *darwin_function_section (tree, enum node_frequency, bool, bool); +extern section *darwin_function_section (tree, enum node_frequency, bool, bool); extern section *darwin_tm_clone_table_section (void); extern void darwin_function_switched_text_sections (FILE *, tree, bool); @@ -105,11 +105,11 @@ extern void darwin_asm_declare_constant_name (FILE *, const char *, extern void darwin_output_aligned_bss (FILE *, tree, const char *, unsigned HOST_WIDE_INT, unsigned int); -extern void darwin_asm_output_aligned_decl_local (FILE *, tree, const char *, - unsigned HOST_WIDE_INT, +extern void darwin_asm_output_aligned_decl_local (FILE *, tree, const char *, + unsigned HOST_WIDE_INT, unsigned int); extern void darwin_asm_output_aligned_decl_common (FILE *, tree, const char *, - unsigned HOST_WIDE_INT, + unsigned HOST_WIDE_INT, unsigned int); extern bool darwin_binds_local_p (const_tree); diff --git a/gcc/config/darwin.cc b/gcc/config/darwin.cc index 9129378..ae821e3 100644 --- a/gcc/config/darwin.cc +++ b/gcc/config/darwin.cc @@ -18,6 +18,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -2403,7 +2404,7 @@ darwin_asm_declare_object_name (FILE *file, #ifdef DEBUG_DARWIN_MEM_ALLOCATORS fprintf (file, "# dadon: %s %s (%llu, %u) local %d weak %d" " stat %d com %d pub %d t-const %d t-ro %d init %lx\n", - xname, (TREE_CODE (decl) == VAR_DECL?"var":"const"), + xname, TREE_CODE (decl) == VAR_DECL ? "var" : "const", (unsigned long long)size, DECL_ALIGN (decl), local_def, DECL_WEAK (decl), TREE_STATIC (decl), DECL_COMMON (decl), TREE_PUBLIC (decl), TREE_CONSTANT (decl), TREE_READONLY (decl), @@ -2641,7 +2642,7 @@ darwin_emit_common (FILE *fp, const char *name, fputs ("\t.comm\t", fp); assemble_name (fp, name); fprintf (fp, "," HOST_WIDE_INT_PRINT_UNSIGNED, - emit_aligned_common?size:rounded); + emit_aligned_common ? size : rounded); if (l2align && emit_aligned_common) fprintf (fp, ",%u", l2align); fputs ("\n", fp); diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h index 3775990..d2a8061 100644 --- a/gcc/config/darwin.h +++ b/gcc/config/darwin.h @@ -648,6 +648,8 @@ extern GTY(()) int darwin_ms_struct; #define ASM_OPTIONS "%{v} %{w:-W} %{I*}" #endif +#define AS_NEEDS_DASH_FOR_PIPED_INPUT + /* Default Darwin ASM_SPEC, very simple. */ #define ASM_SPEC \ "%{static} -arch %(darwin_arch) " \ @@ -850,7 +852,7 @@ ASM_OPTIONS ASM_MMACOSX_VERSION_MIN_SPEC #define TARGET_ASM_DECLARE_CONSTANT_NAME darwin_asm_declare_constant_name /* Wrap new method names in quotes so the assembler doesn't gag. - Make Objective-C internal symbols local and in doing this, we need + Make Objective-C internal symbols local and in doing this, we need to accommodate the name mangling done by c++ on file scope locals. */ int darwin_label_is_anonymous_local_objc_name (const char *name); @@ -1207,7 +1209,7 @@ void add_framework_path (char *); #undef GTM_SELF_SPECS #define GTM_SELF_SPECS "" -/* Darwin disables section anchors by default. +/* Darwin disables section anchors by default. They should be enabled per arch where support exists in that arch. */ #define TARGET_ASM_OUTPUT_ANCHOR NULL #define DARWIN_SECTION_ANCHORS 0 @@ -1238,7 +1240,7 @@ extern void darwin_driver_init (unsigned int *,struct cl_decoded_option **); #undef STACK_CHECK_STATIC_BUILTIN #define STACK_CHECK_STATIC_BUILTIN 1 -/* When building cross-compilers (and native crosses) we shall default to +/* When building cross-compilers (and native crosses) we shall default to providing an osx-version-min of this unless overridden by the User. 10.5 is the only version that fully supports all our archs so that's the fall-back default. */ diff --git a/gcc/config/elfos.h b/gcc/config/elfos.h index 1338815..ea56a5c 100644 --- a/gcc/config/elfos.h +++ b/gcc/config/elfos.h @@ -43,10 +43,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #undef USER_LABEL_PREFIX #define USER_LABEL_PREFIX "" -/* The biggest alignment supported by ELF in bits. 32-bit ELF - supports section alignment up to (0x80000000 * 8), while - 64-bit ELF supports (0x8000000000000000 * 8). If this macro - is not defined, the default is the largest alignment supported +/* The biggest alignment supported by ELF in bits. 32-bit ELF + supports section alignment up to (0x80000000 * 8), while + 64-bit ELF supports (0x8000000000000000 * 8). If this macro + is not defined, the default is the largest alignment supported by 32-bit ELF and representable on a 32-bit host. Use this macro to limit the alignment which can be specified using the `__attribute__ ((aligned (N)))' construct. */ diff --git a/gcc/config/epiphany/epiphany.cc b/gcc/config/epiphany/epiphany.cc index 79254c2..56e7cf9 100644 --- a/gcc/config/epiphany/epiphany.cc +++ b/gcc/config/epiphany/epiphany.cc @@ -830,7 +830,7 @@ epiphany_rtx_costs (rtx x, machine_mode mode, int outer_code, return false; } - + case SET: { rtx src = SET_SRC (x); diff --git a/gcc/config/epiphany/epiphany.md b/gcc/config/epiphany/epiphany.md index 395ddd5..af6cd39 100644 --- a/gcc/config/epiphany/epiphany.md +++ b/gcc/config/epiphany/epiphany.md @@ -2187,14 +2187,14 @@ if (epiphany_uninterruptible_p (current_function_decl) != target_uninterruptible) { - emit_insn (target_uninterruptible ? gen_gid (): gen_gie ()); + emit_insn (target_uninterruptible ? gen_gid () : gen_gie ()); emit_call_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, gen_rtx_CALL (VOIDmode, operands[0], operands[1]), gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (SImode, GPR_LR))))); - emit_insn (target_uninterruptible ? gen_gie (): gen_gid ()); + emit_insn (target_uninterruptible ? gen_gie () : gen_gid ()); DONE; } }) @@ -2225,13 +2225,13 @@ if (epiphany_uninterruptible_p (current_function_decl) != target_uninterruptible) { - emit_insn (target_uninterruptible ? gen_gid (): gen_gie ()); + emit_insn (target_uninterruptible ? gen_gid () : gen_gie ()); emit_call_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, gen_rtx_CALL (VOIDmode, operands[0], operands[1]), ret_rtx))); - emit_insn (target_uninterruptible ? gen_gie (): gen_gid ()); + emit_insn (target_uninterruptible ? gen_gie () : gen_gid ()); DONE; } }) @@ -2264,7 +2264,7 @@ if (epiphany_uninterruptible_p (current_function_decl) != target_uninterruptible) { - emit_insn (target_uninterruptible ? gen_gid (): gen_gie ()); + emit_insn (target_uninterruptible ? gen_gid () : gen_gie ()); emit_call_insn (gen_rtx_PARALLEL (VOIDmode, @@ -2273,7 +2273,7 @@ gen_rtx_CALL (VOIDmode, operands[1], operands[2])), gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (SImode, GPR_LR))))); - emit_insn (target_uninterruptible ? gen_gie (): gen_gid ()); + emit_insn (target_uninterruptible ? gen_gie () : gen_gid ()); DONE; } }) @@ -2307,7 +2307,7 @@ if (epiphany_uninterruptible_p (current_function_decl) != target_uninterruptible) { - emit_insn (target_uninterruptible ? gen_gid (): gen_gie ()); + emit_insn (target_uninterruptible ? gen_gid () : gen_gie ()); emit_call_insn (gen_rtx_PARALLEL (VOIDmode, @@ -2315,7 +2315,7 @@ (operands[0], gen_rtx_CALL (VOIDmode, operands[1], operands[2])), ret_rtx))); - emit_insn (target_uninterruptible ? gen_gie (): gen_gid ()); + emit_insn (target_uninterruptible ? gen_gie () : gen_gid ()); DONE; } }) diff --git a/gcc/config/fr30/fr30.cc b/gcc/config/fr30/fr30.cc index cf93cba..f45e8fc 100644 --- a/gcc/config/fr30/fr30.cc +++ b/gcc/config/fr30/fr30.cc @@ -18,7 +18,7 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -/*{{{ Includes */ +/*{{{ Includes */ #define IN_TARGET_CODE 1 @@ -45,13 +45,13 @@ #include "target-def.h" /*}}}*/ -/*{{{ Function Prologues & Epilogues */ +/*{{{ Function Prologues & Epilogues */ /* The FR30 stack looks like this: Before call After call FP ->| | | | - +-----------------------+ +-----------------------+ high + +-----------------------+ +-----------------------+ high | | | | memory | local variables, | | local variables, | | reg save area, etc. | | reg save area, etc. | @@ -63,32 +63,32 @@ SP ->| do not fit in regs | | | +-----------------------+ +-----------------------+ | args that used to be | \ - | in regs; only created | | pretend_size - AP-> | for vararg funcs | / - +-----------------------+ - | | \ + | in regs; only created | | pretend_size + AP-> | for vararg funcs | / + +-----------------------+ + | | \ | register save area | | | | | +-----------------------+ | reg_size - | return address | | + | return address | | +-----------------------+ | FP ->| previous frame ptr | / - +-----------------------+ - | | \ - | local variables | | var_size - | | / - +-----------------------+ - | | \ + +-----------------------+ + | | \ + | local variables | | var_size + | | / + +-----------------------+ + | | \ low | room for args to | | - memory | other funcs called | | args_size + memory | other funcs called | | args_size | from this one | | - SP ->| | / - +-----------------------+ - + SP ->| | / + +-----------------------+ + Note, AP is a fake hard register. It will be eliminated in favor of SP or FP as appropriate. - Note, Some or all of the stack sections above may be omitted if they + Note, Some or all of the stack sections above may be omitted if they are not needed. */ /* Structure to be filled in by fr30_compute_frame_size() with register @@ -211,7 +211,7 @@ fr30_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) } /* Returns the number of bytes offset between FROM_REG and TO_REG - for the current function. As a side effect it fills in the + for the current function. As a side effect it fills in the current_frame_info structure, if the data is available. */ unsigned int fr30_compute_frame_size (int from_reg, int to_reg) @@ -259,10 +259,10 @@ fr30_compute_frame_size (int from_reg, int to_reg) /* Calculate the required distance. */ return_value = 0; - + if (to_reg == STACK_POINTER_REGNUM) return_value += args_size + var_size; - + if (from_reg == ARG_POINTER_REGNUM) return_value += reg_size; @@ -292,7 +292,7 @@ fr30_expand_prologue (void) if (current_frame_info.pretend_size) { int regs_to_save = current_frame_info.pretend_size / UNITS_PER_WORD; - + /* Push argument registers into the pretend arg area. */ for (regno = FIRST_ARG_REGNUM + FR30_NUM_ARG_REGS; regno --, regs_to_save --;) { @@ -317,7 +317,7 @@ fr30_expand_prologue (void) /* Save return address if necessary. */ if (current_frame_info.save_rp) { - insn = emit_insn (gen_movsi_push (gen_rtx_REG (Pmode, + insn = emit_insn (gen_movsi_push (gen_rtx_REG (Pmode, RETURN_POINTER_REGNUM))); RTX_FRAME_RELATED_P (insn) = 1; } @@ -329,12 +329,12 @@ fr30_expand_prologue (void) { int enter_size = current_frame_info.frame_size + UNITS_PER_WORD; rtx pattern; - + insn = emit_insn (gen_enter_func (GEN_INT (enter_size))); RTX_FRAME_RELATED_P (insn) = 1; - + pattern = PATTERN (insn); - + /* Also mark all 3 subexpressions as RTX_FRAME_RELATED_P. */ if (GET_CODE (pattern) == PARALLEL) { @@ -342,7 +342,7 @@ fr30_expand_prologue (void) for (x = XVECLEN (pattern, 0); x--;) { rtx part = XVECEXP (pattern, 0, x); - + /* One of the insns in the ENTER pattern updates the frame pointer. If we do not actually need the frame pointer in this function then this is a side effect @@ -410,7 +410,7 @@ fr30_expand_epilogue (void) /* Perform the inversion operations of the prologue. */ gcc_assert (current_frame_info.initialised); - + /* Pop local variables and arguments off the stack. If frame_pointer_needed is TRUE then the frame pointer register has actually been used as a frame pointer, and we can recover @@ -433,18 +433,18 @@ fr30_expand_epilogue (void) emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, tmp)); } } - + if (current_frame_info.save_fp) emit_insn (gen_movsi_pop (frame_pointer_rtx)); - + /* Pop all the registers that were pushed. */ if (current_frame_info.save_rp) emit_insn (gen_movsi_pop (gen_rtx_REG (Pmode, RETURN_POINTER_REGNUM))); - + for (regno = 0; regno < STACK_POINTER_REGNUM; regno ++) if (current_frame_info.gmask & (1 << regno)) emit_insn (gen_movsi_pop (gen_rtx_REG (Pmode, regno))); - + if (current_frame_info.pretend_size) emit_insn (gen_add_to_stack (GEN_INT (current_frame_info.pretend_size))); @@ -494,7 +494,7 @@ fr30_setup_incoming_varargs (cumulative_args_t arg_regs_used_so_far_v, } /*}}}*/ -/*{{{ Printing operands */ +/*{{{ Printing operands */ /* Print a memory address as an operand to reference that memory location. */ @@ -506,7 +506,7 @@ fr30_print_operand_address (FILE *stream, rtx address) case SYMBOL_REF: output_addr_const (stream, address); break; - + default: fprintf (stderr, "code = %x\n", GET_CODE (address)); debug_rtx (address); @@ -521,7 +521,7 @@ void fr30_print_operand (FILE *file, rtx x, int code) { rtx x0; - + switch (code) { case '#': @@ -529,7 +529,7 @@ fr30_print_operand (FILE *file, rtx x, int code) if (dbr_sequence_length () != 0) fputs (":D", file); return; - + case 'p': /* Compute the register name of the second register in a hi/lo register pair. */ @@ -538,7 +538,7 @@ fr30_print_operand (FILE *file, rtx x, int code) else fprintf (file, "r%d", REGNO (x) + 1); return; - + case 'b': /* Convert GCC's comparison operators into FR30 comparison codes. */ switch (GET_CODE (x)) @@ -558,7 +558,7 @@ fr30_print_operand (FILE *file, rtx x, int code) break; } return; - + case 'B': /* Convert GCC's comparison operators into the complimentary FR30 comparison codes. */ @@ -587,7 +587,7 @@ fr30_print_operand (FILE *file, rtx x, int code) else { HOST_WIDE_INT val; - + val = INTVAL (x); val &= 0xff; @@ -595,7 +595,7 @@ fr30_print_operand (FILE *file, rtx x, int code) fprintf (file, HOST_WIDE_INT_PRINT_DEC, val); } return; - + case 'x': if (GET_CODE (x) != CONST_INT || INTVAL (x) < 16 @@ -617,11 +617,11 @@ fr30_print_operand (FILE *file, rtx x, int code) fputs (str, file); } return; - + case 0: /* Handled below. */ break; - + default: fprintf (stderr, "unknown code = %x\n", code); output_operand_lossage ("fr30_print_operand: unknown code"); @@ -636,7 +636,7 @@ fr30_print_operand (FILE *file, rtx x, int code) case MEM: x0 = XEXP (x,0); - + switch (GET_CODE (x0)) { case REG: @@ -677,11 +677,11 @@ fr30_print_operand (FILE *file, rtx x, int code) fprintf (file, "@(r15, #" HOST_WIDE_INT_PRINT_DEC ")", val); } break; - + case SYMBOL_REF: output_address (VOIDmode, x0); break; - + default: fprintf (stderr, "bad MEM code = %x\n", GET_CODE (x0)); debug_rtx (x); @@ -689,7 +689,7 @@ fr30_print_operand (FILE *file, rtx x, int code) break; } break; - + case CONST_DOUBLE : /* We handle SFmode constants here as output_addr_const doesn't. */ if (GET_MODE (x) == SFmode) @@ -740,7 +740,7 @@ fr30_function_value_regno_p (const unsigned int regno) return (regno == RETURN_VALUE_REGNUM); } -/*{{{ Function arguments */ +/*{{{ Function arguments */ /* Return true if we should pass an argument on the stack rather than in registers. */ @@ -787,7 +787,7 @@ fr30_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg) register, partial stack space. */ if (*cum + fr30_num_arg_regs (arg) <= FR30_NUM_ARG_REGS) return 0; - + return (FR30_NUM_ARG_REGS - *cum) * UNITS_PER_WORD; } @@ -814,7 +814,7 @@ fr30_function_arg_advance (cumulative_args_t cum, } /*}}}*/ -/*{{{ Operand predicates */ +/*{{{ Operand predicates */ #ifndef Mmode #define Mmode machine_mode @@ -828,30 +828,30 @@ fr30_check_multiple_regs (rtx *operands, int num_operands, int descending) if (descending) { unsigned int prev_regno = 0; - + while (num_operands --) { if (GET_CODE (operands [num_operands]) != REG) return 0; - + if (REGNO (operands [num_operands]) < prev_regno) return 0; - + prev_regno = REGNO (operands [num_operands]); } } else { unsigned int prev_regno = CONDITION_CODE_REGNUM; - + while (num_operands --) { if (GET_CODE (operands [num_operands]) != REG) return 0; - + if (REGNO (operands [num_operands]) > prev_regno) return 0; - + prev_regno = REGNO (operands [num_operands]); } } @@ -895,13 +895,13 @@ fr30_move_double (rtx * operands) if (src_code == REG) { int reverse = (REGNO (dest) == REGNO (src) + 1); - + /* We normally copy the low-numbered register first. However, if the first register of operand 0 is the same as the second register of operand 1, we must copy in the opposite order. */ emit_insn (gen_rtx_SET (operand_subword (dest, reverse, TRUE, mode), operand_subword (src, reverse, TRUE, mode))); - + emit_insn (gen_rtx_SET (operand_subword (dest, !reverse, TRUE, mode), operand_subword (src, !reverse, TRUE, mode))); @@ -912,9 +912,9 @@ fr30_move_double (rtx * operands) rtx dest0 = operand_subword (dest, 0, TRUE, mode); rtx dest1 = operand_subword (dest, 1, TRUE, mode); rtx new_mem; - + gcc_assert (GET_CODE (addr) == REG); - + /* Copy the address before clobbering it. See PR 34174. */ emit_insn (gen_rtx_SET (dest1, addr)); emit_insn (gen_rtx_SET (dest0, adjust_address (src, SImode, 0))); @@ -923,7 +923,7 @@ fr30_move_double (rtx * operands) new_mem = gen_rtx_MEM (SImode, dest1); MEM_COPY_ATTRIBUTES (new_mem, src); - + emit_insn (gen_rtx_SET (dest1, new_mem)); } else if (src_code == CONST_INT || src_code == CONST_DOUBLE) @@ -932,7 +932,7 @@ fr30_move_double (rtx * operands) split_double (src, &words[0], &words[1]); emit_insn (gen_rtx_SET (operand_subword (dest, 0, TRUE, mode), words[0])); - + emit_insn (gen_rtx_SET (operand_subword (dest, 1, TRUE, mode), words[1])); } @@ -1006,7 +1006,7 @@ fr30_frame_pointer_required (void) target are 32 bit aligned within the trampoline. That allows us to initialize those locations with simple SImode stores. The alternative would be to use HImode stores. */ - + static void fr30_asm_trampoline_template (FILE *f) { diff --git a/gcc/config/fr30/fr30.h b/gcc/config/fr30/fr30.h index 19020fb..6d071a1 100644 --- a/gcc/config/fr30/fr30.h +++ b/gcc/config/fr30/fr30.h @@ -1,6 +1,6 @@ -/*{{{ Comment. */ +/*{{{ Comment. */ -/* Definitions of FR30 target. +/* Definitions of FR30 target. Copyright (C) 1998-2024 Free Software Foundation, Inc. Contributed by Cygnus Solutions. @@ -21,7 +21,7 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ /*}}}*/ -/*{{{ Run-time target specifications. */ +/*{{{ Run-time target specifications. */ #undef ASM_SPEC #define ASM_SPEC "" @@ -56,7 +56,7 @@ along with GCC; see the file COPYING3. If not see %{static:-Bstatic} %{shared:-shared} %{symbolic:-Bsymbolic}" /*}}}*/ -/*{{{ Storage Layout. */ +/*{{{ Storage Layout. */ #define BITS_BIG_ENDIAN 1 @@ -93,7 +93,7 @@ along with GCC; see the file COPYING3. If not see #define PCC_BITFIELD_TYPE_MATTERS 1 /*}}}*/ -/*{{{ Layout of Source Language Data Types. */ +/*{{{ Layout of Source Language Data Types. */ #define SHORT_TYPE_SIZE 16 #define INT_TYPE_SIZE 32 @@ -115,7 +115,7 @@ along with GCC; see the file COPYING3. If not see #define WCHAR_TYPE_SIZE BITS_PER_WORD /*}}}*/ -/*{{{ REGISTER BASICS. */ +/*{{{ REGISTER BASICS. */ /* Number of hardware registers known to the compiler. They receive numbers 0 through `FIRST_PSEUDO_REGISTER-1'; thus, the first pseudo register's number @@ -139,7 +139,7 @@ along with GCC; see the file COPYING3. If not see /* A call-used register that can be used during the function prologue. */ #define PROLOGUE_TMP_REGNUM COMPILER_SCRATCH_REGISTER - + /* Register numbers used for passing a function's static chain pointer. If register windows are used, the register number as seen by the called function is `STATIC_CHAIN_INCOMING_REGNUM', while the register number as @@ -161,7 +161,7 @@ along with GCC; see the file COPYING3. If not see determines which register this is. On other machines, you can choose any register you wish for this purpose. */ #define FRAME_POINTER_REGNUM 14 - + /* The register number of the stack pointer register, which must also be a fixed register according to `FIXED_REGISTERS'. On most machines, the hardware determines which register this is. */ @@ -231,7 +231,7 @@ along with GCC; see the file COPYING3. If not see } /*}}}*/ -/*{{{ Register Classes. */ +/*{{{ Register Classes. */ /* An enumeral type that must be defined with all the register class names as enumeral values. `NO_REGS' must be first. `ALL_REGS' must be the last @@ -332,7 +332,7 @@ enum reg_class #define CLASS_MAX_NREGS(CLASS, MODE) targetm.hard_regno_nregs (0, MODE) /*}}}*/ -/*{{{ Basic Stack Layout. */ +/*{{{ Basic Stack Layout. */ /* Define this macro if pushing a word onto the stack moves the stack pointer to a smaller address. */ @@ -367,7 +367,7 @@ enum reg_class #define INCOMING_RETURN_ADDR_RTX gen_rtx_REG (SImode, RETURN_POINTER_REGNUM) /*}}}*/ -/*{{{ Register That Address the Stack Frame. */ +/*{{{ Register That Address the Stack Frame. */ /* The register number of the arg pointer register, which is used to access the function's argument list. On some machines, this is the same as the frame @@ -379,7 +379,7 @@ enum reg_class #define ARG_POINTER_REGNUM 20 /*}}}*/ -/*{{{ Eliminating the Frame Pointer and the Arg Pointer. */ +/*{{{ Eliminating the Frame Pointer and the Arg Pointer. */ /* If defined, this macro specifies a table of register pairs used to eliminate unneeded registers that point into the stack frame. If it is not defined, @@ -417,7 +417,7 @@ enum reg_class (OFFSET) = fr30_compute_frame_size (FROM, TO) /*}}}*/ -/*{{{ Passing Function Arguments on the Stack. */ +/*{{{ Passing Function Arguments on the Stack. */ /* If defined, the maximum amount of space required for outgoing arguments will be computed and placed into the variable @@ -430,10 +430,10 @@ enum reg_class #define ACCUMULATE_OUTGOING_ARGS 1 /*}}}*/ -/*{{{ Function Arguments in Registers. */ +/*{{{ Function Arguments in Registers. */ /* The number of register assigned to holding function arguments. */ - + #define FR30_NUM_ARG_REGS 4 /* A C type for declaring a variable that is used as the first argument of @@ -478,7 +478,7 @@ enum reg_class ((REGNO) >= FIRST_ARG_REGNUM && ((REGNO) < FIRST_ARG_REGNUM + FR30_NUM_ARG_REGS)) /*}}}*/ -/*{{{ How Large Values are Returned. */ +/*{{{ How Large Values are Returned. */ /* Define this macro to be 1 if all structure and union return values must be in memory. Since this results in slower code, this should be defined only @@ -490,7 +490,7 @@ enum reg_class #define DEFAULT_PCC_STRUCT_RETURN 1 /*}}}*/ -/*{{{ Generating Code for Profiling. */ +/*{{{ Generating Code for Profiling. */ /* A C statement or compound statement to output to FILE some assembler code to call the profiling subroutine `mcount'. Before calling, the assembler code @@ -512,7 +512,7 @@ enum reg_class } /*}}}*/ -/*{{{ Trampolines for Nested Functions. */ +/*{{{ Trampolines for Nested Functions. */ /* A C expression for the size in bytes of the trampoline, as an integer. */ #define TRAMPOLINE_SIZE 18 @@ -523,7 +523,7 @@ enum reg_class #define TRAMPOLINE_ALIGNMENT 32 /*}}}*/ -/*{{{ Addressing Modes. */ +/*{{{ Addressing Modes. */ /* A number, the maximum number of registers that can appear in a valid memory address. Note that it is up to you to specify a value equal to the maximum @@ -536,15 +536,15 @@ enum reg_class /* On the FR30 we only have one real addressing mode - an address in a register. There are three special cases however: - + * indexed addressing using small positive offsets from the stack pointer - + * indexed addressing using small signed offsets from the frame pointer * register plus register addressing using R13 as the base register. At the moment we only support the first two of these special cases. */ - + #ifdef REG_OK_STRICT #define GO_IF_LEGITIMATE_ADDRESS(MODE, X, LABEL) \ do \ @@ -617,7 +617,7 @@ enum reg_class #define REG_OK_FOR_INDEX_P(X) REG_OK_FOR_BASE_P (X) /*}}}*/ -/*{{{ Describing Relative Costs of Operations */ +/*{{{ Describing Relative Costs of Operations */ /* Define this macro as a C expression which is nonzero if accessing less than a word of memory (i.e. a `char' or a `short') is no faster than accessing a @@ -633,7 +633,7 @@ enum reg_class #define SLOW_BYTE_ACCESS 1 /*}}}*/ -/*{{{ Dividing the output into sections. */ +/*{{{ Dividing the output into sections. */ /* A C expression whose value is a string containing the assembler operation that should precede instructions and read-only data. Normally `".text"' is @@ -668,13 +668,13 @@ enum reg_class #define ASM_APP_OFF "#NO_APP\n" /*}}}*/ -/*{{{ Output and Generation of Labels. */ +/*{{{ Output and Generation of Labels. */ /* Globalizing directive for a label. */ #define GLOBAL_ASM_OP "\t.globl " /*}}}*/ -/*{{{ Output of Assembler Instructions. */ +/*{{{ Output of Assembler Instructions. */ /* A C compound statement to output to stdio stream STREAM the assembler syntax for an instruction operand X. X is an RTL expression. @@ -713,7 +713,7 @@ enum reg_class #define IMMEDIATE_PREFIX "" /*}}}*/ -/*{{{ Output of Dispatch Tables. */ +/*{{{ Output of Dispatch Tables. */ /* This macro should be provided on machines where the addresses in a dispatch table are relative to the table's own address. @@ -741,7 +741,7 @@ fprintf (STREAM, "\t.word .L%d-.L%d\n", VALUE, REL) fprintf (STREAM, "\t.word .L%d\n", VALUE) /*}}}*/ -/*{{{ Assembler Commands for Alignment. */ +/*{{{ Assembler Commands for Alignment. */ /* A C statement to output to the stdio stream STREAM an assembler command to advance the location counter to a multiple of 2 to the POWER bytes. POWER @@ -750,7 +750,7 @@ fprintf (STREAM, "\t.word .L%d\n", VALUE) fprintf ((STREAM), "\t.p2align %d\n", (POWER)) /*}}}*/ -/*{{{ Miscellaneous Parameters. */ +/*{{{ Miscellaneous Parameters. */ /* An alias for a machine mode name. This is the machine mode that elements of a jump-table should have. */ diff --git a/gcc/config/freebsd-spec.h b/gcc/config/freebsd-spec.h index f43056b..42ee998 100644 --- a/gcc/config/freebsd-spec.h +++ b/gcc/config/freebsd-spec.h @@ -22,10 +22,10 @@ a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* Common FreeBSD configuration. +/* Common FreeBSD configuration. All FreeBSD architectures should include this file, which will specify their commonalities. - Adapted from gcc/config/freebsd.h by + Adapted from gcc/config/freebsd.h by David O'Brien <obrien@FreeBSD.org> Loren J. Rittle <ljrittle@acm.org>. */ @@ -49,7 +49,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see /* Define the default FreeBSD-specific per-CPU hook code. */ #define FBSD_TARGET_CPU_CPP_BUILTINS() do {} while (0) -/* Provide a CPP_SPEC appropriate for FreeBSD. We just deal with the GCC +/* Provide a CPP_SPEC appropriate for FreeBSD. We just deal with the GCC option `-posix', and PIC issues. */ #define FBSD_CPP_SPEC " \ @@ -58,10 +58,10 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see %{posix:-D_POSIX_SOURCE}" /* Provide a STARTFILE_SPEC appropriate for FreeBSD. Here we add - the magical crtbegin.o file (see crtstuff.c) which provides part - of the support for getting C++ file-scope static object constructed + the magical crtbegin.o file (see crtstuff.c) which provides part + of the support for getting C++ file-scope static object constructed before entering `main'. */ - + #define FBSD_STARTFILE_SPEC \ "%{!shared: \ %{pg:gcrt1.o%s} %{!pg:%{p:gcrt1.o%s} \ @@ -71,9 +71,9 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see crti.o%s %{static:crtbeginT.o%s;shared|pie:crtbeginS.o%s;:crtbegin.o%s}" /* Provide a ENDFILE_SPEC appropriate for FreeBSD. Here we tack on - the magical crtend.o file (see crtstuff.c) which provides part of - the support for getting C++ file-scope static object constructed - before entering `main', followed by a normal "finalizer" file, + the magical crtend.o file (see crtstuff.c) which provides part of + the support for getting C++ file-scope static object constructed + before entering `main', followed by a normal "finalizer" file, `crtn.o'. */ #define FBSD_ENDFILE_SPEC \ diff --git a/gcc/config/freebsd.h b/gcc/config/freebsd.h index 7add672..2643348 100644 --- a/gcc/config/freebsd.h +++ b/gcc/config/freebsd.h @@ -17,11 +17,11 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -/* Common FreeBSD configuration. +/* Common FreeBSD configuration. All FreeBSD architectures should include this file, which will specify their commonalities. - Adapted from gcc/config/i386/freebsd-elf.h by - David O'Brien <obrien@FreeBSD.org>. + Adapted from gcc/config/i386/freebsd-elf.h by + David O'Brien <obrien@FreeBSD.org>. Further work by David O'Brien <obrien@FreeBSD.org> and Loren J. Rittle <ljrittle@acm.org>. */ diff --git a/gcc/config/frv/frv.cc b/gcc/config/frv/frv.cc index ac6fda6..216ad5f 100644 --- a/gcc/config/frv/frv.cc +++ b/gcc/config/frv/frv.cc @@ -2484,7 +2484,7 @@ frv_print_operand_address (FILE * stream, machine_mode /* mode */, rtx x) See gcc/testsuite/gcc.dg/asm-4.c for an example. */ frv_print_operand_memory_reference (stream, x, 0); return; - + default: break; } @@ -6311,7 +6311,7 @@ frv_secondary_reload_class (enum reg_class rclass, /* This hook exists to catch the case where secondary_reload_class() is called from init_reg_autoinc() in regclass.c - before the reload optabs have been initialised. */ - + static reg_class_t frv_secondary_reload (bool in_p, rtx x, reg_class_t reload_class_i, machine_mode reload_mode, @@ -6682,7 +6682,7 @@ frv_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED, default: break; - case QUAD_REGS: + case QUAD_REGS: case GPR_REGS: case GR8_REGS: case GR9_REGS: diff --git a/gcc/config/ft32/ft32.cc b/gcc/config/ft32/ft32.cc index 3c6e5fb..80345dc 100644 --- a/gcc/config/ft32/ft32.cc +++ b/gcc/config/ft32/ft32.cc @@ -831,19 +831,6 @@ ft32_target_case_values_threshold (void) ft32_addr_space_legitimate_address_p -// Enabling LRA gives the infamous -// internal compiler error: Max. number of generated reload insns per insn is achieved (90) -// errors e.g. when compiling sieve.c - -static bool -ft32_lra_p (void) -{ - return ft32_lra_flag; -} - -#undef TARGET_LRA_P -#define TARGET_LRA_P ft32_lra_p - static bool reg_ok_for_base_p (rtx r, bool strict) { diff --git a/gcc/config/ft32/ft32.opt b/gcc/config/ft32/ft32.opt index cecc548..419c82a 100644 --- a/gcc/config/ft32/ft32.opt +++ b/gcc/config/ft32/ft32.opt @@ -23,8 +23,8 @@ Target Mask(SIM) Target the software simulator. mlra -Target Var(ft32_lra_flag) Init(0) Save -Use LRA instead of reload. +Target RejectNegative Ignore +Ignored, but preserved for backward compatibility. mnodiv Target Mask(NODIV) diff --git a/gcc/config/gcn/gcn-devices.def b/gcc/config/gcn/gcn-devices.def new file mode 100644 index 0000000..1305e0f --- /dev/null +++ b/gcc/config/gcn/gcn-devices.def @@ -0,0 +1,196 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your option) + any later version. + + This file is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* GCN Device Configurations. + + This file contains all the device-specific information needed for both + GCC and Libgomp. Please respect the formatting and field comments as + this file is read by Awk scripts in addition to the C++ preprocessor. + + To add a new device: + 1. Add a new GCN_DEVICE instance below. + 2. Add the name to the list in config.gcc. + 3. Allow gcn-tables.opt to regenerate. + 4. Implement target-specific metadata and new features using + PROCESSOR_<NAME> (or a new ISA feature flag). + 5. Consider adding to the set of device-specific tests in the libgomp + testsuite. + + New ISA variants are defined in gcn-opts.h. Please use the feature macros + in any conditionals, rather than depending on specific devices or ISAs + directly. + + GCN_DEVICE field descriptions: + 0 "name" (text, external) + Lower case device name used in -march=name, diagnostics, + assembler directives, etc. + 1 "NAME" (text, external) + Upper case device name used in macros. + 2 "ELF" (hex integer, external) + Magic number used assigned to this device for use in elf_flags. + 3 "ISA" (enum gcn_isa, internal) + ISA variant for instruction selection, etc. + 4 "XNACK default" (enum hsaco_attr_type, internal) + Default value for the -mattr=[-+]xnack setting. May need to correspond + to the assembler expectations for this device. + 5 "SRAM_ECC default" (enum hsaco_attr_type, internal) + Default value for the -mattr=[-+]sram-ecc setting. Only really used + to ensure that the binary is in a known state mkoffload can match. + 6 "WAVE64 mode" (enum hsaco_attr_type, internal) + Set "on" for devices where this needs to be configured, "unsupported" + otherwise (meaning no special treatment needed). GCC does not support + wave32 mode. + 7 "CU mode" (enum hsaco_attr_type, internal) + Set "on" for devices that have this feature, "unsupported" otherwise + (meaning that CU mode is not optional on the device). GCC does not + support CU mode off. + 8 "Max ISA VGPRs" (integer, internal) + Define how many registers there are in the VGPR register file, for the + purposes of calculating maximum occupancy. Some devices have AVGPRs + in the same register file, some have more registers than are + addressable from a single kernel. Used by libgomp's plugin-gcn.c. + 9 "Generic Processor Version" (unsigned, external) + Used as version field for generic processor support. For non-generic + code it is 0; otherwise, between 1 and 255. Initially, it is 1 for + each generic device, but incremented (for a given generic device) if + an new device of that series requires a code change; + cf. EF_AMDGPU_GENERIC_VERSION_V. The version shall be the same as + generated by the used llvm-mc assembler. + 10 "Architecture Family Name" (string, external) + Used to #define '__GFX<...>__'. + + Fields marked "external", above, have values defined elsewhere (HSA, ROCM, + LLVM, ELF, etc.) and must have matching definitions here. Fields marked + "internal" are defined and used only in GCC (although some may have + user-visible effects) and may be refactored as needed. */ + +/* GCN GFX9 (Vega) */ + +GCN_DEVICE(gfx900, GFX900, 0x2c, ISA_GCN5, + /* XNACK default */ HSACO_ATTR_OFF, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_UNSUPPORTED, + /* CU mode */ HSACO_ATTR_UNSUPPORTED, + /* Max ISA VGPRs */ 256, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX9 + ) + +GCN_DEVICE(gfx906, GFX906, 0x2f, ISA_GCN5, + /* XNACK default */ HSACO_ATTR_OFF, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_UNSUPPORTED, + /* CU mode */ HSACO_ATTR_UNSUPPORTED, + /* Max ISA VGPRs */ 256, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX9 + ) + +GCN_DEVICE(gfx908, GFX908, 0x30, ISA_CDNA1, + /* XNACK default */ HSACO_ATTR_OFF, + /* SRAM_ECC default */ HSACO_ATTR_ANY, + /* WAVE64 mode */ HSACO_ATTR_UNSUPPORTED, + /* CU mode */ HSACO_ATTR_UNSUPPORTED, + /* Max ISA VGPRs */ 256, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX9 + ) + +GCN_DEVICE(gfx90a, GFX90A, 0x3f, ISA_CDNA2, + /* XNACK default */ HSACO_ATTR_ANY, + /* SRAM_ECC default */ HSACO_ATTR_ANY, + /* WAVE64 mode */ HSACO_ATTR_UNSUPPORTED, + /* CU mode */ HSACO_ATTR_UNSUPPORTED, + /* Max ISA VGPRs */ 512, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX9 + ) + +GCN_DEVICE(gfx90c, GFX90C, 0x32, ISA_GCN5, + /* XNACK default */ HSACO_ATTR_ANY, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_UNSUPPORTED, + /* CU mode */ HSACO_ATTR_UNSUPPORTED, + /* Max ISA VGPRs */ 256, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX9 + ) + +/* GCN GFX10.3 (RDNA 2) */ + +GCN_DEVICE(gfx1030, GFX1030, 0x36, ISA_RDNA2, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 512, /* 512 SIMD32 = 256 wavefrontsize64. */ + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX10 + ) + +GCN_DEVICE(gfx1036, GFX1036, 0x45, ISA_RDNA2, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 512, /* 512 SIMD32 = 256 wavefrontsize64. */ + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX10 + ) + +GCN_DEVICE(gfx10-3-generic, GFX10_3_GENERIC, 0x053, ISA_RDNA2, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 512, /* 512 SIMD32 = 256 wavefrontsize64. */ + /* Generic code obj version */ 1, + /* Architecture Family */ GFX10 + ) + +/* GCN GFX11 (RDNA 3) */ + +GCN_DEVICE(gfx1100, GFX1100, 0x41, ISA_RDNA3, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 1536, /* 1536 SIMD32 = 768 wavefrontsize64. */ + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX11 + ) + +GCN_DEVICE(gfx1103, GFX1103, 0x44, ISA_RDNA3, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 1536, + /* Generic code obj version */ 0, /* non-generic */ + /* Architecture Family */ GFX11 + ) + +GCN_DEVICE(gfx11-generic, GFX11_GENERIC, 0x054, ISA_RDNA3, + /* XNACK default */ HSACO_ATTR_UNSUPPORTED, + /* SRAM_ECC default */ HSACO_ATTR_UNSUPPORTED, + /* WAVE64 mode */ HSACO_ATTR_ON, + /* CU mode */ HSACO_ATTR_ON, + /* Max ISA VGPRs */ 1536, + /* Generic code obj version */ 1, + /* Architecture Family */ GFX11 + ) + +#undef GCN_DEVICE diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h index 0322055..d87d2fa 100644 --- a/gcc/config/gcn/gcn-hsa.h +++ b/gcc/config/gcn/gcn-hsa.h @@ -75,39 +75,16 @@ extern unsigned int gcn_local_sym_hash (const char *name); supported for gcn. */ #define GOMP_SELF_SPECS "" -/* Explicitly set the ABI version; in principle, we could use just the - default; however, when debugging symbols are turned on, mkoffload.cc - writes a new AMD GPU object file and the ABI version needs to be the - same. - LLVM <= 17 defaults to 4 while LLVM >= 18 defaults to 5. - GCC supports LLVM >= 13.0.1 and only LLVM >= 14 supports version 5. - Note that Fiji is only supported with LLVM <= 17 as version 3 is no longer - supported in LLVM >= 18. */ -#define ABI_VERSION_SPEC "march=fiji:--amdhsa-code-object-version=3;" \ - "!march=*|march=*:--amdhsa-code-object-version=4" - -/* Note that the XNACK and SRAM-ECC settings must match those in mkoffload.cc - as the latter creates new ELF object file when debugging is enabled and - the ELF flags (e_flags) of that generated file must be identical to those - generated by the compiler. */ - -#define NO_XNACK "march=fiji:;march=gfx1030:;march=gfx1036:;march=gfx1100:;march=gfx1103:;" \ - /* These match the defaults set in gcn.cc. */ \ - "!mxnack*|mxnack=default:%{march=gfx900|march=gfx906|march=gfx908:-mattr=-xnack};" -#define NO_SRAM_ECC "!march=*:;march=fiji:;march=gfx900:;march=gfx906:;march=gfx90c:;" - -/* In HSACOv4 no attribute setting means the binary supports "any" hardware - configuration. The name of the attribute also changed. */ -#define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc" -#define XNACKOPT "mxnack=on:-mattr=+xnack;mxnack=off:-mattr=-xnack" +#include "gcn-device-macros.h" /* Use LLVM assembler and linker options. */ #define ASM_SPEC "-triple=amdgcn--amdhsa " \ "%{march=*:-mcpu=%*} " \ - "%{" ABI_VERSION_SPEC "} " \ - "%{" NO_XNACK XNACKOPT "} " \ - "%{" NO_SRAM_ECC SRAMOPT "} " \ - "%{march=gfx1030|march=gfx1036|march=gfx1100|march=gfx1103:-mattr=+wavefrontsize64} " \ - "%{march=gfx1030|march=gfx1036|march=gfx1100|march=gfx1103:-mattr=+cumode} " \ + ABI_VERSION_OPT \ + XNACKOPT \ + SRAMOPT \ + WAVE64OPT \ + CUMODEOPT \ "-filetype=obj" #define LINK_SPEC "--pie --export-dynamic" #define LIB_SPEC "-lc" diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index 24e856b..0026bec 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -17,36 +17,18 @@ #ifndef GCN_OPTS_H #define GCN_OPTS_H -/* Which processor to generate code or schedule for. */ +/* Create constants for PROCESSOR_GFX???. */ enum processor_type { - PROCESSOR_FIJI, // gfx803 - PROCESSOR_VEGA10, // gfx900 - PROCESSOR_VEGA20, // gfx906 - PROCESSOR_GFX908, - PROCESSOR_GFX90a, - PROCESSOR_GFX90c, - PROCESSOR_GFX1030, - PROCESSOR_GFX1036, - PROCESSOR_GFX1100, - PROCESSOR_GFX1103 +#define GCN_DEVICE(name, NAME, ...) \ + PROCESSOR_ ## NAME, +#include "gcn-devices.def" + PROCESSOR_COUNT }; -#define TARGET_FIJI (gcn_arch == PROCESSOR_FIJI) -#define TARGET_VEGA10 (gcn_arch == PROCESSOR_VEGA10) -#define TARGET_VEGA20 (gcn_arch == PROCESSOR_VEGA20) -#define TARGET_GFX908 (gcn_arch == PROCESSOR_GFX908) -#define TARGET_GFX90a (gcn_arch == PROCESSOR_GFX90a) -#define TARGET_GFX90c (gcn_arch == PROCESSOR_GFX90c) -#define TARGET_GFX1030 (gcn_arch == PROCESSOR_GFX1030) -#define TARGET_GFX1036 (gcn_arch == PROCESSOR_GFX1036) -#define TARGET_GFX1100 (gcn_arch == PROCESSOR_GFX1100) -#define TARGET_GFX1103 (gcn_arch == PROCESSOR_GFX1103) - /* Set in gcn_option_override. */ extern enum gcn_isa { ISA_UNKNOWN, - ISA_GCN3, ISA_GCN5, ISA_RDNA2, ISA_RDNA3, @@ -54,10 +36,7 @@ extern enum gcn_isa { ISA_CDNA2 } gcn_isa; -#define TARGET_GCN3 (gcn_isa == ISA_GCN3) -#define TARGET_GCN3_PLUS (gcn_isa >= ISA_GCN3) #define TARGET_GCN5 (gcn_isa == ISA_GCN5) -#define TARGET_GCN5_PLUS (gcn_isa >= ISA_GCN5) #define TARGET_CDNA1 (gcn_isa == ISA_CDNA1) #define TARGET_CDNA1_PLUS (gcn_isa >= ISA_CDNA1) #define TARGET_CDNA2 (gcn_isa == ISA_CDNA2) @@ -67,35 +46,30 @@ extern enum gcn_isa { #define TARGET_RDNA3 (gcn_isa == ISA_RDNA3) -#define TARGET_M0_LDS_LIMIT (TARGET_GCN3) #define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS || TARGET_RDNA3) -#define TARGET_XNACK (flag_xnack != HSACO_ATTR_OFF) +#define TARGET_XNACK (flag_xnack == HSACO_ATTR_ON \ + || flag_xnack == HSACO_ATTR_ANY) enum hsaco_attr_type { + HSACO_ATTR_UNSUPPORTED, HSACO_ATTR_OFF, HSACO_ATTR_ON, HSACO_ATTR_ANY, HSACO_ATTR_DEFAULT }; -/* There are global address instructions. */ -#define TARGET_GLOBAL_ADDRSPACE TARGET_GCN5_PLUS /* Device has an AVGPR register file. */ #define TARGET_AVGPRS TARGET_CDNA1_PLUS /* There are load/store instructions for AVGPRS. */ #define TARGET_AVGPR_MEMOPS TARGET_CDNA2_PLUS /* AVGPRS may have their own register file, or be combined with VGPRS. */ #define TARGET_AVGPR_COMBINED TARGET_CDNA2_PLUS -/* flat_load/store allows offsets. */ -#define TARGET_FLAT_OFFSETS TARGET_GCN5_PLUS /* global_load/store has reduced offset. */ #define TARGET_11BIT_GLOBAL_OFFSET TARGET_RDNA2_PLUS /* The work item details are all encoded into v0. */ //#define TARGET_PACKED_WORK_ITEMS TARGET_PACKED_WORK_ITEMS -/* m0 must be initialized in order to use LDS. */ -//#define TARGET_M0_LDS_LIMIT TARGET_M0_LDS_LIMIT /* CDNA2 load/store costs are reduced. * TODO: what does this mean? */ #define TARGET_CDNA2_MEM_COSTS TARGET_CDNA2_PLUS @@ -114,14 +88,12 @@ enum hsaco_attr_type : 4) /* This mostly affects the metadata. */ #define TARGET_ARCHITECTED_FLAT_SCRATCH TARGET_RDNA3 -/* Assembler uses s_add_co not just s_add. */ -#define TARGET_EXPLICIT_CARRY TARGET_GCN5_PLUS -/* mulsi3 permits immediate. */ -#define TARGET_MULTIPLY_IMMEDIATE TARGET_GCN5_PLUS /* Device has Sub-DWord Addressing instrucions. */ #define TARGET_SDWA (!TARGET_RDNA3) /* Different devices uses different cache control instructions. */ #define TARGET_WBINVL1_CACHE (!TARGET_RDNA2_PLUS) #define TARGET_GLn_CACHE TARGET_RDNA2_PLUS +/* Some devices have TGSPLIT, which needs at least metadata. */ +#define TARGET_TGSPLIT TARGET_CDNA2_PLUS #endif diff --git a/gcc/config/gcn/gcn-run.cc b/gcc/config/gcn/gcn-run.cc index 2f3ed2d..64d29b3 100644 --- a/gcc/config/gcn/gcn-run.cc +++ b/gcc/config/gcn/gcn-run.cc @@ -426,7 +426,7 @@ load_image (const char *filename) /* Locate the "_init_array" function, and read the kernel's properties. */ hsa_executable_symbol_t symbol; - XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, + XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array.kd", device, 0, &symbol), "Find '_init_array' function"); diff --git a/gcc/config/gcn/gcn-tables.opt b/gcc/config/gcn/gcn-tables.opt new file mode 100644 index 0000000..bb71089 --- /dev/null +++ b/gcc/config/gcn/gcn-tables.opt @@ -0,0 +1,58 @@ +; -*- buffer-read-only: t -*- +; Generated automatically by gen-opt-tables.awk from gcn-devices.def. +; Do not edit. + +; Copyright (C) 2024 Free Software Foundation, Inc. + +; This file is part of GCC. + +; GCC is free software; you can redistribute it and/or modify +; it under the terms of the GNU General Public License as +; published by the Free Software Foundation; either version 3, +; or (at your option) any later version. + +; GCC is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. + +; You should have received a copy of the GNU General Public +; License along with GCC; see the file COPYING3. If not see +; <http://www.gnu.org/licenses/>. + +Enum +Name(gpu_type) Type(enum processor_type) +GCN GPU type to use: + +EnumValue +Enum(gpu_type) String(gfx900) Value(PROCESSOR_GFX900) + +EnumValue +Enum(gpu_type) String(gfx906) Value(PROCESSOR_GFX906) + +EnumValue +Enum(gpu_type) String(gfx908) Value(PROCESSOR_GFX908) + +EnumValue +Enum(gpu_type) String(gfx90a) Value(PROCESSOR_GFX90A) + +EnumValue +Enum(gpu_type) String(gfx90c) Value(PROCESSOR_GFX90C) + +EnumValue +Enum(gpu_type) String(gfx1030) Value(PROCESSOR_GFX1030) + +EnumValue +Enum(gpu_type) String(gfx1036) Value(PROCESSOR_GFX1036) + +EnumValue +Enum(gpu_type) String(gfx10-3-generic) Value(PROCESSOR_GFX10_3_GENERIC) + +EnumValue +Enum(gpu_type) String(gfx1100) Value(PROCESSOR_GFX1100) + +EnumValue +Enum(gpu_type) String(gfx1103) Value(PROCESSOR_GFX1103) + +EnumValue +Enum(gpu_type) String(gfx11-generic) Value(PROCESSOR_GFX11_GENERIC) diff --git a/gcc/config/gcn/gcn-tables.opt.urls b/gcc/config/gcn/gcn-tables.opt.urls new file mode 100644 index 0000000..b13ed90 --- /dev/null +++ b/gcc/config/gcn/gcn-tables.opt.urls @@ -0,0 +1,2 @@ +; Autogenerated by regenerate-opt-urls.py from gcc/config/gcn/gcn-tables.opt and generated HTML + diff --git a/gcc/config/gcn/gcn-tree.cc b/gcc/config/gcn/gcn-tree.cc index 6a7485a..ad674c3 100644 --- a/gcc/config/gcn/gcn-tree.cc +++ b/gcc/config/gcn/gcn-tree.cc @@ -1,17 +1,17 @@ /* Copyright (C) 2017-2024 Free Software Foundation, Inc. This file is part of GCC. - + GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. - + GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - + You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ @@ -184,7 +184,7 @@ gcn_lockless_update (location_t loc, gimple_stmt_iterator *gsi, } /* Helper function for gcn_reduction_update. - + Insert code to lockfully update *PTR with *PTR OP VAR just before GSI. This is necessary for types larger than 64 bits, where there is no cmp&swap instruction to implement a lockless scheme. We use @@ -488,7 +488,7 @@ gcn_goacc_reduction_teardown (gcall *call) } /* Implement TARGET_GOACC_REDUCTION. - + Expand calls to the GOACC REDUCTION internal function, into a sequence of gimple instructions. */ diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index b24cf9b..cb2f4a7 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -452,7 +452,7 @@ [(set (match_operand:V_1REG 0 "nonimmediate_operand") (match_operand:V_1REG 1 "general_operand"))] "" - {@ [cons: =0, 1; attrs: type, length, gcn_version] + {@ [cons: =0, 1; attrs: type, length, cdna] [v ,vA;vop1 ,4,* ] v_mov_b32\t%0, %1 [v ,B ;vop1 ,8,* ] ^ [v ,a ;vop3p_mai,8,* ] v_accvgpr_read_b32\t%0, %1 @@ -519,7 +519,7 @@ return \"v_accvgpr_mov_b32\t%H0, %H1\;v_accvgpr_mov_b32\t%L0, %L1\";" [(set_attr "type" "vmult,vmult,vmult,vmult") (set_attr "length" "16,16,16,8") - (set_attr "gcn_version" "*,*,*,cdna2")]) + (set_attr "cdna" "*,*,*,cdna2")]) (define_insn "mov<mode>_exec" [(set (match_operand:V_2REG 0 "nonimmediate_operand" "= v, v, v, v, m") @@ -565,7 +565,7 @@ [(set (match_operand:V_4REG 0 "nonimmediate_operand") (match_operand:V_4REG 1 "general_operand"))] "" - {@ [cons: =0, 1; attrs: type, length, gcn_version] + {@ [cons: =0, 1; attrs: type, length, cdna] [v ,vDB;vmult,16,* ] v_mov_b32\t%L0, %L1\; v_mov_b32\t%H0, %H1\; v_mov_b32\t%J0, %J1\; v_mov_b32\t%K0, %K1 [v ,a ;vmult,32,* ] v_accvgpr_read_b32\t%L0, %L1\; v_accvgpr_read_b32\t%H0, %H1\; v_accvgpr_read_b32\t%J0, %J1\; v_accvgpr_read_b32\t%K0, %K1 [$a,v ;vmult,32,* ] v_accvgpr_write_b32\t%L0, %L1\;v_accvgpr_write_b32\t%H0, %H1\;v_accvgpr_write_b32\t%J0, %J1\;v_accvgpr_write_b32\t%K0, %K1 @@ -662,7 +662,7 @@ UNSPEC_SGPRBASE)) (clobber (match_operand:<VnDI> 2 "register_operand"))] "lra_in_progress || reload_completed" - {@ [cons: =0, 1, =2; attrs: type, length, gcn_version] + {@ [cons: =0, 1, =2; attrs: type, length, cdna] [v,vA,&v;vop1,4 ,* ] v_mov_b32\t%0, %1 [v,vB,&v;vop1,8 ,* ] ^ [v,m ,&v;* ,12,* ] # @@ -689,7 +689,7 @@ #" [(set_attr "type" "vmult,*,*,*,*") (set_attr "length" "8,12,12,12,12") - (set_attr "gcn_version" "*,*,*,cdna2,cdna2")]) + (set_attr "cdna" "*,*,*,cdna2,cdna2")]) (define_insn "@mov<mode>_sgprbase" [(set (match_operand:V_4REG 0 "nonimmediate_operand") @@ -1156,23 +1156,16 @@ (mem:BLK (scratch))] UNSPEC_GATHER))] "(AS_FLAT_P (INTVAL (operands[3])) - && ((TARGET_GCN3 && INTVAL(operands[2]) == 0) - || ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000))) - || (AS_GLOBAL_P (INTVAL (operands[3])) - && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))" + && ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x1000)) + || (AS_GLOBAL_P (INTVAL (operands[3])) + && (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))" { addr_space_t as = INTVAL (operands[3]); const char *glc = INTVAL (operands[4]) ? " glc" : ""; static char buf[200]; if (AS_FLAT_P (as)) - { - if (TARGET_FLAT_OFFSETS) - sprintf (buf, "flat_load%%o0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0", - glc); - else - sprintf (buf, "flat_load%%o0\t%%0, %%1%s\;s_waitcnt\t0", glc); - } + sprintf (buf, "flat_load%%o0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0", glc); else if (AS_GLOBAL_P (as)) sprintf (buf, "global_load%%o0\t%%0, %%1, off offset:%%2%s\;" "s_waitcnt\tvmcnt(0)", glc); @@ -1183,7 +1176,7 @@ } [(set_attr "type" "flat") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2,*,cdna2") + (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) (define_insn "gather<mode>_insn_1offset_ds<exec>" @@ -1207,7 +1200,7 @@ } [(set_attr "type" "ds") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2")]) + (set_attr "cdna" "*,cdna2")]) (define_insn "gather<mode>_insn_2offsets<exec>" [(set (match_operand:V_MOV 0 "register_operand" "=v,a,&v,&a") @@ -1241,7 +1234,7 @@ } [(set_attr "type" "flat") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2,*,cdna2") + (set_attr "cdna" "*,cdna2,*,cdna2") (set_attr "xnack" "off,off,on,on")]) (define_expand "scatter_store<mode><vnsi>" @@ -1290,8 +1283,7 @@ UNSPEC_SCATTER))] "(AS_FLAT_P (INTVAL (operands[3])) && (INTVAL(operands[1]) == 0 - || (TARGET_FLAT_OFFSETS - && (unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000))) + || ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x1000))) || (AS_GLOBAL_P (INTVAL (operands[3])) && (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))" { @@ -1300,12 +1292,7 @@ static char buf[200]; if (AS_FLAT_P (as)) - { - if (TARGET_FLAT_OFFSETS) sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc); - else - sprintf (buf, "flat_store%%s2\t%%0, %%2%s", glc); - } else if (AS_GLOBAL_P (as)) sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc); else @@ -1315,7 +1302,7 @@ } [(set_attr "type" "flat") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2")]) + (set_attr "cdna" "*,cdna2")]) (define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>" [(set (mem:BLK (scratch)) @@ -1338,7 +1325,7 @@ } [(set_attr "type" "ds") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2")]) + (set_attr "cdna" "*,cdna2")]) (define_insn "scatter<mode>_insn_2offsets<exec_scatter>" [(set (mem:BLK (scratch)) @@ -1370,7 +1357,7 @@ } [(set_attr "type" "flat") (set_attr "length" "12") - (set_attr "gcn_version" "*,cdna2")]) + (set_attr "cdna" "*,cdna2")]) ;; }}} ;; {{{ Permutations @@ -1476,7 +1463,7 @@ (clobber (reg:DI VCC_REG))] "" {@ [cons: =0, %1, 2; attrs: type, length] - [v,v,vSvA;vop2,4] v_add%^_u32\t%0, vcc, %2, %1 + [v,v,vSvA;vop2,4] v_add_co_u32\t%0, vcc, %2, %1 [v,v,vSvB;vop2,8] ^ }) @@ -1489,7 +1476,7 @@ (clobber (reg:DI VCC_REG))] "" {@ [cons: =0, 1, 2; attrs: type, length] - [v,v,SvA;vop2,4] v_add%^_u32\t%0, vcc, %2, %1 + [v,v,SvA;vop2,4] v_add_co_u32\t%0, vcc, %2, %1 [v,v,SvB;vop2,8] ^ }) @@ -1503,7 +1490,7 @@ (match_dup 1)))] "" {@ [cons: =0, %1, 2, =3; attrs: type, length] - [v,v,vSvA,cV;vop2 ,4] v_add%^_u32\t%0, %3, %2, %1 + [v,v,vSvA,cV;vop2 ,4] v_add_co_u32\t%0, %3, %2, %1 [v,v,vSvB,cV;vop2 ,8] ^ [v,v,vSvA,Sg;vop3b,8] ^ }) @@ -1523,7 +1510,7 @@ (vec_duplicate:V_SI (match_dup 2))))] "" {@ [cons: =0, 1, 2, =3; attrs: type, length] - [v,SvA,v,cV;vop2 ,4] v_add%^_u32\t%0, %3, %1, %2 + [v,SvA,v,cV;vop2 ,4] v_add_co_u32\t%0, %3, %1, %2 [v,SvB,v,cV;vop2 ,8] ^ [v,SvA,v,Sg;vop3b,8] ^ }) @@ -1560,7 +1547,7 @@ (match_dup 1)) (match_dup 1))))] "" - "{v_addc%^_u32|v_add_co_ci_u32}\t%0, %4, %2, %1, %3" + "{v_addc_co_u32|v_add_co_ci_u32}\t%0, %4, %2, %1, %3" [(set_attr "type" "vop2,vop3b") (set_attr "length" "4,8")]) @@ -1572,8 +1559,8 @@ (clobber (reg:DI VCC_REG))] "" "@ - v_sub%^_u32\t%0, vcc, %1, %2 - v_subrev%^_u32\t%0, vcc, %2, %1" + v_sub_co_u32\t%0, vcc, %1, %2 + v_subrev_co_u32\t%0, vcc, %2, %1" [(set_attr "type" "vop2") (set_attr "length" "8,8")]) @@ -1587,10 +1574,10 @@ (match_dup 1)))] "" "@ - v_sub%^_u32\t%0, %3, %1, %2 - v_sub%^_u32\t%0, %3, %1, %2 - v_subrev%^_u32\t%0, %3, %2, %1 - v_subrev%^_u32\t%0, %3, %2, %1" + v_sub_co_u32\t%0, %3, %1, %2 + v_sub_co_u32\t%0, %3, %1, %2 + v_subrev_co_u32\t%0, %3, %2, %1 + v_subrev_co_u32\t%0, %3, %2, %1" [(set_attr "type" "vop2,vop3b,vop2,vop3b") (set_attr "length" "8")]) @@ -1625,10 +1612,10 @@ (match_dup 1))))] "" "@ - {v_subb%^_u32|v_sub_co_ci_u32}\t%0, %4, %1, %2, %3 - {v_subb%^_u32|v_sub_co_ci_u32}\t%0, %4, %1, %2, %3 - {v_subbrev%^_u32|v_subrev_co_ci_u32}\t%0, %4, %2, %1, %3 - {v_subbrev%^_u32|v_subrev_co_ci_u32}\t%0, %4, %2, %1, %3" + {v_subb_co_u32|v_sub_co_ci_u32}\t%0, %4, %1, %2, %3 + {v_subb_co_u32|v_sub_co_ci_u32}\t%0, %4, %1, %2, %3 + {v_subbrev_co_u32|v_subrev_co_ci_u32}\t%0, %4, %2, %1, %3 + {v_subbrev_co_u32|v_subrev_co_ci_u32}\t%0, %4, %2, %1, %3" [(set_attr "type" "vop2,vop3b,vop2,vop3b") (set_attr "length" "4,8,4,8")]) @@ -4297,10 +4284,7 @@ (match_operand:V_1REG 2 "register_operand" "v") (match_operand:SI 3 "const_int_operand" "n")] REDUC_UNSPEC))] - ; GCN3 requires a carry out, GCN5 not - "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode) - && <reduc_unspec> == UNSPEC_PLUS_DPP_SHR) - && TARGET_DPP_FULL" + "TARGET_DPP_FULL" { return gcn_expand_dpp_shr_insn (<MODE>mode, "<reduc_insn>", <reduc_unspec>, INTVAL (operands[3])); @@ -4347,7 +4331,7 @@ (clobber (reg:DI VCC_REG))] "TARGET_DPP_FULL" { - return gcn_expand_dpp_shr_insn (<VnSI>mode, "v_add%^_u32", + return gcn_expand_dpp_shr_insn (<VnSI>mode, "v_add_co_u32", UNSPEC_PLUS_CARRY_DPP_SHR, INTVAL (operands[3])); } @@ -4365,7 +4349,7 @@ (clobber (reg:DI VCC_REG))] "TARGET_DPP_FULL" { - return gcn_expand_dpp_shr_insn (<MODE>mode, "v_addc%^_u32", + return gcn_expand_dpp_shr_insn (<MODE>mode, "v_addc_co_u32", UNSPEC_PLUS_CARRY_IN_DPP_SHR, INTVAL (operands[3])); } diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index 17316a7..d078392 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -68,12 +68,12 @@ static bool ext_gcn_constants_init = 0; /* Holds the ISA variant, derived from the command line parameters. */ -enum gcn_isa gcn_isa = ISA_GCN3; /* Default to GCN3. */ +enum gcn_isa gcn_isa = ISA_GCN5; /* Default to GCN5. */ /* Reserve this much space for LDS (for propagating variables from worker-single mode to worker-partitioned mode), per workgroup. Global analysis could calculate an exact bound, but we don't do that yet. - + We want to permit full occupancy, so size accordingly. */ /* Use this as a default, but allow it to grow if the user requests a large @@ -98,6 +98,15 @@ static hash_map<tree, int> lds_allocs; #define MAX_NORMAL_VGPR_COUNT 24 #define MAX_NORMAL_AVGPR_COUNT 24 +/* Import all the data from gcn-devices.def. + The PROCESSOR_GFXnnn should be indices for this table. */ +const struct gcn_device_def gcn_devices[] = { +#define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAMECC, WAVE64, CU, VGPRS, GEN_VER,ARCH_FAM) \ + {PROCESSOR_ ## NAME, #name, #NAME, ISA, XNACK, SRAMECC, WAVE64, CU, VGPRS, \ + GEN_VER, #ARCH_FAM}, +#include "gcn-devices.def" +}; + /* }}} */ /* {{{ Initialization and options. */ @@ -118,7 +127,7 @@ gcn_init_machine_status (void) } /* Implement TARGET_OPTION_OVERRIDE. - + Override option settings where defaults are variable, or we have specific needs to consider. */ @@ -133,18 +142,8 @@ gcn_option_override (void) if (!flag_pic) flag_pic = flag_pie; - gcn_isa = (gcn_arch == PROCESSOR_FIJI ? ISA_GCN3 - : gcn_arch == PROCESSOR_VEGA10 ? ISA_GCN5 - : gcn_arch == PROCESSOR_VEGA20 ? ISA_GCN5 - : gcn_arch == PROCESSOR_GFX908 ? ISA_CDNA1 - : gcn_arch == PROCESSOR_GFX90a ? ISA_CDNA2 - : gcn_arch == PROCESSOR_GFX90c ? ISA_GCN5 - : gcn_arch == PROCESSOR_GFX1030 ? ISA_RDNA2 - : gcn_arch == PROCESSOR_GFX1036 ? ISA_RDNA2 - : gcn_arch == PROCESSOR_GFX1100 ? ISA_RDNA3 - : gcn_arch == PROCESSOR_GFX1103 ? ISA_RDNA3 - : ISA_UNKNOWN); - gcc_assert (gcn_isa != ISA_UNKNOWN); + gcc_assert (gcn_arch >= 0 && gcn_arch < PROCESSOR_COUNT); + gcn_isa = gcn_devices[gcn_arch].isa; /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and worker broadcasts. */ @@ -164,23 +163,14 @@ gcn_option_override (void) acc_lds_size = 32768; } - /* gfx803 "Fiji", gfx1030 and gfx1100 do not support XNACK. */ - if (gcn_arch == PROCESSOR_FIJI - || gcn_arch == PROCESSOR_GFX1030 - || gcn_arch == PROCESSOR_GFX1036 - || gcn_arch == PROCESSOR_GFX1100 - || gcn_arch == PROCESSOR_GFX1103) + /* gfx1030 and gfx1100 do not support XNACK. */ + if (gcn_devices[gcn_arch].xnack_default == HSACO_ATTR_UNSUPPORTED) { if (flag_xnack == HSACO_ATTR_ON) error ("%<-mxnack=on%> is incompatible with %<-march=%s%>", - (gcn_arch == PROCESSOR_FIJI ? "fiji" - : gcn_arch == PROCESSOR_GFX1030 ? "gfx1030" - : gcn_arch == PROCESSOR_GFX1036 ? "gfx1036" - : gcn_arch == PROCESSOR_GFX1100 ? "gfx1100" - : gcn_arch == PROCESSOR_GFX1103 ? "gfx1103" - : NULL)); - /* Allow HSACO_ATTR_ANY silently because that's the default. */ - flag_xnack = HSACO_ATTR_OFF; + gcn_devices[gcn_arch].name); + /* Allow HSACO_ATTR_ANY silently. */ + flag_xnack = HSACO_ATTR_UNSUPPORTED; } /* There's no need for XNACK on devices without USM, and there are register @@ -188,24 +178,10 @@ gcn_option_override (void) available. FIXME: can the regalloc mean the default can be really "any"? */ if (flag_xnack == HSACO_ATTR_DEFAULT) - switch (gcn_arch) - { - case PROCESSOR_FIJI: - case PROCESSOR_VEGA10: - case PROCESSOR_VEGA20: - case PROCESSOR_GFX908: - flag_xnack = HSACO_ATTR_OFF; - break; - case PROCESSOR_GFX90a: - case PROCESSOR_GFX90c: - flag_xnack = HSACO_ATTR_ANY; - break; - default: - gcc_unreachable (); - } + flag_xnack = gcn_devices[gcn_arch].xnack_default; if (flag_sram_ecc == HSACO_ATTR_DEFAULT) - flag_sram_ecc = HSACO_ATTR_ANY; + flag_sram_ecc = gcn_devices[gcn_arch].sramecc_default; } /* }}} */ @@ -270,7 +246,7 @@ static const long default_requested_args /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())). This function also sets the default values for some arguments. - + Return true on success, with ARGS populated. */ static bool @@ -367,7 +343,7 @@ gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, } /* Referenced by TARGET_ATTRIBUTE_TABLE. - + Validates target specific attributes. */ static tree @@ -397,7 +373,7 @@ gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name, } /* Implement TARGET_ATTRIBUTE_TABLE. - + Create target-specific __attribute__ types. */ TARGET_GNU_ATTRIBUTES (gcn_attribute_table, { @@ -515,7 +491,7 @@ VnMODE (int n, machine_mode mode) } /* Implement TARGET_CLASS_MAX_NREGS. - + Return the number of hard registers needed to hold a value of MODE in a register of class RCLASS. */ @@ -550,7 +526,7 @@ gcn_class_max_nregs (reg_class_t rclass, machine_mode mode) } /* Implement TARGET_HARD_REGNO_NREGS. - + Return the number of hard registers needed to hold a value of MODE in REGNO. */ @@ -561,7 +537,7 @@ gcn_hard_regno_nregs (unsigned int regno, machine_mode mode) } /* Implement TARGET_HARD_REGNO_MODE_OK. - + Return true if REGNO can hold value in MODE. */ bool @@ -642,7 +618,7 @@ gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode) } /* Implement REGNO_REG_CLASS via gcn.h. - + Return smallest class containing REGNO. */ enum reg_class @@ -677,7 +653,7 @@ gcn_regno_reg_class (int regno) } /* Implement TARGET_CAN_CHANGE_MODE_CLASS. - + GCC assumes that lowpart contains first part of value as stored in memory. This is not the case for vector registers. */ @@ -709,7 +685,7 @@ gcn_can_change_mode_class (machine_mode from, machine_mode to, } /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P. - + When this hook returns true for MODE, the compiler allows registers explicitly used in the rtl to be used as spill registers but prevents the compiler from extending the lifetime of these @@ -723,7 +699,7 @@ gcn_small_register_classes_for_mode_p (machine_mode mode) } /* Implement TARGET_CLASS_LIKELY_SPILLED_P. - + Returns true if pseudos that have been assigned to registers of class RCLASS would likely be spilled because registers of RCLASS are needed for spill registers. */ @@ -736,7 +712,7 @@ gcn_class_likely_spilled_p (reg_class_t rclass) } /* Implement TARGET_MODES_TIEABLE_P. - + Returns true if a value of MODE1 is accessible in MODE2 without copying. */ @@ -758,7 +734,7 @@ gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2) } /* Implement TARGET_TRULY_NOOP_TRUNCATION. - + Returns true if it is safe to “convert” a value of INPREC bits to one of OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on it as if it had only OUTPREC bits. */ @@ -845,7 +821,7 @@ gcn_can_split_p (machine_mode, rtx op) } /* Implement TARGET_SPILL_CLASS. - + Return class of registers which could be used for pseudo of MODE and of class RCLASS for spilling instead of memory. Return NO_REGS if it is not possible or non-profitable. */ @@ -861,7 +837,7 @@ gcn_spill_class (reg_class_t c, machine_mode /*mode */ ) } /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. - + Change allocno class for given pseudo from allocno and best class calculated by IRA. */ @@ -1156,7 +1132,7 @@ gcn_constant64_p (rtx x) } /* Implement TARGET_LEGITIMATE_CONSTANT_P. - + Returns true if X is a legitimate constant for a MODE immediate operand. */ bool @@ -1249,7 +1225,7 @@ gcn_gen_undef (machine_mode mode) GEN_VNM - create accessor functions for all sizes of all modes GEN_VN_NOEXEC - for insns without "_exec" variants GEN_VNM_NOEXEC - likewise - + E.g. add<mode>3 GEN_VNM (add, 3, A(rtx dest, rtx s1, rtx s2), A(dest, s1, s2) @@ -1532,8 +1508,7 @@ gcn_flat_address_p (rtx x, machine_mode mode) if (!vec_mode && gcn_vec_address_register_p (x, DImode, false)) return true; - if (TARGET_FLAT_OFFSETS - && GET_CODE (x) == PLUS + if (GET_CODE (x) == PLUS && gcn_vec_address_register_p (XEXP (x, 0), DImode, false) && CONST_INT_P (XEXP (x, 1))) return true; @@ -1631,7 +1606,7 @@ gcn_global_address_p (rtx addr) } /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. - + Recognizes RTL expressions that are valid memory addresses for an instruction. The MODE argument is the machine mode for the MEM expression that wants to use this address. @@ -1644,10 +1619,6 @@ static bool gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, addr_space_t as, code_helper = ERROR_MARK) { - /* All vector instructions need to work on addresses in registers. */ - if (!TARGET_FLAT_OFFSETS && (vgpr_vector_mode_p (mode) && !REG_P (x))) - return false; - if (AS_SCALAR_FLAT_P (as)) { if (mode == QImode || mode == HImode) @@ -1693,15 +1664,13 @@ gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, return gcn_address_register_p (x, SImode, strict); else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as)) { - if (!TARGET_FLAT_OFFSETS || GET_CODE (x) == REG) + if (GET_CODE (x) == REG) return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) ? gcn_address_register_p (x, DImode, strict) : gcn_vec_address_register_p (x, DImode, strict)); else { - gcc_assert (TARGET_FLAT_OFFSETS); - if (GET_CODE (x) == PLUS) { rtx x1 = XEXP (x, 1); @@ -1725,8 +1694,6 @@ gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, } else if (AS_GLOBAL_P (as)) { - gcc_assert (TARGET_FLAT_OFFSETS); - if (GET_CODE (x) == REG) return (gcn_address_register_p (x, DImode, strict) || (!VECTOR_MODE_P (mode) @@ -1819,7 +1786,7 @@ gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, } /* Implement TARGET_ADDR_SPACE_POINTER_MODE. - + Return the appropriate mode for a named address pointer. */ static scalar_int_mode @@ -1842,7 +1809,7 @@ gcn_addr_space_pointer_mode (addr_space_t addrspace) } /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE. - + Return the appropriate mode for a named address space address. */ static scalar_int_mode @@ -1852,7 +1819,7 @@ gcn_addr_space_address_mode (addr_space_t addrspace) } /* Implement TARGET_ADDR_SPACE_SUBSET_P. - + Determine if one named address space is a subset of another. */ static bool @@ -1930,7 +1897,7 @@ gcn_addr_space_debug (addr_space_t as) /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h - + Retun true if REGNO is OK for memory adressing. */ bool @@ -1963,7 +1930,7 @@ gcn_regno_mode_code_ok_for_base_p (int regno, } /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h. - + Return a suitable register class for memory addressing. */ reg_class @@ -1994,7 +1961,7 @@ gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc, } /* Implement REGNO_OK_FOR_INDEX_P via gcn.h. - + Return true if REGNO is OK for index of memory addressing. */ bool @@ -2203,7 +2170,7 @@ gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode, case ADDR_SPACE_FLAT: case ADDR_SPACE_FLAT_SCRATCH: case ADDR_SPACE_GLOBAL: - return !TARGET_FLAT_OFFSETS ? force_reg (DImode, x) : x; + return x; case ADDR_SPACE_LDS: case ADDR_SPACE_GDS: /* FIXME: LDS support offsets, handle them!. */ @@ -2241,13 +2208,6 @@ gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem, rtx mem_base = XEXP (mem, 0); rtx mem_index = NULL_RTX; - if (!TARGET_FLAT_OFFSETS) - { - /* gcn_addr_space_legitimize_address should have put the address in a - register. If not, it is too late to do anything about it. */ - gcc_assert (REG_P (mem_base)); - } - if (GET_CODE (mem_base) == PLUS) { mem_index = XEXP (mem_base, 1); @@ -2669,7 +2629,7 @@ gcn_valid_move_p (machine_mode mode, rtx dest, rtx src) /* {{{ Functions and ABI. */ /* Implement TARGET_FUNCTION_VALUE. - + Define how to find the value returned by a function. The register location is always the same, but the mode depends on VALTYPE. */ @@ -2688,7 +2648,7 @@ gcn_function_value (const_tree valtype, const_tree, bool) } /* Implement TARGET_FUNCTION_VALUE_REGNO_P. - + Return true if N is a possible register number for the function return value. */ @@ -2734,7 +2694,7 @@ gcn_strict_argument_naming (cumulative_args_t cum_v) } /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED. - + See comment on gcn_strict_argument_naming. */ static bool @@ -2744,7 +2704,7 @@ gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v) } /* Implement TARGET_FUNCTION_ARG. - + Return an RTX indicating whether a function argument is passed in a register and if so, which register. */ @@ -2806,7 +2766,7 @@ gcn_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) } /* Implement TARGET_FUNCTION_ARG_ADVANCE. - + Updates the summarizer variable pointed to by CUM_V to advance past an argument in the argument list. */ @@ -2844,7 +2804,7 @@ gcn_function_arg_advance (cumulative_args_t cum_v, } /* Implement TARGET_ARG_PARTIAL_BYTES. - + Returns the number of bytes at the beginning of an argument that must be put in registers. The value must be zero for arguments that are passed entirely in registers or that are entirely pushed on the stack. */ @@ -2896,7 +2856,7 @@ gcn_detect_incoming_pointer_arg (tree fndecl) } /* Implement INIT_CUMULATIVE_ARGS, via gcn.h. - + Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function whose data type is FNTYPE. For a library call, FNTYPE is 0. */ @@ -2973,7 +2933,7 @@ gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype)) } /* Implement TARGET_PROMOTE_FUNCTION_MODE. - + Return the mode to use for outgoing function arguments. */ machine_mode @@ -2989,7 +2949,7 @@ gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode, } /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. - + Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle ARGS_GROW_DOWNWARDS. */ @@ -3050,27 +3010,7 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, case omp_device_arch: return strcmp (name, "amdgcn") == 0 || strcmp (name, "gcn") == 0; case omp_device_isa: - if (strcmp (name, "fiji") == 0 || strcmp (name, "gfx803") == 0) - return gcn_arch == PROCESSOR_FIJI; - if (strcmp (name, "gfx900") == 0) - return gcn_arch == PROCESSOR_VEGA10; - if (strcmp (name, "gfx906") == 0) - return gcn_arch == PROCESSOR_VEGA20; - if (strcmp (name, "gfx908") == 0) - return gcn_arch == PROCESSOR_GFX908; - if (strcmp (name, "gfx90a") == 0) - return gcn_arch == PROCESSOR_GFX90a; - if (strcmp (name, "gfx90c") == 0) - return gcn_arch == PROCESSOR_GFX90c; - if (strcmp (name, "gfx1030") == 0) - return gcn_arch == PROCESSOR_GFX1030; - if (strcmp (name, "gfx1036") == 0) - return gcn_arch == PROCESSOR_GFX1036; - if (strcmp (name, "gfx1100") == 0) - return gcn_arch == PROCESSOR_GFX1100; - if (strcmp (name, "gfx1103") == 0) - return gcn_arch == PROCESSOR_GFX1103; - return 0; + return strcmp (name, gcn_devices[gcn_arch].name) == 0; default: gcc_unreachable (); } @@ -3114,7 +3054,7 @@ gcn_compute_frame_offsets (void) /* Insert code into the prologue or epilogue to store or load any callee-save register to/from the stack. - + Helper function for gcn_expand_prologue and gcn_expand_epilogue. */ static void @@ -3562,17 +3502,6 @@ gcn_expand_prologue () /* Ensure that the scheduler doesn't do anything unexpected. */ emit_insn (gen_blockage ()); - if (TARGET_M0_LDS_LIMIT) - { - /* m0 is initialized for the usual LDS DS and FLAT memory case. - The low-part is the address of the topmost addressable byte, which is - size-1. The high-part is an offset and should be zero. */ - emit_move_insn (gen_rtx_REG (SImode, M0_REG), - gen_int_mode (LDS_SIZE, SImode)); - - emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); - } - if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) { /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */ @@ -3682,10 +3611,10 @@ gcn_frame_pointer_rqd (void) } /* Implement TARGET_CAN_ELIMINATE. - + Return true if the compiler is allowed to try to replace register number FROM_REG with register number TO_REG. - + FIXME: is the default "true" not enough? Should this be a negative set? */ bool @@ -3696,7 +3625,7 @@ gcn_can_eliminate_p (int /*from_reg */ , int to_reg) } /* Implement INITIAL_ELIMINATION_OFFSET. - + Returns the initial difference between the specified pair of registers, in terms of stack position. */ @@ -3763,7 +3692,7 @@ gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg) } /* Implement HARD_REGNO_CALLER_SAVE_MODE. - + Which mode is required for saving NREGS of a pseudo-register in call-clobbered hard register REGNO. */ @@ -3872,7 +3801,7 @@ gcn_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0, rtx op1, /* {{{ Miscellaneous. */ /* Implement TARGET_CANNOT_COPY_INSN_P. - + Return true if INSN must not be duplicated. */ static bool @@ -3964,7 +3893,7 @@ gcn_emutls_var_init (tree, tree decl, tree) /* {{{ Costs. */ /* Implement TARGET_RTX_COSTS. - + Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -4001,7 +3930,7 @@ gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool) } /* Implement TARGET_MEMORY_MOVE_COST. - + Return the cost of moving data of mode M between a register and memory. A value of 2 is the default; this cost is relative to those in `REGISTER_MOVE_COST'. @@ -4063,7 +3992,7 @@ gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in) } /* Implement TARGET_REGISTER_MOVE_COST. - + Return the cost of moving data from a register in class CLASS1 to one in class CLASS2. Base value is 2. */ @@ -4186,7 +4115,7 @@ struct gcn_builtin_description gcn_builtins[] = { static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX]; /* Implement TARGET_BUILTIN_DECL. - + Return the GCN builtin for CODE. */ tree @@ -4238,7 +4167,7 @@ gcn_init_builtin_types (void) } /* Implement TARGET_INIT_BUILTINS. - + Set up all builtin functions for this target. */ static void @@ -4526,7 +4455,7 @@ gcn_init_libfuncs (void) /* Expand the CMP_SWAP GCN builtins. We have our own versions that do not require taking the address of any object, other than the memory cell being operated on. - + Helper function for gcn_expand_builtin_1. */ static rtx @@ -5030,7 +4959,7 @@ gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ , } /* Implement TARGET_EXPAND_BUILTIN. - + Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient (and in mode MODE if that's convenient). SUBTARGET may be used as the target for computing one of EXP's operands. @@ -5070,7 +4999,7 @@ gcn_vectorize_get_mask_mode (machine_mode) /* Return an RTX that references a vector with the i-th lane containing PERM[i]*4. - + Helper function for gcn_vectorize_vec_perm_const. */ static rtx @@ -5107,9 +5036,9 @@ gcn_make_vec_perm_address (unsigned int *perm, int nelt) } /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. - + Return true if permutation with SEL is possible. - + If DST/SRC0/SRC1 are non-null, emit the instructions to perform the permutations. */ @@ -5200,7 +5129,7 @@ gcn_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, } /* Implements TARGET_VECTOR_MODE_SUPPORTED_P. - + Return nonzero if vector MODE is supported with at least move instructions. */ @@ -5597,8 +5526,7 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) || unspec == UNSPEC_UMAX_DPP_SHR); bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR && GET_MODE_CLASS (mode) == MODE_VECTOR_INT - /* FIXME: why GCN3? */ - && (TARGET_GCN3 || scalar_mode == DImode); + && scalar_mode == DImode; if (use_plus_carry) unspec = UNSPEC_PLUS_CARRY_DPP_SHR; @@ -6195,7 +6123,7 @@ gcn_md_reorg (void) CLEAR_REG_SET (&live); /* "Manually Inserted Wait States (NOPs)." - + GCN hardware detects most kinds of register dependencies, but there are some exceptions documented in the ISA manual. This pass detects the missed cases, and inserts the documented number of NOPs @@ -6506,7 +6434,7 @@ gcn_fork_join (gcall *call, const int dims[], bool is_fork) /* Implement ??????? FIXME make this a real hook. - + Adjust FNDECL such that options inherited from the host compiler are made appropriate for the accelerator compiler. */ @@ -6569,7 +6497,7 @@ gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo, /* {{{ ASM Output. */ /* Implement TARGET_ASM_FILE_START. - + Print assembler file header text. */ static void @@ -6579,68 +6507,20 @@ output_file_start (void) configuration. */ const char *xnack = (flag_xnack == HSACO_ATTR_ON ? ":xnack+" : flag_xnack == HSACO_ATTR_OFF ? ":xnack-" - : ""); + : "" /* Unsupported or "any". */); const char *sram_ecc = (flag_sram_ecc == HSACO_ATTR_ON ? ":sramecc+" : flag_sram_ecc == HSACO_ATTR_OFF ? ":sramecc-" - : ""); - - const char *cpu; - switch (gcn_arch) - { - case PROCESSOR_FIJI: - cpu = "gfx803"; - xnack = ""; - sram_ecc = ""; - break; - case PROCESSOR_VEGA10: - cpu = "gfx900"; - sram_ecc = ""; - break; - case PROCESSOR_VEGA20: - cpu = "gfx906"; - sram_ecc = ""; - break; - case PROCESSOR_GFX908: - cpu = "gfx908"; - break; - case PROCESSOR_GFX90a: - cpu = "gfx90a"; - break; - case PROCESSOR_GFX90c: - cpu = "gfx90c"; - sram_ecc = ""; - break; - case PROCESSOR_GFX1030: - cpu = "gfx1030"; - xnack = ""; - sram_ecc = ""; - break; - case PROCESSOR_GFX1036: - cpu = "gfx1036"; - xnack = ""; - sram_ecc = ""; - break; - case PROCESSOR_GFX1100: - cpu = "gfx1100"; - xnack = ""; - sram_ecc = ""; - break; - case PROCESSOR_GFX1103: - cpu = "gfx1103"; - xnack = ""; - sram_ecc = ""; - break; - default: gcc_unreachable (); - } + : "" /* Unsupported or "any". */); + const char *cpu = gcn_devices[gcn_arch].name; fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s%s%s\"\n", cpu, sram_ecc, xnack); } /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h. - + Print the initial definition of a function name. - + For GCN kernel entry points this includes all the HSA meta-data, special alignment constraints that don't apply to regular functions, and magic comments that pass information to mkoffload. */ @@ -6778,11 +6658,13 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, if (!TARGET_ARCHITECTED_FLAT_SCRATCH) fprintf (file, "\t .amdhsa_reserve_flat_scratch\t0\n"); - if (gcn_arch == PROCESSOR_GFX90a) + if (TARGET_AVGPR_COMBINED) fprintf (file, - "\t .amdhsa_accum_offset\t%i\n" - "\t .amdhsa_tg_split\t0\n", + "\t .amdhsa_accum_offset\t%i\n", vgpr); /* The AGPRs come after the VGPRs. */ + if (TARGET_TGSPLIT) + fprintf (file, + "\t .amdhsa_tg_split\t0\n"); fputs ("\t.end_amdhsa_kernel\n", file); #if 1 @@ -6813,7 +6695,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, (TARGET_WAVE64_COMPAT ? " ; wavefrontsize64 counts double on SIMD32" : "")); - if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908) + if (TARGET_AVGPRS) fprintf (file, " .agpr_count: %i\n", avgpr); fputs (" .end_amdgpu_metadata\n", file); #endif @@ -6855,7 +6737,7 @@ gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align) } /* Implement TARGET_ASM_FUNCTION_PROLOGUE. - + Emits custom text into the assembler file at the head of each function. */ static void @@ -7019,7 +6901,7 @@ gcn_asm_output_symbol_ref (FILE *file, rtx x) } /* Implement TARGET_CONSTANT_ALIGNMENT. - + Returns the alignment in bits of a constant that is being placed in memory. CONSTANT is the constant and BASIC_ALIGN is the alignment that the object would ordinarily have. */ @@ -7070,15 +6952,10 @@ print_operand_address (FILE *file, rtx mem) if (GET_CODE (addr) == REG) print_reg (file, addr); else - { - gcc_assert (TARGET_FLAT_OFFSETS); - print_reg (file, XEXP (addr, 0)); - } + print_reg (file, XEXP (addr, 0)); } else if (AS_GLOBAL_P (as)) { - gcc_assert (TARGET_GLOBAL_ADDRSPACE); - rtx base = addr; rtx vgpr_offset = NULL_RTX; @@ -7190,7 +7067,6 @@ print_operand_address (FILE *file, rtx mem) E - print conditional code for v_cmp (eq_u64/ne_u64...) A - print address in formatting suitable for given address space. O - print offset:n for data share operations. - ^ - print "_co" suffix for GCN5 mnemonics g - print "glc", if appropriate for given MEM L - print low-part of a multi-reg value H - print second part of a multi-reg value (high-part of 2-reg value) @@ -7439,8 +7315,6 @@ print_operand (FILE *file, rtx x, int code) rtx x0 = XEXP (x, 0); if (AS_GLOBAL_P (MEM_ADDR_SPACE (x))) { - gcc_assert (TARGET_GLOBAL_ADDRSPACE); - fprintf (file, ", "); rtx base = x0; @@ -7809,10 +7683,6 @@ print_operand (FILE *file, rtx x, int code) else output_addr_const (file, x); return; - case '^': - if (TARGET_EXPLICIT_CARRY) - fputs ("_co", file); - return; case 'g': gcc_assert (xcode == MEM); if (MEM_VOLATILE_P (x)) @@ -7825,7 +7695,7 @@ print_operand (FILE *file, rtx x, int code) } /* Implement DEBUGGER_REGNO macro. - + Return the DWARF register number that corresponds to the GCC internal REGNO. */ @@ -7864,7 +7734,7 @@ gcn_dwarf_register_number (unsigned int regno) } /* Implement TARGET_DWARF_REGISTER_SPAN. - + DImode and Vector DImode require additional registers. */ static rtx diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index bd2afa6..30a144b 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -16,13 +16,28 @@ #include "config/gcn/gcn-opts.h" +extern const struct gcn_device_def { + enum processor_type id; + const char *name; + const char *NAME; + enum gcn_isa isa; + + /* Features. */ + enum hsaco_attr_type xnack_default; + enum hsaco_attr_type sramecc_default; + enum hsaco_attr_type wave64_default; + enum hsaco_attr_type cumode_default; + int max_isa_vgprs; + unsigned generic_version; + const char *arch_family; +} gcn_devices[]; + #define TARGET_CPU_CPP_BUILTINS() \ do \ { \ + builtin_define ("__AMDGPU__"); \ builtin_define ("__AMDGCN__"); \ - if (TARGET_GCN3) \ - builtin_define ("__GCN3__"); \ - else if (TARGET_GCN5) \ + if (TARGET_GCN5) \ builtin_define ("__GCN5__"); \ else if (TARGET_CDNA1) \ builtin_define ("__CDNA1__"); \ @@ -34,31 +49,27 @@ builtin_define ("__RDNA3__"); \ else \ gcc_unreachable (); \ - if (TARGET_FIJI) \ - { \ - builtin_define ("__fiji__"); \ - builtin_define ("__gfx803__"); \ - } \ - else if (TARGET_VEGA10) \ - builtin_define ("__gfx900__"); \ - else if (TARGET_VEGA20) \ - builtin_define ("__gfx906__"); \ - else if (TARGET_GFX908) \ - builtin_define ("__gfx908__"); \ - else if (TARGET_GFX90a) \ - builtin_define ("__gfx90a__"); \ - else if (TARGET_GFX90c) \ - builtin_define ("__gfx90c__"); \ - else if (TARGET_GFX1030) \ - builtin_define ("__gfx1030__"); \ - else if (TARGET_GFX1036) \ - builtin_define ("__gfx1036__"); \ - else if (TARGET_GFX1100) \ - builtin_define ("__gfx1100__"); \ - else if (TARGET_GFX1103) \ - builtin_define ("__gfx1103__"); \ - else \ - gcc_unreachable (); \ + char *name = (char *)xmalloc (strlen (gcn_devices[gcn_arch].name) + 5); \ + sprintf (name, "__%s__", gcn_devices[gcn_arch].name); \ + char *p; \ + if (gcn_devices[gcn_arch].generic_version) \ + while ((p = strchr(name, '-'))) \ + *p = '_'; \ + builtin_define (name); \ + name = (char *)xmalloc (strlen (gcn_devices[gcn_arch].arch_family) + 5); \ + sprintf (name, "__%s__", gcn_devices[gcn_arch].arch_family); \ + builtin_define (name); \ + name = (char *)xmalloc (strlen ("__amdgcn_target_id__") + \ + strlen (gcn_devices[gcn_arch].name) + 4); \ + sprintf (name, "__amdgcn_target_id__=\"%s\"", gcn_devices[gcn_arch].name); \ + builtin_define (name); \ + name = (char *)xmalloc (strlen ("__amdgcn_processor__") + \ + strlen (gcn_devices[gcn_arch].name) + 4); \ + sprintf (name, "__amdgcn_processor__=\"%s\"", gcn_devices[gcn_arch].name); \ + if (gcn_devices[gcn_arch].generic_version) \ + while ((p = strchr(name, '-'))) \ + *p = '_'; \ + builtin_define (name); \ } while (0) #define ASSEMBLER_DIALECT (TARGET_RDNA2_PLUS ? 1 : 0) @@ -198,7 +209,7 @@ STATIC_ASSERT (LAST_AVGPR_REG + 1 - FIRST_AVGPR_REG == 256); #define HARD_FRAME_POINTER_IS_ARG_POINTER 0 #define HARD_FRAME_POINTER_IS_FRAME_POINTER 0 -#define SGPR_REGNO_P(N) ((N) >= FIRST_SGPR_REG && (N) <= LAST_SGPR_REG) +#define SGPR_REGNO_P(N) (/*(N) >= FIRST_SGPR_REG &&*/ (N) <= LAST_SGPR_REG) #define VGPR_REGNO_P(N) ((N) >= FIRST_VGPR_REG && (N) <= LAST_VGPR_REG) #define AVGPR_REGNO_P(N) ((N) >= FIRST_AVGPR_REG && (N) <= LAST_AVGPR_REG) #define SSRC_REGNO_P(N) ((N) <= SCC_REG && (N) != VCCZ_REG) @@ -581,8 +592,7 @@ enum gcn_address_spaces c_register_addr_space ("__global", ADDR_SPACE_GLOBAL); \ } while (0); -#define STACK_ADDR_SPACE \ - (TARGET_GCN5_PLUS ? ADDR_SPACE_GLOBAL : ADDR_SPACE_FLAT) +#define STACK_ADDR_SPACE ADDR_SPACE_GLOBAL #define DEFAULT_ADDR_SPACE \ ((cfun && cfun->machine && !cfun->machine->use_flat_addressing) \ ? ADDR_SPACE_GLOBAL : ADDR_SPACE_FLAT) diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index f223ec9..8b6ae85 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -286,7 +286,7 @@ ; Disable alternatives that only apply to specific ISA variants. -(define_attr "gcn_version" "gcn3,gcn5,cdna2" (const_string "gcn3")) +(define_attr "cdna" "any,cdna2" (const_string "any")) (define_attr "rdna" "any,no,yes" (const_string "any")) (define_attr "xnack" "na,off,on" (const_string "na")) @@ -298,10 +298,7 @@ (and (eq_attr "rdna" "yes") (eq (symbol_ref "TARGET_RDNA2_PLUS") (const_int 0))) (const_int 0) - (and (eq_attr "gcn_version" "gcn5") - (eq (symbol_ref "TARGET_GCN5_PLUS") (const_int 0))) - (const_int 0) - (and (eq_attr "gcn_version" "cdna2") + (and (eq_attr "cdna" "cdna2") (eq (symbol_ref "TARGET_CDNA2_PLUS") (const_int 0))) (const_int 0) (and (eq_attr "xnack" "off") @@ -568,7 +565,7 @@ [(set (match_operand:SISF 0 "nonimmediate_operand") (match_operand:SISF 1 "gcn_load_operand"))] "" - {@ [cons: =0, 1; attrs: type, exec, length, gcn_version, xnack] + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 @@ -609,7 +606,7 @@ [(set (match_operand:QIHI 0 "nonimmediate_operand") (match_operand:QIHI 1 "gcn_load_operand"))] "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])" - {@ [cons: =0, 1; attrs: type, exec, length, gcn_version, xnack] + {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack] [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1 [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1 [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1 @@ -642,7 +639,7 @@ [(set (match_operand:DIDF 0 "nonimmediate_operand") (match_operand:DIDF 1 "general_operand"))] "GET_CODE(operands[1]) != SYMBOL_REF" - {@ [cons: =0, 1; attrs: type, length, gcn_version, xnack] + {@ [cons: =0, 1; attrs: type, length, cdna, xnack] [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1 [SD ,C ;sop1 ,8 ,* ,* ] ^ [SD ,DB ;mult ,* ,* ,* ] # @@ -707,7 +704,7 @@ [(set (match_operand:TI 0 "nonimmediate_operand") (match_operand:TI 1 "general_operand" ))] "" - {@ [cons: =0, 1; attrs: type, delayeduse, length, gcn_version, xnack] + {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack] [SD ,SSB;mult ,* ,* ,* ,* ] # [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0 [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0) @@ -1137,7 +1134,7 @@ s_add_i32\t%0, %1, %2 s_addk_i32\t%0, %2 s_add_i32\t%0, %1, %2 - v_add%^_u32\t%0, vcc, %2, %1" + v_add_co_u32\t%0, vcc, %2, %1" [(set_attr "type" "sop2,sopk,sop2,vop2") (set_attr "length" "4,4,8,8")]) @@ -1217,7 +1214,7 @@ "" "@ s_add_u32\t%0, %1, %2 - v_add%^_u32\t%0, vcc, %2, %1" + v_add_co_u32\t%0, vcc, %2, %1" [(set_attr "type" "sop2,vop2") (set_attr "length" "8,8")]) @@ -1232,7 +1229,7 @@ "INTVAL (operands[2]) == -INTVAL (operands[3])" "@ s_add_u32\t%0, %1, %2 - v_add%^_u32\t%0, vcc, %2, %1" + v_add_co_u32\t%0, vcc, %2, %1" [(set_attr "type" "sop2,vop2") (set_attr "length" "4")]) @@ -1254,7 +1251,7 @@ "" "@ s_addc_u32\t%0, %1, %2 - {v_addc%^_u32|v_add_co_ci_u32}\t%0, vcc, %2, %1, vcc" + {v_addc_co_u32|v_add_co_ci_u32}\t%0, vcc, %2, %1, vcc" [(set_attr "type" "sop2,vop2") (set_attr "length" "8,4")]) @@ -1270,7 +1267,7 @@ "" "@ s_addc_u32\t%0, %1, 0 - {v_addc%^_u32|v_add_co_ci_u32}\t%0, vcc, 0, %1, vcc" + {v_addc_co_u32|v_add_co_ci_u32}\t%0, vcc, 0, %1, vcc" [(set_attr "type" "sop2,vop2") (set_attr "length" "4")]) @@ -1299,8 +1296,8 @@ rtx new_operands[4] = { operands[0], operands[1], operands[2], gen_rtx_REG (DImode, CC_SAVE_REG) }; - output_asm_insn ("v_add%^_u32\t%L0, %3, %L2, %L1", new_operands); - output_asm_insn ("{v_addc%^_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, %3", + output_asm_insn ("v_add_co_u32\t%L0, %3, %L2, %L1", new_operands); + output_asm_insn ("{v_addc_co_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, %3", new_operands); } else @@ -1332,8 +1329,8 @@ "@ s_sub_i32\t%0, %1, %2 s_sub_i32\t%0, %1, %2 - v_subrev%^_u32\t%0, vcc, %2, %1 - v_sub%^_u32\t%0, vcc, %1, %2" + v_subrev_co_u32\t%0, vcc, %2, %1 + v_sub_co_u32\t%0, vcc, %1, %2" [(set_attr "type" "sop2,sop2,vop2,vop2") (set_attr "length" "4,8,8,8")]) @@ -1462,11 +1459,6 @@ (const_int 32))))] "" { - if (can_create_pseudo_p () - && !TARGET_MULTIPLY_IMMEDIATE - && !gcn_inline_immediate_operand (operands[2], SImode)) - operands[2] = force_reg (SImode, operands[2]); - if (REG_P (operands[2])) emit_insn (gen_<su>mulsi3_highpart_reg (operands[0], operands[1], operands[2])); @@ -1492,8 +1484,7 @@ s_mul_hi<sgnsuffix>0\t%0, %1, %2 v_mul_hi<sgnsuffix>0\t%0, %2, %1" [(set_attr "type" "sop2,vop3a") - (set_attr "length" "4,8") - (set_attr "gcn_version" "gcn5,*")]) + (set_attr "length" "4,8")]) (define_insn "<su>mulsi3_highpart_imm" [(set (match_operand:SI 0 "register_operand" "=Sg,Sg,v") @@ -1504,15 +1495,13 @@ (match_operand:SI 1 "register_operand" "Sg,Sg,v")) (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B,A")) (const_int 32))))] - "TARGET_MULTIPLY_IMMEDIATE - || gcn_inline_immediate_operand (operands[2], SImode)" + "" "@ s_mul_hi<sgnsuffix>0\t%0, %1, %2 s_mul_hi<sgnsuffix>0\t%0, %1, %2 v_mul_hi<sgnsuffix>0\t%0, %2, %1" [(set_attr "type" "sop2,sop2,vop3a") - (set_attr "length" "4,8,8") - (set_attr "gcn_version" "gcn5,gcn5,*")]) + (set_attr "length" "4,8,8")]) (define_expand "<su>mulsidi3" [(set (match_operand:DI 0 "register_operand" "") @@ -1522,11 +1511,6 @@ (match_operand:SI 2 "nonmemory_operand" ""))))] "" { - if (can_create_pseudo_p () - && !TARGET_MULTIPLY_IMMEDIATE - && !gcn_inline_immediate_operand (operands[2], SImode)) - operands[2] = force_reg (SImode, operands[2]); - if (REG_P (operands[2])) emit_insn (gen_<su>mulsidi3_reg (operands[0], operands[1], operands[2])); else @@ -1551,8 +1535,7 @@ emit_insn (gen_mulsi3 (dstlo, operands[1], operands[2])); emit_insn (gen_<su>mulsi3_highpart (dsthi, operands[1], operands[2])); DONE; - } - [(set_attr "gcn_version" "gcn5,*")]) + }) (define_insn_and_split "<su>mulsidi3_imm" [(set (match_operand:DI 0 "register_operand" "=&Sg,&Sg,&v") @@ -1560,8 +1543,7 @@ (match_operand:SI 1 "register_operand" "Sg, Sg, v")) (match_operand:DI 2 "gcn_32bit_immediate_operand" "A, B, A")))] - "TARGET_MULTIPLY_IMMEDIATE - || gcn_inline_immediate_operand (operands[2], SImode)" + "" "#" "&& reload_completed" [(const_int 0)] @@ -1571,8 +1553,7 @@ emit_insn (gen_mulsi3 (dstlo, operands[1], operands[2])); emit_insn (gen_<su>mulsi3_highpart (dsthi, operands[1], operands[2])); DONE; - } - [(set_attr "gcn_version" "gcn5,gcn5,*")]) + }) (define_insn_and_split "muldi3" [(set (match_operand:DI 0 "register_operand" "=&Sg,&Sg, &v,&v") @@ -1606,8 +1587,7 @@ add = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, add, clob1, clob2)); emit_insn (add); DONE; - } - [(set_attr "gcn_version" "gcn5,gcn5,*,*")]) + }) (define_insn "<u>mulhisi3" [(set (match_operand:SI 0 "register_operand" "=v") @@ -1994,8 +1974,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") - (set_attr "length" "12") - (set_attr "gcn_version" "gcn5,*,gcn5")]) + (set_attr "length" "12")]) ; FIXME: These patterns are disabled because the instructions don't ; seem to work as advertised. Specifically, OMP "team distribute" @@ -2016,8 +1995,7 @@ flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0 global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") - (set_attr "length" "12") - (set_attr "gcn_version" "gcn5,*,gcn5")]) + (set_attr "length" "12")]) (define_mode_attr x2 [(SI "DI") (DI "TI")]) (define_mode_attr size [(SI "4") (DI "8")]) @@ -2064,7 +2042,6 @@ global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)" [(set_attr "type" "smem,flat,flat") (set_attr "length" "12") - (set_attr "gcn_version" "gcn5,*,gcn5") (set_attr "delayeduse" "*,yes,yes")]) (define_insn "sync_compare_and_swap<mode>_lds_insn" @@ -2174,7 +2151,6 @@ } [(set_attr "type" "smem,flat,flat") (set_attr "length" "28") - (set_attr "gcn_version" "gcn5,*,gcn5") (set_attr "rdna" "no,*,*")]) (define_insn "atomic_store<mode>" @@ -2249,7 +2225,6 @@ } [(set_attr "type" "smem,flat,flat") (set_attr "length" "28") - (set_attr "gcn_version" "gcn5,*,gcn5") (set_attr "rdna" "no,*,*")]) (define_insn "atomic_exchange<mode>" @@ -2362,7 +2337,6 @@ } [(set_attr "type" "smem,flat,flat") (set_attr "length" "28") - (set_attr "gcn_version" "gcn5,*,gcn5") (set_attr "rdna" "no,*,*")]) ;; }}} diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index 3317c49..57e344e 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -21,46 +21,12 @@ HeaderInclude config/gcn/gcn-opts.h -Enum -Name(gpu_type) Type(enum processor_type) -GCN GPU type to use: - -EnumValue -Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI) - -EnumValue -Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA10) - -EnumValue -Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20) - -EnumValue -Enum(gpu_type) String(gfx908) Value(PROCESSOR_GFX908) - -EnumValue -Enum(gpu_type) String(gfx90a) Value(PROCESSOR_GFX90a) - -EnumValue -Enum(gpu_type) String(gfx90c) Value(PROCESSOR_GFX90c) - -EnumValue -Enum(gpu_type) String(gfx1030) Value(PROCESSOR_GFX1030) - -EnumValue -Enum(gpu_type) String(gfx1036) Value(PROCESSOR_GFX1036) - -EnumValue -Enum(gpu_type) String(gfx1100) Value(PROCESSOR_GFX1100) - -EnumValue -Enum(gpu_type) String(gfx1103) Value(PROCESSOR_GFX1103) - march= -Target RejectNegative Negative(march=) Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI) +Target RejectNegative Negative(march=) Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_GFX900) Specify the name of the target GPU. mtune= -Target RejectNegative Negative(mtune=) Joined ToLower Enum(gpu_type) Var(gcn_tune) Init(PROCESSOR_FIJI) +Target RejectNegative Negative(mtune=) Joined ToLower Enum(gpu_type) Var(gcn_tune) Init(PROCESSOR_GFX900) Specify the name of the target GPU. m32 @@ -117,7 +83,7 @@ Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_xnack) Init( Compile for devices requiring XNACK enabled. Default \"any\" if USM is supported. msram-ecc= -Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_sram_ecc) Init(HSACO_ATTR_ANY) +Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_sram_ecc) Init(HSACO_ATTR_DEFAULT) Compile for devices with the SRAM ECC feature enabled, or not. Default \"any\". -param=gcn-preferred-vectorization-factor= diff --git a/gcc/config/gcn/gen-gcn-device-macros.awk b/gcc/config/gcn/gen-gcn-device-macros.awk new file mode 100644 index 0000000..5ecc5c4 --- /dev/null +++ b/gcc/config/gcn/gen-gcn-device-macros.awk @@ -0,0 +1,129 @@ +# Generate $objdir/gcn-device-macros.h from gcn-devices.def +# +# Copyright (C) 2024 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +BEGIN { + FS= "[(,] *" + + print "/* Generated by gen-gcn-device-macros.awk from gcn-devices.def." + print " Do not edit. */" + + list="" + generic_list="" +} + +/^GCN_DEVICE\(/ { + gfx=$2 + NAME=$3 + list=(list " OPT_" NAME) + + print "" + next +} + +/XNACK default.*HSACO_ATTR_UNSUPPORTED/ { + printf "\n#define XNACK_%s \"march=%s:;\"", NAME, gfx + next +} + +/XNACK default.*HSACO_ATTR_OFF/ { + printf "\n#define XNACK_%s \"march=%s:%{!mxnack*|mxnack=default|mxnack=off:-mattr=-xnack;mxnack=on:-mattr=+xnack};\"", NAME, gfx + next +} + +/XNACK default.*HSACO_ATTR_ANY/ { + printf "\n#define XNACK_%s \"march=%s:%{mxnack=off:-mattr=-xnack;mxnack=on:-mattr=+xnack};\"", NAME, gfx + next +} + +/XNACK default.*HSACO/ { + print FILENAME ":" NR ": error: unhandled HSACO default at line (gen-gcn-device-macros.awk)" > "/dev/stderr" + exit 1 +} + +/SRAM_ECC default.*HSACO_ATTR_UNSUPPORTED/ { + printf "\n#define SRAM_%s \"march=%s:;\"", NAME, gfx + next +} + +/SRAM_ECC default.*HSACO_ATTR_ANY/ { + printf "\n#define SRAM_%s \"march=%s:%{msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc};\"", NAME, gfx + next +} + +/SRAM_ECC default.*HSACO/ { + print FILENAME ":" NR ": error: unhandled HSACO default at line (gen-gcn-device-macros.awk)" > "/dev/stderr" + exit 1 +} + +/WAVE64 mode.*HSACO_ATTR_UNSUPPORTED/ { + printf "\n#define WAVE64_%s \"march=%s:;\"", NAME, gfx + next +} + +/WAVE64 mode.*HSACO_ATTR_ON/ { + printf "\n#define WAVE64_%s \"march=%s:-mattr=+wavefrontsize64;\"", NAME, gfx + next +} + +/WAVE64 mode.*HSACO/ { + print FILENAME ":" NR ": error: unhandled HSACO default at line (gen-gcn-device-macros.awk)" > "/dev/stderr" + exit 1 +} + +/CU mode.*HSACO_ATTR_UNSUPPORTED/ { + printf "\n#define CU_%s \"march=%s:;\"", NAME, gfx + next +} + +/CU mode.*HSACO_ATTR_ON/ { + printf "\n#define CU_%s \"march=%s:-mattr=+cumode;\"", NAME, gfx + next +} + +/CU mode.*HSACO/ { + print FILENAME ":" NR ": error: unhandled HSACO default at line (gen-gcn-device-macros.awk)" > "/dev/stderr" + exit 1 +} + +/Generic code obj version/ { + match($0,/Generic code obj version[^\/]*\/[\t ]*([0-9]+)/,m) + if (m[1] > 0) { + printf "\n#define GENERIC_%s \"march=%s:--amdhsa-code-object-version=6;\"", NAME, gfx + generic_list=(generic_list " GENERIC_" NAME) + } + next +} + +# ABI Version: In principle, the LLVM default would work. However, +# however, when debugging symbols are turned on, mkoffload.cc +# writes a new AMD GPU object file and the ABI version needs to be the +# same. - LLVM <= 17 defaults to 4 while LLVM >= 18 defaults to 5. +# GCC supports LLVM >= 13.0.1 and only LLVM >= 14 supports version 5. +# Code object V6 is supported since LLVM 19. + +END { + print "" + print "" + printf "#define ABI_VERSION_OPT \"%%{\"%s \"!march=*|march=*:--amdhsa-code-object-version=4} \"\n", generic_list + printf "#define XNACKOPT \"%%{\"%s \":%%eexpected march\\n} \"\n", gensub (/OPT/, "XNACK", "g", list) + printf "#define SRAMOPT \"%%{\"%s \":%%eexpected march\\n} \"\n", gensub (/OPT/, "SRAM", "g", list) + printf "#define WAVE64OPT \"%%{\"%s \":%%eexpected march\\n} \"\n", gensub (/OPT/, "WAVE64", "g", list) + printf "#define CUMODEOPT \"%%{\"%s \":%%eexpected march\\n} \"\n", gensub (/OPT/, "CU", "g", list) +} diff --git a/gcc/config/gcn/gen-opt-tables.awk b/gcc/config/gcn/gen-opt-tables.awk new file mode 100644 index 0000000..9fbe4cf --- /dev/null +++ b/gcc/config/gcn/gen-opt-tables.awk @@ -0,0 +1,55 @@ +# Generate gcn-tables.opt from gcn-devices.def +# +# Copyright (C) 2024 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +BEGIN { + FS= "[(,] *" + + print "; -*- buffer-read-only: t -*-" + print "; Generated automatically by gen-opt-tables.awk from gcn-devices.def." + print "; Do not edit." + print "" + print "; Copyright (C) 2024 Free Software Foundation, Inc." + print "" + print "; This file is part of GCC." + print "" + print "; GCC is free software; you can redistribute it and/or modify" + print "; it under the terms of the GNU General Public License as" + print "; published by the Free Software Foundation; either version 3," + print "; or (at your option) any later version." + print "" + print "; GCC is distributed in the hope that it will be useful," + print "; but WITHOUT ANY WARRANTY; without even the implied warranty of" + print "; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the" + print "; GNU General Public License for more details." + print "" + print "; You should have received a copy of the GNU General Public" + print "; License along with GCC; see the file COPYING3. If not see" + print "; <http://www.gnu.org/licenses/>." + print "" + print "Enum" + print "Name(gpu_type) Type(enum processor_type)" + print "GCN GPU type to use:" +} + +/^GCN_DEVICE\(/ { + print "" + print "EnumValue" + print "Enum(gpu_type) String(" $2 ") Value(PROCESSOR_" $3 ")" +} diff --git a/gcc/config/gcn/mkoffload.cc b/gcc/config/gcn/mkoffload.cc index 810298a..17a3342 100644 --- a/gcc/config/gcn/mkoffload.cc +++ b/gcc/config/gcn/mkoffload.cc @@ -24,6 +24,7 @@ This is not a complete assembler. We presume the source is well formed from the compiler and can die horribly if it is not. */ +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -48,27 +49,16 @@ #define ELFABIVERSION_AMDGPU_HSA_V3 1 #undef ELFABIVERSION_AMDGPU_HSA_V4 #define ELFABIVERSION_AMDGPU_HSA_V4 2 +#undef ELFABIVERSION_AMDGPU_HSA_V6 +#define ELFABIVERSION_AMDGPU_HSA_V6 4 -#undef EF_AMDGPU_MACH_AMDGCN_GFX803 -#define EF_AMDGPU_MACH_AMDGCN_GFX803 0x2a -#undef EF_AMDGPU_MACH_AMDGCN_GFX900 -#define EF_AMDGPU_MACH_AMDGCN_GFX900 0x2c -#undef EF_AMDGPU_MACH_AMDGCN_GFX906 -#define EF_AMDGPU_MACH_AMDGCN_GFX906 0x2f -#undef EF_AMDGPU_MACH_AMDGCN_GFX908 -#define EF_AMDGPU_MACH_AMDGCN_GFX908 0x30 -#undef EF_AMDGPU_MACH_AMDGCN_GFX90a -#define EF_AMDGPU_MACH_AMDGCN_GFX90a 0x3f -#undef EF_AMDGPU_MACH_AMDGCN_GFX90c -#define EF_AMDGPU_MACH_AMDGCN_GFX90c 0x32 -#undef EF_AMDGPU_MACH_AMDGCN_GFX1030 -#define EF_AMDGPU_MACH_AMDGCN_GFX1030 0x36 -#undef EF_AMDGPU_MACH_AMDGCN_GFX1036 -#define EF_AMDGPU_MACH_AMDGCN_GFX1036 0x45 -#undef EF_AMDGPU_MACH_AMDGCN_GFX1100 -#define EF_AMDGPU_MACH_AMDGCN_GFX1100 0x41 -#undef EF_AMDGPU_MACH_AMDGCN_GFX1103 -#define EF_AMDGPU_MACH_AMDGCN_GFX1103 0x44 +/* Extract the EF_AMDGPU_MACH_AMDGCN_GFXnnn from the def file. */ +enum elf_arch_code { +#define GCN_DEVICE(name, NAME, ELF_ARCH, ...) \ + EF_AMDGPU_MACH_AMDGCN_ ## NAME = ELF_ARCH, +#include "gcn-devices.def" +#undef GCN_DEVICE +}; #define EF_AMDGPU_FEATURE_XNACK_V4 0x300 /* Mask. */ #define EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 0x000 @@ -82,6 +72,9 @@ #define EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 0x800 #define EF_AMDGPU_FEATURE_SRAMECC_ON_V4 0xc00 +#define EF_AMDGPU_GENERIC_VERSION_V 0xff000000 /* Mask. */ +#define EF_AMDGPU_GENERIC_VERSION_OFFSET 24 + #define SET_XNACK_ON(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \ | EF_AMDGPU_FEATURE_XNACK_ON_V4) #define SET_XNACK_ANY(VAR) VAR = ((VAR & ~EF_AMDGPU_FEATURE_XNACK_V4) \ @@ -113,6 +106,12 @@ == EF_AMDGPU_FEATURE_SRAMECC_ON_V4) #define TEST_SRAM_ECC_UNSET(VAR) ((VAR & EF_AMDGPU_FEATURE_SRAMECC_V4) == 0) +#define GET_GENERIC_VERSION(VAR) ((VAR & EF_AMDGPU_GENERIC_VERSION_V) \ + >> EF_AMDGPU_GENERIC_VERSION_OFFSET) +#define SET_GENERIC_VERSION(VAR,GEN_VER) \ + VAR = ((VAR & ~EF_AMDGPU_GENERIC_VERSION_V) \ + | (GEN_VER << EF_AMDGPU_GENERIC_VERSION_OFFSET)) + #ifndef R_AMDGPU_NONE #define R_AMDGPU_NONE 0 #define R_AMDGPU_ABS32_LO 1 /* (S + A) & 0xFFFFFFFF */ @@ -135,6 +134,8 @@ static const char *gcn_dumpbase; static struct obstack files_to_cleanup; enum offload_abi offload_abi = OFFLOAD_ABI_UNSET; +const char *offload_abi_host_opts = NULL; + uint32_t elf_arch = EF_AMDGPU_MACH_AMDGCN_GFX900; // Default GPU architecture. uint32_t elf_flags = EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4; @@ -182,44 +183,6 @@ xputenv (const char *string) putenv (CONST_CAST (char *, string)); } -/* Read the whole input file. It will be NUL terminated (but - remember, there could be a NUL in the file itself. */ - -static const char * -read_file (FILE *stream, size_t *plen) -{ - size_t alloc = 16384; - size_t base = 0; - char *buffer; - - if (!fseek (stream, 0, SEEK_END)) - { - /* Get the file size. */ - long s = ftell (stream); - if (s >= 0) - alloc = s + 100; - fseek (stream, 0, SEEK_SET); - } - buffer = XNEWVEC (char, alloc); - - for (;;) - { - size_t n = fread (buffer + base, 1, alloc - base - 1, stream); - - if (!n) - break; - base += n; - if (base + 1 == alloc) - { - alloc *= 2; - buffer = XRESIZEVEC (char, buffer, alloc); - } - } - buffer[base] = 0; - *plen = base; - return buffer; -} - /* Parse STR, saving found tokens into PVALUES and return their number. Tokens are assumed to be delimited by ':'. */ @@ -352,18 +315,14 @@ copy_early_debug_info (const char *infile, const char *outfile) /* We only support host relocations of x86_64, for now. */ gcc_assert (ehdr.e_machine == EM_X86_64); - /* Fiji devices use HSACOv3 regardless of the assembler. */ - uint32_t elf_flags_actual = (elf_arch == EF_AMDGPU_MACH_AMDGCN_GFX803 - ? 0 : elf_flags); - /* Patch the correct elf architecture flag into the file. */ ehdr.e_ident[7] = ELFOSABI_AMDGPU_HSA; - ehdr.e_ident[8] = (elf_arch == EF_AMDGPU_MACH_AMDGCN_GFX803 - ? ELFABIVERSION_AMDGPU_HSA_V3 + ehdr.e_ident[8] = (GET_GENERIC_VERSION (elf_flags) + ? ELFABIVERSION_AMDGPU_HSA_V6 : ELFABIVERSION_AMDGPU_HSA_V4); ehdr.e_type = ET_REL; ehdr.e_machine = EM_AMDGPU; - ehdr.e_flags = elf_arch | elf_flags_actual; + ehdr.e_flags = elf_arch | elf_flags; /* Load the section headers so we can walk them later. */ Elf64_Shdr *sections = (Elf64_Shdr *)xmalloc (sizeof (Elf64_Shdr) @@ -657,9 +616,11 @@ process_asm (FILE *in, FILE *out, FILE *cfile) struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *); struct regcount *regcounts = XOBFINISH (®counts_os, struct regcount *); - fprintf (cfile, "#include <stdlib.h>\n"); - fprintf (cfile, "#include <stdint.h>\n"); - fprintf (cfile, "#include <stdbool.h>\n\n"); + if (gcn_stack_size) + { + fprintf (cfile, "#include <stdlib.h>\n"); + fprintf (cfile, "#include <stdbool.h>\n\n"); + } fprintf (cfile, "static const int gcn_num_vars = %d;\n\n", var_count); fprintf (cfile, "static const int gcn_num_ind_funcs = %d;\n\n", ind_fn_count); @@ -725,35 +686,28 @@ process_asm (FILE *in, FILE *out, FILE *cfile) /* Embed an object file into a C source file. */ static void -process_obj (FILE *in, FILE *cfile, uint32_t omp_requires) +process_obj (const char *fname_in, FILE *cfile, uint32_t omp_requires) { - size_t len = 0; - const char *input = read_file (in, &len); - /* Dump out an array containing the binary. - FIXME: do this with objcopy. */ - fprintf (cfile, "static unsigned char gcn_code[] = {"); - for (size_t i = 0; i < len; i += 17) - { - fprintf (cfile, "\n\t"); - for (size_t j = i; j < i + 17 && j < len; j++) - fprintf (cfile, "%3u,", (unsigned char) input[j]); - } - fprintf (cfile, "\n};\n\n"); + If the file is empty, a parse error is shown as the argument to is_empty + is an undeclared identifier. */ + fprintf (cfile, + "static unsigned char gcn_code[] = {\n" + "#embed \"%s\" if_empty (error_file_is_empty)\n" + "};\n\n", fname_in); fprintf (cfile, "static const struct gcn_image {\n" - " size_t size;\n" + " __SIZE_TYPE__ size;\n" " void *image;\n" "} gcn_image = {\n" - " %zu,\n" + " sizeof(gcn_code),\n" " gcn_code\n" - "};\n\n", - len); + "};\n\n"); fprintf (cfile, "static const struct gcn_data {\n" - " uintptr_t omp_requires_mask;\n" + " __UINTPTR_TYPE__ omp_requires_mask;\n" " const struct gcn_image *gcn_image;\n" " unsigned kernel_count;\n" " const struct hsa_kernel_description *kernel_infos;\n" @@ -827,17 +781,10 @@ compile_native (const char *infile, const char *outfile, const char *compiler, obstack_ptr_grow (&argv_obstack, gcn_dumpbase); obstack_ptr_grow (&argv_obstack, "-dumpbase-ext"); obstack_ptr_grow (&argv_obstack, ".c"); - switch (offload_abi) - { - case OFFLOAD_ABI_LP64: - obstack_ptr_grow (&argv_obstack, "-m64"); - break; - case OFFLOAD_ABI_ILP32: - obstack_ptr_grow (&argv_obstack, "-m32"); - break; - default: - gcc_unreachable (); - } + if (!offload_abi_host_opts) + fatal_error (input_location, + "%<-foffload-abi-host-opts%> not specified."); + obstack_ptr_grow (&argv_obstack, offload_abi_host_opts); obstack_ptr_grow (&argv_obstack, infile); obstack_ptr_grow (&argv_obstack, "-c"); obstack_ptr_grow (&argv_obstack, "-o"); @@ -853,27 +800,15 @@ compile_native (const char *infile, const char *outfile, const char *compiler, static int get_arch (const char *str, const char *with_arch_str) { - if (strcmp (str, "fiji") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX803; - else if (strcmp (str, "gfx900") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX900; - else if (strcmp (str, "gfx906") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX906; - else if (strcmp (str, "gfx908") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX908; - else if (strcmp (str, "gfx90a") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX90a; - else if (strcmp (str, "gfx90c") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX90c; - else if (strcmp (str, "gfx1030") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX1030; - else if (strcmp (str, "gfx1036") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX1036; - else if (strcmp (str, "gfx1100") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX1100; - else if (strcmp (str, "gfx1103") == 0) - return EF_AMDGPU_MACH_AMDGCN_GFX1103; - + /* Use the def file to map the name to the elf_arch_code. */ + if (!str) ; +#define GCN_DEVICE(name, NAME, ELF, ...) \ + else if (strcmp (str, #name) == 0) \ + return ELF; +#include "gcn-devices.def" +#undef GCN_DEVICE + + /* else */ error ("unrecognized argument in option %<-march=%s%>", str); /* The suggestions are based on the configured multilib support; the compiler @@ -1008,6 +943,15 @@ main (int argc, char **argv) "unrecognizable argument of option %<" STR "%>"); } #undef STR + else if (startswith (argv[i], "-foffload-abi-host-opts=")) + { + if (offload_abi_host_opts) + fatal_error (input_location, + "%<-foffload-abi-host-opts%> specified " + "multiple times"); + offload_abi_host_opts + = argv[i] + strlen ("-foffload-abi-host-opts="); + } else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) @@ -1072,46 +1016,49 @@ main (int argc, char **argv) gcc_unreachable (); } - /* This must match gcn-hsa.h's settings for NO_XNACK, NO_SRAM_ECC - and ASM_SPEC. */ + /* Set the default ELF flags for XNACK. */ switch (elf_arch) { - case EF_AMDGPU_MACH_AMDGCN_GFX803: - case EF_AMDGPU_MACH_AMDGCN_GFX1030: - case EF_AMDGPU_MACH_AMDGCN_GFX1036: - case EF_AMDGPU_MACH_AMDGCN_GFX1100: - case EF_AMDGPU_MACH_AMDGCN_GFX1103: - SET_XNACK_UNSET (elf_flags); - SET_SRAM_ECC_UNSET (elf_flags); - break; - case EF_AMDGPU_MACH_AMDGCN_GFX900: - SET_XNACK_OFF (elf_flags); - SET_SRAM_ECC_UNSET (elf_flags); - break; - case EF_AMDGPU_MACH_AMDGCN_GFX906: - SET_XNACK_OFF (elf_flags); - SET_SRAM_ECC_ANY (elf_flags); - break; - case EF_AMDGPU_MACH_AMDGCN_GFX908: - SET_XNACK_OFF (elf_flags); - if (TEST_SRAM_ECC_UNSET (elf_flags)) - SET_SRAM_ECC_ANY (elf_flags); - break; - case EF_AMDGPU_MACH_AMDGCN_GFX90a: - if (TEST_XNACK_UNSET (elf_flags)) - SET_XNACK_ANY (elf_flags); - if (TEST_SRAM_ECC_UNSET (elf_flags)) - SET_SRAM_ECC_ANY (elf_flags); - break; - case EF_AMDGPU_MACH_AMDGCN_GFX90c: - if (TEST_XNACK_UNSET (elf_flags)) - SET_XNACK_ANY (elf_flags); - SET_SRAM_ECC_UNSET (elf_flags); - break; +#define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, ...) \ + case ELF: XNACK; break; +#define HSACO_ATTR_UNSUPPORTED SET_XNACK_UNSET (elf_flags) +#define HSACO_ATTR_OFF SET_XNACK_OFF (elf_flags) +#define HSACO_ATTR_ANY \ + if (TEST_XNACK_UNSET (elf_flags)) SET_XNACK_ANY (elf_flags) +#include "gcn-devices.def" +#undef HSACO_ATTR_UNSUPPORTED +#undef HSACO_ATTR_OFF +#undef HSACO_ATTR_ANY + default: + fatal_error (input_location, "unhandled architecture"); + } + + /* Set the default ELF flags for SRAM_ECC. */ + switch (elf_arch) + { +#define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAM, ...) \ + case ELF: SRAM; break; +#define HSACO_ATTR_UNSUPPORTED SET_SRAM_ECC_UNSET (elf_flags) +#define HSACO_ATTR_OFF SET_SRAM_ECC_OFF (elf_flags) +#define HSACO_ATTR_ANY \ + if (TEST_SRAM_ECC_UNSET (elf_flags)) SET_SRAM_ECC_ANY (elf_flags) +#include "gcn-devices.def" +#undef HSACO_ATTR_UNSUPPORTED +#undef HSACO_ATTR_OFF +#undef HSACO_ATTR_ANY default: fatal_error (input_location, "unhandled architecture"); } + /* Set the generic version. */ + switch (elf_arch) + { +#define GCN_DEVICE(name, NAME, ELF, ISA, XNACK, SRAMECC, WAVE64, CU, VGPRS, GEN_VER, ...) \ + case ELF: if (GEN_VER) SET_GENERIC_VERSION (elf_flags, GEN_VER); break; +#include "gcn-devices.def" +#undef GCN_DEVICE + } + /* Build arguments for compiler pass. */ struct obstack cc_argv_obstack; obstack_init (&cc_argv_obstack); @@ -1312,13 +1259,7 @@ main (int argc, char **argv) fork_execute (ld_argv[0], CONST_CAST (char **, ld_argv), true, ".ld_args"); obstack_free (&ld_argv_obstack, NULL); - in = fopen (gcn_o_name, "r"); - if (!in) - fatal_error (input_location, "cannot open intermediate gcn obj file"); - - process_obj (in, cfile, omp_requires); - - fclose (in); + process_obj (gcn_o_name, cfile, omp_requires); xputenv (concat ("GCC_EXEC_PREFIX=", execpath, NULL)); xputenv (concat ("COMPILER_PATH=", cpath, NULL)); diff --git a/gcc/config/gcn/t-gcn-hsa b/gcc/config/gcn/t-gcn-hsa index 5fc34a1..5de32ae 100644 --- a/gcc/config/gcn/t-gcn-hsa +++ b/gcc/config/gcn/t-gcn-hsa @@ -49,3 +49,10 @@ gcn-tree.o: $(srcdir)/config/gcn/gcn-tree.cc $(COMPILE) $< $(POSTCOMPILE) ALL_HOST_OBJS += gcn-tree.o + +$(srcdir)/config/gcn/gcn-tables.opt: $(srcdir)/config/gcn/gcn-devices.def $(srcdir)/config/gcn/gen-opt-tables.awk + $(AWK) -f $(srcdir)/config/gcn/gen-opt-tables.awk $< > $@ + +gcn-device-macros.h: $(srcdir)/config/gcn/gcn-devices.def $(srcdir)/config/gcn/gen-gcn-device-macros.awk + $(AWK) -f $(srcdir)/config/gcn/gen-gcn-device-macros.awk $< > $@ +generated_files += gcn-device-macros.h diff --git a/gcc/config/gcn/t-omp-device b/gcc/config/gcn/t-omp-device index b92e19b..cae6bd3 100644 --- a/gcc/config/gcn/t-omp-device +++ b/gcc/config/gcn/t-omp-device @@ -1,4 +1,4 @@ -omp-device-properties-gcn: $(srcdir)/config/gcn/gcn.cc +omp-device-properties-gcn: $(srcdir)/config/gcn/gcn-devices.def echo kind: gpu > $@ echo arch: amdgcn gcn >> $@ - echo isa: fiji gfx803 gfx900 gfx906 gfx908 gfx90a gfx90c gfx1030 gfx1036 gfx1100 gfx1103 >> $@ + echo isa: `grep -o -P '(?<=GCN_DEVICE\()gfx[0-9a-f]+(?=,)' $<` >> $@ diff --git a/gcc/config/h8300/h8300.cc b/gcc/config/h8300/h8300.cc index 7ab26f2..17c6e91 100644 --- a/gcc/config/h8300/h8300.cc +++ b/gcc/config/h8300/h8300.cc @@ -317,7 +317,7 @@ h8300_option_override (void) "%<-msx%> - option ignored"); } -#ifdef H8300_LINUX +#ifdef H8300_LINUX if ((TARGET_NORMAL_MODE)) { error ("%<-mn%> is not supported for linux targets"); @@ -811,7 +811,7 @@ h8300_expand_prologue (void) if (h8300_monitor_function_p (current_function_decl)) /* The monitor function act as normal functions, which means it - can accept parameters and return values. In addition to this, + can accept parameters and return values. In addition to this, interrupts are masked in prologue and return with "rte" in epilogue. */ emit_insn (gen_monitor_prologue ()); @@ -1484,7 +1484,7 @@ h8300_print_operand (FILE *file, rtx x, int code) if ((exact_log2 ((bitint >> 8) & 0xff)) == -1) bitint = exact_log2 (bitint & 0xff); else - bitint = exact_log2 ((bitint >> 8) & 0xff); + bitint = exact_log2 ((bitint >> 8) & 0xff); gcc_assert (bitint >= 0); fprintf (file, "#%d", bitint); break; @@ -1493,7 +1493,7 @@ h8300_print_operand (FILE *file, rtx x, int code) if ((exact_log2 ((bitint >> 8) & 0xff)) == -1 ) bitint = exact_log2 (bitint & 0xff); else - bitint = (exact_log2 ((bitint >> 8) & 0xff)); + bitint = (exact_log2 ((bitint >> 8) & 0xff)); gcc_assert (bitint >= 0); fprintf (file, "#%d", bitint); break; @@ -2358,7 +2358,7 @@ h8300_bitfield_length (rtx op, rtx op2) if (GET_CODE (op) == REG) op = op2; gcc_assert (GET_CODE (op) != REG); - + size = GET_MODE_SIZE (GET_MODE (op)); operand_length = h8300_classify_operand (op, size, &opclass); @@ -2521,7 +2521,7 @@ h8300_insn_length_from_table (rtx_insn *insn, rtx * operands) case LENGTH_TABLE_BITFIELD: return h8300_bitfield_length (operands[0], operands[1]); - + case LENGTH_TABLE_BITBRANCH: return h8300_bitfield_length (operands[1], operands[2]) - 2; @@ -4100,7 +4100,7 @@ output_a_shift (rtx operands[4], rtx_code code) /* This case must be taken care of by one of the two splitters that convert a variable shift into a loop. */ gcc_assert (GET_CODE (operands[2]) == CONST_INT); - + n = INTVAL (operands[2]); /* If the count is negative, make it 0. */ @@ -4113,7 +4113,7 @@ output_a_shift (rtx operands[4], rtx_code code) n = GET_MODE_BITSIZE (mode); get_shift_alg (shift_type, shift_mode, n, &info); - + switch (info.alg) { case SHIFT_SPECIAL: @@ -4134,7 +4134,7 @@ output_a_shift (rtx operands[4], rtx_code code) for (; n > 0; n--) output_asm_insn (info.shift1, operands); return ""; - + case SHIFT_ROT_AND: { int m = GET_MODE_BITSIZE (mode) - n; @@ -4146,18 +4146,18 @@ output_a_shift (rtx operands[4], rtx_code code) /* Not all possibilities of rotate are supported. They shouldn't be generated, but let's watch for 'em. */ gcc_assert (info.shift1); - + /* Emit two bit rotates first. */ if (info.shift2 != NULL) { for (; m > 1; m -= 2) output_asm_insn (info.shift2, operands); } - + /* Now single bit rotates for any residual. */ for (; m > 0; m--) output_asm_insn (info.shift1, operands); - + /* Now mask off the high bits. */ switch (mode) { @@ -4201,7 +4201,7 @@ output_a_shift (rtx operands[4], rtx_code code) fprintf (asm_out_file, "\tbne .Llt%d\n", loopend_lab); } return ""; - + default: gcc_unreachable (); } @@ -4381,7 +4381,7 @@ compute_a_shift_cc (rtx operands[3], rtx_code code) enum shift_mode shift_mode; struct shift_info info; int n; - + switch (mode) { case E_QImode: @@ -4415,7 +4415,7 @@ compute_a_shift_cc (rtx operands[3], rtx_code code) /* This case must be taken care of by one of the two splitters that convert a variable shift into a loop. */ gcc_assert (GET_CODE (operands[2]) == CONST_INT); - + n = INTVAL (operands[2]); /* If the count is negative, make it 0. */ @@ -4426,9 +4426,9 @@ compute_a_shift_cc (rtx operands[3], rtx_code code) do the intuitive thing. */ else if ((unsigned int) n > GET_MODE_BITSIZE (mode)) n = GET_MODE_BITSIZE (mode); - + get_shift_alg (shift_type, shift_mode, n, &info); - + switch (info.alg) { case SHIFT_SPECIAL: @@ -4441,11 +4441,11 @@ compute_a_shift_cc (rtx operands[3], rtx_code code) case SHIFT_INLINE: return (info.cc_inline == OLD_CC_SET_ZN || info.cc_inline == OLD_CC_SET_ZNV); - + case SHIFT_ROT_AND: /* This case always ends with an and instruction. */ return true; - + case SHIFT_LOOP: /* A loop to shift by a "large" constant value. If we have shift-by-2 insns, use them. */ @@ -4454,10 +4454,10 @@ compute_a_shift_cc (rtx operands[3], rtx_code code) if (n % 2) return (info.cc_inline == OLD_CC_SET_ZN || info.cc_inline == OLD_CC_SET_ZNV); - + } return false; - + default: gcc_unreachable (); } diff --git a/gcc/config/host-darwin.h b/gcc/config/host-darwin.h index ecf454e..23752c4 100644 --- a/gcc/config/host-darwin.h +++ b/gcc/config/host-darwin.h @@ -18,7 +18,7 @@ <http://www.gnu.org/licenses/>. */ extern void * darwin_gt_pch_get_address (size_t sz, int fd); -extern int darwin_gt_pch_use_address (void *&addr, size_t sz, int fd, +extern int darwin_gt_pch_use_address (void *&addr, size_t sz, int fd, size_t off); #undef HOST_HOOKS_GT_PCH_GET_ADDRESS diff --git a/gcc/config/host-linux.cc b/gcc/config/host-linux.cc index 1cec56e..c1214b3 100644 --- a/gcc/config/host-linux.cc +++ b/gcc/config/host-linux.cc @@ -105,7 +105,7 @@ #endif /* Determine a location where we might be able to reliably allocate SIZE - bytes. FD is the PCH file, though we should return with the file + bytes. FD is the PCH file, though we should return with the file unmapped. */ static void * diff --git a/gcc/config/host-netbsd.cc b/gcc/config/host-netbsd.cc index 690c636..59df05a 100644 --- a/gcc/config/host-netbsd.cc +++ b/gcc/config/host-netbsd.cc @@ -62,7 +62,7 @@ netbsd_gt_pch_get_address (size_t size, int fd) return addr; } -/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at +/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at mapping the data at BASE, -1 if we couldn't. */ static int diff --git a/gcc/config/host-openbsd.cc b/gcc/config/host-openbsd.cc index 8a3e75d..9d6c927 100644 --- a/gcc/config/host-openbsd.cc +++ b/gcc/config/host-openbsd.cc @@ -62,7 +62,7 @@ openbsd_gt_pch_get_address (size_t size, int fd) return addr; } -/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at +/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at mapping the data at BASE, -1 if we couldn't. */ static int diff --git a/gcc/config/host-solaris.cc b/gcc/config/host-solaris.cc index f8afed7..0e0325b 100644 --- a/gcc/config/host-solaris.cc +++ b/gcc/config/host-solaris.cc @@ -39,7 +39,7 @@ mmap_fixed (void *addr, size_t len, int prot, int flags, int fd, off_t off) void *base; base = mmap ((caddr_t) addr, len, prot, flags, fd, off); - + if (base != addr) { size_t page_size = getpagesize(); @@ -101,7 +101,7 @@ sol_gt_pch_get_address (size_t size, int fd) return addr; } -/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at +/* Map SIZE bytes of FD+OFFSET at BASE. Return 1 if we succeeded at mapping the data at BASE, -1 if we couldn't. */ static int diff --git a/gcc/config/i386/amxavx512intrin.h b/gcc/config/i386/amxavx512intrin.h new file mode 100644 index 0000000..146a981 --- /dev/null +++ b/gcc/config/i386/amxavx512intrin.h @@ -0,0 +1,189 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxavx512intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXAVX512INTRIN_H_INCLUDED +#define _AMXAVX512INTRIN_H_INCLUDED + +#if !defined(__AMX_AVX512__) +#pragma GCC push_options +#pragma GCC target("amx-avx512") +#define __DISABLE_AMX_AVX512__ +#endif /* __AMX_AVX512__ */ + +#if defined(__x86_64__) +#define _tile_cvtrowd2ps_internal(src,A) \ +({ \ + __m512 dst; \ + __asm__ volatile \ + ("{tcvtrowd2ps\t%1, %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_cvtrowd2psi_internal(src,imm) \ +({ \ + __m512 dst; \ + __asm__ volatile \ + ("{tcvtrowd2ps\t$"#imm", %%tmm"#src", %0|tcvtrowd2ps\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_cvtrowps2pbf16h_internal(src,A) \ +({ \ + __m512bh dst; \ + __asm__ volatile \ + ("{tcvtrowps2pbf16h\t%1, %%tmm"#src", %0|tcvtrowps2pbf16h\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_cvtrowps2pbf16hi_internal(src,imm) \ +({ \ + __m512bh dst; \ + __asm__ volatile \ + ("{tcvtrowps2pbf16h\t$"#imm", %%tmm"#src", %0|tcvtrowps2pbf16h\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_cvtrowps2pbf16l_internal(src,A) \ +({ \ + __m512bh dst; \ + __asm__ volatile \ + ("{tcvtrowps2pbf16l\t%1, %%tmm"#src", %0|tcvtrowps2pbf16l\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_cvtrowps2pbf16li_internal(src,imm) \ +({ \ + __m512bh dst; \ + __asm__ volatile \ + ("{tcvtrowps2pbf16l\t$"#imm", %%tmm"#src", %0|tcvtrowps2pbf16l\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_cvtrowps2phh_internal(src,A) \ +({ \ + __m512h dst; \ + __asm__ volatile \ + ("{tcvtrowps2phh\t%1, %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_cvtrowps2phhi_internal(src,imm) \ +({ \ + __m512h dst; \ + __asm__ volatile \ + ("{tcvtrowps2phh\t$"#imm", %%tmm"#src", %0|tcvtrowps2phh\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_cvtrowps2phl_internal(src,A) \ +({ \ + __m512h dst; \ + __asm__ volatile \ + ("{tcvtrowps2phl\t%1, %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_cvtrowps2phli_internal(src,imm) \ +({ \ + __m512h dst; \ + __asm__ volatile \ + ("{tcvtrowps2phl\t$"#imm", %%tmm"#src", %0|tcvtrowps2phl\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_movrow_internal(src,A) \ +({ \ + __m512 dst; \ + __asm__ volatile \ + ("{tilemovrow\t%1, %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", %1}" \ + : "=v" (dst) : "r" ((unsigned) (A))); \ + dst; \ +}) + +#define _tile_movrowi_internal(src,imm) \ +({ \ + __m512 dst; \ + __asm__ volatile \ + ("{tilemovrow\t$"#imm", %%tmm"#src", %0|tilemovrow\t%0, %%tmm"#src", "#imm"}" \ + : "=v" (dst) :); \ + dst; \ +}) + +#define _tile_cvtrowd2ps(src,A) \ + _tile_cvtrowd2ps_internal (src,A) + +#define _tile_cvtrowd2psi(src,imm) \ + _tile_cvtrowd2psi_internal (src,imm) + +#define _tile_cvtrowps2pbf16h(src,A) \ + _tile_cvtrowps2pbf16h_internal (src,A) + +#define _tile_cvtrowps2pbf16hi(src,imm) \ + _tile_cvtrowps2pbf16hi_internal (src,imm) + +#define _tile_cvtrowps2pbf16l(src,A) \ + _tile_cvtrowps2pbf16l_internal (src,A) + +#define _tile_cvtrowps2pbf16li(src,imm) \ + _tile_cvtrowps2pbf16li_internal (src,imm) + +#define _tile_cvtrowps2phh(src,A) \ + _tile_cvtrowps2phh_internal (src,A) + +#define _tile_cvtrowps2phhi(src,imm) \ + _tile_cvtrowps2phhi_internal (src,imm) + +#define _tile_cvtrowps2phl(src,A) \ + _tile_cvtrowps2phl_internal (src,A) + +#define _tile_cvtrowps2phli(src,imm) \ + _tile_cvtrowps2phli_internal (src,imm) + +#define _tile_movrow(src,A) \ + _tile_movrow_internal (src,A) + +#define _tile_movrowi(src,imm) \ + _tile_movrowi_internal (src,imm) + +#endif + +#ifdef __DISABLE_AMX_AVX512__ +#undef __DISABLE_AMX_AVX512__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_AVX512__ */ + +#endif /* _AMXAVX512INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxfp8intrin.h b/gcc/config/i386/amxfp8intrin.h new file mode 100644 index 0000000..7e6fca4 --- /dev/null +++ b/gcc/config/i386/amxfp8intrin.h @@ -0,0 +1,67 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXFP8INTRIN_H_INCLUDED +#define _AMXFP8INTRIN_H_INCLUDED + +#if defined(__x86_64__) +#define _tile_dpbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbhf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpbhf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbhf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dphbf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphbf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphbf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dphf8ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdphf8ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdphf8ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbf8ps(dst,src1,src2) \ + _tile_dpbf8ps_internal (dst,src1,src2) + +#define _tile_dpbhf8ps(dst,src1,src2) \ + _tile_dpbhf8ps_internal (dst,src1,src2) + +#define _tile_dphbf8ps(dst,src1,src2) \ + _tile_dphbf8ps_internal (dst,src1,src2) + +#define _tile_dphf8ps(dst,src1,src2) \ + _tile_dphf8ps_internal (dst,src1,src2) + +#endif + +#ifdef __DISABLE_AMX_FP8__ +#undef __DISABLE_AMX_FP8__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_FP8__ */ + +#endif /* _AMXFP8INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxmovrsintrin.h b/gcc/config/i386/amxmovrsintrin.h new file mode 100644 index 0000000..47c0868 --- /dev/null +++ b/gcc/config/i386/amxmovrsintrin.h @@ -0,0 +1,111 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use <amxmovrsintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMX_MOVRSINTRIN_H_INCLUDED +#define _AMX_MOVRSINTRIN_H_INCLUDED + +#if defined(__x86_64__) + +#if !defined(__AMX_MOVRS__) +#pragma GCC push_options +#pragma GCC target("amx-movrs") +#define __DISABLE_AMX_MOVRS__ +#endif /* __AMX_MOVRS__ */ + +#define _tile_loaddrs_internal(tdst, base, stride) \ +__asm__ volatile \ + ("{tileloaddrs\t(%0,%1,1), %%tmm"#tdst \ + "|tileloaddrs\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_loaddrst1_internal(tdst, base, stride) \ +__asm__ volatile \ + ("{tileloaddrst1\t(%0,%1,1), %%tmm"#tdst \ + "|tileloaddrst1\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_loaddrs(tdst, base, stride) \ + _tile_loaddrs_internal(tdst, base, stride) + +#define _tile_loaddrst1(tdst, base, stride) \ + _tile_loaddrst1_internal(tdst, base, stride) + +#ifdef __DISABLE_AMX_MOVRS__ +#undef __DISABLE_AMX_MOVRS__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_MOVRS__ */ + +#if !defined(__AMX_MOVRS__) || !defined (__AMX_TRANSPOSE__) +#pragma GCC push_options +#pragma GCC target("amx-movrs,amx-transpose") +#define __DISABLE_AMX_MOVRS_TRANSPOSE__ +#endif /* __AMX_MOVRS_TRANSPOSE__ */ + +#define _tile_2rpntlvwz0rs_internal(tdst, base, stride) \ + __asm__ volatile \ + ("{t2rpntlvwz0rs\t(%0,%1,1), %%tmm"#tdst \ + "|t2rpntlvwz0rs\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz0rst1_internal(tdst, base, stride) \ + __asm__ volatile \ + ("{t2rpntlvwz0rst1\t(%0,%1,1), %%tmm"#tdst \ + "|t2rpntlvwz0rst1\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz1rs_internal(tdst, base, stride) \ + __asm__ volatile \ + ("{t2rpntlvwz1rs\t(%0,%1,1), %%tmm"#tdst \ + "|t2rpntlvwz1rs\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz1rst1_internal(tdst, base, stride) \ + __asm__ volatile \ + ("{t2rpntlvwz1rst1\t(%0,%1,1), %%tmm"#tdst \ + "|t2rpntlvwz1rst1\t%%tmm"#tdst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz0rs(tdst, base, stride) \ + _tile_2rpntlvwz0rs_internal(tdst, base, stride) + +#define _tile_2rpntlvwz0rst1(tdst, base, stride) \ + _tile_2rpntlvwz0rst1_internal(tdst, base, stride) + +#define _tile_2rpntlvwz1rs(tdst, base, stride) \ + _tile_2rpntlvwz1rs_internal(tdst, base, stride) + +#define _tile_2rpntlvwz1rst1(tdst, base, stride) \ + _tile_2rpntlvwz1rst1_internal(tdst, base, stride) + +#ifdef __DISABLE_AMX_MOVRS_TRANSPOSE__ +#undef __DISABLE_AMX_MOVRS_TRANSPOSE__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_MOVRS_TRANSPOSE__ */ + +#endif /* __x86_64__ */ + +#endif /* _AMX_MOVRSINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxtf32intrin.h b/gcc/config/i386/amxtf32intrin.h new file mode 100644 index 0000000..450a33e --- /dev/null +++ b/gcc/config/i386/amxtf32intrin.h @@ -0,0 +1,47 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + This file is part of GCC. + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxtf32intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXTF32INTRIN_H_INCLUDED +#define _AMXTF32INTRIN_H_INCLUDED + +#if !defined(__AMX_TF32__) +#pragma GCC push_options +#pragma GCC target("amx-tf32") +#define __DISABLE_AMX_TF32__ +#endif /* __AMX_TF32__ */ + +#if defined(__x86_64__) +#define _tile_mmultf32ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{tmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_mmultf32ps(src1_dst,src2,src3) \ + _tile_mmultf32ps_internal (src1_dst, src2, src3) + +#endif + +#ifdef __DISABLE_AMX_TF32__ +#undef __DISABLE_AMX_TF32__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TF32__ */ + +#endif /* _AMXTF32INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/amxtransposeintrin.h b/gcc/config/i386/amxtransposeintrin.h new file mode 100644 index 0000000..06bdd37 --- /dev/null +++ b/gcc/config/i386/amxtransposeintrin.h @@ -0,0 +1,177 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use <amxtransposeintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _AMXTRANSPOSEINTRIN_H_INCLUDED +#define _AMXTRANSPOSEINTRIN_H_INCLUDED + +#if !defined(__AMX_TRANSPOSE__) +#pragma GCC push_options +#pragma GCC target("amx-transpose") +#define __DISABLE_AMX_TRANSPOSE__ +#endif /* __AMX_TRANSPOSE__ */ + +#if defined(__x86_64__) +#define _tile_transposed_internal(dst,src) \ + __asm__ volatile\ + ("{ttransposed\t%%tmm"#src", %%tmm"#dst"|ttransposed\t%%tmm"#dst", %%tmm"#src"}" ::) + +#define _tile_2rpntlvwz0_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz0\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((long) (stride))) + +#define _tile_2rpntlvwz0t1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz0t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz0t1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_2rpntlvwz1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_2rpntlvwz1t1_internal(dst,base,stride) \ + __asm__ volatile\ + ("{t2rpntlvwz1t1\t(%0,%1,1), %%tmm"#dst"|t2rpntlvwz1t1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*)(base)), "r" ((long)(stride))) + +#define _tile_transposed(dst,src) \ + _tile_transposed_internal (dst, src) + +#define _tile_2rpntlvwz0(dst,base,stride) \ + _tile_2rpntlvwz0_internal (dst, base, stride) + +#define _tile_2rpntlvwz0t1(dst,base,stride) \ + _tile_2rpntlvwz0t1_internal (dst, base, stride) + +#define _tile_2rpntlvwz1(dst,base,stride) \ + _tile_2rpntlvwz1_internal (dst, base, stride) + +#define _tile_2rpntlvwz1t1(dst,base,stride) \ + _tile_2rpntlvwz1t1_internal (dst, base, stride) + +#if !defined(__AMX_BF16__) +#pragma GCC push_options +#pragma GCC target("amx-bf16") +#define __DISABLE_AMX_BF16__ +#endif /* __AMX_BF16__ */ + +#define _tile_tdpbf16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttdpbf16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpbf16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tdpbf16ps(src1_dst,src2,src3) \ + _tile_tdpbf16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_BF16__ +#undef __DISABLE_AMX_BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_BF16__ */ + +#if !defined(__AMX_FP16__) +#pragma GCC push_options +#pragma GCC target("amx-fp16") +#define __DISABLE_AMX_FP16__ +#endif /* __AMX_FP16__ */ + +#define _tile_tdpfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttdpfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttdpfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tdpfp16ps(src1_dst,src2,src3) \ + _tile_tdpfp16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_FP16__ +#undef __DISABLE_AMX_FP16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_FP16__ */ + +#if !defined(__AMX_COMPLEX__) +#pragma GCC push_options +#pragma GCC target("amx-complex") +#define __DISABLE_AMX_COMPLEX__ +#endif /* __AMX_COMPLEX__ */ + +#define _tile_conjtcmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{tconjtcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tconjtcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_conjtfp16_internal(dst,src) \ + __asm__ volatile\ + ("{tconjtfp16\t%%tmm"#src", %%tmm"#dst"|tconjtfp16\t%%tmm"#dst", %%tmm"#src"}" ::) + +#define _tile_tcmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tcmmrlfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_conjtcmmimfp16ps(src1_dst,src2,src3) \ + _tile_conjtcmmimfp16ps_internal (src1_dst, src2, src3) + +#define _tile_conjtfp16(dst,src) \ + _tile_conjtfp16_internal (dst, src) + +#define _tile_tcmmimfp16ps(src1_dst,src2,src3) \ + _tile_tcmmimfp16ps_internal (src1_dst, src2, src3) + +#define _tile_tcmmrlfp16ps(src1_dst,src2,src3) \ + _tile_tcmmrlfp16ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_COMPLEX__ +#undef __DISABLE_AMX_COMPLEX__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_COMPLEX__ */ + +#if !defined(__AMX_TF32__) +#pragma GCC push_options +#pragma GCC target("amx-tf32") +#define __DISABLE_AMX_TF32__ +#endif /* __AMX_TF32__ */ + +#define _tile_tmmultf32ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{ttmmultf32ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|ttmmultf32ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_tmmultf32ps(src1_dst,src2,src3) \ + _tile_tmmultf32ps_internal (src1_dst, src2, src3) + +#ifdef __DISABLE_AMX_TF32__ +#undef __DISABLE_AMX_TF32__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TF32__ */ + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_AMX_TRANSPOSE__ +#undef __DISABLE_AMX_TRANSPOSE__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TRANSPOSE__ */ + +#endif /* _AMXTRANSPOSEINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx10_2-512convertintrin.h b/gcc/config/i386/avx10_2-512convertintrin.h index dfbdfc3..178b5ff 100644 --- a/gcc/config/i386/avx10_2-512convertintrin.h +++ b/gcc/config/i386/avx10_2-512convertintrin.h @@ -276,7 +276,7 @@ _mm512_cvtne2ph_pbf8 (__m512h __A, __m512h __B) extern __inline__ __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtne2ph_pbf8 (__m512i __W, __mmask64 __U, +_mm512_mask_cvtne2ph_pbf8 (__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i) __builtin_ia32_vcvtne2ph2bf8512_mask ((__v32hf) __A, @@ -375,7 +375,7 @@ _mm512_cvtnes2ph_phf8 (__m512h __A, __m512h __B) extern __inline__ __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm512_mask_cvtnes2ph_phf8 (__m512i __W, __mmask64 __U, +_mm512_mask_cvtnes2ph_phf8 (__m512i __W, __mmask64 __U, __m512h __A, __m512h __B) { return (__m512i) __builtin_ia32_vcvtne2ph2hf8s512_mask ((__v32hf) __A, diff --git a/gcc/config/i386/avx10_2bf16intrin.h b/gcc/config/i386/avx10_2bf16intrin.h index f36fb8e..012c704 100644 --- a/gcc/config/i386/avx10_2bf16intrin.h +++ b/gcc/config/i386/avx10_2bf16intrin.h @@ -1270,14 +1270,14 @@ _mm_cmp_pbh_mask (__m128bh __A, __m128bh __B, const int __imm) #else #define _mm256_mask_cmp_pbh_mask(A, B, C, D) \ ((__mmask16) __builtin_ia32_cmppbf16256_mask ((B), (C), (D), (A))) - + #define _mm256_cmp_pbh_mask(A, B, C) \ ((__mmask16) __builtin_ia32_cmppbf16256_mask ((A), (B), (C), \ (__mmask16) (-1))) #define _mm_mask_cmp_pbh_mask(A, B, C, D) \ ((__mmask8) __builtin_ia32_cmppbf16128_mask ((B), (C), (D), (A))) - + #define _mm_cmp_pbh_mask(A, B, C) \ ((__mmask8) __builtin_ia32_cmppbf16128_mask ((A), (B), (C), \ (__mmask8) (-1))) diff --git a/gcc/config/i386/avx10_2convertintrin.h b/gcc/config/i386/avx10_2convertintrin.h index 8d2c1a5..08e34d5 100644 --- a/gcc/config/i386/avx10_2convertintrin.h +++ b/gcc/config/i386/avx10_2convertintrin.h @@ -429,7 +429,7 @@ _mm_cvtne2ph_pbf8 (__m128h __A, __m128h __B) extern __inline__ __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtne2ph_pbf8 (__m128i __W, __mmask16 __U, +_mm_mask_cvtne2ph_pbf8 (__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i) __builtin_ia32_vcvtne2ph2bf8128_mask ((__v8hf) __A, @@ -462,7 +462,7 @@ _mm256_cvtne2ph_pbf8 (__m256h __A, __m256h __B) extern __inline__ __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm256_mask_cvtne2ph_pbf8 (__m256i __W, __mmask32 __U, +_mm256_mask_cvtne2ph_pbf8 (__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) { return (__m256i) __builtin_ia32_vcvtne2ph2bf8256_mask ((__v16hf) __A, @@ -495,7 +495,7 @@ _mm_cvtnes2ph_pbf8 (__m128h __A, __m128h __B) extern __inline__ __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -_mm_mask_cvtnes2ph_pbf8 (__m128i __W, __mmask16 __U, +_mm_mask_cvtnes2ph_pbf8 (__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) { return (__m128i) __builtin_ia32_vcvtne2ph2bf8s128_mask ((__v8hf) __A, diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index d9890c6..a4ab501 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -120,7 +120,7 @@ _cvtmask8_u32 (__mmask8 __A) { return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A); } - + extern __inline __mmask8 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _cvtu32_mask8 (unsigned int __A) diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h index 1869a92..c3096b7 100644 --- a/gcc/config/i386/avx512fp16intrin.h +++ b/gcc/config/i386/avx512fp16intrin.h @@ -3961,11 +3961,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm) #else #define _mm512_mask_fpclass_ph_mask(u, x, c) \ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ - (int) (c),(__mmask8)(u))) + (int) (c),(__mmask32)(u))) #define _mm512_fpclass_ph_mask(x, c) \ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ - (int) (c),(__mmask8)-1)) + (int) (c),(__mmask32)-1)) #endif /* __OPIMTIZE__ */ /* Intrinsics vgetexpph. */ diff --git a/gcc/config/i386/biarch64.h b/gcc/config/i386/biarch64.h index e7c14bc..7cbfb2a 100644 --- a/gcc/config/i386/biarch64.h +++ b/gcc/config/i386/biarch64.h @@ -1,6 +1,6 @@ /* Make configure files to produce biarch compiler defaulting to 64bit mode. This file must be included very first, while the OS specific file later - to overwrite otherwise wrong defaults. + to overwrite otherwise wrong defaults. Copyright (C) 2001-2024 Free Software Foundation, Inc. Contributed by Bo Thorsen <bo@suse.de>. diff --git a/gcc/config/i386/cmpccxaddintrin.h b/gcc/config/i386/cmpccxaddintrin.h index 39f368f..9349fb0 100644 --- a/gcc/config/i386/cmpccxaddintrin.h +++ b/gcc/config/i386/cmpccxaddintrin.h @@ -72,11 +72,11 @@ _cmpccxadd_epi64 (long long *__A, long long __B, long long __C, } #else #define _cmpccxadd_epi32(A,B,C,D) \ - __builtin_ia32_cmpccxadd ((int *) (A), (int) (B), (int) (C), \ + __builtin_ia32_cmpccxadd ((A), (int) (B), (int) (C), \ (_CMPCCX_ENUM) (D)) #define _cmpccxadd_epi64(A,B,C,D) \ - __builtin_ia32_cmpccxadd64 ((long long *) (A), (long long) (B), \ - (long long) (C), (_CMPCCX_ENUM) (D)) + __builtin_ia32_cmpccxadd64 ((A), (long long) (B), (long long) (C), \ + (_CMPCCX_ENUM) (D)) #endif #ifdef __DISABLE_CMPCCXADD__ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 1e8060e..55351e3 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -5,16 +5,16 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 3, or (at your option) any * later version. - * + * * This file is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. - * + * * Under Section 7 of GPL version 3, you are granted additional * permissions described in the GCC Runtime Library Exception, version * 3.1, as published by the Free Software Foundation. - * + * * You should have received a copy of the GNU General Public License and * a copy of the GCC Runtime Library Exception along with this program; * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see @@ -138,6 +138,7 @@ #define bit_AMX_FP16 (1 << 21) #define bit_HRESET (1 << 22) #define bit_AVXIFMA (1 << 23) +#define bit_MOVRS (1 << 31) /* %edx */ #define bit_AVXVNNIINT8 (1 << 4) @@ -162,6 +163,14 @@ #define bit_AESKLE ( 1<<0 ) #define bit_WIDEKL ( 1<<2 ) +/* AMX sub leaf (%eax == 0x1e, %ecx == 1) */ +/* %eax */ +#define bit_AMX_FP8 (1 << 4) +#define bit_AMX_TRANSPOSE (1 << 5) +#define bit_AMX_TF32 (1 << 6) +#define bit_AMX_AVX512 (1 << 7) +#define bit_AMX_MOVRS (1 << 8) + /* AVX10 sub leaf (%eax == 0x24) */ /* %ebx */ #define bit_AVX10_256 (1 << 17) diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h index 9c8c7e3..bd1259f 100644 --- a/gcc/config/i386/cygming.h +++ b/gcc/config/i386/cygming.h @@ -192,7 +192,7 @@ along with GCC; see the file COPYING3. If not see in_section = NULL) /* Older versions of gas don't handle 'r' as data. - Explicitly set data flag with 'd'. */ + Explicitly set data flag with 'd'. */ #define READONLY_DATA_SECTION_ASM_OP "\t.section .rdata,\"dr\"" /* Don't allow flag_pic to propagate since gas may produce invalid code diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h index 63de10c..795d3a5 100644 --- a/gcc/config/i386/cygwin.h +++ b/gcc/config/i386/cygwin.h @@ -137,7 +137,7 @@ along with GCC; see the file COPYING3. If not see do not use them unnecessarily in gthr-posix.h. */ #define GTHREAD_USE_WEAK 0 -/* Every program on cygwin links against cygwin1.dll which contains +/* Every program on cygwin links against cygwin1.dll which contains the pthread routines. There is no need to explicitly link them and the -pthread flag is accepted only for compatibility. */ #undef GOMP_SELF_SPECS diff --git a/gcc/config/i386/djgpp.h b/gcc/config/i386/djgpp.h index 1b5dfb7..e8c3934 100644 --- a/gcc/config/i386/djgpp.h +++ b/gcc/config/i386/djgpp.h @@ -98,7 +98,7 @@ along with GCC; see the file COPYING3. If not see while (0) #endif -/* This is how to tell assembler that a symbol is weak */ +/* This is how to tell assembler that a symbol is weak */ #undef ASM_WEAKEN_LABEL #define ASM_WEAKEN_LABEL(FILE,NAME) \ do { fputs ("\t.weak\t", FILE); assemble_name (FILE, NAME); \ diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc index 445f564..f0ce017 100644 --- a/gcc/config/i386/driver-i386.cc +++ b/gcc/config/i386/driver-i386.cc @@ -19,6 +19,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -252,7 +253,7 @@ decode_caches_intel (unsigned reg, bool xeon_mp, /* Detect cache parameters using CPUID function 2. */ static void -detect_caches_cpuid2 (bool xeon_mp, +detect_caches_cpuid2 (bool xeon_mp, struct cache_desc *level1, struct cache_desc *level2) { unsigned regs[4]; @@ -295,7 +296,7 @@ detect_caches_cpuid4 (struct cache_desc *level1, struct cache_desc *level2, int count; for (count = 0;; count++) - { + { __cpuid_count(4, count, eax, ebx, ecx, edx); switch (eax & 0x1f) { @@ -688,7 +689,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) cpu = "haswell"; /* Assume Sandy Bridge. */ else - cpu = "sandybridge"; + cpu = "sandybridge"; } else if (has_feature (FEATURE_SSE4_2)) { diff --git a/gcc/config/i386/freebsd.h b/gcc/config/i386/freebsd.h index 583c752..2965048 100644 --- a/gcc/config/i386/freebsd.h +++ b/gcc/config/i386/freebsd.h @@ -48,17 +48,17 @@ along with GCC; see the file COPYING3. If not see #undef SIZE_TYPE #define SIZE_TYPE (TARGET_64BIT ? "long unsigned int" : "unsigned int") - + #undef PTRDIFF_TYPE #define PTRDIFF_TYPE (TARGET_64BIT ? "long int" : "int") - + #undef WCHAR_TYPE_SIZE #define WCHAR_TYPE_SIZE (TARGET_64BIT ? 32 : BITS_PER_WORD) #undef SUBTARGET_EXTRA_SPECS /* i386.h bogusly defines it. */ #define SUBTARGET_EXTRA_SPECS \ { "fbsd_dynamic_linker", FBSD_DYNAMIC_LINKER } - + /* Use the STARTFILE_SPEC from config/freebsd-spec.h. */ #undef STARTFILE_SPEC diff --git a/gcc/config/i386/gas.h b/gcc/config/i386/gas.h index ae77e27..fbf1686 100644 --- a/gcc/config/i386/gas.h +++ b/gcc/config/i386/gas.h @@ -50,7 +50,7 @@ along with GCC; see the file COPYING3. If not see doubt or guess work, and since this file is used for both a.out and other file formats, we use one of them. */ -#ifdef HAVE_GAS_BALIGN_AND_P2ALIGN +#ifdef HAVE_GAS_BALIGN_AND_P2ALIGN #undef ASM_OUTPUT_ALIGN #define ASM_OUTPUT_ALIGN(FILE,LOG) \ if ((LOG)!=0) fprintf ((FILE), "\t.balign %d\n", 1 << (LOG)) diff --git a/gcc/config/i386/gmm_malloc.h b/gcc/config/i386/gmm_malloc.h index 7e2ff62..6d4f30c 100644 --- a/gcc/config/i386/gmm_malloc.h +++ b/gcc/config/i386/gmm_malloc.h @@ -29,7 +29,7 @@ #include <errno.h> #endif -static __inline__ void * +static __inline__ void * _mm_malloc (size_t __size, size_t __align) { void * __malloc_ptr; @@ -50,7 +50,7 @@ _mm_malloc (size_t __size, size_t __align) /* Assume malloc'd pointer is aligned at least to sizeof (void*). If necessary, add another sizeof (void*) to store the value returned by malloc. Effectively this enforces a minimum alignment - of sizeof double. */ + of sizeof double. */ if (__align < 2 * sizeof (void *)) __align = 2 * sizeof (void *); @@ -62,7 +62,7 @@ _mm_malloc (size_t __size, size_t __align) __aligned_ptr = (void *) (((size_t) __malloc_ptr + __align) & ~((size_t) (__align) - 1)); - /* Store the original pointer just before p. */ + /* Store the original pointer just before p. */ ((void **) __aligned_ptr)[-1] = __malloc_ptr; return __aligned_ptr; diff --git a/gcc/config/i386/gnu-user.h b/gcc/config/i386/gnu-user.h index da6f64b..a7673eb 100644 --- a/gcc/config/i386/gnu-user.h +++ b/gcc/config/i386/gnu-user.h @@ -41,16 +41,16 @@ along with GCC; see the file COPYING3. If not see #undef SIZE_TYPE #define SIZE_TYPE "unsigned int" - + #undef PTRDIFF_TYPE #define PTRDIFF_TYPE "int" - + #undef WCHAR_TYPE #define WCHAR_TYPE "long int" - + #undef WCHAR_TYPE_SIZE #define WCHAR_TYPE_SIZE BITS_PER_WORD - + /* Provide a LINK_SPEC appropriate for GNU userspace. Here we provide support for the special GCC options -static and -shared, which allow us to link things in one of these three modes by applying the appropriate diff --git a/gcc/config/i386/host-cygwin.cc b/gcc/config/i386/host-cygwin.cc index c72999c..a693d5b 100644 --- a/gcc/config/i386/host-cygwin.cc +++ b/gcc/config/i386/host-cygwin.cc @@ -59,7 +59,7 @@ cygwin_gt_pch_get_address (size_t sz, int fd) /* Cygwin requires that the underlying file be at least as large as the requested mapping. */ if ((size_t) p < sz) - { + { if (ftruncate (fd, sz) == -1) fatal_error (input_location, "cannot extend PCH file: %m"); } diff --git a/gcc/config/i386/host-mingw32.cc b/gcc/config/i386/host-mingw32.cc index 4256398..b010be7 100644 --- a/gcc/config/i386/host-mingw32.cc +++ b/gcc/config/i386/host-mingw32.cc @@ -47,7 +47,7 @@ static inline void w32_error(const char*, const char*, int, const char*); /* Granularity for reserving address space. */ static size_t va_granularity = 0x10000; -/* Print out the GetLastError() translation. */ +/* Print out the GetLastError() translation. */ static inline void w32_error (const char* function, const char* file, int line, const char* my_msg) @@ -93,7 +93,7 @@ mingw32_gt_pch_get_address (size_t size, int) for NT system dlls is in 0x70000000 to 0x78000000 range. If we allocate at bottom we need to reserve the address as early as possible and at the same point in each invocation. */ - + res = VirtualAlloc (NULL, size, MEM_RESERVE | MEM_TOP_DOWN, PAGE_NOACCESS); @@ -103,11 +103,11 @@ mingw32_gt_pch_get_address (size_t size, int) /* We do not need the address space for now, so free it. */ VirtualFree (res, 0, MEM_RELEASE); - return res; + return res; } /* ADDR is an address returned by gt_pch_get_address. Attempt to allocate - SIZE bytes at the same address and load it with the data from FD at + SIZE bytes at the same address and load it with the data from FD at OFFSET. Return -1 if we couldn't allocate memory at ADDR, return 0 if the memory is allocated but the data not loaded, return 1 if done. */ @@ -117,10 +117,10 @@ mingw32_gt_pch_use_address (void *&addr, size_t size, int fd, { void * mmap_addr; HANDLE mmap_handle; - + /* Apparently, MS Vista puts unnamed file mapping objects into Global namespace when running an application in a Terminal Server - session. This causes failure since, by default, applications + session. This causes failure since, by default, applications don't get SeCreateGlobalPrivilege. We don't need global memory sharing so explicitly put object into Local namespace. @@ -140,10 +140,10 @@ mingw32_gt_pch_use_address (void *&addr, size_t size, int fd, version_info.dwOSVersionInfoSize = sizeof (version_info); if (size == 0) - return 0; + return 0; /* Offset must be also be a multiple of allocation granularity for - this to work. We can't change the offset. */ + this to work. We can't change the offset. */ if ((offset & (va_granularity - 1)) != 0) return -1; @@ -166,7 +166,7 @@ mingw32_gt_pch_use_address (void *&addr, size_t size, int fd, if (mmap_handle == NULL) { w32_error (__FUNCTION__, __FILE__, __LINE__, "CreateFileMapping"); - return -1; + return -1; } /* Retry five times, as here might occure a race with multiple gcc's @@ -180,7 +180,7 @@ mingw32_gt_pch_use_address (void *&addr, size_t size, int fd, if (r != 4) Sleep (500); } - + if (mmap_addr != addr) { w32_error (__FUNCTION__, __FILE__, __LINE__, "MapViewOfFileEx"); diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 290f6e6..fff29b8 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1507,3 +1507,12 @@ DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, UQI, INT) DEF_FUNCTION_TYPE (V32HF, V32HF, V32HF, INT, V32HF, USI, INT) DEF_FUNCTION_TYPE (V16HF, V16HF, V16HF, INT, V16HF, UHI, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, UHI, INT) + +# SM4 builtins +DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI) + +# MOVRS builtins +DEF_FUNCTION_TYPE (CHAR, PCCHAR) +DEF_FUNCTION_TYPE (SHORT, PCSHORT) +DEF_FUNCTION_TYPE (INT, PCINT) +DEF_FUNCTION_TYPE (INT64, PCINT64) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 151ccf4..c484e6d 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -505,6 +505,24 @@ BDESC (0, OPTION_MASK_ISA2_WIDEKL, CODE_FOR_nothing, "__builtin_ia32_aesencwide2 BDESC (0, OPTION_MASK_ISA2_PREFETCHI, CODE_FOR_prefetchi, "__builtin_ia32_prefetchi", IX86_BUILTIN_PREFETCHI, UNKNOWN, (int) VOID_FTYPE_PCVOID_INT) BDESC (0, 0, CODE_FOR_nothing, "__builtin_ia32_prefetch", IX86_BUILTIN_PREFETCH, UNKNOWN, (int) VOID_FTYPE_PCVOID_INT_INT_INT) +/* MOVRS */ +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS, CODE_FOR_movrsqi, "__builtin_ia32_movrsqi", IX86_BUILTIN_MOVRSQI, UNKNOWN, (int) CHAR_FTYPE_PCCHAR) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS, CODE_FOR_movrshi, "__builtin_ia32_movrshi", IX86_BUILTIN_MOVRSHI, UNKNOWN, (int) SHORT_FTYPE_PCSHORT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS, CODE_FOR_movrssi, "__builtin_ia32_movrssi", IX86_BUILTIN_MOVRSSI, UNKNOWN, (int) INT_FTYPE_PCINT) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS, CODE_FOR_movrsdi, "__builtin_ia32_movrsdi", IX86_BUILTIN_MOVRSDI, UNKNOWN, (int) INT64_FTYPE_PCINT64) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_vmovrsbv64qi_mask, "__builtin_ia32_vmovrsb512_mask", IX86_BUILTIN_VMOVRSB_512, UNKNOWN, (int) V64QI_FTYPE_PCV64QI_V64QI_UDI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_vmovrsdv16si_mask, "__builtin_ia32_vmovrsd512_mask", IX86_BUILTIN_VMOVRSD_512, UNKNOWN, (int) V16SI_FTYPE_PCV16SI_V16SI_UHI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_vmovrsqv8di_mask, "__builtin_ia32_vmovrsq512_mask", IX86_BUILTIN_VMOVRSQ_512, UNKNOWN, (int) V8DI_FTYPE_PCV8DI_V8DI_UQI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_avx10_2_vmovrswv32hi_mask, "__builtin_ia32_vmovrsw512_mask", IX86_BUILTIN_VMOVRSW_512, UNKNOWN, (int) V32HI_FTYPE_PCV32HI_V32HI_USI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsbv32qi_mask, "__builtin_ia32_vmovrsb256_mask", IX86_BUILTIN_VMOVRSB_256, UNKNOWN, (int) V32QI_FTYPE_PCV32QI_V32QI_USI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsdv8si_mask, "__builtin_ia32_vmovrsd256_mask", IX86_BUILTIN_VMOVRSD_256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI_UQI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsqv4di_mask, "__builtin_ia32_vmovrsq256_mask", IX86_BUILTIN_VMOVRSQ_256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI_UQI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrswv16hi_mask, "__builtin_ia32_vmovrsw256_mask", IX86_BUILTIN_VMOVRSW_256, UNKNOWN, (int) V16HI_FTYPE_PCV16HI_V16HI_UHI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsbv16qi_mask, "__builtin_ia32_vmovrsb128_mask", IX86_BUILTIN_VMOVRSB_128, UNKNOWN, (int) V16QI_FTYPE_PCV16QI_V16QI_UHI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsdv4si_mask, "__builtin_ia32_vmovrsd128_mask", IX86_BUILTIN_VMOVRSD_128, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI_UQI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrsqv2di_mask, "__builtin_ia32_vmovrsq128_mask", IX86_BUILTIN_VMOVRSQ_128, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI_UQI) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_MOVRS | OPTION_MASK_ISA2_AVX10_2_256, CODE_FOR_avx10_2_vmovrswv8hi_mask, "__builtin_ia32_vmovrsw128_mask", IX86_BUILTIN_VMOVRSW_128, UNKNOWN, (int) V8HI_FTYPE_PCV8HI_V8HI_UQI) + BDESC_END (SPECIAL_ARGS, PURE_ARGS) /* AVX */ @@ -1668,8 +1686,10 @@ BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_SM3, CODE_FOR_vsm3rnds2, "__builtin /* SM4 */ BDESC (0, OPTION_MASK_ISA2_SM4, CODE_FOR_vsm4key4_v4si, "__builtin_ia32_vsm4key4128", IX86_BUILTIN_VSM4KEY4128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI) BDESC (0, OPTION_MASK_ISA2_SM4, CODE_FOR_vsm4key4_v8si, "__builtin_ia32_vsm4key4256", IX86_BUILTIN_VSM4KEY4256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_SM4 | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vsm4key4_v16si, "__builtin_ia32_vsm4key4512", IX86_BUILTIN_VSM4KEY4512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI) BDESC (0, OPTION_MASK_ISA2_SM4, CODE_FOR_vsm4rnds4_v4si, "__builtin_ia32_vsm4rnds4128", IX86_BUILTIN_VSM4RNDS4128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI) BDESC (0, OPTION_MASK_ISA2_SM4, CODE_FOR_vsm4rnds4_v8si, "__builtin_ia32_vsm4rnds4256", IX86_BUILTIN_VSM4RNDS4256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) +BDESC (0, OPTION_MASK_ISA2_SM4 | OPTION_MASK_ISA2_AVX10_2_512, CODE_FOR_vsm4rnds4_v16si, "__builtin_ia32_vsm4rnds4512", IX86_BUILTIN_VSM4RNDS4512, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI) /* SHA512 */ BDESC (0, OPTION_MASK_ISA2_SHA512, CODE_FOR_vsha512msg1, "__builtin_ia32_vsha512msg1", IX86_BUILTIN_VSHA512MSG1, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI) @@ -2819,17 +2839,17 @@ BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", IX86_B /* VAES. */ BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) -BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) -BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) /* BF16 */ BDESC (0, OPTION_MASK_ISA2_AVX512BF16 | OPTION_MASK_ISA2_EVEX512, CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKNOWN, (int) V32BF_FTYPE_V16SF_V16SF) diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc index 4286eeb..e12e758 100644 --- a/gcc/config/i386/i386-builtins.cc +++ b/gcc/config/i386/i386-builtins.cc @@ -18,6 +18,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -1849,7 +1850,7 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) target_node = ix86_valid_target_attribute_tree (decl, attrs, &global_options, &global_options_set, 0); - + gcc_assert (target_node); if (target_node == error_mark_node) return 0; @@ -1932,14 +1933,14 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) cl_target_option_restore (&global_options, &global_options_set, &cur_target); - + if (predicate_list && arg_str == NULL) { error_at (DECL_SOURCE_LOCATION (decl), "no dispatcher found for the versioning attributes"); return 0; } - + if (predicate_list) { predicate_decl = ix86_builtins [(int) builtin_fn]; @@ -2007,7 +2008,7 @@ get_builtin_code_for_version (tree decl, tree *predicate_list) *predicate_list = predicate_chain; } - return priority; + return priority; } /* This builds the processor_model struct type defined in diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc index 72435fe..da60da4 100644 --- a/gcc/config/i386/i386-c.cc +++ b/gcc/config/i386/i386-c.cc @@ -741,6 +741,18 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__AVX10_2_256__"); if (isa_flag2 & OPTION_MASK_ISA2_AVX10_2_512) def_or_undef (parse_in, "__AVX10_2_512__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_AVX512) + def_or_undef (parse_in, "__AMX_AVX512__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_TF32) + def_or_undef (parse_in, "__AMX_TF32__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_TRANSPOSE) + def_or_undef (parse_in, "__AMX_TRANSPOSE__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_FP8) + def_or_undef (parse_in, "__AMX_FP8__"); + if (isa_flag2 & OPTION_MASK_ISA2_MOVRS) + def_or_undef (parse_in, "__MOVRS__"); + if (isa_flag2 & OPTION_MASK_ISA2_AMX_MOVRS) + def_or_undef (parse_in, "__AMX_MOVRS__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index d692008..515334a 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -18,6 +18,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -1009,7 +1010,7 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) t = gen_reg_rtx (V4SFmode); else t = op0; - + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) emit_move_insn (t, CONST0_RTX (V4SFmode)); else @@ -1684,7 +1685,7 @@ ix86_emit_binop (enum rtx_code code, machine_mode mode, op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); } @@ -2916,6 +2917,11 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) switch (ix86_fp_comparison_strategy (code)) { case IX86_FPCMP_COMI: + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (TARGET_AVX10_2_256 && (code == EQ || code == NE)) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); cmp_mode = CCFPmode; emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); break; @@ -3090,6 +3096,8 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16) { tmp = SUBREG_REG (op0); + if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode) + tmp = gen_lowpart (V8HImode, tmp); tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST); } else @@ -3139,12 +3147,17 @@ ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */ void -ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1) +ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) { gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH); + rtx zero = NULL_RTX; + if (op2 != const0_rtx + && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND) + && GET_MODE (dest) == SImode) + zero = force_reg (SImode, const0_rtx); rtx gt = ix86_expand_fp_compare (GT, op0, op1); - rtx l0 = gen_label_rtx (); - rtx l1 = gen_label_rtx (); + rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX; + rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX; rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX; rtx lend = gen_label_rtx (); rtx tmp; @@ -3158,32 +3171,185 @@ ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1) jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ()); } - rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode, - gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq, - gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx); - jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - add_reg_br_prob_note (jmp, profile_probability::unlikely ()); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt, - gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx); - jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - add_reg_br_prob_note (jmp, profile_probability::even ()); - emit_move_insn (dest, constm1_rtx); - emit_jump (lend); - emit_label (l0); - emit_move_insn (dest, const0_rtx); - emit_jump (lend); - emit_label (l1); - emit_move_insn (dest, const1_rtx); + if (op2 == const0_rtx) + { + rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode, + gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq, + gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx); + jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + add_reg_br_prob_note (jmp, profile_probability::unlikely ()); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt, + gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx); + jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + add_reg_br_prob_note (jmp, profile_probability::even ()); + emit_move_insn (dest, constm1_rtx); + emit_jump (lend); + emit_label (l0); + emit_move_insn (dest, const0_rtx); + emit_jump (lend); + emit_label (l1); + emit_move_insn (dest, const1_rtx); + } + else + { + rtx lt_tmp = NULL_RTX; + if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND) + { + lt_tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG), + const0_rtx); + if (GET_MODE (dest) != QImode) + { + tmp = gen_reg_rtx (GET_MODE (dest)); + emit_insn (gen_rtx_SET (tmp, + gen_rtx_ZERO_EXTEND (GET_MODE (dest), + lt_tmp))); + lt_tmp = tmp; + } + } + rtx gt_tmp; + if (zero) + { + /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear + before the floating point comparison and use setcc_si_slp + pattern to hide it from the combiner, so that it doesn't + undo it. Similarly for TARGET_ZERO_EXTEND_WITH_AND, where + the ZERO_EXTEND normally emitted would need to be AND + with flags clobber. */ + tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx); + PUT_MODE (tmp, QImode); + emit_insn (gen_setcc_si_slp (zero, tmp, zero)); + gt_tmp = zero; + } + else + { + gt_tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx); + if (GET_MODE (dest) != QImode) + { + tmp = gen_reg_rtx (GET_MODE (dest)); + emit_insn (gen_rtx_SET (tmp, + gen_rtx_ZERO_EXTEND (GET_MODE (dest), + gt_tmp))); + gt_tmp = tmp; + } + } + if (lt_tmp) + { + tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, + dest, 0, OPTAB_DIRECT); + if (!rtx_equal_p (tmp, dest)) + emit_move_insn (dest, tmp); + } + else + { + /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't + do ZERO_EXTEND without clobbering flags. */ + tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx); + PUT_MODE (tmp, SImode); + emit_insn (gen_subsi3_carry (dest, gt_tmp, + force_reg (GET_MODE (dest), const0_rtx), + XEXP (gt, 0), tmp)); + } + } emit_jump (lend); if (l2) { emit_label (l2); - emit_move_insn (dest, const2_rtx); + emit_move_insn (dest, op2 == const0_rtx ? const2_rtx : op2); } emit_label (lend); } +/* Expand integral op0 <=> op1, i.e. + dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1. */ + +void +ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2) +{ + gcc_assert (INTVAL (op2)); + rtx zero1 = NULL_RTX, zero2 = NULL_RTX; + if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode) + { + zero1 = force_reg (SImode, const0_rtx); + if (INTVAL (op2) != 1) + zero2 = force_reg (SImode, const0_rtx); + } + + /* Not using ix86_expand_int_compare here, so that it doesn't swap + operands nor optimize CC mode - we need a mode usable for both + LT and GT resp. LTU and GTU comparisons with the same unswapped + operands. */ + rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG); + rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1); + emit_insn (gen_rtx_SET (flags, tmp)); + rtx lt_tmp = NULL_RTX; + if (zero2) + { + /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid + ZERO_EXTEND. */ + tmp = ix86_expand_compare (LT, flags, const0_rtx); + PUT_MODE (tmp, QImode); + emit_insn (gen_setcc_si_slp (zero2, tmp, zero2)); + lt_tmp = zero2; + } + else if (!zero1) + { + lt_tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags, + const0_rtx); + if (GET_MODE (dest) != QImode) + { + tmp = gen_reg_rtx (GET_MODE (dest)); + emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest), + lt_tmp))); + lt_tmp = tmp; + } + } + rtx gt_tmp; + if (zero1) + { + /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid + ZERO_EXTEND. */ + tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags, + const0_rtx); + PUT_MODE (tmp, QImode); + emit_insn (gen_setcc_si_slp (zero1, tmp, zero1)); + gt_tmp = zero1; + } + else + { + gt_tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags, + const0_rtx); + if (GET_MODE (dest) != QImode) + { + tmp = gen_reg_rtx (GET_MODE (dest)); + emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest), + gt_tmp))); + gt_tmp = tmp; + } + } + if (lt_tmp) + { + tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (tmp, dest)) + emit_move_insn (dest, tmp); + } + else + { + /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't + do ZERO_EXTEND without clobbering flags. */ + tmp = ix86_expand_compare (LTU, flags, const0_rtx); + PUT_MODE (tmp, SImode); + emit_insn (gen_subsi3_carry (dest, gt_tmp, + force_reg (GET_MODE (dest), const0_rtx), + flags, tmp)); + } +} + /* Expand comparison setting or clearing carry flag. Return true when successful and set pop for the operation. */ static bool @@ -4031,6 +4197,8 @@ ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode, return true; else if (GET_MODE_INNER (cmp_mode) == HFmode) return true; + else if (GET_MODE_INNER (cmp_mode) == BFmode) + return true; /* When op_true is NULL, op_false must be NULL, or vice versa. */ gcc_assert (!op_true == !op_false); @@ -4247,23 +4415,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) switch (mode) { case E_V2SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_blendvps; break; case E_V4SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvps; break; case E_V2DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvpd; break; case E_SFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvss; break; case E_DFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_sse4_1_blendvsd; break; case E_V8QImode: @@ -4271,7 +4439,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4HFmode: case E_V4BFmode: case E_V2SImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v8qi; blend_mode = V8QImode; @@ -4281,14 +4449,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V2HImode: case E_V2HFmode: case E_V2BFmode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_mmx_pblendvb_v4qi; blend_mode = V4QImode; } break; case E_V2QImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) gen = gen_mmx_pblendvb_v2qi; break; case E_V16QImode: @@ -4298,18 +4466,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V4SImode: case E_V2DImode: case E_V1TImode: - if (TARGET_SSE4_1) + if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1) { gen = gen_sse4_1_pblendvb; blend_mode = V16QImode; } break; case E_V8SFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvps256; break; case E_V4DFmode: - if (TARGET_AVX) + if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV) gen = gen_avx_blendvpd256; break; case E_V32QImode: @@ -4318,7 +4486,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) case E_V16BFmode: case E_V8SImode: case E_V4DImode: - if (TARGET_AVX2) + if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV) { gen = gen_avx2_pblendvb; blend_mode = V32QImode; @@ -5590,7 +5758,7 @@ ix86_expand_vec_perm (rtx operands[]) if (TARGET_XOP) { - /* The XOP VPPERM insn supports three inputs. By ignoring the + /* The XOP VPPERM insn supports three inputs. By ignoring the one_operand_shuffle special case, we avoid creating another set of constant vectors in memory. */ one_operand_shuffle = false; @@ -7418,7 +7586,7 @@ ix86_expand_v1ti_ashiftrt (rtx operands[]) rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3)); rtx tmp8 = gen_reg_rtx (V1TImode); emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64))); - + rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3)); rtx tmp10 = gen_reg_rtx (V2DImode); emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits))); @@ -8414,7 +8582,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem, DONE_LABEL is a label after the whole copying sequence. The label is created on demand if *DONE_LABEL is NULL. MIN_SIZE is minimal size of block copied. This value gets adjusted for new - bounds after the initial copies. + bounds after the initial copies. DESTMEM/SRCMEM are memory expressions pointing to the copies block, DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether @@ -8723,7 +8891,7 @@ expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg, return dst; } -/* Return true if ALG can be used in current context. +/* Return true if ALG can be used in current context. Assume we expand memset if MEMSET is true. */ static bool alg_usable_p (enum stringop_alg alg, bool memset, bool have_as) @@ -9089,7 +9257,7 @@ ix86_copy_addr_to_reg (rtx addr) with specified algorithm. 4) Epilogue: code copying tail of the block that is too small to be - handled by main body (or up to size guarded by prologue guard). + handled by main body (or up to size guarded by prologue guard). Misaligned move sequence @@ -9307,7 +9475,7 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, /* Do the cheap promotion to allow better CSE across the main loop and epilogue (ie one load of the big constant in the - front of all code. + front of all code. For now the misaligned move sequences do not have fast path without broadcasting. */ if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used))) @@ -11247,6 +11415,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16QI_FTYPE_V8HI_V8HI: case V16HF_FTYPE_V16HF_V16HF: case V16SF_FTYPE_V16SF_V16SF: + case V16SI_FTYPE_V16SI_V16SI: case V8QI_FTYPE_V8QI_V8QI: case V8QI_FTYPE_V4HI_V4HI: case V8HI_FTYPE_V8HI_V8HI: @@ -12741,7 +12910,7 @@ ix86_expand_round_builtin (const struct builtin_description *d, /* Skip erasing embedded rounding for below expanders who generates multiple insns. In ix86_erase_embedded_rounding the pattern will be transformed to a single set, and emit_insn - appends the set insead of insert it to chain. So the insns + appends the set instead of insert it to chain. So the insns emitted inside define_expander would be ignored. */ switch (icode) { @@ -12858,6 +13027,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, klass = load; memory = 0; break; + case CHAR_FTYPE_PCCHAR: + case SHORT_FTYPE_PCSHORT: + case INT_FTYPE_PCINT: + case INT64_FTYPE_PCINT64: case UINT64_FTYPE_PUNSIGNED: case V2DI_FTYPE_PV2DI: case V4DI_FTYPE_PV4DI: @@ -13570,13 +13743,13 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, case IX86_BUILTIN_LDMXCSR: op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); - target = assign_386_stack_local (SImode, SLOT_TEMP); + target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode)); emit_move_insn (target, op0); emit_insn (gen_sse_ldmxcsr (target)); return 0; case IX86_BUILTIN_STMXCSR: - target = assign_386_stack_local (SImode, SLOT_TEMP); + target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode)); emit_insn (gen_sse_stmxcsr (target)); return copy_to_mode_reg (SImode, target); @@ -13625,7 +13798,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, if (!REG_P (op2)) op2 = copy_to_mode_reg (SImode, op2); - emit_insn (fcode == IX86_BUILTIN_MONITOR + emit_insn (fcode == IX86_BUILTIN_MONITOR ? gen_sse3_monitor (Pmode, op0, op1, op2) : gen_monitorx (Pmode, op0, op1, op2)); return 0; @@ -15928,7 +16101,7 @@ static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = { }; /* Comparator for bsearch on ix86_vec_bcast_map. */ -static int +static int ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry) { return (*(const unsigned int*)key) @@ -18096,6 +18269,8 @@ quarter: else if (use_vec_merge) { do_vec_merge: + if (!nonimmediate_operand (val, inner_mode)) + val = force_reg (inner_mode, val); tmp = gen_rtx_VEC_DUPLICATE (mode, val); tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (HOST_WIDE_INT_1U << elt)); @@ -25142,7 +25317,7 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) /* Multiply lower parts and add all */ t5 = gen_reg_rtx (V2DImode); - emit_insn (gen_vec_widen_umult_even_v4si (t5, + emit_insn (gen_vec_widen_umult_even_v4si (t5, gen_lowpart (V4SImode, op1), gen_lowpart (V4SImode, op2))); force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT); @@ -25337,7 +25512,7 @@ ix86_expand_pextr (rtx *operands) return false; dst = SUBREG_REG (dst); } - + if (SUBREG_P (src)) { pos += SUBREG_BYTE (src) * BITS_PER_UNIT; @@ -25573,7 +25748,7 @@ ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0, rtx op1, rtx *quot_p, rtx *rem_p) { - rtx rem = assign_386_stack_local (mode, SLOT_TEMP); + rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL, mode, op0, mode, op1, mode, @@ -25952,7 +26127,7 @@ ix86_gen_bcst_mem (machine_mode mode, rtx x) && !CONST_DOUBLE_P (cst) && !CONST_FIXED_P (cst)) return NULL_RTX; - + int n_elts = GET_MODE_NUNITS (mode); if (CONST_VECTOR_NUNITS (x) != n_elts) return NULL_RTX; @@ -26120,7 +26295,7 @@ do_mem_operand: /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic expression, such as a register or a memory reference. */ - + bool ix86_ternlog_leaf_p (rtx op, machine_mode mode) { @@ -26309,7 +26484,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, return target; } break; - + case 0x22: /* ~b&c */ if ((!op0 || !side_effects_p (op0)) && op1 && register_operand (op1, mode) @@ -26382,7 +26557,7 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, return target; } break; - + case 0x5a: /* a^c */ if (op0 && ix86_ternlog_leaf_p (op0, mode) && op2 && ix86_ternlog_leaf_p (op2, mode) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index ca902ec..e2e8521 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -18,6 +18,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -350,7 +351,7 @@ scalar_chain::mark_dual_mode_def (df_ref def) return; n_sse_to_integer++; } - + if (dump_file) fprintf (dump_file, " Mark r%d def in insn %d as requiring both modes in chain #%d\n", @@ -1503,6 +1504,23 @@ general_scalar_chain::convert_insn (rtx_insn *insn) df_insn_rescan (insn); } +/* Helper function to compute gain for loading an immediate constant. + Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but + with numerous special cases. */ + +static int +timode_immed_const_gain (rtx cst) +{ + /* movabsq vs. movabsq+vmovq+vunpacklqdq. */ + if (CONST_WIDE_INT_P (cst) + && CONST_WIDE_INT_NUNITS (cst) == 2 + && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1)) + return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9) + : -COSTS_N_INSNS (2); + /* 2x movabsq ~ vmovdqa. */ + return 0; +} + /* Compute a gain for chain conversion. */ int @@ -1549,7 +1567,14 @@ timode_scalar_chain::compute_convert_gain () case CONST_INT: if (MEM_P (dst) && standard_sse_constant_p (src, V1TImode)) - igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1; + igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1; + break; + + case CONST_WIDE_INT: + /* 2 x mov vs. vmovdqa. */ + if (MEM_P (dst)) + igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3) + : COSTS_N_INSNS (1); break; case NOT: @@ -1562,6 +1587,8 @@ timode_scalar_chain::compute_convert_gain () case IOR: if (!MEM_P (dst)) igain = COSTS_N_INSNS (1); + if (CONST_SCALAR_INT_P (XEXP (src, 1))) + igain += timode_immed_const_gain (XEXP (src, 1)); break; case ASHIFT: @@ -2304,14 +2331,16 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1)))) return true; - return REG_P (XEXP (src, 0)) + return (REG_P (XEXP (src, 0)) + || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); case IOR: case XOR: - return REG_P (XEXP (src, 0)) + return (REG_P (XEXP (src, 0)) + || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); @@ -3629,7 +3658,7 @@ ix86_compare_version_priority (tree decl1, tree decl2) /* V1 and V2 point to function versions with different priorities based on the target ISA. This function compares their priorities. */ - + static int feature_compare (const void *v1, const void *v2) { @@ -3678,7 +3707,7 @@ add_condition_to_bb (tree function_decl, tree version_decl, convert_expr = build1 (CONVERT_EXPR, ptr_type_node, build_fold_addr_expr (version_decl)); result_var = create_tmp_var (ptr_type_node); - convert_stmt = gimple_build_assign (result_var, convert_expr); + convert_stmt = gimple_build_assign (result_var, convert_expr); return_stmt = gimple_build_return (result_var); if (predicate_chain == NULL_TREE) @@ -3705,7 +3734,7 @@ add_condition_to_bb (tree function_decl, tree version_decl, gimple_seq_add_stmt (&gseq, call_cond_stmt); predicate_chain = TREE_CHAIN (predicate_chain); - + if (and_expr_var == NULL) and_expr_var = cond_var; else @@ -3746,7 +3775,7 @@ add_condition_to_bb (tree function_decl, tree version_decl, gimple_set_bb (return_stmt, bb2); bb3 = e23->dest; - make_edge (bb1, bb3, EDGE_FALSE_VALUE); + make_edge (bb1, bb3, EDGE_FALSE_VALUE); remove_edge (e23); make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0); @@ -3907,7 +3936,7 @@ ix86_mangle_function_version_assembler_name (tree decl, tree id) return ret; } -tree +tree ix86_mangle_decl_assembler_name (tree decl, tree id) { /* For function version, add the target suffix to the assembler name. */ @@ -3937,7 +3966,7 @@ ix86_get_function_versions_dispatcher (void *decl) tree dispatch_decl = NULL; struct cgraph_function_version_info *default_version_info = NULL; - + gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn)); node = cgraph_node::get (fn); @@ -3945,7 +3974,7 @@ ix86_get_function_versions_dispatcher (void *decl) node_v = node->function_version (); gcc_assert (node_v != NULL); - + if (node_v->dispatcher_resolver != NULL) return node_v->dispatcher_resolver; @@ -4101,7 +4130,7 @@ make_resolver_func (const tree default_decl, provide the code to dispatch the right function at run-time. NODE points to the dispatcher decl whose body will be created. */ -tree +tree ix86_generate_version_dispatcher_body (void *node_p) { tree resolver_decl; diff --git a/gcc/config/i386/i386-isa.def b/gcc/config/i386/i386-isa.def index bfb33ba..6cff3a2 100644 --- a/gcc/config/i386/i386-isa.def +++ b/gcc/config/i386/i386-isa.def @@ -123,3 +123,9 @@ DEF_PTA(AVX10_1_256) DEF_PTA(AVX10_1_512) DEF_PTA(AVX10_2_256) DEF_PTA(AVX10_2_512) +DEF_PTA(AMX_AVX512) +DEF_PTA(AMX_TF32) +DEF_PTA(AMX_TRANSPOSE) +DEF_PTA(AMX_FP8) +DEF_PTA(MOVRS) +DEF_PTA(AMX_MOVRS) diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index f79257c..239269e 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -18,6 +18,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -262,7 +263,13 @@ static struct ix86_target_opts isa2_opts[] = { "-mavx10.1-256", OPTION_MASK_ISA2_AVX10_1_256 }, { "-mavx10.1-512", OPTION_MASK_ISA2_AVX10_1_512 }, { "-mavx10.2-256", OPTION_MASK_ISA2_AVX10_2_256 }, - { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 } + { "-mavx10.2-512", OPTION_MASK_ISA2_AVX10_2_512 }, + { "-mamx-avx512", OPTION_MASK_ISA2_AMX_AVX512 }, + { "-mamx-tf32", OPTION_MASK_ISA2_AMX_TF32 }, + { "-mamx-transpose", OPTION_MASK_ISA2_AMX_TRANSPOSE }, + { "-mamx-fp8", OPTION_MASK_ISA2_AMX_FP8 }, + { "-mmovrs", OPTION_MASK_ISA2_MOVRS }, + { "-mamx-movrs", OPTION_MASK_ISA2_AMX_MOVRS } }; static struct ix86_target_opts isa_opts[] = { @@ -754,7 +761,7 @@ static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = { ~m_386, }; -/* This table must be in sync with enum processor_type in i386.h. */ +/* This table must be in sync with enum processor_type in i386.h. */ static const struct processor_costs *processor_cost_table[] = { &generic_cost, @@ -1131,6 +1138,12 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("avx10.2", OPT_mavx10_2_256), IX86_ATTR_ISA ("avx10.2-256", OPT_mavx10_2_256), IX86_ATTR_ISA ("avx10.2-512", OPT_mavx10_2_512), + IX86_ATTR_ISA ("amx-avx512", OPT_mamx_avx512), + IX86_ATTR_ISA ("amx-tf32", OPT_mamx_tf32), + IX86_ATTR_ISA ("amx-transpose", OPT_mamx_transpose), + IX86_ATTR_ISA ("amx-fp8", OPT_mamx_fp8), + IX86_ATTR_ISA ("movrs", OPT_mmovrs), + IX86_ATTR_ISA ("amx-movrs", OPT_mamx_movrs), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -1545,9 +1558,9 @@ ix86_valid_target_attribute_p (tree fndecl, tree old_optimize = build_optimization_node (&global_options, &global_options_set); - /* Get the optimization options of the current function. */ + /* Get the optimization options of the current function. */ tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); - + if (!func_optimize) func_optimize = old_optimize; @@ -2836,7 +2849,7 @@ ix86_option_override_internal (bool main_args_p, /* For all chips supporting SSE2, -mfpmath=sse performs better than fpmath=387. The second is however default at many targets since the extra 80bit precision of temporaries is considered to be part of ABI. - Overwrite the default at least for -ffast-math. + Overwrite the default at least for -ffast-math. TODO: -mfpmath=both seems to produce same performing code with bit smaller binaries. It is however not clear if register allocation is ready for this setting. @@ -3680,8 +3693,8 @@ char * ix86_offload_options (void) { if (TARGET_LP64) - return xstrdup ("-foffload-abi=lp64"); - return xstrdup ("-foffload-abi=ilp32"); + return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-m64"); + return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-m32"); } /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index c7ec0d9..35542b2 100644 --- a/gcc/config/i386/i386-opts.h +++ b/gcc/config/i386/i386-opts.h @@ -29,7 +29,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see enum stringop_alg { #undef DEF_ALG -#define DEF_ALG(alg, name) alg, +#define DEF_ALG(alg, name) alg, #include "stringop.def" last_alg diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 3a7bc94..c1f9147 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -164,7 +164,8 @@ extern bool ix86_expand_fp_vec_cmp (rtx[]); extern void ix86_expand_sse_movcc (rtx, rtx, rtx, rtx); extern void ix86_expand_sse_extend (rtx, rtx, bool); extern void ix86_expand_sse_unpack (rtx, rtx, bool, bool); -extern void ix86_expand_fp_spaceship (rtx, rtx, rtx); +extern void ix86_expand_fp_spaceship (rtx, rtx, rtx, rtx); +extern void ix86_expand_int_spaceship (rtx, rtx, rtx, rtx); extern bool ix86_expand_int_addcc (rtx[]); extern void ix86_expand_carry (rtx arg); extern rtx_insn *ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool); diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 224a78c..473e4cb 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_MEMORY #define INCLUDE_STRING #define IN_TARGET_CODE 1 @@ -180,7 +181,7 @@ enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] = /* The "default" register map used in 32bit mode. */ -int const debugger_register_map[FIRST_PSEUDO_REGISTER] = +unsigned int const debugger_register_map[FIRST_PSEUDO_REGISTER] = { /* general regs */ 0, 2, 1, 3, 6, 7, 4, 5, @@ -211,7 +212,7 @@ int const debugger_register_map[FIRST_PSEUDO_REGISTER] = /* The "default" register map used in 64bit mode. */ -int const debugger64_register_map[FIRST_PSEUDO_REGISTER] = +unsigned int const debugger64_register_map[FIRST_PSEUDO_REGISTER] = { /* general regs */ 0, 1, 2, 3, 4, 5, 6, 7, @@ -293,7 +294,7 @@ int const debugger64_register_map[FIRST_PSEUDO_REGISTER] = 17 for %st(6) (gcc regno = 14) 18 for %st(7) (gcc regno = 15) */ -int const svr4_debugger_register_map[FIRST_PSEUDO_REGISTER] = +unsigned int const svr4_debugger_register_map[FIRST_PSEUDO_REGISTER] = { /* general regs */ 0, 2, 1, 3, 6, 7, 5, 4, @@ -511,7 +512,7 @@ ix86_conditional_register_usage (void) /* See the definition of CALL_USED_REGISTERS in i386.h. */ c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI); - + CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]); for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) @@ -1939,7 +1940,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ The midde-end can't deal with the vector types > 16 bytes. In this case, we return the original mode and warn ABI change if CUM isn't - NULL. + NULL. If INT_RETURN is true, warn ABI change if the vector mode isn't available for function return value. */ @@ -4269,7 +4270,7 @@ ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl, if (fntype_or_decl && DECL_P (fntype_or_decl)) fn = fntype_or_decl; fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; - + if (ix86_function_type_abi (fntype) == MS_ABI) { if (TARGET_64BIT) @@ -4387,7 +4388,7 @@ ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* Unless ABI prescibes otherwise, MMX/3dNow values are returned in MM0 if available. */ - + if (size == 8) return TARGET_VECT8_RETURNS || !TARGET_MMX; @@ -4506,7 +4507,7 @@ ix86_build_builtin_va_list (void) /* For SYSV_ABI we use an array of one record. */ sysv_va_list_type_node = ix86_build_builtin_va_list_64 (); - + /* For MS_ABI we use plain pointer to argument area. */ tree char_ptr_type = build_pointer_type (char_type_node); tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE, @@ -4907,13 +4908,31 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs); - need_temp = (!REG_P (container) + bool container_in_reg = false; + if (REG_P (container)) + container_in_reg = true; + else if (GET_CODE (container) == PARALLEL + && GET_MODE (container) == BLKmode + && XVECLEN (container, 0) == 1) + { + /* Check if it is a PARALLEL BLKmode container of an EXPR_LIST + expression in a TImode register. In this case, temp isn't + needed. Otherwise, the TImode variable will be put in the + GPR save area which guarantees only 8-byte alignment. */ + rtx x = XVECEXP (container, 0, 0); + if (GET_CODE (x) == EXPR_LIST + && REG_P (XEXP (x, 0)) + && XEXP (x, 1) == const0_rtx) + container_in_reg = true; + } + + need_temp = (!container_in_reg && ((needed_intregs && TYPE_ALIGN (type) > 64) || TYPE_ALIGN (type) > 128)); /* In case we are passing structure, verify that it is consecutive block on the register save area. If not we need to do moves. */ - if (!need_temp && !REG_P (container)) + if (!need_temp && !container_in_reg) { /* Verify that all registers are strictly consecutive */ if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0)))) @@ -5445,7 +5464,7 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) && float_vector_all_ones_operand (x, mode))) { enum attr_mode insn_mode = get_attr_mode (insn); - + switch (insn_mode) { case MODE_XI: @@ -5965,7 +5984,7 @@ ix86_frame_pointer_required (void) /* SSE saves require frame-pointer when stack is misaligned. */ if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128) return true; - + /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER turns off the frame pointer by default. Turn it back on now if we've not got a leaf function. */ @@ -6561,8 +6580,8 @@ gen_push2 (rtx mem, rtx reg1, rtx reg2, bool ppx_p = false) if (REG_P (reg2) && GET_MODE (reg2) != word_mode) reg2 = gen_rtx_REG (word_mode, REGNO (reg2)); - return ppx_p ? gen_push2p_di (mem, reg1, reg2): - gen_push2_di (mem, reg1, reg2); + return ppx_p ? gen_push2p_di (mem, reg1, reg2) + : gen_push2_di (mem, reg1, reg2); } /* Return >= 0 if there is an unused call-clobbered register available @@ -10772,7 +10791,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) addr = XEXP (addr, 0); if (CONST_INT_P (addr)) return false; - } + } else if (GET_CODE (addr) == AND && const_32bit_mask (XEXP (addr, 1), DImode)) { @@ -12450,7 +12469,7 @@ ix86_tls_address_pattern_p (rtx op) } /* Rewrite *LOC so that it refers to a default TLS address space. */ -void +static void ix86_rewrite_tls_address_1 (rtx *loc) { subrtx_ptr_iterator::array_type array; @@ -12472,6 +12491,13 @@ ix86_rewrite_tls_address_1 (rtx *loc) if (GET_CODE (u) == UNSPEC && XINT (u, 1) == UNSPEC_TP) { + /* NB: Since address override only applies to the + (reg32) part in fs:(reg32), return if address + override is used. */ + if (Pmode != word_mode + && REG_P (XEXP (*x, 1 - i))) + return; + addr_space_t as = DEFAULT_TLS_SEG_REG; *x = XEXP (*x, 1 - i); @@ -13621,7 +13647,7 @@ ix86_print_operand (FILE *file, rtx x, int code) case 2: putc ('w', file); break; - + case 4: putc ('l', file); break; @@ -14881,9 +14907,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn) static bool ix86_check_avx_upper_register (const_rtx exp) { - return (SSE_REG_P (exp) - && !EXT_REX_SSE_REG_P (exp) - && GET_MODE_BITSIZE (GET_MODE (exp)) > 128); + /* construct_container may return a parallel with expr_list + which contains the real reg and mode */ + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, exp, NONCONST) + { + const_rtx x = *iter; + if (SSE_REG_P (x) + && !EXT_REX_SSE_REG_P (x) + && GET_MODE_BITSIZE (GET_MODE (x)) > 128) + return true; + } + + return false; } /* Check if a 256bit or 512bit AVX register is referenced in stores. */ @@ -14891,7 +14927,9 @@ ix86_check_avx_upper_register (const_rtx exp) static void ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) { - if (ix86_check_avx_upper_register (dest)) + if (SSE_REG_P (dest) + && !EXT_REX_SSE_REG_P (dest) + && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) { bool *used = (bool *) data; *used = true; @@ -14950,14 +14988,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) return AVX_U128_CLEAN; } - subrtx_iterator::array_type array; - rtx set = single_set (insn); if (set) { rtx dest = SET_DEST (set); rtx src = SET_SRC (set); - if (ix86_check_avx_upper_register (dest)) + if (SSE_REG_P (dest) + && !EXT_REX_SSE_REG_P (dest) + && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) { /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the source isn't zero. */ @@ -14968,9 +15006,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) } else { - FOR_EACH_SUBRTX (iter, array, src, NONCONST) - if (ix86_check_avx_upper_register (*iter)) - return AVX_U128_DIRTY; + if (ix86_check_avx_upper_register (src)) + return AVX_U128_DIRTY; } /* This isn't YMM/ZMM load/store. */ @@ -14981,9 +15018,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) Hardware changes state only when a 256bit register is written to, but we need to prevent the compiler from moving optimal insertion point above eventual read from 256bit or 512 bit register. */ - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) - if (ix86_check_avx_upper_register (*iter)) - return AVX_U128_DIRTY; + if (ix86_check_avx_upper_register (PATTERN (insn))) + return AVX_U128_DIRTY; return AVX_U128_ANY; } @@ -15972,7 +16008,7 @@ ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[]) if (optimize_size > 1 && parts.scale > 1 && !parts.base - && (!parts.disp || parts.disp == const0_rtx)) + && (!parts.disp || parts.disp == const0_rtx)) return true; /* Check we need to optimize. */ @@ -16176,6 +16212,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value) case E_V32BFmode: case E_V16BFmode: case E_V8BFmode: + case E_V4BFmode: + case E_V2BFmode: n_elt = GET_MODE_NUNITS (mode); v = rtvec_alloc (n_elt); scalar_mode = GET_MODE_INNER (mode); @@ -16215,6 +16253,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) case E_V32BFmode: case E_V16BFmode: case E_V8BFmode: + case E_V4BFmode: + case E_V2BFmode: vec_mode = mode; imode = HImode; break; @@ -16623,6 +16663,11 @@ ix86_fp_compare_code_to_integer (enum rtx_code code) return LEU; case LTGT: return NE; + case EQ: + case NE: + if (TARGET_AVX10_2_256) + return code; + /* FALLTHRU. */ default: return UNKNOWN; } @@ -17124,7 +17169,7 @@ ix86_output_call_insn (rtx_insn *insn, rtx call_op) seh_nop_p = true; break; } - + /* If we get to another real insn, we don't need the nop. */ if (INSN_P (i)) break; @@ -17714,7 +17759,7 @@ ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail) using topological ordering in the region. */ if (rgn == CONTAINING_RGN (e->src->index) && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index)) - add_dependee_for_func_arg (first_arg, e->src); + add_dependee_for_func_arg (first_arg, e->src); } } insn = first_arg; @@ -18086,7 +18131,7 @@ ix86_local_alignment (tree exp, machine_mode mode, other unit cannot rely on the alignment. Exclude va_list type. It is the common case of local array where - we cannot benefit from the alignment. + we cannot benefit from the alignment. TODO: Probably one should optimize for size only when var is not escaping. */ if (TARGET_64BIT && optimize_function_for_speed_p (cfun) @@ -18469,6 +18514,8 @@ ix86_fold_builtin (tree fndecl, int n_args, = (enum ix86_builtins) DECL_MD_FUNCTION_CODE (fndecl); enum rtx_code rcode; bool is_vshift; + enum tree_code tcode; + bool is_scalar; unsigned HOST_WIDE_INT mask; switch (fn_code) @@ -18918,6 +18965,131 @@ ix86_fold_builtin (tree fndecl, int n_args, } break; + case IX86_BUILTIN_MINSS: + case IX86_BUILTIN_MINSH_MASK: + tcode = LT_EXPR; + is_scalar = true; + goto do_minmax; + + case IX86_BUILTIN_MAXSS: + case IX86_BUILTIN_MAXSH_MASK: + tcode = GT_EXPR; + is_scalar = true; + goto do_minmax; + + case IX86_BUILTIN_MINPS: + case IX86_BUILTIN_MINPD: + case IX86_BUILTIN_MINPS256: + case IX86_BUILTIN_MINPD256: + case IX86_BUILTIN_MINPS512: + case IX86_BUILTIN_MINPD512: + case IX86_BUILTIN_MINPS128_MASK: + case IX86_BUILTIN_MINPD128_MASK: + case IX86_BUILTIN_MINPS256_MASK: + case IX86_BUILTIN_MINPD256_MASK: + case IX86_BUILTIN_MINPH128_MASK: + case IX86_BUILTIN_MINPH256_MASK: + case IX86_BUILTIN_MINPH512_MASK: + tcode = LT_EXPR; + is_scalar = false; + goto do_minmax; + + case IX86_BUILTIN_MAXPS: + case IX86_BUILTIN_MAXPD: + case IX86_BUILTIN_MAXPS256: + case IX86_BUILTIN_MAXPD256: + case IX86_BUILTIN_MAXPS512: + case IX86_BUILTIN_MAXPD512: + case IX86_BUILTIN_MAXPS128_MASK: + case IX86_BUILTIN_MAXPD128_MASK: + case IX86_BUILTIN_MAXPS256_MASK: + case IX86_BUILTIN_MAXPD256_MASK: + case IX86_BUILTIN_MAXPH128_MASK: + case IX86_BUILTIN_MAXPH256_MASK: + case IX86_BUILTIN_MAXPH512_MASK: + tcode = GT_EXPR; + is_scalar = false; + do_minmax: + gcc_assert (n_args >= 2); + if (TREE_CODE (args[0]) != VECTOR_CST + || TREE_CODE (args[1]) != VECTOR_CST) + break; + mask = HOST_WIDE_INT_M1U; + if (n_args > 2) + { + gcc_assert (n_args >= 4); + /* This is masked minmax. */ + if (TREE_CODE (args[3]) != INTEGER_CST + || TREE_SIDE_EFFECTS (args[2])) + break; + mask = TREE_INT_CST_LOW (args[3]); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + mask |= HOST_WIDE_INT_M1U << elems; + if (mask != HOST_WIDE_INT_M1U + && TREE_CODE (args[2]) != VECTOR_CST) + break; + if (n_args >= 5) + { + if (!tree_fits_uhwi_p (args[4])) + break; + if (tree_to_uhwi (args[4]) != 4 + && tree_to_uhwi (args[4]) != 8) + break; + } + if (mask == (HOST_WIDE_INT_M1U << elems)) + return args[2]; + } + /* Punt on NaNs, unless exceptions are disabled. */ + if (HONOR_NANS (args[0]) + && (n_args < 5 || tree_to_uhwi (args[4]) != 8)) + for (int i = 0; i < 2; ++i) + { + unsigned count = vector_cst_encoded_nelts (args[i]); + for (unsigned j = 0; j < count; ++j) + if (tree_expr_nan_p (VECTOR_CST_ENCODED_ELT (args[i], j))) + return NULL_TREE; + } + { + tree res = const_binop (tcode, + truth_type_for (TREE_TYPE (args[0])), + args[0], args[1]); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + res = fold_ternary (VEC_COND_EXPR, TREE_TYPE (args[0]), res, + args[0], args[1]); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + if (mask != HOST_WIDE_INT_M1U) + { + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + vec_perm_builder sel (nelts, nelts, 1); + for (unsigned int i = 0; i < nelts; i++) + if (mask & (HOST_WIDE_INT_1U << i)) + sel.quick_push (i); + else + sel.quick_push (nelts + i); + vec_perm_indices indices (sel, 2, nelts); + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[2], + indices); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + } + if (is_scalar) + { + unsigned nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + vec_perm_builder sel (nelts, nelts, 1); + sel.quick_push (0); + for (unsigned int i = 1; i < nelts; i++) + sel.quick_push (nelts + i); + vec_perm_indices indices (sel, 2, nelts); + res = fold_vec_perm (TREE_TYPE (args[0]), res, args[0], + indices); + if (res == NULL_TREE || TREE_CODE (res) != VECTOR_CST) + break; + } + return res; + } + default: break; } @@ -19463,6 +19635,74 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) } return true; + case IX86_BUILTIN_MINPS: + case IX86_BUILTIN_MINPD: + case IX86_BUILTIN_MINPS256: + case IX86_BUILTIN_MINPD256: + case IX86_BUILTIN_MINPS512: + case IX86_BUILTIN_MINPD512: + case IX86_BUILTIN_MINPS128_MASK: + case IX86_BUILTIN_MINPD128_MASK: + case IX86_BUILTIN_MINPS256_MASK: + case IX86_BUILTIN_MINPD256_MASK: + case IX86_BUILTIN_MINPH128_MASK: + case IX86_BUILTIN_MINPH256_MASK: + case IX86_BUILTIN_MINPH512_MASK: + tcode = LT_EXPR; + goto do_minmax; + + case IX86_BUILTIN_MAXPS: + case IX86_BUILTIN_MAXPD: + case IX86_BUILTIN_MAXPS256: + case IX86_BUILTIN_MAXPD256: + case IX86_BUILTIN_MAXPS512: + case IX86_BUILTIN_MAXPD512: + case IX86_BUILTIN_MAXPS128_MASK: + case IX86_BUILTIN_MAXPD128_MASK: + case IX86_BUILTIN_MAXPS256_MASK: + case IX86_BUILTIN_MAXPD256_MASK: + case IX86_BUILTIN_MAXPH128_MASK: + case IX86_BUILTIN_MAXPH256_MASK: + case IX86_BUILTIN_MAXPH512_MASK: + tcode = GT_EXPR; + do_minmax: + gcc_assert (n_args >= 2); + /* Without SSE4.1 we often aren't able to pattern match it back to the + desired instruction. */ + if (!gimple_call_lhs (stmt) || !optimize || !TARGET_SSE4_1) + break; + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + /* For masked minmax, only optimize if the mask is all ones. */ + if (n_args > 2 + && !ix86_masked_all_ones (elems, gimple_call_arg (stmt, 3))) + break; + if (n_args >= 5) + { + tree arg4 = gimple_call_arg (stmt, 4); + if (!tree_fits_uhwi_p (arg4)) + break; + if (tree_to_uhwi (arg4) == 4) + /* Ok. */; + else if (tree_to_uhwi (arg4) != 8) + /* Invalid round argument. */ + break; + else if (HONOR_NANS (arg0)) + /* Lowering to comparison would raise exceptions which + shouldn't be raised. */ + break; + } + { + tree type = truth_type_for (TREE_TYPE (arg0)); + tree cmpres = gimple_build (&stmts, tcode, type, arg0, arg1); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + g = gimple_build_assign (gimple_call_lhs (stmt), + VEC_COND_EXPR, cmpres, arg0, arg1); + gsi_replace (gsi, g, false); + } + return true; + default: break; } @@ -22041,7 +22281,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, *total = COSTS_N_BYTES (1); else if (TARGET_SLOW_STC) *total = COSTS_N_INSNS (2); - else + else *total = COSTS_N_INSNS (1); return true; } @@ -24195,6 +24435,13 @@ ix86_stack_protect_guard (void) return default_stack_protect_guard (); } +static bool +ix86_stack_protect_runtime_enabled_p (void) +{ + /* Naked functions should not enable stack protector. */ + return !ix86_function_naked (current_function_decl); +} + /* For 32-bit code we can save PIC register setup by using __stack_chk_fail_local hidden function instead of calling __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC @@ -24382,7 +24629,7 @@ ix86_canonical_va_list_type (tree type) return ms_va_list_type_node; if ((TREE_CODE (type) == ARRAY_TYPE - && integer_zerop (array_type_nelts (type))) + && integer_zerop (array_type_nelts_minus_one (type))) || POINTER_TYPE_P (type)) { tree elem_type = TREE_TYPE (type); @@ -24467,13 +24714,17 @@ ix86_reassociation_width (unsigned int op, machine_mode mode) if (width == 1) return 1; - /* Integer vector instructions execute in FP unit + /* Znver1-4 Integer vector instructions execute in FP unit and can execute 3 additions and one multiplication per cycle. */ if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2 - || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4 - || ix86_tune == PROCESSOR_ZNVER5) + || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4) && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) return 1; + /* Znver5 can do 2 integer multiplications per cycle with latency + of 3. */ + if (ix86_tune == PROCESSOR_ZNVER5 + && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) + width = 6; /* Account for targets that splits wide vectors into multiple parts. */ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256) @@ -24554,6 +24805,14 @@ ix86_preferred_simd_mode (scalar_mode mode) } return word_mode; + case E_BFmode: + if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) + return V32BFmode; + else if (TARGET_AVX && !TARGET_PREFER_AVX128) + return V16BFmode; + else + return V8BFmode; + case E_SFmode: if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256) return V16SFmode; @@ -24632,7 +24891,8 @@ ix86_get_mask_mode (machine_mode data_mode) /* AVX512FP16 only supports vector comparison to kmask for _Float16. */ || (TARGET_AVX512VL && TARGET_AVX512FP16 - && GET_MODE_INNER (data_mode) == E_HFmode)) + && GET_MODE_INNER (data_mode) == E_HFmode) + || (TARGET_AVX10_2_256 && GET_MODE_INNER (data_mode) == E_BFmode)) { if (elem_size == 4 || elem_size == 8 @@ -24765,12 +25025,15 @@ private: where we know it's not loaded from memory. */ unsigned m_num_gpr_needed[3]; unsigned m_num_sse_needed[3]; + /* Number of 256-bit vector permutation. */ + unsigned m_num_avx256_vec_perm[3]; }; ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), m_num_gpr_needed (), - m_num_sse_needed () + m_num_sse_needed (), + m_num_avx256_vec_perm () { } @@ -24938,13 +25201,21 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, (AGU and load ports). Try to account for this by scaling the construction cost by the number of elements involved. */ if ((kind == vec_construct || kind == vec_to_scalar) - && stmt_info - && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type - || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) - != INTEGER_CST)) - || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)) + && ((stmt_info + && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type + || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) + && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + != INTEGER_CST)) + || (STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) + == VMAT_GATHER_SCATTER))) + || (node + && ((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF + (SLP_TREE_REPRESENTATIVE (node)))) + != INTEGER_CST)) + || (SLP_TREE_MEMORY_ACCESS_TYPE (node) + == VMAT_GATHER_SCATTER))))) { stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); @@ -25004,6 +25275,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + if (kind == vec_perm && vectype + && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) + m_num_avx256_vec_perm[where]++; + /* Penalize DFmode vector operations for Bonnell. */ if (TARGET_CPU_P (BONNELL) && kind == vector_stmt && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode) @@ -25073,6 +25348,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) ix86_vect_estimate_reg_pressure (); + for (int i = 0; i != 3; i++) + if (m_num_avx256_vec_perm[i] + && TARGET_AVX256_AVOID_VEC_PERM) + m_costs[i] = INT_MAX; + vector_costs::finish_cost (scalar_costs); } @@ -25544,7 +25824,7 @@ extract_base_offset_in_addr (rtx mem, rtx *base, rtx *symbase, rtx *offset) gcc_assert (MEM_P (mem)); addr = XEXP (mem, 0); - + if (GET_CODE (addr) == CONST) addr = XEXP (addr, 0); @@ -26568,6 +26848,10 @@ ix86_libgcc_floating_mode_supported_p #undef TARGET_STACK_PROTECT_GUARD #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard +#undef TARGET_STACK_PROTECT_RUNTIME_ENABLED_P +#define TARGET_STACK_PROTECT_RUNTIME_ENABLED_P \ + ix86_stack_protect_runtime_enabled_p + #if !TARGET_MACHO #undef TARGET_STACK_PROTECT_FAIL #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index eabb324..5193440 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -238,7 +238,7 @@ extern const struct processor_costs *ix86_cost; extern const struct processor_costs ix86_size_cost; #define ix86_cur_cost() \ - (optimize_insn_for_size_p () ? &ix86_size_cost: ix86_cost) + (optimize_insn_for_size_p () ? &ix86_size_cost : ix86_cost) /* Macros used in the machine description to test the flags. */ @@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] #define TARGET_FUSE_ALU_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] +#define TARGET_FUSE_MOV_AND_ALU \ + ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] #define TARGET_AVOID_LEA_FOR_ADDR \ ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] @@ -437,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] #define TARGET_AVX256_SPLIT_REGS \ ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS] +#define TARGET_AVX256_AVOID_VEC_PERM \ + ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM] #define TARGET_AVX512_SPLIT_REGS \ ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS] #define TARGET_GENERAL_REGS_SSE_SPILL \ @@ -460,6 +464,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC] #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC] #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR] +#define TARGET_SSE_MOVCC_USE_BLENDV \ + ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { @@ -539,7 +545,7 @@ extern GTY(()) tree x86_mfence; #define TARGET_SUBTARGET64_ISA_DEFAULT \ (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2) -/* Replace MACH-O, ifdefs by in-line tests, where possible. +/* Replace MACH-O, ifdefs by in-line tests, where possible. (a) Macros defined in config/i386/darwin.h */ #define TARGET_MACHO 0 #define TARGET_MACHO_SYMBOL_STUBS 0 @@ -897,7 +903,10 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); and give entire struct the alignment of an int. */ /* Required on the 386 since it doesn't have bit-field insns. */ #define PCC_BITFIELD_TYPE_MATTERS 1 - + +#define VECTOR_STORE_FLAG_VALUE(MODE) \ + (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT ? constm1_rtx : NULL_RTX) + /* Standard register usage. */ /* This processor has special stack-like registers. See reg-stack.cc @@ -1565,11 +1574,11 @@ enum reg_class /* If defined, the maximum amount of space required for outgoing arguments will be computed and placed into the variable `crtl->outgoing_args_size'. No space will be pushed onto the stack for each call; instead, the - function prologue should increase the stack frame size by this amount. + function prologue should increase the stack frame size by this amount. In 32bit mode enabling argument accumulation results in about 5% code size growth because move instructions are less compact than push. In 64bit - mode the difference is less drastic but visible. + mode the difference is less drastic but visible. FIXME: Unlike earlier implementations, the size of unwind info seems to actually grow with accumulation. Is that because accumulated args @@ -2089,9 +2098,9 @@ do { \ #define DEBUGGER_REGNO(N) \ (TARGET_64BIT ? debugger64_register_map[(N)] : debugger_register_map[(N)]) -extern int const debugger_register_map[FIRST_PSEUDO_REGISTER]; -extern int const debugger64_register_map[FIRST_PSEUDO_REGISTER]; -extern int const svr4_debugger_register_map[FIRST_PSEUDO_REGISTER]; +extern unsigned int const debugger_register_map[FIRST_PSEUDO_REGISTER]; +extern unsigned int const debugger64_register_map[FIRST_PSEUDO_REGISTER]; +extern unsigned int const svr4_debugger_register_map[FIRST_PSEUDO_REGISTER]; /* Before the prologue, RA is at 0(%esp). */ #define INCOMING_RETURN_ADDR_RTX \ @@ -2424,6 +2433,18 @@ constexpr wide_int_bitmask PTA_CLEARWATERFOREST = PTA_SIERRAFOREST | PTA_AVXVNNIINT16 | PTA_SHA512 | PTA_SM3 | PTA_SM4 | PTA_USER_MSR | PTA_PREFETCHI; constexpr wide_int_bitmask PTA_PANTHERLAKE = PTA_ARROWLAKE_S | PTA_PREFETCHI; + +constexpr wide_int_bitmask PTA_BDVER1 = PTA_64BIT | PTA_MMX | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 + | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 + | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; +constexpr wide_int_bitmask PTA_BDVER2 = PTA_BDVER1 | PTA_BMI | PTA_TBM + | PTA_F16C | PTA_FMA; +constexpr wide_int_bitmask PTA_BDVER3 = PTA_BDVER2 | PTA_XSAVEOPT + | PTA_FSGSBASE; +constexpr wide_int_bitmask PTA_BDVER4 = PTA_BDVER3 | PTA_AVX2 | PTA_BMI2 + | PTA_RDRND | PTA_MOVBE | PTA_MWAITX; + constexpr wide_int_bitmask PTA_ZNVER1 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2 | PTA_BMI | PTA_BMI2 @@ -2441,13 +2462,21 @@ constexpr wide_int_bitmask PTA_ZNVER4 = PTA_ZNVER3 | PTA_AVX512F | PTA_AVX512DQ | PTA_AVX512VNNI | PTA_AVX512BITALG | PTA_AVX512VPOPCNTDQ | PTA_EVEX512; constexpr wide_int_bitmask PTA_ZNVER5 = PTA_ZNVER4 | PTA_AVXVNNI | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_AVX512VP2INTERSECT | PTA_PREFETCHI; -constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 - | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES - | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT - | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE | PTA_ADX | PTA_RDSEED | PTA_POPCNT; -constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2 | PTA_F16C - | PTA_FMA | PTA_SHA | PTA_LZCNT; +constexpr wide_int_bitmask PTA_BTVER1 = PTA_64BIT | PTA_MMX | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16 + | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE; +constexpr wide_int_bitmask PTA_BTVER2 = PTA_BTVER1 | PTA_SSE4_1 | PTA_SSE4_2 + | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_BMI | PTA_F16C | PTA_MOVBE + | PTA_XSAVEOPT; + +constexpr wide_int_bitmask PTA_LUJIAZUI = PTA_64BIT | PTA_MMX | PTA_SSE + | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 + | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_BMI | PTA_BMI2 | PTA_PRFCHW + | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND | PTA_MOVBE + | PTA_ADX | PTA_RDSEED | PTA_POPCNT; +constexpr wide_int_bitmask PTA_YONGFENG = PTA_LUJIAZUI | PTA_AVX | PTA_AVX2 + | PTA_F16C | PTA_FMA | PTA_SHA | PTA_LZCNT; #ifndef GENERATOR_FILE @@ -2510,8 +2539,7 @@ enum ix86_fpcmp_strategy { enum ix86_stack_slot { - SLOT_TEMP = 0, - SLOT_CW_STORED, + SLOT_CW_STORED = 0, SLOT_CW_ROUNDEVEN, SLOT_CW_TRUNC, SLOT_CW_FLOOR, diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b56a51b..effab29 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -117,6 +117,8 @@ UNSPEC_STC UNSPEC_PUSHFL UNSPEC_POPFL + UNSPEC_OPTCOMX + UNSPEC_SETCC_SI_SLP ;; For SSE/MMX support: UNSPEC_FIX_NOTRUNC @@ -362,6 +364,9 @@ ;; For AMX-TILE UNSPECV_LDTILECFG UNSPECV_STTILECFG + + ;; For MOVRS support + UNSPECV_MOVRS ]) ;; Constants to represent rounding modes in the ROUND instruction @@ -538,10 +543,10 @@ str,bitmanip, fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp, fxch,fistp,fisttp,frndint, - sse,ssemov,sseadd,sseadd1,sseiadd,sseiadd1, + sse,ssemov,ssemov2,sseadd,sseadd1,sseiadd,sseiadd1, ssemul,sseimul,ssediv,sselog,sselog1, sseishft,sseishft1,ssecmp,ssecomi, - ssecvt,ssecvt1,sseicvt,sseins, + ssecvt,ssecvt1,sseicvt,sseicvt2,sseins, sseshuf,sseshuf1,ssemuladd,sse4arg, lwp,mskmov,msklog, mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" @@ -559,10 +564,10 @@ (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp, fxch,fistp,fisttp,frndint") (const_string "i387") - (eq_attr "type" "sse,ssemov,sseadd,sseadd1,sseiadd,sseiadd1, + (eq_attr "type" "sse,ssemov,ssemov2,sseadd,sseadd1,sseiadd,sseiadd1, ssemul,sseimul,ssediv,sselog,sselog1, sseishft,sseishft1,ssecmp,ssecomi, - ssecvt,ssecvt1,sseicvt,sseins, + ssecvt,ssecvt1,sseicvt,sseicvt2,sseins, sseshuf,sseshuf1,ssemuladd,sse4arg,mskmov") (const_string "sse") (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") @@ -857,6 +862,9 @@ mmx,mmxmov,mmxcmp,mmxcvt,mskmov,msklog") (match_operand 2 "memory_operand")) (const_string "load") + (and (eq_attr "type" "ssemov2,sseicvt2") + (match_operand 2 "memory_operand")) + (const_string "load") (and (eq_attr "type" "icmov,ssemuladd,sse4arg") (match_operand 3 "memory_operand")) (const_string "load") @@ -1736,7 +1744,7 @@ (compare:CC (match_operand:XF 1 "nonmemory_operand") (match_operand:XF 2 "nonmemory_operand"))) (set (pc) (if_then_else - (match_operator 0 "ix86_fp_comparison_operator" + (match_operator 0 "ix86_fp_comparison_operator_xf" [(reg:CC FLAGS_REG) (const_int 0)]) (label_ref (match_operand 3)) @@ -1753,7 +1761,7 @@ (compare:CC (match_operand:XF 2 "nonmemory_operand") (match_operand:XF 3 "nonmemory_operand"))) (set (match_operand:QI 0 "register_operand") - (match_operator 1 "ix86_fp_comparison_operator" + (match_operator 1 "ix86_fp_comparison_operator_xf" [(reg:CC FLAGS_REG) (const_int 0)]))] "TARGET_80387" @@ -2017,6 +2025,32 @@ (set_attr "bdver1_decode" "double") (set_attr "znver1_decode" "double")]) +(define_insn "*cmpx<unord><MODEF:mode>" + [(set (reg:CCFP FLAGS_REG) + (unspec:CCFP [ + (compare:CCFP + (match_operand:MODEF 0 "register_operand" "v") + (match_operand:MODEF 1 "nonimmediate_operand" "vm"))] + UNSPEC_OPTCOMX))] + "TARGET_AVX10_2_256" + "%v<unord>comx<MODEF:ssemodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "evex") + (set_attr "mode" "<MODEF:MODE>")]) + +(define_insn "*cmpx<unord>hf" + [(set (reg:CCFP FLAGS_REG) + (unspec:CCFP [ + (compare:CCFP + (match_operand:HF 0 "register_operand" "v") + (match_operand:HF 1 "nonimmediate_operand" "vm"))] + UNSPEC_OPTCOMX))] + "TARGET_AVX10_2_256" + "v<unord>comxsh\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecomi") + (set_attr "prefix" "evex") + (set_attr "mode" "HF")]) + (define_insn "*cmpi<unord><MODEF:mode>" [(set (reg:CCFP FLAGS_REG) (compare:CCFP @@ -3658,7 +3692,7 @@ [(set (match_operand:TI 0 "nonimmediate_operand" "=ro,r,r,&r") (any_or_plus:TI (and:TI - (match_operand:TI 1 "nonimmediate_operand" "r,m,r,m") + (match_operand:TI 1 "nonimmediate_operand" "r,o,r,o") (match_operand:TI 3 "const_scalar_int_operand" "n,n,n,n")) (zero_extend:TI (match_operand:DI 2 "nonimmediate_operand" "r,r,m,m"))))] @@ -3680,7 +3714,7 @@ [(set (match_operand:DI 0 "nonimmediate_operand" "=ro,r,r,&r") (any_or_plus:DI (and:DI - (match_operand:DI 1 "nonimmediate_operand" "r,m,r,m") + (match_operand:DI 1 "nonimmediate_operand" "r,o,r,o") (match_operand:DI 3 "const_int_operand" "n,n,n,n")) (zero_extend:DI (match_operand:SI 2 "nonimmediate_operand" "r,r,m,m"))))] @@ -5642,16 +5676,18 @@ (set_attr "mode" "HF")]) (define_insn "truncsfbf2" - [(set (match_operand:BF 0 "register_operand" "=x, v") + [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv") (float_truncate:BF - (match_operand:SF 1 "register_operand" "x,v")))] - "((TARGET_AVX512BF16 && TARGET_AVX512VL) || TARGET_AVXNECONVERT) - && !HONOR_NANS (BFmode) && flag_unsafe_math_optimizations" + (match_operand:SF 1 "register_operand" "0,x,v,Yv")))] + "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)" "@ + psrld\t{$16, %0|%0, 16} %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1} - vcvtneps2bf16\t{%1, %0|%0, %1}" - [(set_attr "isa" "avxneconvert,avx512bf16vl") - (set_attr "prefix" "vex,evex")]) + vcvtneps2bf16\t{%1, %0|%0, %1} + vpsrld\t{$16, %1, %0|%0, %1, 16}" + [(set_attr "isa" "noavx,avxneconvert,avx512bf16vl,avx") + (set_attr "prefix" "orig,vex,evex,vex") + (set_attr "type" "sseishft1,ssecvt,ssecvt,sseishft1")]) ;; Signed conversion to DImode. @@ -6249,7 +6285,7 @@ { emit_insn (gen_floatunssi<mode>2_i387_with_xmm (operands[0], operands[1], - assign_386_stack_local (DImode, SLOT_TEMP))); + assign_stack_temp (DImode, GET_MODE_SIZE (DImode)))); DONE; } if (!TARGET_AVX512F) @@ -6434,7 +6470,7 @@ (plus:<DWI> (zero_extend:<DWI> (match_operand:DWIH 2 "nonimmediate_operand" "rm,r,rm,r")) - (match_operand:<DWI> 1 "nonimmediate_operand" "0,0,r,m"))) + (match_operand:<DWI> 1 "nonimmediate_operand" "0,0,r,o"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (UNKNOWN, <DWI>mode, operands, TARGET_APX_NDD)" "#" @@ -7676,7 +7712,7 @@ (eq:CCO (plus:<QPWI> (sign_extend:<QPWI> - (match_operand:<DWI> 1 "nonimmediate_operand" "%0,rjM")) + (match_operand:<DWI> 1 "nonimmediate_operand" "%0,rjO")) (match_operand:<QPWI> 3 "const_scalar_int_operand" "n,n")) (sign_extend:<QPWI> (plus:<DWI> @@ -19251,6 +19287,27 @@ [(set_attr "type" "setcc") (set_attr "mode" "QI")]) +(define_expand "setcc_si_slp" + [(set (match_operand:SI 0 "register_operand") + (unspec:SI + [(match_operand:QI 1) + (match_operand:SI 2 "register_operand")] UNSPEC_SETCC_SI_SLP))]) + +(define_insn_and_split "*setcc_si_slp" + [(set (match_operand:SI 0 "register_operand" "=q") + (unspec:SI + [(match_operator:QI 1 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)]) + (match_operand:SI 2 "register_operand" "0")] UNSPEC_SETCC_SI_SLP))] + "ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (match_dup 2)) + (set (strict_low_part (match_dup 3)) (match_dup 1))] +{ + operands[3] = gen_lowpart (QImode, operands[0]); +}) + ;; In general it is not safe to assume too much about CCmode registers, ;; so simplify-rtx stops when it sees a second one. Under certain ;; conditions this is safe on x86, so help combine not create @@ -19746,6 +19803,32 @@ operands[8] = gen_lowpart (QImode, operands[4]); ix86_expand_clear (operands[4]); }) + +(define_peephole2 + [(set (match_operand 4 "flags_reg_operand") (match_operand 0)) + (set (strict_low_part (match_operand:QI 5 "register_operand")) + (match_operator:QI 6 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)])) + (set (match_operand:QI 1 "register_operand") + (match_operator:QI 2 "ix86_comparison_operator" + [(reg FLAGS_REG) (const_int 0)])) + (set (match_operand 3 "any_QIreg_operand") + (zero_extend (match_dup 1)))] + "(peep2_reg_dead_p (4, operands[1]) + || operands_match_p (operands[1], operands[3])) + && ! reg_overlap_mentioned_p (operands[3], operands[0]) + && ! reg_overlap_mentioned_p (operands[3], operands[5]) + && ! reg_overlap_mentioned_p (operands[1], operands[5]) + && peep2_regno_dead_p (0, FLAGS_REG)" + [(set (match_dup 4) (match_dup 0)) + (set (strict_low_part (match_dup 5)) + (match_dup 6)) + (set (strict_low_part (match_dup 7)) + (match_dup 2))] +{ + operands[7] = gen_lowpart (QImode, operands[3]); + ix86_expand_clear (operands[3]); +}) ;; Call instructions. @@ -28448,19 +28531,21 @@ [(prefetch (match_operand 0 "address_operand") (match_operand:SI 1 "const_int_operand") (match_operand:SI 2 "const_int_operand"))] - "TARGET_3DNOW || TARGET_PREFETCH_SSE || TARGET_PRFCHW" + "TARGET_3DNOW || TARGET_PREFETCH_SSE || TARGET_PRFCHW + || TARGET_MOVRS" { - bool write = operands[1] != const0_rtx; + int write = INTVAL (operands[1]); int locality = INTVAL (operands[2]); gcc_assert (IN_RANGE (locality, 0, 3)); + gcc_assert (IN_RANGE (write, 0, 2)); /* Use 3dNOW prefetch in case we are asking for write prefetch not supported by SSE counterpart (non-SSE2 athlon machines) or the SSE prefetch is not available (K6 machines). Otherwise use SSE prefetch as it allows specifying of locality. */ - if (write) + if (write == 1) { if (TARGET_PRFCHW) operands[2] = GEN_INT (3); @@ -28468,11 +28553,24 @@ operands[2] = GEN_INT (3); else if (TARGET_PREFETCH_SSE) operands[1] = const0_rtx; - else + else if (write == 0) { gcc_assert (TARGET_3DNOW); operands[2] = GEN_INT (3); } + else + { + if (TARGET_MOVRS) + ; + else if (TARGET_PREFETCH_SSE) + operands[1] = const0_rtx; + else + { + gcc_assert (TARGET_3DNOW); + operands[1] = const0_rtx; + operands[2] = GEN_INT (3); + } + } } else { @@ -28543,6 +28641,18 @@ (symbol_ref "memory_address_length (operands[0], false)")) (set_attr "memory" "none")]) +(define_insn "*prefetch_rst2" + [(prefetch (match_operand 0 "address_operand" "p") + (const_int 2) + (const_int 1))] + "TARGET_MOVRS" + "prefetchrst2\t%a0" + [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "prefetch") + (set (attr "length_address") + (symbol_ref "memory_address_length (operands[0], false)")) + (set_attr "memory" "none")]) + (define_insn "sse4_2_crc32<mode>" [(set (match_operand:SI 0 "register_operand" "=r") (unspec:SI @@ -29464,24 +29574,40 @@ (set_attr "length" "4")]) ;; Spaceship optimization -(define_expand "spaceship<mode>3" +(define_expand "spaceship<mode>4" [(match_operand:SI 0 "register_operand") (match_operand:MODEF 1 "cmp_fp_expander_operand") - (match_operand:MODEF 2 "cmp_fp_expander_operand")] + (match_operand:MODEF 2 "cmp_fp_expander_operand") + (match_operand:SI 3 "const_int_operand")] "(TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)) && (TARGET_CMOVE || (TARGET_SAHF && TARGET_USE_SAHF))" { - ix86_expand_fp_spaceship (operands[0], operands[1], operands[2]); + ix86_expand_fp_spaceship (operands[0], operands[1], operands[2], + operands[3]); DONE; }) -(define_expand "spaceshipxf3" +(define_expand "spaceshipxf4" [(match_operand:SI 0 "register_operand") (match_operand:XF 1 "nonmemory_operand") - (match_operand:XF 2 "nonmemory_operand")] + (match_operand:XF 2 "nonmemory_operand") + (match_operand:SI 3 "const_int_operand")] "TARGET_80387 && (TARGET_CMOVE || (TARGET_SAHF && TARGET_USE_SAHF))" { - ix86_expand_fp_spaceship (operands[0], operands[1], operands[2]); + ix86_expand_fp_spaceship (operands[0], operands[1], operands[2], + operands[3]); + DONE; +}) + +(define_expand "spaceship<mode>4" + [(match_operand:SI 0 "register_operand") + (match_operand:SWI 1 "nonimmediate_operand") + (match_operand:SWI 2 "<general_operand>") + (match_operand:SI 3 "const_int_operand")] + "" +{ + ix86_expand_int_spaceship (operands[0], operands[1], operands[2], + operands[3]); DONE; }) @@ -29575,6 +29701,17 @@ (set_attr "prefix" "maybe_evex") (set_attr "memory" "store")]) +(define_insn "movrs<mode>" + [(set (match_operand:SWI1248x 0 "register_operand" "=r") + (unspec_volatile:SWI1248x + [(match_operand:SWI1248x 1 "memory_operand" "m")] + UNSPECV_MOVRS))] + "TARGET_MOVRS && TARGET_64BIT" + "movrs<imodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "prefix" "orig") + (set_attr "type" "other") + (set_attr "mode" "<MODE>")]) + (include "mmx.md") (include "sse.md") (include "sync.md") diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index fe16e44..99e86f5 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1313,7 +1313,7 @@ Enable vectorization for scatter instruction. mapxf Target Mask(ISA2_APX_F) Var(ix86_isa_flags2) Save Support code generation for APX features, including EGPR, PUSH2POP2, -NDD and PPX. +NDD, PPX, NF, CCMP and ZU. mapx-features= Target Undocumented Joined Enum(apx_features) EnumSet Var(ix86_apx_features) Init(apx_none) Save @@ -1389,3 +1389,28 @@ mavx10.2 Target Alias(mavx10.2-256) Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX10.1 and AVX10.2 built-in functions and code generation. + +mamx-avx512 +Target Mask(ISA2_AMX_AVX512) Var(ix86_isa_flags2) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX10.1-512, +AVX10.2-512 and AMX-AVX512 built-in functions and code generation. + +mamx-tf32 +Target Mask(ISA2_AMX_TF32) Var(ix86_isa_flags2) Save +Support AMX-TF32 built-in functions and code generation. + +mamx-transpose +Target Mask(ISA2_AMX_TRANSPOSE) Var(ix86_isa_flags2) Save +Support AMX-TRANSPOSE built-in functions and code generation. + +mamx-fp8 +Target Mask(ISA2_AMX_FP8) Var(ix86_isa_flags2) Save +Support AMX-FP8 built-in functions and code generation. + +mmovrs +Target Mask(ISA2_MOVRS) Var(ix86_isa_flags2) Save +Support MOVRS built-in functions and code generation. + +mamx-movrs +Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save +Support AMX-MOVRS built-in functions and code generation. diff --git a/gcc/config/i386/i386.opt.urls b/gcc/config/i386/i386.opt.urls index fc70616..dbd59ec 100644 --- a/gcc/config/i386/i386.opt.urls +++ b/gcc/config/i386/i386.opt.urls @@ -613,3 +613,21 @@ UrlSuffix(gcc/x86-Options.html#index-mavx10_002e2-512) mavx10.2 UrlSuffix(gcc/x86-Options.html#index-mavx10_002e2) +mamx-avx512 +UrlSuffix(gcc/x86-Options.html#index-mamx-avx512) + +mamx-tf32 +UrlSuffix(gcc/x86-Options.html#index-mamx-tf32) + +mamx-transpose +UrlSuffix(gcc/x86-Options.html#index-mamx-transpose) + +mamx-fp8 +UrlSuffix(gcc/x86-Options.html#index-mamx-fp8) + +mmovrs +UrlSuffix(gcc/x86-Options.html#index-mmovrs) + +mamx-movrs +UrlSuffix(gcc/x86-Options.html#index-mamx-movrs) + diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 6b8035e..7e957b8 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -132,6 +132,14 @@ #include <amxcomplexintrin.h> +#include <amxavx512intrin.h> + +#include <amxtf32intrin.h> + +#include <amxtransposeintrin.h> + +#include <amxfp8intrin.h> + #include <prfchwintrin.h> #include <keylockerintrin.h> @@ -162,4 +170,7 @@ #include <avx10_2copyintrin.h> +#include <movrsintrin.h> + +#include <amxmovrsintrin.h> #endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index cb26975..506f4ca 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -70,6 +70,9 @@ ;; 8-byte and 4-byte HImode vector modes (define_mode_iterator VI2_32_64 [(V4HI "TARGET_MMX_WITH_SSE") V2HI]) +;; 8-byte, 4-byte and 2-byte QImode vector modes +(define_mode_iterator VI1_16_32_64 [(V8QI "TARGET_MMX_WITH_SSE") V4QI V2QI]) + ;; 4-byte and 2-byte integer vector modes (define_mode_iterator VI_16_32 [V4QI V2QI V2HI]) @@ -121,7 +124,7 @@ ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr mmxintvecmode [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI") - (V4HF "V4HI") (V2HF "V2HI")]) + (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")]) (define_mode_attr mmxintvecmodelower [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi") @@ -1958,6 +1961,8 @@ (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")]) +(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")]) + (define_expand "divv4hf3" [(set (match_operand:V4HF 0 "register_operand") (div:V4HF @@ -2036,6 +2041,26 @@ DONE; }) +;; VDIVNEPBF16 does not generate floating point exceptions. +(define_expand "<insn><mode>3" + [(set (match_operand:VBF_32_64 0 "register_operand") + (plusminusmultdiv:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand") + (match_operand:VBF_32_64 2 "nonimmediate_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[2]), <MODE>mode); + + emit_insn (gen_<insn>v8bf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + (define_expand "divv2hf3" [(set (match_operand:V2HF 0 "register_operand") (div:V2HF @@ -2076,6 +2101,25 @@ DONE; }) +(define_expand "<code><mode>3" + [(set (match_operand:VBF_32_64 0 "register_operand") + (smaxmin:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand") + (match_operand:VBF_32_64 2 "nonimmediate_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[2]), <MODE>mode); + + emit_insn (gen_<code>v8bf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + (define_expand "sqrt<mode>2" [(set (match_operand:VHF_32_64 0 "register_operand") (sqrt:VHF_32_64 @@ -2091,18 +2135,37 @@ DONE; }) +(define_expand "sqrt<mode>2" + [(set (match_operand:VBF_32_64 0 "register_operand") + (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[1]), <MODE>mode); + + emit_insn (gen_sqrtv8bf2 (op0, op1)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + +(define_mode_iterator VHBF_32_64 + [V2BF (V4BF "TARGET_MMX_WITH_SSE") + V2HF (V4HF "TARGET_MMX_WITH_SSE")]) + (define_expand "<code><mode>2" - [(set (match_operand:VHF_32_64 0 "register_operand") - (absneg:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand")))] + [(set (match_operand:VHBF_32_64 0 "register_operand") + (absneg:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand")))] "TARGET_SSE" "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;") (define_insn_and_split "*mmx_<code><mode>" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") - (absneg:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))) - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") + (absneg:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))) + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] "TARGET_SSE" "#" "&& reload_completed" @@ -2115,11 +2178,11 @@ [(set_attr "isa" "noavx,noavx,avx")]) (define_insn_and_split "*mmx_nabs<mode>2" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x") - (neg:VHF_32_64 - (abs:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))) - (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x") + (neg:VHBF_32_64 + (abs:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))) + (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))] "TARGET_SSE" "#" "&& reload_completed" @@ -2230,6 +2293,23 @@ DONE; }) +;;This instruction does not generate floating point exceptions +(define_expand "vec_cmp<mode>qi" + [(set (match_operand:QI 0 "register_operand") + (match_operator:QI 1 "" + [(match_operand:VBF_32_64 2 "register_operand") + (match_operand:VBF_32_64 3 "nonimmediate_operand")]))] + "TARGET_AVX10_2_256" +{ + rtx op2 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[2]), <MODE>mode); + rtx op3 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[3]), <MODE>mode); + + emit_insn (gen_vec_cmpv8bfqi (operands[0], operands[1], op2, op3)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel half-precision floating point rounding operations. @@ -2410,11 +2490,11 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define_insn "*mmx_andnot<mode>3" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") - (and:VHF_32_64 - (not:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "0,x")) - (match_operand:VHF_32_64 2 "register_operand" "x,x")))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") + (and:VHBF_32_64 + (not:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "0,x")) + (match_operand:VHBF_32_64 2 "register_operand" "x,x")))] "TARGET_SSE" "@ andnps\t{%2, %0|%0, %2} @@ -2425,10 +2505,10 @@ (set_attr "mode" "V4SF")]) (define_insn "<code><mode>3" - [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x") - (any_logic:VHF_32_64 - (match_operand:VHF_32_64 1 "register_operand" "%0,x") - (match_operand:VHF_32_64 2 "register_operand" " x,x")))] + [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x") + (any_logic:VHBF_32_64 + (match_operand:VHBF_32_64 1 "register_operand" "%0,x") + (match_operand:VHBF_32_64 2 "register_operand" " x,x")))] "TARGET_SSE" "@ <logic>ps\t{%2, %0|%0, %2} @@ -2440,14 +2520,14 @@ (define_expand "copysign<mode>3" [(set (match_dup 4) - (and:VHF_32_64 - (not:VHF_32_64 (match_dup 3)) - (match_operand:VHF_32_64 1 "register_operand"))) + (and:VHBF_32_64 + (not:VHBF_32_64 (match_dup 3)) + (match_operand:VHBF_32_64 1 "register_operand"))) (set (match_dup 5) - (and:VHF_32_64 (match_dup 3) - (match_operand:VHF_32_64 2 "register_operand"))) - (set (match_operand:VHF_32_64 0 "register_operand") - (ior:VHF_32_64 (match_dup 4) (match_dup 5)))] + (and:VHBF_32_64 (match_dup 3) + (match_operand:VHBF_32_64 2 "register_operand"))) + (set (match_operand:VHBF_32_64 0 "register_operand") + (ior:VHBF_32_64 (match_dup 4) (match_dup 5)))] "TARGET_SSE" { operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); @@ -2458,11 +2538,11 @@ (define_expand "xorsign<mode>3" [(set (match_dup 4) - (and:VHF_32_64 (match_dup 3) - (match_operand:VHF_32_64 2 "register_operand"))) - (set (match_operand:VHF_32_64 0 "register_operand") - (xor:VHF_32_64 (match_dup 4) - (match_operand:VHF_32_64 1 "register_operand")))] + (and:VHBF_32_64 (match_dup 3) + (match_operand:VHBF_32_64 2 "register_operand"))) + (set (match_operand:VHBF_32_64 0 "register_operand") + (xor:VHBF_32_64 (match_dup 4) + (match_operand:VHBF_32_64 1 "register_operand")))] "TARGET_SSE" { operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false); @@ -2474,7 +2554,7 @@ [(set (match_operand:<mmxintvecmode> 0 "register_operand") (lshiftrt:<mmxintvecmode> (subreg:<mmxintvecmode> - (match_operand:VHF_32_64 1 "register_operand") 0) + (match_operand:VHBF_32_64 1 "register_operand") 0) (match_dup 2)))] "TARGET_SSE2" { @@ -2632,6 +2712,86 @@ DONE; }) +(define_expand "fma<mode>4" + [(set (match_operand:VBF_32_64 0 "register_operand") + (fma:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand") + (match_operand:VBF_32_64 2 "nonimmediate_operand") + (match_operand:VBF_32_64 3 "nonimmediate_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[2]), <MODE>mode); + rtx op3 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[3]), <MODE>mode); + + emit_insn (gen_fmav8bf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + +(define_expand "fms<mode>4" + [(set (match_operand:VBF_32_64 0 "register_operand") + (fma:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand") + (match_operand:VBF_32_64 2 "nonimmediate_operand") + (neg:VBF_32_64 + (match_operand:VBF_32_64 3 "nonimmediate_operand"))))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[2]), <MODE>mode); + rtx op3 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[3]), <MODE>mode); + + emit_insn (gen_fmsv8bf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + +(define_expand "fnma<mode>4" + [(set (match_operand:VBF_32_64 0 "register_operand") + (fma:VBF_32_64 + (neg:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand")) + (match_operand:VBF_32_64 2 "nonimmediate_operand") + (match_operand:VBF_32_64 3 "nonimmediate_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[2]), <MODE>mode); + rtx op3 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[3]), <MODE>mode); + + emit_insn (gen_fnmav8bf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + +(define_expand "fnms<mode>4" + [(set (match_operand:VBF_32_64 0 "register_operand") + (fma:VBF_32_64 + (neg:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand")) + (match_operand:VBF_32_64 2 "nonimmediate_operand") + (neg:VBF_32_64 + (match_operand:VBF_32_64 3 "nonimmediate_operand"))))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[2]), <MODE>mode); + rtx op3 = lowpart_subreg (V8BFmode, force_reg (<MODE>mode, operands[3]), <MODE>mode); + + emit_insn (gen_fnmsv8bf4 (op0, op1, op2, op3)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel half-precision floating point complex type operations @@ -3061,7 +3221,7 @@ (set_attr "type" "mmxadd,sseadd,sseadd") (set_attr "mode" "DI,TI,TI")]) -(define_insn "*<insn><mode>3" +(define_insn "<insn><mode>3" [(set (match_operand:VI_16_32 0 "register_operand" "=x,Yw") (sat_plusminus:VI_16_32 (match_operand:VI_16_32 1 "register_operand" "<comm>0,Yw") @@ -4307,6 +4467,13 @@ operands[0] = lowpart_subreg (V16QImode, operands[0], <MODE>mode); }) +(define_expand "andn<mode>3" + [(set (match_operand:MMXMODEI 0 "register_operand") + (and:MMXMODEI + (not:MMXMODEI (match_operand:MMXMODEI 2 "register_operand")) + (match_operand:MMXMODEI 1 "register_operand")))] + "TARGET_MMX_WITH_SSE") + (define_insn "mmx_andnot<mode>3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") (and:MMXMODEI @@ -6344,7 +6511,7 @@ DONE; }) -(define_expand "usdot_prodv8qi" +(define_expand "usdot_prodv2siv8qi" [(match_operand:V2SI 0 "register_operand") (match_operand:V8QI 1 "register_operand") (match_operand:V8QI 2 "register_operand") @@ -6363,7 +6530,7 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_usdot_prodv16qi (op0, op1, op2, op3)); + emit_insn (gen_usdot_prodv4siv16qi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); } else @@ -6377,7 +6544,7 @@ emit_move_insn (op3, CONST0_RTX (V4SImode)); emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1])); emit_insn (gen_extendv8qiv8hi2 (op2, operands[2])); - emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_sdot_prodv4siv8hi (op0, op1, op2, op3)); /* vec_perm (op0, 2, 3, 0, 1); */ emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78))); @@ -6388,7 +6555,7 @@ DONE; }) -(define_expand "sdot_prodv8qi" +(define_expand "sdot_prodv2siv8qi" [(match_operand:V2SI 0 "register_operand") (match_operand:V8QI 1 "register_operand") (match_operand:V8QI 2 "register_operand") @@ -6406,7 +6573,7 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_sdot_prodv16qi (op0, op1, op2, op3)); + emit_insn (gen_sdot_prodv4siv16qi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); } else @@ -6420,7 +6587,7 @@ emit_move_insn (op3, CONST0_RTX (V4SImode)); emit_insn (gen_extendv8qiv8hi2 (op1, operands[1])); emit_insn (gen_extendv8qiv8hi2 (op2, operands[2])); - emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_sdot_prodv4siv8hi (op0, op1, op2, op3)); /* vec_perm (op0, 2, 3, 0, 1); */ emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78))); @@ -6432,7 +6599,7 @@ }) -(define_expand "udot_prodv8qi" +(define_expand "udot_prodv2siv8qi" [(match_operand:V2SI 0 "register_operand") (match_operand:V8QI 1 "register_operand") (match_operand:V8QI 2 "register_operand") @@ -6450,7 +6617,7 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_udot_prodv16qi (op0, op1, op2, op3)); + emit_insn (gen_udot_prodv4siv16qi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); } else @@ -6464,7 +6631,7 @@ emit_move_insn (op3, CONST0_RTX (V4SImode)); emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1])); emit_insn (gen_zero_extendv8qiv8hi2 (op2, operands[2])); - emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_sdot_prodv4siv8hi (op0, op1, op2, op3)); /* vec_perm (op0, 2, 3, 0, 1); */ emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78))); @@ -6476,7 +6643,7 @@ }) -(define_expand "usdot_prodv4hi" +(define_expand "usdot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand") (match_operand:V4HI 1 "register_operand") (match_operand:V4HI 2 "register_operand") @@ -6492,12 +6659,12 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_usdot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_usdot_prodv4siv8hi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); DONE; }) -(define_expand "udot_prodv4hi" +(define_expand "udot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand") (match_operand:V4HI 1 "register_operand") (match_operand:V4HI 2 "register_operand") @@ -6513,12 +6680,12 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_udot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_udot_prodv4siv8hi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); DONE; }) -(define_expand "sdot_prodv4hi" +(define_expand "sdot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand") (match_operand:V4HI 1 "register_operand") (match_operand:V4HI 2 "register_operand") @@ -6534,7 +6701,7 @@ rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); rtx op0 = gen_reg_rtx (V4SImode); - emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + emit_insn (gen_sdot_prodv4siv8hi (op0, op1, op2, op3)); emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); DONE; }) @@ -6646,3 +6813,24 @@ [(set_attr "type" "mmx") (set_attr "modrm" "0") (set_attr "memory" "none")]) + +(define_insn "popcount<mode>2" + [(set (match_operand:VI1_16_32_64 0 "register_operand" "=v") + (popcount:VI1_16_32_64 + (match_operand:VI1_16_32_64 1 "register_operand" "v")))] + "TARGET_AVX512VL && TARGET_AVX512BITALG" + "vpopcntb\t{%1, %0|%0, %1}") + +(define_insn "popcount<mode>2" + [(set (match_operand:VI2_32_64 0 "register_operand" "=v") + (popcount:VI2_32_64 + (match_operand:VI2_32_64 1 "register_operand" "v")))] + "TARGET_AVX512VL && TARGET_AVX512BITALG" + "vpopcntw\t{%1, %0|%0, %1}") + +(define_insn "popcountv2si2" + [(set (match_operand:V2SI 0 "register_operand" "=v") + (popcount:V2SI + (match_operand:V2SI 1 "register_operand" "v")))] + "TARGET_AVX512VPOPCNTDQ && TARGET_AVX512VL && TARGET_MMX_WITH_SSE" + "vpopcntd\t{%1, %0|%0, %1}") diff --git a/gcc/config/i386/movrsintrin.h b/gcc/config/i386/movrsintrin.h new file mode 100644 index 0000000..b89ce1c --- /dev/null +++ b/gcc/config/i386/movrsintrin.h @@ -0,0 +1,453 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use <movrsintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef _MOVRSINTRIN_H_INCLUDED +#define _MOVRSINTRIN_H_INCLUDED + +#ifndef __MOVRS__ +#pragma GCC push_options +#pragma GCC target("movrs") +#define __DISABLE_MOVRS__ +#endif /* __MOVRS__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchrs (void* __P) +{ + __builtin_ia32_prefetch (__P, 2, 1, 0 /* _MM_HINT_RST2 */); +} + +#ifdef __x86_64__ + +extern __inline char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movrs_i8 (void const * __P) +{ + return (char) __builtin_ia32_movrsqi ((const char *) __P); +} + +extern __inline short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movrs_i16 (void const * __P) +{ + return (short) __builtin_ia32_movrshi ((const short *) __P); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movrs_i32 (void const * __P) +{ + return (int) __builtin_ia32_movrssi ((const int *) __P); +} + +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movrs_i64 (void const * __P) +{ + return (long long) __builtin_ia32_movrsdi ((const long long *) __P); +} + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_MOVRS__ +#undef __DISABLE_MOVRS__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVRS__ */ + +#ifdef __x86_64__ + +#if !defined (__AVX10_2_256__) || !defined (__MOVRS__) +#pragma GCC push_options +#pragma GCC target("avx10.2,movrs") +#define __DISABLE_MOVRS_AVX10_2__ +#endif /* __MOVRS_AVX10_2__ */ + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadrs_epi8 (void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsb256_mask ((const __v32qi *) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadrs_epi8 (__m256i __D, __mmask32 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsb256_mask ((const __v32qi *) __A, + (__v32qi) __D, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadrs_epi8 (__mmask32 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsb256_mask ((const __v32qi *) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadrs_epi32 (void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsd256_mask ((const __v8si *) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadrs_epi32 (__m256i __D, __mmask8 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsd256_mask ((const __v8si *) __A, + (__v8si) __D, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadrs_epi32 (__mmask8 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsd256_mask ((const __v8si *) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadrs_epi64 (void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsq256_mask ((const __v4di *) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadrs_epi64 (__m256i __D, __mmask8 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsq256_mask ((const __v4di *) __A, + (__v4di) __D, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadrs_epi64 (__mmask8 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsq256_mask ((const __v4di *) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadrs_epi16 (void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsw256_mask ((const __v16hi *) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadrs_epi16 (__m256i __D, __mmask16 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsw256_mask ((const __v16hi *) __A, + (__v16hi) __D, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadrs_epi16 (__mmask16 __U, void const *__A) +{ + return (__m256i) __builtin_ia32_vmovrsw256_mask ((const __v16hi *) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadrs_epi8 (void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsb128_mask ((const __v16qi *) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadrs_epi8 (__m128i __D, __mmask16 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsb128_mask ((const __v16qi *) __A, + (__v16qi) __D, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadrs_epi8 (__mmask16 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsb128_mask ((const __v16qi *) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadrs_epi32 (void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsd128_mask ((const __v4si *) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadrs_epi32 (__m128i __D, __mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsd128_mask ((const __v4si *) __A, + (__v4si) __D, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadrs_epi32 (__mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsd128_mask ((const __v4si *) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadrs_epi64 (void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsq128_mask ((const __v2di *) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadrs_epi64 (__m128i __D, __mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsq128_mask ((const __v2di *) __A, + (__v2di) __D, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadrs_epi64 (__mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsq128_mask ((const __v2di *) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadrs_epi16 (void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsw128_mask ((const __v8hi *) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadrs_epi16 (__m128i __D, __mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsw128_mask ((const __v8hi *) __A, + (__v8hi) __D, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadrs_epi16 (__mmask8 __U, void const *__A) +{ + return (__m128i) __builtin_ia32_vmovrsw128_mask ((const __v8hi *) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +#ifdef __DISABLE_MOVRS_AVX10_2__ +#undef __DISABLE_MOVRS_AVX10_2__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVRS_AVX10_2__ */ + +#if !defined (__AVX10_2_512__) || !defined (__MOVRS__) +#pragma GCC push_options +#pragma GCC target("avx10.2-512,movrs") +#define __DISABLE_MOVRS_AVX10_2_512__ +#endif /* __MOVRS_AVX10_2_512__ */ + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadrs_epi8 (void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsb512_mask ((const __v64qi *) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadrs_epi8 (__m512i __D, __mmask64 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsb512_mask ((const __v64qi *) __A, + (__v64qi) __D, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadrs_epi8 (__mmask64 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsb512_mask ((const __v64qi *) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadrs_epi32 (void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsd512_mask ((const __v16si *) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadrs_epi32 (__m512i __D, __mmask16 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsd512_mask ((const __v16si *) __A, + (__v16si) __D, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadrs_epi32 (__mmask16 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsd512_mask ((const __v16si *) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadrs_epi64 (void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsq512_mask ((const __v8di *) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadrs_epi64 (__m512i __D, __mmask8 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsq512_mask ((const __v8di *) __A, + (__v8di) __D, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadrs_epi64 (__mmask8 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsq512_mask ((const __v8di *) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadrs_epi16 (void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsw512_mask ((const __v32hi *) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadrs_epi16 (__m512i __D, __mmask32 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsw512_mask ((const __v32hi *) __A, + (__v32hi) __D, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadrs_epi16 (__mmask32 __U, void const *__A) +{ + return (__m512i) __builtin_ia32_vmovrsw512_mask ((const __v32hi *) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +#ifdef __DISABLE_MOVRS_AVX10_2_512__ +#undef __DISABLE_MOVRS_AVX10_2_512__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVRS_AVX10_2_512__ */ + +#endif /* __x86_64__ */ + +#endif /* _MOVRSINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/openbsdelf.h b/gcc/config/i386/openbsdelf.h index 3b7c27b..01de0fc 100644 --- a/gcc/config/i386/openbsdelf.h +++ b/gcc/config/i386/openbsdelf.h @@ -1,5 +1,5 @@ /* Configuration for an OpenBSD i386 target. - + Copyright (C) 2005-2024 Free Software Foundation, Inc. This file is part of GCC. @@ -67,7 +67,7 @@ along with GCC; see the file COPYING3. If not see The icky part is not here, but in <machine/profile.h>. */ #undef FUNCTION_PROFILER #define FUNCTION_PROFILER(FILE, LABELNO) \ - fputs (flag_pic ? "\tcall __mcount@PLT\n": "\tcall __mcount\n", FILE); + fputs (flag_pic ? "\tcall __mcount@PLT\n" : "\tcall __mcount\n", FILE); #undef LINK_SPEC #define LINK_SPEC \ diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index ab6a2e1..053312b 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1633,7 +1633,13 @@ }) ;; Return true if this comparison only requires testing one flag bit. +;; VCOMX/VUCOMX set ZF, SF, OF, differently from COMI/UCOMI. (define_predicate "ix86_trivial_fp_comparison_operator" + (if_then_else (match_test "TARGET_AVX10_2_256") + (match_code "gt,ge,unlt,unle,eq,uneq,ne,ltgt,ordered,unordered") + (match_code "gt,ge,unlt,unle,uneq,ltgt,ordered,unordered"))) + +(define_predicate "ix86_trivial_fp_comparison_operator_xf" (match_code "gt,ge,unlt,unle,uneq,ltgt,ordered,unordered")) ;; Return true if we know how to do this comparison. Others require @@ -1645,6 +1651,12 @@ (match_operand 0 "comparison_operator") (match_operand 0 "ix86_trivial_fp_comparison_operator"))) +(define_predicate "ix86_fp_comparison_operator_xf" + (if_then_else (match_test "ix86_fp_comparison_strategy (GET_CODE (op)) + == IX86_FPCMP_ARITH") + (match_operand 0 "comparison_operator") + (match_operand 0 "ix86_trivial_fp_comparison_operator_xf"))) + ;; Return true if we can perform this comparison on TImode operands. (define_predicate "ix86_timode_comparison_operator" (if_then_else (match_test "TARGET_64BIT") diff --git a/gcc/config/i386/sm4intrin.h b/gcc/config/i386/sm4intrin.h index 4c212cc..e2d78f0 100644 --- a/gcc/config/i386/sm4intrin.h +++ b/gcc/config/i386/sm4intrin.h @@ -67,4 +67,29 @@ _mm256_sm4rnds4_epi32 (__m256i __A, __m256i __B) #pragma GCC pop_options #endif /* __DISABLE_SM4__ */ +#if !defined (__SM4__) || !defined (__AVX10_2_512__) +#pragma GCC push_options +#pragma GCC target("sm4,avx10.2-512") +#define __DISABLE_SM4_512__ +#endif /* __SM4_512__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sm4key4_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vsm4key4512 ((__v16si) __A, (__v16si) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sm4rnds4_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vsm4rnds4512 ((__v16si) __A, (__v16si) __B); +} + +#ifdef __DISABLE_SM4_512__ +#undef __DISABLE_SM4_512__ +#pragma GCC pop_options +#endif /* __DISABLE_SM4_512__ */ + #endif /* _SM4INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h index 4c315fe..c2d8277 100644 --- a/gcc/config/i386/smmintrin.h +++ b/gcc/config/i386/smmintrin.h @@ -385,7 +385,7 @@ _mm_extract_ps (__m128 __X, const int __N) by index N. */ #define _MM_EXTRACT_FLOAT(D, S, N) \ { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } - + /* Extract specified single precision float element into the lower part of __m128. */ #define _MM_PICK_OUT_PS(X, N) \ diff --git a/gcc/config/i386/sol2.h b/gcc/config/i386/sol2.h index b93bc4c..f51fb2e 100644 --- a/gcc/config/i386/sol2.h +++ b/gcc/config/i386/sol2.h @@ -80,7 +80,7 @@ along with GCC; see the file COPYING3. If not see #define ASM_CPU_SPEC "%(asm_cpu_default) " ASM_XBRACE_COMMENT_SPEC /* Don't include ASM_PIC_SPEC. While the Solaris 10+ assembler accepts -K PIC, - it gives many warnings: + it gives many warnings: Absolute relocation is used for symbol "<symbol>" GNU as doesn't recognize -K at all. */ #undef ASM_SPEC diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index da91d39..15ed8ff 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -251,6 +251,9 @@ UNSPEC_UFIX_SATURATION UNSPEC_MINMAXNEPBF16 UNSPEC_MINMAX + + ;; For MOVRS suppport + UNSPEC_VMOVRS ]) (define_c_enum "unspecv" [ @@ -391,6 +394,19 @@ (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) +(define_mode_iterator VF_BHSD + [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") + (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") + (V16SF "TARGET_AVX512F && TARGET_EVEX512") + (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F && TARGET_EVEX512") + (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") + (V32BF "TARGET_AVX10_2_512") + (V16BF "TARGET_AVX10_2_256") + (V8BF "TARGET_AVX10_2_256") + ]) + ;; 128-, 256- and 512-bit float vector modes for bitwise operations (define_mode_iterator VFB [(V32BF "TARGET_AVX512F && TARGET_EVEX512") @@ -434,9 +450,12 @@ (define_mode_iterator VF2_AVX10_2 [(V8DF "TARGET_AVX10_2_512") V4DF V2DF]) -;; All DFmode & HFmode vector float modes -(define_mode_iterator VF2H - [(V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") +;; All DFmode & HFmode & BFmode vector float modes +(define_mode_iterator VF2HB + [(V32BF "TARGET_AVX10_2_512") + (V16BF "TARGET_AVX10_2_256") + (V8BF "TARGET_AVX10_2_256") + (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V8DF "TARGET_AVX512F && TARGET_EVEX512") (V4DF "TARGET_AVX") V2DF]) @@ -492,6 +511,12 @@ (V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL") (V8DI "TARGET_EVEX512") (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")]) +(define_mode_iterator VI1248_AVX10_2 + [(V64QI "TARGET_AVX10_2_512") V32QI V16QI + (V32HI "TARGET_AVX10_2_512") V16HI V8HI + (V16SI "TARGET_AVX10_2_512") V8SI V4SI + (V8DI "TARGET_AVX10_2_512") V4DI V2DI]) + (define_mode_iterator VF_AVX512VL [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL") (V8DF "TARGET_EVEX512") (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")]) @@ -610,6 +635,10 @@ (define_mode_iterator VI1_AVX512VNNI [(V64QI "TARGET_AVX512VNNI && TARGET_EVEX512") (V32QI "TARGET_AVX2") V16QI]) +(define_mode_iterator VI1_AVX512VNNIBW + [(V64QI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512") + (V32QI "TARGET_AVX2") V16QI]) + (define_mode_iterator VI12_256_512_AVX512VL [(V64QI "TARGET_EVEX512") (V32QI "TARGET_AVX512VL") (V32HI "TARGET_EVEX512") (V16HI "TARGET_AVX512VL")]) @@ -627,6 +656,9 @@ [(V32HI "(TARGET_AVX512BW || TARGET_AVX512VNNI) && TARGET_EVEX512") (V16HI "TARGET_AVX2") V8HI]) +(define_mode_iterator VI2_AVX10_2 + [(V32HI "TARGET_AVX10_2_512") V16HI V8HI]) + (define_mode_iterator VI4_AVX [(V8SI "TARGET_AVX") V4SI]) @@ -1280,6 +1312,12 @@ (V8HF "w") (V8BF "w") (V4SF "k") (V2DF "q") (HF "w") (BF "w") (SF "k") (DF "q")]) +;; Pointer size override for 16-bit upper-convert modes (Intel asm dialect) +(define_mode_attr iptrh + [(V32HI "") (V16SI "") (V8DI "") + (V16HI "") (V8SI "") (V4DI "q") + (V8HI "") (V4SI "q") (V2DI "k")]) + ;; Mapping of vector modes to VPTERNLOG suffix (define_mode_attr ternlogsuffix [(V8DI "q") (V4DI "q") (V2DI "q") @@ -2400,6 +2438,91 @@ DONE; }) +;; Optimize cmp + movcc with mask register by kortest + movcc. +(define_insn_and_split "*kortest_cmp<SWI1248_AVX512BWDQ_64:mode>_movqicc" + [(set (match_operand:QI 0 "register_operand" "=r,r,r,r,r,r") + (if_then_else:QI + (match_operator 1 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand" + "?k,<SWI1248_AVX512BWDQ_64:r>,?k, <SWI1248_AVX512BWDQ_64:r>,?k,r") + (const_int -1)]) + (match_operand:QI 2 "register_operand" "r,r,0,0,r,r") + (match_operand:QI 3 "register_operand" " 0,0,r,r,r,r"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW && TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL" + "#" + "&& reload_completed" + [(set (match_dup 0) + (if_then_else:SI + (match_dup 5) + (match_dup 2) + (match_dup 3)))] +{ + rtx flag_reg; + if (MASK_REGNO_P (REGNO (operands[4]))) + { + emit_insn (gen_kortest<SWI1248_AVX512BWDQ_64:mode>_ccc (operands[4], operands[4])); + flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG); + } + else + { + flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (flag_reg, + gen_rtx_COMPARE (CCZmode, + operands[4], + constm1_rtx))); + } + operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode, + flag_reg,const0_rtx); + operands[0] = gen_lowpart (SImode, operands[0]); + operands[2] = gen_lowpart (SImode, operands[2]); + operands[3] = gen_lowpart (SImode, operands[3]); +} + [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd") + (set_attr "type" "icmov") + (set_attr "mode" "QI")]) + +(define_insn_and_split "*kortest_cmp<SWI1248_AVX512BWDQ_64:mode>_mov<SWI248:mode>cc" + [(set (match_operand:SWI248 0 "register_operand" "=r,r,r,r,r,r,r,r") + (if_then_else:SWI248 + (match_operator 1 "bt_comparison_operator" + [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand" + "?k,<SWI1248_AVX512BWDQ_64:r>,?k, <SWI1248_AVX512BWDQ_64:r>,?k,r,?k, r") + (const_int -1)]) + (match_operand:SWI248 2 "nonimmediate_operand" "rm,rm, 0, 0,rm,rm, r, r") + (match_operand:SWI248 3 "nonimmediate_operand" " 0, 0,rm,rm, r, r,rm,rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_AVX512BW && TARGET_CMOVE + && !(MEM_P (operands[2]) && MEM_P (operands[3]))" + "#" + "&& reload_completed" + [(set (match_dup 0) + (if_then_else:SWI248 + (match_dup 5) + (match_dup 2) + (match_dup 3)))] +{ + rtx flag_reg; + if (MASK_REGNO_P (REGNO (operands[4]))) + { + emit_insn (gen_kortest<SWI1248_AVX512BWDQ_64:mode>_ccc (operands[4], operands[4])); + flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG); + } + else + { + flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG); + emit_insn (gen_rtx_SET (flag_reg, + gen_rtx_COMPARE (CCZmode, + operands[4], + constm1_rtx))); + } + operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode, + flag_reg,const0_rtx); +} + [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd") + (set_attr "type" "icmov") + (set_attr "mode" "<SWI248:MODE>")]) + (define_insn "kunpckhi" [(set (match_operand:HI 0 "register_operand" "=k") (ior:HI @@ -2520,10 +2643,10 @@ }) (define_expand "<insn><mode>3<mask_name><round_name>" - [(set (match_operand:VFH 0 "register_operand") - (plusminus:VFH - (match_operand:VFH 1 "<round_nimm_predicate>") - (match_operand:VFH 2 "<round_nimm_predicate>")))] + [(set (match_operand:VF_BHSD 0 "register_operand") + (plusminus:VF_BHSD + (match_operand:VF_BHSD 1 "<round_nimm_predicate>") + (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") @@ -2609,10 +2732,10 @@ }) (define_expand "mul<mode>3<mask_name><round_name>" - [(set (match_operand:VFH 0 "register_operand") - (mult:VFH - (match_operand:VFH 1 "<round_nimm_predicate>") - (match_operand:VFH 2 "<round_nimm_predicate>")))] + [(set (match_operand:VF_BHSD 0 "register_operand") + (mult:VF_BHSD + (match_operand:VF_BHSD 1 "<round_nimm_predicate>") + (match_operand:VF_BHSD 2 "<round_nimm_predicate>")))] "TARGET_SSE && <mask_mode512bit_condition> && <round_mode_condition>" "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);") @@ -2727,6 +2850,26 @@ } }) +(define_expand "div<mode>3" + [(set (match_operand:VBF_AVX10_2 0 "register_operand") + (div:VBF_AVX10_2 + (match_operand:VBF_AVX10_2 1 "register_operand") + (match_operand:VBF_AVX10_2 2 "vector_operand")))] + "TARGET_AVX10_2_256" +{ + if (TARGET_RECIP_VEC_DIV + && optimize_insn_for_speed_p () + && flag_finite_math_only + && flag_unsafe_math_optimizations) + { + rtx op = gen_reg_rtx (<MODE>mode); + operands[2] = force_reg (<MODE>mode, operands[2]); + emit_insn (gen_avx10_2_rcppbf16_<mode> (op, operands[2])); + emit_insn (gen_avx10_2_mulnepbf16_<mode> (operands[0], operands[1], op)); + DONE; + } +}) + (define_expand "cond_div<mode>" [(set (match_operand:VFH 0 "register_operand") (vec_merge:VFH @@ -2893,8 +3036,8 @@ (set_attr "mode" "<MODE>")]) (define_expand "sqrt<mode>2" - [(set (match_operand:VF2H 0 "register_operand") - (sqrt:VF2H (match_operand:VF2H 1 "vector_operand")))] + [(set (match_operand:VF2HB 0 "register_operand") + (sqrt:VF2HB (match_operand:VF2HB 1 "vector_operand")))] "TARGET_SSE2") (define_expand "sqrt<mode>2" @@ -3226,7 +3369,7 @@ u = UNSPEC_IEEE_MAX; if (MEM_P (operands[2])) - force_reg (<MODE>mode, operands[2]); + operands[2] = force_reg (<MODE>mode, operands[2]); rtvec v = gen_rtvec (2, operands[2], operands[1]); rtx tmp = gen_rtx_UNSPEC (<MODE>mode, v, u); emit_move_insn (operands[0], tmp); @@ -3290,7 +3433,27 @@ (const_string "*"))) (set_attr "mode" "<ssescalarmode>")]) -(define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>" +(define_expand "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>" + [(set (match_operand:VFH_128 0 "register_operand") + (vec_merge:VFH_128 + (smaxmin:VFH_128 + (match_operand:VFH_128 1 "register_operand") + (match_operand:VFH_128 2 "nonimmediate_operand")) + (match_dup 1) + (const_int 1)))] + "TARGET_SSE" +{ + if (!flag_finite_math_only || flag_signed_zeros) + { + emit_insn (gen_<sse>_ieee_vm<maxmin_float><mode>3<mask_scalar_name><round_saeonly_scalar_name> + (operands[0], operands[1], operands[2] + <mask_scalar_operand_arg34> + <round_saeonly_scalar_mask_arg3>)); + DONE; + } +}) + +(define_insn "*<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>" [(set (match_operand:VFH_128 0 "register_operand" "=x,v") (vec_merge:VFH_128 (smaxmin:VFH_128 @@ -3308,6 +3471,25 @@ (set_attr "prefix" "<round_saeonly_scalar_prefix>") (set_attr "mode" "<ssescalarmode>")]) +(define_insn "<sse>_ieee_vm<ieee_maxmin><mode>3<mask_scalar_name><round_saeonly_scalar_name>" + [(set (match_operand:VFH_128 0 "register_operand" "=x,v") + (vec_merge:VFH_128 + (unspec:VFH_128 + [(match_operand:VFH_128 1 "register_operand" "0,v") + (match_operand:VFH_128 2 "nonimmediate_operand" "xm,<round_saeonly_scalar_constraint>")] + IEEE_MAXMIN) + (match_dup 1) + (const_int 1)))] + "TARGET_SSE" + "@ + <ieee_maxmin><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2} + v<ieee_maxmin><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sse") + (set_attr "btver2_sse_attr" "maxmin") + (set_attr "prefix" "<round_saeonly_scalar_prefix>") + (set_attr "mode" "<ssescalarmode>")]) + (define_mode_attr addsub_cst [(V4DF "5") (V2DF "1") (V4SF "5") (V8SF "85")]) @@ -4216,32 +4398,19 @@ ;; Since vpcmpd implicitly clear the upper bits of dest, transform ;; vpcmpd + zero_extend to vpcmpd since the instruction -(define_insn_and_split "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" - [(set (match_operand:SWI248x 0 "register_operand") +(define_insn "*<avx512>_cmp<V48H_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") (zero_extend:SWI248x (unspec:<V48H_AVX512VL:avx512fmaskmode> - [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand") - (match_operand:V48H_AVX512VL 2 "nonimmediate_operand") - (match_operand:SI 3 "const_0_to_7_operand")] + [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] UNSPEC_PCMP)))] "TARGET_AVX512F && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW) - && ix86_pre_reload_split () && (GET_MODE_NUNITS (<V48H_AVX512VL:MODE>mode) < GET_MODE_PRECISION (<SWI248x:MODE>mode))" - "#" - "&& 1" - [(set (match_dup 0) - (unspec:<V48H_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_PCMP))] -{ - operands[1] = force_reg (<V48H_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<V48H_AVX512VL:avx512fmaskmode>mode, - operands[0], <SWI248x:MODE>mode); -} + "v<ssecmpintprefix>cmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -4269,21 +4438,22 @@ "#" "&& 1" [(set (match_dup 0) - (unspec:<V48H_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_PCMP)) - (set (match_dup 4) (match_dup 0))] + (zero_extend:SWI248x + (unspec:<V48H_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP))) + (set (match_dup 4) (match_dup 5))] { - operands[1] = force_reg (<V48H_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<V48H_AVX512VL:avx512fmaskmode>mode, + operands[5] = lowpart_subreg (<V48H_AVX512VL:avx512fmaskmode>mode, operands[0], <SWI248x:MODE>mode); -} - [(set_attr "type" "ssecmp") - (set_attr "length_immediate" "1") - (set_attr "prefix" "evex") - (set_attr "mode" "<V48H_AVX512VL:sseinsnmode>")]) + if (SUBREG_P (operands[5])) + { + SUBREG_PROMOTED_VAR_P (operands[5]) = 1; + SUBREG_PROMOTED_SET (operands[5], 1); + } +}) (define_insn_and_split "*<avx512>_cmp<mode>3" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") @@ -4318,31 +4488,18 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn_and_split "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" - [(set (match_operand:SWI248x 0 "register_operand") +(define_insn "*<avx512>_cmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") (zero_extend:SWI248x (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") - (match_operand:SI 3 "const_0_to_7_operand")] + [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] UNSPEC_PCMP)))] "TARGET_AVX512BW - && ix86_pre_reload_split () - && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) - < GET_MODE_PRECISION (<SWI248x:MODE>mode))" - "#" - "&& 1" - [(set (match_dup 0) - (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_PCMP))] -{ - operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, - operands[0], <SWI248x:MODE>mode); -} + && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) + < GET_MODE_PRECISION (<SWI248x:MODE>mode))" + "vpcmp<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -4369,16 +4526,21 @@ "#" "&& 1" [(set (match_dup 0) - (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_PCMP)) - (set (match_dup 4) (match_dup 0))] + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_PCMP))) + (set (match_dup 4) (match_dup 5))] { - operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, + operands[5] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, operands[0], <SWI248x:MODE>mode); + if (SUBREG_P (operands[5])) + { + SUBREG_PROMOTED_VAR_P (operands[5]) = 1; + SUBREG_PROMOTED_SET (operands[5], 1); + } } [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") @@ -4436,31 +4598,18 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn_and_split "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" - [(set (match_operand:SWI248x 0 "register_operand") +(define_insn "*<avx512>_ucmp<VI12_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") (zero_extend:SWI248x (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") - (match_operand:SI 3 "const_0_to_7_operand")] + [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") + (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] UNSPEC_UNSIGNED_PCMP)))] "TARGET_AVX512BW - && ix86_pre_reload_split () && (GET_MODE_NUNITS (<VI12_AVX512VL:MODE>mode) < GET_MODE_PRECISION (<SWI248x:MODE>mode))" - "#" - "&& 1" - [(set (match_dup 0) - (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_UNSIGNED_PCMP))] -{ - operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, - operands[0], <SWI248x:MODE>mode); -} + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -4488,16 +4637,21 @@ "#" "&& 1" [(set (match_dup 0) - (unspec:<VI12_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_UNSIGNED_PCMP)) - (set (match_dup 4) (match_dup 0))] -{ - operands[1] = force_reg (<VI12_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, + (zero_extend:SWI248x + (unspec:<VI12_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP))) + (set (match_dup 4) (match_dup 5))] +{ + operands[5] = lowpart_subreg (<VI12_AVX512VL:avx512fmaskmode>mode, operands[0], <SWI248x:MODE>mode); + if (SUBREG_P (operands[5])) + { + SUBREG_PROMOTED_VAR_P (operands[5]) = 1; + SUBREG_PROMOTED_SET (operands[5], 1); + } } [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") @@ -4533,32 +4687,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn_and_split "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" - [(set (match_operand:SWI248x 0 "register_operand") +(define_insn "*<avx512>_ucmp<VI48_AVX512VL:mode>3_zero_extend<SWI248x:mode>" + [(set (match_operand:SWI248x 0 "register_operand" "=k") (zero_extend:SWI248x (unspec:<VI48_AVX512VL:avx512fmaskmode> - [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand") - (match_operand:VI48_AVX512VL 2 "nonimmediate_operand") - (match_operand:SI 3 "const_0_to_7_operand")] + [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand" "v") + (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] UNSPEC_UNSIGNED_PCMP)))] "TARGET_AVX512F && (!VALID_MASK_AVX512BW_MODE (<SWI248x:MODE>mode) || TARGET_AVX512BW) - && ix86_pre_reload_split () && (GET_MODE_NUNITS (<VI48_AVX512VL:MODE>mode) < GET_MODE_PRECISION (<SWI248x:MODE>mode))" - "#" - "&& 1" - [(set (match_dup 0) - (unspec:<VI48_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_UNSIGNED_PCMP))] -{ - operands[1] = force_reg (<VI48_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI48_AVX512VL:avx512fmaskmode>mode, - operands[0], <SWI248x:MODE>mode); -} + "vpcmpu<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -4586,16 +4727,21 @@ "#" "&& 1" [(set (match_dup 0) - (unspec:<VI48_AVX512VL:avx512fmaskmode> - [(match_dup 1) - (match_dup 2) - (match_dup 3)] - UNSPEC_UNSIGNED_PCMP)) - (set (match_dup 4) (match_dup 0))] -{ - operands[1] = force_reg (<VI48_AVX512VL:MODE>mode, operands[1]); - operands[0] = lowpart_subreg (<VI48_AVX512VL:avx512fmaskmode>mode, + (zero_extend:SWI248x + (unspec:<VI48_AVX512VL:avx512fmaskmode> + [(match_dup 1) + (match_dup 2) + (match_dup 3)] + UNSPEC_UNSIGNED_PCMP))) + (set (match_dup 4) (match_dup 5))] +{ + operands[5] = lowpart_subreg (<VI48_AVX512VL:avx512fmaskmode>mode, operands[0], <SWI248x:MODE>mode); + if (SUBREG_P (operands[5])) + { + SUBREG_PROMOTED_VAR_P (operands[5]) = 1; + SUBREG_PROMOTED_SET (operands[5], 1); + } } [(set_attr "type" "ssecmp") (set_attr "length_immediate" "1") @@ -4754,6 +4900,19 @@ DONE; }) +(define_expand "vec_cmp<mode><avx512fmaskmodelower>" + [(set (match_operand:<avx512fmaskmode> 0 "register_operand") + (match_operator:<avx512fmaskmode> 1 "" + [(match_operand:VBF_AVX10_2 2 "register_operand") + (match_operand:VBF_AVX10_2 3 "nonimmediate_operand")]))] + "TARGET_AVX10_2_256" +{ + bool ok = ix86_expand_mask_vec_cmp (operands[0], GET_CODE (operands[1]), + operands[2], operands[3]); + gcc_assert (ok); + DONE; +}) + (define_expand "vec_cmp<mode><sseintvecmodelower>" [(set (match_operand:<sseintvecmode> 0 "register_operand") (match_operator:<sseintvecmode> 1 "" @@ -5637,7 +5796,10 @@ (HF "TARGET_AVX512FP16") (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL") (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL") - (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512")]) + (V32HF "TARGET_AVX512FP16 && TARGET_EVEX512") + (V8BF "TARGET_AVX10_2_256") + (V16BF "TARGET_AVX10_2_256") + (V32BF "TARGET_AVX10_2_512")]) (define_expand "fma<mode>4" [(set (match_operand:FMAMODEM 0 "register_operand") @@ -5797,7 +5959,7 @@ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VFH_AVX512VL (fma:VFH_AVX512VL - (match_operand:VFH_AVX512VL 1 "register_operand" "0,0") + (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (match_operand:VFH_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) @@ -5816,7 +5978,7 @@ (fma:VFH_AVX512VL (match_operand:VFH_AVX512VL 1 "<round_nimm_predicate>" "%v") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") - (match_operand:VFH_AVX512VL 3 "register_operand" "0")) + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX512F && <round_mode_condition>" @@ -5901,7 +6063,7 @@ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VFH_AVX512VL (fma:VFH_AVX512VL - (match_operand:VFH_AVX512VL 1 "register_operand" "0,0") + (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VFH_AVX512VL (match_operand:VFH_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))) @@ -5922,7 +6084,7 @@ (match_operand:VFH_AVX512VL 1 "<round_nimm_predicate>" "%v") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VFH_AVX512VL - (match_operand:VFH_AVX512VL 3 "register_operand" "0"))) + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX512F && <round_mode_condition>" @@ -6008,7 +6170,7 @@ (vec_merge:VFH_AVX512VL (fma:VFH_AVX512VL (neg:VFH_AVX512VL - (match_operand:VFH_AVX512VL 1 "register_operand" "0,0")) + (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")) (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (match_operand:VFH_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>")) (match_dup 1) @@ -6028,7 +6190,7 @@ (neg:VFH_AVX512VL (match_operand:VFH_AVX512VL 1 "<round_nimm_predicate>" "%v")) (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") - (match_operand:VFH_AVX512VL 3 "register_operand" "0")) + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX512F && <round_mode_condition>" @@ -6117,7 +6279,7 @@ (vec_merge:VFH_AVX512VL (fma:VFH_AVX512VL (neg:VFH_AVX512VL - (match_operand:VFH_AVX512VL 1 "register_operand" "0,0")) + (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")) (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VFH_AVX512VL (match_operand:VFH_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))) @@ -6139,7 +6301,7 @@ (match_operand:VFH_AVX512VL 1 "<round_nimm_predicate>" "%v")) (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VFH_AVX512VL - (match_operand:VFH_AVX512VL 3 "register_operand" "0"))) + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX512F && <round_mode_condition>" @@ -6271,9 +6433,9 @@ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") (vec_merge:VFH_AVX512VL (unspec:VFH_AVX512VL - [(match_operand:VFH_AVX512VL 1 "register_operand" "v") + [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "v") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") - (match_operand:VFH_AVX512VL 3 "register_operand" "0")] + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0")] UNSPEC_FMADDSUB) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] @@ -6323,7 +6485,7 @@ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v") (vec_merge:VFH_AVX512VL (unspec:VFH_AVX512VL - [(match_operand:VFH_AVX512VL 1 "register_operand" "0,0") + [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>,v") (neg:VFH_AVX512VL (match_operand:VFH_AVX512VL 3 "<round_nimm_predicate>" "v,<round_constraint>"))] @@ -6342,10 +6504,10 @@ [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v") (vec_merge:VFH_AVX512VL (unspec:VFH_AVX512VL - [(match_operand:VFH_AVX512VL 1 "register_operand" "v") + [(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "v") (match_operand:VFH_AVX512VL 2 "<round_nimm_predicate>" "<round_constraint>") (neg:VFH_AVX512VL - (match_operand:VFH_AVX512VL 3 "register_operand" "0"))] + (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0"))] UNSPEC_FMADDSUB) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] @@ -6362,7 +6524,7 @@ [(set (match_operand:VFH_128 0 "register_operand") (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand") + (match_operand:VFH_128 1 "nonimmediate_operand") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>")) (match_dup 1) @@ -6373,7 +6535,7 @@ [(set (match_operand:VFH_128 0 "register_operand") (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand") + (match_operand:VFH_128 1 "nonimmediate_operand") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>"))) @@ -6386,8 +6548,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>")) - (match_operand:VFH_128 1 "register_operand") + (match_operand:VFH_128 1 "nonimmediate_operand")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>")) (match_dup 1) (const_int 1)))] @@ -6398,8 +6560,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>")) - (match_operand:VFH_128 1 "register_operand") + (match_operand:VFH_128 1 "nonimmediate_operand")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>"))) (match_dup 1) @@ -6410,7 +6572,7 @@ [(set (match_operand:VFH_128 0 "register_operand" "=v,v") (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>, v") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) @@ -6427,7 +6589,7 @@ [(set (match_operand:VFH_128 0 "register_operand" "=v,v") (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) @@ -6446,8 +6608,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) (const_int 1)))] @@ -6464,8 +6626,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) (match_dup 1) @@ -6483,7 +6645,7 @@ (vec_merge:VFH_128 (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) @@ -6505,7 +6667,7 @@ (fma:VFH_128 (match_operand:VFH_128 1 "<round_nimm_scalar_predicate>" "%v") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>") - (match_operand:VFH_128 3 "register_operand" "0")) + (match_operand:VFH_128 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:QI 4 "register_operand" "Yk")) (match_dup 3) @@ -6535,7 +6697,7 @@ (vec_merge:VFH_128 (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_operand:VFH_128 4 "const0_operand") @@ -6555,7 +6717,7 @@ (vec_merge:VFH_128 (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) @@ -6579,7 +6741,7 @@ (match_operand:VFH_128 1 "<round_nimm_scalar_predicate>" "%v") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>") (neg:VFH_128 - (match_operand:VFH_128 3 "register_operand" "0"))) + (match_operand:VFH_128 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:QI 4 "register_operand" "Yk")) (match_dup 3) @@ -6595,7 +6757,7 @@ (vec_merge:VFH_128 (vec_merge:VFH_128 (fma:VFH_128 - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) @@ -6617,8 +6779,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_dup 1) (match_operand:QI 4 "register_operand" "Yk,Yk")) @@ -6640,7 +6802,7 @@ (neg:VFH_128 (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>")) (match_operand:VFH_128 1 "<round_nimm_scalar_predicate>" "%v") - (match_operand:VFH_128 3 "register_operand" "0")) + (match_operand:VFH_128 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:QI 4 "register_operand" "Yk")) (match_dup 3) @@ -6672,7 +6834,7 @@ (fma:VFH_128 (neg:VFH_128 (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>")) (match_operand:VFH_128 4 "const0_operand") (match_operand:QI 5 "register_operand" "Yk,Yk")) @@ -6692,8 +6854,8 @@ (vec_merge:VFH_128 (fma:VFH_128 (neg:VFH_128 - (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0")) + (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) (match_dup 1) @@ -6717,7 +6879,7 @@ (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>")) (match_operand:VFH_128 1 "<round_nimm_scalar_predicate>" "%v") (neg:VFH_128 - (match_operand:VFH_128 3 "register_operand" "0"))) + (match_operand:VFH_128 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:QI 4 "register_operand" "Yk")) (match_dup 3) @@ -6735,7 +6897,7 @@ (fma:VFH_128 (neg:VFH_128 (match_operand:VFH_128 2 "<round_nimm_scalar_predicate>" "<round_constraint>,v")) - (match_operand:VFH_128 1 "register_operand" "0,0") + (match_operand:VFH_128 1 "nonimmediate_operand" "0,0") (neg:VFH_128 (match_operand:VFH_128 3 "<round_nimm_scalar_predicate>" "v,<round_constraint>"))) (match_operand:VFH_128 4 "const0_operand") @@ -7450,7 +7612,7 @@ [(match_operand:<ssePHmode> 1 "<round_nimm_predicate>" "<round_constraint>")] UNSPEC_US_FIX_NOTRUNC))] "TARGET_AVX512FP16 && <round_mode_condition>" - "vcvtph2<sseintconvertsignprefix><sseintconvert>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %1<round_mask_op2>}" + "vcvtph2<sseintconvertsignprefix><sseintconvert>\t{<round_mask_op2>%1, %0<mask_operand2>|%0<mask_operand2>, %<iptrh>1<round_mask_op2>}" [(set_attr "type" "ssecvt") (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) @@ -8817,7 +8979,7 @@ cvtsi2sd{l}\t{%2, %0|%0, %2} vcvtsi2sd{l}\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") - (set_attr "type" "sseicvt") + (set_attr "type" "sseicvt2") (set_attr "athlon_decode" "double,direct,*") (set_attr "amdfam10_decode" "vector,double,*") (set_attr "bdver1_decode" "double,direct,*") @@ -8839,7 +9001,7 @@ cvtsi2sd{q}\t{%2, %0|%0, %2} vcvtsi2sd{q}\t{%2, <round_op3>%1, %0|%0, %1<round_op3>, %2}" [(set_attr "isa" "noavx,noavx,avx") - (set_attr "type" "sseicvt") + (set_attr "type" "sseicvt2") (set_attr "athlon_decode" "double,direct,*") (set_attr "amdfam10_decode" "vector,double,*") (set_attr "bdver1_decode" "double,direct,*") @@ -10897,7 +11059,7 @@ vmovlps\t{%H2, %1, %0|%0, %1, %H2} %vmovhps\t{%2, %0|%q0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") - (set_attr "type" "ssemov") + (set_attr "type" "ssemov2") (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")]) @@ -11459,7 +11621,7 @@ vmovlhps\t{%2, %1, %0|%0, %1, %2} %vmovlps\t{%2, %H0|%H0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") - (set_attr "type" "ssemov") + (set_attr "type" "ssemov2") (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex") (set_attr "mode" "V2SF,V2SF,V4SF,V4SF,V2SF")]) @@ -11512,7 +11674,7 @@ vmovlps\t{%2, %1, %0|%0, %1, %q2} %vmovlps\t{%2, %0|%q0, %2}" [(set_attr "isa" "noavx,avx,noavx,avx,*") - (set_attr "type" "sseshuf,sseshuf,ssemov,ssemov,ssemov") + (set_attr "type" "sseshuf,sseshuf,ssemov2,ssemov2,ssemov") (set (attr "length_immediate") (if_then_else (eq_attr "alternative" "0,1") (const_string "1") @@ -11668,7 +11830,7 @@ movhps\t{%2, %0|%0, %q2} vmovhps\t{%2, %1, %0|%0, %1, %q2}" [(set_attr "isa" "noavx,avx,noavx,avx") - (set_attr "type" "ssemov") + (set_attr "type" "ssemov2") (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) @@ -11749,6 +11911,8 @@ (const_string "imov") (eq_attr "alternative" "14") (const_string "fmov") + (eq_attr "alternative" "4,6") + (const_string "ssemov2") ] (const_string "ssemov"))) (set (attr "addr") @@ -12114,7 +12278,7 @@ movlpd\t{%2, %0|%0, %2} vmovlpd\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx,noavx,avx") - (set_attr "type" "ssemov") + (set_attr "type" "ssemov2") (set_attr "mode" "DF")]) (define_expand "vec_set<mode>" @@ -14565,7 +14729,7 @@ # #" [(set_attr "isa" "noavx,avx,noavx,avx,*,*,*") - (set_attr "type" "ssemov,ssemov,sselog,sselog,ssemov,fmov,imov") + (set_attr "type" "ssemov2,ssemov2,sselog,sselog,ssemov,fmov,imov") (set (attr "prefix_data16") (if_then_else (eq_attr "alternative" "0") (const_string "1") @@ -14635,6 +14799,8 @@ (const_string "fmov") (eq_attr "alternative" "10") (const_string "imov") + (eq_attr "alternative" "0,1,2") + (const_string "ssemov2") ] (const_string "ssemov"))) (set (attr "prefix_data16") @@ -14687,7 +14853,7 @@ (if_then_else (eq_attr "alternative" "5") (const_string "sselog") - (const_string "ssemov"))) + (const_string "ssemov2"))) (set (attr "prefix_data16") (if_then_else (and (eq_attr "alternative" "2,4") @@ -14759,7 +14925,7 @@ (if_then_else (eq_attr "alternative" "0,1,2") (const_string "sselog") - (const_string "ssemov"))) + (const_string "ssemov2"))) (set (attr "prefix_data16") (if_then_else (eq_attr "alternative" "3") (const_string "1") @@ -15288,7 +15454,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code>v2div2qi2_mask_store_1" +(define_insn "avx512vl_<code>v2div2qi2_mask_store_1" [(set (match_operand:V2QI 0 "memory_operand" "=m") (vec_merge:V2QI (any_truncate:V2QI @@ -15302,28 +15468,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code>v2div2qi2_mask_store_2" - [(set (match_operand:HI 0 "memory_operand") - (subreg:HI - (vec_merge:V2QI - (any_truncate:V2QI - (match_operand:V2DI 1 "register_operand")) - (vec_select:V2QI - (subreg:V4QI - (vec_concat:V2HI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V2QI - (any_truncate:V2QI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V2QImode, 0);") +(define_expand "avx512vl_<code>v2div2qi2_mask_store_2" + [(match_operand:HI 0 "memory_operand") + (any_truncate:V2QI + (match_operand:V2DI 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V2QImode, 0); + emit_insn (gen_avx512vl_<code>v2div2qi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_insn "*avx512vl_<code><mode>v4qi2_store_1" [(set (match_operand:V4QI 0 "memory_operand" "=m") @@ -15392,7 +15549,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code><mode>v4qi2_mask_store_1" +(define_insn "avx512vl_<code><mode>v4qi2_mask_store_1" [(set (match_operand:V4QI 0 "memory_operand" "=m") (vec_merge:V4QI (any_truncate:V4QI @@ -15406,29 +15563,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code><mode>v4qi2_mask_store_2" - [(set (match_operand:SI 0 "memory_operand") - (subreg:SI - (vec_merge:V4QI - (any_truncate:V4QI - (match_operand:VI4_128_8_256 1 "register_operand")) - (vec_select:V4QI - (subreg:V8QI - (vec_concat:V2SI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1) - (const_int 2) (const_int 3)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V4QI - (any_truncate:V4QI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V4QImode, 0);") +(define_expand "avx512vl_<code><mode>v4qi2_mask_store_2" + [(match_operand:SI 0 "memory_operand") + (any_truncate:V4QI + (match_operand:VI4_128_8_256 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V4QImode, 0); + emit_insn (gen_avx512vl_<code><mode>v4qi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_mode_iterator VI2_128_BW_4_256 [(V8HI "TARGET_AVX512BW") V8SI]) @@ -15500,7 +15647,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code><mode>v8qi2_mask_store_1" +(define_insn "avx512vl_<code><mode>v8qi2_mask_store_1" [(set (match_operand:V8QI 0 "memory_operand" "=m") (vec_merge:V8QI (any_truncate:V8QI @@ -15514,31 +15661,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code><mode>v8qi2_mask_store_2" - [(set (match_operand:DI 0 "memory_operand") - (subreg:DI - (vec_merge:V8QI - (any_truncate:V8QI - (match_operand:VI2_128_BW_4_256 1 "register_operand")) - (vec_select:V8QI - (subreg:V16QI - (vec_concat:V2DI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1) - (const_int 2) (const_int 3) - (const_int 4) (const_int 5) - (const_int 6) (const_int 7)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V8QI - (any_truncate:V8QI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") +(define_expand "avx512vl_<code><mode>v8qi2_mask_store_2" + [(match_operand:DI 0 "memory_operand") + (any_truncate:V8QI + (match_operand:VI2_128_BW_4_256 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V8QImode, 0); + emit_insn (gen_avx512vl_<code><mode>v8qi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_mode_iterator PMOV_SRC_MODE_4 [(V4DI "TARGET_AVX2") V2DI V4SI]) (define_mode_attr pmov_dst_4 @@ -15666,7 +15801,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code><mode>v4hi2_mask_store_1" +(define_insn "avx512vl_<code><mode>v4hi2_mask_store_1" [(set (match_operand:V4HI 0 "memory_operand" "=m") (vec_merge:V4HI (any_truncate:V4HI @@ -15684,30 +15819,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code><mode>v4hi2_mask_store_2" - [(set (match_operand:DI 0 "memory_operand") - (subreg:DI - (vec_merge:V4HI - (any_truncate:V4HI - (match_operand:VI4_128_8_256 1 "register_operand")) - (vec_select:V4HI - (subreg:V8HI - (vec_concat:V2DI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1) - (const_int 2) (const_int 3)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V4HI - (any_truncate:V4HI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V4HImode, 0);") - +(define_expand "avx512vl_<code><mode>v4hi2_mask_store_2" + [(match_operand:DI 0 "memory_operand") + (any_truncate:V4HI + (match_operand:VI4_128_8_256 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V4HImode, 0); + emit_insn (gen_avx512vl_<code><mode>v4hi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_insn "*avx512vl_<code>v2div2hi2_store_1" [(set (match_operand:V2HI 0 "memory_operand" "=m") @@ -15768,7 +15892,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code>v2div2hi2_mask_store_1" +(define_insn "avx512vl_<code>v2div2hi2_mask_store_1" [(set (match_operand:V2HI 0 "memory_operand" "=m") (vec_merge:V2HI (any_truncate:V2HI @@ -15782,28 +15906,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code>v2div2hi2_mask_store_2" - [(set (match_operand:SI 0 "memory_operand") - (subreg:SI - (vec_merge:V2HI - (any_truncate:V2HI - (match_operand:V2DI 1 "register_operand")) - (vec_select:V2HI - (subreg:V4HI - (vec_concat:V2SI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V2HI - (any_truncate:V2HI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V2HImode, 0);") +(define_expand "avx512vl_<code>v2div2hi2_mask_store_2" + [(match_operand:SI 0 "memory_operand") + (any_truncate:V2HI + (match_operand:V2DI 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V2HImode, 0); + emit_insn (gen_avx512vl_<code>v2div2hi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_expand "truncv2div2si2" [(set (match_operand:V2SI 0 "register_operand") @@ -15923,7 +16038,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512vl_<code>v2div2si2_mask_store_1" +(define_insn "avx512vl_<code>v2div2si2_mask_store_1" [(set (match_operand:V2SI 0 "memory_operand" "=m") (vec_merge:V2SI (any_truncate:V2SI @@ -15937,28 +16052,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512vl_<code>v2div2si2_mask_store_2" - [(set (match_operand:DI 0 "memory_operand") - (subreg:DI - (vec_merge:V2SI - (any_truncate:V2SI - (match_operand:V2DI 1 "register_operand")) - (vec_select:V2SI - (subreg:V4SI - (vec_concat:V2DI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512VL && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V2SI - (any_truncate:V2SI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V2SImode, 0);") +(define_expand "avx512vl_<code>v2div2si2_mask_store_2" + [(match_operand:DI 0 "memory_operand") + (any_truncate:V2SI + (match_operand:V2DI 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512VL" +{ + operands[0] = adjust_address_nv (operands[0], V2SImode, 0); + emit_insn (gen_avx512vl_<code>v2div2si2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) (define_expand "truncv8div8qi2" [(set (match_operand:V8QI 0 "register_operand") @@ -16057,7 +16163,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "*avx512f_<code>v8div16qi2_mask_store_1" +(define_insn "avx512f_<code>v8div16qi2_mask_store_1" [(set (match_operand:V8QI 0 "memory_operand" "=m") (vec_merge:V8QI (any_truncate:V8QI @@ -16071,31 +16177,19 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn_and_split "avx512f_<code>v8div16qi2_mask_store_2" - [(set (match_operand:DI 0 "memory_operand") - (subreg:DI - (vec_merge:V8QI - (any_truncate:V8QI - (match_operand:V8DI 1 "register_operand")) - (vec_select:V8QI - (subreg:V16QI - (vec_concat:V2DI - (match_dup 0) - (const_int 0)) 0) - (parallel [(const_int 0) (const_int 1) - (const_int 2) (const_int 3) - (const_int 4) (const_int 5) - (const_int 6) (const_int 7)])) - (match_operand:QI 2 "register_operand")) 0))] - "TARGET_AVX512F && TARGET_EVEX512 && ix86_pre_reload_split ()" - "#" - "&& 1" - [(set (match_dup 0) - (vec_merge:V8QI - (any_truncate:V8QI (match_dup 1)) - (match_dup 0) - (match_dup 2)))] - "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") +(define_expand "avx512f_<code>v8div16qi2_mask_store_2" + [(match_operand:DI 0 "memory_operand") + (any_truncate:V8QI + (match_operand:V8DI 1 "register_operand")) + (match_operand:QI 2 "register_operand")] + "TARGET_AVX512F && TARGET_EVEX512" +{ + operands[0] = adjust_address_nv (operands[0], V8QImode, 0); + emit_insn (gen_avx512f_<code>v8div16qi2_mask_store_1 (operands[0], + operands[1], + operands[2])); + DONE; +}) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -16875,7 +16969,7 @@ (define_mode_attr SDOT_VPDP_SUF [(V32HI "v16si") (V16HI "v8si") (V8HI "v4si")]) -(define_expand "sdot_prod<mode>" +(define_expand "sdot_prod<sseunpackmodelower><mode>" [(match_operand:<sseunpackmode> 0 "register_operand") (match_operand:VI2_AVX512VNNIBW 1 "register_operand") (match_operand:VI2_AVX512VNNIBW 2 "register_operand") @@ -16910,7 +17004,7 @@ ;; Normally we use widen_mul_even/odd, but combine can't quite get it all ;; back together when madd is available. -(define_expand "sdot_prodv4si" +(define_expand "sdot_prodv2div4si" [(match_operand:V2DI 0 "register_operand") (match_operand:V4SI 1 "register_operand") (match_operand:V4SI 2 "register_operand") @@ -17849,8 +17943,8 @@ (match_operand:VI_128_256 1 "vector_all_ones_operand") (match_operand:VI_128_256 2 "const0_operand") (unspec:<avx512fmaskmode> - [(match_operand:VI_128_256 3 "nonimmediate_operand") - (match_operand:VI_128_256 4 "nonimmediate_operand") + [(match_operand:VI_128_256 3 "nonimm_or_0_operand") + (match_operand:VI_128_256 4 "nonimm_or_0_operand") (match_operand:SI 5 "const_0_to_7_operand")] UNSPEC_PCMP)))] "TARGET_AVX512VL && ix86_pre_reload_split () @@ -17869,6 +17963,12 @@ { if (INTVAL (operands[5]) == 1) std::swap (operands[3], operands[4]); + + if (operands[3] == CONST0_RTX (<MODE>mode)) + operands[3] = force_reg (<MODE>mode, operands[3]); + if (operands[4] == CONST0_RTX (<MODE>mode)) + operands[4] = force_reg (<MODE>mode, operands[4]); + enum rtx_code code = INTVAL (operands[5]) ? GT : EQ; emit_move_insn (operands[0], gen_rtx_fmt_ee (code, <MODE>mode, operands[3], operands[4])); @@ -18036,7 +18136,7 @@ [(match_operand:VI_128_256 3 "nonimmediate_operand") (match_operand:VI_128_256 4 "nonimmediate_operand") (match_operand:SI 5 "const_0_to_7_operand")] - UNSPEC_PCMP_ITER)))] + UNSPEC_PCMP)))] "TARGET_AVX512VL && ix86_pre_reload_split () /* NE is commutative. */ && (INTVAL (operands[5]) == 4 @@ -18059,6 +18159,31 @@ DONE; }) +(define_insn_and_split "*avx2_pcmp<mode>3_8" + [(set (match_operand:VI_128_256 0 "register_operand") + (vec_merge:VI_128_256 + (match_operand:VI_128_256 1 "const0_operand") + (match_operand:VI_128_256 2 "vector_all_ones_operand") + (unspec:<avx512fmaskmode> + [(match_operand:VI_128_256 3 "nonimmediate_operand") + (match_operand:VI_128_256 4 "nonimmediate_operand") + (match_operand:SI 5 "const_0_to_7_operand")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512VL && ix86_pre_reload_split () + /* NE is commutative. */ + && INTVAL (operands[5]) == 4" + + "#" + "&& 1" + [(const_int 0)] +{ + if (MEM_P (operands[3])) + operands[3] = force_reg (<MODE>mode, operands[3]); + emit_move_insn (operands[0], gen_rtx_fmt_ee (EQ, <MODE>mode, + operands[3], operands[4])); + DONE; +}) + (define_expand "<avx512>_eq<mode>3<mask_scalar_merge_name>" [(set (match_operand:<avx512fmaskmode> 0 "register_operand") (unspec:<avx512fmaskmode> @@ -18330,6 +18455,13 @@ (match_operand:VI_AVX2 2 "vector_operand")))] "TARGET_SSE2") +(define_expand "andn<mode>3" + [(set (match_operand:VI 0 "register_operand") + (and:VI + (not:VI (match_operand:VI 2 "register_operand")) + (match_operand:VI 1 "register_operand")))] + "TARGET_SSE2") + (define_expand "<sse2_avx2>_andnot<mode>3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand") (vec_merge:VI48_AVX512VL @@ -21439,7 +21571,7 @@ movhps\t{%2, %0|%0, %q2} vmovhps\t{%2, %1, %0|%0, %1, %q2}" [(set_attr "isa" "sse2_noavx,avx,noavx,noavx,avx") - (set_attr "type" "sselog,sselog,ssemov,ssemov,ssemov") + (set_attr "type" "sselog,sselog,ssemov,ssemov2,ssemov2") (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) @@ -21547,7 +21679,7 @@ (if_then_else (eq_attr "alternative" "0,1,2,3,4,5") (const_string "sselog") - (const_string "ssemov"))) + (const_string "ssemov2"))) (set (attr "addr") (if_then_else (eq_attr "alternative" "0,1") (const_string "gpr16") @@ -29714,7 +29846,7 @@ UNSPEC_FPCLASS) (const_int 1)))] "TARGET_AVX512DQ || VALID_AVX512FP16_REG_MODE(<MODE>mode)" - "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %1, %2}"; + "vfpclass<ssescalarmodesuffix>\t{%2, %1, %0<mask_scalar_merge_operand3>|%0<mask_scalar_merge_operand3>, %<iptr>1, %2}"; [(set_attr "type" "sse") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -29950,25 +30082,29 @@ (set_attr "mode" "OI")]) (define_insn "vsm4key4_<mode>" - [(set (match_operand:VI4_AVX 0 "register_operand" "=x") - (unspec:VI4_AVX - [(match_operand:VI4_AVX 1 "register_operand" "x") - (match_operand:VI4_AVX 2 "vector_operand" "xBm")] + [(set (match_operand:VI4_AVX10_2 0 "register_operand" "=x,v") + (unspec:VI4_AVX10_2 + [(match_operand:VI4_AVX10_2 1 "register_operand" "x,v") + (match_operand:VI4_AVX10_2 2 "vector_operand" "xBm,vBm")] UNSPEC_SM4KEY4))] "TARGET_SM4" "vsm4key4\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "other") + (set_attr "prefix" "maybe_evex") + (set_attr "isa" "avx,avx10_2") (set_attr "mode" "<sseinsnmode>")]) (define_insn "vsm4rnds4_<mode>" - [(set (match_operand:VI4_AVX 0 "register_operand" "=x") - (unspec:VI4_AVX - [(match_operand:VI4_AVX 1 "register_operand" "x") - (match_operand:VI4_AVX 2 "vector_operand" "xBm")] + [(set (match_operand:VI4_AVX10_2 0 "register_operand" "=x,v") + (unspec:VI4_AVX10_2 + [(match_operand:VI4_AVX10_2 1 "register_operand" "x,v") + (match_operand:VI4_AVX10_2 2 "vector_operand" "xBm,vBm")] UNSPEC_SM4RNDS4))] "TARGET_SM4" "vsm4rnds4\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "other") + (set_attr "prefix" "maybe_evex") + (set_attr "isa" "avx,avx10_2") (set_attr "mode" "<sseinsnmode>")]) (define_insn_and_split "avx512f_<castmode><avxsizesuffix>_<castmode>" @@ -30365,7 +30501,7 @@ [(set_attr ("prefix") ("evex")) (set_attr "mode" "<sseinsnmode>")]) -(define_expand "usdot_prod<mode>" +(define_expand "usdot_prod<ssedvecmodelower><mode>" [(match_operand:<ssedvecmode> 0 "register_operand") (match_operand:VI1_AVX512 1 "register_operand") (match_operand:VI1_AVX512 2 "register_operand") @@ -30403,10 +30539,11 @@ rtx sum = gen_reg_rtx (<ssedvecmode>mode); emit_move_insn (sum, CONST0_RTX (<ssedvecmode>mode)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res1, op1_lo, - op2_lo, sum)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res2, op1_hi, - op2_hi, operands[3])); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res1, + op1_lo, op2_lo, sum)); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res2, + op1_hi, op2_hi, + operands[3])); emit_insn (gen_add<ssedvecmodelower>3 (operands[0], res1, res2)); } DONE; @@ -30696,7 +30833,7 @@ UNSPEC_VAESDEC))] "TARGET_VAES" { - if (which_alternative == 0 && <MODE>mode == V16QImode) + if (!TARGET_AES && <MODE>mode == V16QImode) return "%{evex%} vaesdec\t{%2, %1, %0|%0, %1, %2}"; else return "vaesdec\t{%2, %1, %0|%0, %1, %2}"; @@ -30710,7 +30847,7 @@ UNSPEC_VAESDECLAST))] "TARGET_VAES" { - if (which_alternative == 0 && <MODE>mode == V16QImode) + if (!TARGET_AES && <MODE>mode == V16QImode) return "%{evex%} vaesdeclast\t{%2, %1, %0|%0, %1, %2}"; else return "vaesdeclast\t{%2, %1, %0|%0, %1, %2}"; @@ -30724,7 +30861,7 @@ UNSPEC_VAESENC))] "TARGET_VAES" { - if (which_alternative == 0 && <MODE>mode == V16QImode) + if (!TARGET_AES && <MODE>mode == V16QImode) return "%{evex%} vaesenc\t{%2, %1, %0|%0, %1, %2}"; else return "vaesenc\t{%2, %1, %0|%0, %1, %2}"; @@ -30738,7 +30875,7 @@ UNSPEC_VAESENCLAST))] "TARGET_VAES" { - if (which_alternative == 0 && <MODE>mode == V16QImode) + if (!TARGET_AES && <MODE>mode == V16QImode) return "%{evex%} vaesenclast\t{%2, %1, %0|%0, %1, %2}"; else return "vaesenclast\t{%2, %1, %0|%0, %1, %2}"; @@ -31230,14 +31367,15 @@ (UNSPEC_VPDPBSUD "bsud") (UNSPEC_VPDPBSUDS "bsuds") (UNSPEC_VPDPBUUD "buud") (UNSPEC_VPDPBUUDS "buuds")]) -(define_expand "sdot_prod<mode>" +(define_expand "sdot_prod<ssedvecmodelower><mode>" [(match_operand:<ssedvecmode> 0 "register_operand") - (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand") + (match_operand:VI1_AVX512VNNIBW 1 "register_operand") + (match_operand:VI1_AVX512VNNIBW 2 "register_operand") (match_operand:<ssedvecmode> 3 "register_operand")] "TARGET_SSE2" { - if (TARGET_AVXVNNIINT8) + if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512) + || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256))) { operands[1] = lowpart_subreg (<ssedvecmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31266,54 +31404,26 @@ rtx sum = gen_reg_rtx (<ssedvecmode>mode); emit_move_insn (sum, CONST0_RTX (<ssedvecmode>mode)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res1, op1_lo, - op2_lo, sum)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res2, op1_hi, - op2_hi, operands[3])); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res1, + op1_lo, op2_lo, sum)); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res2, + op1_hi, op2_hi, + operands[3])); emit_insn (gen_add<ssedvecmodelower>3 (operands[0], res1, res2)); } DONE; }) -(define_expand "sdot_prodv64qi" - [(match_operand:V16SI 0 "register_operand") - (match_operand:V64QI 1 "register_operand") - (match_operand:V64QI 2 "register_operand") - (match_operand:V16SI 3 "register_operand")] - "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512" -{ - /* Emulate with vpdpwssd. */ - rtx op1_lo = gen_reg_rtx (V32HImode); - rtx op1_hi = gen_reg_rtx (V32HImode); - rtx op2_lo = gen_reg_rtx (V32HImode); - rtx op2_hi = gen_reg_rtx (V32HImode); - - emit_insn (gen_vec_unpacks_lo_v64qi (op1_lo, operands[1])); - emit_insn (gen_vec_unpacks_lo_v64qi (op2_lo, operands[2])); - emit_insn (gen_vec_unpacks_hi_v64qi (op1_hi, operands[1])); - emit_insn (gen_vec_unpacks_hi_v64qi (op2_hi, operands[2])); - - rtx res1 = gen_reg_rtx (V16SImode); - rtx res2 = gen_reg_rtx (V16SImode); - rtx sum = gen_reg_rtx (V16SImode); - - emit_move_insn (sum, CONST0_RTX (V16SImode)); - emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum)); - emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3])); - - emit_insn (gen_addv16si3 (operands[0], res1, res2)); - DONE; -}) - -(define_expand "udot_prod<mode>" +(define_expand "udot_prod<ssedvecmodelower><mode>" [(match_operand:<ssedvecmode> 0 "register_operand") - (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand") + (match_operand:VI1_AVX512VNNIBW 1 "register_operand") + (match_operand:VI1_AVX512VNNIBW 2 "register_operand") (match_operand:<ssedvecmode> 3 "register_operand")] "TARGET_SSE2" { - if (TARGET_AVXVNNIINT8) + if ((<MODE_SIZE> == 64 && TARGET_AVX10_2_512) + || (<MODE_SIZE> < 64 && (TARGET_AVXVNNIINT8 || TARGET_AVX10_2_256))) { operands[1] = lowpart_subreg (<ssedvecmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31342,46 +31452,17 @@ rtx sum = gen_reg_rtx (<ssedvecmode>mode); emit_move_insn (sum, CONST0_RTX (<ssedvecmode>mode)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res1, op1_lo, - op2_lo, sum)); - emit_insn (gen_sdot_prod<sseunpackmodelower> (res2, op1_hi, - op2_hi, operands[3])); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res1, + op1_lo, op2_lo, sum)); + emit_insn (gen_sdot_prod<ssedvecmodelower><sseunpackmodelower> (res2, + op1_hi, op2_hi, + operands[3])); emit_insn (gen_add<ssedvecmodelower>3 (operands[0], res1, res2)); } DONE; }) -(define_expand "udot_prodv64qi" - [(match_operand:V16SI 0 "register_operand") - (match_operand:V64QI 1 "register_operand") - (match_operand:V64QI 2 "register_operand") - (match_operand:V16SI 3 "register_operand")] - "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512" -{ - /* Emulate with vpdpwssd. */ - rtx op1_lo = gen_reg_rtx (V32HImode); - rtx op1_hi = gen_reg_rtx (V32HImode); - rtx op2_lo = gen_reg_rtx (V32HImode); - rtx op2_hi = gen_reg_rtx (V32HImode); - - emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1])); - emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2])); - emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1])); - emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2])); - - rtx res1 = gen_reg_rtx (V16SImode); - rtx res2 = gen_reg_rtx (V16SImode); - rtx sum = gen_reg_rtx (V16SImode); - - emit_move_insn (sum, CONST0_RTX (V16SImode)); - emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum)); - emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3])); - - emit_insn (gen_addv16si3 (operands[0], res1, res2)); - DONE; -}) - (define_insn "vpdp<vpdotprodtype>_<mode>" [(set (match_operand:VI4_AVX 0 "register_operand" "=v") (unspec:VI4_AVX @@ -31755,12 +31836,12 @@ (UNSPEC_VPDPWSUD "wsud") (UNSPEC_VPDPWSUDS "wsuds") (UNSPEC_VPDPWUUD "wuud") (UNSPEC_VPDPWUUDS "wuuds")]) -(define_expand "usdot_prod<mode>" +(define_expand "usdot_prod<sseunpackmodelower><mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand") - (match_operand:VI2_AVX2 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] - "TARGET_AVXVNNIINT16" + "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256" { operands[1] = lowpart_subreg (<sseunpackmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31773,12 +31854,12 @@ DONE; }) -(define_expand "udot_prod<mode>" +(define_expand "udot_prod<sseunpackmodelower><mode>" [(match_operand:<sseunpackmode> 0 "register_operand") - (match_operand:VI2_AVX2 1 "register_operand") - (match_operand:VI2_AVX2 2 "register_operand") + (match_operand:VI2_AVX10_2 1 "register_operand") + (match_operand:VI2_AVX10_2 2 "register_operand") (match_operand:<sseunpackmode> 3 "register_operand")] - "TARGET_AVXVNNIINT16" + "TARGET_AVXVNNIINT16 || TARGET_AVX10_2_256" { operands[1] = lowpart_subreg (<sseunpackmode>mode, force_reg (<MODE>mode, operands[1]), @@ -31916,6 +31997,13 @@ "vscalefpbf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "prefix" "evex")]) +(define_expand "<code><mode>3" + [(set (match_operand:VBF_AVX10_2 0 "register_operand") + (smaxmin:VBF_AVX10_2 + (match_operand:VBF_AVX10_2 1 "register_operand") + (match_operand:VBF_AVX10_2 2 "nonimmediate_operand")))] + "TARGET_AVX10_2_256") + (define_insn "avx10_2_<code>pbf16_<mode><mask_name>" [(set (match_operand:VBF_AVX10_2 0 "register_operand" "=v") (smaxmin:VBF_AVX10_2 @@ -31988,7 +32076,7 @@ (fma:VBF_AVX10_2 (match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "%v") (match_operand:VBF_AVX10_2 2 "nonimmediate_operand" "vm") - (match_operand:VBF_AVX10_2 3 "register_operand" "0")) + (match_operand:VBF_AVX10_2 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX10_2_256" @@ -32053,7 +32141,7 @@ (neg:VBF_AVX10_2 (match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "%v")) (match_operand:VBF_AVX10_2 2 "nonimmediate_operand" "vm") - (match_operand:VBF_AVX10_2 3 "register_operand" "0")) + (match_operand:VBF_AVX10_2 3 "nonimmediate_operand" "0")) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX10_2_256" @@ -32118,7 +32206,7 @@ (match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "%v") (match_operand:VBF_AVX10_2 2 "nonimmediate_operand" "vm") (neg:VBF_AVX10_2 - (match_operand:VBF_AVX10_2 3 "register_operand" "0"))) + (match_operand:VBF_AVX10_2 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX10_2_256" @@ -32186,7 +32274,7 @@ (match_operand:VBF_AVX10_2 1 "nonimmediate_operand" "%v")) (match_operand:VBF_AVX10_2 2 "nonimmediate_operand" "vm") (neg:VBF_AVX10_2 - (match_operand:VBF_AVX10_2 3 "register_operand" "0"))) + (match_operand:VBF_AVX10_2 3 "nonimmediate_operand" "0"))) (match_dup 3) (match_operand:<avx512fmaskmode> 4 "register_operand" "Yk")))] "TARGET_AVX10_2_256" @@ -32274,8 +32362,12 @@ (define_insn "avx10_2_comsbf16_v8bf" [(set (reg:CCFP FLAGS_REG) (unspec:CCFP - [(match_operand:V8BF 0 "register_operand" "v") - (match_operand:V8BF 1 "nonimmediate_operand" "vm")] + [(vec_select:BF + (match_operand:V8BF 0 "register_operand" "v") + (parallel [(const_int 0)])) + (vec_select:BF + (match_operand:V8BF 1 "nonimmediate_operand" "vm") + (parallel [(const_int 0)]))] UNSPEC_VCOMSBF16))] "TARGET_AVX10_2_256" "vcomsbf16\t{%1, %0|%0, %1}" @@ -32485,3 +32577,15 @@ "vminmax<ssescalarmodesuffix>\t{%3, <round_saeonly_scalar_mask_op4>%2, %1, %0<mask_scalar_operand4>|%0<mask_scalar_operand4>, %1, %2<round_saeonly_scalar_mask_op4>, %3}" [(set_attr "prefix" "evex") (set_attr "mode" "<ssescalarmode>")]) + +(define_insn "avx10_2_vmovrs<ssemodesuffix><mode><mask_name>" + [(set (match_operand:VI1248_AVX10_2 0 "register_operand" "=v") + (unspec:VI1248_AVX10_2 + [(match_operand:VI1248_AVX10_2 1 "memory_operand" "m")] + UNSPEC_VMOVRS))] + "TARGET_AVX10_2_256 && TARGET_MOVRS" + "vmovrs<ssemodesuffix>\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "evex") + (set_attr "memory" "load") + (set_attr "mode" "<sseinsnmode>")]) diff --git a/gcc/config/i386/stringop.def b/gcc/config/i386/stringop.def index 07de314..7de514d 100644 --- a/gcc/config/i386/stringop.def +++ b/gcc/config/i386/stringop.def @@ -13,7 +13,7 @@ but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. -You should have received a copy of the GNU General Public License +You should have received a copy of the GNU General Public License along with GCC; see the files COPYING3. If not, see <http://www.gnu.org/licenses/>. */ diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index ca53413..3722b0d 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -366,6 +366,8 @@ (define_subst_attr "mask_scalarcz_operand4" "mask_scalarcz" "" "%{%5%}%N4") (define_subst_attr "mask_scalar4_dest_false_dep_for_glc_cond" "mask_scalar" "1" "operands[4] == CONST0_RTX(<MODE>mode)") (define_subst_attr "mask_scalarc_dest_false_dep_for_glc_cond" "mask_scalarc" "1" "operands[3] == CONST0_RTX(V8HFmode)") +(define_subst_attr "mask_scalar_operand_arg34" "mask_scalar" "" ", operands[3], operands[4]") +(define_subst_attr "mask_scalar_expand_op3" "mask_scalar" "3" "5") (define_subst "mask_scalar" [(set (match_operand:SUBST_V 0) @@ -473,6 +475,7 @@ (define_subst_attr "round_saeonly_scalar_constraint" "round_saeonly_scalar" "vm" "v") (define_subst_attr "round_saeonly_scalar_prefix" "round_saeonly_scalar" "vex" "evex") (define_subst_attr "round_saeonly_scalar_nimm_predicate" "round_saeonly_scalar" "nonimmediate_operand" "register_operand") +(define_subst_attr "round_saeonly_scalar_mask_arg3" "round_saeonly_scalar" "" ", operands[<mask_scalar_expand_op3>]") (define_subst "round_saeonly_scalar" [(set (match_operand:SUBST_V 0) diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md index f2b3ba0..f03d418 100644 --- a/gcc/config/i386/sync.md +++ b/gcc/config/i386/sync.md @@ -170,7 +170,7 @@ if (<MODE>mode == DImode && !TARGET_64BIT) emit_insn (gen_atomic_loaddi_fpu (operands[0], operands[1], - assign_386_stack_local (DImode, SLOT_TEMP))); + assign_stack_temp (DImode, GET_MODE_SIZE (DImode)))); else { rtx dst = operands[0]; @@ -251,7 +251,7 @@ out to be significantly larger than this plus a barrier. */ emit_insn (gen_atomic_storedi_fpu (operands[0], operands[1], - assign_386_stack_local (DImode, SLOT_TEMP))); + assign_stack_temp (DImode, GET_MODE_SIZE (DImode)))); } else { diff --git a/gcc/config/i386/wmmintrin.h b/gcc/config/i386/wmmintrin.h index 34ddd3e..c9ebbb9 100644 --- a/gcc/config/i386/wmmintrin.h +++ b/gcc/config/i386/wmmintrin.h @@ -38,17 +38,17 @@ #define __DISABLE_AES__ #endif /* __AES__ */ -/* Performs 1 round of AES decryption of the first m128i using +/* Performs 1 round of AES decryption of the first m128i using the second m128i as a round key. */ #define _mm_aesdec_si128(X, Y) \ (__m128i) __builtin_ia32_aesdec128 ((__v2di) (X), (__v2di) (Y)) -/* Performs the last round of AES decryption of the first m128i +/* Performs the last round of AES decryption of the first m128i using the second m128i as a round key. */ #define _mm_aesdeclast_si128(X, Y) \ (__m128i) __builtin_ia32_aesdeclast128 ((__v2di) (X), (__v2di) (Y)) -/* Performs 1 round of AES encryption of the first m128i using +/* Performs 1 round of AES encryption of the first m128i using the second m128i as a round key. */ #define _mm_aesenc_si128(X, Y) \ (__m128i) __builtin_ia32_aesenc128 ((__v2di) (X), (__v2di) (Y)) @@ -58,7 +58,7 @@ #define _mm_aesenclast_si128(X, Y) \ (__m128i) __builtin_ia32_aesenclast128 ((__v2di) (X), (__v2di) (Y)) -/* Performs the InverseMixColumn operation on the source m128i +/* Performs the InverseMixColumn operation on the source m128i and stores the result into m128i destination. */ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_aesimc_si128 (__m128i __X) diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 2bfaee5..1b3227a 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -2034,6 +2034,7 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (1), /* cost of a lea instruction. */ COSTS_N_INSNS (1), /* variable shift costs. */ COSTS_N_INSNS (1), /* constant shift costs. */ + /* mul has latency 3, executes in 3 integer units. */ {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ COSTS_N_INSNS (3), /* HI. */ COSTS_N_INSNS (3), /* SI. */ @@ -2041,6 +2042,8 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (3)}, /* other. */ 0, /* cost of multiply per each bit set. */ + /* integer divide has latency of 8 cycles + plus 1 for every 9 bits of quotient. */ {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */ COSTS_N_INSNS (11), /* HI. */ COSTS_N_INSNS (13), /* SI. */ @@ -2048,7 +2051,7 @@ struct processor_costs znver5_cost = { COSTS_N_INSNS (16)}, /* other. */ COSTS_N_INSNS (1), /* cost of movsx. */ COSTS_N_INSNS (1), /* cost of movzx. */ - 8, /* "large" insn. */ + 15, /* "large" insn. */ 9, /* MOVE_RATIO. */ 6, /* CLEAR_RATIO */ {6, 6, 6}, /* cost of loading integer registers @@ -2065,12 +2068,13 @@ struct processor_costs znver5_cost = { 2, 2, 2, /* cost of moving XMM,YMM,ZMM register. */ 6, /* cost of moving SSE register to integer. */ - /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops, - throughput 5. Approx 7 uops do not depend on vector size and every load - is 5 uops. */ + + /* TODO: gather and scatter instructions are currently disabled in + x86-tune.def. In some cases they are however a win, see PR116582 + We however need good cost model for them. */ 14, 10, /* Gather load static, per_elt. */ 14, 20, /* Gather store static, per_elt. */ - 32, /* size of l1 cache. */ + 48, /* size of l1 cache. */ 1024, /* size of l2 cache. */ 64, /* size of prefetch block. */ /* New AMD processors never drop prefetches; if they cannot be performed @@ -2080,6 +2084,8 @@ struct processor_costs znver5_cost = { time). */ 100, /* number of parallel prefetches. */ 3, /* Branch cost. */ + /* TODO x87 latencies are still based on znver4. + Probably not very important these days. */ COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (7), /* cost of FMUL instruction. */ /* Latency of fdiv is 8-15. */ @@ -2089,27 +2095,38 @@ struct processor_costs znver5_cost = { /* Latency of fsqrt is 4-10. */ COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ + /* SSE instructions have typical throughput 4 and latency 1. */ COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ - COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ + /* ADDSS has throughput 2 and latency 2 + (in some cases when source is another addition). */ + COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ + /* MULSS has throughput 2 and latency 3. */ COSTS_N_INSNS (3), /* cost of MULSS instruction. */ COSTS_N_INSNS (3), /* cost of MULSD instruction. */ + /* FMA had throughput 2 and latency 4. */ COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ + /* DIVSS has throughtput 0.4 and latency 10. */ COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ - /* 9-13. */ + /* DIVSD has throughtput 0.25 and latency 13. */ COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ + /* DIVSD has throughtput 0.22 and latency 14. */ COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ + /* DIVSD has throughtput 0.13 and latency 20. */ COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */ - /* Zen can execute 4 integer operations per cycle. FP operations - take 3 cycles and it can execute 2 integer additions and 2 - multiplications thus reassociation may make sense up to with of 6. - SPEC2k6 bencharks suggests - that 4 works better than 6 probably due to register pressure. - - Integer vector operations are taken by FP unit and execute 3 vector - plus/minus operations per cycle but only one multiply. This is adjusted - in ix86_reassociation_width. */ - 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ + /* Zen5 can execute: + - integer ops: 6 per cycle, at most 3 multiplications. + latency 1 for additions, 3 for multiplications (pipelined) + + Setting width of 9 for multiplication is probably excessive + for register pressure. + - fp ops: 2 additions per cycle, latency 2-3 + 2 multiplicaitons per cycle, latency 3 + - vector intger ops: 4 additions, latency 1 + 2 multiplications, latency 4 + We increase width to 6 for multiplications + in ix86_reassociation_width. */ + 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ znver2_memcpy, znver2_memset, COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index d77298b..4ebdf11 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -67,7 +67,6 @@ ix86_issue_rate (void) case PROCESSOR_ZNVER2: case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: - case PROCESSOR_ZNVER5: case PROCESSOR_CORE2: case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: @@ -91,6 +90,13 @@ ix86_issue_rate (void) return 5; case PROCESSOR_SAPPHIRERAPIDS: + /* For znver5 decoder can handle 4 or 8 instructions per cycle, + op cache 12 instruction/cycle, dispatch 8 instructions + integer rename 8 instructions and Fp 6 instructions. + + The scheduler, without understanding out of order nature of the CPU + is unlikely going to be able to fill all of these. */ + case PROCESSOR_ZNVER5: return 6; default: @@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, enum attr_unit unit = get_attr_unit (insn); int loadcost; + /* TODO: On znver5 complex addressing modes have + greater latency. */ if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) loadcost = 4; else @@ -563,6 +571,60 @@ ix86_macro_fusion_p () return TARGET_FUSE_CMP_AND_BRANCH; } +static bool +ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) +{ + /* Validate mov: + - It should be reg-reg move with opcode 0x89 or 0x8B. */ + rtx set1 = PATTERN (mov); + if (GET_CODE (set1) != SET + || !GENERAL_REG_P (SET_SRC (set1)) + || !GENERAL_REG_P (SET_DEST (set1))) + return false; + rtx reg = SET_DEST (set1); + /* - it should have 0x89 or 0x8B opcode. */ + if (!INTEGRAL_MODE_P (GET_MODE (reg)) + || GET_MODE_SIZE (GET_MODE (reg)) < 2 + || GET_MODE_SIZE (GET_MODE (reg)) > 8) + return false; + /* Validate ALU. */ + if (GET_CODE (PATTERN (alu)) != PARALLEL) + return false; + rtx set2 = XVECEXP (PATTERN (alu), 0, 0); + if (GET_CODE (set2) != SET) + return false; + /* Match one of: + ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR + We also may add insn attribute to handle some of sporadic + case we output those with different RTX expressions. */ + + if (GET_CODE (SET_SRC (set2)) != PLUS + && GET_CODE (SET_SRC (set2)) != MINUS + && GET_CODE (SET_SRC (set2)) != XOR + && GET_CODE (SET_SRC (set2)) != AND + && GET_CODE (SET_SRC (set2)) != IOR + && GET_CODE (SET_SRC (set2)) != NOT + && GET_CODE (SET_SRC (set2)) != ASHIFT + && GET_CODE (SET_SRC (set2)) != ASHIFTRT + && GET_CODE (SET_SRC (set2)) != LSHIFTRT) + return false; + rtx op0 = XEXP (SET_SRC (set2), 0); + rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL; + /* One of operands should be register. */ + if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) + std::swap (op0, op1); + if (!REG_P (op0) || REGNO (op0) != REGNO (reg)) + return false; + if (op1 + && !REG_P (op1) + && !x86_64_immediate_operand (op1, VOIDmode)) + return false; + /* Only one of two paramters must be move destination. */ + if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg)) + return false; + return true; +} + /* Check whether current microarchitecture support macro fusion for insn pair "CONDGEN + CONDJMP". Refer to "Intel Architectures Optimization Reference Manual". */ @@ -570,6 +632,9 @@ ix86_macro_fusion_p () bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) { + if (TARGET_FUSE_MOV_AND_ALU + && ix86_fuse_mov_alu_p (condgen, condjmp)) + return true; rtx src, dest; enum rtx_code ccode; rtx compare_set = NULL_RTX, test_if, cond; diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 3d29bff..6ebb2fd 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -143,10 +143,18 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by - the conditional jump instruction. */ + the conditional jump instruction. + + TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, + There is also limitation for immediate and displacement supported. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5) +/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov + and the destination is used by alu. alu must be one of + ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ +DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", + m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */ @@ -476,55 +484,64 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID + ~(m_ZNVER | m_CORE_HYBRID | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID + ~(m_ZNVER | m_CORE_HYBRID | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more elements. */ DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM + ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS)) /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more elements. */ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", - ~(m_ZNVER4)) + ~(m_ZNVER4 | m_ZNVER5)) /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 +DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER | m_YONGFENG | m_SHIJIDADAO | m_GENERIC) /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4 - | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) +DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", + m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID + | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or smaller FMA chain. */ -DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE) +DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd for v2df vector reduction. */ DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD, "v2df_reduction_prefer_haddpd", m_NONE) +/* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to + 3-instruction sequence (op1 & mask) | (op2 & ~mask) + for vector condition move. + For Crestmont, 4-operand vex blendv instructions come from MSROM + which is slow. */ +DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV, + "sse_movcc_use_blendv", ~m_CORE_ATOM) + /*****************************************************************************/ /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ /*****************************************************************************/ @@ -541,7 +558,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal" /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */ DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2 - | m_ZNVER1) + | m_ZNVER1 | m_CORE_ATOM) /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for the auto-vectorizer. */ @@ -552,6 +569,11 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 instructions in the auto-vectorizer. */ DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512) +/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane + vector permutation instructions in the auto-vectorizer. */ +DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM, + "avx256_avoid_vec_perm", m_CORE_ATOM) + /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */ DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4) diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 7f10f96..0864b2b 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -38,6 +38,7 @@ enum _mm_hint { _MM_HINT_IT0 = 19, _MM_HINT_IT1 = 18, + _MM_HINT_RST2 = 9, /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ _MM_HINT_ET0 = 7, _MM_HINT_T0 = 3, @@ -52,12 +53,12 @@ enum _mm_hint extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_prefetch (const void *__P, enum _mm_hint __I) { - __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2, + __builtin_ia32_prefetch (__P, (__I & 0xC) >> 2, __I & 0x3, (__I & 0x10) >> 4); } #else #define _mm_prefetch(P, I) \ - __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4) + __builtin_ia32_prefetch ((P), ((I) & 0xC) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4) #endif #ifndef __SSE__ diff --git a/gcc/config/ia64/freebsd.h b/gcc/config/ia64/freebsd.h index c1eb8d5..bc8366b 100644 --- a/gcc/config/ia64/freebsd.h +++ b/gcc/config/ia64/freebsd.h @@ -36,7 +36,7 @@ along with GCC; see the file COPYING3. If not see /************************[ Target stuff ]***********************************/ -/* Define the actual types of some ANSI-mandated types. +/* Define the actual types of some ANSI-mandated types. Needs to agree with <machine/ansi.h>. GCC defaults come from c-decl.cc, c-common.cc, and config/<arch>/<arch>.h. */ diff --git a/gcc/config/ia64/ia64.cc b/gcc/config/ia64/ia64.cc index cd6ed89..4acbd82 100644 --- a/gcc/config/ia64/ia64.cc +++ b/gcc/config/ia64/ia64.cc @@ -352,7 +352,7 @@ struct expand_vec_perm_d machine_mode vmode; unsigned char nelt; bool one_operand_p; - bool testing_p; + bool testing_p; }; static bool ia64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d); @@ -619,9 +619,6 @@ static const scoped_attribute_specs *const ia64_attribute_table[] = #undef TARGET_LEGITIMATE_ADDRESS_P #define TARGET_LEGITIMATE_ADDRESS_P ia64_legitimate_address_p -#undef TARGET_LRA_P -#define TARGET_LRA_P hook_bool_void_false - #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM ia64_cannot_force_const_mem @@ -818,7 +815,7 @@ ia64_vms_common_object_attribute (tree *node, tree name, tree args, tree id; gcc_assert (DECL_P (decl)); - + DECL_COMMON (decl) = 1; id = TREE_VALUE (args); if (TREE_CODE (id) != IDENTIFIER_NODE && TREE_CODE (id) != STRING_CST) @@ -1045,7 +1042,7 @@ ia64_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x, return true; else if ((GET_CODE (x) == POST_INC || GET_CODE (x) == POST_DEC) && ia64_legitimate_address_reg (XEXP (x, 0), strict) - && XEXP (x, 0) != arg_pointer_rtx) + && XEXP (x, 0) != arg_pointer_rtx) return true; else if (GET_CODE (x) == POST_MODIFY && ia64_legitimate_address_reg (XEXP (x, 0), strict) @@ -1333,7 +1330,7 @@ ia64_expand_move (rtx op0, rtx op1) { machine_mode mode = GET_MODE (op0); - if (!reload_in_progress && !reload_completed && !ia64_move_ok (op0, op1)) + if (!lra_in_progress && !reload_completed && !ia64_move_ok (op0, op1)) op1 = force_reg (mode, op1); if ((mode == Pmode || mode == ptr_mode) && symbolic_operand (op1, VOIDmode)) @@ -1359,7 +1356,7 @@ ia64_expand_move (rtx op0, rtx op1) else if (aligned_offset_symbol_operand (sym, mode)) { HOST_WIDE_INT addend_lo, addend_hi; - + addend_lo = ((addend & 0x3fff) ^ 0x2000) - 0x2000; addend_hi = addend - addend_lo; @@ -1444,7 +1441,7 @@ ia64_split_tmode (rtx out[2], rtx in, bool reversed, bool dead) case CONST_DOUBLE: /* Cannot occur reversed. */ gcc_assert (!reversed); - + if (GET_MODE (in) != TFmode) split_double (in, &out[0], &out[1]); else @@ -1499,7 +1496,7 @@ ia64_split_tmode (rtx out[2], rtx in, bool reversed, bool dead) case POST_INC: gcc_assert (!reversed && !dead); - + /* Just do the increment in two steps. */ out[0] = adjust_automodify_address (in, DImode, 0, 0); out[1] = adjust_automodify_address (in, DImode, 0, 8); @@ -1507,7 +1504,7 @@ ia64_split_tmode (rtx out[2], rtx in, bool reversed, bool dead) case POST_DEC: gcc_assert (!reversed && !dead); - + /* Add 8, subtract 24. */ base = XEXP (base, 0); out[0] = adjust_automodify_address @@ -1595,7 +1592,7 @@ ia64_split_tmode_move (rtx operands[]) the appropriate order so that the pointer is not destroyed too early. Also we must not generate a postmodify for that second load, or rws_access_regno will die. And we must not generate a - postmodify for the second load if the destination register + postmodify for the second load if the destination register overlaps with the base register. */ if (GET_CODE (operands[1]) == MEM && reg_overlap_mentioned_p (operands[0], operands[1])) @@ -1780,7 +1777,7 @@ ia64_expand_movxf_movrf (machine_mode mode, rtx operands[]) } } - if (!reload_in_progress && !reload_completed) + if (!lra_in_progress && !reload_completed) { operands[1] = spill_xfmode_rfmode_operand (operands[1], 0, mode); @@ -1841,7 +1838,7 @@ ia64_expand_compare (rtx *expr, rtx *op0, rtx *op1) int magic; enum rtx_code ncode; rtx ret; - + gcc_assert (cmptf_libfunc && GET_MODE (*op1) == TFmode); switch (code) { @@ -2865,7 +2862,7 @@ ia64_compute_frame_size (HOST_WIDE_INT size) if (df_regs_ever_live_p (AR_PFS_REGNUM)) { SET_HARD_REG_BIT (mask, AR_PFS_REGNUM); - current_frame_info.r[reg_save_ar_pfs] + current_frame_info.r[reg_save_ar_pfs] = find_gr_spill (reg_save_ar_pfs, 1); if (current_frame_info.r[reg_save_ar_pfs] == 0) { @@ -2880,8 +2877,8 @@ ia64_compute_frame_size (HOST_WIDE_INT size) it is absolutely critical that FP get the only hard register that's guaranteed to be free, so we allocated it first. If all three did happen to be allocated hard regs, and are consecutive, rearrange them - into the preferred order now. - + into the preferred order now. + If we have already emitted code for any of those registers, then it's already too late to change. */ min_regno = MIN (current_frame_info.r[reg_fp], @@ -2935,7 +2932,7 @@ ia64_compute_frame_size (HOST_WIDE_INT size) { df_set_regs_ever_live (AR_UNAT_REGNUM, true); SET_HARD_REG_BIT (mask, AR_UNAT_REGNUM); - current_frame_info.r[reg_save_ar_unat] + current_frame_info.r[reg_save_ar_unat] = find_gr_spill (reg_save_ar_unat, spill_size == 0); if (current_frame_info.r[reg_save_ar_unat] == 0) { @@ -2947,7 +2944,7 @@ ia64_compute_frame_size (HOST_WIDE_INT size) if (df_regs_ever_live_p (AR_LC_REGNUM)) { SET_HARD_REG_BIT (mask, AR_LC_REGNUM); - current_frame_info.r[reg_save_ar_lc] + current_frame_info.r[reg_save_ar_lc] = find_gr_spill (reg_save_ar_lc, spill_size == 0); if (current_frame_info.r[reg_save_ar_lc] == 0) { @@ -3534,7 +3531,7 @@ ia64_expand_prologue (void) ia64_emit_probe_stack_range (get_stack_check_protect (), size, bs_size); } - if (dump_file) + if (dump_file) { fprintf (dump_file, "ia64 frame related registers " "recorded in current_frame_info.r[]:\n"); @@ -4148,7 +4145,7 @@ ia64_expand_epilogue (int sibcall_p) names of r2 and HARD_FRAME_POINTER_REGNUM, so we have to make sure we're using the string "r2" when emitting the register name for the assembler. */ - if (current_frame_info.r[reg_fp] + if (current_frame_info.r[reg_fp] && current_frame_info.r[reg_fp] == GR_REG (2)) fp = HARD_FRAME_POINTER_REGNUM; @@ -4275,7 +4272,7 @@ ia64_hard_regno_rename_ok (int from, int to) unsigned int r; for (r = reg_fp; r <= reg_save_ar_lc; r++) - if (to == current_frame_info.r[r] + if (to == current_frame_info.r[r] || from == current_frame_info.r[r] || to == emitted_frame_related_regs[r] || from == emitted_frame_related_regs[r]) @@ -4884,7 +4881,7 @@ ia64_function_arg_1 (cumulative_args_t cum_v, const function_arg_info &arg, } return gen_rtx_PARALLEL (arg.mode, gen_rtvec_v (i, loc)); } - + /* Integral and aggregates go in general registers. If we have run out of FR registers, then FP values must also go in general registers. This can happen when we have a SFmode HFA. */ @@ -5234,7 +5231,7 @@ ia64_function_value (const_tree valtype, if (fn_decl_or_type && !DECL_P (fn_decl_or_type)) func = NULL; - + mode = TYPE_MODE (valtype); hfa_mode = hfa_element_mode (valtype, 0); @@ -5880,7 +5877,7 @@ ia64_preferred_reload_class (rtx x, reg_class_t rclass) of the f/f case when reloading (set (reg fX) (mem/v)). */ if (MEM_P (x) && MEM_VOLATILE_P (x)) return NO_REGS; - + /* Force all unrecognized constants into the constant pool. */ if (CONSTANT_P (x)) return NO_REGS; @@ -6490,7 +6487,7 @@ update_set_flags (rtx x, struct reg_flags *pflags) doloop_end_internal, (3) The destination is an fp register, in which case this is an fselect instruction. - (4) The condition has (unspec [(reg)] UNSPEC_LDC), in which case + (4) The condition has (unspec [(reg)] UNSPEC_LDC), in which case this is a check load. In all cases, nothing we do in this function applies. */ return; @@ -6542,12 +6539,12 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred) } if (ia64_spec_check_src_p (src)) - /* Avoid checking one register twice (in condition + /* Avoid checking one register twice (in condition and in 'then' section) for ldc pattern. */ { gcc_assert (REG_P (XEXP (src, 2))); need_barrier = rtx_needs_barrier (XEXP (src, 2), flags, pred); - + /* We process MEM below. */ src = XEXP (src, 1); } @@ -7438,7 +7435,7 @@ static void ia64_sched_init_global (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose ATTRIBUTE_UNUSED, int max_ready ATTRIBUTE_UNUSED) -{ +{ gcc_assert (pending_data_specs == 0); } @@ -7643,7 +7640,7 @@ ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED, if (reload_completed) { int needed = group_barrier_needed (insn); - + gcc_assert (!needed); if (CALL_P (insn)) init_insn_group_barriers (); @@ -7777,7 +7774,7 @@ ia64_dfa_new_cycle (FILE *dump, int verbose, rtx_insn *insn, int last_clock, static void ia64_h_i_d_extended (void) { - if (stops_p != NULL) + if (stops_p != NULL) { int new_clocks_length = get_max_uid () * 3 / 2; stops_p = (char *) xrecalloc (stops_p, new_clocks_length, clocks_length, 1); @@ -7864,7 +7861,7 @@ static void ia64_clear_sched_context (void *_sc) { ia64_sched_context_t sc = (ia64_sched_context_t) _sc; - + free (sc->prev_cycle_state); sc->prev_cycle_state = NULL; } @@ -8043,13 +8040,13 @@ ia64_set_sched_flags (spec_info_t spec_info) || (mflag_sched_ar_in_data_spec && reload_completed))) mask |= BE_IN_DATA; } - + if (mflag_sched_control_spec && (!sel_sched_p () || reload_completed)) { mask |= BEGIN_CONTROL; - + if (!sel_sched_p () && mflag_sched_in_control_spec) mask |= BE_IN_CONTROL; } @@ -8062,9 +8059,9 @@ ia64_set_sched_flags (spec_info_t spec_info) if (mask & BE_IN_SPEC) *flags |= NEW_BBS; - + spec_info->flags = 0; - + if ((mask & CONTROL_SPEC) && sel_sched_p () && mflag_sel_sched_dont_check_control_spec) spec_info->flags |= SEL_SCHED_SPEC_DONT_CHECK_CONTROL; @@ -8073,7 +8070,7 @@ ia64_set_sched_flags (spec_info_t spec_info) spec_info->dump = sched_dump; else spec_info->dump = 0; - + if (mflag_sched_count_spec_in_critical_path) spec_info->flags |= COUNT_SPEC_IN_CRITICAL_PATH; } @@ -8320,10 +8317,10 @@ insn_can_be_in_speculative_p (rtx insn ATTRIBUTE_UNUSED, return 0. */ static int ia64_speculate_insn (rtx_insn *insn, ds_t ts, rtx *new_pat) -{ +{ int mode_no; int res; - + gcc_assert (!(ts & ~SPECULATIVE)); if (ia64_spec_check_p (insn)) @@ -8510,12 +8507,12 @@ ia64_gen_spec_check (rtx_insn *insn, rtx_insn *label, ds_t ds) gcc_assert (!ia64_needs_block_p (ds)); op1 = copy_rtx (recog_data.operand[1]); } - + gen_check = get_spec_check_gen_function (ds, mode_no, label == NULL_RTX, true); check_pat = gen_check (copy_rtx (recog_data.operand[0]), op1); - + pat = PATTERN (insn); if (GET_CODE (pat) == COND_EXEC) check_pat = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (COND_EXEC_TEST (pat)), @@ -8547,14 +8544,14 @@ ia64_spec_check_src_p (rtx src) t = XEXP (src, 0); if (GET_CODE (t) == NE) { - t = XEXP (t, 0); + t = XEXP (t, 0); if (GET_CODE (t) == UNSPEC) { int code; - + code = XINT (t, 1); - + if (code == UNSPEC_LDCCLR || code == UNSPEC_LDCNC || code == UNSPEC_CHKACLR @@ -9266,7 +9263,7 @@ bundling (FILE *dump, int verbose, rtx_insn *prev_head_insn, rtx_insn *tail) INSN_UID (insn)); } } - + /* We should find a solution because the 2nd insn scheduling has found one. */ gcc_assert (index_to_bundle_states [insn_num]); @@ -9646,7 +9643,7 @@ ia64_st_address_bypass_p (rtx_insn *producer, rtx_insn *consumer) if (GET_CODE (reg) == SUBREG) reg = SUBREG_REG (reg); gcc_assert (GET_CODE (reg) == REG); - + dest = ia64_single_set (consumer); gcc_assert (dest); mem = SET_DEST (dest); @@ -9670,12 +9667,12 @@ ia64_ld_address_bypass_p (rtx_insn *producer, rtx_insn *consumer) if (GET_CODE (reg) == SUBREG) reg = SUBREG_REG (reg); gcc_assert (GET_CODE (reg) == REG); - + src = ia64_single_set (consumer); gcc_assert (src); mem = SET_SRC (src); gcc_assert (mem); - + if (GET_CODE (mem) == UNSPEC && XVECLEN (mem, 0) > 0) mem = XVECEXP (mem, 0, 0); else if (GET_CODE (mem) == IF_THEN_ELSE) @@ -9684,7 +9681,7 @@ ia64_ld_address_bypass_p (rtx_insn *producer, rtx_insn *consumer) gcc_assert (XINT (XEXP (XEXP (mem, 0), 0), 1) == UNSPEC_LDCCLR); mem = XEXP (mem, 1); } - + while (GET_CODE (mem) == SUBREG || GET_CODE (mem) == ZERO_EXTEND) mem = XEXP (mem, 0); @@ -9892,7 +9889,7 @@ ia64_reorg (void) emit_all_insn_group_barriers (dump_file); df_analyze (); - + /* A call must not be the last instruction in a function, so that the return address is still within the function, so that unwinding works properly. Note that IA-64 differs from dwarf2 on this point. */ @@ -10092,9 +10089,9 @@ process_cfa_adjust_cfa (FILE *out_file, rtx pat, rtx insn, { rtx op0 = XEXP (src, 0); rtx op1 = XEXP (src, 1); - + gcc_assert (op0 == dest && GET_CODE (op1) == CONST_INT); - + if (INTVAL (op1) < 0) { gcc_assert (!frame_pointer_needed); @@ -10922,7 +10919,7 @@ ia64_struct_retval_addr_is_first_parm_p (tree fntype) these return values. */ return (abi_version_at_least (2) && ret_type - && TYPE_MODE (ret_type) == BLKmode + && TYPE_MODE (ret_type) == BLKmode && TREE_ADDRESSABLE (ret_type) && lang_GNU_CXX ()); } @@ -11514,7 +11511,7 @@ expand_vec_perm_shrp (struct expand_vec_perm_d *d) static bool expand_vec_perm_1 (struct expand_vec_perm_d *d) -{ +{ unsigned i, nelt = d->nelt; unsigned char perm2[MAX_VECT_LEN]; @@ -11551,8 +11548,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_shrp (d)) return true; - /* ??? Look for deposit-like permutations where most of the result - comes from one vector unchanged and the rest comes from a + /* ??? Look for deposit-like permutations where most of the result + comes from one vector unchanged and the rest comes from a sequential hunk of the other vector. */ return false; @@ -11639,7 +11636,7 @@ expand_vec_perm_interleave_2 (struct expand_vec_perm_d *d) h1 = h0 << nelt2; h2 = h0 << nelt; h3 = h0 << (nelt + nelt2); - + if ((contents & (h0 | h2)) == contents) /* punpck even halves */ { for (i = 0; i < nelt; ++i) @@ -11904,7 +11901,7 @@ ia64_expand_vec_setv2sf (rtx operands[3]) struct expand_vec_perm_d d; unsigned int which; bool ok; - + d.target = operands[0]; d.op0 = operands[0]; d.op1 = gen_reg_rtx (V2SFmode); diff --git a/gcc/config/ia64/ia64.md b/gcc/config/ia64/ia64.md index 698e302..d485acc 100644 --- a/gcc/config/ia64/ia64.md +++ b/gcc/config/ia64/ia64.md @@ -2318,7 +2318,7 @@ (match_operand:DI 3 "register_operand" "f")) (match_operand:DI 4 "nonmemory_operand" "rI"))) (clobber (match_scratch:DI 5 "=f"))] - "reload_in_progress" + "lra_in_progress" "#" [(set_attr "itanium_class" "unknown")]) @@ -3407,7 +3407,7 @@ (match_operand:DI 2 "shladd_operand" "n")) (match_operand:DI 3 "nonmemory_operand" "r")) (match_operand:DI 4 "nonmemory_operand" "rI")))] - "reload_in_progress" + "lra_in_progress" "* gcc_unreachable ();" "reload_completed" [(set (match_dup 0) (plus:DI (mult:DI (match_dup 1) (match_dup 2)) diff --git a/gcc/config/ia64/predicates.md b/gcc/config/ia64/predicates.md index 01a4eff..85f5380e 100644 --- a/gcc/config/ia64/predicates.md +++ b/gcc/config/ia64/predicates.md @@ -347,7 +347,7 @@ allows reload the opportunity to avoid spilling addresses to the stack, and instead simply substitute in the value from a REG_EQUIV. We'll split this up again when splitting the insn. */ - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_completed) return true; /* Some symbol types we allow to use with any offset. */ diff --git a/gcc/config/iq2000/iq2000.cc b/gcc/config/iq2000/iq2000.cc index 136675d..42935d3 100644 --- a/gcc/config/iq2000/iq2000.cc +++ b/gcc/config/iq2000/iq2000.cc @@ -1572,7 +1572,7 @@ final_prescan_insn (rtx_insn *insn, rtx opvec[] ATTRIBUTE_UNUSED, rtx_insn *nop_insn = emit_insn_after (gen_nop (), insn); INSN_ADDRESSES_NEW (nop_insn, -1); } - + if (TARGET_STATS && (JUMP_P (insn) || CALL_P (insn))) dslots_jump_total ++; @@ -1684,7 +1684,7 @@ compute_frame_size (HOST_WIDE_INT size) gp_reg_rounded = IQ2000_STACK_ALIGN (gp_reg_size); total_size += gp_reg_rounded + IQ2000_STACK_ALIGN (fp_reg_size); - /* The gp reg is caller saved, so there is no need for leaf routines + /* The gp reg is caller saved, so there is no need for leaf routines (total_size == extra_size) to save the gp reg. */ if (total_size == extra_size && ! profile_flag) @@ -1751,18 +1751,18 @@ iq2000_initial_elimination_offset (int from, int to ATTRIBUTE_UNUSED) { int offset; - compute_frame_size (get_frame_size ()); - if ((from) == FRAME_POINTER_REGNUM) - (offset) = 0; - else if ((from) == ARG_POINTER_REGNUM) - (offset) = (cfun->machine->total_size); - else if ((from) == RETURN_ADDRESS_POINTER_REGNUM) + compute_frame_size (get_frame_size ()); + if ((from) == FRAME_POINTER_REGNUM) + (offset) = 0; + else if ((from) == ARG_POINTER_REGNUM) + (offset) = (cfun->machine->total_size); + else if ((from) == RETURN_ADDRESS_POINTER_REGNUM) { - if (leaf_function_p ()) - (offset) = 0; - else (offset) = cfun->machine->gp_sp_offset - + ((UNITS_PER_WORD - (POINTER_SIZE / BITS_PER_UNIT)) - * (BYTES_BIG_ENDIAN != 0)); + if (leaf_function_p ()) + (offset) = 0; + else (offset) = cfun->machine->gp_sp_offset + + ((UNITS_PER_WORD - (POINTER_SIZE / BITS_PER_UNIT)) + * (BYTES_BIG_ENDIAN != 0)); } else gcc_unreachable (); @@ -1771,7 +1771,7 @@ iq2000_initial_elimination_offset (int from, int to ATTRIBUTE_UNUSED) } /* Common code to emit the insns (or to write the instructions to a file) - to save/restore registers. + to save/restore registers. Other parts of the code assume that IQ2000_TEMP1_REGNUM (aka large_reg) is not modified within save_restore_insns. */ @@ -1891,7 +1891,7 @@ save_restore_insns (int store_p) if (store_p) iq2000_emit_frame_related_store (mem_rtx, reg_rtx, gp_offset); - else + else { emit_move_insn (reg_rtx, mem_rtx); } @@ -2632,7 +2632,7 @@ expand_one_builtin (enum insn_code icode, rtx target, tree exp, default: gcc_unreachable (); } - + if (! pat) return 0; emit_insn (pat); @@ -2663,7 +2663,7 @@ iq2000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, { default: break; - + case IQ2000_BUILTIN_ADO16: return expand_one_builtin (CODE_FOR_ado16, target, exp, code, 2); @@ -2672,10 +2672,10 @@ iq2000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, code[2] = CONST_INT; code[3] = CONST_INT; return expand_one_builtin (CODE_FOR_ram, target, exp, code, 4); - + case IQ2000_BUILTIN_CHKHDR: return expand_one_builtin (CODE_FOR_chkhdr, target, exp, code, 2); - + case IQ2000_BUILTIN_PKRL: return expand_one_builtin (CODE_FOR_pkrl, target, exp, code, 2); @@ -2822,7 +2822,7 @@ iq2000_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IQ2000_BUILTIN_SYSCALL: return expand_one_builtin (CODE_FOR_syscall, target, exp, code, 0); } - + return NULL_RTX; } @@ -2843,39 +2843,39 @@ iq2000_setup_incoming_varargs (cumulative_args_t cum_v, int *pretend_size, int no_rtl) { CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - unsigned int iq2000_off = ! cum->last_arg_fp; - unsigned int iq2000_fp_off = cum->last_arg_fp; + unsigned int iq2000_off = ! cum->last_arg_fp; + unsigned int iq2000_fp_off = cum->last_arg_fp; if ((cum->arg_words < MAX_ARGS_IN_REGISTERS - iq2000_off)) { - int iq2000_save_gp_regs - = MAX_ARGS_IN_REGISTERS - cum->arg_words - iq2000_off; - int iq2000_save_fp_regs - = (MAX_ARGS_IN_REGISTERS - cum->fp_arg_words - iq2000_fp_off); + int iq2000_save_gp_regs + = MAX_ARGS_IN_REGISTERS - cum->arg_words - iq2000_off; + int iq2000_save_fp_regs + = (MAX_ARGS_IN_REGISTERS - cum->fp_arg_words - iq2000_fp_off); - if (iq2000_save_gp_regs < 0) - iq2000_save_gp_regs = 0; - if (iq2000_save_fp_regs < 0) - iq2000_save_fp_regs = 0; + if (iq2000_save_gp_regs < 0) + iq2000_save_gp_regs = 0; + if (iq2000_save_fp_regs < 0) + iq2000_save_fp_regs = 0; - *pretend_size = ((iq2000_save_gp_regs * UNITS_PER_WORD) - + (iq2000_save_fp_regs * UNITS_PER_FPREG)); + *pretend_size = ((iq2000_save_gp_regs * UNITS_PER_WORD) + + (iq2000_save_fp_regs * UNITS_PER_FPREG)); - if (! (no_rtl)) + if (! (no_rtl)) { - if (cum->arg_words < MAX_ARGS_IN_REGISTERS - iq2000_off) + if (cum->arg_words < MAX_ARGS_IN_REGISTERS - iq2000_off) { - rtx ptr, mem; + rtx ptr, mem; ptr = plus_constant (Pmode, virtual_incoming_args_rtx, - (iq2000_save_gp_regs * UNITS_PER_WORD)); - mem = gen_rtx_MEM (BLKmode, ptr); - move_block_from_reg - (cum->arg_words + GP_ARG_FIRST + iq2000_off, - mem, + mem = gen_rtx_MEM (BLKmode, ptr); + move_block_from_reg + (cum->arg_words + GP_ARG_FIRST + iq2000_off, + mem, iq2000_save_gp_regs); - } - } + } + } } } @@ -3297,7 +3297,7 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, * total = COSTS_N_INSNS (2 * num_words); break; } - + case FFS: * total = COSTS_N_INSNS (6); break; @@ -3316,7 +3316,7 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, * total = COSTS_N_INSNS ((GET_CODE (XEXP (x, 1)) == CONST_INT) ? 4 : 12); else * total = COSTS_N_INSNS (1); - break; + break; case ABS: if (mode == SFmode || mode == DFmode) @@ -3324,7 +3324,7 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, else * total = COSTS_N_INSNS (4); break; - + case PLUS: case MINUS: if (mode == SFmode || mode == DFmode) @@ -3334,7 +3334,7 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, else * total = COSTS_N_INSNS (1); break; - + case NEG: * total = (mode == DImode) ? 4 : 1; break; @@ -3357,16 +3357,16 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, else * total = COSTS_N_INSNS (69); break; - + case UDIV: case UMOD: * total = COSTS_N_INSNS (69); break; - + case SIGN_EXTEND: * total = COSTS_N_INSNS (2); break; - + case ZERO_EXTEND: * total = COSTS_N_INSNS (1); break; @@ -3374,7 +3374,7 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, case CONST_INT: * total = 0; break; - + case LABEL_REF: * total = COSTS_N_INSNS (2); break; @@ -3399,19 +3399,19 @@ iq2000_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, case SYMBOL_REF: * total = COSTS_N_INSNS (SYMBOL_REF_FLAG (x) ? 1 : 2); break; - + case CONST_DOUBLE: { rtx high, low; - + split_double (x, & high, & low); - + * total = COSTS_N_INSNS ( (high == CONST0_RTX (GET_MODE (high)) || low == CONST0_RTX (GET_MODE (low))) ? 2 : 4); break; } - + default: return false; } diff --git a/gcc/config/iq2000/iq2000.h b/gcc/config/iq2000/iq2000.h index c3562be..08801ce 100644 --- a/gcc/config/iq2000/iq2000.h +++ b/gcc/config/iq2000/iq2000.h @@ -1,4 +1,4 @@ -/* Definitions of target machine for GNU compiler. +/* Definitions of target machine for GNU compiler. Vitesse IQ2000 processors Copyright (C) 2003-2024 Free Software Foundation, Inc. @@ -60,7 +60,7 @@ /* Storage Layout. */ #define BITS_BIG_ENDIAN 0 -#define BYTES_BIG_ENDIAN 1 +#define BYTES_BIG_ENDIAN 1 #define WORDS_BIG_ENDIAN 1 #define BITS_PER_WORD 32 #define MAX_BITS_PER_WORD 64 @@ -295,7 +295,7 @@ typedef struct iq2000_args init_cumulative_args (& CUM, FNTYPE, LIBNAME) \ #define FUNCTION_ARG_REGNO_P(N) \ - (((N) >= GP_ARG_FIRST && (N) <= GP_ARG_LAST)) + (((N) >= GP_ARG_FIRST && (N) <= GP_ARG_LAST)) /* On the IQ2000, R2 and R3 are the only register thus used. */ diff --git a/gcc/config/kopensolaris-gnu.h b/gcc/config/kopensolaris-gnu.h index e7f6198..880aa27 100644 --- a/gcc/config/kopensolaris-gnu.h +++ b/gcc/config/kopensolaris-gnu.h @@ -18,7 +18,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -#undef GNU_USER_TARGET_OS_CPP_BUILTINS +#undef GNU_USER_TARGET_OS_CPP_BUILTINS #define GNU_USER_TARGET_OS_CPP_BUILTINS() \ do \ { \ diff --git a/gcc/config/lm32/lm32-protos.h b/gcc/config/lm32/lm32-protos.h index fed0de6..861a406 100644 --- a/gcc/config/lm32/lm32-protos.h +++ b/gcc/config/lm32/lm32-protos.h @@ -20,13 +20,13 @@ <http://www.gnu.org/licenses/>. */ extern int lm32_return_in_memory (tree type); -extern void lm32_declare_object (FILE *stream, char *name, char *init_string, +extern void lm32_declare_object (FILE *stream, char *name, char *init_string, char *final_string, int size); extern void lm32_expand_prologue (void); extern void lm32_expand_epilogue (void); extern void lm32_print_operand (FILE *file, rtx op, int letter); extern void lm32_print_operand_address (FILE *file, rtx addr); -extern HOST_WIDE_INT lm32_compute_initial_elimination_offset (int from, +extern HOST_WIDE_INT lm32_compute_initial_elimination_offset (int from, int to); extern int lm32_can_use_return (void); extern rtx lm32_return_addr_rtx (int count, rtx frame); diff --git a/gcc/config/lm32/lm32.cc b/gcc/config/lm32/lm32.cc index 594f733..206b3f8 100644 --- a/gcc/config/lm32/lm32.cc +++ b/gcc/config/lm32/lm32.cc @@ -158,20 +158,20 @@ emit_add (rtx dest, rtx src0, rtx src1) } /* Generate the code to compare (and possibly branch) two integer values - TEST_CODE is the comparison code we are trying to emulate + TEST_CODE is the comparison code we are trying to emulate (or implement directly) - RESULT is where to store the result of the comparison, + RESULT is where to store the result of the comparison, or null to emit a branch CMP0 CMP1 are the two comparison operands DESTINATION is the destination of the branch, or null to only compare */ static void -gen_int_relational (enum rtx_code code, - rtx result, - rtx cmp0, - rtx cmp1, - rtx destination) +gen_int_relational (enum rtx_code code, + rtx result, + rtx cmp0, + rtx cmp1, + rtx destination) { machine_mode mode; int branch_p; @@ -183,7 +183,7 @@ gen_int_relational (enum rtx_code code, /* Is this a branch or compare. */ branch_p = (destination != 0); - /* Instruction set doesn't support LE or LT, so swap operands and use + /* Instruction set doesn't support LE or LT, so swap operands and use GE, GT. */ switch (code) { @@ -270,7 +270,7 @@ lm32_expand_scc (rtx operands[]) rtx op0 = operands[2]; rtx op1 = operands[3]; - gen_int_relational (code, target, op0, op1, NULL_RTX); + gen_int_relational (code, target, op0, op1, NULL_RTX); } /* Compare OPERANDS[1] with OPERANDS[2] using comparison code @@ -284,7 +284,7 @@ lm32_expand_conditional_branch (rtx operands[]) rtx op1 = operands[2]; rtx destination = operands[3]; - gen_int_relational (code, NULL_RTX, op0, op1, destination); + gen_int_relational (code, NULL_RTX, op0, op1, destination); } /* Generate and emit RTL to save or restore callee save registers. */ @@ -304,10 +304,10 @@ expand_save_restore (struct lm32_frame_info *info, int op) { rtx offset_rtx; rtx mem; - + offset_rtx = GEN_INT (offset); if (satisfies_constraint_K (offset_rtx)) - { + { mem = gen_rtx_MEM (word_mode, gen_rtx_PLUS (Pmode, stack_pointer_rtx, @@ -316,23 +316,23 @@ expand_save_restore (struct lm32_frame_info *info, int op) else { /* r10 is caller saved so it can be used as a temp reg. */ - rtx r10; - + rtx r10; + r10 = gen_rtx_REG (word_mode, 10); insn = emit_move_insn (r10, offset_rtx); if (op == 0) RTX_FRAME_RELATED_P (insn) = 1; insn = emit_add (r10, r10, stack_pointer_rtx); if (op == 0) - RTX_FRAME_RELATED_P (insn) = 1; + RTX_FRAME_RELATED_P (insn) = 1; mem = gen_rtx_MEM (word_mode, r10); - } - + } + if (op == 0) insn = emit_move_insn (mem, gen_rtx_REG (word_mode, regno)); else insn = emit_move_insn (gen_rtx_REG (word_mode, regno), mem); - + /* only prologue instructions which set the sp fp or save a register should be marked as frame related. */ if (op == 0) @@ -391,11 +391,11 @@ lm32_expand_prologue (void) { /* Move sp to fp. */ insn = emit_move_insn (frame_pointer_rtx, stack_pointer_rtx); - RTX_FRAME_RELATED_P (insn) = 1; + RTX_FRAME_RELATED_P (insn) = 1; - /* Add offset - Don't use total_size, as that includes pretend_size, + /* Add offset - Don't use total_size, as that includes pretend_size, which isn't part of this frame? */ - insn = emit_add (frame_pointer_rtx, + insn = emit_add (frame_pointer_rtx, frame_pointer_rtx, GEN_INT (current_frame_info.args_size + current_frame_info.callee_size + @@ -513,7 +513,7 @@ lm32_print_operand (FILE * file, rtx op, int letter) fprintf (file, "%s", reg_names[regnum]); } else if (code == HIGH) - output_addr_const (file, XEXP (op, 0)); + output_addr_const (file, XEXP (op, 0)); else if (code == MEM) output_address (GET_MODE (op), XEXP (op, 0)); else if (letter == 'z' && GET_CODE (op) == CONST_INT && INTVAL (op) == 0) @@ -1129,7 +1129,7 @@ lm32_rtx_costs (rtx x, machine_mode mode, int outer_code, *total = COSTS_N_INSNS (2); return true; } - /* Fall through. */ + /* Fall through. */ default: if (satisfies_constraint_K (x)) @@ -1194,32 +1194,32 @@ lm32_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) static bool lm32_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx x, bool strict, code_helper) -{ - /* (rM) */ +{ + /* (rM) */ if (strict && REG_P (x) && STRICT_REG_OK_FOR_BASE_P (x)) return true; if (!strict && REG_P (x) && NONSTRICT_REG_OK_FOR_BASE_P (x)) return true; - - /* (rM)+literal) */ - if (GET_CODE (x) == PLUS - && REG_P (XEXP (x, 0)) + + /* (rM)+literal) */ + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 0)) && ((strict && STRICT_REG_OK_FOR_BASE_P (XEXP (x, 0))) - || (!strict && NONSTRICT_REG_OK_FOR_BASE_P (XEXP (x, 0)))) - && GET_CODE (XEXP (x, 1)) == CONST_INT + || (!strict && NONSTRICT_REG_OK_FOR_BASE_P (XEXP (x, 0)))) + && GET_CODE (XEXP (x, 1)) == CONST_INT && satisfies_constraint_K (XEXP ((x), 1))) return true; - - /* gp(sym) */ - if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_SMALL_P (x)) + + /* gp(sym) */ + if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_SMALL_P (x)) return true; - - return false; + + return false; } -/* Check a move is not memory to memory. */ +/* Check a move is not memory to memory. */ -bool +bool lm32_move_ok (machine_mode mode, rtx operands[2]) { if (memory_operand (operands[0], mode)) return register_or_zero_operand (operands[1], mode); diff --git a/gcc/config/lm32/lm32.h b/gcc/config/lm32/lm32.h index e761e14..ecad4cd 100644 --- a/gcc/config/lm32/lm32.h +++ b/gcc/config/lm32/lm32.h @@ -52,7 +52,7 @@ %{muser-enabled} \ " -/* Let link script define all link options. +/* Let link script define all link options. Default to using simulator link script. */ #undef STARTFILE_SPEC @@ -162,7 +162,7 @@ do { \ enum reg_class { - NO_REGS, + NO_REGS, GENERAL_REGS, ALL_REGS, LIM_REG_CLASSES diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in b/gcc/config/loongarch/genopts/loongarch.opt.in index d00950c..d5bbf01 100644 --- a/gcc/config/loongarch/genopts/loongarch.opt.in +++ b/gcc/config/loongarch/genopts/loongarch.opt.in @@ -301,3 +301,7 @@ default value is 4. ; CPUCFG independently, so we use bit flags to specify them. TargetVariable HOST_WIDE_INT la_isa_evolution = 0 + +mannotate-tablejump +Target Mask(ANNOTATE_TABLEJUMP) Save +Annotate table jump instruction (jr {reg}) to correlate it with the jump table. diff --git a/gcc/config/loongarch/loongarch-builtins.cc b/gcc/config/loongarch/loongarch-builtins.cc index cf92770..64529da 100644 --- a/gcc/config/loongarch/loongarch-builtins.cc +++ b/gcc/config/loongarch/loongarch-builtins.cc @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/loongarch/loongarch-c.cc b/gcc/config/loongarch/loongarch-c.cc index c676364..97e1baf 100644 --- a/gcc/config/loongarch/loongarch-c.cc +++ b/gcc/config/loongarch/loongarch-c.cc @@ -116,7 +116,7 @@ loongarch_cpu_cpp_builtins (cpp_reader *pfile) max_v_major = major > max_v_major ? major : max_v_major; max_v_minor = major == max_v_major - ? (minor > max_v_minor ? minor : max_v_minor): max_v_minor; + ? (minor > max_v_minor ? minor : max_v_minor) : max_v_minor; } /* Find the minimum ISA version required to run the target program. */ diff --git a/gcc/config/loongarch/loongarch-cpu.cc b/gcc/config/loongarch/loongarch-cpu.cc index 8564ebb..343751b 100644 --- a/gcc/config/loongarch/loongarch-cpu.cc +++ b/gcc/config/loongarch/loongarch-cpu.cc @@ -265,11 +265,11 @@ fill_native_cpu_config (struct loongarch_target *tgt) l1u_present |= cpucfg_cache[16] & 3; /* bit[1:0]: unified l1 */ l1d_present |= cpucfg_cache[16] & 4; /* bit[2:2]: l1d */ l1_szword = l1d_present ? 18 : (l1u_present ? 17 : 0); - l1_szword = l1_szword ? cpucfg_cache[l1_szword]: 0; + l1_szword = l1_szword ? cpucfg_cache[l1_szword] : 0; l2d_present |= cpucfg_cache[16] & 24; /* bit[4:3]: unified l2 */ l2d_present |= cpucfg_cache[16] & 128; /* bit[7:7]: l2d */ - l2_szword = l2d_present ? cpucfg_cache[19]: 0; + l2_szword = l2d_present ? cpucfg_cache[19] : 0; native_cache.l1d_line_size = 1 << ((l1_szword & 0x7f000000) >> 24); /* bit[30:24]: log2(line) */ diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index c7a0210..f956ee4 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -21,6 +21,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index f70ca85..bd08250 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -3496,12 +3496,22 @@ DONE; }) +(define_mode_attr mode_size [(DI "8") (SI "4")]) + (define_insn "@tablejump<mode>" [(set (pc) (match_operand:P 0 "register_operand" "e")) (use (label_ref (match_operand 1 "" "")))] "" - "jr\t%0" + { + return TARGET_ANNOTATE_TABLEJUMP + ? "1:jr\t%0\n\t" + ".pushsection\t.discard.tablejump_annotate\n\t" + "\t.<mode_size>byte\t1b\n\t" + "\t.<mode_size>byte\t%1\n\t" + ".popsection" + : "jr\t%0"; + } [(set_attr "type" "jump") (set_attr "mode" "none")]) diff --git a/gcc/config/loongarch/loongarch.opt b/gcc/config/loongarch/loongarch.opt index 91cb523..fae5754 100644 --- a/gcc/config/loongarch/loongarch.opt +++ b/gcc/config/loongarch/loongarch.opt @@ -310,6 +310,10 @@ default value is 4. TargetVariable HOST_WIDE_INT la_isa_evolution = 0 +mannotate-tablejump +Target Mask(ANNOTATE_TABLEJUMP) Save +Annotate table jump instruction (jr {reg}) to correlate it with the jump table. + mfrecipe Target Mask(ISA_FRECIPE) Var(la_isa_evolution) Support frecipe.{s/d} and frsqrte.{s/d} instructions. diff --git a/gcc/config/loongarch/loongarch.opt.urls b/gcc/config/loongarch/loongarch.opt.urls index f7545f6..571c504 100644 --- a/gcc/config/loongarch/loongarch.opt.urls +++ b/gcc/config/loongarch/loongarch.opt.urls @@ -72,6 +72,9 @@ UrlSuffix(gcc/LoongArch-Options.html#index-mpass-mrelax-to-as) mtls-dialect= UrlSuffix(gcc/LoongArch-Options.html#index-mtls-dialect-1) +mannotate-tablejump +UrlSuffix(gcc/LoongArch-Options.html#index-mannotate-tablejump) + mfrecipe UrlSuffix(gcc/LoongArch-Options.html#index-mfrecipe) diff --git a/gcc/config/m32c/m32c.cc b/gcc/config/m32c/m32c.cc index 38abf17..d27538e 100644 --- a/gcc/config/m32c/m32c.cc +++ b/gcc/config/m32c/m32c.cc @@ -873,7 +873,7 @@ m32c_matches_constraint_p (rtx value, int constraint) && A0_OR_PSEUDO (patternr[5]) && GET_MODE (patternr[5]) == HImode) || RTX_IS ("ms"))); - case CONSTRAINT_Sd: + case CONSTRAINT_Sd: { /* This is the common "src/dest" address */ rtx r; @@ -2790,7 +2790,7 @@ m32c_print_operand (FILE * file, rtx x, int code) #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P #define TARGET_PRINT_OPERAND_PUNCT_VALID_P m32c_print_operand_punct_valid_p -static bool +static bool m32c_print_operand_punct_valid_p (unsigned char c) { if (c == '&' || c == '!') @@ -3036,7 +3036,7 @@ m32c_insert_attributes (tree node ATTRIBUTE_UNUSED, { TREE_THIS_VOLATILE (node) = true; } - } + } } /* Hash table of pragma info. */ @@ -3227,7 +3227,7 @@ m32c_immd_dbl_mov (rtx * operands ATTRIBUTE_UNUSED, /* ??? This relied on the now-defunct MEM_SCALAR and MEM_IN_STRUCT_P flags. */ return false; -} +} /* Expanders */ @@ -4099,7 +4099,7 @@ m32c_emit_prologue (void) if (flag_stack_usage_info) current_function_static_stack_size = frame_size; - + if (frame_size > 254) { extra_frame_size = frame_size - 254; diff --git a/gcc/config/m32r/m32r.cc b/gcc/config/m32r/m32r.cc index c45a7d6..4742ceb 100644 --- a/gcc/config/m32r/m32r.cc +++ b/gcc/config/m32r/m32r.cc @@ -307,7 +307,7 @@ init_reg_tables (void) for (i = 0; i < NUM_MACHINE_MODES; i++) { machine_mode m = (machine_mode) i; - + switch (GET_MODE_CLASS (m)) { case MODE_INT: diff --git a/gcc/config/m32r/m32r.h b/gcc/config/m32r/m32r.h index 7be8dfd..06d5d04 100644 --- a/gcc/config/m32r/m32r.h +++ b/gcc/config/m32r/m32r.h @@ -222,7 +222,7 @@ #define UNITS_PER_WORD 4 /* Define this macro if it is advisable to hold scalars in registers - in a wider mode than that declared by the program. In such cases, + in a wider mode than that declared by the program. In such cases, the value is constrained to be within the bounds of the declared type, but kept valid in the wider mode. The signedness of the extension may differ from that of the type. */ @@ -303,7 +303,7 @@ #endif #define FIRST_PSEUDO_REGISTER (M32R_NUM_REGISTERS + SUBTARGET_NUM_REGISTERS) - + /* 1 for registers that have pervasive standard uses and are not available for the register allocator. diff --git a/gcc/config/m68k/linux.h b/gcc/config/m68k/linux.h index b711f49..fad360f 100644 --- a/gcc/config/m68k/linux.h +++ b/gcc/config/m68k/linux.h @@ -90,7 +90,7 @@ along with GCC; see the file COPYING3. If not see /* Currently, JUMP_TABLES_IN_TEXT_SECTION must be defined in order to keep switch tables in the text section. */ - + #define JUMP_TABLES_IN_TEXT_SECTION 1 /* Use the default action for outputting the case label. */ diff --git a/gcc/config/m68k/m68k.cc b/gcc/config/m68k/m68k.cc index 21c9498..d642bcb 100644 --- a/gcc/config/m68k/m68k.cc +++ b/gcc/config/m68k/m68k.cc @@ -772,7 +772,7 @@ m68k_get_function_kind (tree func) tree a; gcc_assert (TREE_CODE (func) == FUNCTION_DECL); - + a = lookup_attribute ("interrupt", DECL_ATTRIBUTES (func)); if (a != NULL_TREE) return m68k_fk_interrupt_handler; @@ -1400,7 +1400,7 @@ static bool m68k_ok_for_sibcall_p (tree decl, tree exp) { enum m68k_function_kind kind; - + /* We cannot use sibcalls for nested functions because we use the static chain register for indirect calls. */ if (CALL_EXPR_STATIC_CHAIN (exp)) @@ -1436,7 +1436,7 @@ m68k_ok_for_sibcall_p (tree decl, tree exp) the same. */ if (decl && m68k_get_function_kind (decl) == kind) return true; - + return false; } @@ -1503,12 +1503,14 @@ m68k_legitimize_address (rtx x, rtx oldx, machine_mode mode) #define COPY_ONCE(Y) if (!copied) { Y = copy_rtx (Y); copied = ch = 1; } - if (GET_CODE (XEXP (x, 0)) == MULT) + if (GET_CODE (XEXP (x, 0)) == MULT + || GET_CODE (XEXP (x, 0)) == ASHIFT) { COPY_ONCE (x); XEXP (x, 0) = force_operand (XEXP (x, 0), 0); } - if (GET_CODE (XEXP (x, 1)) == MULT) + if (GET_CODE (XEXP (x, 1)) == MULT + || GET_CODE (XEXP (x, 1)) == ASHIFT) { COPY_ONCE (x); XEXP (x, 1) = force_operand (XEXP (x, 1), 0); @@ -1731,7 +1733,7 @@ m68k_asm_final_postscan_insn (FILE *, rtx_insn *insn, rtx [], int) return; } -/* Output a dbCC; jCC sequence. Note we do not handle the +/* Output a dbCC; jCC sequence. Note we do not handle the floating point version of this sequence (Fdbcc). OPERANDS are as in the two peepholes. CODE is the code returned by m68k_output_branch_<mode>. */ @@ -2069,16 +2071,29 @@ m68k_decompose_index (rtx x, bool strict_p, struct m68k_address *address) /* Check for a scale factor. */ scale = 1; - if ((TARGET_68020 || TARGET_COLDFIRE) - && GET_CODE (x) == MULT - && GET_CODE (XEXP (x, 1)) == CONST_INT - && (INTVAL (XEXP (x, 1)) == 2 - || INTVAL (XEXP (x, 1)) == 4 - || (INTVAL (XEXP (x, 1)) == 8 - && (TARGET_COLDFIRE_FPU || !TARGET_COLDFIRE)))) + if (TARGET_68020 || TARGET_COLDFIRE) { - scale = INTVAL (XEXP (x, 1)); - x = XEXP (x, 0); + if (GET_CODE (x) == MULT + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) == 2 + || INTVAL (XEXP (x, 1)) == 4 + || (INTVAL (XEXP (x, 1)) == 8 + && (TARGET_COLDFIRE_FPU || !TARGET_COLDFIRE)))) + { + scale = INTVAL (XEXP (x, 1)); + x = XEXP (x, 0); + } + /* LRA uses ASHIFT instead of MULT outside of MEM. */ + else if (GET_CODE (x) == ASHIFT + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) == 1 + || INTVAL (XEXP (x, 1)) == 2 + || (INTVAL (XEXP (x, 1)) == 3 + && (TARGET_COLDFIRE_FPU || !TARGET_COLDFIRE)))) + { + scale = 1 << INTVAL (XEXP (x, 1)); + x = XEXP (x, 0); + } } /* Check for a word extension. */ @@ -2246,8 +2261,10 @@ m68k_decompose_address (machine_mode mode, rtx x, ??? do_tablejump creates these addresses before placing the target label, so we have to assume that unplaced labels are jump table references. It seems unlikely that we would ever generate indexed - accesses to unplaced labels in other cases. */ + accesses to unplaced labels in other cases. Do not accept it in + PIC mode, since the label address will need to be loaded from memory. */ if (GET_CODE (x) == PLUS + && !flag_pic && m68k_jump_table_ref_p (XEXP (x, 1)) && m68k_decompose_index (XEXP (x, 0), strict_p, address)) { @@ -2335,7 +2352,8 @@ m68k_legitimate_mem_p (rtx x, struct m68k_address *address) { return (MEM_P (x) && m68k_decompose_address (GET_MODE (x), XEXP (x, 0), - reload_in_progress || reload_completed, + (reload_in_progress || lra_in_progress + || reload_completed), address)); } @@ -2610,19 +2628,19 @@ m68k_wrap_symbol_into_got_ref (rtx x, enum m68k_reloc reloc, rtx temp_reg) /* Legitimize PIC addresses. If the address is already position-independent, we return ORIG. Newly generated position-independent addresses go to REG. If we need more - than one register, we lose. + than one register, we lose. An address is legitimized by making an indirect reference through the Global Offset Table with the name of the symbol - used as an offset. + used as an offset. - The assembler and linker are responsible for placing the + The assembler and linker are responsible for placing the address of the symbol in the GOT. The function prologue is responsible for initializing a5 to the starting address of the GOT. The assembler is also responsible for translating a symbol name - into a constant displacement from the start of the GOT. + into a constant displacement from the start of the GOT. A quick example may make things a little clearer: @@ -2642,9 +2660,9 @@ m68k_wrap_symbol_into_got_ref (rtx x, enum m68k_reloc reloc, rtx temp_reg) movel a5@(_foo:w), a0 movel #12345, a0@ - - That (in a nutshell) is how *all* symbol and label references are + + That (in a nutshell) is how *all* symbol and label references are handled. */ rtx @@ -2673,7 +2691,7 @@ legitimize_pic_address (rtx orig, machine_mode mode ATTRIBUTE_UNUSED, /* legitimize both operands of the PLUS */ gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS); - + base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg); orig = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode, base == reg ? 0 : reg); @@ -2735,13 +2753,13 @@ m68k_call_tls_get_addr (rtx x, rtx eqv, enum m68k_reloc reloc) is the simpliest way of generating a call. The difference between __tls_get_addr() and libcall is that the result is returned in D0 instead of A0. To workaround this, we use m68k_libcall_value_in_a0_p - which temporarily switches returning the result to A0. */ + which temporarily switches returning the result to A0. */ m68k_libcall_value_in_a0_p = true; a0 = emit_library_call_value (m68k_get_tls_get_addr (), NULL_RTX, LCT_PURE, Pmode, x, Pmode); m68k_libcall_value_in_a0_p = false; - + insns = get_insns (); end_sequence (); @@ -2769,7 +2787,7 @@ m68k_get_m68k_read_tp (void) /* Emit instruction sequence that calls __m68k_read_tp. A pseudo register with result of __m68k_read_tp call is returned. */ -static rtx +static rtx m68k_call_m68k_read_tp (void) { rtx a0; @@ -2783,7 +2801,7 @@ m68k_call_m68k_read_tp (void) is the simpliest way of generating a call. The difference between __m68k_read_tp() and libcall is that the result is returned in D0 instead of A0. To workaround this, we use m68k_libcall_value_in_a0_p - which temporarily switches returning the result to A0. */ + which temporarily switches returning the result to A0. */ /* Emit the call sequence. */ m68k_libcall_value_in_a0_p = true; @@ -2822,7 +2840,7 @@ m68k_legitimize_tls_address (rtx orig) rtx eqv; rtx a0; rtx x; - + /* Attach a unique REG_EQUIV, to allow the RTL optimizers to share the LDM result with other LD model accesses. */ eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), @@ -3068,12 +3086,17 @@ m68k_rtx_costs (rtx x, machine_mode mode, int outer_code, /* An lea costs about three times as much as a simple add. */ if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG - && GET_CODE (XEXP (x, 0)) == MULT - && GET_CODE (XEXP (XEXP (x, 0), 0)) == REG - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT - && (INTVAL (XEXP (XEXP (x, 0), 1)) == 2 - || INTVAL (XEXP (XEXP (x, 0), 1)) == 4 - || INTVAL (XEXP (XEXP (x, 0), 1)) == 8)) + && ((GET_CODE (XEXP (x, 0)) == MULT + && GET_CODE (XEXP (XEXP (x, 0), 0)) == REG + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT + && (INTVAL (XEXP (XEXP (x, 0), 1)) == 2 + || INTVAL (XEXP (XEXP (x, 0), 1)) == 4 + || INTVAL (XEXP (XEXP (x, 0), 1)) == 8)) + || (GET_CODE (XEXP (x, 0)) == ASHIFT + && GET_CODE (XEXP (XEXP (x, 0), 0)) == REG + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT + && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) + <= 3)))) { /* lea an@(dx:l:i),am */ *total = COSTS_N_INSNS (TARGET_COLDFIRE ? 2 : 3); @@ -3877,11 +3900,13 @@ emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) rtx tem; if (scratch_reg - && reload_in_progress && GET_CODE (operand0) == REG + && (reload_in_progress || lra_in_progress) + && GET_CODE (operand0) == REG && REGNO (operand0) >= FIRST_PSEUDO_REGISTER) operand0 = reg_equiv_mem (REGNO (operand0)); else if (scratch_reg - && reload_in_progress && GET_CODE (operand0) == SUBREG + && (reload_in_progress || lra_in_progress) + && GET_CODE (operand0) == SUBREG && GET_CODE (SUBREG_REG (operand0)) == REG && REGNO (SUBREG_REG (operand0)) >= FIRST_PSEUDO_REGISTER) { @@ -3894,11 +3919,13 @@ emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) } if (scratch_reg - && reload_in_progress && GET_CODE (operand1) == REG + && (reload_in_progress || lra_in_progress) + && GET_CODE (operand1) == REG && REGNO (operand1) >= FIRST_PSEUDO_REGISTER) operand1 = reg_equiv_mem (REGNO (operand1)); else if (scratch_reg - && reload_in_progress && GET_CODE (operand1) == SUBREG + && (reload_in_progress || lra_in_progress) + && GET_CODE (operand1) == SUBREG && GET_CODE (SUBREG_REG (operand1)) == REG && REGNO (SUBREG_REG (operand1)) >= FIRST_PSEUDO_REGISTER) { @@ -3910,11 +3937,13 @@ emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) operand1 = alter_subreg (&temp, true); } - if (scratch_reg && reload_in_progress && GET_CODE (operand0) == MEM + if (scratch_reg && (reload_in_progress || lra_in_progress) + && GET_CODE (operand0) == MEM && ((tem = find_replacement (&XEXP (operand0, 0))) != XEXP (operand0, 0))) operand0 = gen_rtx_MEM (GET_MODE (operand0), tem); - if (scratch_reg && reload_in_progress && GET_CODE (operand1) == MEM + if (scratch_reg && (reload_in_progress || lra_in_progress) + && GET_CODE (operand1) == MEM && ((tem = find_replacement (&XEXP (operand1, 0))) != XEXP (operand1, 0))) operand1 = gen_rtx_MEM (GET_MODE (operand1), tem); @@ -4819,7 +4848,7 @@ output_move_const_single (rtx *operands) to get the desired constant. */ /* This code has been fixed for cross-compilation. */ - + static int inited_68881_table = 0; static const char *const strings_68881[7] = { @@ -4887,7 +4916,7 @@ standard_68881_constant_p (rtx x) if (real_identical (r, &values_68881[i])) return (codes_68881[i]); } - + if (GET_MODE (x) == SFmode) return 0; @@ -5176,7 +5205,7 @@ m68k_delegitimize_address (rtx orig_x) unspec = XEXP (addr.offset, 0); if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1))) unspec = XEXP (unspec, 0); - if (GET_CODE (unspec) != UNSPEC + if (GET_CODE (unspec) != UNSPEC || (XINT (unspec, 1) != UNSPEC_RELOC16 && XINT (unspec, 1) != UNSPEC_RELOC32)) return orig_x; @@ -5197,7 +5226,7 @@ m68k_delegitimize_address (rtx orig_x) x = replace_equiv_address_nv (orig_x, x); return x; } - + /* A C compound statement to output to stdio stream STREAM the assembler syntax for an instruction operand that is a memory @@ -5211,7 +5240,7 @@ m68k_delegitimize_address (rtx orig_x) It is possible for PIC to generate a (plus (label_ref...) (reg...)) and we handle that just like we would a (plus (symbol_ref...) (reg...)). - This routine is responsible for distinguishing between -fpic and -fPIC + This routine is responsible for distinguishing between -fpic and -fPIC style relocations in an address. When generating -fpic code the offset is output in word mode (e.g. movel a5@(_foo:w), a0). When generating -fPIC code the offset is output in long mode (e.g. movel a5@(_foo:l), a0) */ @@ -6632,7 +6661,7 @@ m68k_sched_variable_issue (FILE *sched_dump ATTRIBUTE_UNUSED, case CPU_CFV3: insn_size = sched_get_attr_size_int (insn); - + /* ColdFire V3 and V4 cores have instruction buffers that can accumulate up to 8 instructions regardless of instructions' sizes. So we should take care not to "prefetch" 24 one-word diff --git a/gcc/config/m68k/m68k.md b/gcc/config/m68k/m68k.md index e5c2528..1c9a6bf 100644 --- a/gcc/config/m68k/m68k.md +++ b/gcc/config/m68k/m68k.md @@ -957,11 +957,12 @@ /* The source is an address which requires PIC relocation. Call legitimize_pic_address with the source, mode, and a relocation register (a new pseudo, or the final destination if reload_in_progress - is set). Then fall through normally */ - rtx temp = reload_in_progress ? operands[0] : gen_reg_rtx (Pmode); + or lra_in_progress is set). Then fall through normally */ + rtx temp = ((reload_in_progress || lra_in_progress) + ? operands[0] : gen_reg_rtx (Pmode)); operands[1] = legitimize_pic_address (operands[1], SImode, temp); } - else if (flag_pic && TARGET_PCREL && ! reload_in_progress) + else if (flag_pic && TARGET_PCREL && ! (reload_in_progress || lra_in_progress)) { /* Don't allow writes to memory except via a register; the m68k doesn't consider PC-relative addresses to be writable. */ @@ -1452,7 +1453,7 @@ "" { /* We can't rewrite operands during reload. */ - if (! reload_in_progress) + if (! (reload_in_progress || lra_in_progress)) { if (CONSTANT_P (operands[1])) { diff --git a/gcc/config/m68k/m68kelf.h b/gcc/config/m68k/m68kelf.h index 0af1951..f53e40f 100644 --- a/gcc/config/m68k/m68kelf.h +++ b/gcc/config/m68k/m68kelf.h @@ -104,7 +104,7 @@ do { \ #define DEBUGGER_REGNO(REGNO) (REGNO) #if 0 -/* SVR4 m68k assembler is bitching on the `comm i,1,1' which askes for +/* SVR4 m68k assembler is bitching on the `comm i,1,1' which askes for 1 byte alignment. Don't generate alignment for COMMON seems to be safer until we the assembler is fixed. */ #undef ASM_OUTPUT_ALIGNED_COMMON @@ -126,7 +126,7 @@ do { \ /* Currently, JUMP_TABLES_IN_TEXT_SECTION must be defined in order to keep switch tables in the text section. */ - + #define JUMP_TABLES_IN_TEXT_SECTION 1 /* In m68k svr4, using swbeg is the standard way to do switch diff --git a/gcc/config/m68k/netbsd-elf.h b/gcc/config/m68k/netbsd-elf.h index 6fc5ad1..3d2043b 100644 --- a/gcc/config/m68k/netbsd-elf.h +++ b/gcc/config/m68k/netbsd-elf.h @@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see } \ while (0) -/* Don't try using XFmode on the 68010. */ +/* Don't try using XFmode on the 68010. */ #undef LONG_DOUBLE_TYPE_MODE #define LONG_DOUBLE_TYPE_MODE (TARGET_68020 ? XFmode : DFmode) diff --git a/gcc/config/m68k/predicates.md b/gcc/config/m68k/predicates.md index 46fc379..787e544 100644 --- a/gcc/config/m68k/predicates.md +++ b/gcc/config/m68k/predicates.md @@ -237,6 +237,7 @@ || (TARGET_68881 && (!standard_68881_constant_p (op) || reload_in_progress + || lra_in_progress || reload_completed))); }) diff --git a/gcc/config/mcore/mcore-elf.h b/gcc/config/mcore/mcore-elf.h index 6c522c7..c367bad 100644 --- a/gcc/config/mcore/mcore-elf.h +++ b/gcc/config/mcore/mcore-elf.h @@ -1,4 +1,4 @@ -/* Definitions of MCore target. +/* Definitions of MCore target. Copyright (C) 1998-2024 Free Software Foundation, Inc. Contributed by Cygnus Solutions. @@ -78,7 +78,7 @@ along with GCC; see the file COPYING3. If not see ASM_OUTPUT_LABEL(FILE, NAME); \ } \ while (0) - + /* Output the size directive for a decl in rest_of_decl_compilation in the case where we did not do so before the initializer. Once we find the error_mark_node, we know that the value of @@ -121,5 +121,5 @@ along with GCC; see the file COPYING3. If not see #define CTORS_SECTION_ASM_OP "\t.section\t.ctors,\"aw\"" #undef DTORS_SECTION_ASM_OP #define DTORS_SECTION_ASM_OP "\t.section\t.dtors,\"aw\"" - + #endif /* __MCORE_ELF_H__ */ diff --git a/gcc/config/mcore/mcore.cc b/gcc/config/mcore/mcore.cc index ee58c8f..99c0a6c 100644 --- a/gcc/config/mcore/mcore.cc +++ b/gcc/config/mcore/mcore.cc @@ -293,12 +293,12 @@ output_stack_adjust (int direction, int size) emit_insn (gen_movsi (nval, val)); val = nval; } - + if (direction > 0) insn = gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, val); else insn = gen_subsi3 (stack_pointer_rtx, stack_pointer_rtx, val); - + emit_insn (insn); } } @@ -311,7 +311,7 @@ calc_live_regs (int * count) { int reg; int live_regs_mask = 0; - + * count = 0; for (reg = 0; reg < FIRST_PSEUDO_REGISTER; reg++) @@ -336,7 +336,7 @@ mcore_print_operand_address (FILE * stream, machine_mode /*mode*/, rtx x) case REG: fprintf (stream, "(%s)", reg_names[REGNO (x)]); break; - + case PLUS: { rtx base = XEXP (x, 0); @@ -463,25 +463,25 @@ mcore_const_costs (rtx exp, enum rtx_code code) HOST_WIDE_INT val = INTVAL (exp); /* Easy constants. */ - if ( CONST_OK_FOR_I (val) - || CONST_OK_FOR_M (val) - || CONST_OK_FOR_N (val) + if ( CONST_OK_FOR_I (val) + || CONST_OK_FOR_M (val) + || CONST_OK_FOR_N (val) || (code == PLUS && CONST_OK_FOR_L (val))) - return 1; + return 1; else if (code == AND && ( CONST_OK_FOR_M (~val) || CONST_OK_FOR_N (~val))) return 2; - else if (code == PLUS - && ( CONST_OK_FOR_I (-val) - || CONST_OK_FOR_M (-val) - || CONST_OK_FOR_N (-val))) - return 2; + else if (code == PLUS + && ( CONST_OK_FOR_I (-val) + || CONST_OK_FOR_M (-val) + || CONST_OK_FOR_N (-val))) + return 2; - return 5; + return 5; } -/* What does an and instruction cost - we do this b/c immediates may +/* What does an and instruction cost - we do this b/c immediates may have been relaxed. We want to ensure that cse will cse relaxed immeds out. Otherwise we'll get bad code (multiple reloads of the same const). */ @@ -494,7 +494,7 @@ mcore_and_cost (rtx x) return 2; val = INTVAL (XEXP (x, 1)); - + /* Do it directly. */ if (CONST_OK_FOR_K (val) || CONST_OK_FOR_M (~val)) return 2; @@ -530,7 +530,7 @@ mcore_ior_cost (rtx x) /* Takes two instructions to load. */ else if (TARGET_HARDLIT && mcore_const_ok_for_inline (val)) return 4; - + /* Takes a lrw to load. */ return 5; } @@ -572,7 +572,7 @@ mcore_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, case FIX: *total = COSTS_N_INSNS (100); return true; - + default: return false; } @@ -590,7 +590,7 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) if (GET_CODE (op1) == CONST_INT) { HOST_WIDE_INT val = INTVAL (op1); - + switch (code) { case GTU: @@ -610,12 +610,12 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = code == LE ? LT : GE; } break; - + default: break; } } - + if (CONSTANT_P (op1) && GET_CODE (op1) != CONST_INT) op1 = force_reg (SImode, op1); @@ -628,7 +628,7 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = NE; invert = true; /* FALLTHRU */ - + case NE: /* Use normal condition, cmpne. */ if (GET_CODE (op1) == CONST_INT && ! CONST_OK_FOR_K (INTVAL (op1))) op1 = force_reg (SImode, op1); @@ -638,7 +638,7 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = GT; invert = true; /* FALLTHRU */ - + case GT: /* Use normal condition, reversed cmplt. */ if (GET_CODE (op1) == CONST_INT) op1 = force_reg (SImode, op1); @@ -648,9 +648,9 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = LT; invert = true; /* FALLTHRU */ - + case LT: /* Use normal condition, cmplt. */ - if (GET_CODE (op1) == CONST_INT && + if (GET_CODE (op1) == CONST_INT && /* covered by btsti x,31. */ INTVAL (op1) != 0 && ! CONST_OK_FOR_J (INTVAL (op1))) @@ -663,7 +663,7 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = LEU; invert = true; /* FALLTHRU */ - + case LEU: /* Use normal condition, reversed cmphs. */ if (GET_CODE (op1) == CONST_INT && INTVAL (op1) != 0) op1 = force_reg (SImode, op1); @@ -673,7 +673,7 @@ mcore_gen_compare (enum rtx_code code, rtx op0, rtx op1) code = GEU; invert = true; /* FALLTHRU */ - + case GEU: /* Use normal condition, cmphs. */ if (GET_CODE (op1) == CONST_INT && INTVAL (op1) != 0) op1 = force_reg (SImode, op1); @@ -712,13 +712,13 @@ mcore_output_call (rtx operands[], int index) { static char buffer[20]; rtx addr = operands [index]; - + if (REG_P (addr)) { if (TARGET_CG_DATA) { gcc_assert (mcore_current_function_name); - + ASM_OUTPUT_CG_EDGE (asm_out_file, mcore_current_function_name, "unknown", 1); } @@ -731,11 +731,11 @@ mcore_output_call (rtx operands[], int index) { gcc_assert (mcore_current_function_name); gcc_assert (GET_CODE (addr) == SYMBOL_REF); - + ASM_OUTPUT_CG_EDGE (asm_out_file, mcore_current_function_name, XSTR (addr, 0), 0); } - + sprintf (buffer, "jbsr\t%%%d", index); } @@ -749,15 +749,15 @@ const_ok_for_mcore (HOST_WIDE_INT value) { if (value >= 0 && value <= 127) return 1; - + /* Try exact power of two. */ if (CONST_OK_FOR_M (value)) return 1; - + /* Try exact power of two - 1. */ if (CONST_OK_FOR_N (value) && value != -1) return 1; - + return 0; } @@ -767,7 +767,7 @@ int mcore_const_ok_for_inline (HOST_WIDE_INT value) { HOST_WIDE_INT x, y; - + return try_constant_tricks (value, & x, & y) > 0; } @@ -778,12 +778,12 @@ mcore_const_trick_uses_not (HOST_WIDE_INT value) { HOST_WIDE_INT x, y; - return try_constant_tricks (value, & x, & y) == 2; -} + return try_constant_tricks (value, & x, & y) == 2; +} /* Try tricks to load a constant inline and return the trick number if success (0 is non-inlinable). - + 0: not inlinable 1: single instruction (do the usual thing) 2: single insn followed by a 'not' @@ -805,8 +805,8 @@ try_constant_tricks (HOST_WIDE_INT value, HOST_WIDE_INT * x, HOST_WIDE_INT * y) if (const_ok_for_mcore (value)) return 1; /* Do the usual thing. */ - - if (! TARGET_HARDLIT) + + if (! TARGET_HARDLIT) return 0; if (const_ok_for_mcore (~value)) @@ -912,13 +912,13 @@ try_constant_tricks (HOST_WIDE_INT value, HOST_WIDE_INT * x, HOST_WIDE_INT * y) return 11; } - + return 0; } /* Check whether reg is dead at first. This is done by searching ahead for either the next use (i.e., reg is live), a death note, or a set of - reg. Don't just use dead_or_set_p() since reload does not always mark + reg. Don't just use dead_or_set_p() since reload does not always mark deaths (especially if PRESERVE_DEATH_NOTES_REGNO_P is not defined). We can ignore subregs by extracting the actual register. BRC */ @@ -1032,11 +1032,11 @@ mcore_output_bseti (rtx dst, int mask) if ((mask & 0x1) == 0x1) { out_operands[1] = GEN_INT (bit); - + output_asm_insn ("bseti\t%0,%1", out_operands); } mask >>= 1; - } + } return ""; } @@ -1056,12 +1056,12 @@ mcore_output_bclri (rtx dst, int mask) if ((mask & 0x1) == 0x0) { out_operands[1] = GEN_INT (bit); - + output_asm_insn ("bclri\t%0,%1", out_operands); } - + mask >>= 1; - } + } return ""; } @@ -1098,7 +1098,7 @@ mcore_output_cmov (rtx operands[], int cmp_t, const char * test) /* First output the test if folded into the pattern. */ - if (test) + if (test) output_asm_insn (test, operands); /* Load the constant - for now, only support constants that can be @@ -1111,7 +1111,7 @@ mcore_output_cmov (rtx operands[], int cmp_t, const char * test) output_asm_insn ("bgeni\t%0,%P1", out_operands); else if (CONST_OK_FOR_N (load_value)) output_asm_insn ("bmaski\t%0,%N1", out_operands); - + /* Output the constant adjustment. */ if (load_value > adjust_value) { @@ -1131,7 +1131,7 @@ mcore_output_cmov (rtx operands[], int cmp_t, const char * test) return ""; } -/* Outputs the peephole for moving a constant that gets not'ed followed +/* Outputs the peephole for moving a constant that gets not'ed followed by an and (i.e. combine the not and the and into andn). BRC */ const char * @@ -1152,15 +1152,15 @@ mcore_output_andn (rtx insn ATTRIBUTE_UNUSED, rtx operands[]) if (x >= 0 && x <= 127) load_op = "movi\t%0,%1"; - + /* Try exact power of two. */ else if (CONST_OK_FOR_M (x)) load_op = "bgeni\t%0,%P1"; - + /* Try exact power of two - 1. */ else if (CONST_OK_FOR_N (x)) load_op = "bmaski\t%0,%N1"; - + else { load_op = "BADMOVI-andn\t%0, %1"; @@ -1193,14 +1193,14 @@ output_inline_const (machine_mode mode, rtx operands[]) turned into lrw's. Our caller uses try_constant_tricks to back off to an lrw rather than calling this routine. */ gcc_assert (trick_no != 0); - + if (trick_no == 1) x = value; /* operands: 0 = dst, 1 = load immed., 2 = immed. adjustment. */ out_operands[0] = operands[0]; out_operands[1] = GEN_INT (x); - + if (trick_no > 2) out_operands[2] = GEN_INT (y); @@ -1212,20 +1212,20 @@ output_inline_const (machine_mode mode, rtx operands[]) if (x >= 0 && x <= 127) sprintf (load_op, "movi\t%s,%%1", dst_fmt); - + /* Try exact power of two. */ else if (CONST_OK_FOR_M (x)) sprintf (load_op, "bgeni\t%s,%%P1", dst_fmt); - + /* Try exact power of two - 1. */ else if (CONST_OK_FOR_N (x)) sprintf (load_op, "bmaski\t%s,%%N1", dst_fmt); - + else { sprintf (load_op, "BADMOVI-inline_const %s, %%1", dst_fmt); gcc_unreachable (); - } + } switch (trick_no) { @@ -1266,7 +1266,7 @@ output_inline_const (machine_mode mode, rtx operands[]) default: return ""; } - + output_asm_insn (buf, out_operands); return ""; @@ -1284,15 +1284,15 @@ mcore_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], if (GET_CODE (dst) == REG) { if (GET_CODE (src) == REG) - { + { if (REGNO (src) == CC_REG) /* r-c */ - return "mvc\t%0"; - else + return "mvc\t%0"; + else return "mov\t%0,%1"; /* r-r*/ } else if (GET_CODE (src) == MEM) { - if (GET_CODE (XEXP (src, 0)) == LABEL_REF) + if (GET_CODE (XEXP (src, 0)) == LABEL_REF) return "lrw\t%0,[%1]"; /* a-R */ else switch (GET_MODE (src)) /* r-m */ @@ -1310,7 +1310,7 @@ mcore_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], else if (GET_CODE (src) == CONST_INT) { HOST_WIDE_INT x, y; - + if (CONST_OK_FOR_I (INTVAL (src))) /* r-I */ return "movi\t%0,%1"; else if (CONST_OK_FOR_M (INTVAL (src))) /* r-M */ @@ -1319,7 +1319,7 @@ mcore_output_move (rtx insn ATTRIBUTE_UNUSED, rtx operands[], return "bmaski\t%0,%N1\t// %1 %x1"; else if (try_constant_tricks (INTVAL (src), &x, &y)) /* R-P */ return output_inline_const (SImode, operands); /* 1-2 insns */ - else + else return "lrw\t%0,%x1\t// %1"; /* Get it from literal pool. */ } else @@ -1357,7 +1357,7 @@ mcore_output_movedouble (rtx operands[], machine_mode mode ATTRIBUTE_UNUSED) { int dstreg = REGNO (dst); int srcreg = REGNO (src); - + /* Ensure the second source not overwritten. */ if (srcreg + 1 == dstreg) return "mov %R0,%R1\n\tmov %0,%1"; @@ -1369,10 +1369,10 @@ mcore_output_movedouble (rtx operands[], machine_mode mode ATTRIBUTE_UNUSED) rtx memexp = XEXP (src, 0); int dstreg = REGNO (dst); int basereg = -1; - + if (GET_CODE (memexp) == LABEL_REF) return "lrw\t%0,[%1]\n\tlrw\t%R0,[%R1]"; - else if (GET_CODE (memexp) == REG) + else if (GET_CODE (memexp) == REG) basereg = REGNO (memexp); else if (GET_CODE (memexp) == PLUS) { @@ -1391,7 +1391,7 @@ mcore_output_movedouble (rtx operands[], machine_mode mode ATTRIBUTE_UNUSED) { /* Just load them in reverse order. */ return "ldw\t%R0,%R1\n\tldw\t%0,%1"; - + /* XXX: alternative: move basereg to basereg+1 and then fall through. */ } @@ -1449,7 +1449,7 @@ mcore_arith_S_operand (rtx op) { if (GET_CODE (op) == CONST_INT && CONST_OK_FOR_M (~INTVAL (op))) return 1; - + return 0; } @@ -1484,7 +1484,7 @@ mcore_expand_insv (rtx operands[]) gen_rtx_IOR (SImode, operands[0], GEN_INT (mask)))); } - + return 1; } @@ -1496,7 +1496,7 @@ mcore_expand_insv (rtx operands[]) if (width == 8 && posn % 8 == 0) /* Byte sized and aligned; let caller break it up. */ return 0; - + if (width == 16 && posn % 16 == 0) /* Short sized and aligned; let caller break it up. */ return 0; @@ -1539,7 +1539,7 @@ mcore_expand_insv (rtx operands[]) bits. */ if (width + posn != (int) GET_MODE_SIZE (SImode)) { - ereg = force_reg (SImode, GEN_INT ((1 << width) - 1)); + ereg = force_reg (SImode, GEN_INT ((1 << width) - 1)); emit_insn (gen_rtx_SET (sreg, gen_rtx_AND (SImode, sreg, ereg))); } @@ -1547,7 +1547,7 @@ mcore_expand_insv (rtx operands[]) if (posn != 0) emit_insn (gen_rtx_SET (sreg, gen_rtx_ASHIFT (SImode, sreg, GEN_INT (posn)))); - + emit_insn (gen_rtx_SET (operands[0], gen_rtx_IOR (SImode, operands[0], sreg))); @@ -1630,7 +1630,7 @@ block_move_sequence (rtx dst_mem, rtx src_mem, int size, int align) if (active[phase]) { active[phase] = false; - + x = adjust_address (dst_mem, mode[phase], offset_st); emit_insn (gen_rtx_SET (x, temp[phase])); @@ -1712,11 +1712,11 @@ layout_mcore_frame (struct mcore_frame * infp) /* Might have to spill bytes to re-assemble a big argument that was passed partially in registers and partially on the stack. */ nbytes = crtl->args.pretend_args_size; - + /* Determine how much space for spilled anonymous args (e.g., stdarg). */ if (current_function_anonymous_args) nbytes += (NPARM_REGS - number_of_regs_before_varargs) * UNITS_PER_WORD; - + infp->arg_size = nbytes; /* How much space to save non-volatile registers we stomp. */ @@ -1730,7 +1730,7 @@ layout_mcore_frame (struct mcore_frame * infp) /* Make sure we have a whole number of words for the locals. */ if (infp->local_size % STACK_BYTES) infp->local_size = (infp->local_size + STACK_BYTES - 1) & ~ (STACK_BYTES -1); - + /* Only thing we know we have to pad is the outbound space, since we've aligned our locals assuming that base of locals is aligned. */ infp->pad_local = 0; @@ -1765,23 +1765,23 @@ layout_mcore_frame (struct mcore_frame * infp) step = localregarg + infp->pad_reg; infp->reg_offset = infp->local_size; - + if (outbounds + step <= ADDI_REACH && !frame_pointer_needed) { step += outbounds; infp->reg_offset += outbounds; outbounds = 0; } - + infp->arg_offset = step - 4; infp->growth[growths++] = step; infp->reg_growth = growths; infp->local_growth = growths; - + /* If we haven't already folded it in. */ if (outbounds) infp->growth[growths++] = outbounds; - + goto finish; } @@ -1803,7 +1803,7 @@ layout_mcore_frame (struct mcore_frame * infp) step = ADDI_REACH; /* As much up front as we can. */ if (step > all) step = all; - + /* XXX: Consider whether step will still be aligned; we believe so. */ infp->arg_offset = step - 4; infp->growth[growths++] = step; @@ -1829,7 +1829,7 @@ layout_mcore_frame (struct mcore_frame * infp) /* Finish off if we need to do so. */ if (outbounds) infp->growth[growths++] = outbounds; - + goto finish; } @@ -1845,28 +1845,28 @@ layout_mcore_frame (struct mcore_frame * infp) if (infp->local_size % STACK_BYTES) infp->pad_local = STACK_BYTES - (infp->local_size % STACK_BYTES); - + step = infp->local_size + infp->pad_local; - + if (!frame_pointer_needed) { step += outbounds; outbounds = 0; } - + infp->growth[growths++] = step; infp->local_growth = growths; /* If there's any left to be done. */ if (outbounds) infp->growth[growths++] = outbounds; - + goto finish; } /* XXX: optimizations that we'll want to play with.... -- regarg is not aligned, but it's a small number of registers; - use some of localsize so that regarg is aligned and then + use some of localsize so that regarg is aligned and then save the registers. */ /* Simple encoding; plods down the stack buying the pieces as it goes. @@ -1875,27 +1875,27 @@ layout_mcore_frame (struct mcore_frame * infp) -- but it is safe for all alignments. */ if (regarg % STACK_BYTES != 0) infp->pad_reg = STACK_BYTES - (regarg % STACK_BYTES); - + infp->growth[growths++] = infp->arg_size + infp->reg_size + infp->pad_reg; infp->reg_growth = growths; infp->arg_offset = infp->growth[0] - 4; infp->reg_offset = 0; - + if (frame_pointer_needed) { if (infp->local_size % STACK_BYTES != 0) infp->pad_local = STACK_BYTES - (infp->local_size % STACK_BYTES); - + infp->growth[growths++] = infp->local_size + infp->pad_local; infp->local_growth = growths; - + infp->growth[growths++] = outbounds; } else { if ((infp->local_size + outbounds) % STACK_BYTES != 0) infp->pad_local = STACK_BYTES - ((infp->local_size + outbounds) % STACK_BYTES); - + infp->growth[growths++] = infp->local_size + infp->pad_local + outbounds; infp->local_growth = growths; } @@ -1904,7 +1904,7 @@ layout_mcore_frame (struct mcore_frame * infp) finish: gcc_assert (infp->reg_offset >= 0); gcc_assert (growths <= MAX_STACK_GROWS); - + for (i = 0; i < growths; i++) gcc_assert (!(infp->growth[i] % STACK_BYTES)); } @@ -1956,12 +1956,12 @@ mcore_setup_incoming_varargs (cumulative_args_t args_so_far_v, number_of_regs_before_varargs = *args_so_far; if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl))) number_of_regs_before_varargs += mcore_num_arg_regs (arg.mode, arg.type); - + /* There is a bug somewhere in the arg handling code. Until I can find it this workaround always pushes the last named argument onto the stack. */ number_of_regs_before_varargs = *args_so_far; - + /* The last named argument may be split between argument registers and the stack. Allow for this here. */ if (number_of_regs_before_varargs > NPARM_REGS) @@ -1977,7 +1977,7 @@ mcore_expand_prolog (void) /* Find out what we're doing. */ layout_mcore_frame (&fi); - + space_allocated = fi.arg_size + fi.reg_size + fi.local_size + fi.outbound_size + fi.pad_outbound + fi.pad_local + fi.pad_reg; @@ -1987,17 +1987,17 @@ mcore_expand_prolog (void) rtx x; x = DECL_RTL (current_function_decl); - + gcc_assert (GET_CODE (x) == MEM); - + x = XEXP (x, 0); - + gcc_assert (GET_CODE (x) == SYMBOL_REF); - + free (mcore_current_function_name); - + mcore_current_function_name = xstrdup (XSTR (x, 0)); - + ASM_OUTPUT_CG_NODE (asm_out_file, mcore_current_function_name, space_allocated); if (cfun->calls_alloca) @@ -2017,7 +2017,7 @@ mcore_expand_prolog (void) if (mcore_naked_function_p ()) return; - + /* Handle stdarg+regsaves in one shot: can't be more than 64 bytes. */ output_stack_adjust (-1, fi.growth[growth++]); /* Grows it. */ @@ -2048,7 +2048,7 @@ mcore_expand_prolog (void) { int i; int offs = fi.reg_offset; - + for (i = 15; i >= 0; i--) { if (offs == 0 && i == 15 && ((fi.reg_mask & 0xc000) == 0xc000)) @@ -2084,7 +2084,7 @@ mcore_expand_prolog (void) /* If we haven't already purchased to 'fp'. */ if (growth < fi.local_growth) output_stack_adjust (-1, fi.growth[growth++]); /* Grows it. */ - + emit_insn (gen_movsi (frame_pointer_rtx, stack_pointer_rtx)); /* ... and then go any remaining distance for outbounds, etc. */ @@ -2108,7 +2108,7 @@ mcore_expand_epilog (void) int offs; int growth = MAX_STACK_GROWS - 1 ; - + /* Find out what we're doing. */ layout_mcore_frame(&fi); @@ -2137,9 +2137,9 @@ mcore_expand_epilog (void) register save information back off the stack. */ while (growth >= fi.reg_growth) output_stack_adjust ( 1, fi.growth[growth--]); - + offs = fi.reg_offset; - + for (i = 15; i >= 0; i--) { if (offs == 0 && i == 15 && ((fi.reg_mask & 0xc000) == 0xc000)) @@ -2148,10 +2148,10 @@ mcore_expand_epilog (void) /* Find the starting register. */ first_reg = 15; - + while (fi.reg_mask & (1 << first_reg)) first_reg--; - + first_reg++; emit_insn (gen_load_multiple (gen_rtx_REG (SImode, first_reg), @@ -2257,16 +2257,16 @@ mcore_output_jump_label_table (void) if (pool_size) { fprintf (asm_out_file, "\t.align 2\n"); - + for (i = 0; i < pool_size; i++) { pool_node * p = pool_vector + i; (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (p->label)); - + output_asm_insn (".long %0", &p->value); } - + pool_size = 0; } @@ -2279,7 +2279,7 @@ static cond_type is_cond_candidate (rtx insn) { /* The only things we conditionalize are those that can be directly - changed into a conditional. Only bother with SImode items. If + changed into a conditional. Only bother with SImode items. If we wanted to be a little more aggressive, we could also do other modes such as DImode with reg-reg move or load 0. */ if (NONJUMP_INSN_P (insn)) @@ -2296,7 +2296,7 @@ is_cond_candidate (rtx insn) GET_CODE (dst) != SUBREG) || GET_MODE (dst) != SImode) return COND_NO; - + src = XEXP (pat, 1); if ((GET_CODE (src) == REG || @@ -2304,7 +2304,7 @@ is_cond_candidate (rtx insn) GET_CODE (SUBREG_REG (src)) == REG)) && GET_MODE (src) == SImode) return COND_MOV_INSN; - else if (GET_CODE (src) == CONST_INT && + else if (GET_CODE (src) == CONST_INT && INTVAL (src) == 0) return COND_CLR_INSN; else if (GET_CODE (src) == PLUS && @@ -2330,7 +2330,7 @@ is_cond_candidate (rtx insn) /* Some insns that we don't bother with: (set (rx:DI) (ry:DI)) (set (rx:DI) (const_int 0)) - */ + */ } else if (JUMP_P (insn) @@ -2369,7 +2369,7 @@ emit_new_cond_insn (rtx_insn *insn, int cond) switch (num) { - case COND_MOV_INSN: + case COND_MOV_INSN: case COND_CLR_INSN: if (cond) c_insn = gen_movt0 (dst, src, dst); @@ -2383,7 +2383,7 @@ emit_new_cond_insn (rtx_insn *insn, int cond) else c_insn = gen_incscc_false (dst, dst); break; - + case COND_DEC_INSN: if (cond) c_insn = gen_decscc (dst, dst); @@ -2411,7 +2411,7 @@ emit_new_cond_insn (rtx_insn *insn, int cond) used any more beyond this point for the mcore). */ REG_NOTES (c_insn) = REG_NOTES (insn); } - + if (num == COND_BRANCH_INSN) { /* For jumps, we need to be a little bit careful and emit the new jump @@ -2419,32 +2419,32 @@ emit_new_cond_insn (rtx_insn *insn, int cond) This way, the barrier following the old (uncond) jump will get deleted, but the label won't. */ c_insn = emit_jump_insn_before (c_insn, insn); - + ++ LABEL_NUSES (dst); - + JUMP_LABEL (c_insn) = dst; } else c_insn = emit_insn_after (c_insn, insn); delete_insn (insn); - + return as_a <rtx_insn *> (c_insn); } /* Attempt to change a basic block into a series of conditional insns. This - works by taking the branch at the end of the 1st block and scanning for the + works by taking the branch at the end of the 1st block and scanning for the end of the 2nd block. If all instructions in the 2nd block have cond. versions and the label at the start of block 3 is the same as the target from the branch at block 1, then conditionalize all insn in block 2 using the inverse condition of the branch at block 1. (Note I'm bending the definition of basic block here.) - e.g., change: + e.g., change: bt L2 <-- end of block 1 (delete) - mov r7,r8 - addu r7,1 + mov r7,r8 + addu r7,1 br L3 <-- end of block 2 L2: ... <-- start of block 3 (NUSES==1) @@ -2473,7 +2473,7 @@ conditionalize_block (rtx_insn *first) int br_lab_num; int blk_size = 0; - + /* Check that the first insn is a candidate conditional jump. This is the one that we'll eliminate. If not, advance to the next insn to try. */ @@ -2506,12 +2506,12 @@ conditionalize_block (rtx_insn *first) /* Scan forward for the start of block 2: it must start with a label and that label must be the same as the branch target label from block 1. We don't care about whether block 2 actually - ends with a branch or a label (an uncond. branch is + ends with a branch or a label (an uncond. branch is conditionalizable). */ for (insn = NEXT_INSN (first); insn; insn = NEXT_INSN (insn)) { enum rtx_code code; - + code = GET_CODE (insn); /* Look for the label at the start of block 3. */ @@ -2523,7 +2523,7 @@ conditionalize_block (rtx_insn *first) just return the next insn so we can start over from that point. */ if (code != BARRIER && code != NOTE && !is_cond_candidate (insn)) return NEXT_INSN (insn); - + /* Remember the last real insn before the label (i.e. end of block 2). */ if (code == JUMP_INSN || code == INSN) { @@ -2534,16 +2534,16 @@ conditionalize_block (rtx_insn *first) if (!insn) return insn; - - /* It is possible for this optimization to slow performance if the blocks - are long. This really depends upon whether the branch is likely taken + + /* It is possible for this optimization to slow performance if the blocks + are long. This really depends upon whether the branch is likely taken or not. If the branch is taken, we slow performance in many cases. But, - if the branch is not taken, we always help performance (for a single - block, but for a double block (i.e. when the optimization is re-applied) + if the branch is not taken, we always help performance (for a single + block, but for a double block (i.e. when the optimization is re-applied) this is not true since the 'right thing' depends on the overall length of - the collapsed block). As a compromise, don't apply this optimization on + the collapsed block). As a compromise, don't apply this optimization on blocks larger than size 2 (unlikely for the mcore) when speed is important. - the best threshold depends on the latencies of the instructions (i.e., + the best threshold depends on the latencies of the instructions (i.e., the branch penalty). */ if (optimize > 1 && blk_size > 2) return insn; @@ -2552,16 +2552,16 @@ conditionalize_block (rtx_insn *first) it is the destination of the branch from block 1. Also, all instructions in the block 2 are conditionalizable. So, apply the conditionalization and delete the branch. */ - start_blk_3_lab = insn; - - for (insn = NEXT_INSN (end_blk_1_br); insn != start_blk_3_lab; + start_blk_3_lab = insn; + + for (insn = NEXT_INSN (end_blk_1_br); insn != start_blk_3_lab; insn = NEXT_INSN (insn)) { rtx_insn *newinsn; if (insn->deleted ()) continue; - + /* Try to form a conditional variant of the instruction and emit it. */ if ((newinsn = emit_new_cond_insn (insn, cond))) { @@ -2573,7 +2573,7 @@ conditionalize_block (rtx_insn *first) } /* Note whether we will delete the label starting blk 3 when the jump - gets deleted. If so, we want to re-apply this optimization at the + gets deleted. If so, we want to re-apply this optimization at the last real instruction right before the label. */ if (LABEL_NUSES (start_blk_3_lab) == 1) { @@ -2588,7 +2588,7 @@ conditionalize_block (rtx_insn *first) if (! start_blk_3_lab) return end_blk_2_insn; - + /* Return the insn right after the label at the start of block 3. */ return NEXT_INSN (start_blk_3_lab); } @@ -2597,8 +2597,8 @@ conditionalize_block (rtx_insn *first) outer loop that traverses through the insns scanning for a branch that signifies an opportunity to apply the optimization. Note that this optimization is applied late. If we could apply it earlier, - say before cse 2, it may expose more optimization opportunities. - but, the pay back probably isn't really worth the effort (we'd have + say before cse 2, it may expose more optimization opportunities. + but, the pay back probably isn't really worth the effort (we'd have to update all reg/flow/notes/links/etc to make it work - and stick it in before cse 2). */ @@ -2618,10 +2618,10 @@ mcore_reorg (void) { /* Reset this variable. */ current_function_anonymous_args = 0; - + if (optimize == 0) return; - + /* Conditionalize blocks where we can. */ conditionalize_optimization (); @@ -2687,7 +2687,7 @@ mcore_is_same_reg (rtx x, rtx y) /* Strip any and all of the subreg wrappers. */ while (GET_CODE (x) == SUBREG) x = SUBREG_REG (x); - + while (GET_CODE (y) == SUBREG) y = SUBREG_REG (y); @@ -2706,7 +2706,7 @@ mcore_option_override (void) } -/* Compute the number of word sized registers needed to +/* Compute the number of word sized registers needed to hold a function argument of mode MODE and type TYPE. */ int @@ -2745,11 +2745,11 @@ handle_structs_in_regs (machine_mode mode, const_tree type, int reg) && (size % UNITS_PER_WORD != 0) && (reg + mcore_num_arg_regs (mode, type) <= (FIRST_PARM_REG + NPARM_REGS))) { - rtx arg_regs [NPARM_REGS]; + rtx arg_regs [NPARM_REGS]; int nregs; rtx result; rtvec rtvec; - + for (nregs = 0; size > 0; size -= UNITS_PER_WORD) { arg_regs [nregs] = @@ -2762,11 +2762,11 @@ handle_structs_in_regs (machine_mode mode, const_tree type, int reg) gcc_assert (ARRAY_SIZE (arg_regs) == 6); rtvec = gen_rtvec (nregs, arg_regs[0], arg_regs[1], arg_regs[2], arg_regs[3], arg_regs[4], arg_regs[5]); - + result = gen_rtx_PARALLEL (mode, rtvec); return result; } - + return gen_rtx_REG (mode, reg); } @@ -2775,12 +2775,12 @@ mcore_function_value (const_tree valtype, const_tree func) { machine_mode mode; int unsigned_p; - + mode = TYPE_MODE (valtype); /* Since we promote return types, we must promote the mode here too. */ mode = promote_function_mode (valtype, mode, &unsigned_p, func, 1); - + return handle_structs_in_regs (mode, valtype, FIRST_RET_REG); } @@ -2801,7 +2801,7 @@ static rtx mcore_function_arg (cumulative_args_t cum, const function_arg_info &arg) { int arg_reg; - + if (!arg.named || arg.end_marker_p ()) return 0; @@ -2809,7 +2809,7 @@ mcore_function_arg (cumulative_args_t cum, const function_arg_info &arg) return 0; arg_reg = ROUND_REG (*get_cumulative_args (cum), arg.mode); - + if (arg_reg < NPARM_REGS) return handle_structs_in_regs (arg.mode, arg.type, FIRST_PARM_REG + arg_reg); @@ -2852,7 +2852,7 @@ mcore_arg_partial_bytes (cumulative_args_t cum, const function_arg_info &arg) if (targetm.calls.must_pass_in_stack (arg)) return 0; - + /* REG is not the *hardware* register number of the register that holds the argument, it is the *argument* register number. So for example, the first argument to a function goes in argument register 0, which @@ -2904,12 +2904,12 @@ mcore_mark_dllexport (tree decl) tree idp; rtlname = XEXP (DECL_RTL (decl), 0); - + if (GET_CODE (rtlname) == MEM) rtlname = XEXP (rtlname, 0); gcc_assert (GET_CODE (rtlname) == SYMBOL_REF); oldname = XSTR (rtlname, 0); - + if (mcore_dllexport_name_p (oldname)) return; /* Already done. */ @@ -2939,12 +2939,12 @@ mcore_mark_dllimport (tree decl) rtx newrtl; rtlname = XEXP (DECL_RTL (decl), 0); - + if (GET_CODE (rtlname) == MEM) rtlname = XEXP (rtlname, 0); gcc_assert (GET_CODE (rtlname) == SYMBOL_REF); oldname = XSTR (rtlname, 0); - + gcc_assert (!mcore_dllexport_name_p (oldname)); if (mcore_dllimport_name_p (oldname)) return; /* Already done. */ @@ -2960,7 +2960,7 @@ mcore_mark_dllimport (tree decl) error ("initialized variable %q+D is marked dllimport", decl); return; } - + /* `extern' needn't be specified with dllimport. Specify `extern' now and hope for the best. Sigh. */ if (VAR_P (decl) @@ -3019,7 +3019,7 @@ mcore_encode_section_info (tree decl, rtx rtl ATTRIBUTE_UNUSED, int first ATTRIB mcore_mark_dllexport (decl); else if (mcore_dllimport_p (decl)) mcore_mark_dllimport (decl); - + /* It might be that DECL has already been marked as dllimport, but a subsequent definition nullified that. The attribute is gone but DECL_RTL still has @i.__imp_foo. We need to remove that. */ @@ -3084,7 +3084,7 @@ mcore_unique_section (tree decl, int reloc ATTRIBUTE_UNUSED) const char * prefix; name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); - + /* Strip off any encoding in name. */ name = (* targetm.strip_name_encoding) (name); @@ -3099,10 +3099,10 @@ mcore_unique_section (tree decl, int reloc ATTRIBUTE_UNUSED) prefix = ".rdata$"; else prefix = ".data$"; - + len = strlen (name) + strlen (prefix); string = XALLOCAVEC (char, len + 1); - + sprintf (string, "%s%s", prefix, name); set_decl_section_name (decl, string); @@ -3124,7 +3124,7 @@ mcore_warn_func_return (tree decl) #ifdef OBJECT_FORMAT_ELF static void -mcore_asm_named_section (const char *name, +mcore_asm_named_section (const char *name, unsigned int flags ATTRIBUTE_UNUSED, tree decl ATTRIBUTE_UNUSED) { @@ -3214,13 +3214,13 @@ mcore_reg_ok_for_base_p (const_rtx reg, bool strict_p) static bool mcore_base_register_rtx_p (const_rtx x, bool strict_p) { - return REG_P(x) && mcore_reg_ok_for_base_p (x, strict_p); + return REG_P(x) && mcore_reg_ok_for_base_p (x, strict_p); } /* A legitimate index for a QI is 0..15, for HI is 0..30, for SI is 0..60, and for DI is 0..56 because we use two SI loads, etc. */ -static bool +static bool mcore_legitimate_index_p (machine_mode mode, const_rtx op) { if (CONST_INT_P (op)) @@ -3237,11 +3237,11 @@ mcore_legitimate_index_p (machine_mode mode, const_rtx op) if (GET_MODE_SIZE (mode) == 1 && ((unsigned HOST_WIDE_INT) INTVAL (op)) <= 15) return true; - } + } return false; } - + /* Worker function for TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. Allow REG diff --git a/gcc/config/mcore/mcore.h b/gcc/config/mcore/mcore.h index 36dc860..17502e0 100644 --- a/gcc/config/mcore/mcore.h +++ b/gcc/config/mcore/mcore.h @@ -22,7 +22,7 @@ #define GCC_MCORE_H /* RBE: need to move these elsewhere. */ -#undef LIKE_PPC_ABI +#undef LIKE_PPC_ABI #define MCORE_STRUCT_ARGS /* RBE: end of "move elsewhere". */ @@ -80,7 +80,7 @@ #define TARGET_8ALIGN 1 extern char * mcore_current_function_name; - + /* Target machine storage Layout. */ #define PROMOTE_MODE(MODE,UNSIGNEDP,TYPE) \ @@ -136,7 +136,7 @@ extern char * mcore_current_function_name; /* Every structures size must be a multiple of 8 bits. */ #define STRUCTURE_SIZE_BOUNDARY 8 -/* Look at the fundamental type that is used for a bit-field and use +/* Look at the fundamental type that is used for a bit-field and use that to impose alignment on the enclosing structure. struct s {int a:8}; should have same alignment as "int", not "char". */ #define PCC_BITFIELD_TYPE_MATTERS 1 @@ -150,14 +150,14 @@ extern char * mcore_current_function_name; (TREE_CODE (TYPE) == ARRAY_TYPE \ && TYPE_MODE (TREE_TYPE (TYPE)) == QImode \ && (ALIGN) < FASTEST_ALIGNMENT ? FASTEST_ALIGNMENT : (ALIGN)) - + /* Set this nonzero if move instructions will actually fail to work when given unaligned data. */ #define STRICT_ALIGNMENT 1 /* Standard register usage. */ -/* Register allocation for our first guess +/* Register allocation for our first guess r0 stack pointer r1 scratch, target reg for xtrb? @@ -333,7 +333,7 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; but prevents the compiler from extending the lifetime of these registers. */ #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P hook_bool_mode_true - + /* The class value for index registers, and the one for base regs. */ #define INDEX_REG_CLASS NO_REGS #define BASE_REG_CLASS GENERAL_REGS @@ -369,7 +369,7 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; mcore_secondary_reload_class (CLASS, MODE, X) /* Return the maximum number of consecutive registers - needed to represent mode MODE in a register of class CLASS. + needed to represent mode MODE in a register of class CLASS. On MCore this is the size of MODE in words. */ #define CLASS_MAX_NREGS(CLASS, MODE) \ @@ -434,9 +434,9 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; #define ROUND_ADVANCE(SIZE) \ ((SIZE + UNITS_PER_WORD - 1) / UNITS_PER_WORD) -/* Round a register number up to a proper boundary for an arg of mode - MODE. - +/* Round a register number up to a proper boundary for an arg of mode + MODE. + We round to an even reg for things larger than a word. */ #define ROUND_REG(X, MODE) \ ((TARGET_8ALIGN \ @@ -486,7 +486,7 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; #define REGNO_OK_FOR_INDEX_P(REGNO) 0 -/* Maximum number of registers that can appear in a valid memory +/* Maximum number of registers that can appear in a valid memory address. */ #define MAX_REGS_PER_ADDRESS 1 @@ -587,7 +587,7 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; reg_names[STACK_POINTER_REGNUM], \ (STACK_BOUNDARY / BITS_PER_UNIT)) - + /* Output a reference to a label. */ #undef ASM_OUTPUT_LABELREF #define ASM_OUTPUT_LABELREF(STREAM, NAME) \ @@ -614,8 +614,8 @@ extern const enum reg_class regno_reg_class[FIRST_PSEUDO_REGISTER]; 0 a call from src to dst 1 the call is special (e.g. dst is "unknown" or "alloca") 2 the call is special (e.g., the src is a table instead of routine) - - Frame sizes are augmented with timestamps to help later tools + + Frame sizes are augmented with timestamps to help later tools differentiate between static entities with same names in different files. */ extern long mcore_current_compilation_timestamp; @@ -673,7 +673,7 @@ extern long mcore_current_compilation_timestamp; /* This says how to output an assembler line to define a global common symbol, with alignment information. */ -/* XXX - for now we ignore the alignment. */ +/* XXX - for now we ignore the alignment. */ #undef ASM_OUTPUT_ALIGNED_COMMON #define ASM_OUTPUT_ALIGNED_COMMON(FILE, NAME, SIZE, ALIGN) \ do \ diff --git a/gcc/config/microblaze/microblaze-c.cc b/gcc/config/microblaze/microblaze-c.cc index e60783e..52cc82b 100644 --- a/gcc/config/microblaze/microblaze-c.cc +++ b/gcc/config/microblaze/microblaze-c.cc @@ -30,10 +30,10 @@ #define builtin_define(TXT) cpp_define (pfile, TXT) #define builtin_assert(TXT) cpp_assert (pfile, TXT) -/* Define preprocessor symbols for MicroBlaze. +/* Define preprocessor symbols for MicroBlaze. Symbols which do not start with __ are deprecated. */ -void +void microblaze_cpp_define (cpp_reader *pfile) { builtin_assert ("cpu=microblaze"); @@ -52,7 +52,7 @@ microblaze_cpp_define (cpp_reader *pfile) builtin_define ("__BIG_ENDIAN__"); builtin_define ("__MICROBLAZEEB__"); } - if (!TARGET_SOFT_MUL) + if (!TARGET_SOFT_MUL) { if (!flag_iso) builtin_define ("HAVE_HW_MUL"); @@ -100,4 +100,4 @@ microblaze_cpp_define (cpp_reader *pfile) builtin_define ("HAVE_HW_FPU_SQRT"); builtin_define ("__HAVE_HW_FPU_SQRT__"); } -} +} diff --git a/gcc/config/microblaze/microblaze-protos.h b/gcc/config/microblaze/microblaze-protos.h index ae97cc2..7a25f03a 100644 --- a/gcc/config/microblaze/microblaze-protos.h +++ b/gcc/config/microblaze/microblaze-protos.h @@ -37,7 +37,7 @@ extern bool microblaze_expand_block_move (rtx, rtx, rtx, rtx); extern void microblaze_expand_divide (rtx *); extern void microblaze_expand_conditional_branch (machine_mode, rtx *); extern void microblaze_expand_conditional_branch_reg (machine_mode, rtx *); -extern void microblaze_expand_conditional_branch_sf (rtx *); +extern void microblaze_expand_conditional_branch_sf (rtx *); extern int microblaze_can_use_return_insn (void); extern void print_operand (FILE *, rtx, int); extern void print_operand_address (FILE *, rtx); @@ -65,6 +65,6 @@ extern void microblaze_eh_return (rtx op0); #endif /* RTX_CODE */ /* Declare functions in microblaze-c.cc. */ -extern void microblaze_cpp_define (struct cpp_reader *); +extern void microblaze_cpp_define (struct cpp_reader *); #endif /* GCC_MICROBLAZE_PROTOS_H */ diff --git a/gcc/config/microblaze/microblaze.cc b/gcc/config/microblaze/microblaze.cc index 98ec611..c036969 100644 --- a/gcc/config/microblaze/microblaze.cc +++ b/gcc/config/microblaze/microblaze.cc @@ -65,8 +65,8 @@ An invalid address. ADDRESS_REG -A natural register or a register + const_int offset address. -The register satisfies microblaze_valid_base_register_p and the +A natural register or a register + const_int offset address. +The register satisfies microblaze_valid_base_register_p and the offset is a const_arith_operand. ADDRESS_REG_INDEX @@ -99,7 +99,7 @@ enum microblaze_address_type /* Classifies symbols SYMBOL_TYPE_GENERAL - + A general symbol. */ enum microblaze_symbol_type { @@ -120,7 +120,7 @@ enum tls_reloc { struct microblaze_address_info { enum microblaze_address_type type; - rtx regA; /* Contains valid values on ADDRESS_REG, ADDRESS_REG_INDEX, + rtx regA; /* Contains valid values on ADDRESS_REG, ADDRESS_REG_INDEX, ADDRESS_SYMBOLIC. */ rtx regB; /* Contains valid values on ADDRESS_REG_INDEX. */ rtx offset; /* Contains valid values on ADDRESS_CONST_INT and ADDRESS_REG. */ @@ -143,7 +143,7 @@ struct GTY(()) microblaze_frame_info { int initialized; /* != 0 if frame size already calculated. */ int num_gp; /* number of gp registers saved. */ long insns_len; /* length of insns. */ - int alloc_stack; /* Flag to indicate if the current function + int alloc_stack; /* Flag to indicate if the current function must not create stack space. (As an optimization). */ }; @@ -158,18 +158,18 @@ static GTY(()) int microblaze_sched_use_dfa = 0; data area takes 2 instructions). */ int microblaze_section_threshold = -1; -/* Prevent scheduling potentially exception causing instructions in +/* Prevent scheduling potentially exception causing instructions in delay slots. -mcpu=v3.00.a or v4.00.a turns this on. */ int microblaze_no_unsafe_delay; /* Set to one if the targeted core has the CLZ insn. */ int microblaze_has_clz = 0; -/* Which CPU pipeline do we use. We haven't really standardized on a CPU - version having only a particular type of pipeline. There can still be - options on the CPU to scale pipeline features up or down. :( - Bad Presentation (??), so we let the MD file rely on the value of - this variable instead Making PIPE_5 the default. It should be backward +/* Which CPU pipeline do we use. We haven't really standardized on a CPU + version having only a particular type of pipeline. There can still be + options on the CPU to scale pipeline features up or down. :( + Bad Presentation (??), so we let the MD file rely on the value of + this variable instead Making PIPE_5 the default. It should be backward optimal with PIPE_3 MicroBlazes. */ enum pipeline_type microblaze_pipe = MICROBLAZE_PIPE_5; @@ -210,7 +210,7 @@ enum reg_class microblaze_regno_to_class[] = }; /* MicroBlaze specific machine attributes. - interrupt_handler - Interrupt handler attribute to add interrupt prologue + interrupt_handler - Interrupt handler attribute to add interrupt prologue and epilogue and use appropriate interrupt return. save_volatiles - Similar to interrupt handler, but use normal return. */ int interrupt_handler; @@ -719,8 +719,8 @@ get_base_reg (rtx x) const_int ADDRESS_REG_INDEX %0 %1 NULL NULL - ADDRESS_SYMBOLIC r0 / NULL NULL symbol - sda_base_reg + ADDRESS_SYMBOLIC r0 / NULL NULL symbol + sda_base_reg ADDRESS_CONST_INT r0 NULL const NULL @@ -1005,7 +1005,7 @@ microblaze_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, result = gen_rtx_PLUS (Pmode, ptr_reg, constant); if (SMALL_INT (constant)) return result; - /* Otherwise we fall through so the code below will fix the + /* Otherwise we fall through so the code below will fix the constant. */ xinsn = result; } @@ -1363,7 +1363,7 @@ microblaze_rtx_costs (rtx x, machine_mode mode, int outer_code ATTRIBUTE_UNUSED, *total -= 2; } else - /* Double the worst cost of shifts when there is no barrel shifter and + /* Double the worst cost of shifts when there is no barrel shifter and the shift amount is in a reg. */ *total = COSTS_N_INSNS (32 * 4); return true; @@ -1498,7 +1498,7 @@ microblaze_address_cost (rtx addr, machine_mode mode ATTRIBUTE_UNUSED, return COSTS_N_INSNS (microblaze_address_insns (addr, GET_MODE (addr))); } -/* Return nonzero if X is an address which needs a temporary register when +/* Return nonzero if X is an address which needs a temporary register when reloaded while generating PIC code. */ int @@ -1680,7 +1680,7 @@ function_arg_partial_bytes (cumulative_args_t cum_v, return 0; } -/* Convert a version number of the form "vX.YY.Z" to an integer encoding +/* Convert a version number of the form "vX.YY.Z" to an integer encoding for easier range comparison. */ static int microblaze_version_to_int (const char *version) @@ -1794,7 +1794,7 @@ microblaze_option_override (void) } else { - /* We agree to use 5 pipe-stage model even on area optimized 3 + /* We agree to use 5 pipe-stage model even on area optimized 3 pipe-stage variants. */ #if 0 microblaze_select_flags &= ~(MICROBLAZE_MASK_NO_UNSAFE_DELAY); @@ -1807,7 +1807,7 @@ microblaze_option_override (void) || MICROBLAZE_VERSION_COMPARE (microblaze_select_cpu, "v5.00.c") == 0) { - /* Pattern compares are to be turned on by default only when + /* Pattern compares are to be turned on by default only when compiling for MB v5.00.'z'. */ target_flags |= MASK_PATTERN_COMPARE; } @@ -2039,7 +2039,7 @@ microblaze_must_save_register (int regno) if (microblaze_is_interrupt_variant ()) { - if (df_regs_ever_live_p (regno) + if (df_regs_ever_live_p (regno) || regno == MB_ABI_MSR_SAVE_REG || ((interrupt_handler || fast_interrupt) && (regno == MB_ABI_ASM_TEMP_REGNUM @@ -2109,7 +2109,7 @@ microblaze_must_save_register (int regno) */ static HOST_WIDE_INT -compute_frame_size (HOST_WIDE_INT size) +compute_frame_size (HOST_WIDE_INT size) { int regno; HOST_WIDE_INT total_size; /* # bytes that the entire frame takes up. */ @@ -2207,7 +2207,7 @@ microblaze_can_eliminate (const int from, const int to) } /* Implement INITIAL_ELIMINATION_OFFSET. FROM is either the frame - pointer or argument pointer or the return address pointer. TO is either + pointer or argument pointer or the return address pointer. TO is either the stack pointer or hard frame pointer. */ HOST_WIDE_INT @@ -2240,7 +2240,7 @@ microblaze_initial_elimination_offset (int from, int to) } /* Print operands using format code. - + The MicroBlaze specific codes are: 'X' X is CONST_INT, prints 32 bits in hexadecimal format = "0x%08x", @@ -2267,7 +2267,7 @@ microblaze_initial_elimination_offset (int from, int to) 'j' Print low word of const_double (int or float) value as hex 's' Print -1 if operand is negative, 0 if positive (sign extend) '@' Print the name of the temporary register (rMB_ABI_ASM_TEMP_REGNUM). - '#' Print nop if the delay slot of a branch is not filled. + '#' Print nop if the delay slot of a branch is not filled. */ void @@ -2463,7 +2463,7 @@ print_operand (FILE * file, rtx op, int letter) val[1] = INTVAL (op) & 0x00000000ffffffffLL; if (val[0] == 0 && val[1] < 0) val[0] = -1; - + } fprintf (file, "0x%8.8lx", (letter == 'h') ? val[0] : val[1]); } @@ -2543,19 +2543,19 @@ print_operand (FILE * file, rtx op, int letter) reference whose address is ADDR. ADDR is an RTL expression. Possible address classifications and output formats are, - + ADDRESS_REG "%0, r0" ADDRESS_REG with non-zero "%0, <addr_const>" - offset + offset - ADDRESS_REG_INDEX "rA, RB" + ADDRESS_REG_INDEX "rA, RB" (if rA is r0, rA and rB are swapped) ADDRESS_CONST_INT "r0, <addr_const>" - ADDRESS_SYMBOLIC "rBase, <addr_const>" - (rBase is a base register suitable for the + ADDRESS_SYMBOLIC "rBase, <addr_const>" + (rBase is a base register suitable for the symbol's type) */ @@ -2576,7 +2576,7 @@ print_operand_address (FILE * file, rtx addr) break; case ADDRESS_REG_INDEX: if (REGNO (info.regA) == 0) - /* Make rB == r0 instead of rA == r0. This helps reduce read port + /* Make rB == r0 instead of rA == r0. This helps reduce read port congestion. */ fprintf (file, "%s,%s", reg_names[REGNO (info.regB)], reg_names[REGNO (info.regA)]); @@ -2641,7 +2641,7 @@ print_operand_address (FILE * file, rtx addr) } /* Emit either a label, .comm, or .lcomm directive, and mark that the symbol - is used, so that we don't emit an .extern for it in + is used, so that we don't emit an .extern for it in microblaze_asm_file_end. */ void @@ -2649,7 +2649,7 @@ microblaze_declare_object (FILE * stream, const char *name, const char *section, const char *fmt, int size) { - fputs (section, stream); + fputs (section, stream); assemble_name (stream, name); fprintf (stream, fmt, size); } @@ -2662,7 +2662,7 @@ microblaze_declare_object (FILE * stream, const char *name, #define BITSET_P(VALUE,BIT) (((VALUE) & (1L << (BIT))) != 0) -/* Save or restore instructions based on whether this is the prologue or +/* Save or restore instructions based on whether this is the prologue or epilogue. prologue is 1 for the prologue. */ static void save_restore_insns (int prologue) @@ -2892,7 +2892,7 @@ microblaze_expand_prologue (void) && !cfun->returns_pcc_struct) { tree type = build_pointer_type (fntype); - tree function_result_decl = build_decl (BUILTINS_LOCATION, PARM_DECL, + tree function_result_decl = build_decl (BUILTINS_LOCATION, PARM_DECL, NULL_TREE, type); DECL_ARG_TYPE (function_result_decl) = type; @@ -3108,7 +3108,7 @@ microblaze_expand_epilogue (void) rtx reg_rtx; rtx mem_rtx; - /* In case of interrupt handlers use addki instead of addi for changing the + /* In case of interrupt handlers use addki instead of addi for changing the stack pointer value. */ if (microblaze_can_use_return_insn ()) @@ -3121,9 +3121,9 @@ microblaze_expand_epilogue (void) if (fsiz > 0) { - /* Restore SUB_RETURN_ADDR_REGNUM at first. This is to prevent the - sequence of load-followed by a use (in rtsd) in every prologue. Saves - a load-use stall cycle :) This is also important to handle alloca. + /* Restore SUB_RETURN_ADDR_REGNUM at first. This is to prevent the + sequence of load-followed by a use (in rtsd) in every prologue. Saves + a load-use stall cycle :) This is also important to handle alloca. (See comments for if (frame_pointer_needed) below. */ if (!crtl->is_leaf || interrupt_handler) @@ -3138,11 +3138,11 @@ microblaze_expand_epilogue (void) emit_move_insn (reg_rtx, mem_rtx); } - /* It is important that this is done after we restore the return address - register (above). When alloca is used, we want to restore the - sub-routine return address only from the current stack top and not - from the frame pointer (which we restore below). (frame_pointer + 0) - might have been over-written since alloca allocates memory on the + /* It is important that this is done after we restore the return address + register (above). When alloca is used, we want to restore the + sub-routine return address only from the current stack top and not + from the frame pointer (which we restore below). (frame_pointer + 0) + might have been over-written since alloca allocates memory on the current stack. */ if (frame_pointer_needed) emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx)); @@ -3186,8 +3186,8 @@ microblaze_can_use_return_insn (void) /* Implement TARGET_SECONDARY_RELOAD. */ static reg_class_t -microblaze_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x ATTRIBUTE_UNUSED, - reg_class_t rclass, machine_mode mode ATTRIBUTE_UNUSED, +microblaze_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x ATTRIBUTE_UNUSED, + reg_class_t rclass, machine_mode mode ATTRIBUTE_UNUSED, secondary_reload_info *sri ATTRIBUTE_UNUSED) { if (rclass == ST_REGS) @@ -3263,7 +3263,7 @@ microblaze_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) case SECCAT_RODATA_MERGE_STR: case SECCAT_RODATA_MERGE_STR_INIT: /* MB binutils have various issues with mergeable string sections and - relaxation/relocation. Currently, turning mergeable sections + relaxation/relocation. Currently, turning mergeable sections into regular readonly sections. */ return readonly_data_section; @@ -3274,7 +3274,7 @@ microblaze_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) /* Encode info about sections into the RTL based on a symbol's declaration. - The default definition of this hook, default_encode_section_info in + The default definition of this hook, default_encode_section_info in `varasm.cc', sets a number of commonly-useful bits in SYMBOL_REF_FLAGS. */ static void @@ -3521,7 +3521,7 @@ microblaze_eh_return (rtx op0) If the string size is below the threshold, put it into .sdata2. If the front-end is done, we must be being called from toplev.cc. In that case, do nothing. */ -void +void microblaze_asm_output_ident (const char *string) { const char *section_asm_op; @@ -3695,7 +3695,7 @@ microblaze_expand_divide (rtx operands[]) { /* Table lookup software divides. Works for all (nr/dr) where (0 <= nr,dr <= 15). */ - rtx regt1 = gen_reg_rtx (SImode); + rtx regt1 = gen_reg_rtx (SImode); rtx reg18 = gen_rtx_REG (SImode, R_TMP); rtx regqi = gen_reg_rtx (QImode); rtx_code_label *div_label = gen_label_rtx (); @@ -3707,9 +3707,9 @@ microblaze_expand_divide (rtx operands[]) insn = emit_insn (gen_iorsi3 (regt1, operands[1], operands[2])); cjump = emit_jump_insn_after (gen_cbranchsi4 ( - gen_rtx_GTU (SImode, regt1, GEN_INT (15)), + gen_rtx_GTU (SImode, regt1, GEN_INT (15)), regt1, GEN_INT (15), div_label), insn); - LABEL_NUSES (div_label) = 1; + LABEL_NUSES (div_label) = 1; JUMP_LABEL (cjump) = div_label; emit_insn (gen_rtx_CLOBBER (SImode, reg18)); @@ -3718,21 +3718,21 @@ microblaze_expand_divide (rtx operands[]) mem_rtx = gen_rtx_MEM (QImode, gen_rtx_PLUS (Pmode, regt1, div_table_rtx)); - insn = emit_insn (gen_movqi (regqi, mem_rtx)); + insn = emit_insn (gen_movqi (regqi, mem_rtx)); insn = emit_insn (gen_movsi (operands[0], gen_rtx_SUBREG (SImode, regqi, 0))); - jump = emit_jump_insn_after (gen_jump (div_end_label), insn); + jump = emit_jump_insn_after (gen_jump (div_end_label), insn); JUMP_LABEL (jump) = div_end_label; - LABEL_NUSES (div_end_label) = 1; + LABEL_NUSES (div_end_label) = 1; emit_barrier (); emit_label (div_label); - ret = emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "__divsi3"), + ret = emit_library_call_value (gen_rtx_SYMBOL_REF (Pmode, "__divsi3"), operands[0], LCT_NORMAL, GET_MODE (operands[0]), operands[1], GET_MODE (operands[1]), operands[2], GET_MODE (operands[2])); if (ret != operands[0]) - emit_move_insn (operands[0], ret); + emit_move_insn (operands[0], ret); emit_label (div_end_label); emit_insn (gen_blockage ()); @@ -4014,7 +4014,7 @@ microblaze_starting_frame_offset (void) #define TARGET_LEGITIMIZE_ADDRESS microblaze_legitimize_address #undef TARGET_LEGITIMATE_ADDRESS_P -#define TARGET_LEGITIMATE_ADDRESS_P microblaze_legitimate_address_p +#define TARGET_LEGITIMATE_ADDRESS_P microblaze_legitimate_address_p #undef TARGET_FRAME_POINTER_REQUIRED #define TARGET_FRAME_POINTER_REQUIRED microblaze_frame_pointer_required @@ -4029,7 +4029,7 @@ microblaze_starting_frame_offset (void) #define TARGET_PROMOTE_FUNCTION_MODE default_promote_function_mode_always_promote #undef TARGET_FUNCTION_VALUE -#define TARGET_FUNCTION_VALUE microblaze_function_value +#define TARGET_FUNCTION_VALUE microblaze_function_value #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD microblaze_secondary_reload @@ -4047,7 +4047,7 @@ microblaze_starting_frame_offset (void) #define TARGET_ASM_INIT_SECTIONS microblaze_elf_asm_init_sections #undef TARGET_OPTION_OVERRIDE -#define TARGET_OPTION_OVERRIDE microblaze_option_override +#define TARGET_OPTION_OVERRIDE microblaze_option_override #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P microblaze_legitimate_constant_p diff --git a/gcc/config/microblaze/microblaze.h b/gcc/config/microblaze/microblaze.h index 5d28abf..a56b9de 100644 --- a/gcc/config/microblaze/microblaze.h +++ b/gcc/config/microblaze/microblaze.h @@ -68,8 +68,8 @@ extern enum pipeline_type microblaze_pipe; /* The default is to not need GOT for TLS. */ #define TLS_NEEDS_GOT 0 -/* What is the default setting for -mcpu= . We set it to v4.00.a even though - we are actually ahead. This is safest version that has generate code +/* What is the default setting for -mcpu= . We set it to v4.00.a even though + we are actually ahead. This is safest version that has generate code compatible for the original ISA */ #define MICROBLAZE_DEFAULT_CPU "v4.00.a" @@ -142,7 +142,7 @@ extern enum pipeline_type microblaze_pipe; #define MB_ABI_SUB_RETURN_ADDR_REGNUM 15 #define MB_ABI_DEBUG_RETURN_ADDR_REGNUM 16 #define MB_ABI_EXCEPTION_RETURN_ADDR_REGNUM 17 -#define MB_ABI_ASM_TEMP_REGNUM 18 +#define MB_ABI_ASM_TEMP_REGNUM 18 /* This is our temp register. */ #define MB_ABI_FRAME_POINTER_REGNUM 19 #define MB_ABI_PIC_ADDR_REGNUM 20 @@ -157,7 +157,7 @@ extern enum pipeline_type microblaze_pipe; #define MB_ABI_STATIC_CHAIN_REGNUM 3 #define MB_ABI_TEMP1_REGNUM 11 #define MB_ABI_TEMP2_REGNUM 12 -#define MB_ABI_MSR_SAVE_REG 11 +#define MB_ABI_MSR_SAVE_REG 11 /* Volatile register used to save MSR in interrupt handlers. */ @@ -177,8 +177,8 @@ extern enum pipeline_type microblaze_pipe; (GP_REG_FIRST + MB_ABI_SUB_RETURN_ADDR_REGNUM) /* Initial state of return address on entry to func = R15. - Actually, the RA is at R15+8, but gcc doesn't know how - to generate this. + Actually, the RA is at R15+8, but gcc doesn't know how + to generate this. NOTE: GDB has a workaround and expects this incorrect value. If this is fixed, a corresponding fix to GDB is needed. */ #define INCOMING_RETURN_ADDR_RTX \ @@ -294,7 +294,7 @@ extern enum pipeline_type microblaze_pipe; rMB_ABI_INTR_RETUREN_ADDR_REGNUM is a fixed register(return address for interrupt), and will not be used for anything else. */ - + #define FRAME_POINTER_REGNUM FRP_REG_NUM #define HARD_FRAME_POINTER_REGNUM \ (GP_REG_FIRST + MB_ABI_FRAME_POINTER_REGNUM) @@ -383,7 +383,7 @@ extern enum reg_class microblaze_regno_to_class[]; && (((VALUE) & 0x0000ffff) != 0 \ || (((VALUE) & ~2147483647) != 0 \ && ((VALUE) & ~2147483647) != ~2147483647))) - + #define PREFERRED_RELOAD_CLASS(X,CLASS) \ ((CLASS) != ALL_REGS \ ? (CLASS) \ @@ -470,7 +470,7 @@ typedef struct microblaze_args int fp_code; /* Mode of FP arguments */ int num_adjusts; /* number of adjustments made */ /* Adjustments made to args pass in regs. */ - /* ??? The size is doubled to work around a bug in the code that sets the + /* ??? The size is doubled to work around a bug in the code that sets the adjustments in function_arg. */ rtx adjust[MAX_ARGS_IN_REGISTERS * 2]; } CUMULATIVE_ARGS; @@ -512,7 +512,7 @@ typedef struct microblaze_args #define MAX_REGS_PER_ADDRESS 2 -/* Identify valid constant addresses. Exclude if PIC addr which +/* Identify valid constant addresses. Exclude if PIC addr which needs scratch register. */ #define CONSTANT_ADDRESS_P(X) microblaze_constant_address_p(X) @@ -608,7 +608,7 @@ typedef struct microblaze_args /* ASM_OUTPUT_ALIGNED_COMMON and ASM_OUTPUT_ALIGNED_LOCAL Unfortunately, we still need to set the section explicitly. Somehow, - our binutils assign .comm and .lcomm variables to the "current" section + our binutils assign .comm and .lcomm variables to the "current" section in the assembly file, rather than where they implicitly belong. We need to remove this explicit setting in GCC when binutils can understand sections better. */ @@ -836,11 +836,11 @@ do { \ #undef TARGET_ASM_NAMED_SECTION #define TARGET_ASM_NAMED_SECTION default_elf_asm_named_section -/* Define the strings to put out for each section in the object file. - - Note: For ctors/dtors, we want to give these sections the SHF_WRITE - attribute to allow shared libraries to patch/resolve addresses into - these locations. On Microblaze, there is no concept of shared libraries +/* Define the strings to put out for each section in the object file. + + Note: For ctors/dtors, we want to give these sections the SHF_WRITE + attribute to allow shared libraries to patch/resolve addresses into + these locations. On Microblaze, there is no concept of shared libraries yet, so this is for future use. */ #define TEXT_SECTION_ASM_OP "\t.text" #define DATA_SECTION_ASM_OP "\t.data" @@ -865,7 +865,7 @@ do { \ "\tbrlid r15, " #FUNC "\n\t nop\n" \ TEXT_SECTION_ASM_OP); -/* We need to group -lm as well, since some Newlib math functions +/* We need to group -lm as well, since some Newlib math functions reference __errno! */ #undef LIB_SPEC #define LIB_SPEC \ diff --git a/gcc/config/mingw/winnt-cxx.cc b/gcc/config/mingw/winnt-cxx.cc index f4d7a50..0be8095 100644 --- a/gcc/config/mingw/winnt-cxx.cc +++ b/gcc/config/mingw/winnt-cxx.cc @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -45,8 +46,8 @@ i386_pe_type_dllimport_p (tree decl) || DECL_TEMPLATE_INSTANTIATION (decl) || DECL_ARTIFICIAL (decl))) return false; - - /* Overrides of the class dllimport decls by out-of-class definitions are + + /* Overrides of the class dllimport decls by out-of-class definitions are handled by tree.cc:merge_dllimport_decl_attributes. */ return true; } @@ -73,16 +74,16 @@ i386_pe_type_dllexport_p (tree decl) return true; } -static inline void maybe_add_dllimport (tree decl) +static inline void maybe_add_dllimport (tree decl) { if (i386_pe_type_dllimport_p (decl)) DECL_DLLIMPORT_P (decl) = 1; } -static inline void maybe_add_dllexport (tree decl) +static inline void maybe_add_dllexport (tree decl) { if (i386_pe_type_dllexport_p (decl)) - { + { tree decl_attrs = DECL_ATTRIBUTES (decl); if (lookup_attribute ("dllexport", decl_attrs) != NULL_TREE) /* Already done. */ @@ -98,8 +99,8 @@ i386_pe_adjust_class_at_definition (tree t) tree member; gcc_assert (CLASS_TYPE_P (t)); - - + + if (lookup_attribute ("dllexport", TYPE_ATTRIBUTES (t)) != NULL_TREE) { tree tmv = TYPE_MAIN_VARIANT (t); @@ -124,7 +125,7 @@ i386_pe_adjust_class_at_definition (tree t) { tree thunk; maybe_add_dllexport (member); - + /* Also add the attribute to its thunks. */ for (thunk = DECL_THUNKS (member); thunk; thunk = TREE_CHAIN (thunk)) @@ -156,13 +157,13 @@ i386_pe_adjust_class_at_definition (tree t) { tree thunk; maybe_add_dllimport (member); - + /* Also add the attribute to its thunks. */ for (thunk = DECL_THUNKS (member); thunk; thunk = DECL_CHAIN (thunk)) maybe_add_dllimport (thunk); } - + /* Check vtables */ for (member = CLASSTYPE_VTABLES (t); member; member = DECL_CHAIN (member)) @@ -172,6 +173,6 @@ i386_pe_adjust_class_at_definition (tree t) /* We leave typeinfo tables alone. We can't mark TI objects as dllimport, since the address of a secondary VTT may be needed for static initialization of a primary VTT. VTT's of - dllimport'd classes should always be link-once COMDAT. */ + dllimport'd classes should always be link-once COMDAT. */ } } diff --git a/gcc/config/mingw/winnt.cc b/gcc/config/mingw/winnt.cc index 803e5f5..9d433da 100644 --- a/gcc/config/mingw/winnt.cc +++ b/gcc/config/mingw/winnt.cc @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -120,7 +121,7 @@ i386_pe_determine_dllexport_p (tree decl) if (TREE_CODE (decl) == FUNCTION_DECL && DECL_DECLARED_INLINE_P (decl) && !flag_keep_inline_dllexport) - return false; + return false; if (lookup_attribute ("dllexport", DECL_ATTRIBUTES (decl))) return true; @@ -185,11 +186,11 @@ gen_stdcall_or_fastcall_suffix (tree decl, tree id, bool fastcall) tree arg; function_args_iterator args_iter; - gcc_assert (TREE_CODE (decl) == FUNCTION_DECL); + gcc_assert (TREE_CODE (decl) == FUNCTION_DECL); if (prototype_p (type)) { - /* This attribute is ignored for variadic functions. */ + /* This attribute is ignored for variadic functions. */ if (stdarg_p (type)) return NULL_TREE; @@ -235,7 +236,7 @@ i386_pe_maybe_mangle_decl_assembler_name (tree decl, tree id) tree new_id = NULL_TREE; if (TREE_CODE (decl) == FUNCTION_DECL) - { + { unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl)); if ((ccvt & IX86_CALLCVT_STDCALL) != 0) { @@ -279,7 +280,7 @@ i386_pe_assemble_visibility (tree decl, int) tree i386_pe_mangle_decl_assembler_name (tree decl, tree id) { - tree new_id = i386_pe_maybe_mangle_decl_assembler_name (decl, id); + tree new_id = i386_pe_maybe_mangle_decl_assembler_name (decl, id); return (new_id ? new_id : id); } @@ -335,7 +336,7 @@ mingw_pe_encode_section_info (tree decl, rtx rtl, int first) flags |= SYMBOL_FLAG_DLLEXPORT; else if (i386_pe_determine_dllimport_p (decl)) flags |= SYMBOL_FLAG_DLLIMPORT; - + SYMBOL_REF_FLAGS (symbol) = flags; } @@ -367,7 +368,7 @@ i386_pe_binds_local_p (const_tree exp) && DECL_DECLARED_INLINE_P (exp)) return false; #endif - + return default_binds_local_p_1 (exp, 0); } @@ -495,7 +496,7 @@ mingw_pe_asm_named_section (const char *name, unsigned int flags, *f++ ='d'; /* This is necessary for older versions of gas. */ *f++ ='r'; } - else + else { if (flags & SECTION_CODE) *f++ = 'x'; @@ -527,7 +528,7 @@ mingw_pe_asm_named_section (const char *name, unsigned int flags, Instead, have the linker pick one, without warning. If 'selectany' attribute has been specified, MS compiler sets 'discard' characteristic, rather than telling linker - to warn of size or content mismatch, so do the same. */ + to warn of size or content mismatch, so do the same. */ bool discard = (flags & SECTION_CODE) || (TREE_CODE (decl) != IDENTIFIER_NODE && lookup_attribute ("selectany", @@ -555,7 +556,7 @@ i386_pe_asm_output_aligned_decl_common (FILE *stream, tree decl, rounded += (BIGGEST_ALIGNMENT / BITS_PER_UNIT) - 1; rounded = (rounded / (BIGGEST_ALIGNMENT / BITS_PER_UNIT) * (BIGGEST_ALIGNMENT / BITS_PER_UNIT)); - + mingw_pe_maybe_record_exported_symbol (decl, name, 1); fprintf (stream, "\t.comm\t"); diff --git a/gcc/config/mips/frame-header-opt.cc b/gcc/config/mips/frame-header-opt.cc index 1e7260e..70abd19 100644 --- a/gcc/config/mips/frame-header-opt.cc +++ b/gcc/config/mips/frame-header-opt.cc @@ -207,7 +207,7 @@ callees_functions_use_frame_header (function *fn) { called_fn = DECL_STRUCT_FUNCTION (called_fn_tree); if (called_fn == NULL - || DECL_WEAK (called_fn_tree) + || DECL_WEAK (called_fn_tree) || has_inlined_assembly (called_fn) || !is_leaf_function (called_fn) || !called_fn->machine->does_not_use_frame_header) diff --git a/gcc/config/mips/loongson-mmi.md b/gcc/config/mips/loongson-mmi.md index dd166bf..4d95873 100644 --- a/gcc/config/mips/loongson-mmi.md +++ b/gcc/config/mips/loongson-mmi.md @@ -394,7 +394,7 @@ "pmaddhw\t%0,%1,%2" [(set_attr "type" "fmul")]) -(define_expand "sdot_prodv4hi" +(define_expand "sdot_prodv2siv4hi" [(match_operand:V2SI 0 "register_operand" "") (match_operand:V4HI 1 "register_operand" "") (match_operand:V4HI 2 "register_operand" "") diff --git a/gcc/config/mips/mips-msa.md b/gcc/config/mips/mips-msa.md index 377c63f..976f296 100644 --- a/gcc/config/mips/mips-msa.md +++ b/gcc/config/mips/mips-msa.md @@ -125,6 +125,9 @@ ;; Only floating-point modes. (define_mode_iterator FMSA [V2DF V4SF]) +;; Only used for reduce_plus_scal: V4SI, V8HI, V16QI have HADD. +(define_mode_iterator MSA_NO_HADD [V2DF V4SF V2DI]) + ;; The attribute gives the integer vector mode with same size. (define_mode_attr VIMODE [(V2DF "V2DI") @@ -2802,3 +2805,128 @@ (set_attr "mode" "TI") (set_attr "compact_form" "never") (set_attr "branch_likely" "no")]) + + +;; Vector reduction operation +(define_expand "reduc_smin_scal_<mode>" + [(match_operand:<UNITMODE> 0 "register_operand") + (match_operand:MSA 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_smin<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) + +(define_expand "reduc_smax_scal_<mode>" + [(match_operand:<UNITMODE> 0 "register_operand") + (match_operand:MSA 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_smax<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) + +(define_expand "reduc_umin_scal_<mode>" + [(match_operand:<UNITMODE> 0 "register_operand") + (match_operand:IMSA 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_umin<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) + +(define_expand "reduc_umax_scal_<mode>" + [(match_operand:<UNITMODE> 0 "register_operand") + (match_operand:IMSA 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_umax<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) + +(define_expand "reduc_plus_scal_<mode>" + [(match_operand:<UNITMODE> 0 "register_operand") + (match_operand:MSA_NO_HADD 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_add<mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) + +(define_expand "reduc_plus_scal_v4si" + [(match_operand:SI 0 "register_operand") + (match_operand:V4SI 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (SImode); + rtx tmp1 = gen_reg_rtx (V2DImode); + emit_insn (gen_msa_hadd_s_d (tmp1, operands[1], operands[1])); + emit_insn (gen_vec_extractv4sisi (operands[0], gen_lowpart (V4SImode, tmp1), + const0_rtx)); + emit_insn (gen_vec_extractv4sisi (tmp, gen_lowpart (V4SImode, tmp1), + GEN_INT (2))); + emit_insn (gen_addsi3 (operands[0], operands[0], tmp)); + DONE; +}) + +(define_expand "reduc_plus_scal_v8hi" + [(match_operand:HI 0 "register_operand") + (match_operand:V8HI 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V2DImode); + rtx tmp3 = gen_reg_rtx (V2DImode); + emit_insn (gen_msa_hadd_s_w (tmp1, operands[1], operands[1])); + emit_insn (gen_msa_hadd_s_d (tmp2, tmp1, tmp1)); + mips_expand_msa_reduc (gen_addv2di3, tmp3, tmp2); + emit_insn (gen_vec_extractv8hihi (operands[0], gen_lowpart (V8HImode, tmp3), + const0_rtx)); + DONE; +}) + +(define_expand "reduc_plus_scal_v16qi" + [(match_operand:QI 0 "register_operand") + (match_operand:V16QI 1 "register_operand")] + "ISA_HAS_MSA" +{ + rtx tmp1 = gen_reg_rtx (V8HImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + rtx tmp3 = gen_reg_rtx (V2DImode); + rtx tmp4 = gen_reg_rtx (V2DImode); + emit_insn (gen_msa_hadd_s_h (tmp1, operands[1], operands[1])); + emit_insn (gen_msa_hadd_s_w (tmp2, tmp1, tmp1)); + emit_insn (gen_msa_hadd_s_d (tmp3, tmp2, tmp2)); + mips_expand_msa_reduc (gen_addv2di3, tmp4, tmp3); + emit_insn (gen_vec_extractv16qiqi (operands[0], gen_lowpart (V16QImode, tmp4), + const0_rtx)); + DONE; +}) + +(define_expand "reduc_<optab>_scal_<mode>" + [(any_bitwise:<UNITMODE> + (match_operand:<UNITMODE> 0 "register_operand") + (match_operand:IMSA 1 "register_operand"))] + "ISA_HAS_MSA" +{ + rtx tmp = gen_reg_rtx (<MODE>mode); + mips_expand_msa_reduc (gen_<optab><mode>3, tmp, operands[1]); + emit_insn (gen_vec_extract<mode><unitmode> (operands[0], tmp, + const0_rtx)); + DONE; +}) diff --git a/gcc/config/mips/mips-protos.h b/gcc/config/mips/mips-protos.h index 90b4c87..96e084e 100644 --- a/gcc/config/mips/mips-protos.h +++ b/gcc/config/mips/mips-protos.h @@ -352,6 +352,7 @@ extern void mips_expand_atomic_qihi (union mips_gen_fn_ptrs, extern void mips_expand_vector_init (rtx, rtx); extern void mips_expand_vec_unpack (rtx op[2], bool, bool); extern void mips_expand_vec_reduc (rtx, rtx, rtx (*)(rtx, rtx, rtx)); +extern void mips_expand_msa_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void mips_expand_vec_minmax (rtx, rtx, rtx, rtx (*) (rtx, rtx, rtx), bool); diff --git a/gcc/config/mips/mips.cc b/gcc/config/mips/mips.cc index 6c797b6..3927553 100644 --- a/gcc/config/mips/mips.cc +++ b/gcc/config/mips/mips.cc @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -2803,7 +2804,7 @@ mips_lwxs_address_p (rtx addr) return false; } -/* Return true if ADDR matches the pattern for the L{B,H,W,D}{,U}X load +/* Return true if ADDR matches the pattern for the L{B,H,W,D}{,U}X load indexed address instruction. Note that such addresses are not considered legitimate in the TARGET_LEGITIMATE_ADDRESS_P sense, because their use is so restricted. */ @@ -4454,7 +4455,7 @@ mips_rtx_costs (rtx x, machine_mode mode, int outer_code, + set_src_cost (XEXP (XEXP (x, 1), 0), mode, speed)); return true; } - + /* Fall through. */ case IOR: @@ -12545,7 +12546,7 @@ mips_output_probe_stack_range (rtx reg1, rtx reg2) /* Probe at TEST_ADDR, test if TEST_ADDR == LAST_ADDR and branch. */ xops[1] = reg2; strcpy (tmp, "%(%<bne\t%0,%1,"); - output_asm_insn (strcat (tmp, &loop_lab[1]), xops); + output_asm_insn (strcat (tmp, &loop_lab[1]), xops); if (TARGET_64BIT) output_asm_insn ("sd\t$0,0(%0)%)", xops); else @@ -13729,7 +13730,7 @@ mips_memory_move_cost (machine_mode mode, reg_class_t rclass, bool in) { return (mips_cost->memory_latency + memory_move_secondary_cost (mode, rclass, in)); -} +} /* Implement TARGET_SECONDARY_MEMORY_NEEDED. @@ -14997,7 +14998,7 @@ bool mips_fmadd_bypass (rtx_insn *out_insn, rtx_insn *in_insn) { int dst_reg, src_reg; - + gcc_assert (get_attr_type (in_insn) == TYPE_FMADD); gcc_assert (get_attr_type (out_insn) == TYPE_FMADD); @@ -22239,6 +22240,47 @@ mips_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ok; } +/* Expand a vector reduction. FN is the binary pattern to reduce; + DEST is the destination; IN is the input vector. */ + +void +mips_expand_msa_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) +{ + rtx swap, vec = in; + machine_mode mode = GET_MODE (in); + unsigned int i, gelt; + const unsigned nelt = GET_MODE_BITSIZE (mode) / GET_MODE_UNIT_BITSIZE (mode); + unsigned char perm[MAX_VECT_LEN]; + + /* We have no SHF.d. */ + if (nelt == 2) + { + perm[0] = 2; + perm[1] = 3; + perm[2] = 0; + perm[3] = 1; + rtx rsi = simplify_gen_subreg (V4SImode, in, mode, 0); + swap = gen_reg_rtx (V4SImode); + mips_expand_vselect (swap, rsi, perm, 4); + emit_move_insn (dest, gen_rtx_SUBREG (mode, swap, 0)); + emit_insn (fn (dest, dest, vec)); + return; + } + + for (gelt=1; gelt<=nelt/2; gelt *= 2) + { + for (i = 0; i<nelt; i++) + perm[i] = ((i/gelt)%2) ? (i-gelt) : (i+gelt); + if (gelt == nelt/2) + swap = dest; + else + swap = gen_reg_rtx (mode); + mips_expand_vselect (swap, vec, perm, nelt); + emit_insn (fn (swap, swap, vec)); + vec = swap; + } +} + /* Implement TARGET_SCHED_REASSOCIATION_WIDTH. */ static int diff --git a/gcc/config/mips/mips.h b/gcc/config/mips/mips.h index 84dd64d..fb696ed 100644 --- a/gcc/config/mips/mips.h +++ b/gcc/config/mips/mips.h @@ -1759,7 +1759,7 @@ FP_ASM_SPEC "\ optimised to use word loads. */ #define LOCAL_ALIGNMENT(TYPE, ALIGN) \ DATA_ALIGNMENT (TYPE, ALIGN) - + #define PAD_VARARGS_DOWN \ (targetm.calls.function_arg_padding (TYPE_MODE (type), type) == PAD_DOWNWARD) diff --git a/gcc/config/mips/mips.md b/gcc/config/mips/mips.md index 737d256..f147667 100644 --- a/gcc/config/mips/mips.md +++ b/gcc/config/mips/mips.md @@ -993,6 +993,10 @@ ;; from the same template. (define_code_iterator any_shift [ashift ashiftrt lshiftrt]) +;; This code iterator allows the three bitwise instructions to be generated +;; from the same template. +(define_code_iterator any_bitwise [and ior xor]) + ;; This code iterator allows unsigned and signed division to be generated ;; from the same template. (define_code_iterator any_div [div udiv]) diff --git a/gcc/config/mips/sde.h b/gcc/config/mips/sde.h index 35ca431..d177b08f 100644 --- a/gcc/config/mips/sde.h +++ b/gcc/config/mips/sde.h @@ -45,7 +45,7 @@ along with GCC; see the file COPYING3. If not see "%{!EB:%{!EL:%(endian_spec)}}", \ \ /* Configuration-independent MIPS rules. */ \ - BASE_DRIVER_SELF_SPECS + BASE_DRIVER_SELF_SPECS /* Use trap rather than break for all but MIPS I ISA. Force -no-mips16, so that MIPS16 assembler code requires an explicit ".set mips16". diff --git a/gcc/config/mmix/mmix.cc b/gcc/config/mmix/mmix.cc index 167aea7..ce01438 100644 --- a/gcc/config/mmix/mmix.cc +++ b/gcc/config/mmix/mmix.cc @@ -761,7 +761,7 @@ mmix_function_value (const_tree valtype, if (!outgoing) return gen_rtx_REG (mode, MMIX_RETURN_VALUE_REGNUM); - + /* Return values that fit in a register need no special handling. There's no register hole when parameters are passed in global registers. */ diff --git a/gcc/config/mn10300/linux.h b/gcc/config/mn10300/linux.h index 8cfe0e1..0e51e95 100644 --- a/gcc/config/mn10300/linux.h +++ b/gcc/config/mn10300/linux.h @@ -18,7 +18,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ - + #undef PREFERRED_DEBUGGING_TYPE #define PREFERRED_DEBUGGING_TYPE DWARF2_DEBUG @@ -77,7 +77,7 @@ extern int mn10300_protect_label; asm_fprintf (FILE, "+"); \ asm_fprintf (FILE, "%U%s", real_name); \ } \ - while (0) + while (0) #undef SIZE_TYPE #undef PTRDIFF_TYPE diff --git a/gcc/config/mn10300/mn10300.cc b/gcc/config/mn10300/mn10300.cc index 1cf0811..fab4641 100644 --- a/gcc/config/mn10300/mn10300.cc +++ b/gcc/config/mn10300/mn10300.cc @@ -475,7 +475,7 @@ mn10300_print_operand_address (FILE *file, rtx addr) { rtx base = XEXP (addr, 0); rtx index = XEXP (addr, 1); - + if (REG_P (index) && !REG_OK_FOR_INDEX_P (index)) { rtx x = base; @@ -651,7 +651,7 @@ mn10300_get_live_callee_saved_regs (unsigned int * bytes_saved) for (i = 0x04000; i < 0x40000; i <<= 1) if ((mask & i) == 0) ++ count; - + mask |= 0x3c000; } @@ -748,7 +748,7 @@ static inline unsigned int popcount (unsigned int mask) { unsigned int count = 0; - + while (mask) { ++ count; @@ -1333,7 +1333,7 @@ mn10300_preferred_reload_class (rtx x, reg_class_t rclass) if (x == stack_pointer_rtx && rclass != SP_REGS) return (TARGET_AM33 ? GENERAL_REGS : ADDRESS_REGS); else if (MEM_P (x) - || (REG_P (x) + || (REG_P (x) && !HARD_REGISTER_P (x)) || (GET_CODE (x) == SUBREG && REG_P (SUBREG_REG (x)) @@ -1706,7 +1706,7 @@ mn10300_output_add (rtx operands[3], bool need_flags) src2_regnum = true_regnum (src2); src2_class = REGNO_REG_CLASS (src2_regnum); - + if (dest_regnum == src1_regnum) return "add %2,%0"; if (dest_regnum == src2_regnum) @@ -2296,7 +2296,7 @@ mn10300_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED, move cost above. This is not a problem. */ static int -mn10300_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, +mn10300_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, reg_class_t iclass, bool in ATTRIBUTE_UNUSED) { enum reg_class rclass = (enum reg_class) iclass; @@ -2410,7 +2410,7 @@ mn10300_rtx_costs (rtx x, machine_mode mode, int outer_code, } } goto do_arith_costs; - + case MINUS: case AND: case IOR: @@ -2533,7 +2533,7 @@ mn10300_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) 0xdc jmp fnaddr <disp> - Note that the two extra insns are effectively nops; they + Note that the two extra insns are effectively nops; they clobber the flags but do not affect the contents of D0 or D1. */ disp = expand_binop (SImode, sub_optab, fnaddr, @@ -2631,7 +2631,7 @@ mn10300_hard_regno_mode_ok (unsigned int regno, machine_mode mode) || (TARGET_AM33 && REGNO_REG_CLASS (regno) == ADDRESS_REGS) || REGNO_REG_CLASS (regno) == EXTENDED_REGS) return GET_MODE_SIZE (mode) <= 4; - + return false; } @@ -2906,14 +2906,14 @@ mn10300_match_ccmode (rtx insn, machine_mode cc_mode) } /* This function is used to help split: - + (set (reg) (and (reg) (int))) - + into: - + (set (reg) (shift (reg) (int)) (set (reg) (shift (reg) (int)) - + where the shitfs will be shorter than the "and" insn. It returns the number of bits that should be shifted. A positive @@ -3038,7 +3038,7 @@ check_liw_constraints (struct liw_data * pliw1, struct liw_data * pliw2) check its values prior to any changes made by OP. */ if (pliw1->op == LIW_OP_CMP) { - /* Two sequential comparisons means dead code, which ought to + /* Two sequential comparisons means dead code, which ought to have been eliminated given that bundling only happens with optimization. We cannot bundle them in any case. */ gcc_assert (pliw1->op != pliw2->op); @@ -3076,7 +3076,7 @@ check_liw_constraints (struct liw_data * pliw1, struct liw_data * pliw2) || pliw2->op == LIW_OP_OR || pliw2->op == LIW_OP_XOR)) return false; - + pliw2->src = pliw1->src; return true; } @@ -3114,7 +3114,7 @@ mn10300_bundle_liw (void) if (liw1.slot == LIW_OP2 || liw2.slot == LIW_OP1) { struct liw_data temp; - + temp = liw1; liw1 = liw2; liw2 = temp; @@ -3191,7 +3191,7 @@ mn10300_insert_setlb_lcc (rtx_insn *label, rtx_insn *branch) if (GET_MODE (cmp_reg) == CC_FLOATmode) lcc = gen_FLcc (comparison, label); else - lcc = gen_Lcc (comparison, label); + lcc = gen_Lcc (comparison, label); rtx_insn *jump = emit_jump_insn_before (lcc, branch); mark_jump_label (XVECEXP (lcc, 0, 0), jump, 0); @@ -3294,7 +3294,7 @@ mn10300_scan_for_setlb_lcc (void) loop_optimizer_finalize (); - df_finish_pass (false); + df_finish_pass (false); DUMP ("SETLB scan complete", NULL_RTX); } diff --git a/gcc/config/moxie/moxie.cc b/gcc/config/moxie/moxie.cc index 47a14ea..eda7b08 100644 --- a/gcc/config/moxie/moxie.cc +++ b/gcc/config/moxie/moxie.cc @@ -63,12 +63,12 @@ moxie_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* Define how to find the value returned by a function. VALTYPE is the data type of the value (as a tree). If the precise function being called is known, FUNC is its - FUNCTION_DECL; otherwise, FUNC is 0. + FUNCTION_DECL; otherwise, FUNC is 0. We always return values in register $r0 for moxie. */ static rtx -moxie_function_value (const_tree valtype, +moxie_function_value (const_tree valtype, const_tree fntype_or_decl ATTRIBUTE_UNUSED, bool outgoing ATTRIBUTE_UNUSED) { @@ -118,12 +118,12 @@ moxie_print_operand_address (FILE *file, machine_mode, rtx x) case REG: fprintf (file, "(%s)", reg_names[REGNO (x)]); break; - + case PLUS: switch (GET_CODE (XEXP (x, 1))) { case CONST_INT: - fprintf (file, "%ld(%s)", + fprintf (file, "%ld(%s)", INTVAL(XEXP (x, 1)), reg_names[REGNO (XEXP (x, 0))]); break; case SYMBOL_REF: @@ -133,7 +133,7 @@ moxie_print_operand_address (FILE *file, machine_mode, rtx x) case CONST: { rtx plus = XEXP (XEXP (x, 1), 0); - if (GET_CODE (XEXP (plus, 0)) == SYMBOL_REF + if (GET_CODE (XEXP (plus, 0)) == SYMBOL_REF && CONST_INT_P (XEXP (plus, 1))) { output_addr_const(file, XEXP (plus, 0)); @@ -234,7 +234,7 @@ moxie_option_override (void) /* Set the per-function-data initializer. */ init_machine_status = moxie_init_machine_status; -#ifdef TARGET_MOXIEBOX +#ifdef TARGET_MOXIEBOX target_flags |= MASK_HAS_MULX; #endif } @@ -267,9 +267,9 @@ moxie_compute_frame (void) if (df_regs_ever_live_p (regno) && (! call_used_or_fixed_reg_p (regno))) cfun->machine->callee_saved_reg_size += 4; - cfun->machine->size_for_adjusting_sp = + cfun->machine->size_for_adjusting_sp = crtl->args.pretend_args_size - + cfun->machine->local_vars_size + + cfun->machine->local_vars_size + (ACCUMULATE_OUTGOING_ARGS ? (HOST_WIDE_INT) crtl->outgoing_args_size : 0); } @@ -298,19 +298,19 @@ moxie_expand_prologue (void) if (cfun->machine->size_for_adjusting_sp > 0) { - int i = cfun->machine->size_for_adjusting_sp; + int i = cfun->machine->size_for_adjusting_sp; while ((i >= 255) && (i <= 510)) { - insn = emit_insn (gen_subsi3 (stack_pointer_rtx, - stack_pointer_rtx, + insn = emit_insn (gen_subsi3 (stack_pointer_rtx, + stack_pointer_rtx, GEN_INT (255))); RTX_FRAME_RELATED_P (insn) = 1; i -= 255; } if (i <= 255) { - insn = emit_insn (gen_subsi3 (stack_pointer_rtx, - stack_pointer_rtx, + insn = emit_insn (gen_subsi3 (stack_pointer_rtx, + stack_pointer_rtx, GEN_INT (i))); RTX_FRAME_RELATED_P (insn) = 1; } @@ -319,8 +319,8 @@ moxie_expand_prologue (void) rtx reg = gen_rtx_REG (SImode, MOXIE_R12); insn = emit_move_insn (reg, GEN_INT (i)); RTX_FRAME_RELATED_P (insn) = 1; - insn = emit_insn (gen_subsi3 (stack_pointer_rtx, - stack_pointer_rtx, + insn = emit_insn (gen_subsi3 (stack_pointer_rtx, + stack_pointer_rtx, reg)); RTX_FRAME_RELATED_P (insn) = 1; } @@ -339,8 +339,8 @@ moxie_expand_epilogue (void) if (cfun->machine->callee_saved_reg_size <= 255) { emit_move_insn (reg, hard_frame_pointer_rtx); - emit_insn (gen_subsi3 - (reg, reg, + emit_insn (gen_subsi3 + (reg, reg, GEN_INT (cfun->machine->callee_saved_reg_size))); } else @@ -367,7 +367,7 @@ int moxie_initial_elimination_offset (int from, int to) { int ret; - + if ((from) == FRAME_POINTER_REGNUM && (to) == HARD_FRAME_POINTER_REGNUM) { /* Compute this since we need to use cfun->machine->local_vars_size. */ @@ -392,19 +392,19 @@ moxie_setup_incoming_varargs (cumulative_args_t cum_v, CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); int regno; int regs = 8 - *cum; - + *pretend_size = regs < 0 ? 0 : GET_MODE_SIZE (SImode) * regs; - + if (no_rtl) return; - + for (regno = *cum; regno < 8; regno++) { rtx reg = gen_rtx_REG (SImode, regno); rtx slot = gen_rtx_PLUS (Pmode, gen_rtx_REG (SImode, ARG_POINTER_REGNUM), GEN_INT (UNITS_PER_WORD * (3 + (regno-2)))); - + emit_move_insn (gen_rtx_MEM (SImode, slot), reg); } } @@ -430,7 +430,7 @@ moxie_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) if (*cum < 8) return gen_rtx_REG (arg.mode, *cum); - else + else return NULL_RTX; } @@ -567,7 +567,7 @@ moxie_reg_ok_for_base_p (const_rtx reg, bool strict_p) if (strict_p) return HARD_REGNO_OK_FOR_BASE_P (regno) || HARD_REGNO_OK_FOR_BASE_P (reg_renumber[regno]); - else + else return !HARD_REGISTER_NUM_P (regno) || HARD_REGNO_OK_FOR_BASE_P (regno); } diff --git a/gcc/config/moxie/moxie.h b/gcc/config/moxie/moxie.h index 4857c92..b2977e5 100644 --- a/gcc/config/moxie/moxie.h +++ b/gcc/config/moxie/moxie.h @@ -91,7 +91,7 @@ Special Registers... $pc - 32-bit program counter. - + */ #define REGISTER_NAMES { \ @@ -104,7 +104,7 @@ #define MOXIE_FP 0 #define MOXIE_SP 1 #define MOXIE_R0 2 -#define MOXIE_R1 3 +#define MOXIE_R1 3 #define MOXIE_R2 4 #define MOXIE_R3 5 #define MOXIE_R4 6 @@ -209,7 +209,7 @@ enum reg_class #define ACCUMULATE_OUTGOING_ARGS 1 /* A C statement (sans semicolon) for initializing the variable CUM - for the state at the beginning of the argument list. + for the state at the beginning of the argument list. For moxie, the first arg is passed in register 2 (aka $r0). */ #define INIT_CUMULATIVE_ARGS(CUM,FNTYPE,LIBNAME,FNDECL,N_NAMED_ARGS) \ (CUM = MOXIE_R0) @@ -300,7 +300,7 @@ enum reg_class /* Every structures size must be a multiple of 8 bits. */ #define STRUCTURE_SIZE_BOUNDARY 8 -/* Look at the fundamental type that is used for a bit-field and use +/* Look at the fundamental type that is used for a bit-field and use that to impose alignment on the enclosing structure. struct s {int a:8}; should have same alignment as "int", not "char". */ #define PCC_BITFIELD_TYPE_MATTERS 1 @@ -314,7 +314,7 @@ enum reg_class (TREE_CODE (TYPE) == ARRAY_TYPE \ && TYPE_MODE (TREE_TYPE (TYPE)) == QImode \ && (ALIGN) < FASTEST_ALIGNMENT ? FASTEST_ALIGNMENT : (ALIGN)) - + /* Set this nonzero if move instructions will actually fail to work when given unaligned data. */ #define STRICT_ALIGNMENT 1 @@ -351,7 +351,7 @@ enum reg_class #define ELIMINABLE_REGS \ {{ FRAME_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }, \ - { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }} + { ARG_POINTER_REGNUM, HARD_FRAME_POINTER_REGNUM }} /* This macro returns the initial difference between the specified pair of registers. */ diff --git a/gcc/config/msp430/driver-msp430.cc b/gcc/config/msp430/driver-msp430.cc index a11fd3d..5ffa8ef 100644 --- a/gcc/config/msp430/driver-msp430.cc +++ b/gcc/config/msp430/driver-msp430.cc @@ -20,6 +20,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/netbsd.h b/gcc/config/netbsd.h index de72879..fe33f9a 100644 --- a/gcc/config/netbsd.h +++ b/gcc/config/netbsd.h @@ -131,7 +131,7 @@ along with GCC; see the file COPYING3. If not see #undef TARGET_LIBC_HAS_FUNCTION #define TARGET_LIBC_HAS_FUNCTION no_c99_libc_has_function -/* When building shared libraries, the initialization and finalization +/* When building shared libraries, the initialization and finalization functions for the library are .init and .fini respectively. */ #define COLLECT_SHARED_INIT_FUNC(STREAM,FUNC) \ diff --git a/gcc/config/nios2/elf.h b/gcc/config/nios2/elf.h index 44664c9..3efcdcf 100644 --- a/gcc/config/nios2/elf.h +++ b/gcc/config/nios2/elf.h @@ -1,6 +1,6 @@ /* Definitions of ELF target support for Altera Nios II. Copyright (C) 2012-2024 Free Software Foundation, Inc. - Contributed by Jonah Graham (jgraham@altera.com), + Contributed by Jonah Graham (jgraham@altera.com), Will Reece (wreece@altera.com), and Jeff DaSilva (jdasilva@altera.com). Contributed by Mentor Graphics, Inc. diff --git a/gcc/config/nios2/nios2.cc b/gcc/config/nios2/nios2.cc index a981e50..cb33c67 100644 --- a/gcc/config/nios2/nios2.cc +++ b/gcc/config/nios2/nios2.cc @@ -1,6 +1,6 @@ /* Target machine subroutines for Altera Nios II. Copyright (C) 2012-2024 Free Software Foundation, Inc. - Contributed by Jonah Graham (jgraham@altera.com), + Contributed by Jonah Graham (jgraham@altera.com), Will Reece (wreece@altera.com), and Jeff DaSilva (jdasilva@altera.com). Contributed by Mentor Graphics, Inc. @@ -343,7 +343,7 @@ static bool nios2_fpu_compare_enabled (enum rtx_code cond, machine_mode mode) { if (mode == SFmode) - switch (cond) + switch (cond) { case EQ: return N2FPU_OP_ENABLED_P (fcmpeqs); case NE: return N2FPU_OP_ENABLED_P (fcmpnes); @@ -354,7 +354,7 @@ nios2_fpu_compare_enabled (enum rtx_code cond, machine_mode mode) default: break; } else if (mode == DFmode) - switch (cond) + switch (cond) { case EQ: return N2FPU_OP_ENABLED_P (fcmpeqd); case NE: return N2FPU_OP_ENABLED_P (fcmpned); @@ -388,7 +388,7 @@ nios2_compute_frame_layout (void) if (cfun->machine->initialized) return cfun->machine->total_size; - + /* Calculate space needed for gp registers. */ save_reg_size = 0; for (regno = 0; regno <= LAST_GP_REG; regno++) @@ -434,7 +434,7 @@ nios2_compute_frame_layout (void) { unsigned i; unsigned r; - + for (i = 0; (r = EH_RETURN_DATA_REGNO (i)) != INVALID_REGNUM; i++) if (!(save_mask & (1 << r))) { @@ -552,7 +552,7 @@ nios2_create_cfa_notes (rtx_insn *insn, bool epilogue_p) #define TEMP_REG_NUM 8 /* Emit conditional trap for checking stack limit. SIZE is the number of - additional bytes required. + additional bytes required. GDB prologue analysis depends on this generating a direct comparison to the SP register, so the adjustment to add SIZE needs to be done on @@ -995,7 +995,7 @@ nios2_set_return_address (rtx address, rtx scratch) { unsigned offset = cfun->machine->save_reg_size - 4; rtx base; - + if (frame_pointer_needed) base = hard_frame_pointer_rtx; else @@ -1080,7 +1080,7 @@ static bool prologue_saved_reg_p (unsigned regno) { gcc_assert (GP_REG_P (regno)); - + if (df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno)) return true; @@ -1135,7 +1135,7 @@ nios2_initial_elimination_offset (int from, int to) by the offset from the frame pointer to the stack pointer. */ if (to == HARD_FRAME_POINTER_REGNUM) offset -= (cfun->machine->save_regs_offset - + cfun->machine->fp_save_offset); + + cfun->machine->fp_save_offset); return offset; } @@ -1390,13 +1390,13 @@ nios2_option_override (void) /* Process -mgprel-sec= and -m0rel-sec=. */ if (nios2_gprel_sec) { - if (regcomp (&nios2_gprel_sec_regex, nios2_gprel_sec, + if (regcomp (&nios2_gprel_sec_regex, nios2_gprel_sec, REG_EXTENDED | REG_NOSUB)) error ("%<-mgprel-sec=%> argument is not a valid regular expression"); } if (nios2_r0rel_sec) { - if (regcomp (&nios2_r0rel_sec_regex, nios2_r0rel_sec, + if (regcomp (&nios2_r0rel_sec_regex, nios2_r0rel_sec, REG_EXTENDED | REG_NOSUB)) error ("%<-mr0rel-sec=%> argument is not a valid regular expression"); } @@ -1533,7 +1533,7 @@ nios2_rtx_costs (rtx x, machine_mode mode, *total = COSTS_N_INSNS (5); /* Guess? */ else if (speed) *total = COSTS_N_INSNS (2); /* Latency adjustment. */ - else + else *total = COSTS_N_INSNS (1); if (TARGET_HAS_MULX && GET_MODE (x) == DImode) { @@ -1557,7 +1557,7 @@ nios2_rtx_costs (rtx x, machine_mode mode, *total = COSTS_N_INSNS (5); /* Guess? */ else if (speed) *total = COSTS_N_INSNS (2); /* Latency adjustment. */ - else + else *total = COSTS_N_INSNS (1); return false; } @@ -1569,11 +1569,11 @@ nios2_rtx_costs (rtx x, machine_mode mode, { if (!speed) *total = COSTS_N_INSNS (1); - else + else *total = COSTS_N_INSNS (2); /* Latency adjustment. */ return false; } - + case ZERO_EXTRACT: if (TARGET_HAS_BMX) { @@ -1639,7 +1639,7 @@ nios2_call_tls_get_addr (rtx ti) rtx ret = gen_rtx_REG (Pmode, FIRST_RETVAL_REGNO); rtx fn; rtx_insn *insn; - + if (!nios2_tls_symbol) nios2_tls_symbol = init_one_libfunc ("__tls_get_addr"); @@ -2005,7 +2005,7 @@ nios2_validate_compare (machine_mode mode, rtx *cmp, rtx *op1, rtx *op2) } else if (!reg_or_0_operand (*op2, mode)) *op2 = force_reg (mode, *op2); - + check_rebuild_cmp: if (code == GT || code == GTU || code == LE || code == LEU) { @@ -2057,7 +2057,7 @@ nios2_symbolic_constant_p (rtx x) return false; } -/* Return true if X is an expression of the form +/* Return true if X is an expression of the form (PLUS reg large_constant). */ static bool nios2_plus_large_constant_p (rtx x) @@ -2134,7 +2134,7 @@ nios2_valid_addr_expr_p (rtx base, rtx offset, bool strict_p) && nios2_regno_ok_for_base_p (REGNO (base), strict_p) && (offset == NULL_RTX || nios2_valid_addr_offset_p (offset) - || (nios2_large_constant_allowed () + || (nios2_large_constant_allowed () && nios2_symbolic_constant_p (offset)) || nios2_unspec_reloc_p (offset))); } @@ -2159,7 +2159,7 @@ nios2_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx operand, /* Else, fall through. */ case LABEL_REF: - if (nios2_large_constant_allowed () + if (nios2_large_constant_allowed () && nios2_symbolic_constant_p (operand)) return true; return false; @@ -2182,7 +2182,7 @@ nios2_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx operand, rtx op0 = XEXP (operand, 0); rtx op1 = XEXP (operand, 1); - if (nios2_valid_addr_expr_p (op0, op1, strict_p) + if (nios2_valid_addr_expr_p (op0, op1, strict_p) || nios2_valid_addr_expr_p (op1, op0, strict_p)) return true; } @@ -2192,7 +2192,7 @@ nios2_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx operand, This requires a 16-bit relocation and isn't valid with R2 io-variant load/stores. */ case LO_SUM: - if (TARGET_ARCH_R2 + if (TARGET_ARCH_R2 && (TARGET_BYPASS_CACHE || TARGET_BYPASS_CACHE_VOLATILE)) return false; else @@ -2216,18 +2216,18 @@ nios2_legitimate_address_p (machine_mode mode ATTRIBUTE_UNUSED, rtx operand, the (plus reg symbolic_constant) and (plus reg (const ...)) forms but giving (plus reg symbol_ref) address modes the same cost as those that don't require splitting. Also, from a theoretical point of view: - - This is in line with the recommendation in the GCC internals + - This is in line with the recommendation in the GCC internals documentation to make address forms involving multiple - registers more expensive than single-register forms. - - OTOH it still encourages fwprop1 to propagate constants into + registers more expensive than single-register forms. + - OTOH it still encourages fwprop1 to propagate constants into address expressions more aggressively. - We should discourage splitting (symbol + offset) into hi/lo pairs to allow CSE'ing the symbol when it's used with more than one offset, but not so heavily as to avoid this addressing mode at all. */ static int -nios2_address_cost (rtx address, +nios2_address_cost (rtx address, machine_mode mode ATTRIBUTE_UNUSED, - addr_space_t as ATTRIBUTE_UNUSED, + addr_space_t as ATTRIBUTE_UNUSED, bool speed ATTRIBUTE_UNUSED) { if (nios2_plus_large_constant_p (address)) @@ -2258,7 +2258,7 @@ nios2_large_constant_memory_operand_p (rtx x) } -/* Return true if X is something that needs to be split into a +/* Return true if X is something that needs to be split into a high/lo_sum pair. */ bool nios2_large_constant_p (rtx x) @@ -2269,8 +2269,8 @@ nios2_large_constant_p (rtx x) } /* Given an RTX X that satisfies nios2_large_constant_p, split it into - high and lo_sum parts using TEMP as a scratch register. Emit the high - instruction and return the lo_sum expression. + high and lo_sum parts using TEMP as a scratch register. Emit the high + instruction and return the lo_sum expression. Also handle special cases involving constant integers. */ rtx nios2_split_large_constant (rtx x, rtx temp) @@ -2293,7 +2293,7 @@ nios2_split_large_constant (rtx x, rtx temp) return gen_rtx_PLUS (Pmode, temp, gen_int_mode (low, Pmode)); } } - + emit_insn (gen_rtx_SET (temp, gen_rtx_HIGH (Pmode, copy_rtx (x)))); return gen_rtx_LO_SUM (Pmode, temp, copy_rtx (x)); } @@ -2317,7 +2317,7 @@ nios2_split_plus_large_constant (rtx op0, rtx op1) } /* Given a MEM OP with an address that includes a splittable symbol or - other large constant, emit some instructions to do the split and + other large constant, emit some instructions to do the split and return a new MEM. */ rtx nios2_split_large_constant_memory_operand (rtx op) @@ -2341,7 +2341,7 @@ nios2_small_section_name_p (const char *section) || startswith (section, ".sbss.") || strcmp (section, ".sdata") == 0 || startswith (section, ".sdata.") - || (nios2_gprel_sec + || (nios2_gprel_sec && regexec (&nios2_gprel_sec_regex, section, 0, NULL, 0) == 0)); } @@ -2349,7 +2349,7 @@ nios2_small_section_name_p (const char *section) static bool nios2_r0rel_section_name_p (const char *section) { - return (nios2_r0rel_sec + return (nios2_r0rel_sec && regexec (&nios2_r0rel_sec_regex, section, 0, NULL, 0) == 0); } @@ -2591,7 +2591,7 @@ nios2_legitimize_constant_address (rtx addr) base = nios2_legitimize_tls_address (base); else if (flag_pic) base = nios2_load_pic_address (base, UNSPEC_PIC_SYM, NULL_RTX); - else if (!nios2_large_constant_allowed () + else if (!nios2_large_constant_allowed () && nios2_symbolic_constant_p (addr)) return nios2_split_large_constant (addr, gen_reg_rtx (Pmode)); else if (CONST_INT_P (addr)) @@ -2625,7 +2625,7 @@ nios2_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, machine_mode mode ATTRIBUTE_UNUSED) { rtx op0, op1; - + if (CONSTANT_P (x)) return nios2_legitimize_constant_address (x); @@ -2749,15 +2749,15 @@ nios2_emit_move_sequence (rtx *operands, machine_mode mode) } } else if (gprel_constant_p (from) || r0rel_constant_p (from)) - /* Handled directly by movsi_internal as gp + offset + /* Handled directly by movsi_internal as gp + offset or r0 + offset. */ ; else if (nios2_large_constant_p (from)) /* This case covers either a regular symbol reference or an UNSPEC - representing a 32-bit offset. We split the former + representing a 32-bit offset. We split the former only conditionally and the latter always. */ { - if (!nios2_large_constant_allowed () + if (!nios2_large_constant_allowed () || nios2_large_unspec_reloc_p (from)) { rtx lo = nios2_split_large_constant (from, to); @@ -2767,7 +2767,7 @@ nios2_emit_move_sequence (rtx *operands, machine_mode mode) return true; } } - else + else /* This is a TLS or PIC symbol. */ { from = nios2_legitimize_constant_address (from); @@ -2839,7 +2839,7 @@ nios2_print_operand_punct_valid_p (unsigned char code) z: prints the third register immediate operand in assembly instructions. Outputs const0_rtx as the 'zero' register instead of '0'. - + y: same as 'z', but for specifically for logical instructions, where the processing for immediates are slightly different. @@ -3292,7 +3292,7 @@ nios2_fpu_insn_asm (enum n2fpu_code code) static char buf[256]; const char *op1, *op2, *op3; int ln = 256, n = 0; - + int N = N2FPU_N (code); int num_operands = N2FPU (code).num_operands; const char *insn_name = N2FPU_NAME (code); @@ -3384,7 +3384,7 @@ nios2_fpu_insn_asm (enum n2fpu_code code) static rtx nios2_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); rtx return_rtx = NULL_RTX; if (cum->regs_used < NUM_ARG_REGS) @@ -3400,7 +3400,7 @@ nios2_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) static int nios2_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); HOST_WIDE_INT param_size = arg.promoted_size_in_bytes (); gcc_assert (param_size >= 0); @@ -3420,7 +3420,7 @@ static void nios2_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &arg) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); HOST_WIDE_INT param_size = arg.promoted_size_in_bytes (); gcc_assert (param_size >= 0); @@ -3517,7 +3517,7 @@ nios2_setup_incoming_varargs (cumulative_args_t cum_v, const function_arg_info &arg, int *pretend_size, int second_time) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); CUMULATIVE_ARGS local_cum; cumulative_args_t local_cum_v = pack_cumulative_args (&local_cum); int regs_to_push; @@ -3838,8 +3838,8 @@ nios2_expand_builtin_insn (const struct nios2_builtin_desc *d, int n, else { error ("invalid argument to built-in function %s", d->name); - return has_target_p ? gen_reg_rtx (ops[0].mode) : const0_rtx; - } + return has_target_p ? gen_reg_rtx (ops[0].mode) : const0_rtx; + } } /* Expand ldio/stio and ldex/ldsex/stex/stsex form load-store @@ -3954,7 +3954,7 @@ nios2_expand_cache_builtin (tree exp, rtx target ATTRIBUTE_UNUSED, mem = gen_rtx_MEM (SImode, addr); create_input_operand (&ops[0], mem, SImode); - + return nios2_expand_builtin_insn (d, 1, ops, false); } @@ -3968,7 +3968,7 @@ nios2_expand_wrpie_builtin (tree exp, rtx target, val = expand_normal (CALL_EXPR_ARG (exp, 0)); create_input_operand (&ops[1], val, SImode); create_output_operand (&ops[0], target, SImode); - + return nios2_expand_builtin_insn (d, 2, ops, true); } @@ -3982,10 +3982,10 @@ nios2_expand_eni_builtin (tree exp, rtx target ATTRIBUTE_UNUSED, if (INTVAL (imm) != 0 && INTVAL (imm) != 1) { error ("the ENI instruction operand must be either 0 or 1"); - return const0_rtx; + return const0_rtx; } create_integer_operand (&ops[0], INTVAL (imm)); - + return nios2_expand_builtin_insn (d, 1, ops, false); } @@ -4307,7 +4307,7 @@ nios2_valid_target_attribute_rec (tree args) if (ISSPACE (*t)) continue; if (!ISDIGIT (*t)) - { + { error ("%<custom-%s=%> argument should be " "a non-negative integer", N2FPU_NAME (code)); return false; @@ -4323,7 +4323,7 @@ nios2_valid_target_attribute_rec (tree args) error ("%<custom-%s=%> is not recognized as FPU instruction", argstr + 7); return false; - } + } } else { @@ -4683,7 +4683,7 @@ static bool nios2_add_insn_narrow[] = { false, false}; /* Function to classify kinds of add instruction patterns. */ -static enum nios2_add_insn_kind +static enum nios2_add_insn_kind nios2_add_insn_classify (rtx_insn *insn ATTRIBUTE_UNUSED, rtx lhs, rtx rhs1, rtx rhs2) { @@ -5039,7 +5039,7 @@ ldstwm_operation_p (rtx op, bool load_p) { int start, i, end = XVECLEN (op, 0) - 1, last_regno = -1; unsigned int regset = 0; - rtx base_reg, offset; + rtx base_reg, offset; rtx first_elt = XVECEXP (op, 0, 0); bool inc_p = true; bool wb_p = base_reg_adjustment_p (first_elt, &base_reg, &offset); @@ -5413,7 +5413,7 @@ nios2_reorg (void) max_labelno = max_label_num (); min_labelno = get_first_label_num (); label_align = XCNEWVEC (unsigned char, max_labelno - min_labelno + 1); - + /* Iterate on inserting alignment and adjusting branch lengths until no more changes. */ while (changed) @@ -5464,7 +5464,7 @@ nios2_adjust_reg_alloc_order (void) const int cdx_reg_alloc_order[] = { /* Call-clobbered GPRs within CDX 3-bit encoded range. */ - 2, 3, 4, 5, 6, 7, + 2, 3, 4, 5, 6, 7, /* Call-saved GPRs within CDX 3-bit encoded range. */ 16, 17, /* Other call-clobbered GPRs. */ diff --git a/gcc/config/nios2/nios2.h b/gcc/config/nios2/nios2.h index bad72671..88ad166 100644 --- a/gcc/config/nios2/nios2.h +++ b/gcc/config/nios2/nios2.h @@ -1,6 +1,6 @@ /* Definitions of target machine for Altera Nios II. Copyright (C) 2012-2024 Free Software Foundation, Inc. - Contributed by Jonah Graham (jgraham@altera.com), + Contributed by Jonah Graham (jgraham@altera.com), Will Reece (wreece@altera.com), and Jeff DaSilva (jdasilva@altera.com). Contributed by Mentor Graphics, Inc. @@ -127,7 +127,7 @@ 29 r29 ea Exception Return Address 30 r30 ba Breakpoint Return Address 31 r31 ra Return Address - + 32 ctl0 status 33 ctl1 estatus STATUS saved by exception 34 ctl2 bstatus STATUS saved by break @@ -141,7 +141,7 @@ 40 First Pseudo Register In addition, r12 is used as the static chain register and r13, r14, and r15 - are clobbered by PLT code sequences. + are clobbered by PLT code sequences. The definitions for all the hard register numbers are located in nios2.md. */ diff --git a/gcc/config/nvptx/gen-opt.sh b/gcc/config/nvptx/gen-opt.sh index 3f78382..6022f51 100644 --- a/gcc/config/nvptx/gen-opt.sh +++ b/gcc/config/nvptx/gen-opt.sh @@ -38,12 +38,24 @@ echo . $gen_copyright_sh opt +# Not emitting the following here (in addition to having it in 'nvptx.opt'), as +# we'll otherwise run into: +# +# gtyp-input.list:10: file [...]/gcc/config/nvptx/nvptx-opts.h specified more than once for language (all) +# make[2]: *** [Makefile:2981: s-gtype] Error 1 +: || +cat <<EOF + +HeaderInclude +config/nvptx/nvptx-opts.h +EOF + # Separator. echo cat <<EOF Enum -Name(ptx_isa) Type(int) +Name(ptx_isa) Type(enum ptx_isa) Known PTX ISA target architectures (for use with the -misa= option): EOF diff --git a/gcc/config/nvptx/mkoffload.cc b/gcc/config/nvptx/mkoffload.cc index 503b1ab..ddb1c66 100644 --- a/gcc/config/nvptx/mkoffload.cc +++ b/gcc/config/nvptx/mkoffload.cc @@ -29,6 +29,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -61,6 +62,7 @@ static const char *omp_requires_file; static const char *ptx_dumpbase; enum offload_abi offload_abi = OFFLOAD_ABI_UNSET; +const char *offload_abi_host_opts = NULL; /* Delete tempfiles. */ @@ -607,17 +609,10 @@ compile_native (const char *infile, const char *outfile, const char *compiler, obstack_ptr_grow (&argv_obstack, ptx_dumpbase); obstack_ptr_grow (&argv_obstack, "-dumpbase-ext"); obstack_ptr_grow (&argv_obstack, ".c"); - switch (offload_abi) - { - case OFFLOAD_ABI_LP64: - obstack_ptr_grow (&argv_obstack, "-m64"); - break; - case OFFLOAD_ABI_ILP32: - obstack_ptr_grow (&argv_obstack, "-m32"); - break; - default: - gcc_unreachable (); - } + if (!offload_abi_host_opts) + fatal_error (input_location, + "%<-foffload-abi-host-opts%> not specified."); + obstack_ptr_grow (&argv_obstack, offload_abi_host_opts); obstack_ptr_grow (&argv_obstack, infile); obstack_ptr_grow (&argv_obstack, "-c"); obstack_ptr_grow (&argv_obstack, "-o"); @@ -721,6 +716,15 @@ main (int argc, char **argv) "unrecognizable argument of option " STR); } #undef STR + else if (startswith (argv[i], "-foffload-abi-host-opts=")) + { + if (offload_abi_host_opts) + fatal_error (input_location, + "%<-foffload-abi-host-opts%> specified " + "multiple times"); + offload_abi_host_opts + = argv[i] + strlen ("-foffload-abi-host-opts="); + } else if (strcmp (argv[i], "-fopenmp") == 0) fopenmp = true; else if (strcmp (argv[i], "-fopenacc") == 0) diff --git a/gcc/config/nvptx/nvptx-c.cc b/gcc/config/nvptx/nvptx-c.cc index 8538952..516ce90 100644 --- a/gcc/config/nvptx/nvptx-c.cc +++ b/gcc/config/nvptx/nvptx-c.cc @@ -51,10 +51,8 @@ nvptx_cpu_cpp_builtins (void) cpp_define (parse_in, ptx_sm); { - unsigned major - = ptx_version_to_number ((ptx_version)ptx_version_option, true); - unsigned minor - = ptx_version_to_number ((ptx_version)ptx_version_option, false); + unsigned major = ptx_version_to_number (ptx_version_option, true); + unsigned minor = ptx_version_to_number (ptx_version_option, false); cpp_define_formatted (parse_in, "__PTX_ISA_VERSION_MAJOR__=%u", major); cpp_define_formatted (parse_in, "__PTX_ISA_VERSION_MINOR__=%u", minor); } diff --git a/gcc/config/nvptx/nvptx-gen.opt b/gcc/config/nvptx/nvptx-gen.opt index b097caf..84b70d6 100644 --- a/gcc/config/nvptx/nvptx-gen.opt +++ b/gcc/config/nvptx/nvptx-gen.opt @@ -20,7 +20,7 @@ ; <http://www.gnu.org/licenses/>. Enum -Name(ptx_isa) Type(int) +Name(ptx_isa) Type(enum ptx_isa) Known PTX ISA target architectures (for use with the -misa= option): EnumValue diff --git a/gcc/config/nvptx/nvptx-opts.h b/gcc/config/nvptx/nvptx-opts.h index f897532..fb5147c 100644 --- a/gcc/config/nvptx/nvptx-opts.h +++ b/gcc/config/nvptx/nvptx-opts.h @@ -22,6 +22,7 @@ enum ptx_isa { + PTX_ISA_unset, #define NVPTX_SM(XX, SEP) PTX_ISA_SM ## XX SEP #define NVPTX_SM_SEP , #include "nvptx-sm.def" @@ -31,7 +32,8 @@ enum ptx_isa enum ptx_version { - PTX_VERSION_default, + PTX_VERSION_unset, + PTX_VERSION_default = PTX_VERSION_unset, PTX_VERSION_3_0, PTX_VERSION_3_1, PTX_VERSION_4_2, diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index 2a8f713..3072d37 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -20,6 +20,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include <sstream> #include "system.h" @@ -231,8 +232,7 @@ first_ptx_version_supporting_sm (enum ptx_isa sm) static enum ptx_version default_ptx_version_option (void) { - enum ptx_version first - = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); + enum ptx_version first = first_ptx_version_supporting_sm (ptx_isa_option); /* Pick a version that supports the sm. */ enum ptx_version res = first; @@ -311,20 +311,21 @@ sm_version_to_string (enum ptx_isa sm) static void handle_ptx_version_option (void) { - if (!OPTION_SET_P (ptx_version_option) - || ptx_version_option == PTX_VERSION_default) + if (!OPTION_SET_P (ptx_version_option)) + gcc_checking_assert (ptx_version_option == PTX_VERSION_default); + + if (ptx_version_option == PTX_VERSION_default) { ptx_version_option = default_ptx_version_option (); return; } - enum ptx_version first - = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option); + enum ptx_version first = first_ptx_version_supporting_sm (ptx_isa_option); if (ptx_version_option < first) error ("PTX version (%<-mptx%>) needs to be at least %s to support selected" " %<-misa%> (sm_%s)", ptx_version_to_string (first), - sm_version_to_string ((enum ptx_isa)ptx_isa_option)); + sm_version_to_string (ptx_isa_option)); } /* Implement TARGET_OPTION_OVERRIDE. */ @@ -336,7 +337,9 @@ nvptx_option_override (void) /* Via nvptx 'OPTION_DEFAULT_SPECS', '-misa' always appears on the command line; but handle the case that the compiler is not run via the driver. */ - if (!OPTION_SET_P (ptx_isa_option)) + gcc_checking_assert ((ptx_isa_option == PTX_ISA_unset) + == (!OPTION_SET_P (ptx_isa_option))); + if (ptx_isa_option == PTX_ISA_unset) fatal_error (UNKNOWN_LOCATION, "%<-march=%> must be specified"); handle_ptx_version_option (); @@ -594,7 +597,7 @@ nvptx_emit_forking (unsigned mask, bool is_call) if (mask) { rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); - + /* Emit fork at all levels. This helps form SESE regions, as it creates a block with a single successor before entering a partitooned region. That is a good candidate for the end of @@ -902,10 +905,10 @@ write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); const char *pfx = "\t.reg"; const char *sfx = ";\n"; - + if (for_proto) pfx = "(.param", sfx = "_out) "; - + s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx; } @@ -928,7 +931,7 @@ write_return_type (std::stringstream &s, bool for_proto, tree type) { if (for_proto) return return_in_mem; - + /* Named return values can cause us to return a pointer as well as expect an argument for the return location. This is optimization-level specific, so no caller can make use of @@ -995,8 +998,7 @@ static void write_fn_proto_1 (std::stringstream &s, bool is_defn, const char *name, const_tree decl, bool force_public) { - if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL) - write_fn_marker (s, is_defn, TREE_PUBLIC (decl) || force_public, name); + write_fn_marker (s, is_defn, TREE_PUBLIC (decl) || force_public, name); /* PTX declaration. */ if (DECL_EXTERNAL (decl)) @@ -1055,7 +1057,7 @@ write_fn_proto_1 (std::stringstream &s, bool is_defn, for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--) { tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); - + if (not_atomic_weak_arg) argno = write_arg_type (s, -1, argno, type, prototyped); else @@ -1225,7 +1227,7 @@ static void nvptx_maybe_record_fnsym (rtx sym) { tree decl = SYMBOL_REF_DECL (sym); - + if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl)) nvptx_record_needed_fndecl (decl); } @@ -1509,7 +1511,7 @@ nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) bool return_in_mem = write_return_type (s, false, result_type); if (return_in_mem) argno = write_arg_type (s, 0, argno, ptr_type_node, true); - + /* Declare and initialize incoming arguments. */ tree args = TYPE_ARG_TYPES (fntype); bool prototyped = true; @@ -1900,7 +1902,7 @@ nvptx_expand_call (rtx retval, rtx address) if (varargs) XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs); - gcc_assert (vec_pos = XVECLEN (pat, 0)); + gcc_assert (vec_pos == XVECLEN (pat, 0)); nvptx_emit_forking (parallel, true); emit_call_insn (pat); @@ -1944,7 +1946,7 @@ static rtx nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src) { rtx res; - + switch (GET_MODE (src)) { case E_DImode: @@ -1965,7 +1967,7 @@ static rtx nvptx_gen_pack (rtx dst, rtx src0, rtx src1) { rtx res; - + switch (GET_MODE (dst)) { case E_DImode: @@ -2068,7 +2070,7 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) case E_BImode: { rtx tmp = gen_reg_rtx (SImode); - + start_sequence (); emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx)); emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); @@ -2091,7 +2093,7 @@ nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) end_sequence (); } break; - + default: gcc_unreachable (); } @@ -2131,7 +2133,7 @@ enum propagate_mask /* Generate instruction(s) to spill or fill register REG to/from the worker broadcast array. PM indicates what is to be done, REP how many loop iterations will be executed (0 for not a loop). */ - + static rtx nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, broadcast_data_t *data, bool vector) @@ -2144,7 +2146,7 @@ nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, case E_BImode: { rtx tmp = gen_reg_rtx (SImode); - + start_sequence (); if (pm & PM_read) emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); @@ -2171,7 +2173,7 @@ nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, if (data->offset) addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); } - + addr = gen_rtx_MEM (mode, addr); if (pm == PM_read) res = gen_rtx_SET (addr, reg); @@ -2184,7 +2186,7 @@ nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, { /* We're using a ptr, increment it. */ start_sequence (); - + emit_insn (res); emit_insn (gen_adddi3 (data->ptr, data->ptr, GEN_INT (GET_MODE_SIZE (GET_MODE (reg))))); @@ -2257,7 +2259,7 @@ output_init_frag (rtx sym) init_frag.val = 0; init_frag.offset = 0; init_frag.remaining--; - + if (sym) { bool function = (SYMBOL_REF_DECL (sym) @@ -2738,7 +2740,7 @@ nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) fprintf (asm_out_file, "\t\tcall "); if (result != NULL_RTX) fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]); - + if (decl) { char *replaced_dots = NULL; @@ -3000,7 +3002,7 @@ nvptx_print_operand (FILE *file, rtx x, int code) { nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x); /* Same order as nvptx_shuffle_kind. */ - static const char *const kinds[] = + static const char *const kinds[] = {".up", ".down", ".bfly", ".idx"}; fputs (kinds[kind], file); } @@ -3495,7 +3497,7 @@ struct parallel { /* Parent parallel. */ parallel *parent; - + /* Next sibling parallel. */ parallel *next; @@ -3539,7 +3541,7 @@ parallel::parallel (parallel *parent_, unsigned mask_) forked_block = join_block = 0; forked_insn = join_insn = 0; fork_insn = joining_insn = 0; - + if (parent) { next = parent->inner; @@ -3627,7 +3629,7 @@ nvptx_split_blocks (bb_insn_map_t *map) block = elt->second; remap = block; } - + /* Split block before insn. The insn is in the new block */ edge e = split_block (block, PREV_INSN (elt->first)); @@ -3799,7 +3801,7 @@ nvptx_discover_pars (bb_insn_map_t *map) nvptx_dump_pars (par, 0); fprintf (dump_file, "\n"); } - + return par; } @@ -3830,7 +3832,7 @@ nvptx_discover_pars (bb_insn_map_t *map) the node itself and one for the output edges. Such back edges are referred to as 'Brackets'. Cycle equivalent nodes will have the same set of brackets. - + Determining bracket equivalency is done by maintaining a list of brackets in such a manner that the list length and final bracket uniquely identify the set. @@ -3840,7 +3842,7 @@ nvptx_discover_pars (bb_insn_map_t *map) algorithm. Notice it doesn't actually find the set of nodes within a particular region, just unorderd sets of nodes that are the entries and exits of SESE regions. - + After determining cycle equivalency, we need to find the minimal set of SESE regions. Do this with a DFS coloring walk of the complete graph. We're either 'looking' or 'coloring'. When @@ -3931,7 +3933,7 @@ struct bb_sese back.first ? back.first->index : 0, back.second); brackets.safe_push (bracket (back)); } - + void append (bb_sese *child); void remove (const pseudo_node_t &); @@ -4019,10 +4021,10 @@ nvptx_sese_number (int n, int p, int dir, basic_block b, if (dump_file) fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n", b->index, n, p, dir); - + BB_SET_SESE (b, new bb_sese (n, p, dir)); p = n; - + n += 3; list->quick_push (b); @@ -4039,7 +4041,7 @@ nvptx_sese_number (int n, int p, int dir, basic_block b, FOR_EACH_EDGE (e, ei, edges) { basic_block target = *(basic_block *)((char *)e + offset); - + if (target->flags & BB_VISITED) n = nvptx_sese_number (n, p, dir, target, list); } @@ -4117,7 +4119,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, /* Non-parental ancestor node -- a backlink. */ int d = usd * t_sese->dir; int back = t_sese->node + d; - + if (hi_back > back) { hi_back = back; @@ -4152,7 +4154,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, sese->push (pseudo_node_t (nullptr, 0)); } } - + /* If this node leads directly or indirectly to a no-return region of the graph, then fake a backedge to entry node. */ if (!sese->brackets.length () || !edges || !edges->length ()) @@ -4209,7 +4211,7 @@ nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, node_child = t_sese->high; } } - + sese->push (node_child); } } @@ -4232,7 +4234,7 @@ nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions, gcc_assert (coloring < 0 || (sese && coloring == sese->color)); return; } - + block->flags |= BB_VISITED; if (sese) @@ -4264,7 +4266,7 @@ nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions, { edge e; edge_iterator ei; - + FOR_EACH_EDGE (e, ei, block->succs) nvptx_sese_color (color_counts, regions, e->dest, coloring); } @@ -4281,7 +4283,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) basic_block block; int ix; - /* First clear each BB of the whole function. */ + /* First clear each BB of the whole function. */ FOR_ALL_BB_FN (block, cfun) { block->flags &= ~BB_VISITED; @@ -4312,7 +4314,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) if (dump_file) fprintf (dump_file, "Searching graph starting at %d\n", block->index); - + /* Number the nodes reachable from block initial DFS order. */ int depth = nvptx_sese_number (2, 0, +1, block, &spanlist); @@ -4342,7 +4344,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) { unsigned count; const char *comma = ""; - + fprintf (dump_file, "Found %d cycle equivalents\n", color_counts.length ()); for (ix = 0; color_counts.iterate (ix, &count); ix++) @@ -4362,7 +4364,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) } fprintf (dump_file, "\n"); } - + /* Now we've colored every block in the subgraph. We now need to determine the minimal set of SESE regions that cover that subgraph. Do this with a DFS walk of the complete function. @@ -4384,7 +4386,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) { const char *comma = ""; int len = regions.length (); - + fprintf (dump_file, "SESE regions:"); for (ix = 0; ix != len; ix++) { @@ -4414,7 +4416,7 @@ nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) } fprintf (dump_file, "\n\n"); } - + for (ix = 0; blocks.iterate (ix, &block); ix++) delete BB_GET_SESE (block); } @@ -4476,7 +4478,7 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, idx = gen_reg_rtx (SImode); pred = gen_reg_rtx (BImode); label = gen_label_rtx (); - + emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); /* Allow worker function to initialize anything needed. */ rtx init = fn (tmp, PM_loop_begin, fs, data, vector); @@ -4534,7 +4536,7 @@ warp_prop_gen (rtx reg, propagate_mask pm, { if (!(pm & PM_read_write)) return 0; - + return nvptx_gen_warp_bcast (reg); } @@ -4796,7 +4798,7 @@ verify_neutering_labels (basic_block to, rtx_insn *vector_label, /* Single neutering according to MASK. FROM is the incoming block and TO is the outgoing block. These may be the same block. Insert at start of FROM: - + if (tid.<axis>) goto end. and insert before ending branch of TO (if there is such an insn): @@ -5165,7 +5167,7 @@ nvptx_process_pars (parallel *par) { if (nvptx_optimize) nvptx_optimize_inner (par); - + unsigned inner_mask = par->mask; /* Do the inner parallels first. */ @@ -5231,7 +5233,7 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) & (GOMP_DIM_MASK (GOMP_DIM_WORKER) | GOMP_DIM_MASK (GOMP_DIM_VECTOR))); unsigned skip_mask = 0, neuter_mask = 0; - + if (par->inner) nvptx_neuter_pars (par->inner, modes, outer | me); @@ -5292,7 +5294,7 @@ nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) if (skip_mask) nvptx_skip_par (skip_mask, par); - + if (par->next) nvptx_neuter_pars (par->next, modes, outer); } @@ -5735,7 +5737,7 @@ nvptx_reorg (void) if (dump_file) df_dump (dump_file); - + /* Mark unused regs as unused. */ int max_regs = max_reg_num (); for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++) @@ -5953,13 +5955,11 @@ nvptx_file_start (void) fputs ("// BEGIN PREAMBLE\n", asm_out_file); fputs ("\t.version\t", asm_out_file); - fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option), - asm_out_file); + fputs (ptx_version_to_string (ptx_version_option), asm_out_file); fputs ("\n", asm_out_file); fputs ("\t.target\tsm_", asm_out_file); - fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option), - asm_out_file); + fputs (sm_version_to_string (ptx_isa_option), asm_out_file); fputs ("\n", asm_out_file); fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); @@ -6031,7 +6031,7 @@ nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore) { if (ignore) return target; - + rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, mode, EXPAND_NORMAL); if (!REG_P (src)) @@ -6041,7 +6041,7 @@ nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore) NULL_RTX, SImode, EXPAND_NORMAL); rtx op = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, SImode, EXPAND_NORMAL); - + if (!REG_P (idx) && GET_CODE (idx) != CONST_INT) idx = copy_to_mode_reg (SImode, idx); @@ -6060,7 +6060,7 @@ nvptx_expand_brev (tree exp, rtx target, machine_mode mode, int ignore) { if (ignore) return target; - + rtx arg = expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, mode, EXPAND_NORMAL); if (!REG_P (arg)) @@ -6150,7 +6150,7 @@ nvptx_expand_cmp_swap (tree exp, rtx target, machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore)) { machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); - + if (!target) target = gen_reg_rtx (mode); @@ -6167,7 +6167,7 @@ nvptx_expand_cmp_swap (tree exp, rtx target, cmp = copy_to_mode_reg (mode, cmp); if (!REG_P (src)) src = copy_to_mode_reg (mode, src); - + if (mode == SImode) pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx); else @@ -6747,7 +6747,7 @@ nvptx_generate_vector_shuffle (location_t loc, fn = NVPTX_BUILTIN_SHUFFLELL; arg_type = long_long_unsigned_type_node; } - + tree call = nvptx_builtin_decl (fn, true); tree bits = build_int_cst (unsigned_type_node, shift); tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); @@ -6784,7 +6784,7 @@ static tree nvptx_global_lock_addr () { tree v = global_lock_var; - + if (!v) { tree name = get_identifier ("__reduction_lock"); @@ -6847,7 +6847,7 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, gimple *init_end = gimple_seq_last (init_seq); gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); - + /* Split the block just after the init stmts. */ basic_block pre_bb = gsi_bb (*gsi); edge pre_edge = split_block (pre_bb, init_end); @@ -6859,7 +6859,7 @@ nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, tree expect_var = make_ssa_name (arg_type); tree actual_var = make_ssa_name (arg_type); tree write_var = make_ssa_name (arg_type); - + /* Build and insert the reduction calculation. */ gimple_seq red_seq = NULL; tree write_expr = fold_build1 (code, var_type, expect_var); @@ -6961,7 +6961,7 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, basic_block update_bb = locked_edge->dest; lock_bb = locked_edge->src; *gsi = gsi_for_stmt (gsi_stmt (*gsi)); - + /* Create the lock loop ... */ locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; locked_edge->probability = profile_probability::even (); @@ -6993,11 +6993,11 @@ nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, tree ref_in = build_simple_mem_ref (ptr); TREE_THIS_VOLATILE (ref_in) = 1; gimplify_assign (acc_in, ref_in, &red_seq); - + tree acc_out = make_ssa_name (var_type); tree update_expr = fold_build2 (op, var_type, ref_in, var); gimplify_assign (acc_out, update_expr, &red_seq); - + tree ref_out = build_simple_mem_ref (ptr); TREE_THIS_VOLATILE (ref_out) = 1; gimplify_assign (ref_out, acc_out, &red_seq); @@ -7060,7 +7060,7 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) if (!integer_zerop (ref_to_res)) var = build_simple_mem_ref (ref_to_res); } - + if (level == GOMP_DIM_WORKER || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) { @@ -7097,7 +7097,7 @@ nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) tree init = omp_reduction_init_op (gimple_location (call), rcode, TREE_TYPE (var)); gimple_seq seq = NULL; - + push_gimplify_context (true); if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) @@ -7122,7 +7122,7 @@ nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) /* Fixup flags from call_bb to init_bb. */ init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; init_edge->probability = profile_probability::even (); - + /* Set the initialization stmts. */ gimple_seq init_seq = NULL; tree init_var = make_ssa_name (TREE_TYPE (var)); @@ -7134,7 +7134,7 @@ nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) gsi_prev (&gsi); edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi)); basic_block dst_bb = inited_edge->dest; - + /* Create false edge from call_bb to dst_bb. */ edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE); nop_edge->probability = profile_probability::even (); @@ -7249,7 +7249,7 @@ nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) tree var = gimple_call_arg (call, 2); int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); gimple_seq seq = NULL; - + push_gimplify_context (true); if (level == GOMP_DIM_WORKER || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) @@ -7276,7 +7276,7 @@ nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) if (lhs) gimplify_assign (lhs, var, &seq); - + pop_gimplify_context (NULL); gsi_replace_with_seq (&gsi, seq, true); @@ -7583,7 +7583,8 @@ nvptx_mem_local_p (rtx mem) while (0) void -nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value) +nvptx_asm_output_def_from_decls (FILE *stream, tree name, + tree value ATTRIBUTE_UNUSED) { if (nvptx_alias == 0 || !TARGET_PTX_6_3) { @@ -7618,7 +7619,8 @@ nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value) return; } - if (!cgraph_node::get (name)->referred_to_p ()) + cgraph_node *cnode = cgraph_node::get (name); + if (!cnode->referred_to_p ()) /* Prevent "Internal error: reference to deleted section". */ return; @@ -7627,8 +7629,27 @@ nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value) fputs (s.str ().c_str (), stream); tree id = DECL_ASSEMBLER_NAME (name); + + /* Walk alias chain to get reference callgraph node. + The rationale of using ultimate_alias_target here is that + PTX's .alias directive only supports 1-level aliasing where + aliasee is function defined in same module. + + So for the following case: + int foo() { return 42; } + int bar () __attribute__((alias ("foo"))); + int baz () __attribute__((alias ("bar"))); + + should resolve baz to foo: + .visible .func (.param.u32 %value_out) baz; + .alias baz,foo; */ + symtab_node *alias_target_node = cnode->ultimate_alias_target (); + tree alias_target_id = DECL_ASSEMBLER_NAME (alias_target_node->decl); + std::stringstream s_def; + write_fn_marker (s_def, true, TREE_PUBLIC (name), IDENTIFIER_POINTER (id)); + fputs (s_def.str ().c_str (), stream); NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id), - IDENTIFIER_POINTER (value)); + IDENTIFIER_POINTER (alias_target_id)); } #undef NVPTX_ASM_OUTPUT_DEF diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index deb0066..c040740 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -17,6 +17,9 @@ ; along with GCC; see the file COPYING3. If not see ; <http://www.gnu.org/licenses/>. +HeaderInclude +config/nvptx/nvptx-opts.h + ; It's not clear whether this was ever build/tested/used, so this is no longer ; exposed to the user. ;m32 @@ -53,7 +56,7 @@ Target Mask(GOMP) Generate code for OpenMP offloading: enables -msoft-stack and -muniform-simt. misa= -Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) +Target RejectNegative ToLower Joined Enum(ptx_isa) Var(ptx_isa_option) Init(PTX_ISA_unset) Specify the PTX ISA target architecture to use. march= @@ -118,7 +121,7 @@ march-map=sm_90a Target RejectNegative Alias(misa=,sm_80) Enum -Name(ptx_version) Type(int) +Name(ptx_version) Type(enum ptx_version) Known PTX ISA versions (for use with the -mptx= option): EnumValue @@ -137,7 +140,7 @@ EnumValue Enum(ptx_version) String(_) Value(PTX_VERSION_default) mptx= -Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) +Target RejectNegative ToLower Joined Enum(ptx_version) Var(ptx_version_option) Init(PTX_VERSION_unset) Specify the PTX ISA version to use. minit-regs= diff --git a/gcc/config/openbsd-stdint.h b/gcc/config/openbsd-stdint.h index a6da1da..00ca36c 100644 --- a/gcc/config/openbsd-stdint.h +++ b/gcc/config/openbsd-stdint.h @@ -1,5 +1,5 @@ #define SIG_ATOMIC_TYPE "int" - + #define INT8_TYPE "signed char" #define INT16_TYPE "short int" #define INT32_TYPE "int" @@ -8,7 +8,7 @@ #define UINT16_TYPE "short unsigned int" #define UINT32_TYPE "unsigned int" #define UINT64_TYPE "long long unsigned int" - + #define INT_LEAST8_TYPE "signed char" #define INT_LEAST16_TYPE "short int" #define INT_LEAST32_TYPE "int" @@ -17,7 +17,7 @@ #define UINT_LEAST16_TYPE "short unsigned int" #define UINT_LEAST32_TYPE "unsigned int" #define UINT_LEAST64_TYPE "long long unsigned int" - + #define INT_FAST8_TYPE "int" #define INT_FAST16_TYPE "int" #define INT_FAST32_TYPE "int" @@ -29,6 +29,6 @@ #define INTMAX_TYPE "long long int" #define UINTMAX_TYPE "long long unsigned int" - + #define INTPTR_TYPE "long int" #define UINTPTR_TYPE "long unsigned int" diff --git a/gcc/config/openbsd.h b/gcc/config/openbsd.h index 3493df9..6522527 100644 --- a/gcc/config/openbsd.h +++ b/gcc/config/openbsd.h @@ -17,27 +17,27 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ -/* Common OpenBSD configuration. +/* Common OpenBSD configuration. All OpenBSD architectures include this file, which is intended as - a repository for common defines. + a repository for common defines. Some defines are common to all architectures, a few of them are triggered by OBSD_* guards, so that we won't override architecture defaults by mistakes. - OBSD_HAS_CORRECT_SPECS: + OBSD_HAS_CORRECT_SPECS: another mechanism provides correct specs already. - OBSD_NO_DYNAMIC_LIBRARIES: + OBSD_NO_DYNAMIC_LIBRARIES: no implementation of dynamic libraries. - OBSD_OLD_GAS: + OBSD_OLD_GAS: older flavor of gas which needs help for PIC. OBSD_HAS_DECLARE_FUNCTION_NAME, OBSD_HAS_DECLARE_FUNCTION_SIZE, - OBSD_HAS_DECLARE_OBJECT: + OBSD_HAS_DECLARE_OBJECT: PIC support, FUNCTION_NAME/FUNCTION_SIZE are independent, whereas the corresponding logic for OBJECTS is necessarily coupled. There are also a few `default' defines such as ASM_WEAKEN_LABEL, - intended as common ground for arch that don't provide + intended as common ground for arch that don't provide anything suitable. */ /* OPENBSD_NATIVE is defined only when gcc is configured as part of @@ -104,7 +104,7 @@ while (0) /* CPP_SPEC appropriate for OpenBSD. We deal with -posix and -pthread. XXX the way threads are handled currently is not very satisfying, - since all code must be compiled with -pthread to work. + since all code must be compiled with -pthread to work. This two-stage defines makes it easy to pick that for targets that have subspecs. */ #ifdef CPP_CPU_SPEC @@ -122,8 +122,8 @@ while (0) #define CPP_SPEC OBSD_CPP_SPEC #ifdef OBSD_OLD_GAS -/* ASM_SPEC appropriate for OpenBSD. For some architectures, OpenBSD - still uses a special flavor of gas that needs to be told when generating +/* ASM_SPEC appropriate for OpenBSD. For some architectures, OpenBSD + still uses a special flavor of gas that needs to be told when generating pic code. */ #undef ASM_SPEC #define ASM_SPEC "%{" FPIE1_OR_FPIC1_SPEC ":-k} %{" FPIE2_OR_FPIC2_SPEC ":-k -K}" @@ -152,7 +152,7 @@ while (0) /* - we use . - _func instead of a local label, - - we put extra spaces in expressions such as + - we put extra spaces in expressions such as .type _func , @function This is more readable for a human being and confuses c++filt less. */ @@ -161,11 +161,11 @@ while (0) /* Define the strings used for the .type and .size directives. These strings generally do not vary from one system running OpenBSD to another, but if a given system needs to use different pseudo-op - names for these, they may be overridden in the arch specific file. */ + names for these, they may be overridden in the arch specific file. */ /* OpenBSD assembler is hacked to have .type & .size support even in a.out - format object files. Functions size are supported but not activated - yet (look for GRACE_PERIOD_EXPIRED in gas/config/obj-aout.c). + format object files. Functions size are supported but not activated + yet (look for GRACE_PERIOD_EXPIRED in gas/config/obj-aout.c). SET_ASM_OP is needed for attribute alias to work. */ #undef TYPE_ASM_OP @@ -191,12 +191,12 @@ while (0) /* These macros generate the special .type and .size directives which are used to set the corresponding fields of the linker symbol table - entries under OpenBSD. These macros also have to output the starting + entries under OpenBSD. These macros also have to output the starting labels for the relevant functions/objects. */ #ifndef OBSD_HAS_DECLARE_FUNCTION_NAME /* Extra assembler code needed to declare a function properly. - Some assemblers may also need to also have something extra said + Some assemblers may also need to also have something extra said about the function's return value. We allow for that here. */ #undef ASM_DECLARE_FUNCTION_NAME #define ASM_DECLARE_FUNCTION_NAME(FILE, NAME, DECL) \ @@ -238,7 +238,7 @@ while (0) /* Output the size directive for a decl in rest_of_decl_compilation in the case where we did not do so before the initializer. Once we find the error_mark_node, we know that the value of - size_directive_output was set by ASM_DECLARE_OBJECT_NAME + size_directive_output was set by ASM_DECLARE_OBJECT_NAME when it was run for the same decl. */ #undef ASM_FINISH_DECLARE_OBJECT #define ASM_FINISH_DECLARE_OBJECT(FILE, DECL, TOP_LEVEL, AT_END) \ @@ -260,11 +260,11 @@ do { \ /* Those are `generic' ways to weaken/globalize a label. We shouldn't need to override a processor specific definition. Hence, #ifndef ASM_* - In case overriding turns out to be needed, one can always #undef ASM_* + In case overriding turns out to be needed, one can always #undef ASM_* before including this file. */ /* Tell the assembler that a symbol is weak. */ -/* Note: netbsd arm32 assembler needs a .globl here. An override may +/* Note: netbsd arm32 assembler needs a .globl here. An override may be needed when/if we go for arm32 support. */ #ifndef ASM_WEAKEN_LABEL #define ASM_WEAKEN_LABEL(FILE,NAME) \ diff --git a/gcc/config/pa/pa-64.h b/gcc/config/pa/pa-64.h index b676468..c5e8d32 100644 --- a/gcc/config/pa/pa-64.h +++ b/gcc/config/pa/pa-64.h @@ -35,7 +35,7 @@ along with GCC; see the file COPYING3. If not see size_t 8 bytes ptrdiff_t 8 bytes wchar 4 bytes - + Make GCC agree with types.h. */ #undef SIZE_TYPE #define SIZE_TYPE "long unsigned int" @@ -91,9 +91,9 @@ along with GCC; see the file COPYING3. If not see the RTL to avoid scheduling related problems. For example, the store and load could be separated by a call to a pure or const function which has no frame and this function might also use SP-16. - We have 14-bit immediates on the 64-bit port, so we use secondary - memory for the copies. */ -#define PA_SECONDARY_MEMORY_NEEDED(MODE, CLASS1, CLASS2) \ - (MAYBE_FP_REG_CLASS_P (CLASS1) != FP_REG_CLASS_P (CLASS2) \ - || MAYBE_FP_REG_CLASS_P (CLASS2) != FP_REG_CLASS_P (CLASS1)) + On the 64-bit port, I couldn't get SECONDARY_MEMORY_NEEDED to work + with LRA, so I modified the move patterns to use SP-40. The HP + compiler also uses this slot in the frame marker for moving data + between the general and floating-point registers. */ +#define PA_SECONDARY_MEMORY_NEEDED(MODE, CLASS1, CLASS2) false diff --git a/gcc/config/pa/pa.cc b/gcc/config/pa/pa.cc index 911b7d9..94ee7db 100644 --- a/gcc/config/pa/pa.cc +++ b/gcc/config/pa/pa.cc @@ -58,7 +58,7 @@ along with GCC; see the file COPYING3. If not see /* This file should be included last. */ #include "target-def.h" -/* Return nonzero if there is a bypass for the output of +/* Return nonzero if there is a bypass for the output of OUT_INSN and the fp store IN_INSN. */ int pa_fpstore_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn) @@ -83,7 +83,7 @@ pa_fpstore_bypass_p (rtx_insn *out_insn, rtx_insn *in_insn) return (GET_MODE_SIZE (store_mode) == GET_MODE_SIZE (other_mode)); } - + #ifndef DO_FRAME_NOTES #ifdef INCOMING_RETURN_ADDR_RTX @@ -209,6 +209,7 @@ static bool pa_can_change_mode_class (machine_mode, machine_mode, reg_class_t); static HOST_WIDE_INT pa_starting_frame_offset (void); static section* pa_elf_select_rtx_section(machine_mode, rtx, unsigned HOST_WIDE_INT) ATTRIBUTE_UNUSED; static void pa_atomic_assign_expand_fenv (tree *, tree *, tree *); +static bool pa_use_lra_p (void); /* The following extra sections are only used for SOM. */ static GTY(()) section *som_readonly_data_section; @@ -412,7 +413,7 @@ static size_t n_deferred_plabels = 0; #define TARGET_LEGITIMATE_ADDRESS_P pa_legitimate_address_p #undef TARGET_LRA_P -#define TARGET_LRA_P hook_bool_void_false +#define TARGET_LRA_P pa_use_lra_p #undef TARGET_HARD_REGNO_NREGS #define TARGET_HARD_REGNO_NREGS pa_hard_regno_nregs @@ -973,7 +974,7 @@ legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) /* During and after reload, we need to generate a REG_LABEL_OPERAND note and update LABEL_NUSES because this is not done automatically. */ - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_in_progress || reload_completed) { /* Extract LABEL_REF. */ if (GET_CODE (orig) == CONST) @@ -998,7 +999,7 @@ legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) /* Before reload, allocate a temporary register for the intermediate result. This allows the sequence to be deleted when the final result is unused and the insns are trivially dead. */ - tmp_reg = ((reload_in_progress || reload_completed) + tmp_reg = ((lra_in_progress || reload_in_progress || reload_completed) ? reg : gen_reg_rtx (Pmode)); if (function_label_operand (orig, VOIDmode)) @@ -1052,7 +1053,7 @@ legitimize_pic_address (rtx orig, machine_mode mode, rtx reg) gcc_assert (reg); gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS); - + base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg); orig = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode, base == reg ? 0 : reg); @@ -1102,7 +1103,7 @@ legitimize_tls_address (rtx addr) if (GET_CODE (addr) != SYMBOL_REF) return addr; - switch (SYMBOL_REF_TLS_MODEL (addr)) + switch (SYMBOL_REF_TLS_MODEL (addr)) { case TLS_MODEL_GLOBAL_DYNAMIC: tmp = gen_reg_rtx (Pmode); @@ -1125,7 +1126,7 @@ legitimize_tls_address (rtx addr) insn = get_insns (); end_sequence (); t2 = gen_reg_rtx (Pmode); - emit_libcall_block (insn, t2, t1, + emit_libcall_block (insn, t2, t1, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TLSLDBASE)); emit_insn (gen_tld_offset_load (ret, addr, t2)); @@ -1959,11 +1960,13 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) copy_to_mode_reg (Pmode, XEXP (operand1, 0))); if (scratch_reg - && reload_in_progress && GET_CODE (operand0) == REG + && reload_in_progress + && GET_CODE (operand0) == REG && REGNO (operand0) >= FIRST_PSEUDO_REGISTER) operand0 = reg_equiv_mem (REGNO (operand0)); else if (scratch_reg - && reload_in_progress && GET_CODE (operand0) == SUBREG + && reload_in_progress + && GET_CODE (operand0) == SUBREG && GET_CODE (SUBREG_REG (operand0)) == REG && REGNO (SUBREG_REG (operand0)) >= FIRST_PSEUDO_REGISTER) { @@ -1976,11 +1979,13 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) } if (scratch_reg - && reload_in_progress && GET_CODE (operand1) == REG + && reload_in_progress + && GET_CODE (operand1) == REG && REGNO (operand1) >= FIRST_PSEUDO_REGISTER) operand1 = reg_equiv_mem (REGNO (operand1)); else if (scratch_reg - && reload_in_progress && GET_CODE (operand1) == SUBREG + && reload_in_progress + && GET_CODE (operand1) == SUBREG && GET_CODE (SUBREG_REG (operand1)) == REG && REGNO (SUBREG_REG (operand1)) >= FIRST_PSEUDO_REGISTER) { @@ -1992,12 +1997,16 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) operand1 = alter_subreg (&temp, true); } - if (scratch_reg && reload_in_progress && GET_CODE (operand0) == MEM + if (scratch_reg + && (lra_in_progress || reload_in_progress) + && GET_CODE (operand0) == MEM && ((tem = find_replacement (&XEXP (operand0, 0))) != XEXP (operand0, 0))) operand0 = replace_equiv_address (operand0, tem); - if (scratch_reg && reload_in_progress && GET_CODE (operand1) == MEM + if (scratch_reg + && (lra_in_progress || reload_in_progress) + && GET_CODE (operand1) == MEM && ((tem = find_replacement (&XEXP (operand1, 0))) != XEXP (operand1, 0))) operand1 = replace_equiv_address (operand1, tem); @@ -2043,8 +2052,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) op1 = replace_equiv_address (op1, scratch_reg); } } - else if (((TARGET_ELF32 || !TARGET_PA_20) - && symbolic_memory_operand (op1, VOIDmode)) + else if ((!INT14_OK_STRICT && symbolic_memory_operand (op1, VOIDmode)) || IS_LO_SUM_DLT_ADDR_P (XEXP (op1, 0)) || IS_INDEX_ADDR_P (XEXP (op1, 0))) { @@ -2093,8 +2101,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) op0 = replace_equiv_address (op0, scratch_reg); } } - else if (((TARGET_ELF32 || !TARGET_PA_20) - && symbolic_memory_operand (op0, VOIDmode)) + else if ((!INT14_OK_STRICT && symbolic_memory_operand (op0, VOIDmode)) || IS_LO_SUM_DLT_ADDR_P (XEXP (op0, 0)) || IS_INDEX_ADDR_P (XEXP (op0, 0))) { @@ -2220,7 +2227,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) && !HARD_REGISTER_P (operand0)) copy_reg_pointer (operand0, operand1); } - + /* When MEMs are broken out, the REG_POINTER flag doesn't get set. In some cases, we can set the REG_POINTER flag from the declaration for the MEM. */ @@ -2257,7 +2264,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) else if (GET_CODE (operand0) == MEM) { if (mode == DFmode && operand1 == CONST0_RTX (mode) - && !(reload_in_progress || reload_completed)) + && !(lra_in_progress || reload_in_progress || reload_completed)) { rtx temp = gen_reg_rtx (DFmode); @@ -2271,7 +2278,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) emit_insn (gen_rtx_SET (operand0, operand1)); return 1; } - if (! (reload_in_progress || reload_completed)) + if (! (lra_in_progress || reload_in_progress || reload_completed)) { operands[0] = validize_mem (operand0); operands[1] = operand1 = force_reg (mode, operand1); @@ -2311,7 +2318,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) rtx temp, const_part; /* Figure out what (if any) scratch register to use. */ - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_in_progress || reload_completed) { scratch_reg = scratch_reg ? scratch_reg : operand0; /* SCRATCH_REG will hold an address and maybe the actual @@ -2369,7 +2376,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) rtx_insn *insn; rtx temp; - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_in_progress || reload_completed) { temp = scratch_reg ? scratch_reg : operand0; /* TEMP will hold an address and maybe the actual @@ -2413,7 +2420,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) { rtx temp, set; - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_in_progress || reload_completed) { temp = scratch_reg ? scratch_reg : operand0; /* TEMP will hold an address and maybe the actual @@ -2504,7 +2511,7 @@ pa_emit_move_sequence (rtx *operands, machine_mode mode, rtx scratch_reg) } } - if (reload_in_progress || reload_completed) + if (lra_in_progress || reload_in_progress || reload_completed) temp = scratch_reg ? scratch_reg : operand0; else temp = gen_reg_rtx (mode); @@ -2863,7 +2870,7 @@ pa_output_move_double (rtx *operands) && GET_CODE (operands[0]) == REG); gcc_assert (!reg_overlap_mentioned_p (high_reg, addr)); - + /* No overlap between high target register and address register. (We do this in a non-obvious way to save a register file writeback) */ @@ -2878,7 +2885,7 @@ pa_output_move_double (rtx *operands) operands[0] = XEXP (addr, 0); gcc_assert (GET_CODE (operands[1]) == REG && GET_CODE (operands[0]) == REG); - + gcc_assert (!reg_overlap_mentioned_p (high_reg, addr)); /* No overlap between high target register and address register. (We do this in a non-obvious way to save a @@ -3099,15 +3106,15 @@ pa_output_fp_move_double (rtx *operands) else { rtx xoperands[2]; - + gcc_assert (operands[1] == CONST0_RTX (GET_MODE (operands[0]))); - + /* This is a pain. You have to be prepared to deal with an arbitrary address here including pre/post increment/decrement. so avoid this in the MD. */ gcc_assert (GET_CODE (operands[0]) == REG); - + xoperands[1] = gen_rtx_REG (SImode, REGNO (operands[0]) + 1); xoperands[0] = operands[0]; output_asm_insn ("copy %%r0,%0\n\tcopy %%r0,%1", xoperands); @@ -4076,7 +4083,7 @@ pa_compute_frame_size (poly_int64 size, int *fregs_live) first slot is only used when the frame pointer is needed. */ if (size || frame_pointer_needed) size += pa_starting_frame_offset (); - + /* If the current function calls __builtin_eh_return, then we need to allocate stack space for registers that will hold data for the exception handler. */ @@ -4416,7 +4423,7 @@ pa_expand_prologue (void) to do for functions which make no calls and allocate no frame? Do we need to allocate a frame, or can we just omit the save? For now we'll just omit the save. - + We don't want a note on this insn as the frame marker can move if there is a dynamic stack allocation. */ if (flag_pic && actual_fsize != 0 && !TARGET_64BIT) @@ -4517,7 +4524,7 @@ load_reg (int reg, HOST_WIDE_INT disp, int base) rtx tmpreg = gen_rtx_REG (Pmode, 1); emit_move_insn (tmpreg, delta); - if (TARGET_DISABLE_INDEXING) + if (!TARGET_NO_SPACE_REGS || TARGET_DISABLE_INDEXING) { emit_move_insn (tmpreg, gen_rtx_PLUS (Pmode, tmpreg, basereg)); src = gen_rtx_MEM (word_mode, tmpreg); @@ -5227,7 +5234,7 @@ pa_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, /* A fpload can't be issued until one cycle before a preceding arithmetic operation has finished if the target of the fpload is the destination of the - arithmetic operation. + arithmetic operation. Exception: For PA7100LC, PA7200 and PA7300, the cost is 3 cycles, unless they bundle together. We also @@ -5866,7 +5873,7 @@ pa_output_global_address (FILE *file, rtx x, int round_constant) default: gcc_unreachable (); } - + if (!read_only_operand (base, VOIDmode) && !flag_pic) fputs ("-$global$", file); if (offset) @@ -5926,7 +5933,7 @@ pa_file_start_mcount (const char *aswhat) if (profile_flag) fprintf (asm_out_file, "\t.IMPORT _mcount,%s\n", aswhat); } - + static void pa_elf_file_start (void) { @@ -6410,24 +6417,21 @@ pa_secondary_reload (bool in_p, rtx x, reg_class_t rclass_i, if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG) regno = true_regnum (x); - /* Handle reloads for floating point loads and stores. */ - if ((regno >= FIRST_PSEUDO_REGISTER || regno == -1) - && FP_REG_CLASS_P (rclass)) + /* Handle reloads for floating-point loads and stores. */ + if (regno < 0 && FP_REG_CLASS_P (rclass)) { - if (MEM_P (x)) - { - x = XEXP (x, 0); + if (REG_P (x) || GET_CODE (x) == SUBREG) + return NO_REGS; - /* We don't need a secondary reload for indexed memory addresses. + /* We don't need a secondary reload for indexed memory addresses. - When INT14_OK_STRICT is true, it might appear that we could - directly allow register indirect memory addresses. However, - this doesn't work because we don't support SUBREGs in - floating-point register copies and reload doesn't tell us - when it's going to use a SUBREG. */ - if (IS_INDEX_ADDR_P (x)) - return NO_REGS; - } + When INT14_OK_STRICT is true, it might appear that we could + directly allow register indirect memory addresses. However, + this doesn't work because we don't support SUBREGs in + floating-point register copies and reload doesn't tell us + when it's going to use a SUBREG. */ + if (MEM_P (x) && IS_INDEX_ADDR_P (XEXP (x, 0))) + return NO_REGS; /* Request a secondary reload with a general scratch register for everything else. ??? Could symbolic operands be handled @@ -6444,8 +6448,14 @@ pa_secondary_reload (bool in_p, rtx x, reg_class_t rclass_i, if (rclass == SHIFT_REGS) { /* Handle spill. */ - if (regno >= FIRST_PSEUDO_REGISTER || regno < 0) + if (regno < 0) { + if (REG_P (x) || GET_CODE (x) == SUBREG) + return GENERAL_REGS; + + if (TARGET_64BIT && GET_CODE (x) == CONST_INT) + return GENERAL_REGS; + sri->icode = (in_p ? direct_optab_handler (reload_in_optab, mode) : direct_optab_handler (reload_out_optab, mode)); @@ -7110,7 +7120,7 @@ const char * pa_output_lbranch (rtx dest, rtx_insn *insn, int xdelay) { rtx xoperands[4]; - + xoperands[0] = dest; /* First, free up the delay slot. */ @@ -7631,7 +7641,7 @@ pa_output_dbra (rtx *operands, rtx_insn *insn, int which_alternative) } else return "addib,%C2 %1,%0,%3"; - + case 8: /* Handle weird backwards branch with a fulled delay slot which is nullified. */ @@ -7681,7 +7691,7 @@ pa_output_dbra (rtx *operands, rtx_insn *insn, int which_alternative) return pa_output_lbranch (operands[3], insn, xdelay); } - + } /* Deal with gross reload from FP register case. */ else if (which_alternative == 1) @@ -8477,7 +8487,7 @@ pa_output_indirect_call (rtx_insn *insn, rtx call_dest) pa_output_arg_descriptor (insn); if (TARGET_PA_20) return "bve,l,n (%%r22),%%r2\n\tnop"; - return "ble 0(%%sr4,%%r22)\n\tcopy %%r31,%%r2"; + return "ble 0(%%sr4,%%r22)\n\tcopy %%r31,%%r2"; } if (TARGET_PORTABLE_RUNTIME) @@ -8489,7 +8499,7 @@ pa_output_indirect_call (rtx_insn *insn, rtx call_dest) } /* Now the normal case -- we can reach $$dyncall directly or - we're sure that we can get there via a long-branch stub. + we're sure that we can get there via a long-branch stub. No need to check target flags as the length uniquely identifies the remaining cases. */ @@ -9203,7 +9213,7 @@ pa_asm_out_destructor (rtx symbol, int priority) The ASM_OUTPUT_ALIGNED_BSS macro needs to be defined to call this function on the SOM port to prevent uninitialized global data from being placed in the data section. */ - + void pa_asm_output_aligned_bss (FILE *stream, const char *name, @@ -9369,7 +9379,7 @@ forward_branch_p (rtx_insn *insn) gcc_assert (lab != NULL_RTX); if (INSN_ADDRESSES_SET_P ()) - return INSN_ADDRESSES (INSN_UID (lab)) > INSN_ADDRESSES (INSN_UID (insn)); + return INSN_ADDRESSES (INSN_UID (lab)) > INSN_ADDRESSES (INSN_UID (insn)); while (insn) { @@ -9804,8 +9814,8 @@ pa_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, to match the HP Compiler ABI. */ static rtx -pa_function_value (const_tree valtype, - const_tree func ATTRIBUTE_UNUSED, +pa_function_value (const_tree valtype, + const_tree func ATTRIBUTE_UNUSED, bool outgoing ATTRIBUTE_UNUSED) { machine_mode valmode; @@ -10328,7 +10338,7 @@ pa_select_section (tree exp, int reloc, and the function is in a COMDAT group, place the plabel reference in the .data.rel.ro.local section. The linker ignores references to symbols in discarded sections from this section. */ - + static section * pa_elf_select_rtx_section (machine_mode mode, rtx x, unsigned HOST_WIDE_INT align) @@ -10479,7 +10489,7 @@ pa_can_change_mode_class (machine_mode from, machine_mode to, if (COMPLEX_MODE_P (from) || VECTOR_MODE_P (from) || COMPLEX_MODE_P (to) || VECTOR_MODE_P (to)) return false; - + /* There is no way to load QImode or HImode values directly from memory to a FP register. SImode loads to the FP registers are not zero extended. On the 64-bit target, this conflicts with the definition @@ -10500,7 +10510,7 @@ pa_can_change_mode_class (machine_mode from, machine_mode to, } /* Implement TARGET_MODES_TIEABLE_P. - + We should return FALSE for QImode and HImode because these modes are not ok in the floating-point registers. However, this prevents tieing these modes to SImode and DImode in the general registers. @@ -10877,6 +10887,7 @@ pa_legitimate_constant_p (machine_mode mode, rtx x) if (TARGET_64BIT && HOST_BITS_PER_WIDE_INT > 32 && GET_CODE (x) == CONST_INT + && !lra_in_progress && !reload_in_progress && !reload_completed && !LEGITIMATE_64BIT_CONST_INT_P (INTVAL (x)) @@ -10922,7 +10933,7 @@ pa_section_type_flags (tree decl, const char *name, int reloc) must provide patterns for doing indexed integer stores, or the move expanders must force the address of an indexed store to a register. We have adopted the latter approach. - + Another function of pa_legitimate_address_p is to ensure that the base register is a valid pointer for indexed instructions. On targets that have non-equivalent space registers, we have to @@ -11009,17 +11020,13 @@ pa_legitimate_address_p (machine_mode mode, rtx x, bool strict, code_helper) } if (!TARGET_DISABLE_INDEXING - /* Only accept the "canonical" INDEX+BASE operand order - on targets with non-equivalent space registers. */ - && (TARGET_NO_SPACE_REGS - ? REG_P (index) - : (base == XEXP (x, 1) && REG_P (index) - && (reload_completed - || (reload_in_progress && HARD_REGISTER_P (base)) - || REG_POINTER (base)) - && (reload_completed - || (reload_in_progress && HARD_REGISTER_P (index)) - || !REG_POINTER (index)))) + /* Currently, the REG_POINTER flag is not set in a variety + of situations (e.g., call arguments and pointer arithmetic). + As a result, we can't reliably determine when unscaled + addresses are legitimate on targets that need space register + selection. */ + && TARGET_NO_SPACE_REGS + && REG_P (index) && MODE_OK_FOR_UNSCALED_INDEXING_P (mode) && (strict ? STRICT_REG_OK_FOR_INDEX_P (index) : REG_OK_FOR_INDEX_P (index)) @@ -11028,14 +11035,14 @@ pa_legitimate_address_p (machine_mode mode, rtx x, bool strict, code_helper) return true; if (!TARGET_DISABLE_INDEXING - && GET_CODE (index) == MULT /* Only accept base operands with the REG_POINTER flag prior to reload on targets with non-equivalent space registers. */ && (TARGET_NO_SPACE_REGS - || (base == XEXP (x, 1) - && (reload_completed - || (reload_in_progress && HARD_REGISTER_P (base)) - || REG_POINTER (base)))) + || reload_completed + || ((lra_in_progress || reload_in_progress) + && HARD_REGISTER_P (base)) + || REG_POINTER (base)) + && GET_CODE (index) == MULT && REG_P (XEXP (index, 0)) && GET_MODE (XEXP (index, 0)) == Pmode && MODE_OK_FOR_SCALED_INDEXING_P (mode) @@ -11063,20 +11070,21 @@ pa_legitimate_address_p (machine_mode mode, rtx x, bool strict, code_helper) { y = XEXP (x, 1); - /* Needed for -fPIC */ + /* UNSPEC_DLTIND14R is always okay. Needed for -fPIC */ if (mode == Pmode && GET_CODE (y) == UNSPEC) return true; /* Before reload, we need support for 14-bit floating point loads and stores, and associated relocations. */ - if ((TARGET_ELF32 || !INT14_OK_STRICT) + if (!INT14_OK_STRICT && !reload_completed && mode != QImode && mode != HImode) return false; - if (CONSTANT_P (y)) + if (CONSTANT_P (y) + || (!flag_pic && symbolic_operand (y, mode))) return true; } return false; @@ -11252,7 +11260,7 @@ pa_function_arg_size (machine_mode mode, const_tree type) { HOST_WIDE_INT size; - size = mode != BLKmode ? GET_MODE_SIZE (mode) : int_size_in_bytes (type); + size = mode != BLKmode ? GET_MODE_SIZE (mode) : int_size_in_bytes (type); /* The 64-bit runtime does not restrict the size of stack frames, but the gcc calling conventions limit argument sizes to 1G. Our @@ -11340,4 +11348,12 @@ pa_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) reload_fenv, restore_fnenv), update_call); } +/* Implement TARGET_LRA_P. */ + +static bool +pa_use_lra_p () +{ + return pa_lra_p; +} + #include "gt-pa.h" diff --git a/gcc/config/pa/pa.h b/gcc/config/pa/pa.h index 7e45c35..fa6d05e 100644 --- a/gcc/config/pa/pa.h +++ b/gcc/config/pa/pa.h @@ -226,7 +226,7 @@ typedef struct GTY(()) machine_function } machine_function; /* Define this macro if it is advisable to hold scalars in registers - in a wider mode than that declared by the program. In such cases, + in a wider mode than that declared by the program. In such cases, the value is constrained to be within the bounds of the declared type, but kept valid in the wider mode. The signedness of the extension may differ from that of the type. */ @@ -260,7 +260,7 @@ typedef struct GTY(()) machine_function This needs to be 8 when TARGET_64BIT is true to allow building various TImode routines in libgcc. However, we also need the DImode DIVMOD routines because they are not currently implemented in pa.md. - + The HP runtime specification doesn't provide the alignment requirements and calling conventions for TImode variables. */ #ifdef IN_LIBGCC2 @@ -480,6 +480,9 @@ extern rtx hppa_pic_save_rtx (void); #define INDEX_REG_CLASS GENERAL_REGS #define BASE_REG_CLASS GENERAL_REGS +/* True if register is a general register. */ +#define GENERAL_REGNO_P(N) ((N) >= 1 && (N) <= 31) + #define FP_REG_CLASS_P(CLASS) \ ((CLASS) == FP_REGS || (CLASS) == FPUPPER_REGS) @@ -564,13 +567,13 @@ extern rtx hppa_pic_save_rtx (void); of arguments scanned so far (including the invisible argument, if any, which holds the structure-value-address). Thus, 4 or more means all following args should go on the stack. - + The INCOMING field tracks whether this is an "incoming" or "outgoing" argument. - + The INDIRECT field indicates whether this is an indirect call or not. - + The NARGS_PROTOTYPE field indicates that an argument does not have a prototype when it less than or equal to 0. */ @@ -712,7 +715,7 @@ extern int may_call_alloca; #define MIN_CACHELINE_SIZE 32 -/* Addressing modes, and classification of registers for them. +/* Addressing modes, and classification of registers for them. Using autoincrement addressing modes on PA8000 class machines is not profitable. */ @@ -970,7 +973,7 @@ do { \ /* Higher than the default as we prefer to use simple move insns (better scheduling and delay slot filling) and because our - built-in block move is really a 2X unrolled loop. + built-in block move is really a 2X unrolled loop. Believe it or not, this has to be big enough to allow for copying all arguments passed in registers to avoid infinite recursion during argument @@ -1163,7 +1166,7 @@ do { \ #define ASM_OUTPUT_ADDR_VEC_ELT(FILE, VALUE) \ fprintf (FILE, "\t.word L$%d\n", VALUE) -/* This is how to output an element of a case-vector that is relative. +/* This is how to output an element of a case-vector that is relative. Since we always place jump tables in the text section, the difference is absolute and requires no relocation. */ @@ -1197,7 +1200,7 @@ do { \ #define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ pa_asm_output_aligned_bss (FILE, NAME, SIZE, ALIGN) - + /* This says how to output an assembler line to define a global common symbol with size SIZE (in bytes) and alignment ALIGN (in bits). */ @@ -1211,7 +1214,7 @@ do { \ #define ASM_OUTPUT_ALIGNED_LOCAL(FILE, NAME, SIZE, ALIGN) \ pa_asm_output_aligned_local (FILE, NAME, SIZE, ALIGN) - + /* All HP assemblers use "!" to separate logical lines. */ #define IS_ASM_LOGICAL_LINE_SEPARATOR(C, STR) ((C) == '!') @@ -1295,7 +1298,7 @@ do { \ instructions for non-PIC and PIC, respectively. Import stubs are seven and five instructions for HP-UX and ELF targets, respectively. The default stub group size for ELF targets is 217856 bytes. - FIXME: We need an option to set the maximum offset. */ + FIXME: We need an option to set the maximum offset. */ #define MAX_PCREL17F_OFFSET (TARGET_HPUX ? 198164 : 217856) #define NEED_INDICATE_EXEC_STACK 0 diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md index 9e410f4..bf59b7f 100644 --- a/gcc/config/pa/pa.md +++ b/gcc/config/pa/pa.md @@ -2222,9 +2222,9 @@ (define_insn "" [(set (match_operand:SI 0 "move_dest_operand" - "=r,r,r,r,r,r,Q,!*q,!r,!*f,*f,T") + "=r,r,r,r,r,r,Q,!*q,!r,!*f,*f,T,?r,?*f") (match_operand:SI 1 "move_src_operand" - "A,r,J,N,K,RQ,rM,!rM,!*q,!*fM,RT,*f"))] + "A,r,J,N,K,RQ,rM,!rM,!*q,!*fM,RT,*f,*f,r"))] "(register_operand (operands[0], SImode) || reg_or_0_operand (operands[1], SImode)) && !TARGET_SOFT_FLOAT @@ -2241,10 +2241,12 @@ {mfctl|mfctl,w} %%sar,%0 fcpy,sgl %f1,%0 fldw%F1 %1,%0 - fstw%F0 %1,%0" - [(set_attr "type" "load,move,move,move,shift,load,store,move,move,fpalu,fpload,fpstore") + fstw%F0 %1,%0 + fstw %1,-40(%%sp)\n\tldw -40(%%sp),%0 + stw %1,-40(%%sp)\n\tfldw -40(%%sp),%0" + [(set_attr "type" "load,move,move,move,shift,load,store,move,move,fpalu,fpload,fpstore,fpstore_load,store_fpload") (set_attr "pa_combine_type" "addmove") - (set_attr "length" "4,4,4,4,4,4,4,4,4,4,4,4")]) + (set_attr "length" "4,4,4,4,4,4,4,4,4,4,4,4,8,8")]) (define_insn "" [(set (match_operand:SI 0 "move_dest_operand" @@ -2280,6 +2282,58 @@ (set_attr "pa_combine_type" "addmove") (set_attr "length" "4")]) +; Rewrite RTL using a REG+D store. This will allow the insn that +; computes the address to be deleted if the register it sets is dead. +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (mem:SI (match_dup 0)) + (match_operand:SI 3 "register_operand" ""))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) != REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (mem:SI (plus:SI (match_dup 1) (match_dup 2))) (match_dup 3)) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))] + "") + +; Rewrite RTL using a REG+D load. This will allow the insn that +; computes the address to be deleted if the register it sets is dead. +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (match_operand:SI 3 "register_operand" "") + (mem:SI (match_dup 0)))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) != REGNO (operands[3]) + && REGNO (operands[1]) != REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (match_dup 3) (mem:SI (plus:SI (match_dup 1) (match_dup 2)))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))] + "") + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (match_operand:SI 3 "register_operand" "") + (mem:SI (match_dup 0)))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) == REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (match_dup 3) (mem:SI (plus:SI (match_dup 1) (match_dup 2))))] + "") + ; Rewrite RTL using an indexed store. This will allow the insn that ; computes the address to be deleted if the register it sets is dead. (define_peephole2 @@ -3866,7 +3920,7 @@ (define_insn "" [(set (match_operand:DF 0 "move_dest_operand" "=f,*r,T,?o,?Q,f,*r,*r,?*r,?f") - (match_operand:DF 1 "reg_or_0_or_nonsymb_mem_operand" + (match_operand:DF 1 "reg_or_0_or_mem_operand" "fG,*rG,f,*r,*r,RT,o,RQ,f,*r"))] "(register_operand (operands[0], DFmode) || reg_or_0_operand (operands[1], DFmode)) @@ -4040,7 +4094,7 @@ (define_insn "" [(set (match_operand:DF 0 "move_dest_operand" "=r,?o,?Q,r,r") - (match_operand:DF 1 "reg_or_0_or_nonsymb_mem_operand" + (match_operand:DF 1 "reg_or_0_or_mem_operand" "rG,r,r,o,RQ"))] "(register_operand (operands[0], DFmode) || reg_or_0_operand (operands[1], DFmode)) @@ -4055,9 +4109,9 @@ (define_insn "" [(set (match_operand:DF 0 "move_dest_operand" - "=!*r,*r,*r,*r,*r,Q,f,f,T") + "=!*r,*r,*r,*r,*r,Q,f,f,T,?*r,?f") (match_operand:DF 1 "move_src_operand" - "!*rG,J,N,K,RQ,*rG,fG,RT,f"))] + "!*rG,J,N,K,RQ,*rG,fG,RT,f,f,*r"))] "(register_operand (operands[0], DFmode) || reg_or_0_operand (operands[1], DFmode)) && !TARGET_SOFT_FLOAT && TARGET_64BIT" @@ -4070,10 +4124,12 @@ std%M0 %r1,%0 fcpy,dbl %f1,%0 fldd%F1 %1,%0 - fstd%F0 %1,%0" - [(set_attr "type" "move,move,move,shift,load,store,fpalu,fpload,fpstore") + fstd%F0 %1,%0 + fstd %1,-40(%%sp)\n\tldd -40(%%sp),%0 + std %1,-40(%%sp)\n\tfldd -40(%%sp),%0" + [(set_attr "type" "move,move,move,shift,load,store,fpalu,fpload,fpstore,fpstore_load,store_fpload") (set_attr "pa_combine_type" "addmove") - (set_attr "length" "4,4,4,4,4,4,4,4,4")]) + (set_attr "length" "4,4,4,4,4,4,4,4,4,8,8")]) (define_insn "" [(set (match_operand:DF 0 "move_dest_operand" @@ -4229,9 +4285,9 @@ (define_insn "" [(set (match_operand:DI 0 "move_dest_operand" - "=r,r,r,r,r,r,Q,!*q,!r,!*f,*f,T") + "=r,r,r,r,r,r,Q,!*q,!r,!*f,*f,T,?r,?*f") (match_operand:DI 1 "move_src_operand" - "A,r,J,N,K,RQ,rM,!rM,!*q,!*fM,RT,*f"))] + "A,r,J,N,K,RQ,rM,!rM,!*q,!*fM,RT,*f,*f,r"))] "(register_operand (operands[0], DImode) || reg_or_0_operand (operands[1], DImode)) && !TARGET_SOFT_FLOAT && TARGET_64BIT" @@ -4247,10 +4303,12 @@ {mfctl|mfctl,w} %%sar,%0 fcpy,dbl %f1,%0 fldd%F1 %1,%0 - fstd%F0 %1,%0" - [(set_attr "type" "load,move,move,move,shift,load,store,move,move,fpalu,fpload,fpstore") + fstd%F0 %1,%0 + fstd %1,-40(%%sp)\n\tldd -40(%%sp),%0 + std %1,-40(%%sp)\n\tfldd -40(%%sp),%0" + [(set_attr "type" "load,move,move,move,shift,load,store,move,move,fpalu,fpload,fpstore,fpstore_load,store_fpload") (set_attr "pa_combine_type" "addmove") - (set_attr "length" "4,4,4,4,4,4,4,4,4,4,4,4")]) + (set_attr "length" "4,4,4,4,4,4,4,4,4,4,4,4,8,8")]) (define_insn "" [(set (match_operand:DI 0 "move_dest_operand" @@ -4440,7 +4498,7 @@ (define_insn "" [(set (match_operand:SF 0 "move_dest_operand" "=f,!*r,f,*r,T,Q,?*r,?f") - (match_operand:SF 1 "reg_or_0_or_nonsymb_mem_operand" + (match_operand:SF 1 "reg_or_0_or_mem_operand" "fG,!*rG,RT,RQ,f,*rG,f,*r"))] "(register_operand (operands[0], SFmode) || reg_or_0_operand (operands[1], SFmode)) @@ -4461,9 +4519,9 @@ (define_insn "" [(set (match_operand:SF 0 "move_dest_operand" - "=f,!*r,f,*r,T,Q") - (match_operand:SF 1 "reg_or_0_or_nonsymb_mem_operand" - "fG,!*rG,RT,RQ,f,*rG"))] + "=f,!*r,f,*r,T,Q,?*r,?f") + (match_operand:SF 1 "reg_or_0_or_mem_operand" + "fG,!*rG,RT,RQ,f,*rG,f,*r"))] "(register_operand (operands[0], SFmode) || reg_or_0_operand (operands[1], SFmode)) && !TARGET_SOFT_FLOAT @@ -4474,15 +4532,17 @@ fldw%F1 %1,%0 ldw%M1 %1,%0 fstw%F0 %1,%0 - stw%M0 %r1,%0" - [(set_attr "type" "fpalu,move,fpload,load,fpstore,store") + stw%M0 %r1,%0 + fstw %1,-40(%%sp)\n\tldw -40(%%sp),%0 + stw %1,-40(%%sp)\n\tfldw -40(%%sp),%0" + [(set_attr "type" "fpalu,move,fpload,load,fpstore,store,fpstore_load,store_fpload") (set_attr "pa_combine_type" "addmove") - (set_attr "length" "4,4,4,4,4,4")]) + (set_attr "length" "4,4,4,4,4,4,8,8")]) (define_insn "" [(set (match_operand:SF 0 "move_dest_operand" "=!*r,*r,Q") - (match_operand:SF 1 "reg_or_0_or_nonsymb_mem_operand" + (match_operand:SF 1 "reg_or_0_or_mem_operand" "!*rG,RQ,*rG"))] "(register_operand (operands[0], SFmode) || reg_or_0_operand (operands[1], SFmode)) @@ -4509,6 +4569,54 @@ (define_peephole2 [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (mem:SF (match_dup 0)) + (match_operand:SF 3 "register_operand" ""))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) != REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (mem:SF (plus:SI (match_dup 1) (match_dup 2))) (match_dup 3)) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))] + "") + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (match_operand:SF 3 "register_operand" "") + (mem:SF (match_dup 0)))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) != REGNO (operands[3]) + && REGNO (operands[1]) != REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (match_dup 3) (mem:SF (plus:DI (match_dup 1) (match_dup 2)))) + (set (match_dup 0) (plus:SI (match_dup 1) (match_dup 2)))] + "") + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") + (plus:SI (match_operand:SI 1 "register_operand" "") + (match_operand:SI 2 "const_int_operand" ""))) + (set (match_operand:SF 3 "register_operand" "") + (mem:SF (match_dup 0)))] + "!TARGET_64BIT + && !INT14_OK_STRICT + && GENERAL_REGNO_P (REGNO (operands[0])) + && GENERAL_REGNO_P (REGNO (operands[3])) + && REGNO (operands[0]) == REGNO (operands[3]) + && base14_operand (operands[2], E_SImode)" + [(set (match_dup 3) (mem:SF (plus:DI (match_dup 1) (match_dup 2))))] + "") + +(define_peephole2 + [(set (match_operand:SI 0 "register_operand" "") (plus:SI (ashift:SI (match_operand:SI 1 "register_operand" "") (const_int 2)) (match_operand:SI 2 "register_operand" ""))) @@ -4615,7 +4723,7 @@ (define_insn "" [(set (match_operand:SF 0 "move_dest_operand" "=r,r,Q") - (match_operand:SF 1 "reg_or_0_or_nonsymb_mem_operand" + (match_operand:SF 1 "reg_or_0_or_mem_operand" "rG,RQ,rG"))] "(register_operand (operands[0], SFmode) || reg_or_0_operand (operands[1], SFmode)) @@ -7311,7 +7419,6 @@ /* Ensure the frame pointer move is not optimized. */ emit_insn (gen_blockage ()); emit_clobber (hard_frame_pointer_rtx); - emit_clobber (frame_pointer_rtx); emit_move_insn (hard_frame_pointer_rtx, fp); emit_use (hard_frame_pointer_rtx); @@ -7326,7 +7433,7 @@ }) (define_insn "indirect_goto" - [(unspec [(match_operand 0 "register_operand" "=r")] UNSPEC_GOTO)] + [(unspec [(match_operand 0 "register_operand" "r")] UNSPEC_GOTO)] "GET_MODE (operands[0]) == word_mode" "bv%* %%r0(%0)" [(set_attr "type" "branch") @@ -9102,7 +9209,6 @@ add,l %2,%3,%3\;bv,n %%r0(%3)" /* Ensure the frame pointer move is not optimized. */ emit_insn (gen_blockage ()); emit_clobber (hard_frame_pointer_rtx); - emit_clobber (frame_pointer_rtx); emit_move_insn (hard_frame_pointer_rtx, fp); emit_use (hard_frame_pointer_rtx); diff --git a/gcc/config/pa/pa.opt b/gcc/config/pa/pa.opt index 6863f91..d4b3063 100644 --- a/gcc/config/pa/pa.opt +++ b/gcc/config/pa/pa.opt @@ -86,6 +86,10 @@ mlong-calls Target Mask(LONG_CALLS) Always generate long calls. +mlra +Target Var(pa_lra_p) Init(0) +Use LRA instead of reload (transitional). + mlong-load-store Target Mask(LONG_LOAD_STORE) Emit long load/store sequences. diff --git a/gcc/config/pa/pa.opt.urls b/gcc/config/pa/pa.opt.urls index 5b8bceb..5516332 100644 --- a/gcc/config/pa/pa.opt.urls +++ b/gcc/config/pa/pa.opt.urls @@ -36,6 +36,8 @@ UrlSuffix(gcc/HPPA-Options.html#index-mlinker-opt) mlong-calls UrlSuffix(gcc/HPPA-Options.html#index-mlong-calls-5) +; skipping UrlSuffix for 'mlra' due to finding no URLs + mlong-load-store UrlSuffix(gcc/HPPA-Options.html#index-mlong-load-store) diff --git a/gcc/config/pa/pa32-regs.h b/gcc/config/pa/pa32-regs.h index 6485ab2..3467e03 100644 --- a/gcc/config/pa/pa32-regs.h +++ b/gcc/config/pa/pa32-regs.h @@ -318,7 +318,7 @@ enum reg_class { NO_REGS, R1_REGS, GENERAL_REGS, FPUPPER_REGS, FP_REGS, /* 1 if N is a possible register number for function argument passing. */ #define FUNCTION_ARG_REGNO_P(N) \ - (((N) >= 23 && (N) <= 26) || (! TARGET_SOFT_FLOAT && (N) >= 32 && (N) <= 39)) + (((N) >= 23 && (N) <= 26) || (! TARGET_SOFT_FLOAT && (N) >= 32 && (N) <= 39)) /* How to refer to registers in assembler output. This sequence is indexed by compiler's hard-register-number (see above). */ diff --git a/gcc/config/pa/predicates.md b/gcc/config/pa/predicates.md index 50dffa1..0defd22 100644 --- a/gcc/config/pa/predicates.md +++ b/gcc/config/pa/predicates.md @@ -300,7 +300,7 @@ (define_predicate "integer_store_memory_operand" (match_code "reg,mem") { - if (reload_in_progress + if ((lra_in_progress || reload_in_progress) && REG_P (op) && REGNO (op) >= FIRST_PSEUDO_REGISTER && reg_renumber [REGNO (op)] < 0) @@ -312,7 +312,7 @@ REG+D instructions in pa_emit_move_sequence. Further, the Q constraint is used in more than simple move instructions. So, we must return true and let reload handle the reload. */ - if (reload_in_progress) + if (lra_in_progress || reload_in_progress) return true; /* Extract CONST_INT operand. */ @@ -326,7 +326,8 @@ if (!MEM_P (op)) return false; - return ((reload_in_progress || memory_address_p (mode, XEXP (op, 0))) + return ((lra_in_progress || reload_in_progress + || memory_address_p (mode, XEXP (op, 0))) && !IS_LO_SUM_DLT_ADDR_P (XEXP (op, 0)) && !IS_INDEX_ADDR_P (XEXP (op, 0))); }) @@ -335,17 +336,18 @@ ;; floating point store. This also implies the operand could be used as ;; the source operand of a floating point load. LO_SUM DLT and indexed ;; memory operands are not allowed. Symbolic operands are accepted for -;; PA 2.0 when TARGET_ELF32 is not true. We accept reloading pseudos -;; and other memory; operands. +;; PA 2.0. We accept reloading pseudos and other memory operands. -;; FIXME: The GNU ELF32 linker clobbers the LSB of the FP register number -;; in PA 2.0 {fldw,fstw} insns with long displacements. This is because -;; R_PARISC_DPREL14WR and other relocations like it are not supported. +;; NOTE: The GNU ELF32 linker clobbered the least significant bit of +;; the target floating-point register in PA 2.0 floating-point loads +;; and stores with long displacements in ld versions prior to 2.42. +;; The global pointer also was not double-word aligned. This broke +;; various DPREL relocations. (define_predicate "floating_point_store_memory_operand" (match_code "reg,mem") { - if (reload_in_progress + if ((lra_in_progress || reload_in_progress) && REG_P (op) && REGNO (op) >= FIRST_PSEUDO_REGISTER && reg_renumber [REGNO (op)] < 0) @@ -365,9 +367,9 @@ if (!MEM_P (op)) return false; - return ((reload_in_progress || memory_address_p (mode, XEXP (op, 0))) - && !((TARGET_ELF32 || !TARGET_PA_20) - && symbolic_memory_operand (op, VOIDmode)) + return ((lra_in_progress || reload_in_progress + || memory_address_p (mode, XEXP (op, 0))) + && (INT14_OK_STRICT || !symbolic_memory_operand (op, VOIDmode)) && !IS_LO_SUM_DLT_ADDR_P (XEXP (op, 0)) && !IS_INDEX_ADDR_P (XEXP (op, 0))); }) @@ -467,9 +469,9 @@ return memory_address_p (mode, XEXP (op, 0)); }) -;; True iff OP is not a symbolic memory operand. +;; True iff OP is a valid memory operand. -(define_predicate "nonsymb_mem_operand" +(define_predicate "mem_operand" (match_code "subreg,mem") { if (GET_CODE (op) == SUBREG) @@ -488,8 +490,7 @@ && REG_P (XEXP (XEXP (op, 0), 1))) return false; - return (!symbolic_memory_operand (op, mode) - && memory_address_p (mode, XEXP (op, 0))); + return (memory_address_p (mode, XEXP (op, 0))); }) ;; True iff OP is anything other than a hard register. @@ -556,7 +557,7 @@ if (register_operand (op, mode)) return true; - if (!reload_in_progress && !reload_completed) + if (!lra_in_progress && !reload_in_progress && !reload_completed) return false; if (! MEM_P (op)) @@ -576,11 +577,11 @@ (ior (match_operand 0 "register_operand") (match_operand 0 "const_0_operand"))) -;; True iff OP is either a register, zero, or a non-symbolic memory operand. +;; True iff OP is either a register, zero, or a memory operand. -(define_predicate "reg_or_0_or_nonsymb_mem_operand" +(define_predicate "reg_or_0_or_mem_operand" (ior (match_operand 0 "reg_or_0_operand") - (match_operand 0 "nonsymb_mem_operand"))) + (match_operand 0 "mem_operand"))) ;; Accept REG and any CONST_INT that can be moved in one instruction ;; into a general register. diff --git a/gcc/config/pa/som.h b/gcc/config/pa/som.h index 1039a6a..39fdefd 100644 --- a/gcc/config/pa/som.h +++ b/gcc/config/pa/som.h @@ -25,7 +25,7 @@ along with GCC; see the file COPYING3. If not see linked executables and shared libraries. */ #define LDD_SUFFIX "chatr" /* Look for lines like "dynamic /usr/lib/X11R5/libX11.sl" - or "static /usr/lib/X11R5/libX11.sl". + or "static /usr/lib/X11R5/libX11.sl". HPUX 10.20 also has lines like "static branch prediction ..." so we filter that out explicitly. @@ -357,7 +357,7 @@ do { \ #define GTHREAD_USE_WEAK 0 /* Shared library suffix. Collect2 strips the version string after - this suffix when generating constructor/destructor names. */ + this suffix when generating constructor/destructor names. */ #define SHLIB_SUFFIX ".sl" /* We don't have named sections. */ diff --git a/gcc/config/pdp11/pdp11.cc b/gcc/config/pdp11/pdp11.cc index 084af21..600a4f3 100644 --- a/gcc/config/pdp11/pdp11.cc +++ b/gcc/config/pdp11/pdp11.cc @@ -50,7 +50,7 @@ along with GCC; see the file COPYING3. If not see /* This file should be included last. */ #include "target-def.h" -/* this is the current value returned by the macro FIRST_PARM_OFFSET +/* this is the current value returned by the macro FIRST_PARM_OFFSET defined in tm.h */ int current_first_parm_offset; @@ -220,7 +220,7 @@ static bool pdp11_scalar_mode_supported_p (scalar_mode); #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD pdp11_secondary_reload -#undef TARGET_REGISTER_MOVE_COST +#undef TARGET_REGISTER_MOVE_COST #define TARGET_REGISTER_MOVE_COST pdp11_register_move_cost #undef TARGET_PREFERRED_RELOAD_CLASS @@ -327,7 +327,7 @@ pdp11_saved_regno (unsigned regno) alloca storage if any. */ void pdp11_expand_prologue (void) -{ +{ HOST_WIDE_INT fsize = get_frame_size (); unsigned regno; rtx x, via_ac = NULL; @@ -339,7 +339,7 @@ pdp11_expand_prologue (void) emit_insn (gen_setd ()); emit_insn (gen_seti ()); } - + /* Save CPU registers. */ for (regno = R0_REGNUM; regno <= PC_REGNUM; regno++) if (pdp11_saved_regno (regno)) @@ -350,7 +350,7 @@ pdp11_expand_prologue (void) } /* Save FPU registers. */ - for (regno = AC0_REGNUM; regno <= AC3_REGNUM; regno++) + for (regno = AC0_REGNUM; regno <= AC3_REGNUM; regno++) if (pdp11_saved_regno (regno)) { x = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); @@ -388,7 +388,7 @@ pdp11_expand_prologue (void) void pdp11_expand_epilogue (void) -{ +{ HOST_WIDE_INT fsize = get_frame_size (); unsigned regno; rtx x, reg, via_ac = NULL; @@ -476,13 +476,13 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], bool sameoff = false; enum { REGOP, OFFSOP, MEMOP, PUSHOP, POPOP, CNSTOP, RNDOP } optype; long sval[2]; - + /* If either piece order is accepted and one is pre-decrement while the other is post-increment, set order to be high order word first. That will force the pre-decrement to be turned into a pointer adjust, then offset addressing. Otherwise, if either operand uses pre-decrement, that means - the order is low order first. + the order is low order first. Otherwise, if both operands are registers and destination is higher than source and they overlap, do low order word (highest register number) first. */ @@ -512,7 +512,7 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], the push increases the offset to each source word. In theory there are other cases like this, for example dest == pop, but those don't occur in real life so ignore those. */ - if (GET_CODE (operands[0]) == MEM + if (GET_CODE (operands[0]) == MEM && GET_CODE (XEXP (operands[0], 0)) == PRE_DEC && REGNO (XEXP (XEXP (operands[0], 0), 0)) == STACK_POINTER_REGNUM && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) @@ -529,7 +529,7 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], else gcc_assert (useorder == either || useorder == order); - + for (op = 0; op < opcount; op++) { /* First classify the operand. */ @@ -553,10 +553,10 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], supposed to allow to happen. Return failure for such cases. */ if (optype == RNDOP) return false; - + if (action != NULL) action[op] = no_action; - + /* If the operand uses pre-decrement addressing but we want to get the parts high order first, decrement the former register explicitly @@ -569,7 +569,7 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], XEXP (XEXP (operands[op], 0), 0)); optype = OFFSOP; } - /* If the operand uses post-increment mode but we want + /* If the operand uses post-increment mode but we want to get the parts low order first, change the operand into ordinary indexing and remember to increment the register explicitly when we're done. */ @@ -588,7 +588,7 @@ pdp11_expand_operands (rtx *operands, rtx exops[][2], REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (operands[op]), sval); } - + for (i = 0; i < words; i++) { if (order == big) @@ -633,18 +633,18 @@ output_move_multiple (rtx *operands) rtx inops[2]; rtx exops[4][2]; rtx adjops[2]; - + pdp11_action action[2]; int i, words; - + words = GET_MODE_BITSIZE (GET_MODE (operands[0])) / 16; adjops[1] = gen_rtx_CONST_INT (HImode, words * 2); inops[0] = operands[0]; inops[1] = operands[1]; - + pdp11_expand_operands (inops, exops, 2, words, action, either); - + /* Check for explicit decrement before. */ if (action[0] == dec_before) { @@ -686,7 +686,7 @@ pdp11_gen_int_label (char *label, const char *prefix, int num) else sprintf (label, "*%s_%u", prefix, num); } - + /* Output an ascii string. */ void output_ascii (FILE *file, const char *p, int size) @@ -694,7 +694,7 @@ output_ascii (FILE *file, const char *p, int size) int i, c; const char *pseudo = "\t.ascii\t"; bool delim = false; - + if (TARGET_DEC_ASM) { if (p[size - 1] == '\0') @@ -769,7 +769,7 @@ pdp11_asm_output_var (FILE *file, const char *name, int size, assemble_name (file, name); fputs (":", file); ASM_OUTPUT_SKIP (file, size); - } + } } /* Special format operators handled here: @@ -782,7 +782,7 @@ static void pdp11_asm_print_operand (FILE *file, rtx x, int code) { long sval[2]; - + if (code == '#') { if (TARGET_DEC_ASM) @@ -955,7 +955,7 @@ pdp11_lra_p (void) /* Register to register moves are cheap if both are general registers. */ -static int +static int pdp11_register_move_cost (machine_mode mode ATTRIBUTE_UNUSED, reg_class_t c1, reg_class_t c2) { @@ -979,7 +979,7 @@ pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code, const int asize = (mode == QImode) ? 2 : GET_MODE_SIZE (mode); rtx src, dest; const char *fmt; - + switch (code) { case CONST_INT: @@ -1026,7 +1026,7 @@ pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code, if (GET_RTX_LENGTH (code) > 1) src = XEXP (x, 1); dest = XEXP (x, 0); - + /* If optimizing for size, claim everything costs 2 per word, plus whatever the operands require. */ if (!speed) @@ -1070,7 +1070,7 @@ pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code, case DIV: *total = 10 * asize * asize; break; - + case MOD: /* Fake value because it's accounted for under DIV, since we use a divmod pattern. */ @@ -1085,14 +1085,14 @@ pdp11_rtx_costs (rtx x, machine_mode mode, int outer_code, case of a one bit shift. */ *total = asize; break; - + default: *total = asize; break; } } } - + /* Now see if we're looking at a SET. If yes, then look at the source to see if this is a move or an arithmetic operation, and continue accordingly to handle the operands. */ @@ -1142,7 +1142,7 @@ pdp11_addr_cost (rtx addr, machine_mode mode, addr_space_t as ATTRIBUTE_UNUSED, bool speed) { int cost = 0; - + if (GET_CODE (addr) != REG) { if (!simple_memory_operand (addr, mode)) @@ -1184,7 +1184,7 @@ pdp11_insn_cost (rtx_insn *insn, bool speed) the actual operation plus a clobber, or the implicit compare plus the actual operation. Find the actual operation. */ pat = PATTERN (insn); - + if (GET_CODE (pat) == PARALLEL) { set = XVECEXP (pat, 0, 0); @@ -1199,7 +1199,7 @@ pdp11_insn_cost (rtx_insn *insn, bool speed) if (GET_CODE (set) != SET) return 0; } - + /* Pick up the SET source and destination RTL. */ dest = XEXP (set, 0); src = XEXP (set, 1); @@ -1242,7 +1242,7 @@ pdp11_insn_cost (rtx_insn *insn, bool speed) src2 = XEXP (src, 1); base_cost += pdp11_addr_cost (src2, mode, ADDR_SPACE_GENERIC, speed); } - + return base_cost; } @@ -1354,7 +1354,7 @@ simple_memory_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) /* Decode the address now. */ indirection: - + addr = XEXP (op, 0); switch (GET_CODE (addr)) @@ -1362,27 +1362,27 @@ simple_memory_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) case REG: /* (R0) - no extra cost */ return 1; - + case PRE_DEC: case POST_INC: case PRE_MODIFY: case POST_MODIFY: /* -(R0), (R0)+ - cheap! */ return 1; - + case MEM: - /* cheap - is encoded in addressing mode info! + /* cheap - is encoded in addressing mode info! -- except for @(R0), which has to be @0(R0) !!! */ if (GET_CODE (XEXP (addr, 0)) == REG) return 0; - + op=addr; goto indirection; - + case CONST_INT: - case LABEL_REF: + case LABEL_REF: case CONST: case SYMBOL_REF: /* @#address - extra cost */ @@ -1395,7 +1395,7 @@ simple_memory_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) default: break; } - + return FALSE; } @@ -1412,7 +1412,7 @@ no_side_effect_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) /* Decode the address now. */ indirection: - + addr = XEXP (op, 0); switch (GET_CODE (addr)) @@ -1420,26 +1420,26 @@ no_side_effect_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) case REG: /* (R0) - no extra cost */ return 1; - + case PRE_DEC: case POST_INC: case PRE_MODIFY: case POST_MODIFY: return 0; - + case MEM: - /* cheap - is encoded in addressing mode info! + /* cheap - is encoded in addressing mode info! -- except for @(R0), which has to be @0(R0) !!! */ if (GET_CODE (XEXP (addr, 0)) == REG) return 0; - + op=addr; goto indirection; - + case CONST_INT: - case LABEL_REF: + case LABEL_REF: case CONST: case SYMBOL_REF: /* @#address - extra cost */ @@ -1452,7 +1452,7 @@ no_side_effect_operand(rtx op, machine_mode mode ATTRIBUTE_UNUSED) default: break; } - + return FALSE; } @@ -1514,7 +1514,7 @@ pdp11_can_change_mode_class (machine_mode from, So we disallow all mode changes involving FPRs. */ if (FLOAT_MODE_P (from) != FLOAT_MODE_P (to)) return false; - + return !reg_classes_intersect_p (FPU_REGS, rclass); } @@ -1530,7 +1530,7 @@ pdp11_guard_type (void) Given an rtx X being reloaded into a reg required to be in class CLASS, return the class of reg to actually use. In general this is just CLASS; but on some machines - in some cases it is preferable to use a more restrictive class. + in some cases it is preferable to use a more restrictive class. loading is easier into LOAD_FPU_REGS than FPU_REGS! */ @@ -1554,7 +1554,7 @@ pdp11_preferred_reload_class (rtx x, reg_class_t rclass) Given an rtx X being reloaded into a reg required to be in class CLASS, return the class of reg to actually use. In general this is just CLASS; but on some machines - in some cases it is preferable to use a more restrictive class. + in some cases it is preferable to use a more restrictive class. loading is easier into LOAD_FPU_REGS than FPU_REGS! */ @@ -1576,10 +1576,10 @@ pdp11_preferred_output_reload_class (rtx x, reg_class_t rclass) /* TARGET_SECONDARY_RELOAD. - FPU registers AC4 and AC5 (class NO_LOAD_FPU_REGS) require an + FPU registers AC4 and AC5 (class NO_LOAD_FPU_REGS) require an intermediate register (AC0-AC3: LOAD_FPU_REGS). Everything else can be loaded/stored directly. */ -static reg_class_t +static reg_class_t pdp11_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x, reg_class_t reload_class, @@ -1589,7 +1589,7 @@ pdp11_secondary_reload (bool in_p ATTRIBUTE_UNUSED, if (reload_class != NO_LOAD_FPU_REGS || GET_CODE (x) != REG || REGNO_REG_CLASS (REGNO (x)) == LOAD_FPU_REGS) return NO_REGS; - + return LOAD_FPU_REGS; } @@ -1600,11 +1600,11 @@ pdp11_secondary_reload (bool in_p ATTRIBUTE_UNUSED, static bool pdp11_secondary_memory_needed (machine_mode, reg_class_t c1, reg_class_t c2) { - int fromfloat = (c1 == LOAD_FPU_REGS || c1 == NO_LOAD_FPU_REGS || + int fromfloat = (c1 == LOAD_FPU_REGS || c1 == NO_LOAD_FPU_REGS || c1 == FPU_REGS); - int tofloat = (c2 == LOAD_FPU_REGS || c2 == NO_LOAD_FPU_REGS || + int tofloat = (c2 == LOAD_FPU_REGS || c2 == NO_LOAD_FPU_REGS || c2 == FPU_REGS); - + return (fromfloat != tofloat); } @@ -1624,13 +1624,13 @@ pdp11_legitimate_address_p (machine_mode mode, rtx operand, bool strict, /* accept @#address */ if (CONSTANT_ADDRESS_P (operand)) return true; - + switch (GET_CODE (operand)) { case REG: /* accept (R0) */ return !strict || REGNO_OK_FOR_BASE_P (REGNO (operand)); - + case PLUS: /* accept X(R0) */ return GET_CODE (XEXP (operand, 0)) == REG @@ -1672,11 +1672,11 @@ pdp11_legitimate_address_p (machine_mode mode, rtx operand, bool strict, xfoob = XEXP (operand, 0); /* (MEM:xx (MEM:xx ())) is not valid for SI, DI and currently - also forbidden for float, because we have to handle this + also forbidden for float, because we have to handle this in output_move_double and/or output_move_quad() - we could - do it, but currently it's not worth it!!! - now that DFmode cannot go into CPU register file, - maybe I should allow float ... + do it, but currently it's not worth it!!! + now that DFmode cannot go into CPU register file, + maybe I should allow float ... but then I have to handle memory-to-memory moves in movdf ?? */ if (GET_MODE_BITSIZE(mode) > 16) return false; @@ -1722,7 +1722,7 @@ pdp11_legitimate_address_p (machine_mode mode, rtx operand, bool strict, reg number REGNO. */ enum reg_class pdp11_regno_reg_class (int regno) -{ +{ if (regno == ARG_POINTER_REGNUM) return NOTSP_REG; else if (regno == CC_REGNUM || regno == FCC_REGNUM) @@ -1759,9 +1759,9 @@ pdp11_reg_save_size (void) for (regno = AC0_REGNUM; regno <= AC5_REGNUM; regno++) if (pdp11_saved_regno (regno)) offset += 8; - + return offset; -} +} /* Return the offset between two registers, one to be eliminated, and the other its replacement, at the start of a routine. */ @@ -1791,7 +1791,7 @@ output_addr_const_pdp11 (FILE *file, rtx x) { char buf[256]; int i; - + restart: switch (GET_CODE (x)) { @@ -1891,7 +1891,7 @@ pdp11_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) On the pdp11 the value is found in R0 (or ac0??? not without FPU!!!! ) */ static rtx -pdp11_function_value (const_tree valtype, +pdp11_function_value (const_tree valtype, const_tree fntype_or_decl ATTRIBUTE_UNUSED, bool outgoing ATTRIBUTE_UNUSED) { @@ -1935,7 +1935,7 @@ pdp11_expand_shift (rtx *operands, rtx (*shift_sc) (rtx, rtx, rtx), { rtx r, test; rtx_code_label *lb; - + if (CONST_INT_P (operands[2]) && pdp11_small_shift (INTVAL (operands[2]))) emit_insn ((*shift_sc) (operands[0], operands[1], operands[2])); else if (TARGET_40_PLUS) @@ -1988,7 +1988,7 @@ pdp11_assemble_shift (rtx *operands, machine_mode m, int code) inops[0] = operands[0]; pdp11_expand_operands (inops, exops, 1, 2, action, either); } - + if (!small) { /* Loop case, generate the top of loop label. */ @@ -2154,8 +2154,8 @@ pdp11_md_asm_adjust (vec<rtx> & /*outputs*/, vec<rtx> & /*inputs*/, /* Worker function for TARGET_TRAMPOLINE_INIT. - trampoline - how should i do it in separate i+d ? - have some allocate_trampoline magic??? + trampoline - how should i do it in separate i+d ? + have some allocate_trampoline magic??? the following should work for shared I/D: @@ -2252,7 +2252,7 @@ static void pdp11_output_ident (const char *ident) if (!startswith (ident, "GCC:")) fprintf (asm_out_file, "\t.ident\t\"%s\"\n", ident); } - + } /* This emits a (user) label, which gets a "_" prefix except for DEC @@ -2281,7 +2281,7 @@ pdp11_output_def (FILE *file, const char *label1, const char *label2) assemble_name (file, label1); putc (',', file); assemble_name (file, label2); - } + } putc ('\n', file); } @@ -2314,7 +2314,7 @@ pdp11_asm_named_section (const char *name, unsigned int flags, { const char *rwro = (flags & SECTION_WRITE) ? "rw" : "ro"; const char *insdat = (flags & SECTION_CODE) ? "i" : "d"; - + gcc_assert (TARGET_DEC_ASM); fprintf (asm_out_file, "\t.psect\t%s,con,%s,%s\n", name, insdat, rwro); } @@ -2333,12 +2333,12 @@ pdp11_asm_init_sections (void) ".bss"); } } - + static void pdp11_file_start (void) { default_file_start (); - + if (TARGET_DEC_ASM) fprintf (asm_out_file, "\t.enabl\tlsb,reg\n\n"); } diff --git a/gcc/config/pdp11/pdp11.h b/gcc/config/pdp11/pdp11.h index 6c8e045..f6997b6 100644 --- a/gcc/config/pdp11/pdp11.h +++ b/gcc/config/pdp11/pdp11.h @@ -69,7 +69,7 @@ along with GCC; see the file COPYING3. If not see #define SHORT_TYPE_SIZE 16 #define INT_TYPE_SIZE (TARGET_INT16 ? 16 : 32) #define LONG_TYPE_SIZE 32 -#define LONG_LONG_TYPE_SIZE 64 +#define LONG_LONG_TYPE_SIZE 64 /* machine types from ansi */ #define SIZE_TYPE "short unsigned int" /* definition of size_t */ @@ -93,7 +93,7 @@ along with GCC; see the file COPYING3. If not see /* Define that floats are in VAX order, not high word first as for ints. */ #define FLOAT_WORDS_BIG_ENDIAN 0 -/* Width of a word, in units (bytes). +/* Width of a word, in units (bytes). UNITS OR BYTES - seems like units */ #define UNITS_PER_WORD 2 @@ -107,7 +107,7 @@ extern const struct real_format pdp11_f_format; extern const struct real_format pdp11_d_format; /* Maximum sized of reasonable data type -- DImode ...*/ -#define MAX_FIXED_MODE_SIZE 64 +#define MAX_FIXED_MODE_SIZE 64 /* Allocation boundary (in *bits*) for storing pointers in memory. */ #define POINTER_BOUNDARY 16 @@ -144,7 +144,7 @@ extern const struct real_format pdp11_d_format; All registers that the compiler knows about must be given numbers, even those that are not normally considered general registers. - we have 8 integer registers, plus 6 float + we have 8 integer registers, plus 6 float (don't use scratch float !) */ /* 1 for registers that have pervasive standard uses @@ -153,7 +153,7 @@ extern const struct real_format pdp11_d_format; On the pdp, these are: Reg 7 = pc; reg 6 = sp; - reg 5 = fp; not necessarily! + reg 5 = fp; not necessarily! */ #define FIXED_REGISTERS \ @@ -203,7 +203,7 @@ extern const struct real_format pdp11_d_format; For any two classes, it is very desirable that there be another class that represents their union. */ - + /* The pdp has a couple of classes: MUL_REGS are used for odd numbered regs, to use in 16-bit multiplication @@ -211,7 +211,7 @@ MUL_REGS are used for odd numbered regs, to use in 16-bit multiplication GENERAL_REGS is all cpu LOAD_FPU_REGS is the first four cpu regs, they are easier to load NO_LOAD_FPU_REGS is ac4 and ac5, currently - difficult to load them -FPU_REGS is all fpu regs +FPU_REGS is all fpu regs CC_REGS is the condition codes (CPU and FPU) */ @@ -292,7 +292,7 @@ enum reg_class /* Return TRUE if the class is a CPU register. */ #define CPU_REG_CLASS(CLASS) \ (CLASS >= NOTR0_REG && CLASS <= GENERAL_REGS) - + /* Return the maximum number of consecutive registers needed to represent mode MODE in a register of class CLASS. */ #define CLASS_MAX_NREGS(CLASS, MODE) \ @@ -316,7 +316,7 @@ enum reg_class #define PUSH_ROUNDING(BYTES) pdp11_push_rounding (BYTES) -/* current_first_parm_offset stores the # of registers pushed on the +/* current_first_parm_offset stores the # of registers pushed on the stack */ extern int current_first_parm_offset; @@ -328,7 +328,7 @@ extern int current_first_parm_offset; If the precise function being called is known, FUNC is its FUNCTION_DECL; otherwise, FUNC is 0. */ #define BASE_RETURN_VALUE_REG(MODE) \ - (FLOAT_MODE_P (MODE) ? AC0_REGNUM : RETVAL_REGNUM) + (FLOAT_MODE_P (MODE) ? AC0_REGNUM : RETVAL_REGNUM) /* 1 if N is a possible register number for function argument passing. - not used on pdp */ @@ -462,7 +462,7 @@ extern int current_first_parm_offset; #define DEFAULT_SIGNED_CHAR 1 /* Max number of bytes we can move from memory to memory - in one reasonably fast instruction. + in one reasonably fast instruction. */ #define MOVE_MAX 2 diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 4decaed..1f1849d 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -282,7 +282,7 @@ (match_operand:<VM> 2 "vector_mask_operand") (match_operand 3 "autovec_length_operand") (match_operand 4 "const_0_operand")] - "TARGET_VECTOR" + "TARGET_VECTOR_AUTOVEC_SEGMENT" { riscv_vector::expand_lanes_load_store (operands, true); DONE; @@ -295,7 +295,7 @@ (match_operand:<VM> 2 "vector_mask_operand") (match_operand 3 "autovec_length_operand") (match_operand 4 "const_0_operand")] - "TARGET_VECTOR" + "TARGET_VECTOR_AUTOVEC_SEGMENT" { riscv_vector::expand_lanes_load_store (operands, false); DONE; @@ -1454,6 +1454,69 @@ }) ;; ------------------------------------------------------------------------- +;; ---- [INT,FP] Extract a vector from a vector. +;; ------------------------------------------------------------------------- +;; TODO: This can be extended to allow basically any extract mode. +;; For now this helps optimize VLS subregs like (subreg:V2DI (reg:V4DI) 16) +;; that would otherwise need to go via memory. + +(define_expand "vec_extract<mode><vls_half>" + [(set (match_operand:<VLS_HALF> 0 "nonimmediate_operand") + (vec_select:<VLS_HALF> + (match_operand:VLS_HAS_HALF 1 "register_operand") + (parallel + [(match_operand 2 "immediate_operand")])))] + "TARGET_VECTOR" +{ + int sz = GET_MODE_NUNITS (<VLS_HALF>mode).to_constant (); + int part = INTVAL (operands[2]); + + rtx start = GEN_INT (part * sz); + rtx tmp = operands[1]; + + if (part != 0) + { + tmp = gen_reg_rtx (<MODE>mode); + + rtx ops[] = {tmp, operands[1], start}; + riscv_vector::emit_vlmax_insn + (code_for_pred_slide (UNSPEC_VSLIDEDOWN, <MODE>mode), + riscv_vector::BINARY_OP, ops); + } + + emit_move_insn (operands[0], gen_lowpart (<VLS_HALF>mode, tmp)); + DONE; +}) + +(define_expand "vec_extract<mode><vls_quarter>" + [(set (match_operand:<VLS_QUARTER> 0 "nonimmediate_operand") + (vec_select:<VLS_QUARTER> + (match_operand:VLS_HAS_QUARTER 1 "register_operand") + (parallel + [(match_operand 2 "immediate_operand")])))] + "TARGET_VECTOR" +{ + int sz = GET_MODE_NUNITS (<VLS_QUARTER>mode).to_constant (); + int part = INTVAL (operands[2]); + + rtx start = GEN_INT (part * sz); + rtx tmp = operands[1]; + + if (part != 0) + { + tmp = gen_reg_rtx (<MODE>mode); + + rtx ops[] = {tmp, operands[1], start}; + riscv_vector::emit_vlmax_insn + (code_for_pred_slide (UNSPEC_VSLIDEDOWN, <MODE>mode), + riscv_vector::BINARY_OP, ops); + } + + emit_move_insn (operands[0], gen_lowpart (<VLS_QUARTER>mode, tmp)); + DONE; +}) + +;; ------------------------------------------------------------------------- ;; ---- [FP] Binary operations ;; ------------------------------------------------------------------------- ;; Includes: @@ -2649,6 +2712,17 @@ } ) +(define_expand "ssadd<mode>3" + [(match_operand:V_VLSI 0 "register_operand") + (match_operand:V_VLSI 1 "register_operand") + (match_operand:V_VLSI 2 "register_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_vec_ssadd (operands[0], operands[1], operands[2], <MODE>mode); + DONE; + } +) + (define_expand "ussub<mode>3" [(match_operand:V_VLSI 0 "register_operand") (match_operand:V_VLSI 1 "register_operand") @@ -2660,6 +2734,17 @@ } ) +(define_expand "sssub<mode>3" + [(match_operand:V_VLSI 0 "register_operand") + (match_operand:V_VLSI 1 "register_operand") + (match_operand:V_VLSI 2 "register_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_vec_sssub (operands[0], operands[1], operands[2], <MODE>mode); + DONE; + } +) + (define_expand "ustrunc<mode><v_double_trunc>2" [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand") (match_operand:VWEXTI 1 "register_operand")] @@ -2694,6 +2779,40 @@ } ) +(define_expand "sstrunc<mode><v_double_trunc>2" + [(match_operand:<V_DOUBLE_TRUNC> 0 "register_operand") + (match_operand:VWEXTI 1 "register_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_vec_double_sstrunc (operands[0], operands[1], + <MODE>mode); + DONE; + } +) + +(define_expand "sstrunc<mode><v_quad_trunc>2" + [(match_operand:<V_QUAD_TRUNC> 0 "register_operand") + (match_operand:VQEXTI 1 "register_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_vec_quad_sstrunc (operands[0], operands[1], <MODE>mode, + <V_DOUBLE_TRUNC>mode); + DONE; + } +) + +(define_expand "sstrunc<mode><v_oct_trunc>2" + [(match_operand:<V_OCT_TRUNC> 0 "register_operand") + (match_operand:VOEXTI 1 "register_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_vec_oct_sstrunc (operands[0], operands[1], <MODE>mode, + <V_DOUBLE_TRUNC>mode, + <V_QUAD_TRUNC>mode); + DONE; + } +) + ;; ========================================================================= ;; == Early break auto-vectorization patterns ;; ========================================================================= @@ -2770,3 +2889,32 @@ DONE; } ) + +;; ========================================================================= +;; == Strided Load/Store +;; ========================================================================= +(define_expand "mask_len_strided_load_<mode>" + [(match_operand:V 0 "register_operand") + (match_operand 1 "pmode_reg_or_0_operand") + (match_operand 2 "pmode_reg_or_0_operand") + (match_operand:<VM> 3 "vector_mask_operand") + (match_operand 4 "autovec_length_operand") + (match_operand 5 "const_0_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_strided_load (<MODE>mode, operands); + DONE; + }) + +(define_expand "mask_len_strided_store_<mode>" + [(match_operand 0 "pmode_reg_or_0_operand") + (match_operand 1 "pmode_reg_or_0_operand") + (match_operand:V 2 "register_operand") + (match_operand:<VM> 3 "vector_mask_operand") + (match_operand 4 "autovec_length_operand") + (match_operand 5 "const_0_operand")] + "TARGET_VECTOR" + { + riscv_vector::expand_strided_store (<MODE>mode, operands); + DONE; + }) diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md index 3ab6d54..eb5a0bb 100644 --- a/gcc/config/riscv/constraints.md +++ b/gcc/config/riscv/constraints.md @@ -70,6 +70,11 @@ (and (match_code "const_int") (match_test "ival == 8"))) +(define_constraint "P" + "A 5-bit signed immediate for vmv.v.i." + (and (match_code "const_int") + (match_test "IN_RANGE (ival, -16, 15)"))) + (define_constraint "K" "A 5-bit unsigned immediate for CSR access instructions." (and (match_code "const_int") diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md index 2844cb0..0816594 100644 --- a/gcc/config/riscv/iterators.md +++ b/gcc/config/riscv/iterators.md @@ -233,6 +233,10 @@ (define_code_iterator any_ge [ge geu]) (define_code_iterator any_lt [lt ltu]) (define_code_iterator any_le [le leu]) +(define_code_iterator any_eq [eq ne]) + +;; Iterators for conditions we can emit a sCC against 0 or a reg directly +(define_code_iterator scc_0 [eq ne gt gtu]) ; atomics code iterator (define_code_iterator any_atomic [plus ior xor and]) @@ -283,6 +287,8 @@ (le "le") (gt "gt") (lt "lt") + (eq "eq") + (ne "ne") (ior "ior") (xor "xor") (and "and") diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md index 9971fab..55bcfa4 100644 --- a/gcc/config/riscv/predicates.md +++ b/gcc/config/riscv/predicates.md @@ -329,7 +329,7 @@ { enum riscv_symbol_type type; return (riscv_symbolic_constant_p (op, &type) - && type == SYMBOL_GOT_DISP && !SYMBOL_REF_WEAK (op) && TARGET_PLT); + && type == SYMBOL_GOT_DISP && !SYMBOL_REF_WEAK (op) && flag_plt); }) (define_predicate "call_insn_operand" diff --git a/gcc/config/riscv/riscv-avlprop.cc b/gcc/config/riscv/riscv-avlprop.cc index 91d80aa..066377c 100644 --- a/gcc/config/riscv/riscv-avlprop.cc +++ b/gcc/config/riscv/riscv-avlprop.cc @@ -65,6 +65,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY #define INCLUDE_ARRAY #include "config.h" diff --git a/gcc/config/riscv/riscv-c.cc b/gcc/config/riscv/riscv-c.cc index 71112d9..c59f408 100644 --- a/gcc/config/riscv/riscv-c.cc +++ b/gcc/config/riscv/riscv-c.cc @@ -59,7 +59,12 @@ riscv_pragma_intrinsic_flags_pollute (struct pragma_intrinsic_flags *flags) riscv_zvl_flags = riscv_zvl_flags | MASK_ZVL32B | MASK_ZVL64B - | MASK_ZVL128B; + | MASK_ZVL128B + | MASK_ZVL256B + | MASK_ZVL512B + | MASK_ZVL1024B + | MASK_ZVL2048B + | MASK_ZVL4096B; riscv_vector_elen_flags = riscv_vector_elen_flags | MASK_VECTOR_ELEN_32 diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index 5497d11..1b2a4bc 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -157,4 +157,9 @@ enum riscv_tls_type { TLS_DESCRIPTORS }; +/* On some microarchitectures, vector segment loads and stores are excessively + expensive, so predicate the generation of those instrunctions. */ +#define TARGET_VECTOR_AUTOVEC_SEGMENT \ + (TARGET_VECTOR && riscv_mautovec_segment) + #endif /* ! GCC_RISCV_OPTS_H */ diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 926899c..4ed0432 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -134,8 +134,11 @@ extern bool riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int); extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx); extern void riscv_expand_usadd (rtx, rtx, rtx); +extern void riscv_expand_ssadd (rtx, rtx, rtx); extern void riscv_expand_ussub (rtx, rtx, rtx); +extern void riscv_expand_sssub (rtx, rtx, rtx); extern void riscv_expand_ustrunc (rtx, rtx); +extern void riscv_expand_sstrunc (rtx, rtx); #ifdef RTX_CODE extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool *invert_ptr = 0); @@ -621,6 +624,7 @@ enum mask_policy enum tail_policy get_prefer_tail_policy (); enum mask_policy get_prefer_mask_policy (); rtx get_avl_type_rtx (enum avl_type); +opt_machine_mode get_lmul_mode (scalar_mode, int); opt_machine_mode get_vector_mode (scalar_mode, poly_uint64); opt_machine_mode get_tuple_mode (machine_mode, unsigned int); bool simm5_p (rtx); @@ -644,11 +648,17 @@ void expand_vec_lround (rtx, rtx, machine_mode, machine_mode, machine_mode); void expand_vec_lceil (rtx, rtx, machine_mode, machine_mode); void expand_vec_lfloor (rtx, rtx, machine_mode, machine_mode); void expand_vec_usadd (rtx, rtx, rtx, machine_mode); +void expand_vec_ssadd (rtx, rtx, rtx, machine_mode); void expand_vec_ussub (rtx, rtx, rtx, machine_mode); +void expand_vec_sssub (rtx, rtx, rtx, machine_mode); void expand_vec_double_ustrunc (rtx, rtx, machine_mode); +void expand_vec_double_sstrunc (rtx, rtx, machine_mode); void expand_vec_quad_ustrunc (rtx, rtx, machine_mode, machine_mode); +void expand_vec_quad_sstrunc (rtx, rtx, machine_mode, machine_mode); void expand_vec_oct_ustrunc (rtx, rtx, machine_mode, machine_mode, machine_mode); +void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode, + machine_mode); #endif bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode, bool, void (*)(rtx *, rtx), enum avl_type); @@ -668,7 +678,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode, machine_mode, rtx *); rtx gen_avl_for_scalar_move (rtx); void expand_tuple_move (rtx *); -bool expand_block_move (rtx, rtx, rtx); +bool expand_block_move (rtx, rtx, rtx, bool); machine_mode preferred_simd_mode (scalar_mode); machine_mode get_mask_mode (machine_mode); void expand_vec_series (rtx, rtx, rtx, rtx = 0); @@ -690,6 +700,8 @@ bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool); void emit_vec_extract (rtx, rtx, rtx); bool expand_vec_setmem (rtx, rtx, rtx); bool expand_vec_cmpmem (rtx, rtx, rtx, rtx); +void expand_strided_load (machine_mode, rtx *); +void expand_strided_store (machine_mode, rtx *); /* Rounding mode bitfield for fixed point VXRM. */ enum fixed_point_rounding_mode @@ -793,6 +805,8 @@ extern bool riscv_use_divmod_expander (void); void riscv_init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); extern bool riscv_option_valid_attribute_p (tree, tree, tree, int); +extern bool +riscv_process_target_attr (const char *, location_t); extern void riscv_override_options_internal (struct gcc_options *); extern void riscv_option_override (void); diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc index 4bb8bce..20395e1 100644 --- a/gcc/config/riscv/riscv-string.cc +++ b/gcc/config/riscv/riscv-string.cc @@ -966,7 +966,7 @@ riscv_expand_block_move_scalar (rtx dest, rtx src, rtx length) /* This function delegates block-move expansion to either the vector implementation or the scalar one. Return TRUE if successful or FALSE - otherwise. */ + otherwise. Assume that the memory regions do not overlap. */ bool riscv_expand_block_move (rtx dest, rtx src, rtx length) @@ -974,7 +974,7 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length) if ((TARGET_VECTOR && !TARGET_XTHEADVECTOR) && stringop_strategy & STRATEGY_VECTOR) { - bool ok = riscv_vector::expand_block_move (dest, src, length); + bool ok = riscv_vector::expand_block_move (dest, src, length, false); if (ok) return true; } @@ -1051,57 +1051,54 @@ riscv_expand_block_clear (rtx dest, rtx length) namespace riscv_vector { -/* Used by cpymemsi in riscv.md . */ +struct stringop_info { + rtx avl; + bool need_loop; + machine_mode vmode; +}; -bool -expand_block_move (rtx dst_in, rtx src_in, rtx length_in) -{ - /* - memcpy: - mv a3, a0 # Copy destination - loop: - vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b - vle8.v v0, (a1) # Load bytes - add a1, a1, t0 # Bump pointer - sub a2, a2, t0 # Decrement count - vse8.v v0, (a3) # Store bytes - add a3, a3, t0 # Bump pointer - bnez a2, loop # Any more? - ret # Return - */ - gcc_assert (TARGET_VECTOR); +/* If a vectorized stringop should be used populate INFO and return TRUE. + Otherwise return false and leave INFO unchanged. - HOST_WIDE_INT potential_ew - = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) - / BITS_PER_UNIT); - machine_mode vmode = VOIDmode; + MAX_EW is the maximum element width that the caller wants to use and + LENGTH_IN is the length of the stringop in bytes. + + This is currently used for cpymem and setmem. If expand_vec_cmpmem switches + to using it too then check_vectorise_memory_operation can be removed. +*/ + +static bool +use_vector_stringop_p (struct stringop_info &info, HOST_WIDE_INT max_ew, + rtx length_in) +{ bool need_loop = true; - bool size_p = optimize_function_for_size_p (cfun); - rtx src, dst; - rtx end = gen_reg_rtx (Pmode); - rtx vec; - rtx length_rtx = length_in; + machine_mode vmode = VOIDmode; + /* The number of elements in the stringop. */ + rtx avl = length_in; + HOST_WIDE_INT potential_ew = max_ew; + + if (!TARGET_VECTOR || !(stringop_strategy & STRATEGY_VECTOR)) + return false; if (CONST_INT_P (length_in)) { HOST_WIDE_INT length = INTVAL (length_in); - /* By using LMUL=8, we can copy as many bytes in one go as there - are bits in a vector register. If the entire block thus fits, - we don't need a loop. */ - if (length <= TARGET_MIN_VLEN) - { - need_loop = false; + /* If the VLEN and preferred LMUL allow the entire block to be copied in + one go then no loop is needed. */ + if (known_le (length, BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL)) + { + need_loop = false; - /* If a single scalar load / store pair can do the job, leave it - to the scalar code to do that. */ - /* ??? If fast unaligned access is supported, the scalar code could - use suitably sized scalars irrespective of alignment. If that - gets fixed, we have to adjust the test here. */ + /* If a single scalar load / store pair can do the job, leave it + to the scalar code to do that. */ + /* ??? If fast unaligned access is supported, the scalar code could + use suitably sized scalars irrespective of alignment. If that + gets fixed, we have to adjust the test here. */ - if (pow2p_hwi (length) && length <= potential_ew) - return false; - } + if (pow2p_hwi (length) && length <= potential_ew) + return false; + } /* Find the vector mode to use. Using the largest possible element size is likely to give smaller constants, and thus potentially @@ -1120,14 +1117,17 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) { scalar_int_mode elem_mode; unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT; - unsigned HOST_WIDE_INT per_iter; - HOST_WIDE_INT nunits; + poly_uint64 per_iter; + poly_int64 nunits; if (need_loop) - per_iter = TARGET_MIN_VLEN; + per_iter = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL; else per_iter = length; - nunits = per_iter / potential_ew; + /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL may not be divisible by + this potential_ew. */ + if (!multiple_p (per_iter, potential_ew, &nunits)) + continue; /* Unless we get an implementation that's slow for small element size / non-word-aligned accesses, we assume that the hardware @@ -1138,6 +1138,8 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) if (length % potential_ew != 0 || !int_mode_for_size (bits, 0).exists (&elem_mode)) continue; + + poly_uint64 mode_units; /* Find the mode to use for the copy inside the loop - or the sole copy, if there is no loop. */ if (!need_loop) @@ -1153,26 +1155,25 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) pointless. Still, by choosing a lower LMUL factor that still allows an entire transfer, we can reduce register pressure. */ - for (unsigned lmul = 1; lmul <= 4; lmul <<= 1) - if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT - /* Avoid loosing the option of using vsetivli . */ - && (nunits <= 31 * lmul || nunits > 31 * 8) - && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew) + for (unsigned lmul = 1; lmul < TARGET_MAX_LMUL; lmul <<= 1) + if (known_le (length * BITS_PER_UNIT, TARGET_MIN_VLEN * lmul) + && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew, + &mode_units) && (riscv_vector::get_vector_mode - (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul, - potential_ew)).exists (&vmode))) + (elem_mode, mode_units).exists (&vmode))) break; } - /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes - wide. BYTES_PER_RISCV_VECTOR can't be evenly divided by - the sizes of larger element types; the LMUL factor of 8 can at - the moment be divided by the SEW, with SEW of up to 8 bytes, - but there are reserved encodings so there might be larger - SEW in the future. */ - if (riscv_vector::get_vector_mode - (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8, - potential_ew)).exists (&vmode)) + /* Stop searching if a suitable vmode has been found. */ + if (vmode != VOIDmode) + break; + + /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL will at least be divisible + by potential_ew 1, so this should succeed eventually. */ + if (multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL, + potential_ew, &mode_units) + && riscv_vector::get_vector_mode (elem_mode, + mode_units).exists (&vmode)) break; /* We may get here if we tried an element size that's larger than @@ -1181,45 +1182,90 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) gcc_assert (potential_ew > 1); } if (potential_ew > 1) - length_rtx = GEN_INT (length / potential_ew); + avl = GEN_INT (length / potential_ew); } else { - vmode = E_RVVM8QImode; + gcc_assert (get_lmul_mode (QImode, TARGET_MAX_LMUL).exists (&vmode)); } /* A memcpy libcall in the worst case takes 3 instructions to prepare the arguments + 1 for the call. When RVV should take 7 instructions and we're optimizing for size a libcall may be preferable. */ - if (size_p && need_loop) + if (optimize_function_for_size_p (cfun) && need_loop) + return false; + + info.need_loop = need_loop; + info.vmode = vmode; + info.avl = avl; + return true; +} + +/* Used by cpymemsi in riscv.md . */ + +bool +expand_block_move (rtx dst_in, rtx src_in, rtx length_in, bool movmem_p) +{ + /* + memcpy: + mv a3, a0 # Copy destination + loop: + vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b + vle8.v v0, (a1) # Load bytes + add a1, a1, t0 # Bump pointer + sub a2, a2, t0 # Decrement count + vse8.v v0, (a3) # Store bytes + add a3, a3, t0 # Bump pointer + bnez a2, loop # Any more? + ret # Return + */ + struct stringop_info info; + + HOST_WIDE_INT potential_ew + = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD) + / BITS_PER_UNIT); + + if (!use_vector_stringop_p (info, potential_ew, length_in)) return false; - /* length_rtx holds the (remaining) length of the required copy. + /* Inlining general memmove is a pessimisation: we can't avoid having to + decide which direction to go at runtime, which is costly in instruction + count however for situations where the entire move fits in one vector + operation we can do all reads before doing any writes so we don't have to + worry so generate the inline vector code in such situations. */ + if (info.need_loop && movmem_p) + return false; + + rtx src, dst; + rtx vec; + + /* avl holds the (remaining) length of the required copy. cnt holds the length we copy with the current load/store pair. */ - rtx cnt = length_rtx; + rtx cnt = info.avl; rtx label = NULL_RTX; rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0)); - if (need_loop) + if (info.need_loop) { - length_rtx = copy_to_mode_reg (Pmode, length_rtx); + info.avl = copy_to_mode_reg (Pmode, info.avl); cnt = gen_reg_rtx (Pmode); label = gen_label_rtx (); emit_label (label); - emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt, - length_rtx)); + emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt, + info.avl)); } - vec = gen_reg_rtx (vmode); - src = change_address (src_in, vmode, src_addr); - dst = change_address (dst_in, vmode, dst_addr); + vec = gen_reg_rtx (info.vmode); + src = change_address (src_in, info.vmode, src_addr); + dst = change_address (dst_in, info.vmode, dst_addr); /* If we don't need a loop and have a suitable mode to describe the size, just do a load / store pair and leave it up to the later lazy code motion pass to insert the appropriate vsetvli. */ - if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) + if (!info.need_loop + && known_eq (GET_MODE_SIZE (info.vmode), INTVAL (length_in))) { emit_move_insn (vec, src); emit_move_insn (dst, vec); @@ -1227,26 +1273,26 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in) else { machine_mode mask_mode = riscv_vector::get_vector_mode - (BImode, GET_MODE_NUNITS (vmode)).require (); + (BImode, GET_MODE_NUNITS (info.vmode)).require (); rtx mask = CONSTM1_RTX (mask_mode); if (!satisfies_constraint_K (cnt)) cnt= force_reg (Pmode, cnt); rtx m_ops[] = {vec, mask, src}; - emit_nonvlmax_insn (code_for_pred_mov (vmode), + emit_nonvlmax_insn (code_for_pred_mov (info.vmode), riscv_vector::UNARY_OP_TAMA, m_ops, cnt); - emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt, + emit_insn (gen_pred_store (info.vmode, dst, mask, vec, cnt, get_avl_type_rtx (riscv_vector::NONVLMAX))); } - if (need_loop) + if (info.need_loop) { emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt))); emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt))); - emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt))); + emit_insn (gen_rtx_SET (info.avl, gen_rtx_MINUS (Pmode, info.avl, cnt))); /* Emit the loop condition. */ - rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx); - emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label)); + rtx test = gen_rtx_NE (VOIDmode, info.avl, const0_rtx); + emit_jump_insn (gen_cbranch4 (Pmode, test, info.avl, const0_rtx, label)); emit_insn (gen_nop ()); } @@ -1557,41 +1603,39 @@ check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out) bool expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in) { - HOST_WIDE_INT lmul; + stringop_info info; + /* Check we are able and allowed to vectorise this operation; bail if not. */ - if (!check_vectorise_memory_operation (length_in, lmul)) + if (!use_vector_stringop_p (info, 1, length_in) || info.need_loop) return false; - machine_mode vmode - = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul) - .require (); rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0)); - rtx dst = change_address (dst_in, vmode, dst_addr); + rtx dst = change_address (dst_in, info.vmode, dst_addr); - rtx fill_value = gen_reg_rtx (vmode); + rtx fill_value = gen_reg_rtx (info.vmode); rtx broadcast_ops[] = { fill_value, fill_value_in }; /* If the length is exactly vlmax for the selected mode, do that. Otherwise, use a predicated store. */ - if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in))) + if (known_eq (GET_MODE_SIZE (info.vmode), INTVAL (info.avl))) { - emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, - broadcast_ops); + emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP, + broadcast_ops); emit_move_insn (dst, fill_value); } else { - if (!satisfies_constraint_K (length_in)) - length_in = force_reg (Pmode, length_in); - emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP, - broadcast_ops, length_in); + if (!satisfies_constraint_K (info.avl)) + info.avl = force_reg (Pmode, info.avl); + emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode), + riscv_vector::UNARY_OP, broadcast_ops, info.avl); machine_mode mask_mode - = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode)) - .require (); + = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (info.vmode)) + .require (); rtx mask = CONSTM1_RTX (mask_mode); - emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in, - get_avl_type_rtx (riscv_vector::NONVLMAX))); + emit_insn (gen_pred_store (info.vmode, dst, mask, fill_value, info.avl, + get_avl_type_rtx (riscv_vector::NONVLMAX))); } return true; diff --git a/gcc/config/riscv/riscv-subset.h b/gcc/config/riscv/riscv-subset.h index dace4de..1914a53 100644 --- a/gcc/config/riscv/riscv-subset.h +++ b/gcc/config/riscv/riscv-subset.h @@ -120,5 +120,8 @@ public: extern const riscv_subset_list *riscv_cmdline_subset_list (void); extern void riscv_set_arch_by_subset_list (riscv_subset_list *, struct gcc_options *); +extern bool +riscv_ext_is_subset (struct cl_target_option *, struct cl_target_option *); +extern int riscv_x_target_flags_isa_mask (void); #endif /* ! GCC_RISCV_SUBSET_H */ diff --git a/gcc/config/riscv/riscv-target-attr.cc b/gcc/config/riscv/riscv-target-attr.cc index bf14ade..8ce9607 100644 --- a/gcc/config/riscv/riscv-target-attr.cc +++ b/gcc/config/riscv/riscv-target-attr.cc @@ -304,35 +304,13 @@ num_occurrences_in_str (char c, char *str) return res; } -/* Parse the tree in ARGS that contains the target attribute information +/* Parse the string in ARGS that contains the target attribute information and update the global target options space. */ -static bool -riscv_process_target_attr (tree args, location_t loc) +bool +riscv_process_target_attr (const char *args, location_t loc) { - if (TREE_CODE (args) == TREE_LIST) - { - do - { - tree head = TREE_VALUE (args); - if (head) - { - if (!riscv_process_target_attr (head, loc)) - return false; - } - args = TREE_CHAIN (args); - } while (args); - - return true; - } - - if (TREE_CODE (args) != STRING_CST) - { - error_at (loc, "attribute %<target%> argument not a string"); - return false; - } - - size_t len = strlen (TREE_STRING_POINTER (args)); + size_t len = strlen (args); /* No need to emit warning or error on empty string here, generic code already handle this case. */ @@ -343,7 +321,7 @@ riscv_process_target_attr (tree args, location_t loc) std::unique_ptr<char[]> buf (new char[len+1]); char *str_to_check = buf.get (); - strcpy (str_to_check, TREE_STRING_POINTER (args)); + strcpy (str_to_check, args); /* Used to catch empty spaces between semi-colons i.e. attribute ((target ("attr1;;attr2"))). */ @@ -366,7 +344,7 @@ riscv_process_target_attr (tree args, location_t loc) if (num_attrs != num_semicolons + 1) { error_at (loc, "malformed %<target(\"%s\")%> attribute", - TREE_STRING_POINTER (args)); + args); return false; } @@ -376,6 +354,37 @@ riscv_process_target_attr (tree args, location_t loc) return true; } +/* Parse the tree in ARGS that contains the target attribute information + and update the global target options space. */ + +static bool +riscv_process_target_attr (tree args, location_t loc) +{ + if (TREE_CODE (args) == TREE_LIST) + { + do + { + tree head = TREE_VALUE (args); + if (head) + { + if (!riscv_process_target_attr (head, loc)) + return false; + } + args = TREE_CHAIN (args); + } while (args); + + return true; + } + + if (TREE_CODE (args) != STRING_CST) + { + error_at (loc, "attribute %<target%> argument not a string"); + return false; + } + + return riscv_process_target_attr (TREE_STRING_POINTER (args), loc); +} + /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to process attribute ((target ("..."))). Note, that riscv_set_current_function() has not been called before, diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index c8960366..5e728f0 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -51,6 +51,7 @@ #include "targhooks.h" #include "predict.h" #include "errors.h" +#include "riscv-v.h" using namespace riscv_vector; @@ -436,58 +437,6 @@ emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl) e.emit_insn ((enum insn_code) icode, ops); } -class rvv_builder : public rtx_vector_builder -{ -public: - rvv_builder () : rtx_vector_builder () {} - rvv_builder (machine_mode mode, unsigned int npatterns, - unsigned int nelts_per_pattern) - : rtx_vector_builder (mode, npatterns, nelts_per_pattern) - { - m_inner_mode = GET_MODE_INNER (mode); - m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); - m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); - m_mask_mode = get_mask_mode (mode); - - gcc_assert ( - int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); - m_int_mode - = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require (); - } - - bool can_duplicate_repeating_sequence_p (); - bool is_repeating_sequence (); - rtx get_merged_repeating_sequence (); - - bool repeating_sequence_use_merge_profitable_p (); - bool combine_sequence_use_slideup_profitable_p (); - bool combine_sequence_use_merge_profitable_p (); - rtx get_merge_scalar_mask (unsigned int, machine_mode) const; - - bool single_step_npatterns_p () const; - bool npatterns_all_equal_p () const; - bool interleaved_stepped_npatterns_p () const; - bool npatterns_vid_diff_repeated_p () const; - - machine_mode new_mode () const { return m_new_mode; } - scalar_mode inner_mode () const { return m_inner_mode; } - scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } - machine_mode mask_mode () const { return m_mask_mode; } - machine_mode int_mode () const { return m_int_mode; } - unsigned int inner_bits_size () const { return m_inner_bits_size; } - unsigned int inner_bytes_size () const { return m_inner_bytes_size; } - -private: - scalar_mode m_inner_mode; - scalar_int_mode m_inner_int_mode; - machine_mode m_new_mode; - scalar_int_mode m_new_inner_mode; - machine_mode m_mask_mode; - machine_mode m_int_mode; - unsigned int m_inner_bits_size; - unsigned int m_inner_bytes_size; -}; - /* Return true if the vector duplicated by a super element which is the fusion of consecutive elements. @@ -845,6 +794,15 @@ const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval) return true; } +/* Returns true if the vector's elements are all duplicates in + range -16 ~ 15 integer or 0.0 floating-point. */ + +bool +valid_vec_immediate_p (rtx x) +{ + return (satisfies_constraint_vi (x) || satisfies_constraint_Wc0 (x)); +} + /* Return a const vector of VAL. The VAL can be either const_int or const_poly_int. */ @@ -1146,30 +1104,92 @@ expand_vec_series (rtx dest, rtx base, rtx step, rtx vid) emit_move_insn (dest, result); } +/* Subroutine of riscv_vector_expand_vector_init. + Works as follows: + (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. + (b) Skip leading elements from BUILDER, which are the same as + element NELTS_REQD - 1. + (c) Insert earlier elements in reverse order in TARGET using vslide1down. */ + static void -expand_const_vector (rtx target, rtx src) +expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, + int nelts_reqd) { machine_mode mode = GET_MODE (target); - if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + rtx dup = expand_vector_broadcast (mode, builder.elt (0)); + emit_move_insn (target, dup); + int ndups = builder.count_dups (0, nelts_reqd - 1, 1); + for (int i = ndups; i < nelts_reqd; i++) { - rtx elt; - gcc_assert ( - const_vec_duplicate_p (src, &elt) - && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx))); - rtx ops[] = {target, src}; - emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops); - return; + unsigned int unspec + = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN; + insn_code icode = code_for_pred_slide (unspec, mode); + rtx ops[] = {target, target, builder.elt (i)}; + emit_vlmax_insn (icode, BINARY_OP, ops); } +} + +/* Subroutine of expand_vec_init to handle case + when all trailing elements of builder are same. + This works as follows: + (a) Use expand_insn interface to broadcast last vector element in TARGET. + (b) Insert remaining elements in TARGET using insr. + + ??? The heuristic used is to do above if number of same trailing elements + is greater than leading_ndups, loosely based on + heuristic from mostly_zeros_p. May need fine-tuning. */ +static bool +expand_vector_init_trailing_same_elem (rtx target, + const rtx_vector_builder &builder, + int nelts_reqd) +{ + int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1); + int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1); + machine_mode mode = GET_MODE (target); + + if (trailing_ndups > leading_ndups) + { + rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1)); + for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--) + { + unsigned int unspec + = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; + insn_code icode = code_for_pred_slide (unspec, mode); + rtx tmp = gen_reg_rtx (mode); + rtx ops[] = {tmp, dup, builder.elt (i)}; + emit_vlmax_insn (icode, BINARY_OP, ops); + /* slide1up need source and dest to be different REG. */ + dup = tmp; + } + + emit_move_insn (target, dup); + return true; + } + + return false; +} + +static void +expand_const_vector (rtx target, rtx src) +{ + machine_mode mode = GET_MODE (target); + rtx result = register_operand (target, mode) ? target : gen_reg_rtx (mode); rtx elt; if (const_vec_duplicate_p (src, &elt)) { - rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode); + if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) + { + gcc_assert (rtx_equal_p (elt, const0_rtx) + || rtx_equal_p (elt, const1_rtx)); + rtx ops[] = {result, src}; + emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops); + } /* Element in range -16 ~ 15 integer or 0.0 floating-point, we use vmv.v.i instruction. */ - if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src)) + else if (valid_vec_immediate_p (src)) { - rtx ops[] = {tmp, src}; + rtx ops[] = {result, src}; emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops); } else @@ -1186,7 +1206,7 @@ expand_const_vector (rtx target, rtx src) instruction (vsetvl a5, zero). */ if (lra_in_progress) { - rtx ops[] = {tmp, elt}; + rtx ops[] = {result, elt}; emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops); } else @@ -1194,15 +1214,15 @@ expand_const_vector (rtx target, rtx src) struct expand_operand ops[2]; enum insn_code icode = optab_handler (vec_duplicate_optab, mode); gcc_assert (icode != CODE_FOR_nothing); - create_output_operand (&ops[0], tmp, mode); + create_output_operand (&ops[0], result, mode); create_input_operand (&ops[1], elt, GET_MODE_INNER (mode)); expand_insn (icode, 2, ops); - tmp = ops[0].value; + result = ops[0].value; } } - if (tmp != target) - emit_move_insn (target, tmp); + if (result != target) + emit_move_insn (target, result); return; } @@ -1210,7 +1230,10 @@ expand_const_vector (rtx target, rtx src) rtx base, step; if (const_vec_series_p (src, &base, &step)) { - expand_vec_series (target, base, step); + expand_vec_series (result, base, step); + + if (result != target) + emit_move_insn (target, result); return; } @@ -1243,7 +1266,7 @@ expand_const_vector (rtx target, rtx src) all element equal to 0x0706050403020100. */ rtx ele = builder.get_merged_repeating_sequence (); rtx dup = expand_vector_broadcast (builder.new_mode (), ele); - emit_move_insn (target, gen_lowpart (mode, dup)); + emit_move_insn (result, gen_lowpart (mode, dup)); } else { @@ -1272,8 +1295,8 @@ expand_const_vector (rtx target, rtx src) emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()), BINARY_OP, and_ops); - rtx tmp = gen_reg_rtx (builder.mode ()); - rtx dup_ops[] = {tmp, builder.elt (0)}; + rtx tmp1 = gen_reg_rtx (builder.mode ()); + rtx dup_ops[] = {tmp1, builder.elt (0)}; emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP, dup_ops); for (unsigned int i = 1; i < builder.npatterns (); i++) @@ -1285,12 +1308,12 @@ expand_const_vector (rtx target, rtx src) /* Merge scalar to each i. */ rtx tmp2 = gen_reg_rtx (builder.mode ()); - rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask}; + rtx merge_ops[] = {tmp2, tmp1, builder.elt (i), mask}; insn_code icode = code_for_pred_merge_scalar (builder.mode ()); emit_vlmax_insn (icode, MERGE_OP, merge_ops); - tmp = tmp2; + tmp1 = tmp2; } - emit_move_insn (target, tmp); + emit_move_insn (result, tmp1); } } else if (CONST_VECTOR_STEPPED_P (src)) @@ -1312,25 +1335,61 @@ expand_const_vector (rtx target, rtx src) /* Generate the variable-length vector following this rule: { a, a, a + step, a + step, a + step * 2, a + step * 2, ...} E.g. { 0, 0, 8, 8, 16, 16, ... } */ - /* We want to create a pattern where value[ix] = floor (ix / + + /* We want to create a pattern where value[idx] = floor (idx / NPATTERNS). As NPATTERNS is always a power of two we can - rewrite this as = ix & -NPATTERNS. */ + rewrite this as = idx & -NPATTERNS. */ /* Step 2: VID AND -NPATTERNS: { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... } */ rtx imm = gen_int_mode (-builder.npatterns (), builder.inner_mode ()); - rtx tmp = gen_reg_rtx (builder.mode ()); - rtx and_ops[] = {tmp, vid, imm}; + rtx tmp1 = gen_reg_rtx (builder.mode ()); + rtx and_ops[] = {tmp1, vid, imm}; icode = code_for_pred_scalar (AND, builder.mode ()); emit_vlmax_insn (icode, BINARY_OP, and_ops); + + /* Step 3: Convert to step size 1. */ + rtx tmp2 = gen_reg_rtx (builder.mode ()); + /* log2 (npatterns) to get the shift amount to convert + Eg. { 0, 0, 0, 0, 4, 4, ... } + into { 0, 0, 0, 0, 1, 1, ... }. */ + HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ; + rtx shift = gen_int_mode (shift_amt, builder.inner_mode ()); + rtx shift_ops[] = {tmp2, tmp1, shift}; + icode = code_for_pred_scalar (ASHIFTRT, builder.mode ()); + emit_vlmax_insn (icode, BINARY_OP, shift_ops); + + /* Step 4: Multiply to step size n. */ + HOST_WIDE_INT step_size = + INTVAL (builder.elt (builder.npatterns ())) + - INTVAL (builder.elt (0)); + rtx tmp3 = gen_reg_rtx (builder.mode ()); + if (pow2p_hwi (step_size)) + { + /* Power of 2 can be handled with a left shift. */ + HOST_WIDE_INT shift = exact_log2 (step_size); + rtx shift_amount = gen_int_mode (shift, Pmode); + insn_code icode = code_for_pred_scalar (ASHIFT, mode); + rtx ops[] = {tmp3, tmp2, shift_amount}; + emit_vlmax_insn (icode, BINARY_OP, ops); + } + else + { + rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ()); + insn_code icode = code_for_pred_scalar (MULT, builder.mode ()); + rtx ops[] = {tmp3, tmp2, mult_amt}; + emit_vlmax_insn (icode, BINARY_OP, ops); + } + + /* Step 5: Add starting value to all elements. */ HOST_WIDE_INT init_val = INTVAL (builder.elt (0)); if (init_val == 0) - emit_move_insn (target, tmp); + emit_move_insn (result, tmp3); else { rtx dup = gen_const_vector_dup (builder.mode (), init_val); - rtx add_ops[] = {target, tmp, dup}; + rtx add_ops[] = {result, tmp3, dup}; icode = code_for_pred (PLUS, builder.mode ()); emit_vlmax_insn (icode, BINARY_OP, add_ops); } @@ -1360,7 +1419,7 @@ expand_const_vector (rtx target, rtx src) /* Step 2: Generate result = VID + diff. */ rtx vec = v.build (); - rtx add_ops[] = {target, vid, vec}; + rtx add_ops[] = {result, vid, vec}; emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), BINARY_OP, add_ops); } @@ -1376,24 +1435,24 @@ expand_const_vector (rtx target, rtx src) v.quick_push (builder.elt (i)); rtx new_base = v.build (); - /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS). */ + /* Step 2: Generate tmp1 = VID >> LOG2 (NPATTERNS). */ rtx shift_count = gen_int_mode (exact_log2 (builder.npatterns ()), builder.inner_mode ()); - rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT, + rtx tmp1 = expand_simple_binop (builder.mode (), LSHIFTRT, vid, shift_count, NULL_RTX, false, OPTAB_DIRECT); - /* Step 3: Generate tmp2 = tmp * step. */ + /* Step 3: Generate tmp2 = tmp1 * step. */ rtx tmp2 = gen_reg_rtx (builder.mode ()); rtx step = simplify_binary_operation (MINUS, builder.inner_mode (), builder.elt (v.npatterns()), builder.elt (0)); - expand_vec_series (tmp2, const0_rtx, step, tmp); + expand_vec_series (tmp2, const0_rtx, step, tmp1); - /* Step 4: Generate target = tmp2 + new_base. */ - rtx add_ops[] = {target, tmp2, new_base}; + /* Step 4: Generate result = tmp2 + new_base. */ + rtx add_ops[] = {result, tmp2, new_base}; emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), BINARY_OP, add_ops); } @@ -1426,13 +1485,13 @@ expand_const_vector (rtx target, rtx src) if (int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode) && get_vector_mode (new_smode, new_nunits).exists (&new_mode)) { - rtx tmp = gen_reg_rtx (new_mode); + rtx tmp1 = gen_reg_rtx (new_mode); base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode); - expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode)); + expand_vec_series (tmp1, base1, gen_int_mode (step1, new_smode)); if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0)) /* { 1, 0, 2, 0, ... }. */ - emit_move_insn (target, gen_lowpart (mode, tmp)); + emit_move_insn (result, gen_lowpart (mode, tmp1)); else if (known_eq (step2, 0)) { /* { 1, 1, 2, 1, ... }. */ @@ -1442,10 +1501,10 @@ expand_const_vector (rtx target, rtx src) gen_int_mode (builder.inner_bits_size (), new_smode), NULL_RTX, false, OPTAB_DIRECT); rtx tmp2 = gen_reg_rtx (new_mode); - rtx and_ops[] = {tmp2, tmp, scalar}; - emit_vlmax_insn (code_for_pred_scalar (AND, new_mode), - BINARY_OP, and_ops); - emit_move_insn (target, gen_lowpart (mode, tmp2)); + rtx ior_ops[] = {tmp2, tmp1, scalar}; + emit_vlmax_insn (code_for_pred_scalar (IOR, new_mode), + BINARY_OP, ior_ops); + emit_move_insn (result, gen_lowpart (mode, tmp2)); } else { @@ -1459,10 +1518,10 @@ expand_const_vector (rtx target, rtx src) gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX, false, OPTAB_DIRECT); rtx tmp3 = gen_reg_rtx (new_mode); - rtx ior_ops[] = {tmp3, tmp, shifted_tmp2}; + rtx ior_ops[] = {tmp3, tmp1, shifted_tmp2}; emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP, ior_ops); - emit_move_insn (target, gen_lowpart (mode, tmp3)); + emit_move_insn (result, gen_lowpart (mode, tmp3)); } } else @@ -1490,7 +1549,7 @@ expand_const_vector (rtx target, rtx src) rtx mask = gen_reg_rtx (builder.mask_mode ()); expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode)); - rtx ops[] = {target, tmp1, tmp2, mask}; + rtx ops[] = {result, tmp1, tmp2, mask}; emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops); } } @@ -1500,6 +1559,9 @@ expand_const_vector (rtx target, rtx src) } else gcc_unreachable (); + + if (result != target) + emit_move_insn (target, result); } /* Get the frm mode with given CONST_INT rtx, the default mode is @@ -1828,6 +1890,18 @@ get_mask_mode (machine_mode mode) return get_vector_mode (BImode, nunits).require (); } +/* Return the appropriate LMUL mode for MODE. */ + +opt_machine_mode +get_lmul_mode (scalar_mode mode, int lmul) +{ + poly_uint64 lmul_nunits; + unsigned int bytes = GET_MODE_SIZE (mode); + if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits)) + return get_vector_mode (mode, lmul_nunits); + return E_VOIDmode; +} + /* Return the appropriate M1 mode for MODE. */ static opt_machine_mode @@ -2342,31 +2416,6 @@ preferred_simd_mode (scalar_mode mode) return word_mode; } -/* Subroutine of riscv_vector_expand_vector_init. - Works as follows: - (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. - (b) Skip leading elements from BUILDER, which are the same as - element NELTS_REQD - 1. - (c) Insert earlier elements in reverse order in TARGET using vslide1down. */ - -static void -expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, - int nelts_reqd) -{ - machine_mode mode = GET_MODE (target); - rtx dup = expand_vector_broadcast (mode, builder.elt (0)); - emit_move_insn (target, dup); - int ndups = builder.count_dups (0, nelts_reqd - 1, 1); - for (int i = ndups; i < nelts_reqd; i++) - { - unsigned int unspec - = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN; - insn_code icode = code_for_pred_slide (unspec, mode); - rtx ops[] = {target, target, builder.elt (i)}; - emit_vlmax_insn (icode, BINARY_OP, ops); - } -} - /* Use merge approach to initialize the vector with repeating sequence. v = {a, b, a, b, a, b, a, b}. @@ -2491,47 +2540,6 @@ expand_vector_init_merge_combine_sequence (rtx target, emit_vlmax_insn (icode, MERGE_OP, merge_ops); } -/* Subroutine of expand_vec_init to handle case - when all trailing elements of builder are same. - This works as follows: - (a) Use expand_insn interface to broadcast last vector element in TARGET. - (b) Insert remaining elements in TARGET using insr. - - ??? The heuristic used is to do above if number of same trailing elements - is greater than leading_ndups, loosely based on - heuristic from mostly_zeros_p. May need fine-tuning. */ - -static bool -expand_vector_init_trailing_same_elem (rtx target, - const rtx_vector_builder &builder, - int nelts_reqd) -{ - int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1); - int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1); - machine_mode mode = GET_MODE (target); - - if (trailing_ndups > leading_ndups) - { - rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1)); - for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--) - { - unsigned int unspec - = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; - insn_code icode = code_for_pred_slide (unspec, mode); - rtx tmp = gen_reg_rtx (mode); - rtx ops[] = {tmp, dup, builder.elt (i)}; - emit_vlmax_insn (icode, BINARY_OP, ops); - /* slide1up need source and dest to be different REG. */ - dup = tmp; - } - - emit_move_insn (target, dup); - return true; - } - - return false; -} - /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ void @@ -3825,6 +3833,58 @@ expand_load_store (rtx *ops, bool is_load) } } +/* Expand MASK_LEN_STRIDED_LOAD. */ +void +expand_strided_load (machine_mode mode, rtx *ops) +{ + rtx v_reg = ops[0]; + rtx base = ops[1]; + rtx stride = ops[2]; + rtx mask = ops[3]; + rtx len = ops[4]; + poly_int64 len_val; + + insn_code icode = code_for_pred_strided_load (mode); + rtx emit_ops[] = {v_reg, mask, gen_rtx_MEM (mode, base), stride}; + + if (poly_int_rtx_p (len, &len_val) + && known_eq (len_val, GET_MODE_NUNITS (mode))) + emit_vlmax_insn (icode, BINARY_OP_TAMA, emit_ops); + else + { + len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len); + emit_nonvlmax_insn (icode, BINARY_OP_TAMA, emit_ops, len); + } +} + +/* Expand MASK_LEN_STRIDED_STORE. */ +void +expand_strided_store (machine_mode mode, rtx *ops) +{ + rtx v_reg = ops[2]; + rtx base = ops[0]; + rtx stride = ops[1]; + rtx mask = ops[3]; + rtx len = ops[4]; + poly_int64 len_val; + rtx vl_type; + + if (poly_int_rtx_p (len, &len_val) + && known_eq (len_val, GET_MODE_NUNITS (mode))) + { + len = gen_reg_rtx (Pmode); + emit_vlmax_vsetvl (mode, len); + vl_type = get_avl_type_rtx (VLMAX); + } + else + { + len = satisfies_constraint_K (len) ? len : force_reg (Pmode, len); + vl_type = get_avl_type_rtx (NONVLMAX); + } + + emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, base), + mask, stride, v_reg, len, vl_type)); +} /* Return true if the operation is the floating-point operation need FRM. */ static bool @@ -4888,6 +4948,15 @@ expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode) emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode); } +/* Expand the standard name ssadd<mode>3 for vector mode, we can leverage + the vector fixed point vector single-width saturating add directly. */ + +void +expand_vec_ssadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode) +{ + emit_vec_binary_alu (op_0, op_1, op_2, SS_PLUS, vec_mode); +} + /* Expand the standard name usadd<mode>3 for vector mode, we can leverage the vector fixed point vector single-width saturating add directly. */ @@ -4897,6 +4966,15 @@ expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode) emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode); } +/* Expand the standard name ssadd<mode>3 for vector mode, we can leverage + the vector fixed point vector single-width saturating add directly. */ + +void +expand_vec_sssub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode) +{ + emit_vec_binary_alu (op_0, op_1, op_2, SS_MINUS, vec_mode); +} + /* Expand the standard name ustrunc<m><n>2 for double vector mode, like DI => SI. we can leverage the vector fixed point vector narrowing fixed-point clip directly. */ @@ -4913,6 +4991,22 @@ expand_vec_double_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode) emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops); } +/* Expand the standard name sstrunc<m><n>2 for double vector mode, like + DI => SI. we can leverage the vector fixed point vector narrowing + fixed-point clip directly. */ + +void +expand_vec_double_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode) +{ + insn_code icode; + rtx zero = CONST0_RTX (Xmode); + enum unspec unspec = UNSPEC_VNCLIP; + rtx ops[] = {op_0, op_1, zero}; + + icode = code_for_pred_narrow_clip_scalar (unspec, vec_mode); + emit_vlmax_insn (icode, BINARY_OP_VXRM_RNU, ops); +} + /* Expand the standard name ustrunc<m><n>2 for double vector mode, like DI => HI. we can leverage the vector fixed point vector narrowing fixed-point clip directly. */ @@ -4927,6 +5021,20 @@ expand_vec_quad_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode, expand_vec_double_ustrunc (op_0, double_rtx, double_mode); } +/* Expand the standard name sstrunc<m><n>2 for quad vector mode, like + DI => HI. we can leverage the vector fixed point vector narrowing + fixed-point clip directly. */ + +void +expand_vec_quad_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode, + machine_mode double_mode) +{ + rtx double_rtx = gen_reg_rtx (double_mode); + + expand_vec_double_sstrunc (double_rtx, op_1, vec_mode); + expand_vec_double_sstrunc (op_0, double_rtx, double_mode); +} + /* Expand the standard name ustrunc<m><n>2 for double vector mode, like DI => QI. we can leverage the vector fixed point vector narrowing fixed-point clip directly. */ @@ -4943,6 +5051,22 @@ expand_vec_oct_ustrunc (rtx op_0, rtx op_1, machine_mode vec_mode, expand_vec_double_ustrunc (op_0, quad_rtx, quad_mode); } +/* Expand the standard name sstrunc<m><n>2 for oct vector mode, like + DI => QI. we can leverage the vector fixed point vector narrowing + fixed-point clip directly. */ + +void +expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode, + machine_mode double_mode, machine_mode quad_mode) +{ + rtx double_rtx = gen_reg_rtx (double_mode); + rtx quad_rtx = gen_reg_rtx (quad_mode); + + expand_vec_double_sstrunc (double_rtx, op_1, vec_mode); + expand_vec_double_sstrunc (quad_rtx, double_rtx, double_mode); + expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode); +} + /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as well. */ void diff --git a/gcc/config/riscv/riscv-v.h b/gcc/config/riscv/riscv-v.h new file mode 100644 index 0000000..e7b095f --- /dev/null +++ b/gcc/config/riscv/riscv-v.h @@ -0,0 +1,90 @@ +/* Subroutines used for code generation for RISC-V 'V' Extension for + GNU compiler. + Copyright (C) 2022-2024 Free Software Foundation, Inc. + Contributed by Juzhe Zhong (juzhe.zhong@rivai.ai), RiVAI Technologies Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +#ifndef GCC_RISCV_V_H +#define GCC_RISCV_V_H + +#include "rtx-vector-builder.h" + +using namespace riscv_vector; + +namespace riscv_vector { + +extern machine_mode get_mask_mode (machine_mode); +extern opt_machine_mode get_vector_mode (scalar_mode, poly_uint64); + +class rvv_builder : public rtx_vector_builder +{ +public: + rvv_builder () : rtx_vector_builder () {} + rvv_builder (machine_mode mode, unsigned int npatterns, + unsigned int nelts_per_pattern) + : rtx_vector_builder (mode, npatterns, nelts_per_pattern) + { + m_inner_mode = GET_MODE_INNER (mode); + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); + m_mask_mode = get_mask_mode (mode); + + gcc_assert ( + int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); + m_int_mode + = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require (); + } + + bool can_duplicate_repeating_sequence_p (); + bool is_repeating_sequence (); + rtx get_merged_repeating_sequence (); + + bool repeating_sequence_use_merge_profitable_p (); + bool combine_sequence_use_slideup_profitable_p (); + bool combine_sequence_use_merge_profitable_p (); + rtx get_merge_scalar_mask (unsigned int, machine_mode) const; + + bool single_step_npatterns_p () const; + bool npatterns_all_equal_p () const; + bool interleaved_stepped_npatterns_p () const; + bool npatterns_vid_diff_repeated_p () const; + + machine_mode new_mode () const { return m_new_mode; } + scalar_mode inner_mode () const { return m_inner_mode; } + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } + machine_mode mask_mode () const { return m_mask_mode; } + machine_mode int_mode () const { return m_int_mode; } + unsigned int inner_bits_size () const { return m_inner_bits_size; } + unsigned int inner_bytes_size () const { return m_inner_bytes_size; } + +private: + scalar_mode m_inner_mode; + scalar_int_mode m_inner_int_mode; + machine_mode m_new_mode; + scalar_int_mode m_new_inner_mode; + machine_mode m_mask_mode; + machine_mode m_int_mode; + unsigned int m_inner_bits_size; + unsigned int m_inner_bytes_size; +}; + +extern bool valid_vec_immediate_p(rtx); + +} // namespace riscv_vector + +#endif // GCC_RISCV_V_H diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc index 193392f..b8c337f 100644 --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc @@ -1753,6 +1753,8 @@ public: rtx expand (function_expander &e) const override { + if (!e.target) + return NULL_RTX; tree arg = CALL_EXPR_ARG (e.exp, 0); rtx src = expand_normal (arg); emit_move_insn (gen_lowpart (e.vector_mode (), e.target), src); @@ -1767,6 +1769,8 @@ public: rtx expand (function_expander &e) const override { + if (!e.target) + return NULL_RTX; rtx src = expand_normal (CALL_EXPR_ARG (e.exp, 0)); emit_move_insn (e.target, gen_lowpart (GET_MODE (e.target), src)); return e.target; @@ -2247,7 +2251,7 @@ public: { return (CODE == CLZ || CODE == CTZ) ? false : true; } - + rtx expand (function_expander &e) const override { switch (e.op_info->op) diff --git a/gcc/config/riscv/riscv-vector-builtins-shapes.cc b/gcc/config/riscv/riscv-vector-builtins-shapes.cc index 3339541..22cbbc2 100644 --- a/gcc/config/riscv/riscv-vector-builtins-shapes.cc +++ b/gcc/config/riscv/riscv-vector-builtins-shapes.cc @@ -1268,7 +1268,7 @@ struct crypto_vv_no_op_type_def : public build_base if (overloaded_p && !instance.base->can_be_overloaded_p (instance.pred)) return nullptr; b.append_base_name (instance.base_name); - + if (!overloaded_p) { b.append_name (operand_suffixes[instance.op_info->op]); diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc index 41730c4..458d9b0 100644 --- a/gcc/config/riscv/riscv-vector-builtins.cc +++ b/gcc/config/riscv/riscv-vector-builtins.cc @@ -20,6 +20,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index a80e167..67b9e3e 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #define INCLUDE_STRING #include "config.h" #include "system.h" @@ -193,7 +194,7 @@ compute_local_program_points ( /* Collect the stmts that is vectorized and mark their program point. */ for (i = 0; i < nbbs; i++) { - int point = 1; + unsigned int point = 1; basic_block bb = bbs[i]; vec<stmt_point> program_points = vNULL; if (dump_enabled_p ()) @@ -488,9 +489,15 @@ max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb, pair live_range = (*iter).second; for (i = live_range.first + 1; i <= live_range.second; i++) { - machine_mode mode = TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE - ? BImode - : TYPE_MODE (TREE_TYPE (var)); + machine_mode mode; + if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE) + mode = BImode; + /* Constants do not have a mode, just use the biggest so + compute_nregs will return 1. */ + else if (TREE_CODE (var) == INTEGER_CST) + mode = biggest_mode; + else + mode = TYPE_MODE (TREE_TYPE (var)); unsigned int nregs = compute_nregs_for_mode (loop_vinfo, mode, biggest_mode, lmul); live_vars_vec[i] += nregs; diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 017efa8..0b53b20 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -63,6 +63,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 #define INCLUDE_ALGORITHM #define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY #define INCLUDE_ARRAY #include "config.h" @@ -1002,6 +1003,9 @@ public: void parse_insn (insn_info *insn) { + /* The VL dest of the insn */ + rtx dest_vl = NULL_RTX; + m_insn = insn; m_bb = insn->bb (); /* Return if it is debug insn for the consistency with optimize == 0. */ @@ -1035,7 +1039,10 @@ public: if (m_avl) { if (vsetvl_insn_p (insn->rtl ()) || has_vlmax_avl ()) - m_vl = ::get_vl (insn->rtl ()); + { + m_vl = ::get_vl (insn->rtl ()); + dest_vl = m_vl; + } if (has_nonvlmax_reg_avl ()) m_avl_def = find_access (insn->uses (), REGNO (m_avl))->def (); @@ -1132,22 +1139,22 @@ public: } /* Determine if dest operand(vl) has been used by non-RVV instructions. */ - if (has_vl ()) + if (dest_vl) { const hash_set<use_info *> vl_uses - = get_all_real_uses (get_insn (), REGNO (get_vl ())); + = get_all_real_uses (get_insn (), REGNO (dest_vl)); for (use_info *use : vl_uses) { gcc_assert (use->insn ()->is_real ()); rtx_insn *rinsn = use->insn ()->rtl (); if (!has_vl_op (rinsn) - || count_regno_occurrences (rinsn, REGNO (get_vl ())) != 1) + || count_regno_occurrences (rinsn, REGNO (dest_vl)) != 1) { m_vl_used_by_non_rvv_insn = true; break; } rtx avl = ::get_avl (rinsn); - if (!avl || !REG_P (avl) || REGNO (get_vl ()) != REGNO (avl)) + if (!avl || !REG_P (avl) || REGNO (dest_vl) != REGNO (avl)) { m_vl_used_by_non_rvv_insn = true; break; @@ -2790,6 +2797,9 @@ pre_vsetvl::fuse_local_vsetvl_info () curr_info.dump (dump_file, " "); } m_dem.merge (prev_info, curr_info); + if (!curr_info.vl_used_by_non_rvv_insn_p () + && vsetvl_insn_p (curr_info.get_insn ()->rtl ())) + m_delete_list.safe_push (curr_info); if (curr_info.get_read_vl_insn ()) prev_info.set_read_vl_insn (curr_info.get_read_vl_insn ()); if (dump_file && (dump_flags & TDF_DETAILS)) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 90a6e93..2e9ac28 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -21,6 +21,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #define INCLUDE_STRING #include "config.h" #include "system.h" @@ -75,6 +76,7 @@ along with GCC; see the file COPYING3. If not see #include "gcse.h" #include "tree-dfa.h" #include "target-globals.h" +#include "riscv-v.h" /* This file should be included last. */ #include "target-def.h" @@ -293,6 +295,9 @@ struct riscv_tune_param bool overlap_op_by_pieces; unsigned int fusible_ops; const struct cpu_vector_cost *vec_costs; + const char *function_align; + const char *jump_align; + const char *loop_align; }; @@ -452,6 +457,9 @@ static const struct riscv_tune_param rocket_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for Sifive 7 Series. */ @@ -471,6 +479,9 @@ static const struct riscv_tune_param sifive_7_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for Sifive p400 Series. */ @@ -490,6 +501,9 @@ static const struct riscv_tune_param sifive_p400_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ &generic_vector_cost, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for Sifive p600 Series. */ @@ -509,6 +523,9 @@ static const struct riscv_tune_param sifive_p600_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */ &generic_vector_cost, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for T-HEAD c906. */ @@ -528,6 +545,9 @@ static const struct riscv_tune_param thead_c906_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for xiangshan nanhu. */ @@ -547,6 +567,9 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH, /* fusible_ops */ NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for a generic ooo profile. */ @@ -566,6 +589,9 @@ static const struct riscv_tune_param generic_ooo_tune_info = { true, /* overlap_op_by_pieces */ RISCV_FUSE_NOTHING, /* fusible_ops */ &generic_vector_cost, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; /* Costs to use when optimizing for size. */ @@ -585,6 +611,9 @@ static const struct riscv_tune_param optimize_size_tune_info = { false, /* overlap_op_by_pieces */ RISCV_FUSE_NOTHING, /* fusible_ops */ NULL, /* vector cost */ + NULL, /* function_align */ + NULL, /* jump_align */ + NULL, /* loop_align */ }; static bool riscv_avoid_shrink_wrapping_separate (); @@ -1229,6 +1258,152 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, } } + else if (cost > 4 && TARGET_64BIT && can_create_pseudo_p () + && allow_new_pseudos) + { + struct riscv_integer_op alt_codes[RISCV_MAX_INTEGER_OPS]; + int alt_cost; + + unsigned HOST_WIDE_INT loval = value & 0xffffffff; + unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32; + bool bit31 = (loval & 0x80000000) != 0; + int trailing_shift = ctz_hwi (loval) - ctz_hwi (hival); + int leading_shift = clz_hwi (loval) - clz_hwi (hival); + int shiftval = 0; + + /* Adjust the shift into the high half accordingly. */ + if ((trailing_shift > 0 && hival == (loval >> trailing_shift))) + shiftval = 32 - trailing_shift; + else if ((leading_shift > 0 && hival == (loval << leading_shift))) + shiftval = 32 + leading_shift; + + if (shiftval && !bit31) + alt_cost = 2 + riscv_build_integer_1 (alt_codes, sext_hwi (loval, 32), + mode); + + /* For constants where the upper half is a shift of the lower half we + can do a shift followed by an or. */ + if (shiftval && !bit31 && alt_cost < cost) + { + /* We need to save the first constant we build. */ + alt_codes[alt_cost - 3].save_temporary = true; + + /* Now we want to shift the previously generated constant into the + high half. */ + alt_codes[alt_cost - 2].code = ASHIFT; + alt_codes[alt_cost - 2].value = shiftval; + alt_codes[alt_cost - 2].use_uw = false; + alt_codes[alt_cost - 2].save_temporary = false; + + /* And the final step, IOR the two halves together. Since this uses + the saved temporary, use CONCAT similar to what we do for Zbkb. */ + alt_codes[alt_cost - 1].code = CONCAT; + alt_codes[alt_cost - 1].value = 0; + alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; + + memcpy (codes, alt_codes, sizeof (alt_codes)); + cost = alt_cost; + } + + if (cost > 4 && !bit31 && TARGET_ZBA) + { + int value = 0; + + /* Check for a shNadd. */ + if (hival == loval * 3) + value = 3; + else if (hival == loval * 5) + value = 5; + else if (hival == loval * 9) + value = 9; + + if (value) + alt_cost = 2 + riscv_build_integer_1 (alt_codes, + sext_hwi (loval, 32), mode); + + /* For constants where the upper half is a shNadd of the lower half + we can do a similar transformation. */ + if (value && alt_cost < cost) + { + alt_codes[alt_cost - 3].save_temporary = true; + alt_codes[alt_cost - 2].code = FMA; + alt_codes[alt_cost - 2].value = value; + alt_codes[alt_cost - 2].use_uw = false; + alt_codes[alt_cost - 2].save_temporary = false; + alt_codes[alt_cost - 1].code = CONCAT; + alt_codes[alt_cost - 1].value = 0; + alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; + + memcpy (codes, alt_codes, sizeof (alt_codes)); + cost = alt_cost; + } + } + + if (cost > 4 && !bit31) + { + int value = hival - loval; + + /* For constants were the halves differ by less than 2048 we can + generate the upper half by using an addi on the lower half then + using a shift 32 followed by an or. */ + if (IN_RANGE (value, -2048, 2047)) + { + alt_cost = 3 + riscv_build_integer_1 (alt_codes, + sext_hwi (loval, 32), mode); + if (alt_cost < cost) + { + alt_codes[alt_cost - 4].save_temporary = true; + alt_codes[alt_cost - 3].code = PLUS; + alt_codes[alt_cost - 3].value = value; + alt_codes[alt_cost - 3].use_uw = false; + alt_codes[alt_cost - 3].save_temporary = false; + alt_codes[alt_cost - 2].code = ASHIFT; + alt_codes[alt_cost - 2].value = 32; + alt_codes[alt_cost - 2].use_uw = false; + alt_codes[alt_cost - 2].save_temporary = false; + alt_codes[alt_cost - 1].code = CONCAT; + alt_codes[alt_cost - 1].value = 0; + alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; + + memcpy (codes, alt_codes, sizeof (alt_codes)); + cost = alt_cost; + } + } + } + + if (cost > 5 && !bit31) + { + /* For constants where the upper half is the lower half inverted we can flip + it with an xor and do a shift 32 followed by an or. */ + if (hival == (~loval & 0xffffffff)) + { + alt_cost = 3 + riscv_build_integer_1 (alt_codes, + sext_hwi (loval, 32), mode); + if (alt_cost < cost) + { + alt_codes[alt_cost - 4].save_temporary = true; + alt_codes[alt_cost - 3].code = XOR; + alt_codes[alt_cost - 3].value = -1; + alt_codes[alt_cost - 3].use_uw = false; + alt_codes[alt_cost - 3].save_temporary = false; + alt_codes[alt_cost - 2].code = ASHIFT; + alt_codes[alt_cost - 2].value = 32; + alt_codes[alt_cost - 2].use_uw = false; + alt_codes[alt_cost - 2].save_temporary = false; + alt_codes[alt_cost - 1].code = CONCAT; + alt_codes[alt_cost - 1].value = 0; + alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; + + memcpy (codes, alt_codes, sizeof (alt_codes)); + cost = alt_cost; + } + } + } + } return cost; } @@ -1240,18 +1415,20 @@ static int riscv_split_integer_cost (HOST_WIDE_INT val) { int cost; - unsigned HOST_WIDE_INT loval = sext_hwi (val, 32); - unsigned HOST_WIDE_INT hival = sext_hwi ((val - loval) >> 32, 32); + unsigned HOST_WIDE_INT loval = val & 0xffffffff; + unsigned HOST_WIDE_INT hival = (val & ~loval) >> 32; struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS]; /* This routine isn't used by pattern conditions, so whether or not to allow new pseudos can be a function of where we are in the - RTL pipeline. We shouldn't need scratch pseudos for this case - anyway. */ + RTL pipeline. */ bool allow_new_pseudos = can_create_pseudo_p (); cost = 2 + riscv_build_integer (codes, loval, VOIDmode, allow_new_pseudos); if (loval != hival) cost += riscv_build_integer (codes, hival, VOIDmode, allow_new_pseudos); + else if ((loval & 0x80000000) != 0) + cost = 3 + riscv_build_integer (codes, ~loval & 0xffffffff, + VOIDmode, allow_new_pseudos); return cost; } @@ -1274,11 +1451,16 @@ riscv_integer_cost (HOST_WIDE_INT val, bool allow_new_pseudos) static rtx riscv_split_integer (HOST_WIDE_INT val, machine_mode mode) { - unsigned HOST_WIDE_INT loval = sext_hwi (val, 32); - unsigned HOST_WIDE_INT hival = sext_hwi ((val - loval) >> 32, 32); + unsigned HOST_WIDE_INT loval = val & 0xffffffff; + unsigned HOST_WIDE_INT hival = (val & ~loval) >> 32; rtx hi = gen_reg_rtx (mode), lo = gen_reg_rtx (mode); + rtx x = gen_reg_rtx (mode); + bool eq_neg = (loval == hival) && ((loval & 0x80000000) != 0); - riscv_move_integer (lo, lo, loval, mode); + if (eq_neg) + riscv_move_integer (lo, lo, ~loval & 0xffffffff, mode); + else + riscv_move_integer (lo, lo, loval, mode); if (loval == hival) hi = gen_rtx_ASHIFT (mode, lo, GEN_INT (32)); @@ -1289,7 +1471,13 @@ riscv_split_integer (HOST_WIDE_INT val, machine_mode mode) } hi = force_reg (mode, hi); - return gen_rtx_PLUS (mode, hi, lo); + x = gen_rtx_PLUS (mode, hi, lo); + if (eq_neg) + { + x = force_reg (mode, x); + x = gen_rtx_XOR (mode, x, GEN_INT (-1)); + } + return x; } /* Return true if X is a thread-local symbol. */ @@ -2142,18 +2330,13 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) ...etc. */ if (riscv_v_ext_mode_p (GET_MODE (x))) { - /* const series vector. */ - rtx base, step; - if (const_vec_series_p (x, &base, &step)) - { - /* This is not accurate, we will need to adapt the COST - * accurately according to BASE && STEP. */ - return 1; - } - rtx elt; if (const_vec_duplicate_p (x, &elt)) { + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL) + /* Duplicate values of 0/1 can be emitted using vmv.v.i. */ + return 1; + /* We don't allow CONST_VECTOR for DI vector on RV32 system since the ELT constant value can not held within a single register to disable reload a DI @@ -2162,11 +2345,9 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) if (maybe_gt (GET_MODE_SIZE (smode), UNITS_PER_WORD) && !immediate_operand (elt, Pmode)) return 0; - /* Constants from -16 to 15 can be loaded with vmv.v.i. - The Wc0, Wc1 constraints are already covered by the - vi constraint so we do not need to check them here - separately. */ - if (satisfies_constraint_vi (x)) + /* Constants in range -16 ~ 15 integer or 0.0 floating-point + can be emitted using vmv.v.i. */ + if (valid_vec_immediate_p (x)) return 1; /* Any int/FP constants can always be broadcast from a @@ -2186,6 +2367,52 @@ riscv_const_insns (rtx x, bool allow_new_pseudos) return 1 + 4; /*vmv.v.x + memory access. */ } } + + /* const series vector. */ + rtx base, step; + if (const_vec_series_p (x, &base, &step)) + { + /* This cost is not accurate, we will need to adapt the COST + accurately according to BASE && STEP. */ + return 1; + } + + if (CONST_VECTOR_STEPPED_P (x)) + { + /* Some cases are unhandled so we need construct a builder to + detect/allow those cases to be handled by the fallthrough + handler. */ + unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); + unsigned int npatterns = CONST_VECTOR_NPATTERNS (x); + rvv_builder builder (GET_MODE(x), npatterns, nelts_per_pattern); + for (unsigned int i = 0; i < nelts_per_pattern; i++) + { + for (unsigned int j = 0; j < npatterns; j++) + builder.quick_push (CONST_VECTOR_ELT (x, i * npatterns + j)); + } + builder.finalize (); + + if (builder.single_step_npatterns_p ()) + { + if (builder.npatterns_all_equal_p ()) + { + /* TODO: This cost is not accurate. */ + return 1; + } + else + { + /* TODO: This cost is not accurate. */ + return 1; + } + } + else if (builder.interleaved_stepped_npatterns_p ()) + { + /* TODO: This cost is not accurate. */ + return 1; + } + + /* Fallthrough. */ + } } /* TODO: We may support more const vector in the future. */ @@ -2579,14 +2806,12 @@ riscv_legitimize_tls_address (rtx loc) case TLS_MODEL_GLOBAL_DYNAMIC: if (TARGET_TLSDESC) { - static unsigned seqno; tp = gen_rtx_REG (Pmode, THREAD_POINTER_REGNUM); a0 = gen_rtx_REG (Pmode, GP_ARG_FIRST); dest = gen_reg_rtx (Pmode); - emit_insn (gen_tlsdesc (Pmode, loc, GEN_INT (seqno))); + emit_insn (gen_tlsdesc (Pmode, loc)); emit_insn (gen_add3_insn (dest, a0, tp)); - seqno++; } else { @@ -2810,12 +3035,22 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value, } else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE) { - rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp; - rtx t2 = codes[i].code == VEC_MERGE ? old_value : x; - gcc_assert (t2); - t2 = gen_lowpart (SImode, t2); - emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2)); - x = t; + if (codes[i].code == CONCAT && !TARGET_ZBKB) + { + /* The two values should have no bits in common, so we can + use PLUS instead of IOR which has a higher chance of + using a compressed instruction. */ + x = gen_rtx_PLUS (mode, x, old_value); + } + else + { + rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp; + rtx t2 = codes[i].code == VEC_MERGE ? old_value : x; + gcc_assert (t2); + t2 = gen_lowpart (SImode, t2); + emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2)); + x = t; + } } else x = gen_rtx_fmt_ee (codes[i].code, mode, @@ -3560,7 +3795,12 @@ riscv_rtx_costs (rtx x, machine_mode mode, int outer_code, int opno ATTRIBUTE_UN if (outer_code == INSN && register_operand (SET_DEST (x), GET_MODE (SET_DEST (x)))) { - riscv_rtx_costs (SET_SRC (x), mode, outer_code, opno, total, speed); + if (REG_P (SET_SRC (x)) && TARGET_DOUBLE_FLOAT && mode == DFmode) + { + *total = COSTS_N_INSNS (1); + return true; + } + riscv_rtx_costs (SET_SRC (x), mode, SET, opno, total, speed); return true; } @@ -4159,11 +4399,29 @@ riscv_noce_conversion_profitable_p (rtx_insn *seq, riscv_if_info.original_cost += COSTS_N_INSNS (1); riscv_if_info.max_seq_cost += COSTS_N_INSNS (1); } - last_dest = NULL_RTX; + rtx dest = SET_DEST (x); - if (COMPARISON_P (src) + + /* Do something similar for the moves that are likely to + turn into NOP moves by the time the register allocator is + done. These are also side effects of how our sCC expanders + work. We'll want to check and update LAST_DEST here too. */ + if (last_dest && REG_P (dest) - && GET_MODE (dest) == SImode) + && GET_MODE (dest) == SImode + && SUBREG_P (src) + && SUBREG_PROMOTED_VAR_P (src) + && REGNO (SUBREG_REG (src)) == REGNO (last_dest)) + { + riscv_if_info.original_cost += COSTS_N_INSNS (1); + riscv_if_info.max_seq_cost += COSTS_N_INSNS (1); + if (last_dest) + last_dest = dest; + } + else + last_dest = NULL_RTX; + + if (COMPARISON_P (src) && REG_P (dest)) last_dest = dest; } else @@ -4845,13 +5103,31 @@ riscv_expand_int_scc (rtx target, enum rtx_code code, rtx op0, rtx op1, bool *in riscv_extend_comparands (code, &op0, &op1); op0 = force_reg (word_mode, op0); + /* For sub-word targets on rv64, do the computation in DImode + then extract the lowpart for the final target, marking it + as sign extended. Note that it's also properly zero extended, + but it's probably more profitable to expose it as sign extended. */ + rtx t; + if (TARGET_64BIT && GET_MODE (target) == SImode) + t = gen_reg_rtx (DImode); + else + t = target; + if (code == EQ || code == NE) { rtx zie = riscv_zero_if_equal (op0, op1); - riscv_emit_binary (code, target, zie, const0_rtx); + riscv_emit_binary (code, t, zie, const0_rtx); } else - riscv_emit_int_order_test (code, invert_ptr, target, op0, op1); + riscv_emit_int_order_test (code, invert_ptr, t, op0, op1); + + if (t != target) + { + t = gen_lowpart (SImode, t); + SUBREG_PROMOTED_VAR_P (t) = 1; + SUBREG_PROMOTED_SET (t, SRP_SIGNED); + emit_move_insn (target, t); + } } /* Like riscv_expand_int_scc, but for floating-point comparisons. */ @@ -6373,8 +6649,8 @@ riscv_union_memmodels (enum memmodel model1, enum memmodel model2) model1 = memmodel_base (model1); model2 = memmodel_base (model2); - enum memmodel weaker = model1 <= model2 ? model1: model2; - enum memmodel stronger = model1 > model2 ? model1: model2; + enum memmodel weaker = model1 <= model2 ? model1 : model2; + enum memmodel stronger = model1 > model2 ? model1 : model2; switch (stronger) { @@ -7405,6 +7681,73 @@ riscv_compute_frame_info (void) /* Next points the incoming stack pointer and any incoming arguments. */ } +/* Implement TARGET_CAN_INLINE_P. Determine whether inlining the function + CALLER into the function CALLEE is safe. Inlining should be rejected if + there is no always_inline attribute and the target options differ except + for differences in ISA extensions or performance tuning options like the + code model, TLS dialect, and stack protector, etc. Inlining is + permissible when the non-ISA extension options are identical and the ISA + extensions of CALLEE are a subset of those of CALLER, thereby improving + the performance of Function Multi-Versioning. */ + +static bool +riscv_can_inline_p (tree caller, tree callee) +{ + /* Do not inline when callee is versioned but caller is not. */ + if (DECL_FUNCTION_VERSIONED (callee) && ! DECL_FUNCTION_VERSIONED (caller)) + return false; + + tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); + tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); + + /* It's safe to inline if callee has no opts. */ + if (! callee_tree) + return true; + + if (! caller_tree) + caller_tree = target_option_default_node; + + struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); + struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); + + int isa_flag_mask = riscv_x_target_flags_isa_mask (); + + /* Callee and caller should have the same target options except for ISA. */ + int callee_target_flags = callee_opts->x_target_flags & ~isa_flag_mask; + int caller_target_flags = caller_opts->x_target_flags & ~isa_flag_mask; + + if (callee_target_flags != caller_target_flags) + return false; + + /* Callee's ISA should be a subset of the caller's ISA. */ + if (! riscv_ext_is_subset (caller_opts, callee_opts)) + return false; + + /* If the callee has always_inline set, we can ignore the rest attributes. */ + if (lookup_attribute ("always_inline", DECL_ATTRIBUTES (callee))) + return true; + + if (caller_opts->x_riscv_cmodel != callee_opts->x_riscv_cmodel) + return false; + + if (caller_opts->x_riscv_tls_dialect != callee_opts->x_riscv_tls_dialect) + return false; + + if (caller_opts->x_riscv_stack_protector_guard_reg + != callee_opts->x_riscv_stack_protector_guard_reg) + return false; + + if (caller_opts->x_riscv_stack_protector_guard_offset + != callee_opts->x_riscv_stack_protector_guard_offset) + return false; + + if (caller_opts->x_rvv_vector_strict_align + != callee_opts->x_rvv_vector_strict_align) + return false; + + return true; +} + /* Make sure that we're not trying to eliminate to the wrong hard frame pointer. */ @@ -9971,6 +10314,18 @@ riscv_override_options_internal (struct gcc_options *opts) ? &optimize_size_tune_info : cpu->tune_param; + /* If not optimizing for size, set the default + alignment to what the target wants. */ + if (!opts->x_optimize_size) + { + if (opts->x_flag_align_loops && !opts->x_str_align_loops) + opts->x_str_align_loops = tune_param->loop_align; + if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) + opts->x_str_align_jumps = tune_param->jump_align; + if (opts->x_flag_align_functions && !opts->x_str_align_functions) + opts->x_str_align_functions = tune_param->function_align; + } + /* Use -mtune's setting for slow_unaligned_access, even when optimizing for size. For architectures that trap and emulate unaligned accesses, the performance cost is too great, even for -Os. Similarly, if @@ -10630,6 +10985,17 @@ riscv_can_change_mode_class (machine_mode from, machine_mode to, if (reg_classes_intersect_p (V_REGS, rclass) && !ordered_p (GET_MODE_PRECISION (from), GET_MODE_PRECISION (to))) return false; + + /* Subregs of modes larger than one vector are ambiguous. + A V4DImode with rv64gcv_zvl128b could, for example, span two registers/one + register group of two at VLEN = 128 or one register at VLEN >= 256 and + we cannot, statically, determine which part of it to extract. + Therefore prevent that. */ + if (reg_classes_intersect_p (V_REGS, rclass) + && riscv_v_ext_vls_mode_p (from) + && !ordered_p (BITS_PER_RISCV_VECTOR, GET_MODE_PRECISION (from))) + return false; + return !reg_classes_intersect_p (FP_REGS, rclass); } @@ -11445,6 +11811,65 @@ riscv_frm_mode_needed (rtx_insn *cur_insn, int code) return mode; } +/* If the current function needs a single VXRM mode, return it. Else + return VXRM_MODE_NONE. + + This is called on the first insn in the chain and scans the full function + once to collect VXRM mode settings. If a single mode is needed, it will + often be better to set it once at the start of the function rather than + at an anticipation point. */ +static int +singleton_vxrm_need (void) +{ + /* Only needed for vector code. */ + if (!TARGET_VECTOR) + return VXRM_MODE_NONE; + + /* If ENTRY has more than once successor, then don't optimize, just to + keep things simple. */ + if (EDGE_COUNT (ENTRY_BLOCK_PTR_FOR_FN (cfun)->succs) > 1) + return VXRM_MODE_NONE; + + /* Walk the IL noting if VXRM is needed and if there's more than one + mode needed. */ + bool found = false; + int saved_vxrm_mode; + for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) + { + if (!INSN_P (insn) || DEBUG_INSN_P (insn)) + continue; + + int code = recog_memoized (insn); + if (code < 0) + continue; + + int vxrm_mode = get_attr_vxrm_mode (insn); + if (vxrm_mode == VXRM_MODE_NONE) + continue; + + /* If this is the first VXRM need, note it. */ + if (!found) + { + saved_vxrm_mode = vxrm_mode; + found = true; + continue; + } + + /* Not the first VXRM need. If this is different than + the saved need, then we're not going to be able to + optimize and we can stop scanning now. */ + if (saved_vxrm_mode != vxrm_mode) + return VXRM_MODE_NONE; + + /* Same mode as we've seen, keep scanning. */ + } + + /* If we got here we scanned the whole function. If we found + some VXRM state, then we can optimize. If we didn't find + VXRM state, then there's nothing to optimize. */ + return found ? saved_vxrm_mode : VXRM_MODE_NONE; +} + /* Return mode that entity must be switched into prior to the execution of insn. */ @@ -11456,6 +11881,16 @@ riscv_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET) switch (entity) { case RISCV_VXRM: + /* If CUR_INSN is the first insn in the function, then determine if we + want to signal a need in ENTRY->succs to allow for aggressive + elimination of subsequent sets of VXRM. */ + if (insn == get_first_nonnote_insn ()) + { + int need = singleton_vxrm_need (); + if (need != VXRM_MODE_NONE) + return need; + } + return code >= 0 ? get_attr_vxrm_mode (insn) : VXRM_MODE_NONE; case RISCV_FRM: return riscv_frm_mode_needed (insn, code); @@ -11842,19 +12277,56 @@ riscv_get_raw_result_mode (int regno) return default_get_reg_raw_mode (regno); } -/* Generate a new rtx of Xmode based on the rtx and mode in define pattern. - The rtx x will be zero extended to Xmode if the mode is HI/QImode, and - the new zero extended Xmode rtx will be returned. - Or the gen_lowpart rtx of Xmode will be returned. */ +/* Generate a REG rtx of Xmode from the given rtx and mode. + The rtx x can be REG (QI/HI/SI/DI) or const_int. + The machine_mode mode is the original mode from define pattern. + + If rtx is REG and Xmode, the RTX x will be returned directly. + + If rtx is REG and non-Xmode, the zero extended to new REG of Xmode will be + returned. + + If rtx is const_int, a new REG rtx will be created to hold the value of + const_int and then returned. + + According to the gccint doc, the constants generated for modes with fewer + bits than in HOST_WIDE_INT must be sign extended to full width. Thus there + will be two cases here, take QImode as example. + + For .SAT_SUB (127, y) in QImode, we have (const_int 127) and one simple + mov from const_int to the new REG rtx is good enough here. + + For .SAT_SUB (254, y) in QImode, we have (const_int -2) after define_expand. + Aka 0xfffffffffffffffe in Xmode of RV64 but we actually need 0xfe in Xmode + of RV64. So we need to cleanup the highest 56 bits of the new REG rtx moved + from the (const_int -2). + + Then the underlying expanding can perform the code generation based on + the REG rtx of Xmode, instead of taking care of these in expand func. */ static rtx riscv_gen_zero_extend_rtx (rtx x, machine_mode mode) { + rtx xmode_reg = gen_reg_rtx (Xmode); + + if (!CONST_INT_P (x)) + { + if (mode == Xmode) + return x; + + riscv_emit_unary (ZERO_EXTEND, xmode_reg, x); + return xmode_reg; + } + if (mode == Xmode) - return x; + emit_move_insn (xmode_reg, x); + else + { + rtx reg_x = gen_reg_rtx (mode); - rtx xmode_reg = gen_reg_rtx (Xmode); - riscv_emit_unary (ZERO_EXTEND, xmode_reg, x); + emit_move_insn (reg_x, x); + riscv_emit_unary (ZERO_EXTEND, xmode_reg, reg_x); + } return xmode_reg; } @@ -11876,7 +12348,7 @@ riscv_expand_usadd (rtx dest, rtx x, rtx y) rtx xmode_sum = gen_reg_rtx (Xmode); rtx xmode_lt = gen_reg_rtx (Xmode); rtx xmode_x = riscv_gen_zero_extend_rtx (x, mode); - rtx xmode_y = gen_lowpart (Xmode, y); + rtx xmode_y = riscv_gen_zero_extend_rtx (y, mode); rtx xmode_dest = gen_reg_rtx (Xmode); /* Step-1: sum = x + y */ @@ -11907,48 +12379,94 @@ riscv_expand_usadd (rtx dest, rtx x, rtx y) emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); } -/* Generate a REG rtx of Xmode from the given rtx and mode. - The rtx x can be REG (QI/HI/SI/DI) or const_int. - The machine_mode mode is the original mode from define pattern. - - If rtx is REG, the gen_lowpart of Xmode will be returned. - - If rtx is const_int, a new REG rtx will be created to hold the value of - const_int and then returned. - - According to the gccint doc, the constants generated for modes with fewer - bits than in HOST_WIDE_INT must be sign extended to full width. Thus there - will be two cases here, take QImode as example. - - For .SAT_SUB (127, y) in QImode, we have (const_int 127) and one simple - mov from const_int to the new REG rtx is good enough here. - - For .SAT_SUB (254, y) in QImode, we have (const_int -2) after define_expand. - Aka 0xfffffffffffffffe in Xmode of RV64 but we actually need 0xfe in Xmode - of RV64. So we need to cleanup the highest 56 bits of the new REG rtx moved - from the (const_int -2). - - Then the underlying expanding can perform the code generation based on - the REG rtx of Xmode, instead of taking care of these in expand func. */ +/* Return a new const RTX of MAX value based on given mode. Only + int scalar mode is allowed. */ static rtx -riscv_gen_unsigned_xmode_reg (rtx x, machine_mode mode) +riscv_gen_sign_max_cst (machine_mode mode) { - if (!CONST_INT_P (x)) - return gen_lowpart (Xmode, x); - - rtx xmode_x = gen_reg_rtx (Xmode); - - if (mode == Xmode) - emit_move_insn (xmode_x, x); - else + switch (mode) { - rtx reg_x = gen_reg_rtx (mode); - emit_move_insn (reg_x, x); - riscv_emit_unary (ZERO_EXTEND, xmode_x, reg_x); + case QImode: + return GEN_INT (INT8_MAX); + case HImode: + return GEN_INT (INT16_MAX); + case SImode: + return GEN_INT (INT32_MAX); + case DImode: + return GEN_INT (INT64_MAX); + default: + gcc_unreachable (); } +} + +/* Implements the signed saturation sub standard name ssadd for int mode. + + z = SAT_ADD(x, y). + => + 1. sum = x + y + 2. xor_0 = x ^ y + 3. xor_1 = x ^ sum + 4. lt = xor_1 < 0 + 5. ge = xor_0 >= 0 + 6. and = ge & lt + 7. lt = x < 0 + 8. neg = -lt + 9. max = INT_MAX + 10. max = max ^ neg + 11. neg = -and + 12. max = max & neg + 13. and = and - 1 + 14. z = sum & and + 15. z = z | max */ + +void +riscv_expand_ssadd (rtx dest, rtx x, rtx y) +{ + machine_mode mode = GET_MODE (dest); + unsigned bitsize = GET_MODE_BITSIZE (mode).to_constant (); + rtx shift_bits = GEN_INT (bitsize - 1); + rtx xmode_x = gen_lowpart (Xmode, x); + rtx xmode_y = gen_lowpart (Xmode, y); + rtx xmode_sum = gen_reg_rtx (Xmode); + rtx xmode_dest = gen_reg_rtx (Xmode); + rtx xmode_xor_0 = gen_reg_rtx (Xmode); + rtx xmode_xor_1 = gen_reg_rtx (Xmode); + rtx xmode_ge = gen_reg_rtx (Xmode); + rtx xmode_lt = gen_reg_rtx (Xmode); + rtx xmode_neg = gen_reg_rtx (Xmode); + rtx xmode_and = gen_reg_rtx (Xmode); + rtx xmode_max = gen_reg_rtx (Xmode); - return xmode_x; + /* Step-1: sum = x + y, xor_0 = x ^ y, xor_1 = x ^ sum. */ + riscv_emit_binary (PLUS, xmode_sum, xmode_x, xmode_y); + riscv_emit_binary (XOR, xmode_xor_0, xmode_x, xmode_y); + riscv_emit_binary (XOR, xmode_xor_1, xmode_x, xmode_sum); + + /* Step-2: lt = xor_1 < 0, ge = xor_0 >= 0, and = ge & lt. */ + riscv_emit_binary (LSHIFTRT, xmode_lt, xmode_xor_1, shift_bits); + riscv_emit_binary (LSHIFTRT, xmode_ge, xmode_xor_0, shift_bits); + riscv_emit_binary (XOR, xmode_ge, xmode_ge, CONST1_RTX (Xmode)); + riscv_emit_binary (AND, xmode_and, xmode_lt, xmode_ge); + riscv_emit_binary (AND, xmode_and, xmode_and, CONST1_RTX (Xmode)); + + /* Step-3: lt = x < 0, neg = -lt */ + riscv_emit_binary (LT, xmode_lt, xmode_x, CONST0_RTX (Xmode)); + riscv_emit_unary (NEG, xmode_neg, xmode_lt); + + /* Step-4: max = 0x7f..., max = max ^ neg, neg = -and, max = max & neg */ + riscv_emit_move (xmode_max, riscv_gen_sign_max_cst (mode)); + riscv_emit_binary (XOR, xmode_max, xmode_max, xmode_neg); + riscv_emit_unary (NEG, xmode_neg, xmode_and); + riscv_emit_binary (AND, xmode_max, xmode_max, xmode_neg); + + /* Step-5: and = and - 1, dest = sum & and */ + riscv_emit_binary (PLUS, xmode_and, xmode_and, CONSTM1_RTX (Xmode)); + riscv_emit_binary (AND, xmode_dest, xmode_sum, xmode_and); + + /* Step-6: xmode_dest = xmode_dest | xmode_max, dest = xmode_dest */ + riscv_emit_binary (IOR, xmode_dest, xmode_dest, xmode_max); + emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); } /* Implements the unsigned saturation sub standard name usadd for int mode. @@ -11964,8 +12482,8 @@ void riscv_expand_ussub (rtx dest, rtx x, rtx y) { machine_mode mode = GET_MODE (dest); - rtx xmode_x = riscv_gen_unsigned_xmode_reg (x, mode); - rtx xmode_y = gen_lowpart (Xmode, y); + rtx xmode_x = riscv_gen_zero_extend_rtx (x, mode); + rtx xmode_y = riscv_gen_zero_extend_rtx (y, mode); rtx xmode_lt = gen_reg_rtx (Xmode); rtx xmode_minus = gen_reg_rtx (Xmode); rtx xmode_dest = gen_reg_rtx (Xmode); @@ -11986,6 +12504,75 @@ riscv_expand_ussub (rtx dest, rtx x, rtx y) emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); } +/* Implements the signed saturation sub standard name ssadd for int mode. + + z = SAT_SUB(x, y). + => + 1. minus = x - y + 2. xor_0 = x ^ y + 3. xor_1 = x ^ minus + 4. lt_0 = xor_1 < 0 + 5. lt_1 = xor_0 < 0 + 6. and = lt_0 & lt_1 + 7. lt = x < 0 + 8. neg = -lt + 9. max = INT_MAX + 10. max = max ^ neg + 11. neg = -and + 12. max = max & neg + 13. and = and - 1 + 14. z = minus & and + 15. z = z | max */ + +void +riscv_expand_sssub (rtx dest, rtx x, rtx y) +{ + machine_mode mode = GET_MODE (dest); + unsigned bitsize = GET_MODE_BITSIZE (mode).to_constant (); + rtx shift_bits = GEN_INT (bitsize - 1); + rtx xmode_x = gen_lowpart (Xmode, x); + rtx xmode_y = gen_lowpart (Xmode, y); + rtx xmode_minus = gen_reg_rtx (Xmode); + rtx xmode_xor_0 = gen_reg_rtx (Xmode); + rtx xmode_xor_1 = gen_reg_rtx (Xmode); + rtx xmode_lt_0 = gen_reg_rtx (Xmode); + rtx xmode_lt_1 = gen_reg_rtx (Xmode); + rtx xmode_and = gen_reg_rtx (Xmode); + rtx xmode_lt = gen_reg_rtx (Xmode); + rtx xmode_neg = gen_reg_rtx (Xmode); + rtx xmode_max = gen_reg_rtx (Xmode); + rtx xmode_dest = gen_reg_rtx (Xmode); + + /* Step-1: mins = x - y, xor_0 = x ^ y, xor_1 = x ^ minus. */ + riscv_emit_binary (MINUS, xmode_minus, xmode_x, xmode_y); + riscv_emit_binary (XOR, xmode_xor_0, xmode_x, xmode_y); + riscv_emit_binary (XOR, xmode_xor_1, xmode_x, xmode_minus); + + /* Step-2: and = xor_0 < 0 & xor_1 < 0. */ + riscv_emit_binary (LSHIFTRT, xmode_lt_0, xmode_xor_0, shift_bits); + riscv_emit_binary (LSHIFTRT, xmode_lt_1, xmode_xor_1, shift_bits); + riscv_emit_binary (AND, xmode_and, xmode_lt_0, xmode_lt_1); + riscv_emit_binary (AND, xmode_and, xmode_and, CONST1_RTX (Xmode)); + + /* Step-3: lt = x < 0, neg = -lt. */ + riscv_emit_binary (LT, xmode_lt, xmode_x, CONST0_RTX (Xmode)); + riscv_emit_unary (NEG, xmode_neg, xmode_lt); + + /* Step-4: max = 0x7f..., max = max ^ neg, neg = -and, max = max & neg. */ + riscv_emit_move (xmode_max, riscv_gen_sign_max_cst (mode)); + riscv_emit_binary (XOR, xmode_max, xmode_max, xmode_neg); + riscv_emit_unary (NEG, xmode_neg, xmode_and); + riscv_emit_binary (AND, xmode_max, xmode_max, xmode_neg); + + /* Step-5: and = and - 1, dest = minus & and. */ + riscv_emit_binary (PLUS, xmode_and, xmode_and, CONSTM1_RTX (Xmode)); + riscv_emit_binary (AND, xmode_dest, xmode_minus, xmode_and); + + /* Step-6: dest = dest | max. */ + riscv_emit_binary (IOR, xmode_dest, xmode_dest, xmode_max); + emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); +} + /* Implement the unsigned saturation truncation for int mode. b = SAT_TRUNC (a); @@ -12026,6 +12613,67 @@ riscv_expand_ustrunc (rtx dest, rtx src) emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); } +/* Implement the signed saturation truncation for int mode. + + b = SAT_TRUNC (a); + => + 1. lt = a < max + 2. gt = min < a + 3. mask = lt & gt + 4. trunc_mask = -mask + 5. sat_mask = mask - 1 + 6. lt = a < 0 + 7. neg = -lt + 8. sat = neg ^ max + 9. trunc = src & trunc_mask + 10. sat = sat & sat_mask + 11. dest = trunc | sat */ + +void +riscv_expand_sstrunc (rtx dest, rtx src) +{ + machine_mode mode = GET_MODE (dest); + unsigned narrow_prec = GET_MODE_PRECISION (mode).to_constant (); + HOST_WIDE_INT narrow_max = ((int64_t)1 << (narrow_prec - 1)) - 1; // 127 + HOST_WIDE_INT narrow_min = -narrow_max - 1; // -128 + + rtx xmode_narrow_max = gen_reg_rtx (Xmode); + rtx xmode_narrow_min = gen_reg_rtx (Xmode); + rtx xmode_lt = gen_reg_rtx (Xmode); + rtx xmode_gt = gen_reg_rtx (Xmode); + rtx xmode_src = gen_lowpart (Xmode, src); + rtx xmode_dest = gen_reg_rtx (Xmode); + rtx xmode_mask = gen_reg_rtx (Xmode); + rtx xmode_sat = gen_reg_rtx (Xmode); + rtx xmode_trunc = gen_reg_rtx (Xmode); + rtx xmode_sat_mask = gen_reg_rtx (Xmode); + rtx xmode_trunc_mask = gen_reg_rtx (Xmode); + + /* Step-1: lt = src < max, gt = min < src, mask = lt & gt */ + emit_move_insn (xmode_narrow_min, gen_int_mode (narrow_min, Xmode)); + emit_move_insn (xmode_narrow_max, gen_int_mode (narrow_max, Xmode)); + riscv_emit_binary (LT, xmode_lt, xmode_src, xmode_narrow_max); + riscv_emit_binary (LT, xmode_gt, xmode_narrow_min, xmode_src); + riscv_emit_binary (AND, xmode_mask, xmode_lt, xmode_gt); + + /* Step-2: sat_mask = mask - 1, trunc_mask = ~mask */ + riscv_emit_binary (PLUS, xmode_sat_mask, xmode_mask, CONSTM1_RTX (Xmode)); + riscv_emit_unary (NEG, xmode_trunc_mask, xmode_mask); + + /* Step-3: lt = src < 0, lt = -lt, sat = lt ^ narrow_max */ + riscv_emit_binary (LT, xmode_lt, xmode_src, CONST0_RTX (Xmode)); + riscv_emit_unary (NEG, xmode_lt, xmode_lt); + riscv_emit_binary (XOR, xmode_sat, xmode_lt, xmode_narrow_max); + + /* Step-4: xmode_dest = (src & trunc_mask) | (sat & sat_mask) */ + riscv_emit_binary (AND, xmode_trunc, xmode_src, xmode_trunc_mask); + riscv_emit_binary (AND, xmode_sat, xmode_sat, xmode_sat_mask); + riscv_emit_binary (IOR, xmode_dest, xmode_trunc, xmode_sat); + + /* Step-5: dest = xmode_dest */ + emit_move_insn (dest, gen_lowpart (mode, xmode_dest)); +} + /* Implement TARGET_C_MODE_FOR_FLOATING_TYPE. Return TFmode for TI_LONG_DOUBLE_TYPE which is for long double type, go with the default one for the others. */ @@ -12047,6 +12695,22 @@ riscv_stack_clash_protection_alloca_probe_range (void) return STACK_CLASH_CALLER_GUARD; } +static bool +riscv_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size, + unsigned alignment, + enum by_pieces_operation op, bool speed_p) +{ + /* For set/clear with size > UNITS_PER_WORD, by pieces uses vector broadcasts + with UNITS_PER_WORD size pieces. Use setmem<mode> instead which can use + bigger chunks. */ + if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR + && (op == CLEAR_BY_PIECES || op == SET_BY_PIECES) + && speed_p && size > UNITS_PER_WORD) + return false; + + return default_use_by_pieces_infrastructure_p (size, alignment, op, speed_p); +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" @@ -12195,6 +12859,9 @@ riscv_stack_clash_protection_alloca_probe_range (void) #undef TARGET_LEGITIMATE_ADDRESS_P #define TARGET_LEGITIMATE_ADDRESS_P riscv_legitimate_address_p +#undef TARGET_CAN_INLINE_P +#define TARGET_CAN_INLINE_P riscv_can_inline_p + #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE riscv_can_eliminate @@ -12409,6 +13076,9 @@ riscv_stack_clash_protection_alloca_probe_range (void) #undef TARGET_C_MODE_FOR_FLOATING_TYPE #define TARGET_C_MODE_FOR_FLOATING_TYPE riscv_c_mode_for_floating_type +#undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P +#define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P riscv_use_by_pieces_infrastructure_p + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-riscv.h" diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index ead9786..ca1b832 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -316,7 +316,7 @@ ASM_MISA_SPEC #define FIRST_PSEUDO_REGISTER 128 -/* x0, sp, gp, and tp are fixed. */ +/* x0, ra, sp, gp, and tp are fixed. */ #define FIXED_REGISTERS \ { /* General registers. */ \ @@ -667,6 +667,18 @@ enum reg_class /* True if bit BIT is set in VALUE. */ #define BITSET_P(VALUE, BIT) (((VALUE) & (1ULL << (BIT))) != 0) +/* Returns the smaller (common) number of trailing zeros for VAL1 and VAL2. */ +#define COMMON_TRAILING_ZEROS(VAL1, VAL2) \ + (ctz_hwi (VAL1) < ctz_hwi (VAL2) \ + ? ctz_hwi (VAL1) \ + : ctz_hwi (VAL2)) + +/* Returns true if both VAL1 and VAL2 are SMALL_OPERANDs after shifting by + the common number of trailing zeros. */ +#define SMALL_AFTER_COMMON_TRAILING_SHIFT(VAL1, VAL2) \ + (SMALL_OPERAND ((VAL1) >> COMMON_TRAILING_ZEROS (VAL1, VAL2)) \ + && SMALL_OPERAND ((VAL2) >> COMMON_TRAILING_ZEROS (VAL1, VAL2))) + /* Stack layout; function entry, exit and calling. */ #define STACK_GROWS_DOWNWARD 1 @@ -939,8 +951,6 @@ extern enum riscv_cc get_riscv_cc (const rtx use); #define TARGET_VECTOR_MISALIGN_SUPPORTED \ riscv_vector_unaligned_access_p -#define LOGICAL_OP_NON_SHORT_CIRCUIT 0 - /* Control the assembler format that we output. */ /* Output to assembler file text saying following lines diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md index a94705a..5b7b735 100644 --- a/gcc/config/riscv/riscv.md +++ b/gcc/config/riscv/riscv.md @@ -56,6 +56,8 @@ UNSPEC_FLT_QUIET UNSPEC_FLE_QUIET UNSPEC_COPYSIGN + UNSPEC_FMV_X_W + UNSPEC_FMVH_X_D UNSPEC_RINT UNSPEC_ROUND UNSPEC_FLOOR @@ -1811,7 +1813,15 @@ (define_expand "zero_extendsidi2" [(set (match_operand:DI 0 "register_operand") (zero_extend:DI (match_operand:SI 1 "nonimmediate_operand")))] - "TARGET_64BIT") + "TARGET_64BIT" +{ + if (SUBREG_P (operands[1]) && SUBREG_PROMOTED_VAR_P (operands[1]) + && SUBREG_PROMOTED_UNSIGNED_P (operands[1])) + { + emit_insn (gen_movdi (operands[0], SUBREG_REG (operands[1]))); + DONE; + } +}) (define_insn_and_split "*zero_extendsidi2_internal" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -1892,7 +1902,15 @@ [(set (match_operand:DI 0 "register_operand" "=r,r") (sign_extend:DI (match_operand:SI 1 "nonimmediate_operand" " r,m")))] - "TARGET_64BIT") + "TARGET_64BIT" +{ + if (SUBREG_P (operands[1]) && SUBREG_PROMOTED_VAR_P (operands[1]) + && SUBREG_PROMOTED_SIGNED_P (operands[1])) + { + emit_insn (gen_movdi (operands[0], SUBREG_REG (operands[1]))); + DONE; + } +}) (define_insn "*extendsidi2_internal" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -2327,17 +2345,16 @@ (define_insn "@tlsdesc<mode>" [(set (reg:P A0_REGNUM) - (unspec:P - [(match_operand:P 0 "symbolic_operand" "") - (match_operand:P 1 "const_int_operand")] - UNSPEC_TLSDESC)) + (unspec:P + [(match_operand:P 0 "symbolic_operand" "")] + UNSPEC_TLSDESC)) (clobber (reg:P T0_REGNUM))] "TARGET_TLSDESC" { - return ".LT%1: auipc\ta0,%%tlsdesc_hi(%0)\;" - "<load>\tt0,%%tlsdesc_load_lo(.LT%1)(a0)\;" - "addi\ta0,a0,%%tlsdesc_add_lo(.LT%1)\;" - "jalr\tt0,t0,%%tlsdesc_call(.LT%1)"; + return ".LT%=: auipc\ta0,%%tlsdesc_hi(%0)\;" + "<load>\tt0,%%tlsdesc_load_lo(.LT%=)(a0)\;" + "addi\ta0,a0,%%tlsdesc_add_lo(.LT%=)\;" + "jalr\tt0,t0,%%tlsdesc_call(.LT%=)"; } [(set_attr "type" "multi") (set_attr "length" "16") @@ -2627,8 +2644,9 @@ (define_insn "movsidf2_low_rv32" [(set (match_operand:SI 0 "register_operand" "= r") - (truncate:SI - (match_operand:DF 1 "register_operand" "zmvf")))] + (unspec:SI + [(match_operand:DF 1 "register_operand" "zmvf")] + UNSPEC_FMV_X_W))] "TARGET_HARD_FLOAT && !TARGET_64BIT && TARGET_ZFA" "fmv.x.w\t%0,%1" [(set_attr "move_type" "fmove") @@ -2637,11 +2655,10 @@ (define_insn "movsidf2_high_rv32" - [(set (match_operand:SI 0 "register_operand" "= r") - (truncate:SI - (lshiftrt:DF - (match_operand:DF 1 "register_operand" "zmvf") - (const_int 32))))] + [(set (match_operand:SI 0 "register_operand" "= r") + (unspec:SI + [(match_operand:DF 1 "register_operand" "zmvf")] + UNSPEC_FMVH_X_D))] "TARGET_HARD_FLOAT && !TARGET_64BIT && TARGET_ZFA" "fmvh.x.d\t%0,%1" [(set_attr "move_type" "fmove") @@ -2744,12 +2761,6 @@ FAIL; }) -;; Inlining general memmove is a pessimisation: we can't avoid having to decide -;; which direction to go at runtime, which is costly in instruction count -;; however for situations where the entire move fits in one vector operation -;; we can do all reads before doing any writes so we don't have to worry -;; so generate the inline vector code in such situations -;; nb. prefer scalar path for tiny memmoves. (define_expand "movmem<mode>" [(parallel [(set (match_operand:BLK 0 "general_operand") (match_operand:BLK 1 "general_operand")) @@ -2757,10 +2768,8 @@ (use (match_operand:SI 3 "const_int_operand"))])] "TARGET_VECTOR" { - if ((INTVAL (operands[2]) >= TARGET_MIN_VLEN / 8) - && (INTVAL (operands[2]) <= TARGET_MIN_VLEN) - && riscv_vector::expand_block_move (operands[0], operands[1], - operands[2])) + if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2], + true)) DONE; else FAIL; @@ -2925,7 +2934,9 @@ ;; for IOR/XOR. It probably doesn't matter for AND. ;; ;; We also don't want to do this if the immediate already fits in a simm12 -;; field. +;; field, or is a single bit operand, or when we might be able to generate +;; a shift-add sequence via the splitter in bitmanip.md +;; in bitmanip.md for masks that are a run of consecutive ones. (define_insn_and_split "<optab>_shift_reverse<X:mode>" [(set (match_operand:X 0 "register_operand" "=r") (any_bitwise:X (ashift:X (match_operand:X 1 "register_operand" "r") @@ -2934,9 +2945,9 @@ "(!SMALL_OPERAND (INTVAL (operands[3])) && SMALL_OPERAND (INTVAL (operands[3]) >> INTVAL (operands[2])) && popcount_hwi (INTVAL (operands[3])) > 1 - && (!TARGET_64BIT - || (exact_log2 ((INTVAL (operands[3]) >> INTVAL (operands[2])) + 1) - == -1)) + && (!(TARGET_64BIT && TARGET_ZBA) + || !consecutive_bits_operand (operands[3], VOIDmode) + || !imm123_operand (operands[2], VOIDmode)) && (INTVAL (operands[3]) & ((1ULL << INTVAL (operands[2])) - 1)) == 0)" "#" "&& 1" @@ -3126,6 +3137,38 @@ } [(set_attr "type" "branch")]) +(define_insn_and_split "*branch<ANYI:mode>_shiftedarith_<optab>_shifted" + [(set (pc) + (if_then_else (any_eq + (and:ANYI (match_operand:ANYI 1 "register_operand" "r") + (match_operand 2 "shifted_const_arith_operand" "i")) + (match_operand 3 "shifted_const_arith_operand" "i")) + (label_ref (match_operand 0 "" "")) + (pc))) + (clobber (match_scratch:X 4 "=&r")) + (clobber (match_scratch:X 5 "=&r"))] + "!SMALL_OPERAND (INTVAL (operands[2])) + && !SMALL_OPERAND (INTVAL (operands[3])) + && SMALL_AFTER_COMMON_TRAILING_SHIFT (INTVAL (operands[2]), + INTVAL (operands[3]))" + "#" + "&& reload_completed" + [(set (match_dup 4) (lshiftrt:X (match_dup 1) (match_dup 7))) + (set (match_dup 4) (and:X (match_dup 4) (match_dup 8))) + (set (match_dup 5) (match_dup 9)) + (set (pc) (if_then_else (any_eq (match_dup 4) (match_dup 5)) + (label_ref (match_dup 0)) (pc)))] +{ + HOST_WIDE_INT mask1 = INTVAL (operands[2]); + HOST_WIDE_INT mask2 = INTVAL (operands[3]); + int trailing_shift = COMMON_TRAILING_ZEROS (mask1, mask2); + + operands[7] = GEN_INT (trailing_shift); + operands[8] = GEN_INT (mask1 >> trailing_shift); + operands[9] = GEN_INT (mask2 >> trailing_shift); +} +[(set_attr "type" "branch")]) + (define_insn_and_split "*branch<ANYI:mode>_shiftedmask_equals_zero" [(set (pc) (if_then_else (match_operator 1 "equality_operator" @@ -4358,11 +4401,22 @@ (define_expand "usadd<mode>3" [(match_operand:ANYI 0 "register_operand") + (match_operand:ANYI 1 "reg_or_int_operand") + (match_operand:ANYI 2 "reg_or_int_operand")] + "" + { + riscv_expand_usadd (operands[0], operands[1], operands[2]); + DONE; + } +) + +(define_expand "ssadd<mode>3" + [(match_operand:ANYI 0 "register_operand") (match_operand:ANYI 1 "register_operand") (match_operand:ANYI 2 "register_operand")] "" { - riscv_expand_usadd (operands[0], operands[1], operands[2]); + riscv_expand_ssadd (operands[0], operands[1], operands[2]); DONE; } ) @@ -4370,7 +4424,7 @@ (define_expand "ussub<mode>3" [(match_operand:ANYI 0 "register_operand") (match_operand:ANYI 1 "reg_or_int_operand") - (match_operand:ANYI 2 "register_operand")] + (match_operand:ANYI 2 "reg_or_int_operand")] "" { riscv_expand_ussub (operands[0], operands[1], operands[2]); @@ -4378,6 +4432,17 @@ } ) +(define_expand "sssub<mode>3" + [(match_operand:ANYI 0 "register_operand") + (match_operand:ANYI 1 "register_operand") + (match_operand:ANYI 2 "register_operand")] + "" + { + riscv_expand_sssub (operands[0], operands[1], operands[2]); + DONE; + } +) + (define_expand "ustrunc<mode><anyi_double_truncated>2" [(match_operand:<ANYI_DOUBLE_TRUNCATED> 0 "register_operand") (match_operand:ANYI_DOUBLE_TRUNC 1 "register_operand")] @@ -4388,6 +4453,16 @@ } ) +(define_expand "sstrunc<mode><anyi_double_truncated>2" + [(match_operand:<ANYI_DOUBLE_TRUNCATED> 0 "register_operand") + (match_operand:ANYI_DOUBLE_TRUNC 1 "register_operand")] + "" + { + riscv_expand_sstrunc (operands[0], operands[1]); + DONE; + } +) + (define_expand "ustrunc<mode><anyi_quad_truncated>2" [(match_operand:<ANYI_QUAD_TRUNCATED> 0 "register_operand") (match_operand:ANYI_QUAD_TRUNC 1 "register_operand")] @@ -4398,6 +4473,16 @@ } ) +(define_expand "sstrunc<mode><anyi_quad_truncated>2" + [(match_operand:<ANYI_QUAD_TRUNCATED> 0 "register_operand") + (match_operand:ANYI_QUAD_TRUNC 1 "register_operand")] + "" + { + riscv_expand_sstrunc (operands[0], operands[1]); + DONE; + } +) + (define_expand "ustrunc<mode><anyi_oct_truncated>2" [(match_operand:<ANYI_OCT_TRUNCATED> 0 "register_operand") (match_operand:ANYI_OCT_TRUNC 1 "register_operand")] @@ -4408,6 +4493,16 @@ } ) +(define_expand "sstrunc<mode><anyi_oct_truncated>2" + [(match_operand:<ANYI_OCT_TRUNCATED> 0 "register_operand") + (match_operand:ANYI_OCT_TRUNC 1 "register_operand")] + "" + { + riscv_expand_sstrunc (operands[0], operands[1]); + DONE; + } +) + ;; These are forms of (x << C1) + C2, potentially canonicalized from ;; ((x + C2') << C1. Depending on the cost to load C2 vs C2' we may ;; want to go ahead and recognize this form as C2 may be cheaper to diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index a8758ab..5bc5d30 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -34,8 +34,8 @@ Target RejectNegative Joined UInteger Var(riscv_branch_cost) -mbranch-cost=N Set the cost of branches to roughly N instructions. mplt -Target Var(TARGET_PLT) Init(1) -When generating -fpic code, allow the use of PLTs. Ignored for fno-pic. +Target Alias(fplt) +This option is deprecated; use -fplt or -fno-plt instead. mabi= Target RejectNegative Joined Enum(abi_type) Var(riscv_abi) Init(ABI_ILP32) Negative(mabi=) @@ -658,3 +658,7 @@ Specify TLS dialect. mfence-tso Target Var(TARGET_FENCE_TSO) Init(1) Specifies whether the fence.tso instruction should be used. + +mautovec-segment +Target Integer Var(riscv_mautovec_segment) Init(1) +Enable (default) or disable generation of vector segment load/store instructions. diff --git a/gcc/config/riscv/riscv_cmo.h b/gcc/config/riscv/riscv_cmo.h new file mode 100644 index 0000000..3514fd3 --- /dev/null +++ b/gcc/config/riscv/riscv_cmo.h @@ -0,0 +1,84 @@ +/* RISC-V CMO Extension intrinsics include file. + Copyright (C) 2024 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef __RISCV_CMO_H +#define __RISCV_CMO_H + +#if defined (__riscv_zicbom) + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_clean (void *addr) +{ + __builtin_riscv_zicbom_cbo_clean (addr); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_flush (void *addr) +{ + __builtin_riscv_zicbom_cbo_flush (addr); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_inval (void *addr) +{ + __builtin_riscv_zicbom_cbo_inval (addr); +} + +#endif // __riscv_zicbom + +#if defined (__riscv_zicbop) + +# define rnum 1 + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_prefetch (void *addr, const int vs1, const int vs2) +{ + __builtin_prefetch (addr,vs1,vs2); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_prefetchi () +{ + return __builtin_riscv_zicbop_cbo_prefetchi (rnum); +} + +#endif // __riscv_zicbop + +#if defined (__riscv_zicboz) + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +__riscv_cmo_zero (void *addr) +{ + __builtin_riscv_zicboz_cbo_zero (addr); +} + +#endif // __riscv_zicboz + +#endif // __RISCV_CMO_H diff --git a/gcc/config/riscv/thead.cc b/gcc/config/riscv/thead.cc index 2f1d83f..707d910 100644 --- a/gcc/config/riscv/thead.cc +++ b/gcc/config/riscv/thead.cc @@ -960,11 +960,11 @@ th_asm_output_opcode (FILE *asm_out_file, const char *p) if (strstr (p, "zero,zero")) return "th.vsetvli\tzero,zero,e%0,%m1"; else - return "th.vsetvli\tzero,%0,e%1,%m2"; + return "th.vsetvli\tzero,%z0,e%1,%m2"; } else { - return "th.vsetvli\t%0,%1,e%2,%m3"; + return "th.vsetvli\t%z0,%z1,e%2,%m3"; } } diff --git a/gcc/config/riscv/thead.md b/gcc/config/riscv/thead.md index 2a3af76..7a76cc8 100644 --- a/gcc/config/riscv/thead.md +++ b/gcc/config/riscv/thead.md @@ -85,7 +85,9 @@ (zero_extract:GPR (match_operand:GPR 1 "register_operand" "r") (match_operand 2 "const_int_operand") (match_operand 3 "const_int_operand")))] - "TARGET_XTHEADBB" + "TARGET_XTHEADBB + && (UINTVAL (operands[2]) + UINTVAL (operands[3]) + <= GET_MODE_BITSIZE (<MODE>mode))" { operands[2] = GEN_INT (INTVAL (operands[2]) + INTVAL (operands[3]) - 1); return "th.extu\t%0,%1,%2,%3"; diff --git a/gcc/config/riscv/vector-crypto.md b/gcc/config/riscv/vector-crypto.md index db372be..db372be 100755..100644 --- a/gcc/config/riscv/vector-crypto.md +++ b/gcc/config/riscv/vector-crypto.md diff --git a/gcc/config/riscv/vector-iterators.md b/gcc/config/riscv/vector-iterators.md index cbbd248..43325d1 100644 --- a/gcc/config/riscv/vector-iterators.md +++ b/gcc/config/riscv/vector-iterators.md @@ -4126,3 +4126,389 @@ (define_mode_attr VSIX16 [ (RVVMF2SI "RVVM8SI") ]) + +(define_mode_iterator VLS_HAS_HALF [ + (V2QI "riscv_vector::vls_mode_valid_p (V2QImode)") + (V4QI "riscv_vector::vls_mode_valid_p (V4QImode)") + (V8QI "riscv_vector::vls_mode_valid_p (V8QImode)") + (V16QI "riscv_vector::vls_mode_valid_p (V16QImode)") + (V2HI "riscv_vector::vls_mode_valid_p (V2HImode)") + (V4HI "riscv_vector::vls_mode_valid_p (V4HImode)") + (V8HI "riscv_vector::vls_mode_valid_p (V8HImode)") + (V16HI "riscv_vector::vls_mode_valid_p (V16HImode)") + (V2SI "riscv_vector::vls_mode_valid_p (V2SImode)") + (V4SI "riscv_vector::vls_mode_valid_p (V4SImode)") + (V8SI "riscv_vector::vls_mode_valid_p (V8SImode)") + (V16SI "riscv_vector::vls_mode_valid_p (V16SImode) && TARGET_MIN_VLEN >= 64") + (V2DI "riscv_vector::vls_mode_valid_p (V2DImode) && TARGET_VECTOR_ELEN_64") + (V4DI "riscv_vector::vls_mode_valid_p (V4DImode) && TARGET_VECTOR_ELEN_64") + (V8DI "riscv_vector::vls_mode_valid_p (V8DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 64") + (V16DI "riscv_vector::vls_mode_valid_p (V16DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 128") + (V2SF "riscv_vector::vls_mode_valid_p (V2SFmode) && TARGET_VECTOR_ELEN_FP_32") + (V4SF "riscv_vector::vls_mode_valid_p (V4SFmode) && TARGET_VECTOR_ELEN_FP_32") + (V8SF "riscv_vector::vls_mode_valid_p (V8SFmode) && TARGET_VECTOR_ELEN_FP_32") + (V16SF "riscv_vector::vls_mode_valid_p (V16SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 64") + (V2DF "riscv_vector::vls_mode_valid_p (V2DFmode) && TARGET_VECTOR_ELEN_FP_64") + (V4DF "riscv_vector::vls_mode_valid_p (V4DFmode) && TARGET_VECTOR_ELEN_FP_64") + (V8DF "riscv_vector::vls_mode_valid_p (V8DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 64") + (V16DF "riscv_vector::vls_mode_valid_p (V16DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 128") + (V32QI "riscv_vector::vls_mode_valid_p (V32QImode)") + (V64QI "riscv_vector::vls_mode_valid_p (V64QImode) && TARGET_MIN_VLEN >= 64") + (V128QI "riscv_vector::vls_mode_valid_p (V128QImode) && TARGET_MIN_VLEN >= 128") + (V256QI "riscv_vector::vls_mode_valid_p (V256QImode) && TARGET_MIN_VLEN >= 256") + (V512QI "riscv_vector::vls_mode_valid_p (V512QImode) && TARGET_MIN_VLEN >= 512") + (V1024QI "riscv_vector::vls_mode_valid_p (V1024QImode) && TARGET_MIN_VLEN >= 1024") + (V2048QI "riscv_vector::vls_mode_valid_p (V2048QImode) && TARGET_MIN_VLEN >= 2048") + (V4096QI "riscv_vector::vls_mode_valid_p (V4096QImode) && TARGET_MIN_VLEN >= 4096") + (V32HI "riscv_vector::vls_mode_valid_p (V32HImode) && TARGET_MIN_VLEN >= 64") + (V64HI "riscv_vector::vls_mode_valid_p (V64HImode) && TARGET_MIN_VLEN >= 128") + (V128HI "riscv_vector::vls_mode_valid_p (V128HImode) && TARGET_MIN_VLEN >= 256") + (V256HI "riscv_vector::vls_mode_valid_p (V256HImode) && TARGET_MIN_VLEN >= 512") + (V512HI "riscv_vector::vls_mode_valid_p (V512HImode) && TARGET_MIN_VLEN >= 1024") + (V1024HI "riscv_vector::vls_mode_valid_p (V1024HImode) && TARGET_MIN_VLEN >= 2048") + (V2048HI "riscv_vector::vls_mode_valid_p (V2048HImode) && TARGET_MIN_VLEN >= 4096") + (V32SI "riscv_vector::vls_mode_valid_p (V32SImode) && TARGET_MIN_VLEN >= 128") + (V64SI "riscv_vector::vls_mode_valid_p (V64SImode) && TARGET_MIN_VLEN >= 256") + (V128SI "riscv_vector::vls_mode_valid_p (V128SImode) && TARGET_MIN_VLEN >= 512") + (V256SI "riscv_vector::vls_mode_valid_p (V256SImode) && TARGET_MIN_VLEN >= 1024") + (V512SI "riscv_vector::vls_mode_valid_p (V512SImode) && TARGET_MIN_VLEN >= 2048") + (V1024SI "riscv_vector::vls_mode_valid_p (V1024SImode) && TARGET_MIN_VLEN >= 4096") + (V32DI "riscv_vector::vls_mode_valid_p (V32DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 256") + (V64DI "riscv_vector::vls_mode_valid_p (V64DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 512") + (V128DI "riscv_vector::vls_mode_valid_p (V128DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 1024") + (V256DI "riscv_vector::vls_mode_valid_p (V256DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 2048") + (V512DI "riscv_vector::vls_mode_valid_p (V512DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 4096") + (V32SF "riscv_vector::vls_mode_valid_p (V32SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 128") + (V64SF "riscv_vector::vls_mode_valid_p (V64SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 256") + (V128SF "riscv_vector::vls_mode_valid_p (V128SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 512") + (V256SF "riscv_vector::vls_mode_valid_p (V256SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 1024") + (V512SF "riscv_vector::vls_mode_valid_p (V512SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 2048") + (V1024SF "riscv_vector::vls_mode_valid_p (V1024SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 4096") + (V32DF "riscv_vector::vls_mode_valid_p (V32DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 256") + (V64DF "riscv_vector::vls_mode_valid_p (V64DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 512") + (V128DF "riscv_vector::vls_mode_valid_p (V128DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 1024") + (V256DF "riscv_vector::vls_mode_valid_p (V256DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 2048") + (V512DF "riscv_vector::vls_mode_valid_p (V512DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 4096") +]) + +(define_mode_attr VLS_HALF [ + (V2QI "V1QI") + (V4QI "V2QI") + (V8QI "V4QI") + (V16QI "V8QI") + (V32QI "V16QI") + (V64QI "V32QI") + (V128QI "V64QI") + (V256QI "V128QI") + (V512QI "V256QI") + (V1024QI "V512QI") + (V2048QI "V1024QI") + (V4096QI "V2048QI") + + (V2HI "V1HI") + (V4HI "V2HI") + (V8HI "V4HI") + (V16HI "V8HI") + (V32HI "V16HI") + (V64HI "V32HI") + (V128HI "V64HI") + (V256HI "V128HI") + (V512HI "V256HI") + (V1024HI "V512HI") + (V2048HI "V1024HI") + + (V2SI "V1SI") + (V4SI "V2SI") + (V8SI "V4SI") + (V16SI "V8SI") + (V32SI "V16SI") + (V64SI "V32SI") + (V128SI "V64SI") + (V256SI "V128SI") + (V512SI "V256SI") + (V1024SI "V512SI") + + (V2DI "V1DI") + (V4DI "V2DI") + (V8DI "V4DI") + (V16DI "V8DI") + (V32DI "V16DI") + (V64DI "V32DI") + (V128DI "V64DI") + (V256DI "V128DI") + (V512DI "V256DI") + + (V2SF "V1SF") + (V4SF "V2SF") + (V8SF "V4SF") + (V16SF "V8SF") + (V32SF "V16SF") + (V64SF "V32SF") + (V128SF "V64SF") + (V256SF "V128SF") + (V512SF "V256SF") + (V1024SF "V512SF") + + (V2DF "V1DF") + (V4DF "V2DF") + (V8DF "V4DF") + (V16DF "V8DF") + (V32DF "V16DF") + (V64DF "V32DF") + (V128DF "V64DF") + (V256DF "V128DF") + (V512DF "V256DF") +]) + +(define_mode_attr vls_half [ + (V2QI "v1qi") + (V4QI "v2qi") + (V8QI "v4qi") + (V16QI "v8qi") + (V32QI "v16qi") + (V64QI "v32qi") + (V128QI "v64qi") + (V256QI "v128qi") + (V512QI "v256qi") + (V1024QI "v512qi") + (V2048QI "v1024qi") + (V4096QI "v2048qi") + + (V2HI "v1hi") + (V4HI "v2hi") + (V8HI "v4hi") + (V16HI "v8hi") + (V32HI "v16hi") + (V64HI "v32hi") + (V128HI "v64hi") + (V256HI "v128hi") + (V512HI "v256hi") + (V1024HI "v512hi") + (V2048HI "v1024hi") + + (V2SI "v1si") + (V4SI "v2si") + (V8SI "v4si") + (V16SI "v8si") + (V32SI "v16si") + (V64SI "v32si") + (V128SI "v64si") + (V256SI "v128si") + (V512SI "v256si") + (V1024SI "v512si") + + (V2DI "v1di") + (V4DI "v2di") + (V8DI "v4di") + (V16DI "v8di") + (V32DI "v16di") + (V64DI "v32di") + (V128DI "v64di") + (V256DI "v128di") + (V512DI "v256di") + + (V2SF "v1sf") + (V4SF "v2sf") + (V8SF "v4sf") + (V16SF "v8sf") + (V32SF "v16sf") + (V64SF "v32sf") + (V128SF "v64sf") + (V256SF "v128sf") + (V512SF "v256sf") + (V1024SF "v512sf") + + (V2DF "v1df") + (V4DF "v2df") + (V8DF "v4df") + (V16DF "v8df") + (V32DF "v16df") + (V64DF "v32df") + (V128DF "v64df") + (V256DF "v128df") + (V512DF "v256df") +]) + +(define_mode_iterator VLS_HAS_QUARTER [ + (V4QI "riscv_vector::vls_mode_valid_p (V4QImode)") + (V8QI "riscv_vector::vls_mode_valid_p (V8QImode)") + (V16QI "riscv_vector::vls_mode_valid_p (V16QImode)") + (V4HI "riscv_vector::vls_mode_valid_p (V4HImode)") + (V8HI "riscv_vector::vls_mode_valid_p (V8HImode)") + (V16HI "riscv_vector::vls_mode_valid_p (V16HImode)") + (V4SI "riscv_vector::vls_mode_valid_p (V4SImode)") + (V8SI "riscv_vector::vls_mode_valid_p (V8SImode)") + (V16SI "riscv_vector::vls_mode_valid_p (V16SImode) && TARGET_MIN_VLEN >= 64") + (V4DI "riscv_vector::vls_mode_valid_p (V4DImode) && TARGET_VECTOR_ELEN_64") + (V8DI "riscv_vector::vls_mode_valid_p (V8DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 64") + (V16DI "riscv_vector::vls_mode_valid_p (V16DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 128") + (V4SF "riscv_vector::vls_mode_valid_p (V4SFmode) && TARGET_VECTOR_ELEN_FP_32") + (V8SF "riscv_vector::vls_mode_valid_p (V8SFmode) && TARGET_VECTOR_ELEN_FP_32") + (V16SF "riscv_vector::vls_mode_valid_p (V16SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 64") + (V4DF "riscv_vector::vls_mode_valid_p (V4DFmode) && TARGET_VECTOR_ELEN_FP_64") + (V8DF "riscv_vector::vls_mode_valid_p (V8DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 64") + (V16DF "riscv_vector::vls_mode_valid_p (V16DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 128") + (V32QI "riscv_vector::vls_mode_valid_p (V32QImode)") + (V64QI "riscv_vector::vls_mode_valid_p (V64QImode) && TARGET_MIN_VLEN >= 64") + (V128QI "riscv_vector::vls_mode_valid_p (V128QImode) && TARGET_MIN_VLEN >= 128") + (V256QI "riscv_vector::vls_mode_valid_p (V256QImode) && TARGET_MIN_VLEN >= 256") + (V512QI "riscv_vector::vls_mode_valid_p (V512QImode) && TARGET_MIN_VLEN >= 512") + (V1024QI "riscv_vector::vls_mode_valid_p (V1024QImode) && TARGET_MIN_VLEN >= 1024") + (V2048QI "riscv_vector::vls_mode_valid_p (V2048QImode) && TARGET_MIN_VLEN >= 2048") + (V4096QI "riscv_vector::vls_mode_valid_p (V4096QImode) && TARGET_MIN_VLEN >= 4096") + (V32HI "riscv_vector::vls_mode_valid_p (V32HImode) && TARGET_MIN_VLEN >= 64") + (V64HI "riscv_vector::vls_mode_valid_p (V64HImode) && TARGET_MIN_VLEN >= 128") + (V128HI "riscv_vector::vls_mode_valid_p (V128HImode) && TARGET_MIN_VLEN >= 256") + (V256HI "riscv_vector::vls_mode_valid_p (V256HImode) && TARGET_MIN_VLEN >= 512") + (V512HI "riscv_vector::vls_mode_valid_p (V512HImode) && TARGET_MIN_VLEN >= 1024") + (V1024HI "riscv_vector::vls_mode_valid_p (V1024HImode) && TARGET_MIN_VLEN >= 2048") + (V2048HI "riscv_vector::vls_mode_valid_p (V2048HImode) && TARGET_MIN_VLEN >= 4096") + (V32SI "riscv_vector::vls_mode_valid_p (V32SImode) && TARGET_MIN_VLEN >= 128") + (V64SI "riscv_vector::vls_mode_valid_p (V64SImode) && TARGET_MIN_VLEN >= 256") + (V128SI "riscv_vector::vls_mode_valid_p (V128SImode) && TARGET_MIN_VLEN >= 512") + (V256SI "riscv_vector::vls_mode_valid_p (V256SImode) && TARGET_MIN_VLEN >= 1024") + (V512SI "riscv_vector::vls_mode_valid_p (V512SImode) && TARGET_MIN_VLEN >= 2048") + (V1024SI "riscv_vector::vls_mode_valid_p (V1024SImode) && TARGET_MIN_VLEN >= 4096") + (V32DI "riscv_vector::vls_mode_valid_p (V32DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 256") + (V64DI "riscv_vector::vls_mode_valid_p (V64DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 512") + (V128DI "riscv_vector::vls_mode_valid_p (V128DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 1024") + (V256DI "riscv_vector::vls_mode_valid_p (V256DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 2048") + (V512DI "riscv_vector::vls_mode_valid_p (V512DImode) && TARGET_VECTOR_ELEN_64 && TARGET_MIN_VLEN >= 4096") + (V32SF "riscv_vector::vls_mode_valid_p (V32SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 128") + (V64SF "riscv_vector::vls_mode_valid_p (V64SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 256") + (V128SF "riscv_vector::vls_mode_valid_p (V128SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 512") + (V256SF "riscv_vector::vls_mode_valid_p (V256SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 1024") + (V512SF "riscv_vector::vls_mode_valid_p (V512SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 2048") + (V1024SF "riscv_vector::vls_mode_valid_p (V1024SFmode) && TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 4096") + (V32DF "riscv_vector::vls_mode_valid_p (V32DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 256") + (V64DF "riscv_vector::vls_mode_valid_p (V64DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 512") + (V128DF "riscv_vector::vls_mode_valid_p (V128DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 1024") + (V256DF "riscv_vector::vls_mode_valid_p (V256DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 2048") + (V512DF "riscv_vector::vls_mode_valid_p (V512DFmode) && TARGET_VECTOR_ELEN_FP_64 && TARGET_MIN_VLEN >= 4096") +]) + +(define_mode_attr VLS_QUARTER [ + (V4QI "V1QI") + (V8QI "V2QI") + (V16QI "V4QI") + (V32QI "V8QI") + (V64QI "V16QI") + (V128QI "V32QI") + (V256QI "V64QI") + (V512QI "V128QI") + (V1024QI "V256QI") + (V2048QI "V512QI") + (V4096QI "V1024QI") + + (V4HI "V1HI") + (V8HI "V2HI") + (V16HI "V4HI") + (V32HI "V8HI") + (V64HI "V16HI") + (V128HI "V32HI") + (V256HI "V64HI") + (V512HI "V128HI") + (V1024HI "V256HI") + (V2048HI "V512HI") + + (V4SI "V1SI") + (V8SI "V2SI") + (V16SI "V4SI") + (V32SI "V8SI") + (V64SI "V16SI") + (V128SI "V32SI") + (V256SI "V64SI") + (V512SI "V128SI") + (V1024SI "V256SI") + + (V4DI "V1DI") + (V8DI "V2DI") + (V16DI "V4DI") + (V32DI "V8DI") + (V64DI "V16DI") + (V128DI "V32DI") + (V256DI "V64DI") + (V512DI "V128DI") + + (V4SF "V1SF") + (V8SF "V2SF") + (V16SF "V4SF") + (V32SF "V8SF") + (V64SF "V16SF") + (V128SF "V32SF") + (V256SF "V64SF") + (V512SF "V128SF") + (V1024SF "V256SF") + + (V4DF "V1DF") + (V8DF "V2DF") + (V16DF "V4DF") + (V32DF "V8DF") + (V64DF "V16DF") + (V128DF "V32DF") + (V256DF "V64DF") + (V512DF "V128DF") +]) + +(define_mode_attr vls_quarter [ + (V4QI "v1qi") + (V8QI "v2qi") + (V16QI "v4qi") + (V32QI "v8qi") + (V64QI "v16qi") + (V128QI "v32qi") + (V256QI "v64qi") + (V512QI "v128qi") + (V1024QI "v256qi") + (V2048QI "v512qi") + (V4096QI "v1024qi") + + (V4HI "v1hi") + (V8HI "v2hi") + (V16HI "v4hi") + (V32HI "v8hi") + (V64HI "v16hi") + (V128HI "v32hi") + (V256HI "v64hi") + (V512HI "v128hi") + (V1024HI "v256hi") + (V2048HI "v512hi") + + (V4SI "v1si") + (V8SI "v2si") + (V16SI "v4si") + (V32SI "v8si") + (V64SI "v16si") + (V128SI "v32si") + (V256SI "v64si") + (V512SI "v128si") + (V1024SI "v256si") + + (V4DI "v1di") + (V8DI "v2di") + (V16DI "v4di") + (V32DI "v8di") + (V64DI "v16di") + (V128DI "v32di") + (V256DI "v64di") + (V512DI "v128di") + + (V4SF "v1sf") + (V8SF "v2sf") + (V16SF "v4sf") + (V32SF "v8sf") + (V64SF "v16sf") + (V128SF "v32sf") + (V256SF "v64sf") + (V512SF "v128sf") + (V1024SF "v256sf") + + (V4DF "v1df") + (V8DF "v2df") + (V16DF "v4df") + (V32DF "v8df") + (V64DF "v16df") + (V128DF "v32df") + (V256DF "v64df") + (V512DF "v128df") +]) diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 6667193..a21288f 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -816,7 +816,7 @@ vfcmp,vfminmax,vfsgnj,vfclass,vfmerge,vfmov,\ vfcvtitof,vfncvtitof,vfncvtftoi,vfncvtftof,vmalu,vmiota,vmidx,\ vimovxv,vfmovfv,vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\ - vgather,vcompress,vmov,vnclip,vnshift,vandn,vcpop,vclz,vctz") + vgather,vcompress,vmov,vnclip,vnshift,vandn,vcpop,vclz,vctz,vrol,vror") (const_int 0) (eq_attr "type" "vimovvx,vfmovvf") @@ -2095,6 +2095,16 @@ emit_move_insn (tmp, gen_int_mode (value, Pmode)); operands[3] = gen_rtx_SIGN_EXTEND (<VEL>mode, tmp); } + /* Never load (const_int 0) into a register, that's silly. */ + else if (operands[3] == CONST0_RTX (<VEL>mode)) + ; + /* If we're broadcasting [-16..15] across more than just + element 0, then we can use vmv.v.i directly, thus avoiding + the load of the constant into a GPR. */ + else if (CONST_INT_P (operands[3]) + && IN_RANGE (INTVAL (operands[3]), -16, 15) + && !satisfies_constraint_Wb1 (operands[1])) + ; else operands[3] = force_reg (<VEL>mode, operands[3]); }) @@ -2111,18 +2121,18 @@ (reg:SI VL_REGNUM) (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) (vec_duplicate:V_VLSI - (match_operand:<VEL> 3 "direct_broadcast_operand" " r, r,Wdm,Wdm,Wdm,Wdm, r, r")) - (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))] + (match_operand:<VEL> 3 "direct_broadcast_operand" "rP,rP,Wdm,Wdm,Wdm,Wdm, rJ, rJ")) + (match_operand:V_VLSI 2 "vector_merge_operand" "vu, 0, vu, 0, vu, 0, vu, 0")))] "TARGET_VECTOR" "@ - vmv.v.x\t%0,%3 - vmv.v.x\t%0,%3 + vmv.v.%o3\t%0,%3 + vmv.v.%o3\t%0,%3 vlse<sew>.v\t%0,%3,zero,%1.t vlse<sew>.v\t%0,%3,zero,%1.t vlse<sew>.v\t%0,%3,zero vlse<sew>.v\t%0,%3,zero - vmv.s.x\t%0,%3 - vmv.s.x\t%0,%3" + vmv.s.x\t%0,%z3 + vmv.s.x\t%0,%z3" "(register_operand (operands[3], <VEL>mode) || CONST_POLY_INT_P (operands[3])) && GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)" @@ -4400,10 +4410,10 @@ (sat_int_minus_binop:VI_D (match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr") (vec_duplicate:VI_D - (match_operand:<VEL> 4 "register_operand" " r, r, r, r"))) + (match_operand:<VEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ"))) (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR" - "v<insn>.vx\t%0,%3,%4%p1" + "v<insn>.vx\t%0,%3,%z4%p1" [(set_attr "type" "<int_binop_insn_type>") (set_attr "mode" "<MODE>")]) @@ -4422,10 +4432,10 @@ (match_operand:VI_D 3 "register_operand" " vr, vr, vr, vr") (vec_duplicate:VI_D (sign_extend:<VEL> - (match_operand:<VSUBEL> 4 "register_operand" " r, r, r, r")))) + (match_operand:<VSUBEL> 4 "reg_or_0_operand" " rJ, rJ, rJ, rJ")))) (match_operand:VI_D 2 "vector_merge_operand" " vu, 0, vu, 0")))] "TARGET_VECTOR && !TARGET_64BIT" - "v<insn>.vx\t%0,%3,%4%p1" + "v<insn>.vx\t%0,%3,%z4%p1" [(set_attr "type" "<int_binop_insn_type>") (set_attr "mode" "<MODE>")]) diff --git a/gcc/config/riscv/xiangshan.md b/gcc/config/riscv/xiangshan.md index 76539d3..eb83bbf 100644 --- a/gcc/config/riscv/xiangshan.md +++ b/gcc/config/riscv/xiangshan.md @@ -70,12 +70,17 @@ (define_insn_reservation "xiangshan_jump" 1 (and (eq_attr "tune" "xiangshan") - (eq_attr "type" "jump,call,auipc,unknown,branch,jalr,ret,sfb_alu")) + (eq_attr "type" "jump,call,auipc,unknown,branch,jalr,ret,sfb_alu,trap")) "xs_jmp_rs") (define_insn_reservation "xiangshan_i2f" 3 (and (eq_attr "tune" "xiangshan") - (eq_attr "type" "mtc")) + (eq_attr "type" "mtc,fcvt_i2f")) + "xs_jmp_rs") + +(define_insn_reservation "xiangshan_atomic" 1 + (and (eq_attr "tune" "xiangshan") + (eq_attr "type" "atomic")) "xs_jmp_rs") (define_insn_reservation "xiangshan_mul" 3 @@ -115,7 +120,7 @@ (define_insn_reservation "xiangshan_f2f" 3 (and (eq_attr "tune" "xiangshan") - (eq_attr "type" "fcvt,fmove")) + (eq_attr "type" "fcvt,fcvt_f2i,fmove")) "xs_fmisc_rs") (define_insn_reservation "xiangshan_f2i" 3 diff --git a/gcc/config/riscv/zc.md b/gcc/config/riscv/zc.md index 5b948b4..6dc47da 100644 --- a/gcc/config/riscv/zc.md +++ b/gcc/config/riscv/zc.md @@ -1442,7 +1442,7 @@ (match_operand:X 3 "zcmp_mv_sreg_operand" "r"))] "TARGET_ZCMP && (REGNO (operands[2]) != REGNO (operands[0]))" - { return (REGNO (operands[0]) == A0_REGNUM)?"cm.mva01s\t%1,%3":"cm.mva01s\t%3,%1"; } + { return (REGNO (operands[0]) == A0_REGNUM) ? "cm.mva01s\t%1,%3" : "cm.mva01s\t%3,%1"; } [(set_attr "mode" "<X:MODE>") (set_attr "type" "mvpair")]) @@ -1454,6 +1454,6 @@ "TARGET_ZCMP && (REGNO (operands[0]) != REGNO (operands[2])) && (REGNO (operands[1]) != REGNO (operands[3]))" - { return (REGNO (operands[1]) == A0_REGNUM)?"cm.mvsa01\t%0,%2":"cm.mvsa01\t%2,%0"; } + { return (REGNO (operands[1]) == A0_REGNUM) ? "cm.mvsa01\t%0,%2" : "cm.mvsa01\t%2,%0"; } [(set_attr "mode" "<X:MODE>") (set_attr "type" "mvpair")]) diff --git a/gcc/config/riscv/zicond.md b/gcc/config/riscv/zicond.md index 3876be7..ab1a533 100644 --- a/gcc/config/riscv/zicond.md +++ b/gcc/config/riscv/zicond.md @@ -124,3 +124,115 @@ { operands[2] = GEN_INT (1 << UINTVAL(operands[2])); }) + +;; In some cases gimple can give us a sequence with a logical and +;; of two sCC insns. This can be implemented an sCC feeding a +;; conditional zero. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (ne:X (match_operand:X 1 "register_operand") (const_int 0)) + (scc_0:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "reg_or_0_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (scc_0:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (eq:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + +;; Similarly but GE/GEU which requires (const_int 1) as an operand. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (ne:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_ge:X (match_operand:X 2 "register_operand") + (const_int 1)))) + (clobber (match_operand:X 3 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 3) (any_ge:X (match_dup 2) (const_int 1))) + (set (match_dup 0) (if_then_else:X (eq:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 3)))]) + +;; Similarly but LU/LTU which allows an arith_operand +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (ne:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_lt:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "arith_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (any_lt:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (eq:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + +;; Finally LE/LEU which requires sle_operand. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (ne:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_le:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "sle_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (any_le:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (eq:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + + +;; Inverted versions from above. I tried to get this to work with +;; iterators, but didn't have any success disambiguating the code attr +;; for the eq/ne flip we have to do. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (eq:X (match_operand:X 1 "register_operand") (const_int 0)) + (scc_0:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "reg_or_0_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (scc_0:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (ne:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + +;; Similarly but GE/GEU which requires (const_int 1) as an operand. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (eq:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_ge:X (match_operand:X 2 "register_operand") + (const_int 1)))) + (clobber (match_operand:X 3 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 3) (any_ge:X (match_dup 2) (const_int 1))) + (set (match_dup 0) (if_then_else:X (ne:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 3)))]) + +;; Similarly but LU/LTU which allows an arith_operand +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (eq:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_lt:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "arith_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (any_lt:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (ne:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + +;; Finally LE/LEU which requires sle_operand. +(define_split + [(set (match_operand:X 0 "register_operand") + (and:X (eq:X (match_operand:X 1 "register_operand") (const_int 0)) + (any_le:X (match_operand:X 2 "register_operand") + (match_operand:X 3 "sle_operand")))) + (clobber (match_operand:X 4 "register_operand"))] + "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV" + [(set (match_dup 4) (any_le:X (match_dup 2) (match_dup 3))) + (set (match_dup 0) (if_then_else:X (ne:X (match_dup 1) (const_int 0)) + (const_int 0) + (match_dup 4)))]) + + + diff --git a/gcc/config/rl78/rl78.cc b/gcc/config/rl78/rl78.cc index 25f6606..8ce9331 100644 --- a/gcc/config/rl78/rl78.cc +++ b/gcc/config/rl78/rl78.cc @@ -1675,7 +1675,7 @@ static void rl78_start_function (FILE *file) { int i; - + add_vector_labels (file, "interrupt"); add_vector_labels (file, "vector"); diff --git a/gcc/config/rs6000/aix.h b/gcc/config/rs6000/aix.h index 03d39b1..7f6c45e 100644 --- a/gcc/config/rs6000/aix.h +++ b/gcc/config/rs6000/aix.h @@ -182,7 +182,7 @@ Don't do this until the fixed IBM assembler is more generally available. When this becomes permanently defined, the ASM_OUTPUT_EXTERNAL, ASM_OUTPUT_EXTERNAL_LIBCALL, and RS6000_OUTPUT_BASENAME macros will no - longer be needed. Also, the extern declaration of mcount in + longer be needed. Also, the extern declaration of mcount in rs6000_xcoff_file_start will no longer be needed. */ /* #define ASM_SPEC "-u %(asm_cpu)" */ diff --git a/gcc/config/rs6000/aix71.h b/gcc/config/rs6000/aix71.h index 41037b3..4350dcd 100644 --- a/gcc/config/rs6000/aix71.h +++ b/gcc/config/rs6000/aix71.h @@ -125,7 +125,7 @@ do { \ %{mpe: -I%R/usr/lpp/ppe.poe/include} \ %{pthread: -D_THREAD_SAFE}" -/* The GNU C++ standard library requires that these macros be +/* The GNU C++ standard library requires that these macros be defined. Synchronize with libstdc++ os_defines.h. */ #define CPLUSPLUS_CPP_SPEC_COMMON \ "-D_ALL_SOURCE -D__COMPATMATH__ \ @@ -257,7 +257,7 @@ do { \ #define LD_INIT_SWITCH "-binitfini" #ifndef _AIX52 -extern long long int atoll(const char *); +extern long long int atoll(const char *); #endif /* This target uses the aix64.opt file. */ diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h index c9f9486..dcf0f28 100644 --- a/gcc/config/rs6000/altivec.h +++ b/gcc/config/rs6000/altivec.h @@ -35,7 +35,7 @@ #endif /* If __APPLE_ALTIVEC__ is defined, the compiler supports 'vector', - 'pixel' and 'bool' as context-sensitive AltiVec keywords (in + 'pixel' and 'bool' as context-sensitive AltiVec keywords (in non-AltiVec contexts, they revert to their original meanings, if any), so we do not need to define them as macros. Also, avoid defining them as macros for C++ with strict ANSI, as diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 1f5489b..00dad4b 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -170,6 +170,7 @@ UNSPEC_VSTRIL UNSPEC_SLDB UNSPEC_SRDB + UNSPEC_VECTOR_SHIFT ]) (define_c_enum "unspecv" @@ -2176,6 +2177,56 @@ "vsro %0,%1,%2" [(set_attr "type" "vecperm")]) +;; Optimize V2DI shifts by constants. This relies on the shift instructions +;; only looking at the bits needed to do the shift. This means we can use +;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits +;; that the vector shift instructions will not use. +(define_mode_iterator VSHIFT_MODE [(V4SI "TARGET_P9_VECTOR") + (V2DI "TARGET_P8_VECTOR")]) + +(define_code_iterator vshift_code [ashift ashiftrt lshiftrt]) +(define_code_attr vshift_attr [(ashift "ashift") + (ashiftrt "ashiftrt") + (lshiftrt "lshiftrt")]) + +(define_insn_and_split "*altivec_<mode>_<vshift_attr>_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (vshift_code:VSHIFT_MODE + (match_operand:VSHIFT_MODE 1 "register_operand" "v") + (match_operand:VSHIFT_MODE 2 "vector_shift_constant" ""))) + (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))] + "((<MODE>mode == V2DImode && TARGET_P8_VECTOR) + || (<MODE>mode == V4SImode && TARGET_P9_VECTOR))" + "#" + "&& 1" + [(set (match_dup 3) + (unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT)) + (set (match_dup 0) + (vshift_code:VSHIFT_MODE (match_dup 1) + (match_dup 3)))] +{ + if (GET_CODE (operands[3]) == SCRATCH) + operands[3] = gen_reg_rtx (<MODE>mode); + + operands[4] = GET_CODE (operands[2]) == CONST_VECTOR + ? CONST_VECTOR_ELT (operands[2], 0) + : XEXP (operands[2], 0); +}) + +(define_insn "*altivec_<mode>_shift_const" + [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v") + (unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")] + UNSPEC_VECTOR_SHIFT))] + "TARGET_P8_VECTOR" +{ + if (UINTVAL (operands[1]) <= 15) + return "vspltisw %0,%1"; + else if (TARGET_P9_VECTOR) + return "xxspltib %x0,%1"; + else + gcc_unreachable (); +}) + (define_insn "altivec_vsum4ubs" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v") @@ -3698,7 +3749,7 @@ } }) -(define_expand "udot_prod<mode>" +(define_expand "udot_prodv4si<mode>" [(set (match_operand:V4SI 0 "register_operand" "=v") (plus:V4SI (match_operand:V4SI 3 "register_operand" "v") (unspec:V4SI [(match_operand:VIshort 1 "register_operand" "v") @@ -3710,7 +3761,7 @@ DONE; }) -(define_expand "sdot_prodv8hi" +(define_expand "sdot_prodv4siv8hi" [(set (match_operand:V4SI 0 "register_operand" "=v") (plus:V4SI (match_operand:V4SI 3 "register_operand" "v") (unspec:V4SI [(match_operand:V8HI 1 "register_operand" "v") diff --git a/gcc/config/rs6000/amo.h b/gcc/config/rs6000/amo.h index 6b9e4e0..1303c9d 100644 --- a/gcc/config/rs6000/amo.h +++ b/gcc/config/rs6000/amo.h @@ -46,7 +46,7 @@ enum _AMO_LD { _AMO_LD_CS_NE = 0x10, /* Compare and Swap Not Equal. */ _AMO_LD_INC_BOUNDED = 0x18, /* Fetch and Increment Bounded. */ _AMO_LD_INC_EQUAL = 0x19, /* Fetch and Increment Equal. */ - _AMO_LD_DEC_BOUNDED = 0x1A /* Fetch and Decrement Bounded. */ + _AMO_LD_DEC_BOUNDED = 0x1C /* Fetch and Decrement Bounded. */ }; /* Implementation of the simple LWAT/LDAT operations that take one register and diff --git a/gcc/config/rs6000/darwin.h b/gcc/config/rs6000/darwin.h index e8b1949..bd5a016 100644 --- a/gcc/config/rs6000/darwin.h +++ b/gcc/config/rs6000/darwin.h @@ -487,14 +487,14 @@ default, as kernel code doesn't save/restore those registers. */ #define OS_MISSING_ALTIVEC (flag_mkernel || flag_apple_kext) -/* Darwin has support for section anchors on powerpc*. +/* Darwin has support for section anchors on powerpc*. It is disabled for any section containing a "zero-sized item" (because these are re-written as size=1 to be compatible with the OSX ld64). The re-writing would interfere with the computation of anchor offsets. Therefore, we place zero-sized items in their own sections and make such sections unavailable to section anchoring. */ -#undef TARGET_ASM_OUTPUT_ANCHOR +#undef TARGET_ASM_OUTPUT_ANCHOR #define TARGET_ASM_OUTPUT_ANCHOR darwin_asm_output_anchor #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P diff --git a/gcc/config/rs6000/driver-rs6000.cc b/gcc/config/rs6000/driver-rs6000.cc index f490072..a054827 100644 --- a/gcc/config/rs6000/driver-rs6000.cc +++ b/gcc/config/rs6000/driver-rs6000.cc @@ -19,6 +19,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -628,7 +629,7 @@ host_detect_local_cpu (int argc, const char **argv) arch = strcmp (argv[0], "cpu") == 0; if (!arch && strcmp (argv[0], "tune")) return NULL; - + if (arch) cpu = "powerpc"; diff --git a/gcc/config/rs6000/freebsd.h b/gcc/config/rs6000/freebsd.h index 0f42dc1..0c83e9e 100644 --- a/gcc/config/rs6000/freebsd.h +++ b/gcc/config/rs6000/freebsd.h @@ -50,7 +50,7 @@ /************************[ Target stuff ]***********************************/ -/* Define the actual types of some ANSI-mandated types. +/* Define the actual types of some ANSI-mandated types. Needs to agree with <machine/ansi.h>. GCC defaults come from c-decl.cc, c-common.cc, and config/<arch>/<arch>.h. */ diff --git a/gcc/config/rs6000/freebsd64.h b/gcc/config/rs6000/freebsd64.h index 6740170..627fd42 100644 --- a/gcc/config/rs6000/freebsd64.h +++ b/gcc/config/rs6000/freebsd64.h @@ -237,7 +237,7 @@ extern int dot_symbols; /************************[ Target stuff ]***********************************/ -/* Define the actual types of some ANSI-mandated types. +/* Define the actual types of some ANSI-mandated types. Needs to agree with <machine/ansi.h>. GCC defaults come from c-decl.cc, c-common.cc, and config/<arch>/<arch>.h. */ diff --git a/gcc/config/rs6000/host-darwin.cc b/gcc/config/rs6000/host-darwin.cc index e000177..fa9140d 100644 --- a/gcc/config/rs6000/host-darwin.cc +++ b/gcc/config/rs6000/host-darwin.cc @@ -19,6 +19,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -92,7 +93,7 @@ segv_handler (int sig ATTRIBUTE_UNUSED, || (faulting_insn & 0xFC1F8000) == 0xBC018000 /* stmw xxx, -yyy(%r1) */) { char *shell_name; - + fnotice (stderr, "Out of stack space.\n"); shell_name = getenv ("SHELL"); if (shell_name != NULL) @@ -109,23 +110,23 @@ segv_handler (int sig ATTRIBUTE_UNUSED, { "zsh", "limit stacksize 32m" } }; size_t i; - + for (i = 0; i < ARRAY_SIZE (shell_commands); i++) if (strcmp (shell_commands[i][0], shell_name + 1) == 0) { - fnotice (stderr, + fnotice (stderr, "Try running '%s' in the shell to raise its limit.\n", shell_commands[i][1]); } } - + if (global_dc->m_abort_on_error) fancy_abort (__FILE__, __LINE__, __FUNCTION__); exit (FATAL_EXIT_CODE); } - fprintf (stderr, "[address=%08lx pc=%08x]\n", + fprintf (stderr, "[address=%08lx pc=%08x]\n", uc->uc_mcontext->MC_FLD(es).MC_FLD(dar), uc->uc_mcontext->MC_FLD(ss).MC_FLD(srr0)); internal_error ("segmentation fault"); @@ -147,7 +148,7 @@ darwin_rs6000_extra_signals (void) sigemptyset(&sact.sa_mask); sact.sa_flags = SA_ONSTACK | SA_SIGINFO; sact.sa_sigaction = segv_handler; - if (sigaction (SIGSEGV, &sact, 0) < 0) + if (sigaction (SIGSEGV, &sact, 0) < 0) fatal_error (input_location, "While setting up signal handler: %m"); } diff --git a/gcc/config/rs6000/linux.h b/gcc/config/rs6000/linux.h index 5f6cede..d247f41 100644 --- a/gcc/config/rs6000/linux.h +++ b/gcc/config/rs6000/linux.h @@ -116,7 +116,7 @@ /* We are 32-bit all the time, so optimize a little. */ #undef TARGET_64BIT #define TARGET_64BIT 0 - + /* We don't need to generate entries in .fixup, except when -mrelocatable or -mrelocatable-lib is given. */ #undef RELOCATABLE_NEEDS_FIXUP diff --git a/gcc/config/rs6000/mmintrin.h b/gcc/config/rs6000/mmintrin.h index c7988c1..68c06aa 100644 --- a/gcc/config/rs6000/mmintrin.h +++ b/gcc/config/rs6000/mmintrin.h @@ -743,14 +743,14 @@ _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0; - __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0; - __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0; - __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0; - __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0; - __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0; - __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0; - __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0; + __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; + __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; + __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; + __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; + __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; + __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; + __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; + __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; return (__m64) __res.as_m64; #endif @@ -778,14 +778,14 @@ _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0; - __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0; - __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0; - __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0; - __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0; - __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0; - __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0; - __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0; + __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; + __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; + __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; + __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; + __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; + __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; + __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; + __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; return (__m64) __res.as_m64; #endif @@ -815,10 +815,10 @@ _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0; - __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0; - __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0; - __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0; + __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; + __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; + __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; + __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; return (__m64) __res.as_m64; #endif @@ -846,10 +846,10 @@ _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0; - __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0; - __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0; - __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0; + __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; + __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; + __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; + __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; return (__m64) __res.as_m64; #endif @@ -879,8 +879,8 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0; - __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0; + __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; + __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; return (__m64) __res.as_m64; #endif @@ -908,8 +908,8 @@ _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) __mu1.as_m64 = __m1; __mu2.as_m64 = __m2; - __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0; - __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0; + __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; + __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; return (__m64) __res.as_m64; #endif diff --git a/gcc/config/rs6000/ppu_intrinsics.h b/gcc/config/rs6000/ppu_intrinsics.h index 4d91c72..9ac1caf 100644 --- a/gcc/config/rs6000/ppu_intrinsics.h +++ b/gcc/config/rs6000/ppu_intrinsics.h @@ -34,7 +34,7 @@ #ifdef __cplusplus extern "C" { -#endif +#endif /* * unsigned int __cntlzw(unsigned int) @@ -113,7 +113,7 @@ extern "C" { * void __mtfsb1(int) * double __setflm(double) * - * dcbt intrinsics + * dcbt intrinsics * void __protected_unlimited_stream_set (unsigned int direction, const void *add, unsigned int ID) * void __protected_stream_set (unsigned int direction, const void *add, unsigned int ID) * void __protected_stream_stop_all (void) @@ -178,7 +178,7 @@ typedef int __V4SI __attribute__((vector_size(16))); #ifdef __powerpc64__ #define __mtspr(spr, value) \ __asm__ volatile ("mtspr %0,%1" : : "n" (spr), "r" (value)) - + #define __mfspr(spr) __extension__ \ ({ unsigned long long result; \ __asm__ volatile ("mfspr %0,%1" : "=r" (result) : "n" (spr)); \ @@ -211,7 +211,7 @@ typedef int __V4SI __attribute__((vector_size(16))); #define __dcbf(base) \ __asm__ volatile ("dcbf %y0" : "=Z" (*(__V4SI*) (base)) : : "memory") - + #define __dcbz(base) \ __asm__ volatile ("dcbz %y0" : "=Z" (*(__V4SI*) (base)) : : "memory") @@ -226,7 +226,7 @@ typedef int __V4SI __attribute__((vector_size(16))); #define __icbi(base) \ __asm__ volatile ("icbi %y0" : "=Z" (*(__V4SI*) (base)) : : "memory") - + #define __dcbt_TH1000(EATRUNC, D, UG, ID) \ __asm__ volatile ("dcbt %y0,8" \ : "=Z" (*(__V4SI*) (__SIZE_TYPE__)((((__SIZE_TYPE__) (EATRUNC)) & ~0x7F) \ @@ -390,7 +390,7 @@ typedef int __V4SI __attribute__((vector_size(16))); #define __mtfsf(mask,value) \ __asm__ volatile ("mtfsf %0,%1" : : "n" (mask), "d" ((double) (value))) - + #define __mtfsfi(bits,field) \ __asm__ volatile ("mtfsfi %0,%1" : : "n" (bits), "n" (field)) @@ -406,10 +406,10 @@ typedef int __V4SI __attribute__((vector_size(16))); /* __builtin_fabs may perform unnecessary rounding. */ -/* Rename __fabs and __fabsf to work around internal prototypes defined - in bits/mathcalls.h with some glibc versions. */ -#define __fabs __ppu_fabs -#define __fabsf __ppu_fabsf +/* Rename __fabs and __fabsf to work around internal prototypes defined + in bits/mathcalls.h with some glibc versions. */ +#define __fabs __ppu_fabs +#define __fabsf __ppu_fabsf static __inline__ double __fabs(double x) __attribute__((always_inline)); static __inline__ double diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 7f0b4ab..0b78901 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -861,6 +861,69 @@ return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode); }) +;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element +;; is the same constant, and the constant can be used for a shift operation. +;; This is to prevent sub-optimal code, that needs to load up the constant and +;; then zero extend it 32 or 64-bit vectors or load up the constant from the +;; literal pool. +;; +;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by +;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction. +;; For V2DImode, we do this all of the time, since there is no convenient +;; instruction to load up a vector long long splatted constant. +;; +;; If we can use XXSPLTIB, then allow constants up to 63. If not, we restrict +;; the constant to 0..15 that can be loaded with VSPLTISW. V4SI shifts are +;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31. Values +;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't +;; need this optimization. +(define_predicate "vector_shift_constant" + (match_code "const_vector,vec_duplicate") +{ + unsigned HOST_WIDE_INT min_value; + + if (mode == V2DImode) + { + min_value = 0; + if (!TARGET_P8_VECTOR) + return 0; + } + else if (mode == V4SImode) + { + min_value = 16; + if (!TARGET_P9_VECTOR) + return 0; + } + else + return 0; + + unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15; + + if (GET_CODE (op) == CONST_VECTOR) + { + unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0)); + unsigned nunits = GET_MODE_NUNITS (mode); + unsigned i; + + if (!IN_RANGE (first, min_value, max_value)) + return 0; + + for (i = 1; i < nunits; i++) + if (first != UINTVAL (CONST_VECTOR_ELT (op, i))) + return 0; + + return 1; + } + else + { + rtx op0 = XEXP (op, 0); + if (!CONST_INT_P (op0)) + return 0; + + return IN_RANGE (UINTVAL (op0), min_value, max_value); + } +}) + ;; Return 1 if operand is 0.0. (define_predicate "zero_fp_constant" (and (match_code "const_double") diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index 04882c3..4dc80e5 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -22,6 +22,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -808,7 +809,7 @@ is_float128_p (tree t) && TARGET_LONG_DOUBLE_128 && t == long_double_type_node)); } - + /* Return true iff ARGTYPE can be compatibly passed as PARMTYPE. */ static bool diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc index a039ff7..549fa57 100644 --- a/gcc/config/rs6000/rs6000-call.cc +++ b/gcc/config/rs6000/rs6000-call.cc @@ -407,15 +407,15 @@ rs6000_discover_homogeneous_aggregate (machine_mode mode, const_tree type, The AIX ABI for the RS/6000 specifies that all structures are returned in memory. The Darwin ABI does the same. - + For the Darwin 64 Bit ABI, a function result can be returned in registers or in memory, depending on the size of the return data type. If it is returned in registers, the value occupies the same registers as it would if it were the first and only function argument. Otherwise, the function places its result in memory at the location pointed to by GPR3. - - The SVR4 ABI specifies that structures <= 8 bytes are returned in r3/r4, + + The SVR4 ABI specifies that structures <= 8 bytes are returned in r3/r4, but a draft put them in memory, and GCC used to implement the draft instead of the final standard. Therefore, aix_struct_return controls this instead of DEFAULT_ABI; V.4 targets needing backward @@ -1045,10 +1045,10 @@ int rs6000_darwin64_struct_check_p (machine_mode mode, const_tree type) { return rs6000_darwin64_abi - && ((mode == BLKmode - && TREE_CODE (type) == RECORD_TYPE + && ((mode == BLKmode + && TREE_CODE (type) == RECORD_TYPE && int_size_in_bytes (type) > 0) - || (type && TREE_CODE (type) == RECORD_TYPE + || (type && TREE_CODE (type) == RECORD_TYPE && int_size_in_bytes (type) == 8)) ? 1 : 0; } @@ -1178,7 +1178,7 @@ rs6000_function_arg_advance_1 (CUMULATIVE_ARGS *cum, machine_mode mode, { fprintf (stderr, "function_adv: words = %2d, align=%d, size=%d", cum->words, TYPE_ALIGN (type), size); - fprintf (stderr, + fprintf (stderr, "nargs = %4d, proto = %d, mode = %4s (darwin64 abi)\n", cum->nargs_prototype, cum->prototype, GET_MODE_NAME (mode)); @@ -2568,9 +2568,9 @@ rs6000_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, /* We need to deal with the fact that the darwin ppc64 ABI is defined by an earlier version of gcc, with the property that it always applied alignment adjustments to the va-args (even for zero-sized types). The cheapest way - to deal with this is to replicate the effect of the part of - std_gimplify_va_arg_expr that carries out the align adjust, for the case - of relevance. + to deal with this is to replicate the effect of the part of + std_gimplify_va_arg_expr that carries out the align adjust, for the case + of relevance. We don't need to check for pass-by-reference because of the test above. We can return a simplifed answer, since we know there's no offset to add. */ diff --git a/gcc/config/rs6000/rs6000-internal.h b/gcc/config/rs6000/rs6000-internal.h index 3a6cc31..0f6e1fd 100644 --- a/gcc/config/rs6000/rs6000-internal.h +++ b/gcc/config/rs6000/rs6000-internal.h @@ -149,7 +149,7 @@ extern machine_mode rs6000_promote_function_mode (const_tree type ATTRIBUTE_UNUS machine_mode mode, int *punsignedp ATTRIBUTE_UNUSED, const_tree, int); -extern bool rs6000_return_in_memory (const_tree type, +extern bool rs6000_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED); extern bool rs6000_return_in_msb (const_tree valtype); extern bool rs6000_pass_by_reference (cumulative_args_t, diff --git a/gcc/config/rs6000/rs6000-logue.cc b/gcc/config/rs6000/rs6000-logue.cc index fdb6414..c87058b 100644 --- a/gcc/config/rs6000/rs6000-logue.cc +++ b/gcc/config/rs6000/rs6000-logue.cc @@ -1376,7 +1376,7 @@ rs6000_emit_eh_reg_restore (rtx source, rtx scratch) /* Freeze lr_save_p. We've just emitted rtl that depends on the state of lr_save_p so any change from here on would be a bug. In particular, stop rs6000_ra_ever_killed from considering the SET - of lr we may have added just above. */ + of lr we may have added just above. */ cfun->machine->lr_save_state = info->lr_save_p + 1; } @@ -1462,7 +1462,7 @@ rs6000_emit_stack_tie (rtx fp, bool hard_frame_needed) /* Allocate SIZE_INT bytes on the stack using a store with update style insn and set the appropriate attributes for the generated insn. Return the first insn which adjusts the stack pointer or the last insn before - the stack adjustment loop. + the stack adjustment loop. SIZE_INT is used to create the CFI note for the allocation. @@ -1487,7 +1487,7 @@ rs6000_emit_allocate_stack_1 (HOST_WIDE_INT size_int, rtx orig_sp) try_split (PATTERN (insn), insn, 0); size_rtx = tmp_reg; } - + if (TARGET_32BIT) insn = emit_insn (gen_movsi_update_stack (stack_pointer_rtx, stack_pointer_rtx, @@ -4689,7 +4689,7 @@ rs6000_emit_epilogue (enum epilogue_type epilogue_type) if (newptr_regno != 1 && REGNO (frame_reg_rtx) != newptr_regno) frame_reg_rtx = gen_rtx_REG (Pmode, newptr_regno); - + if (end_save + ptr_off != 0) { rtx offset = GEN_INT (end_save + ptr_off); diff --git a/gcc/config/rs6000/rs6000-overload.def b/gcc/config/rs6000/rs6000-overload.def index 87495ad..7d9e31c 100644 --- a/gcc/config/rs6000/rs6000-overload.def +++ b/gcc/config/rs6000/rs6000-overload.def @@ -4403,12 +4403,20 @@ XXEVAL XXEVAL_VUQ [VEC_TEST_LSBB_ALL_ONES, vec_test_lsbb_all_ones, __builtin_vec_xvtlsbb_all_ones] + signed int __builtin_vec_xvtlsbb_all_ones (vsc); + XVTLSBB_ONES LSBB_ALL_ONES_VSC signed int __builtin_vec_xvtlsbb_all_ones (vuc); - XVTLSBB_ONES + XVTLSBB_ONES LSBB_ALL_ONES_VUC + signed int __builtin_vec_xvtlsbb_all_ones (vbc); + XVTLSBB_ONES LSBB_ALL_ONES_VBC [VEC_TEST_LSBB_ALL_ZEROS, vec_test_lsbb_all_zeros, __builtin_vec_xvtlsbb_all_zeros] + signed int __builtin_vec_xvtlsbb_all_zeros (vsc); + XVTLSBB_ZEROS LSBB_ALL_ZEROS_VSC signed int __builtin_vec_xvtlsbb_all_zeros (vuc); - XVTLSBB_ZEROS + XVTLSBB_ZEROS LSBB_ALL_ZEROS_VUC + signed int __builtin_vec_xvtlsbb_all_zeros (vbc); + XVTLSBB_ZEROS LSBB_ALL_ZEROS_VBC [VEC_TRUNC, vec_trunc, __builtin_vec_trunc] vf __builtin_vec_trunc (vf); diff --git a/gcc/config/rs6000/rs6000-p8swap.cc b/gcc/config/rs6000/rs6000-p8swap.cc index 05fb760..614cecc 100644 --- a/gcc/config/rs6000/rs6000-p8swap.cc +++ b/gcc/config/rs6000/rs6000-p8swap.cc @@ -133,7 +133,7 @@ already in a register. In some cases, this mask may be a constant that we can discover with ud-chains, in which case the above transformation is ok. However, the common usage here is for the - mask to be produced by an UNSPEC_LVSL, in which case the mask + mask to be produced by an UNSPEC_LVSL, in which case the mask cannot be known at compile time. In such a case we would have to generate several instructions to compute M' as above at run time, and a cost model is needed again. @@ -634,7 +634,7 @@ v2df_reduction_p (rtx op) { if (GET_MODE (op) != V2DFmode) return false; - + enum rtx_code code = GET_CODE (op); if (code != PLUS && code != SMIN && code != SMAX) return false; @@ -913,7 +913,7 @@ insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, return 0; if (GET_CODE (XEXP (lhs, 0)) == AND) return 0; - + *special = SH_NOSWAP_ST; return 1; } @@ -1355,7 +1355,7 @@ adjust_vperm (rtx_insn *insn) break; } gcc_assert (swap_insn); - + /* Find the load. */ insn_info = DF_INSN_INFO_GET (swap_insn); rtx_insn *load_insn = 0; @@ -2094,7 +2094,7 @@ alignment_with_canonical_addr (rtx align) return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16)); } -/* Check whether an rtx is an alignment mask, and if so, return +/* Check whether an rtx is an alignment mask, and if so, return a fully-expanded rtx for the masking operation. */ static rtx alignment_mask (rtx_insn *insn) @@ -2397,7 +2397,7 @@ recombine_lvx_stvx_patterns (function *fun) remove_insn (to_delete[i].replace_insn); to_delete[i].replace_insn->set_deleted (); } - + free (to_delete); } diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc index 55b4133..de618da 100644 --- a/gcc/config/rs6000/rs6000-string.cc +++ b/gcc/config/rs6000/rs6000-string.cc @@ -1337,7 +1337,7 @@ expand_compare_loop (rtx operands[]) { /* If remainder length < word length, branch to final cleanup compare. */ - + if (!bytes_is_const) { do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), @@ -2695,7 +2695,7 @@ gen_lvx_v4si_move (rtx dest, rtx src) if (MEM_P (dest)) return gen_altivec_stvx_v4si_internal (dest, src); - else + else return gen_altivec_lvx_v4si_internal (dest, src); } @@ -2918,7 +2918,7 @@ expand_block_move (rtx operands[], bool might_overlap) emit_insn (stores[i]); num_reg = 0; } - + } return 1; diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 94c0db4..950fd94 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -22,6 +22,7 @@ #define IN_TARGET_CODE 1 #include "config.h" +#define INCLUDE_MEMORY #include "system.h" #include "coretypes.h" #include "backend.h" @@ -4317,10 +4318,10 @@ rs6000_option_override_internal (bool global_init_p) } } - /* Set the Darwin64 ABI as default for 64-bit Darwin. + /* Set the Darwin64 ABI as default for 64-bit Darwin. So far, the only darwin64 targets are also MACH-O. */ if (TARGET_MACHO - && DEFAULT_ABI == ABI_DARWIN + && DEFAULT_ABI == ABI_DARWIN && TARGET_64BIT) { if (main_target_opt != NULL && !main_target_opt->x_rs6000_darwin64_abi) @@ -4945,7 +4946,7 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac } /* Return true if the vector misalignment factor is supported by the - target. */ + target. */ static bool rs6000_builtin_support_vector_misalignment (machine_mode mode, const_tree type, @@ -8073,7 +8074,7 @@ rs6000_split_vec_extract_var (rtx dest, rtx src, rtx element, rtx tmp_gpr, /* Return alignment of TYPE. Existing alignment is ALIGN. HOW selects whether the alignment is abi mandated, optional, or both abi and optional alignment. */ - + unsigned int rs6000_data_alignment (tree type, unsigned int align, enum data_align how) { @@ -8698,7 +8699,7 @@ virtual_stack_registers_memory_p (rtx op) to determine whether -mcmodel=medium code can use TOC pointer relative addressing for OP. This means the alignment of the TOC pointer must also be taken into account, and unfortunately that is - only 8 bytes. */ + only 8 bytes. */ #ifndef POWERPC64_TOC_POINTER_ALIGNMENT #define POWERPC64_TOC_POINTER_ALIGNMENT 8 @@ -8846,8 +8847,8 @@ static const_rtx tocrel_base_oac, tocrel_offset_oac; /* Return true if OP is a toc pointer relative address (the output of create_TOC_reference). If STRICT, do not match non-split - -mcmodel=large/medium toc pointer relative addresses. If the pointers - are non-NULL, place base and offset pieces in TOCREL_BASE_RET and + -mcmodel=large/medium toc pointer relative addresses. If the pointers + are non-NULL, place base and offset pieces in TOCREL_BASE_RET and TOCREL_OFFSET_RET respectively. */ bool @@ -9574,7 +9575,7 @@ rs6000_legitimize_tls_address_aix (rtx addr, enum tls_model model) tocref = create_TOC_reference (modaddr, NULL_RTX); rtx modmem = gen_const_mem (Pmode, tocref); set_mem_alias_set (modmem, get_TOC_alias_set ()); - + rtx modreg = gen_reg_rtx (Pmode); emit_insn (gen_rtx_SET (modreg, modmem)); @@ -10138,13 +10139,13 @@ rs6000_offsettable_memref_p (rtx op, machine_mode reg_mode, bool strict) This takes into account how many parallel operations we can actually do of a given type, and also the latency. P8: - int add/sub 6/cycle + int add/sub 6/cycle mul 2/cycle vect add/sub/mul 2/cycle fp add/sub/mul 2/cycle dfp 1/cycle */ - + static int rs6000_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, machine_mode mode) @@ -10159,7 +10160,7 @@ rs6000_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED, return 1; if (VECTOR_MODE_P (mode)) return 4; - if (INTEGRAL_MODE_P (mode)) + if (INTEGRAL_MODE_P (mode)) return 1; if (FLOAT_MODE_P (mode)) return 4; @@ -14480,7 +14481,7 @@ print_operand (FILE *file, rtx x, int code) ? reg - 32 : reg - FIRST_ALTIVEC_REGNO + 32); -#ifdef TARGET_REGNAMES +#ifdef TARGET_REGNAMES if (TARGET_REGNAMES) fprintf (file, "%%vs%d", vsx_reg); else @@ -17329,9 +17330,9 @@ static char * rs6000_offload_options (void) { if (TARGET_64BIT) - return xstrdup ("-foffload-abi=lp64"); + return xstrdup ("-foffload-abi=lp64 -foffload-abi-host-opts=-m64"); else - return xstrdup ("-foffload-abi=ilp32"); + return xstrdup ("-foffload-abi=ilp32 -foffload-abi-host-opts=-m32"); } @@ -21226,7 +21227,7 @@ rs6000_darwin_file_start (void) darwin_file_start (); /* Determine the argument to -mcpu=. Default to G3 if not specified. */ - + if (rs6000_default_cpu != 0 && rs6000_default_cpu[0] != '\0') cpu_id = rs6000_default_cpu; @@ -22509,7 +22510,7 @@ rs6000_rtx_costs (rtx x, machine_mode mode, int outer_code, return false; } /* fall through */ - + case ASHIFTRT: case LSHIFTRT: case ROTATE: @@ -23082,7 +23083,7 @@ rs6000_emit_swdiv (rtx dst, rtx n, rtx d, bool note_p) for (i = 0, xprev = x1, eprev = e0; i < passes - 2; ++i, xprev = xnext, eprev = enext) { - + /* enext = eprev * eprev */ enext = gen_reg_rtx (mode); emit_insn (gen_mul (enext, eprev, eprev)); @@ -23349,7 +23350,7 @@ rs6000_emit_parity (rtx dst, rtx src) vperm 9,10,11,12 - places the desired result in vr9. However, in LE mode the + places the desired result in vr9. However, in LE mode the vector contents will be vr10 = 00000003 00000002 00000001 00000000 @@ -23566,7 +23567,7 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1, one_vec = true; break; } - + /* Look for splat patterns. */ if (one_vec) { @@ -23968,7 +23969,7 @@ rs6000_function_value (const_tree valtype, int n_elts; /* Special handling for structs in darwin64. */ - if (TARGET_MACHO + if (TARGET_MACHO && rs6000_darwin64_struct_check_p (TYPE_MODE (valtype), valtype)) { CUMULATIVE_ARGS valcum; @@ -24825,7 +24826,7 @@ rs6000_valid_attribute_p (tree fndecl, IDENTIFIER_POINTER (tname)); else fprintf (stderr, "function: unknown\n"); - + fprintf (stderr, "args:"); rs6000_debug_target_options (args, " "); fprintf (stderr, "\n"); @@ -25094,7 +25095,7 @@ static void rs6000_function_specific_restore (struct gcc_options *opts, struct gcc_options */* opts_set */, struct cl_target_option *ptr) - + { opts->x_rs6000_isa_flags = ptr->x_rs6000_isa_flags; opts->x_rs6000_isa_flags_explicit = ptr->x_rs6000_isa_flags_explicit; @@ -26748,7 +26749,7 @@ is_lfs_stfs_insn (rtx_insn *insn) rtx set = XVECEXP (pattern, 0, 0); if (GET_CODE (set) != SET) return false; - + rtx clobber = XVECEXP (pattern, 0, 1); if (GET_CODE (clobber) != CLOBBER) return false; diff --git a/gcc/config/rs6000/si2vmx.h b/gcc/config/rs6000/si2vmx.h index d0a1a28..fb03bdc 100644 --- a/gcc/config/rs6000/si2vmx.h +++ b/gcc/config/rs6000/si2vmx.h @@ -3,7 +3,7 @@ This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your option) + Software Foundation; either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT @@ -30,7 +30,7 @@ /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics. - * Users can override the action by defining it prior to including this + * Users can override the action by defining it prior to including this * header file. */ #ifndef SPU_HALT_ACTION @@ -38,7 +38,7 @@ #endif /* Specify a default stop action for the spu_stop intrinsic. - * Users can override the action by defining it prior to including this + * Users can override the action by defining it prior to including this * header file. */ #ifndef SPU_STOP_ACTION @@ -47,7 +47,7 @@ /* Specify a default action for unsupported intrinsic. - * Users can override the action by defining it prior to including this + * Users can override the action by defining it prior to including this * header file. */ #ifndef SPU_UNSUPPORTED_ACTION @@ -55,7 +55,7 @@ #endif -/* Casting intrinsics - from scalar to quadword +/* Casting intrinsics - from scalar to quadword */ static __inline qword si_from_uchar(unsigned char c) { @@ -274,7 +274,7 @@ static __inline qword si_absdb(qword a, qword b) return ((qword)(dc)); } -/* Add intrinsics +/* Add intrinsics */ #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b)))) @@ -282,14 +282,14 @@ static __inline qword si_absdb(qword a, qword b) static __inline qword si_ai(qword a, int b) { - return ((qword)(vec_add((vec_int4)(a), + return ((qword)(vec_add((vec_int4)(a), vec_splat((vec_int4)(si_from_int(b)), 0)))); } static __inline qword si_ahi(qword a, short b) { - return ((qword)(vec_add((vec_short8)(a), + return ((qword)(vec_add((vec_short8)(a), vec_splat((vec_short8)(si_from_short(b)), 1)))); } @@ -325,13 +325,13 @@ static __inline qword si_dfa(qword a, qword b) static __inline qword si_andbi(qword a, signed char b) { - return ((qword)(vec_and((vec_char16)(a), + return ((qword)(vec_and((vec_char16)(a), vec_splat((vec_char16)(si_from_char(b)), 3)))); } static __inline qword si_andhi(qword a, signed short b) { - return ((qword)(vec_and((vec_short8)(a), + return ((qword)(vec_and((vec_short8)(a), vec_splat((vec_short8)(si_from_short(b)), 1)))); } @@ -373,8 +373,8 @@ static __inline qword si_andi(qword a, signed int b) static __inline qword si_fcmeq(qword a, qword b) { vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); - - return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb), + + return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb), vec_andc((vec_float4)(b), msb)))); } @@ -408,11 +408,11 @@ static __inline qword si_dfcmeq(qword a, qword b) biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs); biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v)); - /* + /* B) Check if a is NaN, store in high word - + B1) If the high word is greater than max_exp (indicates a NaN) - B2) If the low word is greater than 0 + B2) If the low word is greater than 0 */ a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); @@ -435,7 +435,7 @@ static __inline qword si_dfcmeq(qword a, qword b) static __inline qword si_fcmgt(qword a, qword b) { vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); - + return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb), vec_andc((vec_float4)(b), msb)))); } @@ -454,7 +454,7 @@ static __inline qword si_dfcmgt(qword a, qword b) /* Shift 4 bytes */ x.i[3] = 4 << 3; - // absolute value of a,b + // absolute value of a,b vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); @@ -470,7 +470,7 @@ static __inline qword si_dfcmgt(qword a, qword b) b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf)); b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi); - // A) Check if the exponents are different + // A) Check if the exponents are different vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs); // B) Check if high word equal, and low word greater @@ -478,7 +478,7 @@ static __inline qword si_dfcmgt(qword a, qword b) vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs); vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); - // If either A or B is true, return true (unless NaNs detected) + // If either A or B is true, return true (unless NaNs detected) vec_uint4 r = vec_or(gt_hi, eqgt); // splat the high words of the comparison step @@ -513,19 +513,19 @@ static __inline qword si_fceq(qword a, qword b) static __inline qword si_ceqbi(qword a, signed char b) { - return ((qword)(vec_cmpeq((vec_char16)(a), + return ((qword)(vec_cmpeq((vec_char16)(a), vec_splat((vec_char16)(si_from_char(b)), 3)))); } static __inline qword si_ceqhi(qword a, signed short b) { - return ((qword)(vec_cmpeq((vec_short8)(a), + return ((qword)(vec_cmpeq((vec_short8)(a), vec_splat((vec_short8)(si_from_short(b)), 1)))); } static __inline qword si_ceqi(qword a, signed int b) { - return ((qword)(vec_cmpeq((vec_int4)(a), + return ((qword)(vec_cmpeq((vec_int4)(a), vec_splat((vec_int4)(si_from_int(b)), 0)))); } @@ -560,11 +560,11 @@ static __inline qword si_dfceq(qword a, qword b) aabs = vec_and((vec_uint4)a,sign_mask); babs = vec_and((vec_uint4)b,sign_mask); - /* + /* B) Check if a is NaN, store in high word - + B1) If the high word is greater than max_exp (indicates a NaN) - B2) If the low word is greater than 0 + B2) If the low word is greater than 0 */ a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); @@ -583,7 +583,7 @@ static __inline qword si_dfceq(qword a, qword b) result = vec_andc(result, anan); /* Promote high words to 64 bits and return */ - return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); + return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); } @@ -639,7 +639,7 @@ static __inline qword si_dfcgt(qword a, qword b) /* Shift 4 bytes */ x.i[3] = 4 << 3; - // absolute value of a,b + // absolute value of a,b vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); @@ -680,7 +680,7 @@ static __inline qword si_dfcgt(qword a, qword b) // pick the one we want vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel); - // A) Check if the exponents are different + // A) Check if the exponents are different vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval); // B) Check if high word equal, and low word greater @@ -688,7 +688,7 @@ static __inline qword si_dfcgt(qword a, qword b) vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval); vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); - // If either A or B is true, return true (unless NaNs detected) + // If either A or B is true, return true (unless NaNs detected) vec_uint4 r = vec_or(gt_hi, eqgt); // splat the high words of the comparison step @@ -700,25 +700,25 @@ static __inline qword si_dfcgt(qword a, qword b) static __inline qword si_cgtbi(qword a, signed char b) { - return ((qword)(vec_cmpgt((vec_char16)(a), + return ((qword)(vec_cmpgt((vec_char16)(a), vec_splat((vec_char16)(si_from_char(b)), 3)))); } static __inline qword si_cgthi(qword a, signed short b) { - return ((qword)(vec_cmpgt((vec_short8)(a), + return ((qword)(vec_cmpgt((vec_short8)(a), vec_splat((vec_short8)(si_from_short(b)), 1)))); } static __inline qword si_cgti(qword a, signed int b) { - return ((qword)(vec_cmpgt((vec_int4)(a), + return ((qword)(vec_cmpgt((vec_int4)(a), vec_splat((vec_int4)(si_from_int(b)), 0)))); } static __inline qword si_clgtbi(qword a, unsigned char b) { - return ((qword)(vec_cmpgt((vec_uchar16)(a), + return ((qword)(vec_cmpgt((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); } @@ -730,7 +730,7 @@ static __inline qword si_clgthi(qword a, unsigned short b) static __inline qword si_clgti(qword a, unsigned int b) { - return ((qword)(vec_cmpgt((vec_uint4)(a), + return ((qword)(vec_cmpgt((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint(b)), 0)))); } @@ -742,7 +742,7 @@ static __inline qword si_dftsv(qword a, char b) vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi); vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask); - + union { vec_uchar16 v; int i[4]; @@ -750,7 +750,7 @@ static __inline qword si_dftsv(qword a, char b) /* Shift 4 bytes */ x.i[3] = 4 << 3; - + /* Nan or +inf or -inf */ if (b & 0x70) { @@ -761,21 +761,21 @@ static __inline qword si_dftsv(qword a, char b) { vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); - a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); + a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); result = vec_or(result, a_nan); } - /* inf */ + /* inf */ if (b & 0x30) { a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf); - a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi); + a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi); /* +inf */ if (b & 0x20) result = vec_or(vec_andc(a_inf, sign), result); /* -inf */ if (b & 0x10) result = vec_or(vec_and(a_inf, sign), result); - } + } } /* 0 or denorm */ if (b & 0xF) @@ -860,7 +860,7 @@ static __inline qword si_clz(qword a) cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight))); cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen))); cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour))); - + return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour)))); } @@ -901,7 +901,7 @@ static __inline qword si_xsbh(qword a) vec_char16 av; av = (vec_char16)(a); - return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15, + return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15, 0, 0, 0, 0, 0, 0, 0, 0}))))); } @@ -910,9 +910,9 @@ static __inline qword si_xshw(qword a) vec_short8 av; av = (vec_short8)(a); - return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7, + return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7, 10,11,14,15, - 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}))))); } @@ -921,10 +921,10 @@ static __inline qword si_xswd(qword a) vec_int4 av; av = (vec_int4)(a); - return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})), - ((vec_uchar16){20, 21, 22, 23, - 4, 5, 6, 7, - 28, 29, 30, 31, + return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})), + ((vec_uchar16){20, 21, 22, 23, + 4, 5, 6, 7, + 28, 29, 30, 31, 12, 13, 14, 15})))); } @@ -984,7 +984,7 @@ static __inline qword si_gb(qword a) } -/* Compare and halt +/* Compare and halt */ static __inline void si_heq(qword a, qword b) { @@ -1066,8 +1066,8 @@ static __inline void si_hlgti(qword a, unsigned int b) */ static __inline qword si_mpya(qword a, qword b, qword c) { - return ((qword)(vec_msum(vec_and((vec_short8)(a), - ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})), + return ((qword)(vec_msum(vec_and((vec_short8)(a), + ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})), (vec_short8)(b), (vec_int4)(c)))); } @@ -1116,7 +1116,7 @@ static __inline qword si_fsmh(qword a) in = (vec_uchar16)(a); mask = (vec_short8)(vec_splat(in, 3)); - return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})), + return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})), vec_splat_u16(15)))); } @@ -1155,7 +1155,7 @@ static __inline qword si_mpyhhau(qword a, qword b, qword c) */ static __inline qword si_fms(qword a, qword b, qword c) { - return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), + return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), vec_sub(((vec_float4){0.0f}), (vec_float4)(c))))); } @@ -1231,13 +1231,13 @@ static __inline qword si_mpyu(qword a, qword b) static __inline qword si_mpyi(qword a, short b) { - return ((qword)(vec_mulo((vec_short8)(a), + return ((qword)(vec_mulo((vec_short8)(a), vec_splat((vec_short8)(si_from_short(b)), 1)))); } static __inline qword si_mpyui(qword a, unsigned short b) { - return ((qword)(vec_mulo((vec_ushort8)(a), + return ((qword)(vec_mulo((vec_ushort8)(a), vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); } @@ -1313,19 +1313,19 @@ static __inline qword si_or(qword a, qword b) static __inline qword si_orbi(qword a, unsigned char b) { - return ((qword)(vec_or((vec_uchar16)(a), + return ((qword)(vec_or((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); } static __inline qword si_orhi(qword a, unsigned short b) { - return ((qword)(vec_or((vec_ushort8)(a), + return ((qword)(vec_or((vec_ushort8)(a), vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); } static __inline qword si_ori(qword a, unsigned int b) { - return ((qword)(vec_or((vec_uint4)(a), + return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint(b)), 0)))); } @@ -1384,13 +1384,13 @@ static __inline qword si_rot(qword a, qword b) static __inline qword si_rothi(qword a, int b) { - return ((qword)(vec_rl((vec_ushort8)(a), + return ((qword)(vec_rl((vec_ushort8)(a), vec_splat((vec_ushort8)(si_from_int(b)), 1)))); } static __inline qword si_roti(qword a, int b) { - return ((qword)(vec_rl((vec_uint4)(a), + return ((qword)(vec_rl((vec_uint4)(a), vec_splat((vec_uint4)(si_from_int(b)), 0)))); } @@ -1526,7 +1526,7 @@ static __inline qword si_rotqbyi(qword a, int count) vec_uchar16 v; int i[4]; } left, right; - + count <<= 3; left.i[3] = count; right.i[3] = 0 - count; @@ -1536,7 +1536,7 @@ static __inline qword si_rotqbyi(qword a, int count) static __inline qword si_rotqby(qword a, qword count) { vec_uchar16 left, right; - + left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3)); right = vec_sub(vec_splat_u8(0), left); return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right)))); @@ -1560,7 +1560,7 @@ static __inline qword si_rotqbii(qword a, int count) { vec_uchar16 x, y; vec_uchar16 result; - + x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3); y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), (vec_uint4)vec_sub(vec_splat_u8(8), x))); @@ -1572,11 +1572,11 @@ static __inline qword si_rotqbi(qword a, qword count) { vec_uchar16 x, y; vec_uchar16 result; - + x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7)); y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), (vec_uint4)vec_sub(vec_splat_u8(8), x))); - + result = vec_or(vec_sll((qword)(a), x), y); return ((qword)(result)); } @@ -1652,10 +1652,10 @@ static __inline qword si_shufb(qword a, qword b, qword pattern) { vec_uchar16 pat; - pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), + pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)), vec_sra((vec_uchar16)(pattern), vec_splat_u8(7))); - return ((qword)(vec_perm(vec_perm(a, b, pattern), + return ((qword)(vec_perm(vec_perm(a, b, pattern), ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}), pat))); @@ -1831,7 +1831,7 @@ static __inline qword si_sumb(qword a, qword b) { vec_uint4 zero = (vec_uint4){0}; vec_ushort8 sum_a, sum_b; - + sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero); sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero); @@ -1848,19 +1848,19 @@ static __inline qword si_xor(qword a, qword b) static __inline qword si_xorbi(qword a, unsigned char b) { - return ((qword)(vec_xor((vec_uchar16)(a), + return ((qword)(vec_xor((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); } static __inline qword si_xorhi(qword a, unsigned short b) { - return ((qword)(vec_xor((vec_ushort8)(a), + return ((qword)(vec_xor((vec_ushort8)(a), vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); } static __inline qword si_xori(qword a, unsigned int b) { - return ((qword)(vec_xor((vec_uint4)(a), + return ((qword)(vec_xor((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint(b)), 0)))); } @@ -2038,7 +2038,7 @@ static __inline void si_stqr(qword a, unsigned int imm) static __inline void si_stqx(qword a, qword b, qword c) { - vec_st((vec_uchar16)(a), + vec_st((vec_uchar16)(a), si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))), (vector unsigned char *)(0)); } diff --git a/gcc/config/rs6000/spu2vmx.h b/gcc/config/rs6000/spu2vmx.h index b2573a9..7550dd9 100644 --- a/gcc/config/rs6000/spu2vmx.h +++ b/gcc/config/rs6000/spu2vmx.h @@ -3,7 +3,7 @@ This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your option) + Software Foundation; either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT @@ -202,7 +202,7 @@ static __inline vec_int4 spu_and(vec_int4 a, signed int b) * ======= */ #define spu_avg(_a, _b) vec_avg(_a, _b) - + /* spu_bisled * spu_bisled_d @@ -1070,12 +1070,12 @@ static __inline vec_float4 spu_nand(vec_float4 a, vec_float4 b) static __inline vec_ullong2 spu_nand(vec_ullong2 a, vec_ullong2 b) { - return ((vec_ullong2)(si_nand((qword)(a), (qword)(b)))); + return ((vec_ullong2)(si_nand((qword)(a), (qword)(b)))); } static __inline vec_llong2 spu_nand(vec_llong2 a, vec_llong2 b) { - return ((vec_llong2)(si_nand((qword)(a), (qword)(b)))); + return ((vec_llong2)(si_nand((qword)(a), (qword)(b)))); } static __inline vec_double2 spu_nand(vec_double2 a, vec_double2 b) @@ -1653,7 +1653,7 @@ static __inline vec_double2 spu_rlmaskqwbytebc(vec_double2 a, int count) static __inline vec_uchar16 spu_rlqwbyte(vec_uchar16 a, int count) { return ((vec_uchar16)(si_rotqby((qword)(a), si_from_int(count)))); -} +} static __inline vec_char16 spu_rlqwbyte(vec_char16 a, int count) { @@ -1663,7 +1663,7 @@ static __inline vec_char16 spu_rlqwbyte(vec_char16 a, int count) static __inline vec_ushort8 spu_rlqwbyte(vec_ushort8 a, int count) { return ((vec_ushort8)(si_rotqby((qword)(a), si_from_int(count)))); -} +} static __inline vec_short8 spu_rlqwbyte(vec_short8 a, int count) { @@ -2304,7 +2304,7 @@ static __inline vec_int4 spu_subx(vec_int4 a, vec_int4 b, vec_int4 c) static __inline vec_ushort8 spu_sumb(vec_uchar16 a, vec_uchar16 b) { return ((vec_ushort8)(si_sumb((qword)(a), (qword)(b)))); -} +} /* spu_sync diff --git a/gcc/config/rs6000/vec_types.h b/gcc/config/rs6000/vec_types.h index 6d629cc..e24dc26 100644 --- a/gcc/config/rs6000/vec_types.h +++ b/gcc/config/rs6000/vec_types.h @@ -3,7 +3,7 @@ This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your option) + Software Foundation; either version 3 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, but WITHOUT @@ -20,7 +20,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -/* Single token vector data types for the PowerPC SIMD/Vector Multi-media +/* Single token vector data types for the PowerPC SIMD/Vector Multi-media eXtension */ #ifndef _VEC_TYPES_H_ diff --git a/gcc/config/rs6000/xcoff.h b/gcc/config/rs6000/xcoff.h index c22edd7..89bd462 100644 --- a/gcc/config/rs6000/xcoff.h +++ b/gcc/config/rs6000/xcoff.h @@ -28,11 +28,11 @@ #define OBJECT_FORMAT_COFF /* Define the magic numbers that we recognize as COFF. - + AIX 4.3 adds U803XTOCMAGIC (0757) for 64-bit objects and AIX V5 adds U64_TOCMAGIC (0767), but collect2.cc does not include files in the correct order to conditionally define the symbolic name in this macro. - + The AIX linker accepts import/export files as object files, so accept "#!" (0x2321) magic number. */ #define MY_ISCOFF(magic) \ @@ -233,7 +233,7 @@ /* This is how we tell the assembler that two symbols have the same value. */ #define SET_ASM_OP "\t.set " -/* This is how we tell the assembler to equate two values. +/* This is how we tell the assembler to equate two values. The semantic of AIX assembler's .set do not correspond to middle-end expectations. We output aliases as alternative symbols in the front of the definition via DECLARE_FUNCTION_NAME and DECLARE_OBJECT_NAME. diff --git a/gcc/config/rtems.h b/gcc/config/rtems.h index cd5db38..3eca9b5 100644 --- a/gcc/config/rtems.h +++ b/gcc/config/rtems.h @@ -1,4 +1,4 @@ -/* Configuration common to all targets running RTEMS. +/* Configuration common to all targets running RTEMS. Copyright (C) 2000-2024 Free Software Foundation, Inc. This file is part of GCC. diff --git a/gcc/config/rx/rx.cc b/gcc/config/rx/rx.cc index c84e139..00242e8 100644 --- a/gcc/config/rx/rx.cc +++ b/gcc/config/rx/rx.cc @@ -156,7 +156,7 @@ rx_legitimize_address (rtx x, if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS - && REG_P (XEXP (XEXP (x, 0), 0)) + && REG_P (XEXP (XEXP (x, 0), 0)) && REG_P (XEXP (x, 1))) return force_reg (SImode, x); @@ -232,7 +232,7 @@ rx_is_legitimate_address (machine_mode mode, rtx x, switch (GET_MODE_SIZE (mode)) { - default: + default: case 4: factor = 4; break; case 2: factor = 2; break; case 1: factor = 1; break; @@ -299,7 +299,7 @@ rx_is_restricted_memory_address (rtx mem, machine_mode mode) case PLUS: { rtx base, index; - + /* Only allow REG+INT addressing. */ base = XEXP (mem, 0); index = XEXP (mem, 1); @@ -688,7 +688,7 @@ rx_print_operand (FILE * file, rtx op, int letter) fprintf (file, "#"); /* Trickery to avoid problems with shifting 32 bits at a time. */ v = v >> 16; - v = v >> 16; + v = v >> 16; rx_print_integer (file, v); break; } @@ -1002,14 +1002,14 @@ rx_gen_move_template (rtx * operands, bool is_movu) { gcc_assert (GET_MODE (src) != DImode); gcc_assert (GET_MODE (src) != DFmode); - + src_template = "(%A1 - __pid_base)[%P1]"; } else if (MEM_P (src) && rx_small_data_operand (XEXP (src, 0))) { gcc_assert (GET_MODE (src) != DImode); gcc_assert (GET_MODE (src) != DFmode); - + src_template = "%%gp(%A1)[%G1]"; } else @@ -1019,7 +1019,7 @@ rx_gen_move_template (rtx * operands, bool is_movu) { gcc_assert (GET_MODE (dest) != DImode); gcc_assert (GET_MODE (dest) != DFmode); - + dst_template = "%%gp(%A0)[%G0]"; } else @@ -1151,7 +1151,7 @@ rx_function_value (const_tree ret_type, && ! VECTOR_MODE_P (mode) ) return gen_rtx_REG (SImode, FUNC_RETURN_REGNUM); - + return gen_rtx_REG (mode, FUNC_RETURN_REGNUM); } @@ -1279,7 +1279,7 @@ rx_conditional_register_usage (void) /* This is for fast interrupt handlers. Any register in the range r10 to r13 (inclusive) that is currently - marked as fixed is now a viable, call-used register. */ + marked as fixed is now a viable, call-used register. */ for (r = 10; r <= 13; r++) if (fixed_regs[r]) { @@ -1363,7 +1363,7 @@ rx_set_current_function (tree fndecl) current_is_fast_interrupt = fndecl ? is_fast_interrupt_func (fndecl) : false; - + if (prev_was_fast_interrupt != current_is_fast_interrupt) { use_fixed_regs = current_is_fast_interrupt; @@ -1790,7 +1790,7 @@ rx_expand_prologue (void) break; } } - + /* We have assumed that there are at least two registers pushed... */ gcc_assert (acc_high != 0); @@ -1939,7 +1939,7 @@ rx_emit_stack_popm (rtx * operands, bool is_popm) gcc_assert (CONST_INT_P (operands[0])); stack_adjust = INTVAL (operands[0]); - + gcc_assert (GET_CODE (operands[1]) == PARALLEL); last_reg = XVECLEN (operands[1], 0) - (is_popm ? 2 : 3); @@ -1987,13 +1987,13 @@ gen_rx_rtsd_vector (unsigned int adjust, unsigned int low, unsigned int high) return vector; } - + /* Generate a PARALLEL which will satisfy the rx_load_multiple_vector predicate. */ static rtx gen_rx_popm_vector (unsigned int low, unsigned int high) { - unsigned int i; + unsigned int i; unsigned int count = (high - low) + 2; rtx vector; @@ -2877,7 +2877,7 @@ rx_func_attr_inlinable (const_tree decl) { return ! is_fast_interrupt_func (decl) && ! is_interrupt_func (decl) - && ! is_naked_func (decl); + && ! is_naked_func (decl); } static bool @@ -2961,7 +2961,7 @@ rx_is_legitimate_constant (machine_mode mode ATTRIBUTE_UNUSED, rtx x) gcc_unreachable (); } break; - + case LABEL_REF: case SYMBOL_REF: return true; @@ -3001,7 +3001,7 @@ rx_address_cost (rtx addr, machine_mode mode ATTRIBUTE_UNUSED, && ((INTVAL (b) > 128) || INTVAL (b) < -127)) /* Try to discourage REG + <large OFF> when optimizing for size. */ return COSTS_N_INSNS (2); - + return COSTS_N_INSNS (1); } @@ -3421,7 +3421,7 @@ rx_adjust_insn_length (rtx_insn *insn, int current_length) zero = false; factor = 2; break; - + case CODE_FOR_plussi3_zero_extendqi: case CODE_FOR_andsi3_zero_extendqi: case CODE_FOR_iorsi3_zero_extendqi: @@ -3436,7 +3436,7 @@ rx_adjust_insn_length (rtx_insn *insn, int current_length) zero = true; factor = 1; break; - + case CODE_FOR_plussi3_sign_extendqi: case CODE_FOR_andsi3_sign_extendqi: case CODE_FOR_iorsi3_sign_extendqi: @@ -3451,7 +3451,7 @@ rx_adjust_insn_length (rtx_insn *insn, int current_length) zero = false; factor = 1; break; - } + } /* We are expecting: (SET (REG) (<OP> (REG) (<EXTEND> (MEM)))). */ extend = single_set (insn); @@ -3466,7 +3466,7 @@ rx_adjust_insn_length (rtx_insn *insn, int current_length) gcc_assert ((zero && (GET_CODE (extend) == ZERO_EXTEND)) || (! zero && (GET_CODE (extend) == SIGN_EXTEND))); - + mem = XEXP (extend, 0); gcc_checking_assert (MEM_P (mem)); if (REG_P (XEXP (mem, 0))) diff --git a/gcc/config/s390/s390-c.cc b/gcc/config/s390/s390-c.cc index 4521a86..0332fb4 100644 --- a/gcc/config/s390/s390-c.cc +++ b/gcc/config/s390/s390-c.cc @@ -29,6 +29,7 @@ #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h index b4646cc..e7ac59d 100644 --- a/gcc/config/s390/s390-protos.h +++ b/gcc/config/s390/s390-protos.h @@ -50,7 +50,6 @@ extern void s390_set_has_landing_pad_p (bool); extern bool s390_hard_regno_rename_ok (unsigned int, unsigned int); extern int s390_class_max_nregs (enum reg_class, machine_mode); extern bool s390_return_addr_from_memory(void); -extern rtx s390_gen_lowpart_subreg (machine_mode, rtx); extern bool s390_fma_allowed_p (machine_mode); #if S390_USE_TARGET_ATTRIBUTE extern tree s390_valid_target_attribute_tree (tree args, diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc index 47e1d5a..874b112 100644 --- a/gcc/config/s390/s390.cc +++ b/gcc/config/s390/s390.cc @@ -22,6 +22,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" @@ -516,31 +517,6 @@ s390_return_addr_from_memory () return cfun_gpr_save_slot(RETURN_REGNUM) == SAVE_SLOT_STACK; } -/* Generate a SUBREG for the MODE lowpart of EXPR. - - In contrast to gen_lowpart it will always return a SUBREG - expression. This is useful to generate STRICT_LOW_PART - expressions. */ -rtx -s390_gen_lowpart_subreg (machine_mode mode, rtx expr) -{ - rtx lowpart = gen_lowpart (mode, expr); - - /* There might be no SUBREG in case it could be applied to the hard - REG rtx or it could be folded with a paradoxical subreg. Bring - it back. */ - if (!SUBREG_P (lowpart)) - { - machine_mode reg_mode = TARGET_ZARCH ? DImode : SImode; - gcc_assert (REG_P (lowpart)); - lowpart = gen_lowpart_SUBREG (mode, - gen_rtx_REG (reg_mode, - REGNO (lowpart))); - } - - return lowpart; -} - /* Return nonzero if it's OK to use fused multiply-add for MODE. */ bool s390_fma_allowed_p (machine_mode mode) @@ -3714,6 +3690,18 @@ s390_mem_constraint (const char *str, rtx op) if ((reload_completed || reload_in_progress) ? !offsettable_memref_p (op) : !offsettable_nonstrict_memref_p (op)) return 0; + /* offsettable_memref_p ensures only that any positive offset added to + the address forms a valid general address. For AQ and AR constraints + we also have to verify that the resulting displacement after adding + any positive offset less than the size of the object being referenced + is still valid. */ + if (str[1] == 'Q' || str[1] == 'R') + { + int o = GET_MODE_SIZE (GET_MODE (op)) - 1; + rtx tmp = adjust_address (op, QImode, o); + if (!s390_check_qrst_address (str[1], XEXP (tmp, 0), true)) + return 0; + } return s390_check_qrst_address (str[1], XEXP (op, 0), true); case 'B': /* Check for non-literal-pool variants of memory constraints. */ @@ -7112,15 +7100,21 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src) /* Emit a strict_low_part pattern if possible. */ if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize) { - rtx low_dest = s390_gen_lowpart_subreg (smode, dest); - rtx low_src = gen_lowpart (smode, src); - - switch (smode) + rtx low_dest = gen_lowpart (smode, dest); + if (SUBREG_P (low_dest) && !paradoxical_subreg_p (low_dest)) { - case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); return true; - case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); return true; - case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); return true; - default: break; + poly_int64 offset = GET_MODE_SIZE (mode) - GET_MODE_SIZE (smode); + rtx low_src = adjust_address (src, smode, offset); + switch (smode) + { + case E_QImode: emit_insn (gen_movstrictqi (low_dest, low_src)); + return true; + case E_HImode: emit_insn (gen_movstricthi (low_dest, low_src)); + return true; + case E_SImode: emit_insn (gen_movstrictsi (low_dest, low_src)); + return true; + default: break; + } } } @@ -8607,7 +8601,6 @@ print_operand_address (FILE *file, rtx addr) 't': CONST_INT: "start" of contiguous bitmask X in SImode. 'x': print integer X as if it's an unsigned halfword. 'v': print register number as vector register (v1 instead of f1). - 'V': print the second word of a TFmode operand as vector register. */ void @@ -8861,13 +8854,13 @@ print_operand (FILE *file, rtx x, int code) case REG: /* Print FP regs as fx instead of vx when they are accessed through non-vector mode. */ - if ((code == 'v' || code == 'V') + if (code == 'v' || VECTOR_NOFP_REG_P (x) || (FP_REG_P (x) && VECTOR_MODE_P (GET_MODE (x))) || (VECTOR_REG_P (x) && (GET_MODE_SIZE (GET_MODE (x)) / s390_class_max_nregs (FP_REGS, GET_MODE (x))) > 8)) - fprintf (file, "%%v%s", reg_names[REGNO (x) + (code == 'V')] + 2); + fprintf (file, "%%v%s", reg_names[REGNO (x)] + 2); else fprintf (file, "%s", reg_names[REGNO (x)]); break; @@ -11350,13 +11343,6 @@ s390_can_change_mode_class (machine_mode from_mode, return true; } -/* Return true if we use LRA instead of reload pass. */ -static bool -s390_lra_p (void) -{ - return s390_lra_flag; -} - /* Return true if register FROM can be eliminated via register TO. */ static bool @@ -18452,9 +18438,6 @@ s390_c_mode_for_floating_type (enum tree_index ti) #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P s390_legitimate_constant_p -#undef TARGET_LRA_P -#define TARGET_LRA_P s390_lra_p - #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE s390_can_eliminate diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md index 3d5759d..4a225ae 100644 --- a/gcc/config/s390/s390.md +++ b/gcc/config/s390/s390.md @@ -241,6 +241,8 @@ UNSPEC_VEC_VFMIN UNSPEC_VEC_VFMAX + UNSPEC_TF_TO_FPRX2 + UNSPEC_NNPA_VCLFNHS_V8HI UNSPEC_NNPA_VCLFNLS_V8HI UNSPEC_NNPA_VCRNFS_V8HI @@ -1974,12 +1976,11 @@ "TARGET_ZARCH" "#" "&& reload_completed" - [(set (match_dup 2) (match_dup 4)) + [(set (match_dup 2) (match_dup 3)) (set (match_dup 0) (ashift:DI (match_dup 0) (const_int 32))) - (set (strict_low_part (match_dup 3)) (match_dup 5))] + (set (strict_low_part (match_dup 2)) (match_dup 4))] "operands[2] = gen_lowpart (SImode, operands[0]); - operands[3] = s390_gen_lowpart_subreg (SImode, operands[0]); - s390_split_access_reg (operands[1], &operands[5], &operands[4]);") + s390_split_access_reg (operands[1], &operands[4], &operands[3]);") ; Splitters for storing TLS pointer to %a0:DI. @@ -5068,7 +5069,7 @@ (parallel [(set (strict_low_part (match_dup 2)) (match_dup 1)) (clobber (reg:CC CC_REGNUM))])] - "operands[2] = s390_gen_lowpart_subreg (HImode, operands[0]);") + "operands[2] = gen_lowpart (HImode, operands[0]);") (define_insn_and_split "*zero_extendqisi2_31" [(set (match_operand:SI 0 "register_operand" "=&d") @@ -5078,7 +5079,7 @@ "&& reload_completed" [(set (match_dup 0) (const_int 0)) (set (strict_low_part (match_dup 2)) (match_dup 1))] - "operands[2] = s390_gen_lowpart_subreg (QImode, operands[0]);") + "operands[2] = gen_lowpart (QImode, operands[0]);") ; ; zero_extendqihi2 instruction pattern(s). @@ -5110,7 +5111,7 @@ "&& reload_completed" [(set (match_dup 0) (const_int 0)) (set (strict_low_part (match_dup 2)) (match_dup 1))] - "operands[2] = s390_gen_lowpart_subreg (QImode, operands[0]);") + "operands[2] = gen_lowpart (QImode, operands[0]);") ; ; fixuns_trunc(dd|td|sf|df|tf)(si|di)2 expander diff --git a/gcc/config/s390/s390.opt b/gcc/config/s390/s390.opt index a5b5aa9..23ea4b8 100644 --- a/gcc/config/s390/s390.opt +++ b/gcc/config/s390/s390.opt @@ -229,10 +229,6 @@ Set the branch costs for conditional branch instructions. Reasonable values are small, non-negative integers. The default branch cost is 1. -mlra -Target Var(s390_lra_flag) Init(1) Save -Use LRA instead of reload. - mpic-data-is-text-relative Target Var(s390_pic_data_is_text_relative) Init(TARGET_DEFAULT_PIC_DATA_IS_TEXT_RELATIVE) Assume data segments are relative to text segment. diff --git a/gcc/config/s390/s390.opt.urls b/gcc/config/s390/s390.opt.urls index ab1e761..bc772d2 100644 --- a/gcc/config/s390/s390.opt.urls +++ b/gcc/config/s390/s390.opt.urls @@ -74,8 +74,6 @@ UrlSuffix(gcc/S_002f390-and-zSeries-Options.html#index-mzarch) ; skipping UrlSuffix for 'mbranch-cost=' due to finding no URLs -; skipping UrlSuffix for 'mlra' due to finding no URLs - ; skipping UrlSuffix for 'mpic-data-is-text-relative' due to finding no URLs ; skipping UrlSuffix for 'mindirect-branch=' due to finding no URLs diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index a75b7cb..e6f83d0 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -907,36 +907,45 @@ "vmrlg\t%0,%1,%2"; [(set_attr "op_type" "VRR")]) - -(define_insn "*tf_to_fprx2_0" - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0) - (subreg:DF (match_operand:TF 1 "general_operand" "v") 0))] - "TARGET_VXE" - ; M4 == 1 corresponds to %v0[0] = %v1[0]; %v0[1] = %v0[1]; - "vpdi\t%v0,%v1,%v0,1" - [(set_attr "op_type" "VRR")]) - -(define_insn "*tf_to_fprx2_1" - [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 8) - (subreg:DF (match_operand:TF 1 "general_operand" "v") 8))] +(define_insn "tf_to_fprx2" + [(set (match_operand:FPRX2 0 "register_operand" "=f,f ,f") + (unspec:FPRX2 [(match_operand:TF 1 "general_operand" "v,AR,AT")] + UNSPEC_TF_TO_FPRX2))] "TARGET_VXE" - ; M4 == 5 corresponds to %V0[0] = %v1[1]; %V0[1] = %V0[1]; - "vpdi\t%V0,%v1,%V0,5" - [(set_attr "op_type" "VRR")]) - -(define_insn_and_split "tf_to_fprx2" - [(set (match_operand:FPRX2 0 "nonimmediate_operand" "=f,f") - (subreg:FPRX2 (match_operand:TF 1 "general_operand" "v,AR") 0))] - "TARGET_VXE" - "#" - "!(MEM_P (operands[1]) && MEM_VOLATILE_P (operands[1]))" - [(set (match_dup 2) (match_dup 3)) - (set (match_dup 4) (match_dup 5))] { - operands[2] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 0); - operands[3] = simplify_gen_subreg (DFmode, operands[1], TFmode, 0); - operands[4] = simplify_gen_subreg (DFmode, operands[0], FPRX2mode, 8); - operands[5] = simplify_gen_subreg (DFmode, operands[1], TFmode, 8); + char buf[64]; + const char *reg_pair = reg_names[REGNO (operands[0]) + 1]; + switch (which_alternative) + { + case 0: + if (REGNO (operands[0]) == REGNO (operands[1])) + { + reg_pair += 2; // get rid of prefix %f + snprintf (buf, sizeof (buf), "vpdi\t%%%%v%s,%%v1,%%%%v%s,5", reg_pair, reg_pair); + output_asm_insn (buf, operands); + return ""; + } + else + { + reg_pair += 2; // get rid of prefix %f + snprintf (buf, sizeof (buf), "ldr\t%%f0,%%f1;vpdi\t%%%%v%s,%%v1,%%%%v%s,5", reg_pair, reg_pair); + output_asm_insn (buf, operands); + return ""; + } + case 1: + { + snprintf (buf, sizeof (buf), "ld\t%%f0,%%1;ld\t%%%s,8+%%1", reg_pair); + output_asm_insn (buf, operands); + return ""; + } + case 2: + { + snprintf (buf, sizeof (buf), "ldy\t%%f0,%%1;ldy\t%%%s,8+%%1", reg_pair); + output_asm_insn (buf, operands); + return ""; + } + default: gcc_unreachable (); + } }) ;; VECTOR REVERSE ELEMENTS V16QI @@ -2830,9 +2839,8 @@ ; There is no instruction for rounding an extended BFP operand in a VR into ; a signed integer, therefore copy it into a FPR pair first. (define_expand "fix_trunctf<mode>2_vr" - [(set (subreg:DF (match_dup 2) 0) - (subreg:DF (match_operand:TF 1 "register_operand" "") 0)) - (set (subreg:DF (match_dup 2) 8) (subreg:DF (match_dup 1) 8)) + [(set (match_dup 2) + (unspec:FPRX2 [(match_operand:TF 1 "register_operand")] UNSPEC_TF_TO_FPRX2)) (parallel [(set (match_operand:GPR 0 "register_operand" "") (fix:GPR (match_dup 2))) (unspec:GPR [(const_int BFP_RND_TOWARD_0)] UNSPEC_ROUND) @@ -2863,9 +2871,8 @@ ; There is no instruction for rounding an extended BFP operand in a VR into ; an unsigned integer, therefore copy it into a FPR pair first. (define_expand "fixuns_trunctf<mode>2_vr" - [(set (subreg:DF (match_dup 2) 0) - (subreg:DF (match_operand:TF 1 "register_operand" "") 0)) - (set (subreg:DF (match_dup 2) 8) (subreg:DF (match_dup 1) 8)) + [(set (match_dup 2) + (unspec:FPRX2 [(match_operand:TF 1 "register_operand")] UNSPEC_TF_TO_FPRX2)) (parallel [(set (match_operand:GPR 0 "register_operand" "") (unsigned_fix:GPR (match_dup 2))) (unspec:GPR [(const_int BFP_RND_TOWARD_0)] UNSPEC_ROUND) diff --git a/gcc/config/sh/elf.h b/gcc/config/sh/elf.h index 505c5d6..33a6906 100644 --- a/gcc/config/sh/elf.h +++ b/gcc/config/sh/elf.h @@ -33,7 +33,7 @@ along with GCC; see the file COPYING3. If not see #undef WCHAR_TYPE #define WCHAR_TYPE SH_ELF_WCHAR_TYPE - + #undef WCHAR_TYPE_SIZE #define WCHAR_TYPE_SIZE 32 diff --git a/gcc/config/sh/embed-elf.h b/gcc/config/sh/embed-elf.h index fef16de..844aff9 100644 --- a/gcc/config/sh/embed-elf.h +++ b/gcc/config/sh/embed-elf.h @@ -1,4 +1,4 @@ -/* Definitions of target machine for GNU compiler for Renesas / SuperH SH +/* Definitions of target machine for GNU compiler for Renesas / SuperH SH non-Linux embedded targets. Copyright (C) 2002-2024 Free Software Foundation, Inc. Contributed by J"orn Rennecke <joern.rennecke@superh.com> diff --git a/gcc/config/sh/netbsd-elf.h b/gcc/config/sh/netbsd-elf.h index f195710..b937155 100644 --- a/gcc/config/sh/netbsd-elf.h +++ b/gcc/config/sh/netbsd-elf.h @@ -62,7 +62,7 @@ along with GCC; see the file COPYING3. If not see /* Define because we use the label and we do not need them. */ #define NO_PROFILE_COUNTERS 1 - + #undef FUNCTION_PROFILER #define FUNCTION_PROFILER(STREAM,LABELNO) \ do \ diff --git a/gcc/config/sh/sh.cc b/gcc/config/sh/sh.cc index 7391b8d..663c9908 100644 --- a/gcc/config/sh/sh.cc +++ b/gcc/config/sh/sh.cc @@ -377,7 +377,7 @@ TARGET_GNU_ATTRIBUTES (sh_attribute_table, #define TARGET_PRINT_OPERAND_PUNCT_VALID_P sh_print_operand_punct_valid_p #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA sh_asm_output_addr_const_extra - + #undef TARGET_ASM_FUNCTION_EPILOGUE #define TARGET_ASM_FUNCTION_EPILOGUE sh_output_function_epilogue @@ -814,7 +814,7 @@ register_sh_passes (void) PASS_POS_INSERT_BEFORE, "sched2", 1); } -/* Implement TARGET_OPTION_OVERRIDE macro. Validate and override +/* Implement TARGET_OPTION_OVERRIDE macro. Validate and override various options, and do some machine dependent initialization. */ static void sh_option_override (void) @@ -1012,7 +1012,7 @@ sh_override_options_after_change (void) fetched as a pair from a longword boundary. For size use 16 bit alignment to get more compact code. Aligning all jumps increases the code size, even if it might - result in slightly faster code. Thus, it is set to the smallest + result in slightly faster code. Thus, it is set to the smallest alignment possible if not specified by the user. */ if (flag_align_loops && !str_align_loops) str_align_loops = optimize_size ? "2" : "4"; @@ -2265,7 +2265,7 @@ sh_eval_treg_value (rtx op) t = 1; else return -1; - + return t ^ (cmpval == cmpop); } @@ -2543,7 +2543,7 @@ output_movedouble (rtx insn ATTRIBUTE_UNUSED, rtx operands[], We punt for now, since this is likely very rare. */ gcc_assert (!REG_P (XEXP (inside, 1))); break; - + case LABEL_REF: return "mov.l %1,%0" "\n" " mov.l %1+4,%T0"; @@ -3016,7 +3016,7 @@ bool sh_ashlsi_clobbers_t_reg_p (rtx shift_amount) { gcc_assert (CONST_INT_P (shift_amount)); - + const int shift_amount_i = INTVAL (shift_amount) & 31; /* Special case for shift count of 31: use and-rotl sequence. */ @@ -3036,7 +3036,7 @@ sh_lshrsi_clobbers_t_reg_p (rtx shift_amount) /* For right shifts the constant might be negative. */ const int shift_amount_i = std::abs (INTVAL (shift_amount)) & 31; - + /* Special case for shift count of 31: use shll-movt sequence. */ if (shift_amount_i == 31) return true; @@ -3046,7 +3046,7 @@ sh_lshrsi_clobbers_t_reg_p (rtx shift_amount) } /* Return true if it is potentially beneficial to use a dynamic shift - instruction (shad / shar) instead of a combination of 1/2/8/16 + instruction (shad / shar) instead of a combination of 1/2/8/16 shift instructions for the specified shift count. If dynamic shifts are not available, always return false. */ bool @@ -3231,7 +3231,7 @@ multcosts (rtx x ATTRIBUTE_UNUSED) static bool sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, int opno ATTRIBUTE_UNUSED, - int *total, bool speed ATTRIBUTE_UNUSED) + int *total, bool speed) { int code = GET_CODE (x); @@ -3240,7 +3240,7 @@ sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, /* The lower-subreg pass decides whether to split multi-word regs into individual regs by looking at the cost for a SET of certain modes with the following patterns: - (set (reg) (reg)) + (set (reg) (reg)) (set (reg) (const_int 0)) On machines that support vector-move operations a multi-word move is the same cost as individual reg move. On SH there is no @@ -3264,10 +3264,12 @@ sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, } return false; - /* The cost of a mem access is mainly the cost of the address mode. */ + /* The cost of a mem access is mainly the cost of the address mode on top + of the cost of the load/store insn itself. */ case MEM: *total = sh_address_cost (XEXP (x, 0), GET_MODE (x), MEM_ADDR_SPACE (x), - true); + speed) + + COSTS_N_INSNS (1); return true; case IF_THEN_ELSE: @@ -3317,7 +3319,8 @@ sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, { *total = sh_address_cost (XEXP (XEXP (x, 0), 0), GET_MODE (XEXP (x, 0)), - MEM_ADDR_SPACE (XEXP (x, 0)), true); + MEM_ADDR_SPACE (XEXP (x, 0)), speed) + + COSTS_N_INSNS (1); return true; } return false; @@ -3333,9 +3336,10 @@ sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, || GET_MODE (XEXP (x, 0)) == HImode)) { /* Handle SH2A's movu.b and movu.w insn. */ - *total = sh_address_cost (XEXP (XEXP (x, 0), 0), - GET_MODE (XEXP (x, 0)), - MEM_ADDR_SPACE (XEXP (x, 0)), true); + *total = sh_address_cost (XEXP (XEXP (x, 0), 0), + GET_MODE (XEXP (x, 0)), + MEM_ADDR_SPACE (XEXP (x, 0)), speed) + + COSTS_N_INSNS (1); return true; } return false; @@ -3348,16 +3352,18 @@ sh_rtx_costs (rtx x, machine_mode mode ATTRIBUTE_UNUSED, int outer_code, rtx xx = XVECEXP (x, 0, i); if (GET_CODE (xx) == SET && MEM_P (XEXP (xx, 0))) { - *total = sh_address_cost (XEXP (XEXP (xx, 0), 0), + *total = sh_address_cost (XEXP (XEXP (xx, 0), 0), GET_MODE (XEXP (xx, 0)), - MEM_ADDR_SPACE (XEXP (xx, 0)), true); + MEM_ADDR_SPACE (XEXP (xx, 0)), speed) + + COSTS_N_INSNS (1); return true; } if (GET_CODE (xx) == SET && MEM_P (XEXP (xx, 1))) { *total = sh_address_cost (XEXP (XEXP (xx, 1), 0), GET_MODE (XEXP (xx, 1)), - MEM_ADDR_SPACE (XEXP (xx, 1)), true); + MEM_ADDR_SPACE (XEXP (xx, 1)), speed) + + COSTS_N_INSNS (1); return true; } } @@ -3575,7 +3581,7 @@ sh_max_mov_insn_displacement (machine_mode mode, bool consider_sh2a) const int mov_insn_sz = mov_insn_size (mode, consider_sh2a); const int mode_sz = GET_MODE_SIZE (mode); int r = 15 * mov_insn_sz * disp_scale; - + /* If the mov insn will be split into multiple loads/stores, the maximum possible displacement is a bit smaller. */ if (mode_sz > mov_insn_sz) @@ -3645,7 +3651,7 @@ sh_address_cost (rtx x, machine_mode mode, return 3; } - /* 'reg + reg' addressing. Account a slightly higher cost because of + /* 'reg + reg' addressing. Account a slightly higher cost because of increased pressure on R0. */ if (GET_CODE (x) == PLUS && ! CONSTANT_P (XEXP (x, 1))) return 3; @@ -5225,7 +5231,7 @@ find_barrier (int num_mova, rtx_insn *mova, rtx_insn *from) from = PREV_INSN (from); /* Don't emit a constant table int the middle of global pointer setting, - since that that would move the addressing base GOT into another table. + since that that would move the addressing base GOT into another table. We need the first mov instruction before the _GLOBAL_OFFSET_TABLE_ in the pool anyway, so just move up the whole constant pool. @@ -6059,7 +6065,7 @@ sh_reorg (void) later insn. */ /* ??? We shouldn't have to use FOUNDINSN here. - This dates back to when we used LOG_LINKS to find + This dates back to when we used LOG_LINKS to find the most recent insn which sets the register. */ if (foundinsn @@ -6759,7 +6765,7 @@ output_stack_adjust (int size, rtx reg, int epilogue_p, if (temp < 0) { rtx adj_reg, tmp_reg, mem; - + /* If we reached here, the most likely case is the (sibcall) epilogue. Put a special push/pop sequence for such case as the last resort. This looks lengthy but would not be problem @@ -6770,7 +6776,7 @@ output_stack_adjust (int size, rtx reg, int epilogue_p, r5 have been reserved as fixed registers or assigned as global registers, and they change during an interrupt. There are possible ways to handle this: - + - If we are adjusting the frame pointer (r14), we can do with a single temp register and an ordinary push / pop on the stack. @@ -7268,7 +7274,7 @@ sh_expand_epilogue (bool sibcall_p) /* For an ISR with RESBANK attribute assigned, don't pop PR register. */ if (TEST_HARD_REG_BIT (live_regs_mask, PR_REG) - && !sh_cfun_resbank_handler_p ()) + && !sh_cfun_resbank_handler_p ()) { if (!frame_pointer_needed) emit_insn (gen_blockage ()); @@ -7328,7 +7334,7 @@ sh_expand_epilogue (bool sibcall_p) fpscr_deferred = true; /* For an ISR with RESBANK attribute assigned, don't pop following registers, R0-R14, MACH, MACL and GBR. */ - else if (j != PR_REG && TEST_HARD_REG_BIT (live_regs_mask, j) + else if (j != PR_REG && TEST_HARD_REG_BIT (live_regs_mask, j) && ! (sh_cfun_resbank_handler_p () && ((j >= FIRST_GENERAL_REG && j < LAST_GENERAL_REG) @@ -9189,7 +9195,7 @@ legitimize_pic_address (rtx orig, machine_mode mode ATTRIBUTE_UNUSED, rtx reg) In some cases it is possible that a requested offset might seem unaligned or inappropriate for the mode size, like offset = 2 and mode size = 4. This is compensated by adjusting the base address so that the effective - address of the displacement move insn will be aligned. + address of the displacement move insn will be aligned. This is not the best possible way of rebasing the base address, as it does not look at other present displacement addressings around it. @@ -10405,7 +10411,7 @@ sh_vector_mode_supported_p (machine_mode mode ATTRIBUTE_UNUSED) bool sh_frame_pointer_required (void) { -/* If needed override this in other tm.h files to cope with various OS +/* If needed override this in other tm.h files to cope with various OS lossage requiring a frame pointer. */ if (SUBTARGET_FRAME_POINTER_REQUIRED) return true; @@ -11393,14 +11399,14 @@ sh_secondary_reload (bool in_p, rtx x, reg_class_t rclass_i, <= sh_max_mov_insn_displacement (mode, false)) return R0_REGS; - /* When reload is trying to address a QImode or HImode subreg on the stack, + /* When reload is trying to address a QImode or HImode subreg on the stack, force any subreg byte into R0_REGS, as this is going to become a displacement address. We could restrict this to SUBREG_BYTE (x) > 0, but if the actual reg is on the stack, the memref to it might already require a displacement and that has to be added to the final address. At this point we don't know the cumulative displacement so we assume the worst case. */ - if ((mode == QImode || mode == HImode) && rclass != R0_REGS + if ((mode == QImode || mode == HImode) && rclass != R0_REGS && GET_CODE (x) == SUBREG && true_regnum (x) == -1) return R0_REGS; @@ -11439,7 +11445,7 @@ sh_legitimize_address_displacement (rtx *offset1, rtx *offset2, *offset2 = adj.mov_disp; return true; } - + return false; } @@ -11589,7 +11595,7 @@ base_reg_disp::base_reg_disp (rtx br, disp_t d) : reg_ (br), disp_ (d) { } - + inline bool base_reg_disp::is_reg (void) const { @@ -11934,7 +11940,7 @@ sh_is_logical_t_store_expr (rtx op, rtx_insn* insn) op_is_t_count++; } } - + return op_is_t_count == 2; } diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h index 53cad85..e0ac35c 100644 --- a/gcc/config/sh/sh.h +++ b/gcc/config/sh/sh.h @@ -293,7 +293,7 @@ extern int code_for_indirect_jump_scratch; #else #define IS_LITTLE_ENDIAN_OPTION "%{!mb:" #endif - + #if TARGET_CPU_DEFAULT & MASK_HARD_SH2A #define UNSUPPORTED_SH2A IS_LITTLE_ENDIAN_OPTION \ "%{m2a*|!m1:%{!m2*:%{!m3*:%{!m4*:%eSH2a does not support little-endian}}}}}" @@ -1490,7 +1490,7 @@ extern bool current_function_interrupt; return X << (Y & 31); else return X >> (-Y) & 31); - + The dynamic shift library routines in lib1funcs.S do not use the sign bit like the hardware dynamic shifts and truncate the shift count to 31. We define SHIFT_COUNT_TRUNCATED to 0 and express the implied shift count diff --git a/gcc/config/sh/sh_treg_combine.cc b/gcc/config/sh/sh_treg_combine.cc index a26fcfb..db40573 100644 --- a/gcc/config/sh/sh_treg_combine.cc +++ b/gcc/config/sh/sh_treg_combine.cc @@ -634,7 +634,7 @@ sh_treg_combine::sh_treg_combine (gcc::context* ctx, bool split_insns, m_split_insns (split_insns), m_ccreg (NULL_RTX) { - // Overwrite default name in pass_data base class. + // Overwrite default name in pass_data base class. this->name = name; } diff --git a/gcc/config/sh/vxworks.h b/gcc/config/sh/vxworks.h index 7a07ce9..15786b2 100644 --- a/gcc/config/sh/vxworks.h +++ b/gcc/config/sh/vxworks.h @@ -1,8 +1,8 @@ /* Definitions of target machine for GCC, - for SuperH with targeting the VXWorks run time environment. + for SuperH with targeting the VXWorks run time environment. Copyright (C) 2003-2024 Free Software Foundation, Inc. Contributed by CodeSourcery, LLC. - + This file is part of GCC. GCC is free software; you can redistribute it and/or modify diff --git a/gcc/config/sol2-cxx.cc b/gcc/config/sol2-cxx.cc index aa558be..4f5fbc6 100644 --- a/gcc/config/sol2-cxx.cc +++ b/gcc/config/sol2-cxx.cc @@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/sol2.cc b/gcc/config/sol2.cc index 4efa693..1cc525a 100644 --- a/gcc/config/sol2.cc +++ b/gcc/config/sol2.cc @@ -226,7 +226,7 @@ solaris_elf_asm_comdat_section (const char *name, unsigned int flags, tree decl) directive since Sun as treats undeclared sections as @progbits, which conflicts with .bss* sections which are @nobits. */ targetm.asm_out.named_section (section, flags & ~SECTION_LINKONCE, decl); - + /* Sun as separates declaration of a group section and of the group itself, using the .group directive and the #comdat flag. */ fprintf (asm_out_file, "\t.group\t%s," SECTION_NAME_FORMAT ",#comdat\n", diff --git a/gcc/config/sparc/constraints.md b/gcc/config/sparc/constraints.md index 350ad8e..6cb7a30 100644 --- a/gcc/config/sparc/constraints.md +++ b/gcc/config/sparc/constraints.md @@ -145,51 +145,6 @@ (match_test "TARGET_ARCH32") (match_test "memory_ok_for_ldd (op)"))) -;; This awkward register constraint is necessary because it is not -;; possible to express the "must be even numbered register" condition -;; using register classes. The problem is that membership in a -;; register class requires that all registers of a multi-regno -;; register be included in the set. It is add_to_hard_reg_set -;; and in_hard_reg_set_p which populate and test regsets with these -;; semantics. -;; -;; So this means that we would have to put both the even and odd -;; register into the register class, which would not restrict things -;; at all. -;; -;; Using a combination of GENERAL_REGS and TARGET_HARD_REGNO_MODE_OK is -;; not a full solution either. In fact, even though IRA uses the macro -;; TARGET_HARD_REGNO_MODE_OK to calculate which registers are prohibited -;; from use in certain modes, it still can allocate an odd hard register -;; for DImode values. This is due to how IRA populates the table -;; ira_useful_class_mode_regs[][]. It suffers from the same problem -;; as using a register class to describe this restriction. Namely, it -;; sets both the odd and even part of an even register pair in the -;; regset. Therefore IRA can and will allocate odd registers for -;; DImode values on 32-bit. -;; -;; There are legitimate cases where DImode values can end up in odd -;; hard registers, the most notable example is argument passing. -;; -;; What saves us is reload and the DImode splitters. Both are -;; necessary. The odd register splitters cannot match if, for -;; example, we have a non-offsetable MEM. Reload will notice this -;; case and reload the address into a single hard register. -;; -;; The real downfall of this awkward register constraint is that it -;; does not evaluate to a true register class like a bonafide use of -;; define_register_constraint would. This means that we cannot use -;; it with LRA, since the constraint processing of LRA really depends -;; upon whether an extra constraint is for registers or not. It uses -;; reg_class_for_constraint, and checks it against NO_REGS. -(define_constraint "U" - "Pseudo-register or hard even-numbered integer register" - (and (match_code "reg") - (ior (match_test "REGNO (op) < FIRST_PSEUDO_REGISTER") - (not (match_test "reload_in_progress && reg_renumber [REGNO (op)] < 0"))) - (match_test "TARGET_ARCH32") - (match_test "register_ok_for_ldd (op)"))) - (define_memory_constraint "W" "A memory with only a base register" (match_operand 0 "mem_noofs_operand")) diff --git a/gcc/config/sparc/freebsd.h b/gcc/config/sparc/freebsd.h index 5396b32..ee2a210 100644 --- a/gcc/config/sparc/freebsd.h +++ b/gcc/config/sparc/freebsd.h @@ -55,7 +55,7 @@ along with GCC; see the file COPYING3. If not see /************************[ Target stuff ]***********************************/ -/* Define the actual types of some ANSI-mandated types. +/* Define the actual types of some ANSI-mandated types. Needs to agree with <machine/ansi.h>. GCC defaults come from c-decl.cc, c-common.cc, and config/<arch>/<arch>.h. */ @@ -111,7 +111,7 @@ along with GCC; see the file COPYING3. If not see /* DWARF bits. */ -/* Follow Irix 6 and not the Dwarf2 draft in using 64-bit offsets. +/* Follow Irix 6 and not the Dwarf2 draft in using 64-bit offsets. Obviously the Dwarf2 folks havn't tried to actually build systems with their spec. On a 64-bit system, only 64-bit relocs become RELATIVE relocations. */ diff --git a/gcc/config/sparc/linux.h b/gcc/config/sparc/linux.h index 8cc5389..538845e 100644 --- a/gcc/config/sparc/linux.h +++ b/gcc/config/sparc/linux.h @@ -56,13 +56,13 @@ extern const char *host_detect_local_cpu (int argc, const char **argv); #undef SIZE_TYPE #define SIZE_TYPE "unsigned int" - + #undef PTRDIFF_TYPE #define PTRDIFF_TYPE "int" - + #undef WCHAR_TYPE #define WCHAR_TYPE "int" - + #undef WCHAR_TYPE_SIZE #define WCHAR_TYPE_SIZE 32 diff --git a/gcc/config/sparc/linux64.h b/gcc/config/sparc/linux64.h index 63543f9..dc6b408 100644 --- a/gcc/config/sparc/linux64.h +++ b/gcc/config/sparc/linux64.h @@ -254,7 +254,7 @@ do { \ /* DWARF bits. */ -/* Follow Irix 6 and not the Dwarf2 draft in using 64-bit offsets. +/* Follow Irix 6 and not the Dwarf2 draft in using 64-bit offsets. Obviously the Dwarf2 folks haven't tried to actually build systems with their spec. On a 64-bit system, only 64-bit relocs become RELATIVE relocations. */ diff --git a/gcc/config/sparc/sparc-protos.h b/gcc/config/sparc/sparc-protos.h index 399458a..bc30608 100644 --- a/gcc/config/sparc/sparc-protos.h +++ b/gcc/config/sparc/sparc-protos.h @@ -99,7 +99,7 @@ extern int register_ok_for_ldd (rtx); extern int memory_ok_for_ldd (rtx); extern int v9_regcmp_p (enum rtx_code); /* Function used for V8+ code generation. Returns 1 if the high - 32 bits of REG are 0 before INSN. */ + 32 bits of REG are 0 before INSN. */ extern int sparc_check_64 (rtx, rtx_insn *); extern rtx gen_df_reg (rtx, int); extern void sparc_expand_compare_and_swap (rtx op[]); diff --git a/gcc/config/sparc/sparc.cc b/gcc/config/sparc/sparc.cc index 3a4c13a..3935a97 100644 --- a/gcc/config/sparc/sparc.cc +++ b/gcc/config/sparc/sparc.cc @@ -61,6 +61,7 @@ along with GCC; see the file COPYING3. If not see #include "builtins.h" #include "tree-vector-builder.h" #include "opts.h" +#include "dwarf2out.h" /* This file should be included last. */ #include "target-def.h" @@ -681,6 +682,9 @@ static rtx sparc_libcall_value (machine_mode, const_rtx); static bool sparc_function_value_regno_p (const unsigned int); static unsigned HOST_WIDE_INT sparc_asan_shadow_offset (void); static void sparc_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; +static bool sparc_output_cfi_directive (FILE *, dw_cfi_ref); +static bool sparc_dw_cfi_oprnd1_desc (dwarf_call_frame_info, + dw_cfi_oprnd_type &); static void sparc_file_end (void); static bool sparc_frame_pointer_required (void); static bool sparc_can_eliminate (const int, const int); @@ -693,7 +697,6 @@ static const char *sparc_mangle_type (const_tree); static void sparc_trampoline_init (rtx, tree, rtx); static machine_mode sparc_preferred_simd_mode (scalar_mode); static reg_class_t sparc_preferred_reload_class (rtx x, reg_class_t rclass); -static bool sparc_lra_p (void); static bool sparc_print_operand_punct_valid_p (unsigned char); static void sparc_print_operand (FILE *, rtx, int); static void sparc_print_operand_address (FILE *, machine_mode, rtx); @@ -878,6 +881,12 @@ char sparc_hard_reg_printed[8]; #define TARGET_ASM_OUTPUT_DWARF_DTPREL sparc_output_dwarf_dtprel #endif +#undef TARGET_OUTPUT_CFI_DIRECTIVE +#define TARGET_OUTPUT_CFI_DIRECTIVE sparc_output_cfi_directive + +#undef TARGET_DW_CFI_OPRND1_DESC +#define TARGET_DW_CFI_OPRND1_DESC sparc_dw_cfi_oprnd1_desc + #undef TARGET_ASM_FILE_END #define TARGET_ASM_FILE_END sparc_file_end @@ -911,9 +920,6 @@ char sparc_hard_reg_printed[8]; #define TARGET_MANGLE_TYPE sparc_mangle_type #endif -#undef TARGET_LRA_P -#define TARGET_LRA_P sparc_lra_p - #undef TARGET_LEGITIMATE_ADDRESS_P #define TARGET_LEGITIMATE_ADDRESS_P sparc_legitimate_address_p @@ -1947,10 +1953,6 @@ sparc_option_override (void) if (TARGET_ARCH32) target_flags &= ~MASK_STACK_BIAS; - /* Use LRA instead of reload, unless otherwise instructed. */ - if (!(target_flags_explicit & MASK_LRA)) - target_flags |= MASK_LRA; - /* Enable applicable errata workarounds for LEON3FT. */ if (sparc_fix_ut699 || sparc_fix_ut700 || sparc_fix_gr712rc) { @@ -2168,7 +2170,7 @@ sparc_option_override (void) || sparc_cpu == PROCESSOR_M8) ? 128 : (sparc_cpu == PROCESSOR_NIAGARA7 ? 256 : 512))); - + /* Disable save slot sharing for call-clobbered registers by default. The IRA sharing algorithm works on single registers only and this @@ -10142,7 +10144,7 @@ supersparc_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, if (insn_type == TYPE_IALU || insn_type == TYPE_SHIFT) return 0; } - + return cost; } @@ -10384,7 +10386,7 @@ sparc_branch_cost (bool speed_p, bool predictable_p) return cost; } } - + static int set_extends (rtx_insn *insn) { @@ -11006,7 +11008,7 @@ enum sparc_builtins SPARC_BUILTIN_FPCMPUR16SHL, SPARC_BUILTIN_FPCMPUR32SHL, SPARC_BUILTIN_LAST_FPCMPSHL = SPARC_BUILTIN_FPCMPUR32SHL, - + SPARC_BUILTIN_MAX }; @@ -11553,7 +11555,7 @@ sparc_vis_init_builtins (void) def_builtin_const ("__builtin_vis_fpcmpugt32", CODE_FOR_fpcmpugt32si_vis, SPARC_BUILTIN_FPCMPUGT32, di_ftype_v2si_v2si); } - + def_builtin_const ("__builtin_vis_fpmax8", CODE_FOR_maxv8qi3, SPARC_BUILTIN_FPMAX8, v8qi_ftype_v8qi_v8qi); def_builtin_const ("__builtin_vis_fpmax16", CODE_FOR_maxv4hi3, @@ -11608,7 +11610,7 @@ sparc_vis_init_builtins (void) tree di_ftype_v2si_v2si_si = build_function_type_list (intDI_type_node, v2si, v2si, intSI_type_node, 0); - + def_builtin_const ("__builtin_vis_fpcmple8shl", CODE_FOR_fpcmple8dishl, SPARC_BUILTIN_FPCMPLE8SHL, di_ftype_v8qi_v8qi_si); def_builtin_const ("__builtin_vis_fpcmpgt8shl", CODE_FOR_fpcmpgt8dishl, @@ -11678,7 +11680,7 @@ sparc_vis_init_builtins (void) tree si_ftype_v2si_v2si_si = build_function_type_list (intSI_type_node, v2si, v2si, intSI_type_node, 0); - + def_builtin_const ("__builtin_vis_fpcmple8shl", CODE_FOR_fpcmple8sishl, SPARC_BUILTIN_FPCMPLE8SHL, si_ftype_v8qi_v8qi_si); def_builtin_const ("__builtin_vis_fpcmpgt8shl", CODE_FOR_fpcmpgt8sishl, @@ -12621,6 +12623,31 @@ sparc_output_dwarf_dtprel (FILE *file, int size, rtx x) fputs (")", file); } +/* Implement TARGET_OUTPUT_CFI_DIRECTIVE. */ +static bool +sparc_output_cfi_directive (FILE *f, dw_cfi_ref cfi) +{ + if (cfi->dw_cfi_opc == DW_CFA_GNU_window_save) + { + fprintf (f, "\t.cfi_window_save\n"); + return true; + } + return false; +} + +/* Implement TARGET_DW_CFI_OPRND1_DESC. */ +static bool +sparc_dw_cfi_oprnd1_desc (dwarf_call_frame_info cfi_opc, + dw_cfi_oprnd_type &oprnd_type) +{ + if (cfi_opc == DW_CFA_GNU_window_save) + { + oprnd_type = dw_cfi_oprnd_unused; + return true; + } + return false; +} + /* Do whatever processing is required at the end of a file. */ static void @@ -12995,7 +13022,7 @@ sparc_expand_vec_perm_bmask (machine_mode vmode, rtx sel) t_1 = force_reg (SImode, GEN_INT (0x01010101)); /* sel = { A*2, A*2+1, B*2, B*2+1, ... } */ break; - + case E_V8QImode: /* input = xAxBxCxDxExFxGxH */ sel = expand_simple_binop (DImode, AND, sel, @@ -13251,14 +13278,6 @@ sparc_preferred_reload_class (rtx x, reg_class_t rclass) return rclass; } -/* Return true if we use LRA instead of reload pass. */ - -static bool -sparc_lra_p (void) -{ - return TARGET_LRA; -} - /* Output a wide multiply instruction in V8+ mode. INSN is the instruction, OPERANDS are its operands and OPCODE is the mnemonic to be used. */ diff --git a/gcc/config/sparc/sparc.h b/gcc/config/sparc/sparc.h index 8612832..deb6c1c 100644 --- a/gcc/config/sparc/sparc.h +++ b/gcc/config/sparc/sparc.h @@ -429,7 +429,7 @@ along with GCC; see the file COPYING3. If not see (MASK_FPU + MASK_HARD_QUAD + MASK_VIS + MASK_VIS2 + MASK_VIS3 \ + MASK_VIS4 + MASK_CBCOND + MASK_FMAF + MASK_FSMULD \ + MASK_POPC + MASK_SUBXC) - + /* TARGET_HARD_MUL: Use 32-bit hardware multiply instructions but not %y. */ #define TARGET_HARD_MUL \ (TARGET_SPARCLITE || TARGET_SPARCLET \ diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index 7363079..9703a20 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -265,12 +265,8 @@ (define_attr "cpu_feature" "none,fpu,fpunotv9,v9,vis,vis3,vis4,vis4b" (const_string "none")) -(define_attr "lra" "disabled,enabled" - (const_string "enabled")) - (define_attr "enabled" "" - (cond [(eq_attr "cpu_feature" "none") - (cond [(eq_attr "lra" "disabled") (symbol_ref "!TARGET_LRA")] (const_int 1)) + (cond [(eq_attr "cpu_feature" "none") (const_int 1) (eq_attr "cpu_feature" "fpu") (symbol_ref "TARGET_FPU") (eq_attr "cpu_feature" "fpunotv9") (symbol_ref "TARGET_FPU && !TARGET_V9") (eq_attr "cpu_feature" "v9") (symbol_ref "TARGET_V9") @@ -1835,9 +1831,9 @@ (define_insn "*movdi_insn_sp32" [(set (match_operand:DI 0 "nonimmediate_operand" - "=T,o,U,T,r,o,r,r,?*f, T,?*f, o,?*e,?*e, r,?*f,?*e, T,*b,*b") + "=T,o,r,o,r,r,?*f, T,?*f, o,?*e,?*e, r,?*f,?*e, T,*b,*b") (match_operand:DI 1 "input_operand" - " J,J,T,U,o,r,i,r, T,?*f, o,?*f, *e, *e,?*f, r, T,?*e, J, P"))] + " J,J,o,r,i,r, T,?*f, o,?*f, *e, *e,?*f, r, T,?*e, J, P"))] "TARGET_ARCH32 && (register_operand (operands[0], DImode) || register_or_zero_operand (operands[1], DImode))" @@ -1846,8 +1842,6 @@ # ldd\t%1, %0 std\t%1, %0 - ldd\t%1, %0 - std\t%1, %0 # # ldd\t%1, %0 @@ -1862,13 +1856,11 @@ std\t%1, %0 fzero\t%0 fone\t%0" - [(set_attr "type" "store,*,load,store,load,store,*,*,fpload,fpstore,*,*,fpmove,*,*,*,fpload,fpstore,visl, -visl") - (set_attr "subtype" "*,*,regular,*,regular,*,*,*,*,*,*,*,*,*,*,*,*,*,double,double") - (set_attr "length" "*,2,*,*,*,*,2,2,*,*,2,2,*,2,2,2,*,*,*,*") - (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") - (set_attr "cpu_feature" "v9,*,*,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis") - (set_attr "lra" "*,*,disabled,disabled,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")]) + [(set_attr "type" "store,*,load,store,*,*,fpload,fpstore,*,*,fpmove,*,*,*,fpload,fpstore,visl,visl") + (set_attr "subtype" "*,*,regular,*,*,*,*,*,*,*,*,*,*,*,*,*,double,double") + (set_attr "length" "*,2,*,*,2,2,*,*,2,2,*,2,2,2,*,*,*,*") + (set_attr "fptype" "*,*,*,*,*,*,*,*,*,*,double,*,*,*,*,*,double,double") + (set_attr "cpu_feature" "v9,*,*,*,*,*,fpu,fpu,fpu,fpu,v9,fpunotv9,vis3,vis3,fpu,fpu,vis,vis")]) (define_insn "*movdi_insn_sp64" [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,r, m, r,*e,?*e,?*e, m,b,b") @@ -2468,9 +2460,9 @@ visl") (define_insn "*movdf_insn_sp32" [(set (match_operand:DF 0 "nonimmediate_operand" - "=T,o,b,b,e,e,*r, f, e,T,U,T, f,o, *r,*r, o") + "=T,o,b,b,e,e,*r, f, e,T, f,o, *r,*r, o") (match_operand:DF 1 "input_operand" - " G,G,G,C,e,e, f,*r,T#F,e,T,U,o#F,f,*rF, o,*r"))] + " G,G,G,C,e,e, f,*r,T#F,e,o#F,f,*rF, o,*r"))] "TARGET_ARCH32 && (register_operand (operands[0], DFmode) || register_or_zero_or_all_ones_operand (operands[1], DFmode))" @@ -2485,19 +2477,16 @@ visl") # ldd\t%1, %0 std\t%1, %0 - ldd\t%1, %0 - std\t%1, %0 # # # ldd\t%1, %0 std\t%1, %0" - [(set_attr "type" "store,*,visl,visl,fpmove,*,*,*,fpload,fpstore,load,store,*,*,*,load,store") - (set_attr "subtype" "*,*,double,double,*,*,*,*,*,*,regular,*,*,*,*,regular,*") - (set_attr "length" "*,2,*,*,*,2,2,2,*,*,*,*,2,2,2,*,*") - (set_attr "fptype" "*,*,double,double,double,*,*,*,*,*,*,*,*,*,*,*,*") - (set_attr "cpu_feature" "v9,*,vis,vis,v9,fpunotv9,vis3,vis3,fpu,fpu,*,*,fpu,fpu,*,*,*") - (set_attr "lra" "*,*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")]) + [(set_attr "type" "store,*,visl,visl,fpmove,*,*,*,fpload,fpstore,*,*,*,load,store") + (set_attr "subtype" "*,*,double,double,*,*,*,*,*,*,*,*,*,regular,*") + (set_attr "length" "*,2,*,*,*,2,2,2,*,*,2,2,2,*,*") + (set_attr "fptype" "*,*,double,double,double,*,*,*,*,*,*,*,*,*,*") + (set_attr "cpu_feature" "v9,*,vis,vis,v9,fpunotv9,vis3,vis3,fpu,fpu,fpu,fpu,*,*,*")]) (define_insn "*movdf_insn_sp64" [(set (match_operand:DF 0 "nonimmediate_operand" "=b,b,e,*r, e, e,m, *r,*r, m,*r") @@ -8499,9 +8488,9 @@ visl") (define_insn "*mov<VM64:mode>_insn_sp32" [(set (match_operand:VM64 0 "nonimmediate_operand" - "=T,o,e,e,e,*r, f,e,T,U,T,f,o,*r,*r, o") + "=T,o,e,e,e,*r, f,e,T,f,o,*r,*r, o") (match_operand:VM64 1 "input_operand" - " Y,Y,Y,Z,e, f,*r,T,e,T,U,o,f,*r, o,*r"))] + " Y,Y,Y,Z,e, f,*r,T,e,o,f,*r, o,*r"))] "TARGET_VIS && TARGET_ARCH32 && (register_operand (operands[0], <VM64:MODE>mode) @@ -8516,18 +8505,15 @@ visl") # ldd\t%1, %0 std\t%1, %0 - ldd\t%1, %0 - std\t%1, %0 # # # ldd\t%1, %0 std\t%1, %0" - [(set_attr "type" "store,*,visl,visl,vismv,*,*,fpload,fpstore,load,store,*,*,*,load,store") - (set_attr "subtype" "*,*,double,double,double,*,*,*,*,regular,*,*,*,*,regular,*") - (set_attr "length" "*,2,*,*,*,2,2,*,*,*,*,2,2,2,*,*") - (set_attr "cpu_feature" "*,*,vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*,*,*") - (set_attr "lra" "*,*,*,*,*,*,*,*,*,disabled,disabled,*,*,*,*,*")]) + [(set_attr "type" "store,*,visl,visl,vismv,*,*,fpload,fpstore,*,*,*,load,store") + (set_attr "subtype" "*,*,double,double,double,*,*,*,*,*,*,*,regular,*") + (set_attr "length" "*,2,*,*,*,2,2,*,*,2,2,2,*,*") + (set_attr "cpu_feature" "*,*,vis,vis,vis,vis3,vis3,*,*,*,*,*,*,*")]) (define_split [(set (match_operand:VM64 0 "register_operand" "") diff --git a/gcc/config/sparc/sparc.opt b/gcc/config/sparc/sparc.opt index afede3f..235974c 100644 --- a/gcc/config/sparc/sparc.opt +++ b/gcc/config/sparc/sparc.opt @@ -57,10 +57,6 @@ msoft-quad-float Target RejectNegative InverseMask(HARD_QUAD) Do not use hardware quad fp instructions. -mlra -Target Mask(LRA) -Enable Local Register Allocation. - mv8plus Target Mask(V8PLUS) Compile for V8+ ABI. diff --git a/gcc/config/sparc/sparc.opt.urls b/gcc/config/sparc/sparc.opt.urls index 24cc22e..2a6ffa2 100644 --- a/gcc/config/sparc/sparc.opt.urls +++ b/gcc/config/sparc/sparc.opt.urls @@ -24,9 +24,6 @@ UrlSuffix(gcc/SPARC-Options.html#index-mhard-quad-float) msoft-quad-float UrlSuffix(gcc/SPARC-Options.html#index-msoft-quad-float) -mlra -UrlSuffix(gcc/SPARC-Options.html#index-mlra-3) - mv8plus UrlSuffix(gcc/SPARC-Options.html#index-mv8plus) diff --git a/gcc/config/sparc/sysv4.h b/gcc/config/sparc/sysv4.h index 391d9b1..fb43e3f 100644 --- a/gcc/config/sparc/sysv4.h +++ b/gcc/config/sparc/sysv4.h @@ -90,7 +90,7 @@ do { ASM_OUTPUT_ALIGN ((FILE), Pmode == SImode ? 2 : 3); \ #define FINI_SECTION_ASM_OP "\t.section\t\".fini\"" /* Define the pseudo-ops used to switch to the .ctors and .dtors sections. - + Note that we want to give these sections the SHF_WRITE attribute because these sections will actually contain data (i.e. tables of addresses of functions in the current root executable or shared library @@ -103,7 +103,7 @@ do { ASM_OUTPUT_ALIGN ((FILE), Pmode == SImode ? 2 : 3); \ use the `-z text' option when building a shared library, you will get errors unless the .ctors and .dtors sections are marked as writable via the SHF_WRITE attribute.) */ - + #undef CTORS_SECTION_ASM_OP #define CTORS_SECTION_ASM_OP "\t.section\t\".ctors\",#alloc,#write" #undef DTORS_SECTION_ASM_OP diff --git a/gcc/config/stormy16/stormy16-protos.h b/gcc/config/stormy16/stormy16-protos.h index 1d3a8cf..4396616 100644 --- a/gcc/config/stormy16/stormy16-protos.h +++ b/gcc/config/stormy16/stormy16-protos.h @@ -55,13 +55,13 @@ extern void xstormy16_expand_andqi3 (rtx *); extern void xstormy16_split_cbranch (machine_mode, rtx, rtx, rtx); extern int short_memory_operand (rtx, machine_mode); extern bool nonimmediate_nonstack_operand (rtx, machine_mode); -extern enum reg_class xstormy16_secondary_reload_class +extern enum reg_class xstormy16_secondary_reload_class (enum reg_class, machine_mode, rtx); extern void xstormy16_split_move (machine_mode, rtx, rtx); extern void xstormy16_expand_move (machine_mode, rtx, rtx); -extern void xstormy16_expand_arith (machine_mode, enum rtx_code, +extern void xstormy16_expand_arith (machine_mode, enum rtx_code, rtx, rtx, rtx); -extern const char * xstormy16_output_shift (machine_mode, enum rtx_code, +extern const char * xstormy16_output_shift (machine_mode, enum rtx_code, rtx, rtx, rtx); extern bool xstormy16_below100_symbol (rtx, machine_mode); extern bool xstormy16_splittable_below100_operand (rtx, machine_mode); diff --git a/gcc/config/stormy16/stormy16.cc b/gcc/config/stormy16/stormy16.cc index 1016913..d04af9a 100644 --- a/gcc/config/stormy16/stormy16.cc +++ b/gcc/config/stormy16/stormy16.cc @@ -150,7 +150,7 @@ xstormy16_rtx_costs (rtx x, machine_mode mode, *total = COSTS_N_INSNS (speed_p ? 18 + 5 : 6); else if (mode == SImode) *total = COSTS_N_INSNS (speed_p ? 3 * 18 + 14 : 17); - else + else *total = COSTS_N_INSNS (speed_p ? 18 + 3 : 4); return false; diff --git a/gcc/config/stormy16/stormy16.h b/gcc/config/stormy16/stormy16.h index 3d5e21d..dbcb897 100644 --- a/gcc/config/stormy16/stormy16.h +++ b/gcc/config/stormy16/stormy16.h @@ -292,7 +292,7 @@ enum reg_class /* This declaration must be present, but it can be an abort if profiling is not implemented. */ - + #define FUNCTION_PROFILER(FILE, LABELNO) xstormy16_function_profiler () diff --git a/gcc/config/v850/predicates.md b/gcc/config/v850/predicates.md index 3e76bdaa..751cf72 100644 --- a/gcc/config/v850/predicates.md +++ b/gcc/config/v850/predicates.md @@ -182,7 +182,7 @@ */ - for (i = 2; i < count - (TARGET_LONG_CALLS ? 2: 1); i++) + for (i = 2; i < count - (TARGET_LONG_CALLS ? 2 : 1); i++) { rtx dest; rtx src; diff --git a/gcc/config/v850/v850-c.cc b/gcc/config/v850/v850-c.cc index bafd6d9..b808710 100644 --- a/gcc/config/v850/v850-c.cc +++ b/gcc/config/v850/v850-c.cc @@ -90,7 +90,7 @@ static void mark_current_function_as_interrupt (void) { tree name; - + if (current_function_decl == NULL_TREE) { warning (0, "cannot set interrupt attribute: no current function"); @@ -104,7 +104,7 @@ mark_current_function_as_interrupt (void) warning (0, "cannot set interrupt attribute: no such identifier"); return; } - + decl_attributes (¤t_function_decl, tree_cons (name, NULL_TREE, NULL_TREE), 0); } @@ -125,9 +125,9 @@ ghs_pragma_section (cpp_reader * pfile ATTRIBUTE_UNUSED) tree sect_ident; const char *sect, *alias; enum GHS_section_kind kind; - + type = pragma_lex (&x); - + if (type == CPP_EOF && !repeat) goto reset; else if (type == CPP_NAME) @@ -138,20 +138,20 @@ ghs_pragma_section (cpp_reader * pfile ATTRIBUTE_UNUSED) else goto bad; repeat = 0; - + if (pragma_lex (&x) != CPP_EQ) goto bad; if (pragma_lex (&x) != CPP_NAME) goto bad; - + alias = IDENTIFIER_POINTER (x); - + type = pragma_lex (&x); if (type == CPP_COMMA) repeat = 1; else if (type != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs section"); - + if (streq (sect, "data")) kind = GHS_SECTION_KIND_DATA; else if (streq (sect, "text")) kind = GHS_SECTION_KIND_TEXT; else if (streq (sect, "rodata")) kind = GHS_SECTION_KIND_RODATA; @@ -170,7 +170,7 @@ ghs_pragma_section (cpp_reader * pfile ATTRIBUTE_UNUSED) warning (0, "unrecognized section name %qE", sect_ident); return; } - + if (streq (alias, "default")) GHS_current_section_names [kind] = NULL; else @@ -188,7 +188,7 @@ ghs_pragma_section (cpp_reader * pfile ATTRIBUTE_UNUSED) /* #pragma ghs section \n: Reset all section names back to their defaults. */ { int i; - + for (i = COUNT_OF_GHS_SECTION_KINDS; i--;) GHS_current_section_names [i] = NULL; } @@ -198,10 +198,10 @@ void ghs_pragma_interrupt (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs interrupt"); - + mark_current_function_as_interrupt (); } @@ -209,10 +209,10 @@ void ghs_pragma_starttda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs starttda"); - + push_data_area (DATA_AREA_TDA); } @@ -220,10 +220,10 @@ void ghs_pragma_startsda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs startsda"); - + push_data_area (DATA_AREA_SDA); } @@ -231,10 +231,10 @@ void ghs_pragma_startzda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs startzda"); - + push_data_area (DATA_AREA_ZDA); } @@ -242,10 +242,10 @@ void ghs_pragma_endtda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs endtda"); - + pop_data_area (DATA_AREA_TDA); } @@ -253,10 +253,10 @@ void ghs_pragma_endsda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs endsda"); - + pop_data_area (DATA_AREA_SDA); } @@ -264,9 +264,9 @@ void ghs_pragma_endzda (cpp_reader * pfile ATTRIBUTE_UNUSED) { tree x; - + if (pragma_lex (&x) != CPP_EOF) warning (OPT_Wpragmas, "junk at end of %<#pragma%> ghs endzda"); - + pop_data_area (DATA_AREA_ZDA); } diff --git a/gcc/config/v850/v850.cc b/gcc/config/v850/v850.cc index 35fa6b5..b39343c 100644 --- a/gcc/config/v850/v850.cc +++ b/gcc/config/v850/v850.cc @@ -60,7 +60,7 @@ static void v850_print_operand_address (FILE *, machine_mode, rtx); const char * GHS_default_section_names [(int) COUNT_OF_GHS_SECTION_KINDS]; const char * GHS_current_section_names [(int) COUNT_OF_GHS_SECTION_KINDS]; -/* Track the current data area set by the data area pragma (which +/* Track the current data area set by the data area pragma (which can be nested). Tested by check_default_data_area. */ data_area_stack_element * data_area_stack = NULL; @@ -193,7 +193,7 @@ v850_arg_partial_bytes (cumulative_args_t cum_v, const function_arg_info &arg) size = arg.promoted_size_in_bytes (); if (size < 1) size = 1; - + if (!TARGET_GCC_ABI) align = UNITS_PER_WORD; else if (arg.type) @@ -449,7 +449,7 @@ v850_print_operand (FILE * file, rtx x, int code) case CONST_INT: fprintf (file, "%d", (INTVAL (x) >= 0) ? 0 : -1); break; - + case CONST_DOUBLE: const_double_split (x, &high, &low); fprintf (file, "%ld", (long) high); @@ -465,7 +465,7 @@ v850_print_operand (FILE * file, rtx x, int code) case CONST_INT: fprintf (file, "%ld", (long) INTVAL (x)); break; - + case CONST_DOUBLE: const_double_split (x, &high, &low); fprintf (file, "%ld", (long) low); @@ -483,12 +483,12 @@ v850_print_operand (FILE * file, rtx x, int code) break; case 'O': gcc_assert (special_symbolref_operand (x, VOIDmode)); - + if (GET_CODE (x) == CONST) x = XEXP (XEXP (x, 0), 0); else gcc_assert (GET_CODE (x) == SYMBOL_REF); - + if (SYMBOL_REF_ZDA_P (x)) fprintf (file, "zdaoff"); else if (SYMBOL_REF_SDA_P (x)) @@ -504,12 +504,12 @@ v850_print_operand (FILE * file, rtx x, int code) break; case 'Q': gcc_assert (special_symbolref_operand (x, VOIDmode)); - + if (GET_CODE (x) == CONST) x = XEXP (XEXP (x, 0), 0); else gcc_assert (GET_CODE (x) == SYMBOL_REF); - + if (SYMBOL_REF_ZDA_P (x)) fprintf (file, "r0"); else if (SYMBOL_REF_SDA_P (x)) @@ -534,7 +534,7 @@ v850_print_operand (FILE * file, rtx x, int code) fprintf (file, "[r0]"); } break; - + case CONST_INT: { unsigned HOST_WIDE_INT v = INTVAL (x); @@ -542,7 +542,7 @@ v850_print_operand (FILE * file, rtx x, int code) /* Trickery to avoid problems with shifting 32-bits at a time on a 32-bit host. */ v = v >> 16; - v = v >> 16; + v = v >> 16; fprintf (file, HOST_WIDE_INT_PRINT_HEX, v); break; } @@ -622,7 +622,7 @@ v850_print_operand (FILE * file, rtx x, int code) case CONST_DOUBLE: fprintf (file, HOST_WIDE_INT_PRINT_HEX, CONST_DOUBLE_LOW (x)); break; - + case CONST_INT: case SYMBOL_REF: case CONST: @@ -756,7 +756,7 @@ v850_print_operand_punct_valid_p (unsigned char code) output_addr_const will normally barf at this, but it is OK to omit the truncate and just emit the difference of the two labels. The .hword directive will automatically handle the truncation for us. - + Returns true if rtx was handled, false otherwise. */ static bool @@ -852,7 +852,7 @@ output_move_single (rtx * operands) || GET_CODE (src) == SYMBOL_REF || GET_CODE (src) == CONST) { - if (TARGET_V850E_UP) + if (TARGET_V850E_UP) return "mov hilo(%1),%0"; else return "movhi hi(%1),%.,%0\n\tmovea lo(%1),%0,%0"; @@ -1018,7 +1018,7 @@ ep_memory_offset (machine_mode mode, int unsignedp ATTRIBUTE_UNUSED) case E_SFmode: max_offset = (1 << 8); break; - + default: break; } @@ -1472,7 +1472,7 @@ compute_register_save_size (long * p_reg_saved) registers that need to be saved. To detect this we note that the helper functions always push at least register r29 (provided that the function is not an interrupt handler). */ - + if (TARGET_PROLOG_FUNCTION && (i == 2 || ((i >= 20) && (i < 30)))) { @@ -1510,7 +1510,7 @@ compute_register_save_size (long * p_reg_saved) } } } - + if (p_reg_saved) *p_reg_saved = reg_saved; @@ -1640,7 +1640,7 @@ expand_prologue (void) emit_insn (gen_save_interrupt ()); actual_fsize -= INTERRUPT_FIXED_SAVE_SIZE; - + if (((1L << LINK_POINTER_REGNUM) & reg_saved) != 0) actual_fsize -= INTERRUPT_ALL_SAVE_SIZE; @@ -1724,7 +1724,7 @@ expand_prologue (void) rtx insn = emit_insn (save_all); INSN_CODE (insn) = code; actual_fsize -= alloc_stack; - + } else save_all = NULL_RTX; @@ -1753,13 +1753,13 @@ expand_prologue (void) init_stack_alloc = compute_register_save_size (NULL); else init_stack_alloc = actual_fsize; - + /* Save registers at the beginning of the stack frame. */ offset = init_stack_alloc - 4; - + if (init_stack_alloc) increment_stack (- (signed) init_stack_alloc, true); - + /* Save the return pointer first. */ if (num_save > 0 && REGNO (save_regs[num_save-1]) == LINK_POINTER_REGNUM) { @@ -1770,7 +1770,7 @@ expand_prologue (void) save_regs[--num_save])); offset -= 4; } - + for (i = 0; i < num_save; i++) { F (emit_move_insn (gen_rtx_MEM (SImode, @@ -1865,7 +1865,7 @@ expand_epilogue (void) } code = recog (restore_all, NULL, NULL); - + if (code >= 0) { rtx insn; @@ -1967,10 +1967,10 @@ v850_get_data_area (tree decl) { if (lookup_attribute ("sda", DECL_ATTRIBUTES (decl)) != NULL_TREE) return DATA_AREA_SDA; - + if (lookup_attribute ("tda", DECL_ATTRIBUTES (decl)) != NULL_TREE) return DATA_AREA_TDA; - + if (lookup_attribute ("zda", DECL_ATTRIBUTES (decl)) != NULL_TREE) return DATA_AREA_ZDA; @@ -1983,7 +1983,7 @@ static void v850_set_data_area (tree decl, v850_data_area data_area) { tree name; - + switch (data_area) { case DATA_AREA_SDA: name = get_identifier ("sda"); break; @@ -2036,7 +2036,7 @@ v850_handle_data_area_attribute (tree *node, tree name, data_area = DATA_AREA_ZDA; else gcc_unreachable (); - + switch (TREE_CODE (decl)) { case VAR_DECL: @@ -2059,7 +2059,7 @@ v850_handle_data_area_attribute (tree *node, tree name, *no_add_attrs = true; } break; - + default: break; } @@ -2113,7 +2113,7 @@ v850_encode_data_area (tree decl, rtx symbol) if (DECL_SECTION_NAME (decl)) { const char *name = DECL_SECTION_NAME (decl); - + if (streq (name, ".zdata") || streq (name, ".zbss")) v850_set_data_area (decl, DATA_AREA_ZDA); @@ -2140,7 +2140,7 @@ v850_encode_data_area (tree decl, rtx symbol) else if (size <= small_memory_max [(int) SMALL_MEMORY_ZDA]) v850_set_data_area (decl, DATA_AREA_ZDA); } - + if (v850_get_data_area (decl) == DATA_AREA_NORMAL) return; } @@ -2182,7 +2182,7 @@ construct_restore_jr (rtx op) unsigned long int last; int i; static char buff [256]; /* XXX */ - + if (count <= 2) { error ("bogus JR construction: %d", count); @@ -2194,7 +2194,7 @@ construct_restore_jr (rtx op) gcc_assert (GET_CODE (XVECEXP (op, 0, 1)) == SET); gcc_assert (GET_CODE (SET_SRC (XVECEXP (op, 0, 1))) == PLUS); gcc_assert (GET_CODE (XEXP (SET_SRC (XVECEXP (op, 0, 1)), 1)) == CONST_INT); - + stack_bytes = INTVAL (XEXP (SET_SRC (XVECEXP (op, 0, 1)), 1)); /* Each pop will remove 4 bytes from the stack.... */ @@ -2212,12 +2212,12 @@ construct_restore_jr (rtx op) for (i = 2; i < count; i++) { rtx vector_element = XVECEXP (op, 0, i); - + gcc_assert (GET_CODE (vector_element) == SET); gcc_assert (GET_CODE (SET_DEST (vector_element)) == REG); gcc_assert (register_is_ok_for_epilogue (SET_DEST (vector_element), SImode)); - + mask |= 1 << REGNO (SET_DEST (vector_element)); } @@ -2239,7 +2239,7 @@ construct_restore_jr (rtx op) { gcc_assert (!stack_bytes); gcc_assert (mask & (1 << 29)); - + last = 29; } @@ -2247,16 +2247,16 @@ construct_restore_jr (rtx op) We ignore this here, and generate a JR anyway. We will be popping more registers than is strictly necessary, but it does save code space. */ - + if (TARGET_LONG_CALLS) { char name[40]; - + if (first == last) sprintf (name, "__return_%s", reg_names [first]); else sprintf (name, "__return_%s_%s", reg_names [first], reg_names [last]); - + sprintf (buff, "movhi hi(%s), r0, r6\n\tmovea lo(%s), r6, r6\n\tjmp r6", name, name); } @@ -2267,7 +2267,7 @@ construct_restore_jr (rtx op) else sprintf (buff, "jr __return_%s_%s", reg_names [first], reg_names [last]); } - + return buff; } @@ -2287,8 +2287,8 @@ construct_save_jarl (rtx op) unsigned long int last; int i; static char buff [255]; /* XXX */ - - if (count <= (TARGET_LONG_CALLS ? 3 : 2)) + + if (count <= (TARGET_LONG_CALLS ? 3 : 2)) { error ("bogus JARL construction: %d", count); return NULL; @@ -2299,7 +2299,7 @@ construct_save_jarl (rtx op) gcc_assert (GET_CODE (SET_SRC (XVECEXP (op, 0, 0))) == PLUS); gcc_assert (GET_CODE (XEXP (SET_SRC (XVECEXP (op, 0, 0)), 0)) == REG); gcc_assert (GET_CODE (XEXP (SET_SRC (XVECEXP (op, 0, 0)), 1)) == CONST_INT); - + /* Work out how many bytes to push onto the stack after storing the registers. */ stack_bytes = INTVAL (XEXP (SET_SRC (XVECEXP (op, 0, 0)), 1)); @@ -2319,16 +2319,16 @@ construct_save_jarl (rtx op) for (i = 1; i < count - (TARGET_LONG_CALLS ? 3 : 2); i++) { rtx vector_element = XVECEXP (op, 0, i); - + gcc_assert (GET_CODE (vector_element) == SET); gcc_assert (GET_CODE (SET_SRC (vector_element)) == REG); gcc_assert (register_is_ok_for_epilogue (SET_SRC (vector_element), SImode)); - + mask |= 1 << REGNO (SET_SRC (vector_element)); } - /* Scan for the first register to push. */ + /* Scan for the first register to push. */ for (first = 0; first < 32; first++) { if (mask & (1 << first)) @@ -2346,7 +2346,7 @@ construct_save_jarl (rtx op) { gcc_assert (!stack_bytes); gcc_assert (mask & (1 << 29)); - + last = 29; } @@ -2354,16 +2354,16 @@ construct_save_jarl (rtx op) We ignore this here, and generate a JARL anyway. We will be pushing more registers than is strictly necessary, but it does save code space. */ - + if (TARGET_LONG_CALLS) { char name[40]; - + if (first == last) sprintf (name, "__save_%s", reg_names [first]); else sprintf (name, "__save_%s_%s", reg_names [first], reg_names [last]); - + if (TARGET_V850E3V5_UP) sprintf (buff, "mov hilo(%s), r11\n\tjarl [r11], r10", name); else @@ -2404,12 +2404,12 @@ v850_output_aligned_bss (FILE * file, case DATA_AREA_TDA: switch_to_section (tdata_section); break; - + default: switch_to_section (bss_section); break; } - + ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT)); #ifdef ASM_DECLARE_OBJECT_NAME last_assemble_variable_decl = decl; @@ -2448,13 +2448,13 @@ v850_output_common (FILE * file, case DATA_AREA_TDA: fprintf (file, "%s", TCOMMON_ASM_OP); break; - + default: fprintf (file, "%s", COMMON_ASM_OP); break; } } - + assemble_name (file, name); fprintf (file, ",%u,%u\n", size, align / BITS_PER_UNIT); } @@ -2470,7 +2470,7 @@ v850_output_local (FILE * file, fprintf (file, "%s", LOCAL_ASM_OP); assemble_name (file, name); fprintf (file, "\n"); - + ASM_OUTPUT_ALIGNED_DECL_COMMON (file, decl, name, size, align); } @@ -2488,7 +2488,7 @@ v850_insert_attributes (tree decl, tree * attr_ptr ATTRIBUTE_UNUSED ) /* Initialize the default names of the v850 specific sections, if this has not been done before. */ - + if (GHS_default_section_names [(int) GHS_SECTION_KIND_SDATA] == NULL) { GHS_default_section_names [(int) GHS_SECTION_KIND_SDATA] @@ -2499,14 +2499,14 @@ v850_insert_attributes (tree decl, tree * attr_ptr ATTRIBUTE_UNUSED ) GHS_default_section_names [(int) GHS_SECTION_KIND_TDATA] = ".tdata"; - + GHS_default_section_names [(int) GHS_SECTION_KIND_ZDATA] = ".zdata"; GHS_default_section_names [(int) GHS_SECTION_KIND_ROZDATA] = ".rozdata"; } - + if (current_function_decl == NULL_TREE && (VAR_P (decl) || TREE_CODE (decl) == CONST_DECL @@ -2526,23 +2526,23 @@ v850_insert_attributes (tree decl, tree * attr_ptr ATTRIBUTE_UNUSED ) { default: gcc_unreachable (); - + case DATA_AREA_SDA: kind = ((TREE_READONLY (decl)) ? GHS_SECTION_KIND_ROSDATA : GHS_SECTION_KIND_SDATA); break; - + case DATA_AREA_TDA: kind = GHS_SECTION_KIND_TDATA; break; - + case DATA_AREA_ZDA: kind = ((TREE_READONLY (decl)) ? GHS_SECTION_KIND_ROZDATA : GHS_SECTION_KIND_ZDATA); break; - + case DATA_AREA_NORMAL: /* default data area */ if (TREE_READONLY (decl)) kind = GHS_SECTION_KIND_RODATA; @@ -2585,7 +2585,7 @@ construct_dispose_instruction (rtx op) int i; static char buff[ 120 ]; /* XXX */ int use_callt = 0; - + if (count <= 2) { error ("bogus DISPOSE construction: %d", count); @@ -2597,7 +2597,7 @@ construct_dispose_instruction (rtx op) gcc_assert (GET_CODE (XVECEXP (op, 0, 1)) == SET); gcc_assert (GET_CODE (SET_SRC (XVECEXP (op, 0, 1))) == PLUS); gcc_assert (GET_CODE (XEXP (SET_SRC (XVECEXP (op, 0, 1)), 1)) == CONST_INT); - + stack_bytes = INTVAL (XEXP (SET_SRC (XVECEXP (op, 0, 1)), 1)); /* Each pop will remove 4 bytes from the stack.... */ @@ -2617,7 +2617,7 @@ construct_dispose_instruction (rtx op) for (i = 2; i < count; i++) { rtx vector_element = XVECEXP (op, 0, i); - + gcc_assert (GET_CODE (vector_element) == SET); gcc_assert (GET_CODE (SET_DEST (vector_element)) == REG); gcc_assert (register_is_ok_for_epilogue (SET_DEST (vector_element), @@ -2642,7 +2642,7 @@ construct_dispose_instruction (rtx op) for (i = 20; i < 32; i++) if (mask & (1 << i)) break; - + if (i == 31) sprintf (buff, "callt ctoff(__callt_return_r31c)"); else @@ -2654,31 +2654,31 @@ construct_dispose_instruction (rtx op) { static char regs [100]; /* XXX */ int done_one; - + /* Generate the DISPOSE instruction. Note we could just issue the bit mask as a number as the assembler can cope with this, but for the sake of our readers we turn it into a textual description. */ regs[0] = 0; done_one = 0; - + for (i = 20; i < 32; i++) { if (mask & (1 << i)) { int first; - + if (done_one) strcat (regs, ", "); else done_one = 1; - + first = i; strcat (regs, reg_names[ first ]); - + for (i++; i < 32; i++) if ((mask & (1 << i)) == 0) break; - + if (i > first + 1) { strcat (regs, " - "); @@ -2686,10 +2686,10 @@ construct_dispose_instruction (rtx op) } } } - + sprintf (buff, "dispose %d {%s}, r31", stack_bytes / 4, regs); } - + return buff; } @@ -2706,7 +2706,7 @@ construct_prepare_instruction (rtx op) int i; static char buff[ 120 ]; /* XXX */ int use_callt = 0; - + if (XVECLEN (op, 0) <= 1) { error ("bogus PREPEARE construction: %d", XVECLEN (op, 0)); @@ -2718,7 +2718,7 @@ construct_prepare_instruction (rtx op) gcc_assert (GET_CODE (XVECEXP (op, 0, 0)) == SET); gcc_assert (GET_CODE (SET_SRC (XVECEXP (op, 0, 0))) == PLUS); gcc_assert (GET_CODE (XEXP (SET_SRC (XVECEXP (op, 0, 0)), 1)) == CONST_INT); - + stack_bytes = INTVAL (XEXP (SET_SRC (XVECEXP (op, 0, 0)), 1)); @@ -2736,10 +2736,10 @@ construct_prepare_instruction (rtx op) for (i = 1; i < XVECLEN (op, 0); i++) { rtx vector_element = XVECEXP (op, 0, i); - + if (GET_CODE (vector_element) == CLOBBER) continue; - + gcc_assert (GET_CODE (vector_element) == SET); gcc_assert (GET_CODE (SET_SRC (vector_element)) == REG); gcc_assert (register_is_ok_for_epilogue (SET_SRC (vector_element), @@ -2762,7 +2762,7 @@ construct_prepare_instruction (rtx op) sprintf (buff, "callt ctoff(__callt_save_r2_r%d)", (mask & (1 << 31)) ? 31 : 29 ); return buff; } - + for (i = 20; i < 32; i++) if (mask & (1 << i)) break; @@ -2778,31 +2778,31 @@ construct_prepare_instruction (rtx op) static char regs [100]; /* XXX */ int done_one; - + /* Generate the PREPARE instruction. Note we could just issue the bit mask as a number as the assembler can cope with this, but for - the sake of our readers we turn it into a textual description. */ + the sake of our readers we turn it into a textual description. */ regs[0] = 0; done_one = 0; - + for (i = 20; i < 32; i++) { if (mask & (1 << i)) { int first; - + if (done_one) strcat (regs, ", "); else done_one = 1; - + first = i; strcat (regs, reg_names[ first ]); - + for (i++; i < 32; i++) if ((mask & (1 << i)) == 0) break; - + if (i > first + 1) { strcat (regs, " - "); @@ -2810,10 +2810,10 @@ construct_prepare_instruction (rtx op) } } } - + sprintf (buff, "prepare {%s}, %d", regs, (- stack_bytes) / 4); } - + return buff; } @@ -2914,7 +2914,7 @@ v850_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) /* Worker function for TARGET_FUNCTION_VALUE. */ static rtx -v850_function_value (const_tree valtype, +v850_function_value (const_tree valtype, const_tree fn_decl_or_type ATTRIBUTE_UNUSED, bool outgoing ATTRIBUTE_UNUSED) { @@ -3064,7 +3064,7 @@ v850_legitimate_address_p (machine_mode mode, rtx x, bool strict_p, + (GET_MODE_NUNITS (mode) * UNITS_PER_WORD)))) return true; - return false; + return false; } static int @@ -3176,7 +3176,7 @@ v850_gen_movdi (rtx * operands) if (REGNO (operands[1]) & 1) /* Use two store word instructions to synthesise a store double. */ return "st.w %1, %0 ; st.w %R1, %R0 "; - + return "st.dw %1, %0"; } diff --git a/gcc/config/v850/v850.h b/gcc/config/v850/v850.h index 2b31dd1..2721ea0 100644 --- a/gcc/config/v850/v850.h +++ b/gcc/config/v850/v850.h @@ -63,7 +63,7 @@ #if TARGET_CPU_DEFAULT == TARGET_CPU_v850e1 #undef MASK_DEFAULT -#define MASK_DEFAULT MASK_V850E /* No practical difference. */ +#define MASK_DEFAULT MASK_V850E /* No practical difference. */ #undef SUBTARGET_ASM_SPEC #define SUBTARGET_ASM_SPEC "%{!mv*:-mv850e1}" #undef SUBTARGET_CPP_SPEC @@ -72,7 +72,7 @@ #if TARGET_CPU_DEFAULT == TARGET_CPU_v850e2 #undef MASK_DEFAULT -#define MASK_DEFAULT MASK_V850E2 +#define MASK_DEFAULT MASK_V850E2 #undef SUBTARGET_ASM_SPEC #define SUBTARGET_ASM_SPEC "%{!mv*:-mv850e2}" #undef SUBTARGET_CPP_SPEC @@ -99,7 +99,7 @@ #define TARGET_VERSION fprintf (stderr, " (Renesas V850E3V5)"); #endif -#define TARGET_V850E3V5_UP ((TARGET_V850E3V5)) +#define TARGET_V850E3V5_UP ((TARGET_V850E3V5)) #define TARGET_V850E2V3_UP ((TARGET_V850E2V3) || TARGET_V850E3V5_UP) #define TARGET_V850E2_UP ((TARGET_V850E2) || TARGET_V850E2V3_UP) #define TARGET_V850E_UP ((TARGET_V850E) || TARGET_V850E2_UP) @@ -127,7 +127,7 @@ #define EXTRA_SPECS \ { "subtarget_asm_spec", SUBTARGET_ASM_SPEC }, \ - { "subtarget_cpp_spec", SUBTARGET_CPP_SPEC } + { "subtarget_cpp_spec", SUBTARGET_CPP_SPEC } /* Macro to decide when FPU instructions can be used. */ @@ -308,7 +308,7 @@ For any two classes, it is very desirable that there be another class that represents their union. */ - + enum reg_class { NO_REGS, EVEN_REGS, GENERAL_REGS, ALL_REGS, LIM_REG_CLASSES @@ -353,7 +353,7 @@ enum reg_class Since they use reg_renumber, they are safe only once reg_renumber has been allocated, which happens in reginfo.cc during register allocation. */ - + #define REGNO_OK_FOR_BASE_P(regno) \ (((regno) < FIRST_PSEUDO_REGISTER \ && (regno) != CC_REGNUM \ @@ -412,7 +412,7 @@ enum reg_class /* Register containing return address from latest function call. */ #define LINK_POINTER_REGNUM LP_REGNUM - + /* On some machines the offset between the frame pointer and starting offset of the automatic variables is not known until after register allocation has been done (for example, because the saved registers @@ -432,7 +432,7 @@ enum reg_class Do not define this macro if it would be the same as `FRAME_POINTER_REGNUM'. */ -#undef HARD_FRAME_POINTER_REGNUM +#undef HARD_FRAME_POINTER_REGNUM #define HARD_FRAME_POINTER_REGNUM 29 /* Base register for access to arguments of the function. */ @@ -578,7 +578,7 @@ struct cum_arg { int nbytes; }; #define NO_FUNCTION_CSE 1 /* The four different data regions on the v850. */ -typedef enum +typedef enum { DATA_AREA_NORMAL, DATA_AREA_SDA, @@ -617,7 +617,7 @@ typedef enum #define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ asm_output_aligned_bss ((FILE), (DECL), (NAME), (SIZE), (ALIGN)) -#undef ASM_OUTPUT_ALIGNED_BSS +#undef ASM_OUTPUT_ALIGNED_BSS #define ASM_OUTPUT_ALIGNED_BSS(FILE, DECL, NAME, SIZE, ALIGN) \ v850_output_aligned_bss (FILE, DECL, NAME, SIZE, ALIGN) @@ -634,7 +634,7 @@ typedef enum #undef ASM_OUTPUT_LOCAL #define ASM_OUTPUT_ALIGNED_DECL_LOCAL(FILE, DECL, NAME, SIZE, ALIGN) \ v850_output_local (FILE, DECL, NAME, SIZE, ALIGN) - + /* Globalizing directive for a label. */ #define GLOBAL_ASM_OP "\t.global " @@ -767,26 +767,26 @@ typedef enum can appear in the "ghs section" pragma. These names are used to index into the GHS_default_section_names[] and GHS_current_section_names[] that are defined in v850.cc, and so the ordering of each must remain - consistent. + consistent. - These arrays give the default and current names for each kind of + These arrays give the default and current names for each kind of section defined by the GHS pragmas. The current names can be changed - by the "ghs section" pragma. If the current names are null, use + by the "ghs section" pragma. If the current names are null, use the default names. Note that the two arrays have different types. For the *normal* section kinds (like .data, .text, etc.) we do not want to explicitly force the name of these sections, but would rather - let the linker (or at least the back end) choose the name of the + let the linker (or at least the back end) choose the name of the section, UNLESS the user has forced a specific name for these section kinds. To accomplish this set the name in ghs_default_section_names to null. */ enum GHS_section_kind -{ +{ GHS_SECTION_KIND_DEFAULT, GHS_SECTION_KIND_TEXT, - GHS_SECTION_KIND_DATA, + GHS_SECTION_KIND_DATA, GHS_SECTION_KIND_RODATA, GHS_SECTION_KIND_BSS, GHS_SECTION_KIND_SDATA, diff --git a/gcc/config/vax/vax.opt.urls b/gcc/config/vax/vax.opt.urls index 10bee25..7813b88 100644 --- a/gcc/config/vax/vax.opt.urls +++ b/gcc/config/vax/vax.opt.urls @@ -19,5 +19,5 @@ munix UrlSuffix(gcc/VAX-Options.html#index-munix) mlra -UrlSuffix(gcc/VAX-Options.html#index-mlra-4) +UrlSuffix(gcc/VAX-Options.html#index-mlra-3) diff --git a/gcc/config/visium/visium.h b/gcc/config/visium/visium.h index afeb68f..c5cce1d 100644 --- a/gcc/config/visium/visium.h +++ b/gcc/config/visium/visium.h @@ -119,7 +119,7 @@ data area approach is no longer used, these pointers are no longer supported. - The macro and function pointers are described below. + The macro and function pointers are described below. INIT_EXPANDERS: @@ -1015,7 +1015,7 @@ struct visium_args A difficulty is setting the correct instruction parity at run time. - TRAMPOLINE_SIZE + TRAMPOLINE_SIZE A C expression for the size in bytes of the trampoline, as an integer. */ #define TRAMPOLINE_SIZE (visium_cpu == PROCESSOR_GR6 ? 24 : 20) diff --git a/gcc/config/vms/vms-c.cc b/gcc/config/vms/vms-c.cc index d0620b4..0e9fa2f 100644 --- a/gcc/config/vms/vms-c.cc +++ b/gcc/config/vms/vms-c.cc @@ -20,6 +20,7 @@ along with GCC; see the file COPYING3. If not see #define IN_TARGET_CODE 1 +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/vx-common.h b/gcc/config/vx-common.h index d727e85..98636e2 100644 --- a/gcc/config/vx-common.h +++ b/gcc/config/vx-common.h @@ -1,4 +1,4 @@ -/* Target-independent configuration for VxWorks and VxWorks AE. +/* Target-independent configuration for VxWorks and VxWorks AE. Copyright (C) 2005-2024 Free Software Foundation, Inc. Contributed by CodeSourcery, LLC. diff --git a/gcc/config/vxworks.cc b/gcc/config/vxworks.cc index fab92d3..80577e2 100644 --- a/gcc/config/vxworks.cc +++ b/gcc/config/vxworks.cc @@ -72,9 +72,9 @@ static tree vxworks_emutls_var_fields (tree type, tree *name) { tree field, next_field; - + *name = get_identifier ("__tls_var"); - + field = build_decl (BUILTINS_LOCATION, FIELD_DECL, get_identifier ("size"), unsigned_type_node); DECL_CONTEXT (field) = type; @@ -106,23 +106,23 @@ vxworks_emutls_var_init (tree var, tree decl, tree tmpl_addr) { vec<constructor_elt, va_gc> *v; vec_alloc (v, 3); - + tree type = TREE_TYPE (var); tree field = TYPE_FIELDS (type); - + constructor_elt elt = {field, fold_convert (TREE_TYPE (field), tmpl_addr)}; v->quick_push (elt); - + field = DECL_CHAIN (field); elt.index = field; elt.value = build_int_cst (TREE_TYPE (field), 0); v->quick_push (elt); - + field = DECL_CHAIN (field); elt.index = field; elt.value = fold_convert (TREE_TYPE (field), DECL_SIZE_UNIT (decl)); v->quick_push (elt); - + return build_constructor (type, v); } @@ -155,7 +155,7 @@ vxworks_override_options (void) the toolchain user is expected to provide whatever linker level glue is required to get things to operate properly. */ - targetm.have_ctors_dtors = + targetm.have_ctors_dtors = TARGET_VXWORKS_HAVE_CTORS_DTORS || HAVE_INITFINI_ARRAY_SUPPORT; /* PIC is only supported for RTPs. flags_pic might be < 0 here, in diff --git a/gcc/config/vxworksae.h b/gcc/config/vxworksae.h index b95f22d..2114928 100644 --- a/gcc/config/vxworksae.h +++ b/gcc/config/vxworksae.h @@ -45,7 +45,7 @@ along with GCC; see the file COPYING3. If not see #undef VXWORKS_LINK_SPEC #define VXWORKS_LINK_SPEC \ "-r %{v:-V}" - + #undef VXWORKS_LIBGCC_SPEC #define VXWORKS_LIBGCC_SPEC \ "-lgcc" diff --git a/gcc/config/xtensa/constraints.md b/gcc/config/xtensa/constraints.md index d855fb8..c96959e 100644 --- a/gcc/config/xtensa/constraints.md +++ b/gcc/config/xtensa/constraints.md @@ -32,7 +32,7 @@ General-purpose AR registers for indirect sibling calls, @code{a2}- @code{a8}.") -(define_register_constraint "d" "TARGET_DENSITY ? AR_REGS: NO_REGS" +(define_register_constraint "d" "TARGET_DENSITY ? AR_REGS : NO_REGS" "@internal All AR registers, including sp, but only if the Xtensa Code Density Option is configured.") @@ -53,7 +53,7 @@ General-purpose AR registers, but only if the Xtensa Sign Extend Option is configured.") -(define_register_constraint "C" "TARGET_MUL16 ? GR_REGS: NO_REGS" +(define_register_constraint "C" "TARGET_MUL16 ? GR_REGS : NO_REGS" "@internal General-purpose AR registers, but only if the Xtensa 16-Bit Integer Multiply Option is configured.") @@ -63,7 +63,7 @@ General-purpose AR registers, but only if the Xtensa Code Density Option is configured.") -(define_register_constraint "W" "TARGET_CONST16 ? GR_REGS: NO_REGS" +(define_register_constraint "W" "TARGET_CONST16 ? GR_REGS : NO_REGS" "@internal General-purpose AR registers, but only if the Xtensa Const16 Option is configured.") diff --git a/gcc/config/xtensa/xtensa-dynconfig.cc b/gcc/config/xtensa/xtensa-dynconfig.cc index 3bd2760..6ddc02a 100644 --- a/gcc/config/xtensa/xtensa-dynconfig.cc +++ b/gcc/config/xtensa/xtensa-dynconfig.cc @@ -17,6 +17,7 @@ along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +#define INCLUDE_MEMORY #include "config.h" #include "system.h" #include "coretypes.h" diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index 43b1332..d279382 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -363,6 +363,9 @@ static rtx xtensa_delegitimize_address (rtx); #undef TARGET_MAX_ANCHOR_OFFSET #define TARGET_MAX_ANCHOR_OFFSET 1020 +#undef TARGET_DIFFERENT_ADDR_DISPLACEMENT_P +#define TARGET_DIFFERENT_ADDR_DISPLACEMENT_P hook_bool_void_true + struct gcc_target targetm = TARGET_INITIALIZER; diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md index f19e1fd..2c08c7d6 100644 --- a/gcc/config/xtensa/xtensa.md +++ b/gcc/config/xtensa/xtensa.md @@ -1279,15 +1279,13 @@ }) (define_insn "movsi_internal" - [(set (match_operand:SI 0 "nonimmed_operand" "=D,D,D,a,U,D,R,R,a,q,a,a,W,a,*a,*A") - (match_operand:SI 1 "move_operand" "M,D,d,U,r,R,D,d,r,r,I,Y,i,T,*A,*r"))] + [(set (match_operand:SI 0 "nonimmed_operand" "=D,D,D,D,R,R,a,q,a,a,W,a,a,U,*a,*A") + (match_operand:SI 1 "move_operand" "M,D,d,R,D,d,r,r,I,Y,i,T,U,r,*A,*r"))] "xtensa_valid_move (SImode, operands)" "@ movi.n\t%0, %x1 mov.n\t%0, %1 mov.n\t%0, %1 - %v1l32i\t%0, %1 - %v0s32i\t%1, %0 %v1l32i.n\t%0, %1 %v0s32i.n\t%1, %0 %v0s32i.n\t%1, %0 @@ -1297,11 +1295,13 @@ movi\t%0, %1 const16\t%0, %t1\;const16\t%0, %b1 %v1l32r\t%0, %1 + %v1l32i\t%0, %1 + %v0s32i\t%1, %0 rsr\t%0, ACCLO wsr\t%1, ACCLO" - [(set_attr "type" "move,move,move,load,store,load,store,store,move,move,move,move,move,load,rsr,wsr") + [(set_attr "type" "move,move,move,load,store,store,move,move,move,move,move,load,load,store,rsr,wsr") (set_attr "mode" "SI") - (set_attr "length" "2,2,2,3,3,2,2,2,3,3,3,3,6,3,3,3")]) + (set_attr "length" "2,2,2,2,2,2,3,3,3,3,6,3,3,3,3,3")]) (define_split [(set (match_operand:SHI 0 "register_operand") |