diff options
author | Martin Liska <mliska@suse.cz> | 2021-08-06 12:38:10 +0200 |
---|---|---|
committer | Martin Liska <mliska@suse.cz> | 2021-08-06 12:38:10 +0200 |
commit | 01c909e1a5fca4988431a328454e9d8c0eea9ef6 (patch) | |
tree | 0cb8cd9cc4879749dc330119198ebe0d109767e5 /gcc | |
parent | f182597d273fe81ffae6dfece17fecadd01842f7 (diff) | |
parent | 3c94db20be9af3cb0376292e2d4672b515558231 (diff) | |
download | gcc-01c909e1a5fca4988431a328454e9d8c0eea9ef6.zip gcc-01c909e1a5fca4988431a328454e9d8c0eea9ef6.tar.gz gcc-01c909e1a5fca4988431a328454e9d8c0eea9ef6.tar.bz2 |
Merge branch 'master' into devel/sphinx
Diffstat (limited to 'gcc')
29 files changed, 1822 insertions, 457 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d888dc5..6b7a77d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,99 @@ +2021-08-05 H.J. Lu <hjl.tools@gmail.com> + + PR target/99744 + * config/i386/i386.c (ix86_can_inline_p): Ignore MASK_80387 if + callee only uses GPRs. + * config/i386/ia32intrin.h: Revert commit 5463cee2770. + * config/i386/serializeintrin.h: Revert commit 71958f740f1. + * config/i386/x86gprintrin.h: Add + #pragma GCC target("general-regs-only") and #pragma GCC pop_options + to disable non-GPR ISAs. + +2021-08-05 Richard Sandiford <richard.sandiford@arm.com> + + PR middle-end/101787 + * doc/md.texi (cond_ashl, cond_ashr, cond_lshr): Document. + +2021-08-05 Richard Sandiford <richard.sandiford@arm.com> + + * tree-vectorizer.h (vect_is_store_elt_extraction, vect_is_reduction) + (vect_reduc_type, vect_embedded_comparison_type, vect_comparison_type) + (vect_is_extending_load, vect_is_integer_truncation): New functions, + moved from aarch64.c but given different names. + * config/aarch64/aarch64.c (aarch64_is_store_elt_extraction) + (aarch64_is_reduction, aarch64_reduc_type) + (aarch64_embedded_comparison_type, aarch64_comparison_type) + (aarch64_extending_load_p, aarch64_integer_truncation_p): Delete + in favor of the above. Update callers accordingly. + +2021-08-05 Richard Earnshaw <rearnsha@arm.com> + + PR target/101723 + * config/arm/arm-cpus.in (generic-armv7-a): Add quirk to suppress + writing .cpu directive in asm output. + * config/arm/arm.c (arm_identify_fpu_from_isa): New variable. + (arm_last_printed_arch_string): Delete. + (arm_last-printed_fpu_string): Delete. + (arm_configure_build_target): If use of floating-point/SIMD is + disabled, remove all fp/simd related features from the target ISA. + (last_arm_targ_options): New variable. + (arm_print_asm_arch_directives): Add new parameters. Change order + of emitted directives and handle all cases here. + (arm_file_start): Always call arm_print_asm_arch_directives, move + all generation of .arch/.arch_extension here. + (arm_file_end): Call arm_print_asm_arch. + (arm_declare_function_name): Call arm_print_asm_arch_directives + instead of printing .arch/.fpu directives directly. + +2021-08-05 Richard Earnshaw <rearnsha@arm.com> + + * config/arm/arm.c (arm_configure_build_target): Don't call + arm_option_reconfigure_globals. + (arm_option_restore): Call arm_option_reconfigure_globals after + reconfiguring the target. + * config/arm/arm-c.c (arm_pragma_target_parse): Likewise. + +2021-08-05 Richard Earnshaw <rearnsha@arm.com> + + * config/arm/arm.c (arm_configure_build_target): Ensure the target's + arch_name is always set. + +2021-08-05 Jonathan Wright <jonathan.wright@arm.com> + + * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost + of vec_select high-half from being added into Neon subtract + cost. + +2021-08-05 Jonathan Wright <jonathan.wright@arm.com> + + * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost + of vec_select high-half from being added into Neon add cost. + +2021-08-05 Kewen Lin <linkw@linux.ibm.com> + + * cfgloop.h (loops_list::loops_list): Add one optional argument + root and adjust accordingly, update loop tree walking and factor + out to ... + * cfgloop.c (loops_list::walk_loop_tree): ... this. New function. + +2021-08-05 Eric Botcazou <ebotcazou@gcc.gnu.org> + + PR tree-optimization/101626 + * tree-sra.c (propagate_subaccesses_from_rhs): Do not set the + reverse scalar storage order on a pointer or vector component. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * config/i386/sse.md (cond_<code><mode>): New expander. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * config/i386/sse.md (cond_<code><mode>): New expander. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * config/i386/sse.md (cond_<code><mode>): New expander. + 2021-08-04 David Malcolm <dmalcolm@redhat.com> PR analyzer/101570 diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 6168f46..891ccf6 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20210805 +20210806 diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 92d22d1..d24bfdb 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -1489,7 +1489,8 @@ static tree c_parser_std_attribute_specifier_sequence (c_parser *); static void c_parser_external_declaration (c_parser *); static void c_parser_asm_definition (c_parser *); static void c_parser_declaration_or_fndef (c_parser *, bool, bool, bool, - bool, bool, tree *, vec<c_token>, + bool, bool, tree * = NULL, + vec<c_token> * = NULL, bool have_attrs = false, tree attrs = NULL, struct oacc_routine_data * = NULL, @@ -1774,13 +1775,12 @@ c_parser_external_declaration (c_parser *parser) an @interface or @protocol with prefix attributes). We can only tell which after parsing the declaration specifiers, if any, and the first declarator. */ - c_parser_declaration_or_fndef (parser, true, true, true, false, true, - NULL, vNULL); + c_parser_declaration_or_fndef (parser, true, true, true, false, true); break; } } -static void c_finish_omp_declare_simd (c_parser *, tree, tree, vec<c_token>); +static void c_finish_omp_declare_simd (c_parser *, tree, tree, vec<c_token> *); static void c_finish_oacc_routine (struct oacc_routine_data *, tree, bool); /* Build and add a DEBUG_BEGIN_STMT statement with location LOC. */ @@ -1890,11 +1890,15 @@ static void c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, bool static_assert_ok, bool empty_ok, bool nested, bool start_attr_ok, - tree *objc_foreach_object_declaration, - vec<c_token> omp_declare_simd_clauses, - bool have_attrs, tree attrs, - struct oacc_routine_data *oacc_routine_data, - bool *fallthru_attr_p) + tree *objc_foreach_object_declaration + /* = NULL */, + vec<c_token> *omp_declare_simd_clauses + /* = NULL */, + bool have_attrs /* = false */, + tree attrs /* = NULL_TREE */, + struct oacc_routine_data *oacc_routine_data + /* = NULL */, + bool *fallthru_attr_p /* = NULL */) { struct c_declspecs *specs; tree prefix_attrs; @@ -2150,7 +2154,7 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, C_DTR_NORMAL, &dummy); if (declarator == NULL) { - if (omp_declare_simd_clauses.exists ()) + if (omp_declare_simd_clauses) c_finish_omp_declare_simd (parser, NULL_TREE, NULL_TREE, omp_declare_simd_clauses); if (oacc_routine_data) @@ -2250,7 +2254,7 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, chainon (postfix_attrs, all_prefix_attrs)); if (!d) d = error_mark_node; - if (omp_declare_simd_clauses.exists ()) + if (omp_declare_simd_clauses) c_finish_omp_declare_simd (parser, d, NULL_TREE, omp_declare_simd_clauses); } @@ -2262,7 +2266,7 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, chainon (postfix_attrs, all_prefix_attrs)); if (!d) d = error_mark_node; - if (omp_declare_simd_clauses.exists ()) + if (omp_declare_simd_clauses) c_finish_omp_declare_simd (parser, d, NULL_TREE, omp_declare_simd_clauses); init_loc = c_parser_peek_token (parser)->location; @@ -2342,7 +2346,7 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, warn_parm_array_mismatch (lastloc, d, parms); } } - if (omp_declare_simd_clauses.exists ()) + if (omp_declare_simd_clauses) { tree parms = NULL_TREE; if (d && TREE_CODE (d) == FUNCTION_DECL) @@ -2496,9 +2500,9 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok, while (c_parser_next_token_is_not (parser, CPP_EOF) && c_parser_next_token_is_not (parser, CPP_OPEN_BRACE)) c_parser_declaration_or_fndef (parser, false, false, false, - true, false, NULL, vNULL); + true, false); store_parm_decls (); - if (omp_declare_simd_clauses.exists ()) + if (omp_declare_simd_clauses) c_finish_omp_declare_simd (parser, current_function_decl, NULL_TREE, omp_declare_simd_clauses); if (oacc_routine_data) @@ -5699,7 +5703,7 @@ c_parser_compound_statement_nostart (c_parser *parser) bool fallthru_attr_p = false; c_parser_declaration_or_fndef (parser, true, !have_std_attrs, true, true, true, NULL, - vNULL, have_std_attrs, std_attrs, + NULL, have_std_attrs, std_attrs, NULL, &fallthru_attr_p); if (last_stmt && !fallthru_attr_p) @@ -5731,7 +5735,7 @@ c_parser_compound_statement_nostart (c_parser *parser) last_label = false; mark_valid_location_for_stdc_pragma (false); c_parser_declaration_or_fndef (parser, true, true, true, true, - true, NULL, vNULL); + true); /* Following the old parser, __extension__ does not disable this diagnostic. */ restore_extension_diagnostics (ext); @@ -6782,7 +6786,7 @@ c_parser_for_statement (c_parser *parser, bool ivdep, unsigned short unroll, || c_parser_nth_token_starts_std_attributes (parser, 1)) { c_parser_declaration_or_fndef (parser, true, true, true, true, true, - &object_expression, vNULL); + &object_expression); parser->objc_could_be_foreach_context = false; if (c_parser_next_token_is_keyword (parser, RID_IN)) @@ -6813,7 +6817,7 @@ c_parser_for_statement (c_parser *parser, bool ivdep, unsigned short unroll, ext = disable_extension_diagnostics (); c_parser_consume_token (parser); c_parser_declaration_or_fndef (parser, true, true, true, true, - true, &object_expression, vNULL); + true, &object_expression); parser->objc_could_be_foreach_context = false; restore_extension_diagnostics (ext); @@ -11277,7 +11281,7 @@ c_parser_objc_methodprotolist (c_parser *parser) } else c_parser_declaration_or_fndef (parser, false, false, true, - false, true, NULL, vNULL); + false, true); break; } } @@ -17281,12 +17285,12 @@ c_parser_oacc_routine (c_parser *parser, enum pragma_context context) while (c_parser_next_token_is (parser, CPP_KEYWORD) && c_parser_peek_token (parser)->keyword == RID_EXTENSION); c_parser_declaration_or_fndef (parser, true, true, true, false, true, - NULL, vNULL, false, NULL, &data); + NULL, NULL, false, NULL, &data); restore_extension_diagnostics (ext); } else c_parser_declaration_or_fndef (parser, true, true, true, false, true, - NULL, vNULL, false, NULL, &data); + NULL, NULL, false, NULL, &data); } } @@ -18393,8 +18397,7 @@ c_parser_omp_for_loop (location_t loc, c_parser *parser, enum tree_code code, vec_safe_push (for_block, c_begin_compound_stmt (true)); this_pre_body = push_stmt_list (); c_in_omp_for = true; - c_parser_declaration_or_fndef (parser, true, true, true, true, true, - NULL, vNULL); + c_parser_declaration_or_fndef (parser, true, true, true, true, true); c_in_omp_for = false; if (this_pre_body) { @@ -20335,12 +20338,12 @@ c_parser_omp_declare_simd (c_parser *parser, enum pragma_context context) while (c_parser_next_token_is (parser, CPP_KEYWORD) && c_parser_peek_token (parser)->keyword == RID_EXTENSION); c_parser_declaration_or_fndef (parser, true, true, true, false, true, - NULL, clauses); + NULL, &clauses); restore_extension_diagnostics (ext); } else c_parser_declaration_or_fndef (parser, true, true, true, false, true, - NULL, clauses); + NULL, &clauses); break; case pragma_struct: case pragma_param: @@ -20361,7 +20364,7 @@ c_parser_omp_declare_simd (c_parser *parser, enum pragma_context context) if (c_parser_next_tokens_start_declaration (parser)) { c_parser_declaration_or_fndef (parser, true, true, true, true, - true, NULL, clauses); + true, NULL, &clauses); restore_extension_diagnostics (ext); break; } @@ -20370,7 +20373,7 @@ c_parser_omp_declare_simd (c_parser *parser, enum pragma_context context) else if (c_parser_next_tokens_start_declaration (parser)) { c_parser_declaration_or_fndef (parser, true, true, true, true, true, - NULL, clauses); + NULL, &clauses); break; } error ("%<#pragma omp declare %s%> must be followed by " @@ -20851,8 +20854,10 @@ c_finish_omp_declare_variant (c_parser *parser, tree fndecl, tree parms) static void c_finish_omp_declare_simd (c_parser *parser, tree fndecl, tree parms, - vec<c_token> clauses) + vec<c_token> *pclauses) { + vec<c_token> &clauses = *pclauses; + /* Normally first token is CPP_NAME "simd" or "variant". CPP_EOF there indicates error has been reported and CPP_PRAGMA that c_finish_omp_declare_simd has already processed the tokens. */ diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 313b35f..390cf9a 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9206,257 +9206,1139 @@ __STRUCTN (float, 64, 4) #undef __STRUCTN -#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f16 (float16_t *__ptr, float16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float16x8x2_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} -#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ - __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) -__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) -__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) -__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) -__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) -__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) -__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) -__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) -__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) -__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) -__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) -__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) -__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) -__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) - -#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f32 (float32_t *__ptr, float32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float32x4x2_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} -#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_ci __o; } __temp = { __b }; \ - __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) -__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) -__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) -__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) -__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) -__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) -__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) -__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) -__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) -__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) -__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) -__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) -__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) -__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) - -#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ - qmode, ptr_mode, funcsuffix, signedtype) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] \ - = vcombine_##funcsuffix (__b.val[0], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[1] \ - = vcombine_##funcsuffix (__b.val[1], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[2] \ - = vcombine_##funcsuffix (__b.val[2], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __temp.val[3] \ - = vcombine_##funcsuffix (__b.val[3], \ - vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], 3); \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __o, __c); \ -} - -__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16, - float16x8_t) -__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32, - float32x4_t) -__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64, - float64x2_t) -__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16, - int16x8_t) -__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64, - poly64x2_t) -__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, u16, - int16x8_t) -__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, - int32x4_t) -__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, - int64x2_t) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_f64 (float64_t *__ptr, float64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + float64x2x2_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p8 (poly8_t *__ptr, poly8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly8x16x2_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p16 (poly16_t *__ptr, poly16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly16x8x2_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_p64 (poly64_t *__ptr, poly64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + poly64x2x2_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s8 (int8_t *__ptr, int8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int8x16x2_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s16 (int16_t *__ptr, int16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int16x8x2_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s32 (int32_t *__ptr, int32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int32x4x2_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_s64 (int64_t *__ptr, int64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + int64x2x2_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u8 (uint8_t *__ptr, uint8x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint8x16x2_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u16 (uint16_t *__ptr, uint16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint16x8x2_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u32 (uint32_t *__ptr, uint32x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint32x4x2_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_u64 (uint64_t *__ptr, uint64x1x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + uint64x2x2_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f16 (float16_t *__ptr, float16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f32 (float32_t *__ptr, float32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_f64 (float64_t *__ptr, float64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p8 (poly8_t *__ptr, poly8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p16 (poly16_t *__ptr, poly16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_p64 (poly64_t *__ptr, poly64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s8 (int8_t *__ptr, int8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s16 (int16_t *__ptr, int16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s32 (int32_t *__ptr, int32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_s64 (int64_t *__ptr, int64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u8 (uint8_t *__ptr, uint8x16x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u16 (uint16_t *__ptr, uint16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u32 (uint32_t *__ptr, uint32x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_u64 (uint64_t *__ptr, uint64x2x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f16 (float16_t *__ptr, float16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float16x8x3_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f32 (float32_t *__ptr, float32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float32x4x3_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_f64 (float64_t *__ptr, float64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + float64x2x3_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p8 (poly8_t *__ptr, poly8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly8x16x3_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p16 (poly16_t *__ptr, poly16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly16x8x3_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_p64 (poly64_t *__ptr, poly64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + poly64x2x3_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s8 (int8_t *__ptr, int8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int8x16x3_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s16 (int16_t *__ptr, int16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int16x8x3_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s32 (int32_t *__ptr, int32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int32x4x3_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_s64 (int64_t *__ptr, int64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + int64x2x3_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} -#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ -__extension__ extern __inline void \ -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ -vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ - intype __b, const int __c) \ -{ \ - union { intype __i; \ - __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ - __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \ - __ptr, __temp.__o, __c); \ -} - -__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) -__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) -__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) -__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) -__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) -__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) -__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) -__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) -__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) -__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) -__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) -__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) -__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) -__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u8 (uint8_t *__ptr, uint8x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint8x16x3_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u16 (uint16_t *__ptr, uint16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint16x8x3_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u32 (uint32_t *__ptr, uint32x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint32x4x3_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_u64 (uint64_t *__ptr, uint64x1x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + uint64x2x3_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f16 (float16_t *__ptr, float16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f32 (float32_t *__ptr, float32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_f64 (float64_t *__ptr, float64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p8 (poly8_t *__ptr, poly8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p16 (poly16_t *__ptr, poly16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_p64 (poly64_t *__ptr, poly64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s8 (int8_t *__ptr, int8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s16 (int16_t *__ptr, int16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s32 (int32_t *__ptr, int32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_s64 (int64_t *__ptr, int64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u8 (uint8_t *__ptr, uint8x16x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u16 (uint16_t *__ptr, uint16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u32 (uint32_t *__ptr, uint32x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_u64 (uint64_t *__ptr, uint64x2x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f16 (float16_t *__ptr, float16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f16 (__val.val[3], + vcreate_f16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f32 (float32_t *__ptr, float32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f32 (__val.val[3], + vcreate_f32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_f64 (float64_t *__ptr, float64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f64 (__val.val[3], + vcreate_f64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p8 (poly8_t *__ptr, poly8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p8 (__val.val[3], + vcreate_p8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p16 (poly16_t *__ptr, poly16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p16 (__val.val[3], + vcreate_p16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_p64 (poly64_t *__ptr, poly64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p64 (__val.val[3], + vcreate_p64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s8 (int8_t *__ptr, int8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s8 (__val.val[3], + vcreate_s8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s16 (int16_t *__ptr, int16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s16 (__val.val[3], + vcreate_s16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s32 (int32_t *__ptr, int32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s32 (__val.val[3], + vcreate_s32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_s64 (int64_t *__ptr, int64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_s64 (__val.val[3], + vcreate_s64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u8 (uint8_t *__ptr, uint8x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u8 (__val.val[3], + vcreate_u8 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u16 (uint16_t *__ptr, uint16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u16 (__val.val[3], + vcreate_u16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u32 (uint32_t *__ptr, uint32x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u32 (__val.val[3], + vcreate_u32 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_u64 (uint64_t *__ptr, uint64x1x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u64 (__val.val[3], + vcreate_u64 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f16 (float16_t *__ptr, float16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f32 (float32_t *__ptr, float32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_f64 (float64_t *__ptr, float64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p8 (poly8_t *__ptr, poly8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p16 (poly16_t *__ptr, poly16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_p64 (poly64_t *__ptr, poly64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s8 (int8_t *__ptr, int8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s16 (int16_t *__ptr, int16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s32 (int32_t *__ptr, int32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_s64 (int64_t *__ptr, int64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u8 (uint8_t *__ptr, uint8x16x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u16 (uint16_t *__ptr, uint16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u32 (uint32_t *__ptr, uint32x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_u64 (uint64_t *__ptr, uint64x2x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o, + __lane); +} __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -32957,8 +33839,7 @@ vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4bf (__a, __o); } @@ -32967,8 +33848,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8bf (__a, __o); } @@ -32981,9 +33861,7 @@ vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -32992,26 +33870,31 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val) { - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val) { - union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void @@ -33043,8 +33926,7 @@ vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4bf (__a, __o); } @@ -33053,8 +33935,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8bf (__a, __o); } @@ -33067,9 +33948,7 @@ vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33078,9 +33957,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33094,10 +33971,7 @@ vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33106,10 +33980,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33723,15 +34594,86 @@ __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, bf16, bfloat16x8_t) __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) -__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) -__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) -__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, - bf16, bfloat16x8_t) -__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16) +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st2_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x2_t __val, const int __lane) +{ + __builtin_aarch64_simd_oi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st2_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st3_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x3_t __val, const int __lane) +{ + __builtin_aarch64_simd_ci __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st3_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], + vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st4_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x4_t __val, const int __lane) +{ + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st4_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o, + __lane); +} #pragma GCC pop_options @@ -33952,11 +34894,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b) #undef __LD3Q_LANE_FUNC #undef __LD4_LANE_FUNC #undef __LD4Q_LANE_FUNC -#undef __ST2_LANE_FUNC -#undef __ST2Q_LANE_FUNC -#undef __ST3_LANE_FUNC -#undef __ST3Q_LANE_FUNC -#undef __ST4_LANE_FUNC -#undef __ST4Q_LANE_FUNC #endif diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 2de5a96..5b1c06b 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -4189,6 +4189,8 @@ rs6000_option_override_internal (bool global_init_p) else rs6000_long_double_type_size = default_long_double_size; } + else if (rs6000_long_double_type_size == FLOAT_PRECISION_TFmode) + ; /* The option value can be seen when cl_target_option_restore is called. */ else if (rs6000_long_double_type_size == 128) rs6000_long_double_type_size = FLOAT_PRECISION_TFmode; else if (global_options_set.x_rs6000_ieeequad) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ff99887..2551832 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -14764,17 +14764,17 @@ To optimize the program based on the collected profile information, use Register the profile information in the specified section instead of using a constructor/destructor. The section name is @var{name} if it is specified, otherwise the section name defaults to @code{.gcov_info}. A pointer to the -profile information generated by @option{-fprofile-arcs} or -@option{-ftest-coverage} is placed in the specified section for each -translation unit. This option disables the profile information registration -through a constructor and it disables the profile information processing -through a destructor. This option is not intended to be used in hosted -environments such as GNU/Linux. It targets systems with limited resources -which do not support constructors and destructors. The linker could collect -the input sections in a continuous memory block and define start and end -symbols. The runtime support could dump the profiling information registered -in this linker set during program termination to a serial line for example. A -GNU linker script example which defines a linker output section follows: +profile information generated by @option{-fprofile-arcs} is placed in the +specified section for each translation unit. This option disables the profile +information registration through a constructor and it disables the profile +information processing through a destructor. This option is not intended to be +used in hosted environments such as GNU/Linux. It targets free-standing +environments (for example embedded systems) with limited resources which do not +support constructors/destructors or the C library file I/O. + +The linker could collect the input sections in a continuous memory block and +define start and end symbols. A GNU linker script example which defines a +linker output section follows: @smallexample .gcov_info : @@ -14785,6 +14785,64 @@ GNU linker script example which defines a linker output section follows: @} @end smallexample +The program could dump the profiling information registered in this linker set +for example like this: + +@smallexample +#include <gcov.h> +#include <stdio.h> +#include <stdlib.h> + +extern const struct gcov_info *__gcov_info_start[]; +extern const struct gcov_info *__gcov_info_end[]; + +static void +filename (const char *f, void *arg) +@{ + puts (f); +@} + +static void +dump (const void *d, unsigned n, void *arg) +@{ + const unsigned char *c = d; + + for (unsigned i = 0; i < n; ++i) + printf ("%02x", c[i]); +@} + +static void * +allocate (unsigned length, void *arg) +@{ + return malloc (length); +@} + +static void +dump_gcov_info (void) +@{ + const struct gcov_info **info = __gcov_info_start; + const struct gcov_info **end = __gcov_info_end; + + /* Obfuscate variable to prevent compiler optimizations. */ + __asm__ ("" : "+r" (info)); + + while (info != end) + @{ + void *arg = NULL; + __gcov_info_to_gcda (*info, filename, dump, allocate, arg); + putchar ('\n'); + ++info; + @} +@} + +int +main() +@{ + dump_gcov_info(); + return 0; +@} +@end smallexample + @item -fprofile-note=@var{path} @opindex fprofile-note diff --git a/gcc/dominance.c b/gcc/dominance.c index 6a262ce..cc63391 100644 --- a/gcc/dominance.c +++ b/gcc/dominance.c @@ -1227,7 +1227,7 @@ recompute_dominator (enum cdi_direction dir, basic_block bb) from BBS. */ static void -prune_bbs_to_update_dominators (vec<basic_block> bbs, +prune_bbs_to_update_dominators (vec<basic_block> &bbs, bool conservative) { unsigned i; @@ -1379,7 +1379,7 @@ determine_dominators_for_sons (struct graph *g, vec<basic_block> bbs, a block of BBS in the current dominance tree dominate it. */ void -iterate_fix_dominators (enum cdi_direction dir, vec<basic_block> bbs, +iterate_fix_dominators (enum cdi_direction dir, vec<basic_block> &bbs, bool conservative) { unsigned i; diff --git a/gcc/dominance.h b/gcc/dominance.h index 1a8c248..970da02 100644 --- a/gcc/dominance.h +++ b/gcc/dominance.h @@ -78,7 +78,7 @@ checking_verify_dominators (cdi_direction dir) basic_block recompute_dominator (enum cdi_direction, basic_block); extern void iterate_fix_dominators (enum cdi_direction, - vec<basic_block> , bool); + vec<basic_block> &, bool); extern void add_to_dominance_info (enum cdi_direction, basic_block); extern void delete_from_dominance_info (enum cdi_direction, basic_block); extern basic_block first_dom_son (enum cdi_direction, basic_block); diff --git a/gcc/gcov-io.c b/gcc/gcov-io.c index 7819593..d3e56af 100644 --- a/gcc/gcov-io.c +++ b/gcc/gcov-io.c @@ -229,30 +229,25 @@ gcov_magic (gcov_unsigned_t magic, gcov_unsigned_t expected) #endif #if !IN_GCOV -/* Write unsigned VALUE to coverage file. */ +/* Write DATA of LENGTH characters to coverage file. */ GCOV_LINKAGE void -gcov_write_unsigned (gcov_unsigned_t value) +gcov_write (const void *data, unsigned length) { - gcov_unsigned_t r = fwrite (&value, sizeof (value), 1, gcov_var.file); + gcov_unsigned_t r = fwrite (data, length, 1, gcov_var.file); if (r != 1) gcov_var.error = 1; } -/* Write counter VALUE to coverage file. Sets error flag - appropriately. */ +/* Write unsigned VALUE to coverage file. */ -#if IN_LIBGCOV GCOV_LINKAGE void -gcov_write_counter (gcov_type value) +gcov_write_unsigned (gcov_unsigned_t value) { - gcov_write_unsigned ((gcov_unsigned_t) value); - if (sizeof (value) > sizeof (gcov_unsigned_t)) - gcov_write_unsigned ((gcov_unsigned_t) (value >> 32)); - else - gcov_write_unsigned (0); + gcov_unsigned_t r = fwrite (&value, sizeof (value), 1, gcov_var.file); + if (r != 1) + gcov_var.error = 1; } -#endif /* IN_LIBGCOV */ #if !IN_LIBGCOV /* Write STRING to coverage file. Sets error flag on file @@ -349,22 +344,13 @@ gcov_write_length (gcov_position_t position) #else /* IN_LIBGCOV */ -/* Write a tag TAG and length LENGTH. */ - -GCOV_LINKAGE void -gcov_write_tag_length (gcov_unsigned_t tag, gcov_unsigned_t length) -{ - gcov_write_unsigned (tag); - gcov_write_unsigned (length); -} - -/* Write a summary structure to the gcov file. Return nonzero on - overflow. */ +/* Write a summary structure to the gcov file. */ GCOV_LINKAGE void gcov_write_summary (gcov_unsigned_t tag, const struct gcov_summary *summary) { - gcov_write_tag_length (tag, GCOV_TAG_SUMMARY_LENGTH); + gcov_write_unsigned (tag); + gcov_write_unsigned (GCOV_TAG_SUMMARY_LENGTH); gcov_write_unsigned (summary->runs); gcov_write_unsigned (summary->sum_max); } diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h index 538bee8..99e1964 100644 --- a/gcc/gcov-io.h +++ b/gcc/gcov-io.h @@ -367,6 +367,7 @@ char *mangle_path (char const *base); #if !IN_GCOV /* Available outside gcov */ +GCOV_LINKAGE void gcov_write (const void *, unsigned) ATTRIBUTE_HIDDEN; GCOV_LINKAGE void gcov_write_unsigned (gcov_unsigned_t) ATTRIBUTE_HIDDEN; #endif diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE index 394530c..19ab2de 100644 --- a/gcc/go/gofrontend/MERGE +++ b/gcc/go/gofrontend/MERGE @@ -1,4 +1,4 @@ -b47bcf942daa9a0c252db9b57b8f138adbfcdaa2 +32590102c464679f845667b5554e1dcce2549ad2 The first line of this file holds the git revision number of the last merge done from the gofrontend repository. diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc index 3e433d6..33177a7 100644 --- a/gcc/go/gofrontend/expressions.cc +++ b/gcc/go/gofrontend/expressions.cc @@ -11590,12 +11590,10 @@ Call_expression::intrinsify(Gogo* gogo, // sync/atomic functions and runtime/internal/atomic functions // are very similar. In order not to duplicate code, we just // redirect to the latter and let the code below to handle them. - // In case there is no equivalent functions (slight variance - // in types), we just make an artificial name (begin with '$'). // Note: no StorePointer, SwapPointer, and CompareAndSwapPointer, // as they need write barriers. if (name == "LoadInt32") - name = "$Loadint32"; + name = "Loadint32"; else if (name == "LoadInt64") name = "Loadint64"; else if (name == "LoadUint32") @@ -11607,9 +11605,9 @@ Call_expression::intrinsify(Gogo* gogo, else if (name == "LoadPointer") name = "Loadp"; else if (name == "StoreInt32") - name = "$Storeint32"; + name = "Storeint32"; else if (name == "StoreInt64") - name = "$Storeint64"; + name = "Storeint64"; else if (name == "StoreUint32") name = "Store"; else if (name == "StoreUint64") @@ -11617,7 +11615,7 @@ Call_expression::intrinsify(Gogo* gogo, else if (name == "StoreUintptr") name = "Storeuintptr"; else if (name == "AddInt32") - name = "$Xaddint32"; + name = "Xaddint32"; else if (name == "AddInt64") name = "Xaddint64"; else if (name == "AddUint32") @@ -11627,9 +11625,9 @@ Call_expression::intrinsify(Gogo* gogo, else if (name == "AddUintptr") name = "Xadduintptr"; else if (name == "SwapInt32") - name = "$Xchgint32"; + name = "Xchgint32"; else if (name == "SwapInt64") - name = "$Xchgint64"; + name = "Xchgint64"; else if (name == "SwapUint32") name = "Xchg"; else if (name == "SwapUint64") @@ -11637,9 +11635,9 @@ Call_expression::intrinsify(Gogo* gogo, else if (name == "SwapUintptr") name = "Xchguintptr"; else if (name == "CompareAndSwapInt32") - name = "$Casint32"; + name = "Casint32"; else if (name == "CompareAndSwapInt64") - name = "$Casint64"; + name = "Casint64"; else if (name == "CompareAndSwapUint32") name = "Cas"; else if (name == "CompareAndSwapUint64") @@ -11875,7 +11873,7 @@ Call_expression::intrinsify(Gogo* gogo, if ((name == "Load" || name == "Load64" || name == "Loadint64" || name == "Loadp" || name == "Loaduint" || name == "Loaduintptr" || name == "LoadAcq" - || name == "$Loadint32") + || name == "Loadint32") && this->args_ != NULL && this->args_->size() == 1) { if (int_size < 8 && (name == "Load64" || name == "Loadint64")) @@ -11895,7 +11893,7 @@ Call_expression::intrinsify(Gogo* gogo, code = Runtime::ATOMIC_LOAD_8; res_type = uint64_type; } - else if (name == "$Loadint32") + else if (name == "Loadint32") { code = Runtime::ATOMIC_LOAD_4; res_type = int32_type; @@ -11942,10 +11940,10 @@ Call_expression::intrinsify(Gogo* gogo, if ((name == "Store" || name == "Store64" || name == "StorepNoWB" || name == "Storeuintptr" || name == "StoreRel" - || name == "$Storeint32" || name == "$Storeint64") + || name == "Storeint32" || name == "Storeint64") && this->args_ != NULL && this->args_->size() == 2) { - if (int_size < 8 && (name == "Store64" || name == "$Storeint64")) + if (int_size < 8 && (name == "Store64" || name == "Storeint64")) return NULL; Runtime::Function code; @@ -11955,9 +11953,9 @@ Call_expression::intrinsify(Gogo* gogo, code = Runtime::ATOMIC_STORE_4; else if (name == "Store64") code = Runtime::ATOMIC_STORE_8; - else if (name == "$Storeint32") + else if (name == "Storeint32") code = Runtime::ATOMIC_STORE_4; - else if (name == "$Storeint64") + else if (name == "Storeint64") code = Runtime::ATOMIC_STORE_8; else if (name == "Storeuintptr") code = (ptr_size == 8 ? Runtime::ATOMIC_STORE_8 : Runtime::ATOMIC_STORE_4); @@ -11979,7 +11977,7 @@ Call_expression::intrinsify(Gogo* gogo, } if ((name == "Xchg" || name == "Xchg64" || name == "Xchguintptr" - || name == "$Xchgint32" || name == "$Xchgint64") + || name == "Xchgint32" || name == "Xchgint64") && this->args_ != NULL && this->args_->size() == 2) { if (int_size < 8 && (name == "Xchg64" || name == "Xchgint64")) @@ -11997,12 +11995,12 @@ Call_expression::intrinsify(Gogo* gogo, code = Runtime::ATOMIC_EXCHANGE_8; res_type = uint64_type; } - else if (name == "$Xchgint32") + else if (name == "Xchgint32") { code = Runtime::ATOMIC_EXCHANGE_4; res_type = int32_type; } - else if (name == "$Xchgint64") + else if (name == "Xchgint64") { code = Runtime::ATOMIC_EXCHANGE_8; res_type = int64_type; @@ -12025,10 +12023,10 @@ Call_expression::intrinsify(Gogo* gogo, if ((name == "Cas" || name == "Cas64" || name == "Casuintptr" || name == "Casp1" || name == "CasRel" - || name == "$Casint32" || name == "$Casint64") + || name == "Casint32" || name == "Casint64") && this->args_ != NULL && this->args_->size() == 3) { - if (int_size < 8 && (name == "Cas64" || name == "$Casint64")) + if (int_size < 8 && (name == "Cas64" || name == "Casint64")) return NULL; Runtime::Function code; @@ -12047,9 +12045,9 @@ Call_expression::intrinsify(Gogo* gogo, code = Runtime::ATOMIC_COMPARE_EXCHANGE_4; else if (name == "Cas64") code = Runtime::ATOMIC_COMPARE_EXCHANGE_8; - else if (name == "$Casint32") + else if (name == "Casint32") code = Runtime::ATOMIC_COMPARE_EXCHANGE_4; - else if (name == "$Casint64") + else if (name == "Casint64") code = Runtime::ATOMIC_COMPARE_EXCHANGE_8; else if (name == "Casuintptr") code = (ptr_size == 8 @@ -12077,7 +12075,7 @@ Call_expression::intrinsify(Gogo* gogo, } if ((name == "Xadd" || name == "Xadd64" || name == "Xaddint64" - || name == "Xadduintptr" || name == "$Xaddint32") + || name == "Xadduintptr" || name == "Xaddint32") && this->args_ != NULL && this->args_->size() == 2) { if (int_size < 8 && (name == "Xadd64" || name == "Xaddint64")) @@ -12095,7 +12093,7 @@ Call_expression::intrinsify(Gogo* gogo, code = Runtime::ATOMIC_ADD_FETCH_8; res_type = uint64_type; } - else if (name == "$Xaddint32") + else if (name == "Xaddint32") { code = Runtime::ATOMIC_ADD_FETCH_4; res_type = int32_type; diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h index 19751f10..42842d9 100644 --- a/gcc/ipa-prop.h +++ b/gcc/ipa-prop.h @@ -499,10 +499,10 @@ public: get reallocated, the member vectors and the underlying auto_vecs would get out of sync. */ ipa_call_arg_values (ipa_auto_call_arg_values *aavals) - : m_known_vals (aavals->m_known_vals), - m_known_contexts (aavals->m_known_contexts), - m_known_aggs (aavals->m_known_aggs), - m_known_value_ranges (aavals->m_known_value_ranges) + : m_known_vals (aavals->m_known_vals.to_vec_legacy ()), + m_known_contexts (aavals->m_known_contexts.to_vec_legacy ()), + m_known_aggs (aavals->m_known_aggs.to_vec_legacy ()), + m_known_value_ranges (aavals->m_known_value_ranges.to_vec_legacy ()) {} /* If m_known_vals (vector of known "scalar" values) is sufficiantly long, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 04b011b..d4c0307 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,85 @@ +2021-08-05 Jonathan Wakely <jwakely@redhat.com> + + * g++.old-deja/g++.other/inline7.C: Cast nodiscard call to void. + +2021-08-05 H.J. Lu <hjl.tools@gmail.com> + + PR target/99744 + * gcc.target/i386/pr99744-3.c: New test. + * gcc.target/i386/pr99744-4.c: Likewise. + * gcc.target/i386/pr99744-5.c: Likewise. + * gcc.target/i386/pr99744-6.c: Likewise. + * gcc.target/i386/pr99744-7.c: Likewise. + * gcc.target/i386/pr99744-8.c: Likewise. + +2021-08-05 Richard Earnshaw <rearnsha@arm.com> + + PR target/101723 + * gcc.target/arm/cortex-m55-nofp-flag-hard.c: Update expected output. + * gcc.target/arm/cortex-m55-nofp-flag-softfp.c: Likewise. + * gcc.target/arm/cortex-m55-nofp-nomve-flag-softfp.c: Likewise. + * gcc.target/arm/mve/intrinsics/mve_fpu1.c: Convert to dg-do assemble. + Add a non-no-op function body. + * gcc.target/arm/mve/intrinsics/mve_fpu2.c: Likewise. + * gcc.target/arm/pr98636.c (dg-options): Add -mfloat-abi=softfp. + * gcc.target/arm/attr-neon.c: Tighten scan-assembler tests. + * gcc.target/arm/attr-neon2.c: Use -Ofast, convert test to use + check-function-bodies. + * gcc.target/arm/attr-neon3.c: Likewise. + * gcc.target/arm/pr69245.c: Tighten scan-assembler match, but allow + multiple instances. + * gcc.target/arm/pragma_fpu_attribute.c: Likewise. + * gcc.target/arm/pragma_fpu_attribute_2.c: Likewise. + +2021-08-05 Jonathan Wright <jonathan.wright@arm.com> + + * gcc.target/aarch64/vsubX_high_cost.c: New test. + +2021-08-05 Jonathan Wright <jonathan.wright@arm.com> + + * gcc.target/aarch64/vaddX_high_cost.c: New test. + +2021-08-05 Richard Biener <rguenther@suse.de> + + * gcc.dg/vect/bb-slp-pr101756.c: Add -w. + +2021-08-05 Eric Botcazou <ebotcazou@gcc.gnu.org> + + * gcc.dg/sso-15.c: New test. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * gcc.target/i386/cond_op_anylogic_d-1.c: New test. + * gcc.target/i386/cond_op_anylogic_d-2.c: New test. + * gcc.target/i386/cond_op_anylogic_q-1.c: New test. + * gcc.target/i386/cond_op_anylogic_q-2.c: New test. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * gcc.target/i386/cond_op_maxmin_double-1.c: New test. + * gcc.target/i386/cond_op_maxmin_double-2.c: New test. + * gcc.target/i386/cond_op_maxmin_float-1.c: New test. + * gcc.target/i386/cond_op_maxmin_float-2.c: New test. + +2021-08-05 liuhongt <hongtao.liu@intel.com> + + * gcc.target/i386/cond_op_maxmin_b-1.c: New test. + * gcc.target/i386/cond_op_maxmin_b-2.c: New test. + * gcc.target/i386/cond_op_maxmin_d-1.c: New test. + * gcc.target/i386/cond_op_maxmin_d-2.c: New test. + * gcc.target/i386/cond_op_maxmin_q-1.c: New test. + * gcc.target/i386/cond_op_maxmin_q-2.c: New test. + * gcc.target/i386/cond_op_maxmin_ub-1.c: New test. + * gcc.target/i386/cond_op_maxmin_ub-2.c: New test. + * gcc.target/i386/cond_op_maxmin_ud-1.c: New test. + * gcc.target/i386/cond_op_maxmin_ud-2.c: New test. + * gcc.target/i386/cond_op_maxmin_uq-1.c: New test. + * gcc.target/i386/cond_op_maxmin_uq-2.c: New test. + * gcc.target/i386/cond_op_maxmin_uw-1.c: New test. + * gcc.target/i386/cond_op_maxmin_uw-2.c: New test. + * gcc.target/i386/cond_op_maxmin_w-1.c: New test. + * gcc.target/i386/cond_op_maxmin_w-2.c: New test. + 2021-08-04 David Malcolm <dmalcolm@redhat.com> PR analyzer/101570 diff --git a/gcc/testsuite/gcc.dg/gcov-info-to-gcda.c b/gcc/testsuite/gcc.dg/gcov-info-to-gcda.c new file mode 100644 index 0000000..a42a768 --- /dev/null +++ b/gcc/testsuite/gcc.dg/gcov-info-to-gcda.c @@ -0,0 +1,60 @@ +/* { dg-do run } */ +/* { dg-skip-if "profile-info-section" { powerpc-ibm-aix* } } */ +/* { dg-options "-fprofile-arcs -fprofile-info-section" } */ + +#define assert(expr) \ + ((expr) \ + ? (void)0 \ + : (__builtin_printf ("%s:%i: Assertion `%s' failed.\n", \ + __FILE__, __LINE__, #expr), \ + __builtin_abort ())) + +struct gcov_info; + +extern void +__gcov_info_to_gcda (const struct gcov_info *__info, + void (*__filename_fn) (const char *, void *), + void (*__dump_fn) (const void *, unsigned, void *), + void *(*__allocate_fn) (unsigned, void *), + void *__arg); + +extern const struct gcov_info *my_info; + +static unsigned counter; + +static void +filename (const char *f, void *arg) +{ + assert (arg == &counter); + assert (__builtin_strstr (f, "gcov-info-to-gcda.c") == 0); +} + +static void +dump (const void *d, unsigned n, void *arg) +{ + unsigned *m = (unsigned *)arg; + assert (arg == &counter); + + if (*m == 0) + { + const unsigned *u = d; + assert (*u == 0x67636461); + } + + *m += n; +} + +static void * +allocate (unsigned length, void *arg) +{ + assert (arg == &counter); + return __builtin_malloc (length); +} + +int main() +{ + __asm__ volatile (".set my_info, .LPBX2"); + __gcov_info_to_gcda (my_info, filename, dump, allocate, &counter); + assert (counter > 4); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c index 60c53bc..3e7e572 100644 --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3" } */ +/* { dg-options "-O3 -march=armv8.2-a+bf16" } */ #include <arm_neon.h> @@ -95,6 +95,7 @@ TEST_STX (vst4q, int16x8x4_t, int16_t*, s16); TEST_STX (vst4q, uint16x8x4_t, uint16_t*, u16); TEST_STX (vst4q, poly16x8x4_t, poly16_t*, p16); TEST_STX (vst4q, float16x8x4_t, float16_t*, f16); +TEST_STX (vst4q, bfloat16x8x4_t, bfloat16_t*, bf16); TEST_STX (vst4q, int32x4x4_t, int32_t*, s32); TEST_STX (vst4q, uint32x4x4_t, uint32_t*, u32); TEST_STX (vst4q, float32x4x4_t, float32_t*, f32); @@ -110,6 +111,7 @@ TEST_STX (vst2q, int16x8x2_t, int16_t*, s16); TEST_STX (vst2q, uint16x8x2_t, uint16_t*, u16); TEST_STX (vst2q, poly16x8x2_t, poly16_t*, p16); TEST_STX (vst2q, float16x8x2_t, float16_t*, f16); +TEST_STX (vst2q, bfloat16x8x2_t, bfloat16_t*, bf16); TEST_STX (vst2q, int32x4x2_t, int32_t*, s32); TEST_STX (vst2q, uint32x4x2_t, uint32_t*, u32); TEST_STX (vst2q, float32x4x2_t, float32_t*, f32); @@ -131,6 +133,7 @@ TEST_ST3 (vst3q, int16x8x3_t, int16_t*, s16); TEST_ST3 (vst3q, uint16x8x3_t, uint16_t*, u16); TEST_ST3 (vst3q, poly16x8x3_t, poly16_t*, p16); TEST_ST3 (vst3q, float16x8x3_t, float16_t*, f16); +TEST_ST3 (vst3q, bfloat16x8x3_t, bfloat16_t*, bf16); TEST_ST3 (vst3q, int32x4x3_t, int32_t*, s32); TEST_ST3 (vst3q, uint32x4x3_t, uint32_t*, u32); TEST_ST3 (vst3q, float32x4x3_t, float32_t*, f32); @@ -139,6 +142,66 @@ TEST_ST3 (vst3q, uint64x2x3_t, uint64_t*, u64); TEST_ST3 (vst3q, float64x2x3_t, float64_t*, f64); TEST_ST3 (vst3q, poly64x2x3_t, poly64_t*, p64); +#define TEST_STX_LANE(name, tbltype, ptrtype, ts) \ + void test_ ## name ## _ ## ts (ptrtype a, tbltype b) \ + { \ + name ## _ ## ts (a, b, 1); \ + } + +TEST_STX_LANE (vst4q_lane, int8x16x4_t, int8_t*, s8); +TEST_STX_LANE (vst4q_lane, uint8x16x4_t, uint8_t*, u8); +TEST_STX_LANE (vst4q_lane, poly8x16x4_t, poly8_t*, p8); +TEST_STX_LANE (vst4q_lane, int16x8x4_t, int16_t*, s16); +TEST_STX_LANE (vst4q_lane, uint16x8x4_t, uint16_t*, u16); +TEST_STX_LANE (vst4q_lane, poly16x8x4_t, poly16_t*, p16); +TEST_STX_LANE (vst4q_lane, float16x8x4_t, float16_t*, f16); +TEST_STX_LANE (vst4q_lane, bfloat16x8x4_t, bfloat16_t*, bf16); +TEST_STX_LANE (vst4q_lane, int32x4x4_t, int32_t*, s32); +TEST_STX_LANE (vst4q_lane, uint32x4x4_t, uint32_t*, u32); +TEST_STX_LANE (vst4q_lane, float32x4x4_t, float32_t*, f32); +TEST_STX_LANE (vst4q_lane, int64x2x4_t, int64_t*, s64); +TEST_STX_LANE (vst4q_lane, uint64x2x4_t, uint64_t*, u64); +TEST_STX_LANE (vst4q_lane, float64x2x4_t, float64_t*, f64); +TEST_STX_LANE (vst4q_lane, poly64x2x4_t, poly64_t*, p64); + +TEST_STX_LANE (vst2q_lane, int8x16x2_t, int8_t*, s8); +TEST_STX_LANE (vst2q_lane, uint8x16x2_t, uint8_t*, u8); +TEST_STX_LANE (vst2q_lane, poly8x16x2_t, poly8_t*, p8); +TEST_STX_LANE (vst2q_lane, int16x8x2_t, int16_t*, s16); +TEST_STX_LANE (vst2q_lane, uint16x8x2_t, uint16_t*, u16); +TEST_STX_LANE (vst2q_lane, poly16x8x2_t, poly16_t*, p16); +TEST_STX_LANE (vst2q_lane, float16x8x2_t, float16_t*, f16); +TEST_STX_LANE (vst2q_lane, bfloat16x8x2_t, bfloat16_t*, bf16); +TEST_STX_LANE (vst2q_lane, int32x4x2_t, int32_t*, s32); +TEST_STX_LANE (vst2q_lane, uint32x4x2_t, uint32_t*, u32); +TEST_STX_LANE (vst2q_lane, float32x4x2_t, float32_t*, f32); +TEST_STX_LANE (vst2q_lane, int64x2x2_t, int64_t*, s64); +TEST_STX_LANE (vst2q_lane, uint64x2x2_t, uint64_t*, u64); +TEST_STX_LANE (vst2q_lane, float64x2x2_t, float64_t*, f64); +TEST_STX_LANE (vst2q_lane, poly64x2x2_t, poly64_t*, p64); + +#define TEST_ST3_LANE(name, tbltype, ptrtype, ts) \ + void test_ ## name ## _ ## ts (ptrtype a, int8x8_t dummy, tbltype b) \ + { \ + name ## _ ## ts (a, b, 1); \ + } + +TEST_ST3_LANE (vst3q_lane, int8x16x3_t, int8_t*, s8); +TEST_ST3_LANE (vst3q_lane, uint8x16x3_t, uint8_t*, u8); +TEST_ST3_LANE (vst3q_lane, poly8x16x3_t, poly8_t*, p8); +TEST_ST3_LANE (vst3q_lane, int16x8x3_t, int16_t*, s16); +TEST_ST3_LANE (vst3q_lane, uint16x8x3_t, uint16_t*, u16); +TEST_ST3_LANE (vst3q_lane, poly16x8x3_t, poly16_t*, p16); +TEST_ST3_LANE (vst3q_lane, float16x8x3_t, float16_t*, f16); +TEST_ST3_LANE (vst3q_lane, bfloat16x8x3_t, bfloat16_t*, bf16); +TEST_ST3_LANE (vst3q_lane, int32x4x3_t, int32_t*, s32); +TEST_ST3_LANE (vst3q_lane, uint32x4x3_t, uint32_t*, u32); +TEST_ST3_LANE (vst3q_lane, float32x4x3_t, float32_t*, f32); +TEST_ST3_LANE (vst3q_lane, int64x2x3_t, int64_t*, s64); +TEST_ST3_LANE (vst3q_lane, uint64x2x3_t, uint64_t*, u64); +TEST_ST3_LANE (vst3q_lane, float64x2x3_t, float64_t*, f64); +TEST_ST3_LANE (vst3q_lane, poly64x2x3_t, poly64_t*, p64); + #define TEST_ST1xN(name, tbltype, ptrtype, ts, xn) \ void test_ ## name ## _ ## ts ## _ ## xn (ptrtype a, tbltype b) \ { \ @@ -152,6 +215,7 @@ TEST_ST1xN (vst1q, int16x8x4_t, int16_t*, s16, x4); TEST_ST1xN (vst1q, uint16x8x4_t, uint16_t*, u16, x4); TEST_ST1xN (vst1q, poly16x8x4_t, poly16_t*, p16, x4); TEST_ST1xN (vst1q, float16x8x4_t, float16_t*, f16, x4); +TEST_ST1xN (vst1q, bfloat16x8x4_t, bfloat16_t*, bf16, x4); TEST_ST1xN (vst1q, int32x4x4_t, int32_t*, s32, x4); TEST_ST1xN (vst1q, uint32x4x4_t, uint32_t*, u32, x4); TEST_ST1xN (vst1q, float32x4x4_t, float32_t*, f32, x4); @@ -167,6 +231,7 @@ TEST_ST1xN (vst1q, int16x8x2_t, int16_t*, s16, x2); TEST_ST1xN (vst1q, uint16x8x2_t, uint16_t*, u16, x2); TEST_ST1xN (vst1q, poly16x8x2_t, poly16_t*, p16, x2); TEST_ST1xN (vst1q, float16x8x2_t, float16_t*, f16, x2); +TEST_ST1xN (vst1q, bfloat16x8x2_t, bfloat16_t*, bf16, x2); TEST_ST1xN (vst1q, int32x4x2_t, int32_t*, s32, x2); TEST_ST1xN (vst1q, uint32x4x2_t, uint32_t*, u32, x2); TEST_ST1xN (vst1q, float32x4x2_t, float32_t*, f32, x2); @@ -189,6 +254,7 @@ TEST_ST1x3 (vst1q, int16x8x3_t, int16_t*, s16, x3); TEST_ST1x3 (vst1q, uint16x8x3_t, uint16_t*, u16, x3); TEST_ST1x3 (vst1q, poly16x8x3_t, poly16_t*, p16, x3); TEST_ST1x3 (vst1q, float16x8x3_t, float16_t*, f16, x3); +TEST_ST1x3 (vst1q, bfloat16x8x3_t, bfloat16_t*, bf16, x3); TEST_ST1x3 (vst1q, int32x4x3_t, int32_t*, s32, x3); TEST_ST1x3 (vst1q, uint32x4x3_t, uint32_t*, u32, x3); TEST_ST1x3 (vst1q, float32x4x3_t, float32_t*, f32, x3); @@ -201,7 +267,7 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3); /* { dg-final { scan-assembler-times "tbl\\t" 18} } */ /* { dg-final { scan-assembler-times "tbx\\t" 18} } */ -/* { dg-final { scan-assembler-times "st4\\t" 14} } */ -/* { dg-final { scan-assembler-times "st3\\t" 14} } */ -/* { dg-final { scan-assembler-times "st2\\t" 14} } */ -/* { dg-final { scan-assembler-times "st1\\t" 42} } */ +/* { dg-final { scan-assembler-times "st4\\t" 30} } */ +/* { dg-final { scan-assembler-times "st3\\t" 30} } */ +/* { dg-final { scan-assembler-times "st2\\t" 30} } */ +/* { dg-final { scan-assembler-times "st1\\t" 45} } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pragma-optimize.c b/gcc/testsuite/gcc.target/powerpc/pragma-optimize.c new file mode 100644 index 0000000..e8ba63a --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pragma-optimize.c @@ -0,0 +1,13 @@ +/* { dg-options "-O2 -mlong-double-128 -mabi=ibmlongdouble" } */ + +extern unsigned long int x; +extern float f (float); +extern __typeof (f) f_power8; +extern __typeof (f) f_power9; +extern __typeof (f) f __attribute__ ((ifunc ("f_ifunc"))); +static __attribute__ ((optimize (1))) __typeof (f) * +f_ifunc (void) +{ + __typeof (f) *res = x ? f_power9 : f_power8; + return res; +} diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 index cc1aebf..c8a7d89 100644 --- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 +++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90 @@ -704,7 +704,6 @@ CALL track('KERNEL ') RETURN END SUBROUTINE kernel -! { dg-final { scan-tree-dump-times "vectorized 24 loops" 1 "vect" { target aarch64_sve } } } -! { dg-final { scan-tree-dump-times "vectorized 23 loops" 1 "vect" { target { aarch64*-*-* && { ! aarch64_sve } } } } } +! { dg-final { scan-tree-dump-times "vectorized 24 loops" 1 "vect" { target aarch64*-*-* } } } ! { dg-final { scan-tree-dump-times "vectorized 2\[234\] loops" 1 "vect" { target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } } ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { { ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } } diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index b6f7828..e061baa 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -404,8 +404,7 @@ print_dist_vectors (FILE *outf, vec<lambda_vector> dist_vects, /* Dump function for a DATA_DEPENDENCE_RELATION structure. */ DEBUG_FUNCTION void -dump_data_dependence_relation (FILE *outf, - struct data_dependence_relation *ddr) +dump_data_dependence_relation (FILE *outf, const data_dependence_relation *ddr) { struct data_reference *dra, *drb; @@ -479,7 +478,7 @@ dump_data_dependence_relation (FILE *outf, /* Debug version. */ DEBUG_FUNCTION void -debug_data_dependence_relation (struct data_dependence_relation *ddr) +debug_data_dependence_relation (const struct data_dependence_relation *ddr) { dump_data_dependence_relation (stderr, ddr); } @@ -487,10 +486,9 @@ debug_data_dependence_relation (struct data_dependence_relation *ddr) /* Dump into FILE all the dependence relations from DDRS. */ DEBUG_FUNCTION void -dump_data_dependence_relations (FILE *file, - vec<ddr_p> ddrs) +dump_data_dependence_relations (FILE *file, const vec<ddr_p> &ddrs) { - for (data_dependence_relation *ddr : ddrs) + for (auto ddr : ddrs) dump_data_dependence_relation (file, ddr); } diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index de45f25..685f33d 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -528,8 +528,8 @@ extern void debug_data_reference (struct data_reference *); extern void debug_data_references (vec<data_reference_p> ); extern void debug (vec<data_reference_p> &ref); extern void debug (vec<data_reference_p> *ptr); -extern void debug_data_dependence_relation (struct data_dependence_relation *); -extern void dump_data_dependence_relations (FILE *, vec<ddr_p> ); +extern void debug_data_dependence_relation (const data_dependence_relation *); +extern void dump_data_dependence_relations (FILE *, const vec<ddr_p> &); extern void debug (vec<ddr_p> &ref); extern void debug (vec<ddr_p> *ptr); extern void debug_data_dependence_relations (vec<ddr_p> ); diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c index bed30d2..6b195d1 100644 --- a/gcc/tree-predcom.c +++ b/gcc/tree-predcom.c @@ -639,9 +639,8 @@ dump_chain (FILE *file, chain_p chain) /* Dumps CHAINS to FILE. */ -extern void dump_chains (FILE *, vec<chain_p> ); void -dump_chains (FILE *file, vec<chain_p> chains) +dump_chains (FILE *file, const vec<chain_p> &chains) { chain_p chain; unsigned i; @@ -2049,7 +2048,7 @@ finalize_eliminated_stores (class loop *loop, chain_p chain) static void initialize_root_vars_lm (class loop *loop, dref root, bool written, - vec<tree> *vars, vec<tree> inits, + vec<tree> *vars, const vec<tree> &inits, bitmap tmp_vars) { unsigned i; @@ -2324,7 +2323,7 @@ pcom_worker::execute_pred_commoning_chain (chain_p chain, optimized. */ static unsigned -determine_unroll_factor (vec<chain_p> chains) +determine_unroll_factor (const vec<chain_p> &chains) { chain_p chain; unsigned factor = 1, af, nfactor, i; @@ -2401,7 +2400,7 @@ pcom_worker::execute_pred_commoning (bitmap tmp_vars) phi node, record the ssa name that is defined by it. */ static void -replace_phis_by_defined_names (vec<chain_p> chains) +replace_phis_by_defined_names (vec<chain_p> &chains) { chain_p chain; dref a; @@ -3276,7 +3275,7 @@ pcom_worker::prepare_finalizers () /* Insert all initializing gimple stmts into LOOP's entry edge. */ static void -insert_init_seqs (class loop *loop, vec<chain_p> chains) +insert_init_seqs (class loop *loop, vec<chain_p> &chains) { unsigned i; edge entry = loop_preheader_edge (loop); @@ -3387,7 +3386,7 @@ pcom_worker::tree_predictive_commoning_loop (bool allow_unroll_p) fprintf (dump_file, "Unrolling %u times.\n", unroll_factor); dta.tmp_vars = tmp_vars; - dta.chains = m_chains; + dta.chains = m_chains.to_vec_legacy (); dta.worker = this; /* Cfg manipulations performed in tree_transform_and_unroll_loop before diff --git a/gcc/tree-ssa-pre.c b/gcc/tree-ssa-pre.c index d2a7395..ebe95cc 100644 --- a/gcc/tree-ssa-pre.c +++ b/gcc/tree-ssa-pre.c @@ -3107,7 +3107,7 @@ create_expression_by_pieces (basic_block block, pre_expr expr, static bool insert_into_preds_of_block (basic_block block, unsigned int exprnum, - vec<pre_expr> avail) + vec<pre_expr> &avail) { pre_expr expr = expression_for_id (exprnum); pre_expr newphi; diff --git a/gcc/tree-ssa-threadbackward.c b/gcc/tree-ssa-threadbackward.c index 91ce443..e237eb4 100644 --- a/gcc/tree-ssa-threadbackward.c +++ b/gcc/tree-ssa-threadbackward.c @@ -92,7 +92,7 @@ public: private: void maybe_register_path (edge taken_edge); bool find_paths_to_names (basic_block bb, bitmap imports); - bool resolve_def (tree name, bitmap interesting, vec<tree> worklist); + bool resolve_def (tree name, bitmap interesting, vec<tree> &worklist); bool resolve_phi (gphi *phi, bitmap imports); edge find_taken_edge (const vec<basic_block> &path); edge find_taken_edge_cond (const vec<basic_block> &path, gcond *); @@ -240,7 +240,7 @@ back_threader::find_taken_edge_cond (const vec<basic_block> &path, // Populate a vector of trees from a bitmap. static inline void -populate_worklist (vec<tree> worklist, bitmap bits) +populate_worklist (vec<tree> &worklist, bitmap bits) { bitmap_iterator bi; unsigned i; @@ -317,7 +317,7 @@ back_threader::resolve_phi (gphi *phi, bitmap interesting) // current path to be constant, register the path, and return TRUE. bool -back_threader::resolve_def (tree name, bitmap interesting, vec<tree> worklist) +back_threader::resolve_def (tree name, bitmap interesting, vec<tree> &worklist) { gimple *def_stmt = SSA_NAME_DEF_STMT (name); diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 17d24b4..d594c0a 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -212,7 +212,7 @@ vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) static void vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) { - vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); + const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); for (unsigned int i = 0; i < checks.length(); ++i) if (checks[i] == value) return; @@ -2349,8 +2349,8 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) if (do_versioning) { - vec<stmt_vec_info> may_misalign_stmts - = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); + const vec<stmt_vec_info> &may_misalign_stmts + = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); stmt_vec_info stmt_info; /* It can now be assumed that the data references in the statements @@ -3364,7 +3364,8 @@ static void vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, poly_uint64 min_value) { - vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); + vec<vec_lower_bound> &lower_bounds + = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); for (unsigned int i = 0; i < lower_bounds.length (); ++i) if (operand_equal_p (lower_bounds[i].expr, expr, 0)) { @@ -3466,10 +3467,10 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; hash_set <tree_pair_hash> compared_objects; - vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); + const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); vec<dr_with_seg_len_pair_t> &comp_alias_ddrs = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); - vec<vec_object_pair> &check_unequal_addrs + const vec<vec_object_pair> &check_unequal_addrs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); @@ -5350,7 +5351,7 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, I4: 6 14 22 30 7 15 23 31. */ void -vect_permute_store_chain (vec_info *vinfo, vec<tree> dr_chain, +vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, unsigned int length, stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c index ad209de..b8d09b7 100644 --- a/gcc/tree-vect-slp-patterns.c +++ b/gcc/tree-vect-slp-patterns.c @@ -746,7 +746,7 @@ vect_match_call_complex_mla (slp_tree node, unsigned child, of the negate node. */ static inline bool -vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL) +vect_normalize_conj_loc (vec<slp_tree> &args, bool *neg_first_p = NULL) { gcc_assert (args.length () == 2); bool neg_found = false; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index e6b81a0..94bdb74 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -4499,7 +4499,7 @@ static void vect_create_vectorized_demotion_stmts (vec_info *vinfo, vec<tree> *vec_oprnds, int multi_step_cvt, stmt_vec_info stmt_info, - vec<tree> vec_dsts, + vec<tree> &vec_dsts, gimple_stmt_iterator *gsi, slp_tree slp_node, enum tree_code code) { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 686644b4..5571b3c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1990,8 +1990,8 @@ extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT); extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT); extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); -extern void vect_permute_store_chain (vec_info *, - vec<tree> ,unsigned int, stmt_vec_info, +extern void vect_permute_store_chain (vec_info *, vec<tree> &, + unsigned int, stmt_vec_info, gimple_stmt_iterator *, vec<tree> *); extern tree vect_setup_realignment (vec_info *, stmt_vec_info, gimple_stmt_iterator *, @@ -38,16 +38,6 @@ along with GCC; see the file COPYING3. If not see #include "diagnostic-core.h" #endif -/* vNULL is an empty type with a template cast operation that returns - a zero-initialized vec<T, A, L> instance. Use this when you want - to assign nil values to new vec instances or pass a nil vector as - a function call argument. - - We use this technique because vec<T, A, L> must be PODs (they are - stored in unions and passed in vararg functions), this means that - they cannot have ctors/dtors. */ -vnull vNULL; - /* Vector memory usage. */ class vec_usage: public mem_usage { @@ -282,6 +272,42 @@ safe_push_range (vec <int>&v, int start, int limit) v.safe_push (i); } +/* Verify forms of initialization. */ + +static void +test_init () +{ + { + vec<int> v1{ }; + ASSERT_EQ (0, v1.length ()); + + vec<int> v2 (v1); + ASSERT_EQ (0, v2.length ()); + } + + { + vec<int> v1 = vec<int>(); + ASSERT_EQ (0, v1.length ()); + + vec<int> v2 = v1; + ASSERT_EQ (0, v2.length ()); + } + + { + vec<int> v1 (vNULL); + ASSERT_EQ (0, v1.length ()); + v1.safe_push (1); + + vec<int> v2 (v1); + ASSERT_EQ (1, v1.length ()); + v2.safe_push (1); + + ASSERT_EQ (2, v1.length ()); + ASSERT_EQ (2, v2.length ()); + v1.release (); + } +} + /* Verify that vec::quick_push works correctly. */ static void @@ -547,6 +573,7 @@ test_auto_delete_vec () void vec_c_tests () { + test_init (); test_quick_push (); test_safe_push (); test_truncate (); @@ -541,18 +541,16 @@ vec_copy_construct (T *dst, const T *src, unsigned n) ::new (static_cast<void*>(dst)) T (*src); } -/* Type to provide NULL values for vec<T, A, L>. This is used to - provide nil initializers for vec instances. Since vec must be - a POD, we cannot have proper ctor/dtor for it. To initialize - a vec instance, you can assign it the value vNULL. This isn't - needed for file-scope and function-local static vectors, which - are zero-initialized by default. */ -struct vnull -{ - template <typename T, typename A, typename L> - CONSTEXPR operator vec<T, A, L> () const { return vec<T, A, L>(); } -}; -extern vnull vNULL; +/* Type to provide zero-initialized values for vec<T, A, L>. This is + used to provide nil initializers for vec instances. Since vec must + be a trivially copyable type that can be copied by memcpy and zeroed + out by memset, it must have defaulted default and copy ctor and copy + assignment. To initialize a vec either use value initialization + (e.g., vec() or vec v{ };) or assign it the value vNULL. This isn't + needed for file-scope and function-local static vectors, which are + zero-initialized by default. */ +struct vnull { }; +constexpr vnull vNULL{ }; /* Embeddable vector. These vectors are suitable to be embedded @@ -1431,10 +1429,34 @@ gt_pch_nx (vec<T, A, vl_embed> *v, gt_pointer_operator op, void *cookie) As long as we use C++03, we cannot have constructors nor destructors in classes that are stored in unions. */ +template<typename T, size_t N = 0> +class auto_vec; + template<typename T> struct vec<T, va_heap, vl_ptr> { public: + /* Default ctors to ensure triviality. Use value-initialization + (e.g., vec() or vec v{ };) or vNULL to create a zero-initialized + instance. */ + vec () = default; + vec (const vec &) = default; + /* Initialization from the generic vNULL. */ + vec (vnull): m_vec () { } + /* Same as default ctor: vec storage must be released manually. */ + ~vec () = default; + + /* Defaulted same as copy ctor. */ + vec& operator= (const vec &) = default; + + /* Prevent implicit conversion from auto_vec. Use auto_vec::to_vec() + instead. */ + template <size_t N> + vec (auto_vec<T, N> &) = delete; + + template <size_t N> + void operator= (auto_vec<T, N> &) = delete; + /* Memory allocation and deallocation for the embedded vector. Needed because we cannot have proper ctors/dtors defined. */ void create (unsigned nelems CXX_MEM_STAT_INFO); @@ -1522,7 +1544,7 @@ public: want to ask for internal storage for vectors on the stack because if the size of the vector is larger than the internal storage that space is wasted. */ -template<typename T, size_t N = 0> +template<typename T, size_t N /* = 0 */> class auto_vec : public vec<T, va_heap> { public: @@ -1549,6 +1571,14 @@ public: this->release (); } + /* Explicitly convert to the base class. There is no conversion + from a const auto_vec because a copy of the returned vec can + be used to modify *THIS. + This is a legacy function not to be used in new code. */ + vec<T, va_heap> to_vec_legacy () { + return *static_cast<vec<T, va_heap> *>(this); + } + private: vec<T, va_heap, vl_embed> m_auto; T m_data[MAX (N - 1, 1)]; @@ -1602,6 +1632,14 @@ public: return *this; } + /* Explicitly convert to the base class. There is no conversion + from a const auto_vec because a copy of the returned vec can + be used to modify *THIS. + This is a legacy function not to be used in new code. */ + vec<T, va_heap> to_vec_legacy () { + return *static_cast<vec<T, va_heap> *>(this); + } + // You probably don't want to copy a vector, so these are deleted to prevent // unintentional use. If you really need a copy of the vectors contents you // can use copy (). @@ -1781,7 +1819,7 @@ template<typename T> inline vec<T, va_heap, vl_ptr> vec<T, va_heap, vl_ptr>::copy (ALONE_MEM_STAT_DECL) const { - vec<T, va_heap, vl_ptr> new_vec = vNULL; + vec<T, va_heap, vl_ptr> new_vec{ }; if (length ()) new_vec.m_vec = m_vec->copy (ALONE_PASS_MEM_STAT); return new_vec; |