aboutsummaryrefslogtreecommitdiff
path: root/gcc/tree-vectorizer.h
AgeCommit message (Collapse)AuthorFilesLines
2023-08-24tree-optimization/111115 - SLP of masked storesRichard Biener1-0/+1
The following adds the capability to do SLP on .MASK_STORE, I do not plan to add interleaving support. PR tree-optimization/111115 gcc/ * tree-vectorizer.h (vect_slp_child_index_for_operand): New. * tree-vect-data-refs.cc (can_group_stmts_p): Also group .MASK_STORE. * tree-vect-slp.cc (arg3_arg2_map): New. (vect_get_operand_map): Handle IFN_MASK_STORE. (vect_slp_child_index_for_operand): New function. (vect_build_slp_tree_1): Handle statements with no LHS, masked store ifns. (vect_remove_slp_scalar_calls): Likewise. * tree-vect-stmts.cc (vect_check_store_rhs): Lookup the SLP child corresponding to the ifn value index. (vectorizable_store): Likewise for the mask index. Support masked stores. (vectorizable_load): Lookup the SLP child corresponding to the ifn mask index. gcc/testsuite/ * lib/target-supports.exp (check_effective_target_vect_masked_store): Supported with check_avx_available. * gcc.dg/vect/slp-mask-store-1.c: New testcase.
2023-08-16VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizerJuzhe-Zhong1-2/+2
Hi, Richard and Richi. This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer. Consider this simple case: void __attribute__ ((noinline, noclone)) foo (int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d, int *__restrict e, int *__restrict f, int *__restrict g, int *__restrict h, int *__restrict j, int n) { for (int i = 0; i < n; ++i) { a[i] = j[i * 8]; b[i] = j[i * 8 + 1]; c[i] = j[i * 8 + 2]; d[i] = j[i * 8 + 3]; e[i] = j[i * 8 + 4]; f[i] = j[i * 8 + 5]; g[i] = j[i * 8 + 6]; h[i] = j[i * 8 + 7]; } } RVV Gimple IR: _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]); ivtmp_125 = _79 * 32; vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0); vect__8.9_122 = vect_array.8[0]; vect__8.10_121 = vect_array.8[1]; vect__8.11_120 = vect_array.8[2]; vect__8.12_119 = vect_array.8[3]; vect__8.13_118 = vect_array.8[4]; vect__8.14_117 = vect_array.8[5]; vect__8.15_116 = vect_array.8[6]; vect__8.16_115 = vect_array.8[7]; vect_array.8 ={v} {CLOBBER}; ivtmp_114 = _79 * 4; .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122); .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121); .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120); .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119); .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118); .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117); .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116); .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115); ASM: foo: lw t4,8(sp) ld t5,0(sp) ble t4,zero,.L5 .L3: vsetvli t1,t4,e8,mf4,ta,ma vlseg8e32.v v8,(t5) slli t3,t1,2 slli t6,t1,5 vse32.v v8,0(a0) vse32.v v9,0(a1) vse32.v v10,0(a2) vse32.v v11,0(a3) vse32.v v12,0(a4) vse32.v v13,0(a5) vse32.v v14,0(a6) vse32.v v15,0(a7) sub t4,t4,t1 add t5,t5,t6 add a0,a0,t3 add a1,a1,t3 add a2,a2,t3 add a3,a3,t3 add a4,a4,t3 add a5,a5,t3 add a6,a6,t3 add a7,a7,t3 bne t4,zero,.L3 .L5: ret The details of the approach: Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported): +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT + vectors of type VECTYPE. MASKED_P says whether the masked form is needed. */ -bool +internal_fn vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, bool masked_p) { - if (masked_p) - return vect_lanes_optab_supported_p ("vec_mask_load_lanes", - vec_mask_load_lanes_optab, - vectype, count); + if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes", + vec_mask_len_load_lanes_optab, + vectype, count)) + return IFN_MASK_LEN_LOAD_LANES; + else if (masked_p) + { + if (vect_lanes_optab_supported_p ("vec_mask_load_lanes", + vec_mask_load_lanes_optab, + vectype, count)) + return IFN_MASK_LOAD_LANES; + } else - return vect_lanes_optab_supported_p ("vec_load_lanes", - vec_load_lanes_optab, - vectype, count); + { + if (vect_lanes_optab_supported_p ("vec_load_lanes", + vec_load_lanes_optab, + vectype, count)) + return IFN_LOAD_LANES; + } + return IFN_LAST; } Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE. I change it into return internal_fn of the LANES LOAD/STORE that target support, If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST. Step 2 - Compute IFN for LANES LOAD/STORE (Only compute once). if (!STMT_VINFO_STRIDED_P (first_stmt_info) && (can_overrun_p || !would_overrun_p) && compare_step_with_zero (vinfo, stmt_info) > 0) { /* First cope with the degenerate case of a single-element vector. */ if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)) ; else { /* Otherwise try using LOAD/STORE_LANES. */ *lanes_ifn = vls_type == VLS_LOAD ? vect_load_lanes_supported (vectype, group_size, masked_p) : vect_store_lanes_supported (vectype, group_size, masked_p); if (*lanes_ifn != IFN_LAST) { *memory_access_type = VMAT_LOAD_STORE_LANES; overrun_p = would_overrun_p; } /* If that fails, try using permuting loads. */ else if (vls_type == VLS_LOAD ? vect_grouped_load_supported (vectype, single_element_p, group_size) : vect_grouped_store_supported (vectype, group_size)) { *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; overrun_p = would_overrun_p; } } } Step 3 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR: + if (lanes_ifn == IFN_MASK_LEN_STORE_LANES) + { + if (loop_lens) + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + ncopies, vectype, j, 1); + else + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + } + gcall *call; - if (final_mask) + if (final_len && final_mask) + { + /* Emit: + MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK, + LEN, BIAS, VEC_ARRAY). */ + unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype)); + tree alias_ptr = build_int_cst (ref_type, align); + call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6, + dataref_ptr, alias_ptr, + final_mask, final_len, bias, + vec_array); + } + else if (final_mask) The LEN and MASK flow is totally the same as other MASK_LEN_* load/store. gcc/ChangeLog: * internal-fn.cc (internal_load_fn_p): Apply MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer. (internal_store_fn_p): Ditto. (internal_fn_len_index): Ditto. (internal_fn_mask_index): Ditto. (internal_fn_stored_value_index): Ditto. * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto. (vect_load_lanes_supported): Ditto. * tree-vect-loop.cc: Ditto. * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (get_group_load_store_type): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. * tree-vectorizer.h (vect_store_lanes_supported): Ditto. (vect_load_lanes_supported): Ditto.
2023-08-15Support constants and externals in BB reduction vectorizationRichard Biener1-4/+5
The following supports vectorizing BB reductions involving a constant or an invariant. * tree-vectorizer.h (_slp_instance::remain_stmts): Change to ... (_slp_instance::remain_defs): ... this. (SLP_INSTANCE_REMAIN_STMTS): Rename to ... (SLP_INSTANCE_REMAIN_DEFS): ... this. (slp_root::remain): New. (slp_root::slp_root): Adjust. * tree-vect-slp.cc (vect_free_slp_instance): Adjust. (vect_build_slp_instance): Get extra remain parameter, adjust former handling of a cut off stmt. (vect_analyze_slp_instance): Adjust. (vect_analyze_slp): Likewise. (_bb_vec_info::~_bb_vec_info): Likewise. (vectorizable_bb_reduc_epilogue): Dump something if we fail. (vect_slp_check_for_constructors): Handle non-internal defs as remain defs of a reduction. (vectorize_slp_instance_root_stmt): Adjust. * gcc.dg/vect/bb-slp-75.c: New testcase.
2023-08-10Remove insert location argument from vectorizable_live_operationRichard Biener1-2/+1
The insert location argument isn't actually used but we compute that ourselves. There's a single spot, namely when asking for the loop mask via vect_get_loop_mask that the passed argument is used but that looks like an oversight. The following fixes that and adjusts vectorizable_live_operation and can_vectorize_live_stmts to no longer take a stmt iterator argument. * tree-vectorizer.h (vectorizable_live_operation): Remove gimple_stmt_iterator * argument. * tree-vect-loop.cc (vectorizable_live_operation): Likewise. Adjust plumbing around vect_get_loop_mask. (vect_analyze_loop_operations): Adjust. * tree-vect-slp.cc (vect_slp_analyze_node_operations_1): Likewise. (vect_bb_slp_mark_live_stmts): Likewise. (vect_schedule_slp_node): Likewise. * tree-vect-stmts.cc (can_vectorize_live_stmts): Likewise. Remove gimple_stmt_iterator * argument. (vect_transform_stmt): Adjust.
2023-08-08tree-optimization/49955 - BB reduction with odd number of lanesRichard Biener1-0/+5
The following enhances BB reduction vectorization to support vectorizing only a subset of the lanes, keeping the rest as scalar ops. For now we try to make the number of lanes even by leaving alone the "last" lane. That's because SLP discovery with all lanes will fail too soon to get us any hint on which lane to strip and likewise we don't know what vector modes the target supports so restricting ourselves to power-of-two or other cases isn't easy. This is enough to get at the vectorization opportunity for the testcase in the PR - albeit with the chosen lanes not optimal but at least vectorizable. PR tree-optimization/49955 * tree-vectorizer.h (_slp_instance::remain_stmts): New. (SLP_INSTANCE_REMAIN_STMTS): Likewise. * tree-vect-slp.cc (vect_free_slp_instance): Release SLP_INSTANCE_REMAIN_STMTS. (vect_build_slp_instance): Make the number of lanes of a BB reduction even. (vectorize_slp_instance_root_stmt): Handle unvectorized defs of a BB reduction. * gfortran.dg/vect/pr49955.f: New testcase.
2023-07-24Remove SLP_TREE_VEC_STMTS in favor of SLP_TREE_VEC_DEFSRichard Biener1-3/+4
The following unifies SLP_TREE_VEC_STMTS into SLP_TREE_VEC_DEFS which can handle all cases we need. * tree-vectorizer.h (_slp_tree::push_vec_def): Add. (_slp_tree::vec_stmts): Remove. (SLP_TREE_VEC_STMTS): Remove. * tree-vect-slp.cc (_slp_tree::push_vec_def): Define. (_slp_tree::_slp_tree): Adjust. (_slp_tree::~_slp_tree): Likewise. (vect_get_slp_vect_def): Simplify. (vect_get_slp_defs): Likewise. (vect_transform_slp_perm_load_1): Adjust. (vect_add_slp_permutation): Likewise. (vect_schedule_slp_node): Likewise. (vectorize_slp_instance_root_stmt): Likewise. (vect_schedule_scc): Likewise. * tree-vect-stmts.cc (vectorizable_bswap): Use push_vec_def. (vectorizable_call): Likewise. (vectorizable_call): Likewise. (vect_create_vectorized_demotion_stmts): Likewise. (vectorizable_conversion): Likewise. (vectorizable_assignment): Likewise. (vectorizable_shift): Likewise. (vectorizable_operation): Likewise. (vectorizable_load): Likewise. (vectorizable_condition): Likewise. (vectorizable_comparison): Likewise. * tree-vect-loop.cc (vect_create_epilog_for_reduction): Adjust. (vectorize_fold_left_reduction): Use push_vec_def. (vect_transform_reduction): Likewise. (vect_transform_cycle_phi): Likewise. (vectorizable_lc_phi): Likewise. (vectorizable_phi): Likewise. (vectorizable_recurr): Likewise. (vectorizable_induction): Likewise. (vectorizable_live_operation): Likewise.
2023-07-06tree-optimization/110563 - simplify epilogue VF checksRichard Biener1-2/+1
The following consolidates an assert that now hits for ppc64le with an earlier check we already do, simplifying vect_determine_partial_vectors_and_peeling and getting rid of its now redundant argument. PR tree-optimization/110563 * tree-vectorizer.h (vect_determine_partial_vectors_and_peeling): Remove second argument. * tree-vect-loop.cc (vect_determine_partial_vectors_and_peeling): Remove for_epilogue_p argument. Merge assert ... (vect_analyze_loop_2): ... with check done before determining partial vectors by moving it after. * tree-vect-loop-manip.cc (vect_do_peeling): Adjust.
2023-06-19AVX512 fully masked vectorizationRichard Biener1-3/+32
This implemens fully masked vectorization or a masked epilog for AVX512 style masks which single themselves out by representing each lane with a single bit and by using integer modes for the mask (both is much like GCN). AVX512 is also special in that it doesn't have any instruction to compute the mask from a scalar IV like SVE has with while_ult. Instead the masks are produced by vector compares and the loop control retains the scalar IV (mainly to avoid dependences on mask generation, a suitable mask test instruction is available). Like RVV code generation prefers a decrementing IV though IVOPTs messes things up in some cases removing that IV to eliminate it with an incrementing one used for address generation. One of the motivating testcases is from PR108410 which in turn is extracted from x264 where large size vectorization shows issues with small trip loops. Execution time there improves compared to classic AVX512 with AVX2 epilogues for the cases of less than 32 iterations. size scalar 128 256 512 512e 512f 1 9.42 11.32 9.35 11.17 15.13 16.89 2 5.72 6.53 6.66 6.66 7.62 8.56 3 4.49 5.10 5.10 5.74 5.08 5.73 4 4.10 4.33 4.29 5.21 3.79 4.25 6 3.78 3.85 3.86 4.76 2.54 2.85 8 3.64 1.89 3.76 4.50 1.92 2.16 12 3.56 2.21 3.75 4.26 1.26 1.42 16 3.36 0.83 1.06 4.16 0.95 1.07 20 3.39 1.42 1.33 4.07 0.75 0.85 24 3.23 0.66 1.72 4.22 0.62 0.70 28 3.18 1.09 2.04 4.20 0.54 0.61 32 3.16 0.47 0.41 0.41 0.47 0.53 34 3.16 0.67 0.61 0.56 0.44 0.50 38 3.19 0.95 0.95 0.82 0.40 0.45 42 3.09 0.58 1.21 1.13 0.36 0.40 'size' specifies the number of actual iterations, 512e is for a masked epilog and 512f for the fully masked loop. From 4 scalar iterations on the AVX512 masked epilog code is clearly the winner, the fully masked variant is clearly worse and it's size benefit is also tiny. This patch does not enable using fully masked loops or masked epilogues by default. More work on cost modeling and vectorization kind selection on x86_64 is necessary for this. Implementation wise this introduces LOOP_VINFO_PARTIAL_VECTORS_STYLE which could be exploited further to unify some of the flags we have right now but there didn't seem to be many easy things to merge, so I'm leaving this for followups. Mask requirements as registered by vect_record_loop_mask are kept in their original form and recorded in a hash_set now instead of being processed to a vector of rgroup_controls. Instead that's now left to the final analysis phase which tries forming the rgroup_controls vector using while_ult and if that fails now tries AVX512 style which needs a different organization and instead fills a hash_map with the relevant info. vect_get_loop_mask now has two implementations, one for the two mask styles we then have. I have decided against interweaving vect_set_loop_condition_partial_vectors with conditions to do AVX512 style masking and instead opted to "duplicate" this to vect_set_loop_condition_partial_vectors_avx512. Likewise for vect_verify_full_masking vs vect_verify_full_masking_avx512. The vect_prepare_for_masked_peels hunk might run into issues with SVE, I didn't check yet but using LOOP_VINFO_RGROUP_COMPARE_TYPE looked odd. Bootstrapped and tested on x86_64-unknown-linux-gnu. I've run the testsuite with --param vect-partial-vector-usage=2 with and without -fno-vect-cost-model and filed two bugs, one ICE (PR110221) and one latent wrong-code (PR110237). * tree-vectorizer.h (enum vect_partial_vector_style): New. (_loop_vec_info::partial_vector_style): Likewise. (LOOP_VINFO_PARTIAL_VECTORS_STYLE): Likewise. (rgroup_controls::compare_type): Add. (vec_loop_masks): Change from a typedef to auto_vec<> to a structure. * tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors): Adjust. Convert niters_skip to compare_type. (vect_set_loop_condition_partial_vectors_avx512): New function implementing the AVX512 partial vector codegen. (vect_set_loop_condition): Dispatch to the correct vect_set_loop_condition_partial_vectors_* function based on LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_prepare_for_masked_peels): Compute LOOP_VINFO_MASK_SKIP_NITERS in the original niter type. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize partial_vector_style. (can_produce_all_loop_masks_p): Adjust. (vect_verify_full_masking): Produce the rgroup_controls vector here. Set LOOP_VINFO_PARTIAL_VECTORS_STYLE on success. (vect_verify_full_masking_avx512): New function implementing verification of AVX512 style masking. (vect_verify_loop_lens): Set LOOP_VINFO_PARTIAL_VECTORS_STYLE. (vect_analyze_loop_2): Also try AVX512 style masking. Adjust condition. (vect_estimate_min_profitable_iters): Implement AVX512 style mask producing cost. (vect_record_loop_mask): Do not build the rgroup_controls vector here but record masks in a hash-set. (vect_get_loop_mask): Implement AVX512 style mask query, complementing the existing while_ult style.
2023-06-19Add loop_vinfo argument to vect_get_loop_maskRichard Biener1-1/+2
This adds a loop_vinfo argument for future use, making the next patch smaller. * tree-vectorizer.h (vect_get_loop_mask): Add loop_vec_info argument. * tree-vect-loop.cc (vect_get_loop_mask): Likewise. (vectorize_fold_left_reduction): Adjust. (vect_transform_reduction): Likewise. (vectorizable_live_operation): Likewise. * tree-vect-stmts.cc (vectorizable_call): Likewise. (vectorizable_operation): Likewise. (vectorizable_store): Likewise. (vectorizable_load): Likewise. (vectorizable_condition): Likewise.
2023-06-10VECT: Add SELECT_VL supportJu-Zhe Zhong1-0/+6
This patch address comments from Richard && Richi and rebase to trunk. This patch is adding SELECT_VL middle-end support allow target have target dependent optimization in case of length calculation. This patch is inspired by RVV ISA and LLVM: https://reviews.llvm.org/D99750 The SELECT_VL is same behavior as LLVM "get_vector_length" with these following properties: 1. Only apply on single-rgroup. 2. non SLP. 3. adjust loop control IV. 4. adjust data reference IV. 5. allow non-vf elements processing in non-final iteration Code # void vvaddint32(size_t n, const int*x, const int*y, int*z) # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } Take RVV codegen for example: Before this patch: vvaddint32: ble a0,zero,.L6 csrr a4,vlenb srli a6,a4,2 .L4: mv a5,a0 bleu a0,a6,.L3 mv a5,a6 .L3: vsetvli zero,a5,e32,m1,ta,ma vle32.v v2,0(a1) vle32.v v1,0(a2) vsetvli a7,zero,e32,m1,ta,ma sub a0,a0,a5 vadd.vv v1,v1,v2 vsetvli zero,a5,e32,m1,ta,ma vse32.v v1,0(a3) add a2,a2,a4 add a3,a3,a4 add a1,a1,a4 bne a0,zero,.L4 .L6: ret After this patch: vvaddint32: vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors vle32.v v0, (a1) # Get first vector sub a0, a0, t0 # Decrement number done slli t0, t0, 2 # Multiply number done by 4 bytes add a1, a1, t0 # Bump pointer vle32.v v1, (a2) # Get second vector add a2, a2, t0 # Bump pointer vadd.vv v2, v0, v1 # Sum vectors vse32.v v2, (a3) # Store result add a3, a3, t0 # Bump pointer bnez a0, vvaddint32 # Loop back ret # Finished Co-authored-by: Richard Sandiford<richard.sandiford@arm.com> Co-authored-by: Richard Biener <rguenther@suse.de> gcc/ChangeLog: * doc/md.texi: Add SELECT_VL support. * internal-fn.def (SELECT_VL): Ditto. * optabs.def (OPTAB_D): Ditto. * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto. * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
2023-06-05vect: Refactor to allow internal_fn'sAndre Vieira1-7/+9
Refactor vect-patterns to allow patterns to be internal_fns starting with widening_plus/minus patterns 2023-06-05 Andre Vieira <andre.simoesdiasvieira@arm.com> Joel Hutton <joel.hutton@arm.com> gcc/ChangeLog: * tree-vect-patterns.cc: Add include for gimple-iterator. (vect_recog_widen_op_pattern): Refactor to use code_helper. (vect_gimple_build): New function. * tree-vect-stmts.cc (simple_integer_narrowing): Refactor to use code_helper. (vectorizable_call): Likewise. (vect_gen_widened_results_half): Likewise. (vect_create_vectorized_demotion_stmts): Likewise. (vect_create_vectorized_promotion_stmts): Likewise. (vect_create_half_widening_stmts): Likewise. (vectorizable_conversion): Likewise. (supportable_widening_operation): Likewise. (supportable_narrowing_operation): Likewise. * tree-vectorizer.h (supportable_widening_operation): Change prototype to use code_helper. (supportable_narrowing_operation): Likewise. (vect_gimple_build): New function prototype. * tree.h (code_helper::safe_as_tree_code): New function. (code_helper::safe_as_fn_code): New function.
2023-05-31Enhance NARROW FLOAT_EXPR vectorization by truncating integer to lower ↵liuhongt1-0/+1
precision. Similar like WIDEN FLOAT_EXPR, when direct_optab is not existed, try intermediate integer type whenever gimple ranger can tell it's safe. .i.e. When there's no direct optab for vector long long -> vector float, but the value range of integer can be represented as int, try vector int -> vector float if availble. gcc/ChangeLog: PR tree-optimization/108804 * tree-vect-patterns.cc (vect_get_range_info): Remove static. * tree-vect-stmts.cc (vect_create_vectorized_demotion_stmts): Add new parameter narrow_src_p. (vectorizable_conversion): Enhance NARROW FLOAT_EXPR vectorization by truncating to lower precision. * tree-vectorizer.h (vect_get_range_info): New declare. gcc/testsuite/ChangeLog: * gcc.target/i386/pr108804.c: New test.
2023-05-25VECT: Add decrement IV iteration loop control by variable amount supportJu-Zhe Zhong1-0/+8
This patch is supporting decrement IV by following the flow designed by Richard: (1) In vect_set_loop_condition_partial_vectors, for the first iteration of: call vect_set_loop_controls_directly. (2) vect_set_loop_controls_directly calculates "step" as in your patch. If rgc has 1 control, this step is the SSA name created for that control. Otherwise the step is a fresh SSA name, as in your patch. (3) vect_set_loop_controls_directly stores this step somewhere for later use, probably in LOOP_VINFO. Let's use "S" to refer to this stored step. (4) After the vect_set_loop_controls_directly call above, and outside the "if" statement that now contains vect_set_loop_controls_directly, check whether rgc->controls.length () > 1. If so, use vect_adjust_loop_lens_control to set the controls based on S. Then the only caller of vect_adjust_loop_lens_control is vect_set_loop_condition_partial_vectors. And the starting step for vect_adjust_loop_lens_control is always S. This patch has well tested for single-rgroup and multiple-rgroup (SLP) and passed all testcase in RISC-V port. Signed-off-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Co-Authored-By: Richard Sandiford <richard.sandiford@arm.com> gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New function. (vect_set_loop_controls_directly): Add decrement IV support. (vect_set_loop_condition_partial_vectors): Ditto. * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): New variable. * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c: New test.
2023-05-22VECT: Fix bug of multiple-rgroup for length is counting elementsJu-Zhe Zhong1-2/+3
Address comments from Richard that splits the patch of fixing multiple-rgroup handling of length counting elements. This patch is fixing issue of handling multiple-rgroup of length is counting elements Before this patch, multiple rgroup run fail: FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c execution test FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c execution test After this patch, These tests are all passed. gcc/ChangeLog: * tree-vect-loop.cc (vect_get_loop_len): Fix issue for multiple-rgroup of length. * tree-vect-stmts.cc (vectorizable_store): Ditto. (vectorizable_load): Ditto. * tree-vectorizer.h (vect_get_loop_len): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c: New test. * gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c: New test.
2023-02-16don't declare header-defined functions both static and inlinePatrick Palka1-32/+32
Many functions defined in our headers are declared 'static inline' which is a C idiom whose use predates our move to C++ as the implementation language. But in C++ the inline keyword is more than just a compiler hint, and is sufficient to give the function the intended semantics. In fact declaring a function both static and inline is a pessimization since static effectively disables the desired definition merging behavior enabled by inline, and is also a source of (harmless) ODR violations when a static inline function gets called from a non-static inline one (such as tree_operand_check calling tree_operand_length). This patch mechanically fixes the vast majority of occurrences of this anti-pattern throughout the compiler's headers via the command line sed -i 's/^static inline/inline/g' gcc/*.h gcc/*/*.h There's also a manual change to remove the redundant declarations of is_ivar and lookup_category in gcc/objc/objc-act.cc which would otherwise conflict with their modified definitions in objc-act.h (due to the difference in staticness). Besides fixing some ODR violations, this speeds up stage1 cc1plus by about 2% and reduces the size of its text segment by 1.5MB. gcc/ChangeLog: * addresses.h: Mechanically drop 'static' from 'static inline' functions via s/^static inline/inline/g. * asan.h: Likewise. * attribs.h: Likewise. * basic-block.h: Likewise. * bitmap.h: Likewise. * cfghooks.h: Likewise. * cfgloop.h: Likewise. * cgraph.h: Likewise. * cselib.h: Likewise. * data-streamer.h: Likewise. * debug.h: Likewise. * df.h: Likewise. * diagnostic.h: Likewise. * dominance.h: Likewise. * dumpfile.h: Likewise. * emit-rtl.h: Likewise. * except.h: Likewise. * expmed.h: Likewise. * expr.h: Likewise. * fixed-value.h: Likewise. * gengtype.h: Likewise. * gimple-expr.h: Likewise. * gimple-iterator.h: Likewise. * gimple-predict.h: Likewise. * gimple-range-fold.h: Likewise. * gimple-ssa.h: Likewise. * gimple.h: Likewise. * graphite.h: Likewise. * hard-reg-set.h: Likewise. * hash-map.h: Likewise. * hash-set.h: Likewise. * hash-table.h: Likewise. * hwint.h: Likewise. * input.h: Likewise. * insn-addr.h: Likewise. * internal-fn.h: Likewise. * ipa-fnsummary.h: Likewise. * ipa-icf-gimple.h: Likewise. * ipa-inline.h: Likewise. * ipa-modref.h: Likewise. * ipa-prop.h: Likewise. * ira-int.h: Likewise. * ira.h: Likewise. * lra-int.h: Likewise. * lra.h: Likewise. * lto-streamer.h: Likewise. * memmodel.h: Likewise. * omp-general.h: Likewise. * optabs-query.h: Likewise. * optabs.h: Likewise. * plugin.h: Likewise. * pretty-print.h: Likewise. * range.h: Likewise. * read-md.h: Likewise. * recog.h: Likewise. * regs.h: Likewise. * rtl-iter.h: Likewise. * rtl.h: Likewise. * sbitmap.h: Likewise. * sched-int.h: Likewise. * sel-sched-ir.h: Likewise. * sese.h: Likewise. * sparseset.h: Likewise. * ssa-iterators.h: Likewise. * system.h: Likewise. * target-globals.h: Likewise. * target.h: Likewise. * timevar.h: Likewise. * tree-chrec.h: Likewise. * tree-data-ref.h: Likewise. * tree-iterator.h: Likewise. * tree-outof-ssa.h: Likewise. * tree-phinodes.h: Likewise. * tree-scalar-evolution.h: Likewise. * tree-sra.h: Likewise. * tree-ssa-alias.h: Likewise. * tree-ssa-live.h: Likewise. * tree-ssa-loop-manip.h: Likewise. * tree-ssa-loop.h: Likewise. * tree-ssa-operands.h: Likewise. * tree-ssa-propagate.h: Likewise. * tree-ssa-sccvn.h: Likewise. * tree-ssa.h: Likewise. * tree-ssanames.h: Likewise. * tree-streamer.h: Likewise. * tree-switch-conversion.h: Likewise. * tree-vectorizer.h: Likewise. * tree.h: Likewise. * wide-int.h: Likewise. gcc/c-family/ChangeLog: * c-common.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. gcc/c/ChangeLog: * c-parser.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. gcc/cp/ChangeLog: * cp-tree.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. gcc/fortran/ChangeLog: * gfortran.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. gcc/jit/ChangeLog: * jit-dejagnu.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. * jit-recording.h: Likewise. gcc/objc/ChangeLog: * objc-act.h: Mechanically drop static from static inline functions via s/^static inline/inline/g. * objc-map.h: Likewise. * objc-act.cc: Remove the redundant redeclarations of is_ivar and lookup_category.
2023-02-02Don't peel nonlinear iv(mult or shift) for epilog when vf is not constant.liuhongt1-3/+0
Normally when vf is not constant, it will be prevented by vectorizable_nonlinear_inductions, but for this case, it failed going into if (STMT_VINFO_RELEVANT_P (stmt_info)) { need_to_vectorize = true; if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def && ! PURE_SLP_STMT (stmt_info)) ok = vectorizable_induction (loop_vinfo, stmt_info, NULL, NULL, &cost_vec); since the iv is never used outside of the loop, and will be dce later, so vectorizer doesn't bother checking if it's vectorizable. it's true but hit gcc_assert in vect_can_peel_nonlinear_iv_p when vf is not constant. One solution is ignoring the nonlinear iv peeling if it's !STMT_VINFO_RELEVANT_P (stmt_info) just like the upper code, the other solution is returning false earlier in the vect_can_peel_nonlinear_iv_p when vf is not constant, the patch chooses the second incase there's other cases using vect_can_advance_ivs_p which calls vect_can_peel_nonlinear_iv_p. Also remove vect_peel_nonlinear_iv_p from vectorizable_nonlinear_inductions. gcc/ChangeLog: PR tree-optimization/108601 * tree-vectorizer.h (vect_can_peel_nonlinear_iv_p): Removed. * tree-vect-loop.cc (vectorizable_nonlinear_induction): Remove vect_can_peel_nonlinear_iv_p. (vect_can_peel_nonlinear_iv_p): Don't peel nonlinear iv(mult or shift) for epilog when vf is not constant and moved the defination to .. * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): .. Here. gcc/testsuite/ChangeLog: * gcc.target/aarch64/pr108601.c: New test.
2023-01-02Update copyright years.Jakub Jelinek1-1/+1
2022-10-17Vectorization of first-order recurrencesRichard Biener1-0/+4
The following picks up the prototype by Ju-Zhe Zhong for vectorizing first order recurrences. That solves two TSVC missed optimization PRs. There's a new scalar cycle def kind, vect_first_order_recurrence and it's handling of the backedge value vectorization is complicated by the fact that the vectorized value isn't the PHI but instead a (series of) permute(s) shifting in the recurring value from the previous iteration. I've implemented this by creating both the single vectorized PHI and the series of permutes when vectorizing the scalar PHI but leave the backedge values in both unassigned. The backedge values are (for the testcases) computed by a load which is also the place after which the permutes are inserted. That placement also restricts the cases we can handle (without resorting to code motion). I added both costing and SLP handling though SLP handling is restricted to the case where a single vectorized PHI is enough. Missing is epilogue handling - while prologue peeling would be handled transparently by adjusting iv_phi_p the epilogue case doesn't work with just inserting a scalar LC PHI since that a) keeps the scalar load live and b) that loads is the wrong one, it has to be the last, much like when we'd vectorize the LC PHI as live operation. Unfortunately LIVE compute/analysis happens too early before we decide on peeling. When using fully masked loop vectorization the vect-recurr-6.c works as expected though. I have tested this on x86_64 for now, but since epilogue handling is missing there's probably no practical cases. My prototype WHILE_ULT AVX512 patch can handle vect-recurr-6.c just fine but I didn't feel like running SPEC within SDE nor is the WHILE_ULT patch complete enough. PR tree-optimization/99409 PR tree-optimization/99394 * tree-vectorizer.h (vect_def_type::vect_first_order_recurrence): Add. (stmt_vec_info_type::recurr_info_type): Likewise. (vectorizable_recurr): New function. * tree-vect-loop.cc (vect_phi_first_order_recurrence_p): New function. (vect_analyze_scalar_cycles_1): Look for first order recurrences. (vect_analyze_loop_operations): Handle them. (vect_transform_loop): Likewise. (vectorizable_recurr): New function. (maybe_set_vectorized_backedge_value): Handle the backedge value setting in the first order recurrence PHI and the permutes. * tree-vect-stmts.cc (vect_analyze_stmt): Handle first order recurrences. (vect_transform_stmt): Likewise. (vect_is_simple_use): Likewise. (vect_is_simple_use): Likewise. * tree-vect-slp.cc (vect_get_and_check_slp_defs): Likewise. (vect_build_slp_tree_2): Likewise. (vect_schedule_scc): Handle the backedge value setting in the first order recurrence PHI and the permutes. * gcc.dg/vect/vect-recurr-1.c: New testcase. * gcc.dg/vect/vect-recurr-2.c: Likewise. * gcc.dg/vect/vect-recurr-3.c: Likewise. * gcc.dg/vect/vect-recurr-4.c: Likewise. * gcc.dg/vect/vect-recurr-5.c: Likewise. * gcc.dg/vect/vect-recurr-6.c: Likewise. * gcc.dg/vect/tsvc/vect-tsvc-s252.c: Un-XFAIL. * gcc.dg/vect/tsvc/vect-tsvc-s254.c: Likewise. * gcc.dg/vect/tsvc/vect-tsvc-s291.c: Likewise. Co-authored-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
2022-09-29Check nonlinear iv in vect_can_advance_ivs_p.liuhongt1-0/+3
vectorizable_nonlinear_induction doesn't always guard vect_peel_nonlinear_iv_init when it's called by vect_update_ivs_after_vectorizer. It's supposed to be guarded by vect_can_advance_ivs_p. gcc/ChangeLog: PR tree-optimization/107055 * tree-vect-loop-manip.cc (vect_can_advance_ivs_p): Check for nonlinear induction variables. * tree-vect-loop.cc (vect_can_peel_nonlinear_iv_p): New functions. (vectorizable_nonlinear_induction): Put part codes into vect_can_peel_nonlinear_iv_p. * tree-vectorizer.h (vect_can_peel_nonlinear_iv_p): Declare. gcc/testsuite/ChangeLog: * gcc.target/i386/pr107055.c: New test.
2022-09-07Extend vectorizer to handle nonlinear induction for neg, mul/lshift/rshift ↵liuhongt1-0/+15
with a constant. For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no vec_step is needed to update vectorized iv since vf is always multiple of 2(negative * negative is positive). For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..] as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is updated as vec_def = vec_init >>/<< vec_step. For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..] as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def = vec_init * vec_step. The patch handles nonlinear iv for 1. Integer type only, floating point is not handled. 2. No slp_node. 3. iv_loop should be same as vector loop, not nested loop. 4. No UD is created, for mul, use unsigned mult to avoid UD, for shift, shift count should be less than type precision. gcc/ChangeLog: PR tree-optimization/103144 * tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function. (vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function. (vect_create_nonlinear_iv_init): New function. (vect_peel_nonlinear_iv_init): Ditto. (vect_create_nonlinear_iv_step): Ditto (vect_create_nonlinear_iv_vec_step): Ditto (vect_update_nonlinear_iv): Ditto (vectorizable_nonlinear_induction): Ditto. (vectorizable_induction): Call vectorizable_nonlinear_induction when induction_type is not vect_step_op_add. * tree-vect-loop-manip.cc (vect_update_ivs_after_vectorizer): Update nonlinear iv for epilogue loop. * tree-vectorizer.h (enum vect_induction_op_type): New enum. (STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro. gcc/testsuite/ChangeLog: * gcc.target/i386/pr103144-mul-1.c: New test. * gcc.target/i386/pr103144-mul-2.c: New test. * gcc.target/i386/pr103144-neg-1.c: New test. * gcc.target/i386/pr103144-neg-2.c: New test. * gcc.target/i386/pr103144-shift-1.c: New test. * gcc.target/i386/pr103144-shift-2.c: New test.
2022-08-30Extend SLP permutation optimisationsRichard Sandiford1-0/+2
Currently SLP tries to force permute operations "down" the graph from loads in the hope of reducing the total number of permutations needed or (in the best case) removing the need for the permutations entirely. This patch tries to extend it as follows: - Allow loads to take a different permutation from the one they started with, rather than choosing between "original permutation" and "no permutation". - Allow changes in both directions, if the target supports the reverse permutation. - Treat the placement of permutations as a two-way dataflow problem: after propagating information from leaves to roots (as now), propagate information back up the graph. - Take execution frequency into account when optimising for speed, so that (for example) permutations inside loops have a higher cost than permutations outside loops. - Try to reduce the total number of permutations when optimising for size, even if that increases the number of permutations on a given execution path. See the big block comment above vect_optimize_slp_pass for a detailed description. The original motivation for doing this was to add a framework that would allow other layout differences in future. The two main ones are: - Make it easier to represent predicated operations, including predicated operations with gaps. E.g.: a[0] += 1; a[1] += 1; a[3] += 1; could be a single load/add/store for SVE. We could handle this by representing a layout such as { 0, 1, _, 2 } or { 0, 1, _, 3 } (depending on what's being counted). We might need to move elements between lanes at various points, like with permutes. (This would first mean adding support for stores with gaps.) - Make it easier to switch between an even/odd and unpermuted layout when switching between wide and narrow elements. E.g. if a widening operation produces an even vector and an odd vector, we should try to keep operations on the wide elements in that order rather than force them to be permuted back "in order". To give some examples of what the patch does: int f1(int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d) { a[0] = (b[1] << c[3]) - d[1]; a[1] = (b[0] << c[2]) - d[0]; a[2] = (b[3] << c[1]) - d[3]; a[3] = (b[2] << c[0]) - d[2]; } continues to produce the same code as before when optimising for speed: b, c and d are permuted at load time. But when optimising for size we instead permute c into the same order as b+d and then permute the result of the arithmetic into the same order as a: ldr q1, [x2] ldr q0, [x1] ext v1.16b, v1.16b, v1.16b, #8 // <------ sshl v0.4s, v0.4s, v1.4s ldr q1, [x3] sub v0.4s, v0.4s, v1.4s rev64 v0.4s, v0.4s // <------ str q0, [x0] ret The following function: int f2(int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d) { a[0] = (b[3] << c[3]) - d[3]; a[1] = (b[2] << c[2]) - d[2]; a[2] = (b[1] << c[1]) - d[1]; a[3] = (b[0] << c[0]) - d[0]; } continues to push the reverse down to just before the store, like the previous code did. In: int f3(int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d) { for (int i = 0; i < 100; ++i) { a[0] = (a[0] + c[3]); a[1] = (a[1] + c[2]); a[2] = (a[2] + c[1]); a[3] = (a[3] + c[0]); c += 4; } } the loads of a are hoisted and the stores of a are sunk, so that only the load from c happens in the loop. When optimising for speed, we prefer to have the loop operate on the reversed layout, changing on entry and exit from the loop: mov x3, x0 adrp x0, .LC0 add x1, x2, 1600 ldr q2, [x0, #:lo12:.LC0] ldr q0, [x3] mov v1.16b, v0.16b tbl v0.16b, {v0.16b - v1.16b}, v2.16b // <-------- .p2align 3,,7 .L6: ldr q1, [x2], 16 add v0.4s, v0.4s, v1.4s cmp x2, x1 bne .L6 mov v1.16b, v0.16b adrp x0, .LC0 ldr q2, [x0, #:lo12:.LC0] tbl v0.16b, {v0.16b - v1.16b}, v2.16b // <-------- str q0, [x3] ret Similarly, for the very artificial testcase: int f4(int *__restrict a, int *__restrict b, int *__restrict c, int *__restrict d) { int a0 = a[0]; int a1 = a[1]; int a2 = a[2]; int a3 = a[3]; for (int i = 0; i < 100; ++i) { a0 ^= c[0]; a1 ^= c[1]; a2 ^= c[2]; a3 ^= c[3]; c += 4; for (int j = 0; j < 100; ++j) { a0 += d[1]; a1 += d[0]; a2 += d[3]; a3 += d[2]; d += 4; } b[0] = a0; b[1] = a1; b[2] = a2; b[3] = a3; b += 4; } a[0] = a0; a[1] = a1; a[2] = a2; a[3] = a3; } the a vector in the inner loop maintains the order { 1, 0, 3, 2 }, even though it's part of an SCC that includes the outer loop. In other words, this is a motivating case for not assigning permutes at SCC granularity. The code we get is: ldr q0, [x0] mov x4, x1 mov x5, x0 add x1, x3, 1600 add x3, x4, 1600 .p2align 3,,7 .L11: ldr q1, [x2], 16 sub x0, x1, #1600 eor v0.16b, v1.16b, v0.16b rev64 v0.4s, v0.4s // <--- .p2align 3,,7 .L10: ldr q1, [x0], 16 add v0.4s, v0.4s, v1.4s cmp x0, x1 bne .L10 rev64 v0.4s, v0.4s // <--- add x1, x0, 1600 str q0, [x4], 16 cmp x3, x4 bne .L11 str q0, [x5] ret bb-slp-layout-17.c is a collection of compile tests for problems I hit with earlier versions of the patch. The same prolems might show up elsewhere, but it seemed worth having the test anyway. In slp-11b.c we previously pushed the permutation of the in[i*4] group down from the load to just before the store. That didn't reduce the number or frequency of the permutations (or increase them either). But separating the permute from the load meant that we could no longer use load/store lanes. Whether load/store lanes are a good idea here is another question. If there were two sets of loads, and if we could use a single permutation instead of one per load, then avoiding load/store lanes should be a good thing even under the current abstract cost model. But I think under the current model we should try to avoid splitting up potential load/store lanes groups if there is no specific benefit to the split. Preferring load/store lanes is still a source of missed optimisations that we should fix one day... gcc/ * params.opt (-param=vect-max-layout-candidates=): New parameter. * doc/invoke.texi (vect-max-layout-candidates): Document it. * tree-vectorizer.h (auto_lane_permutation_t): New typedef. (auto_load_permutation_t): Likewise. * tree-vect-slp.cc (vect_slp_node_weight): New function. (slpg_layout_cost): New class. (slpg_vertex): Replace perm_in and perm_out with partition, out_degree, weight and out_weight. (slpg_partition_info, slpg_partition_layout_costs): New classes. (vect_optimize_slp_pass): Likewise, cannibalizing some part of the previous vect_optimize_slp. (vect_optimize_slp): Use it. gcc/testsuite/ * lib/target-supports.exp (check_effective_target_vect_var_shift): Return true for aarch64. * gcc.dg/vect/bb-slp-layout-1.c: New test. * gcc.dg/vect/bb-slp-layout-2.c: New test. * gcc.dg/vect/bb-slp-layout-3.c: New test. * gcc.dg/vect/bb-slp-layout-4.c: New test. * gcc.dg/vect/bb-slp-layout-5.c: New test. * gcc.dg/vect/bb-slp-layout-6.c: New test. * gcc.dg/vect/bb-slp-layout-7.c: New test. * gcc.dg/vect/bb-slp-layout-8.c: New test. * gcc.dg/vect/bb-slp-layout-9.c: New test. * gcc.dg/vect/bb-slp-layout-10.c: New test. * gcc.dg/vect/bb-slp-layout-11.c: New test. * gcc.dg/vect/bb-slp-layout-13.c: New test. * gcc.dg/vect/bb-slp-layout-14.c: New test. * gcc.dg/vect/bb-slp-layout-15.c: New test. * gcc.dg/vect/bb-slp-layout-16.c: New test. * gcc.dg/vect/bb-slp-layout-17.c: New test. * gcc.dg/vect/slp-11b.c: XFAIL SLP test for load-lanes targets.
2022-07-04Revert update-ssa assert in vectorizerRichard Biener1-0/+4
The following reverts the just added assert that virtual SSA does not need updating. It instead goes for a select whitelist of transforms known to be prone to difficulties with virtual SSA update. * tree-vect-loop-manip.cc (vect_do_peeling): Revert assert and update virtual SSA form again. Assert we do so for a known set of transforms only. * tree-vectorizer.h (vec_info::any_known_not_updated_vssa): New. * tree-vect-stmts.cc (vectorizable_load): When vectorizing using load-lanes allow virtual SSA update.
2022-02-22tree-optimization/104582 - make SLP node available in vector cost hookRichard Biener1-11/+17
This adjusts the vectorizer costing API to allow passing down the SLP node the vector stmt is created from. 2022-02-18 Richard Biener <rguenther@suse.de> PR tree-optimization/104582 * tree-vectorizer.h (stmt_info_for_cost::node): New field. (vector_costs::add_stmt_cost): Add SLP node parameter. (dump_stmt_cost): Likewise. (add_stmt_cost): Likewise, new overload and adjust. (add_stmt_costs): Adjust. (record_stmt_cost): New overload. * tree-vectorizer.cc (dump_stmt_cost): Dump the SLP node. (vector_costs::add_stmt_cost): Adjust. * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Adjust. * tree-vect-slp.cc (vect_prologue_cost_for_slp): Record the SLP node for costing. (vectorizable_slp_permutation): Likewise. * tree-vect-stmts.cc (record_stmt_cost): Adjust and add new overloads. * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Adjust. * config/aarch64/aarch64.cc (aarch64_vector_costs::add_stmt_cost): Adjust. * config/rs6000/rs6000.cc (rs6000_vector_costs::add_stmt_cost): Adjust. (rs6000_cost_data::adjust_vect_cost_per_loop): Likewise.
2022-02-22tree-optimization/104582 - Simplify vectorizer cost API and fixesRichard Biener1-0/+12
This simplifies the vectorizer cost API by providing overloads to add_stmt_cost and record_stmt_cost suitable for scalar stmt and branch stmt costing which do not need information like a vector type or alignment. It also fixes two mistakes where costs for versioning tests were recorded as vector stmt rather than scalar stmt. This is a first patch to simplify the actual fix for PR104582. 2022-02-18 Richard Biener <rguenther@suse.de> PR tree-optimization/104582 * tree-vectorizer.h (add_stmt_cost): New overload. (record_stmt_cost): Likewise. * tree-vect-loop.cc (vect_compute_single_scalar_iteration_cost): Use add_stmt_costs. (vect_get_known_peeling_cost): Use new overloads. (vect_estimate_min_profitable_iters): Likewise. Consistently use scalar_stmt for costing versioning checks. * tree-vect-stmts.cc (record_stmt_cost): New overload.
2022-02-15vect+aarch64: Fix ldp_stp_* regressionsRichard Sandiford1-0/+35
ldp_stp_1.c, ldp_stp_4.c and ldp_stp_5.c have been failing since vectorisation was enabled at -O2. In all three cases SLP is generating vector code when scalar code would be better. The problem is that the target costs do not model whether STP could be used for the scalar or vector code, so the normal latency-based costs for store-heavy code can be way off. It would be good to fix that “properly” at some point, but it isn't easy; see the existing discussion in aarch64_sve_adjust_stmt_cost for more details. This patch therefore adds an on-the-side check for whether the code is doing nothing more than set-up+stores. It then applies STP-based costs to those cases only, in addition to the normal latency-based costs. (That is, the vector code has to win on both counts rather than on one count individually.) However, at the moment, SLP costs one vector set-up instruction for every vector in an SLP node, even if the contents are the same as a previous vector in the same node. Fixing the STP costs without fixing that would regress other cases, tested in the patch. The patch therefore makes the SLP costing code check for duplicates within a node. Ideally we'd check for duplicates more globally, but that would require a more global approach to costs: the cost of an initialisation should be amoritised across all trees that use the initialisation, rather than fully counted against one arbitrarily-chosen subtree. Back on aarch64: an earlier version of the patch tried to apply the new heuristic to constant stores. However, that didn't work too well in practice; see the comments for details. The patch therefore just tests the status quo for constant cases, leaving out a match if the current choice is dubious. ldp_stp_5.c was affected by the same thing. The test would be worth vectorising if we generated better vector code, but: (1) We do a bad job of moving the { -1, 1 } constant, given that we have { -1, -1 } and { 1, 1 } to hand. (2) The vector code has 6 pairable stores to misaligned offsets. We have peephole patterns to handle such misalignment for 4 pairable stores, but not 6. So the SLP decision isn't wrong as such. It's just being let down by later codegen. The patch therefore adds -mstrict-align to preserve the original intention of the test while adding ldp_stp_19.c to check for the preferred vector code (XFAILed for now). gcc/ * tree-vectorizer.h (vect_scalar_ops_slice): New struct. (vect_scalar_ops_slice_hash): Likewise. (vect_scalar_ops_slice::op): New function. * tree-vect-slp.cc (vect_scalar_ops_slice::all_same_p): New function. (vect_scalar_ops_slice_hash::hash): Likewise. (vect_scalar_ops_slice_hash::equal): Likewise. (vect_prologue_cost_for_slp): Check for duplicate vectors. * config/aarch64/aarch64.cc (aarch64_vector_costs::m_stp_sequence_cost): New member variable. (aarch64_aligned_constant_offset_p): New function. (aarch64_stp_sequence_cost): Likewise. (aarch64_vector_costs::add_stmt_cost): Handle new STP heuristic. (aarch64_vector_costs::finish_cost): Likewise. gcc/testsuite/ * gcc.target/aarch64/ldp_stp_5.c: Require -mstrict-align. * gcc.target/aarch64/ldp_stp_14.h, * gcc.target/aarch64/ldp_stp_14.c: New test. * gcc.target/aarch64/ldp_stp_15.c: Likewise. * gcc.target/aarch64/ldp_stp_16.c: Likewise. * gcc.target/aarch64/ldp_stp_17.c: Likewise. * gcc.target/aarch64/ldp_stp_18.c: Likewise. * gcc.target/aarch64/ldp_stp_19.c: Likewise.
2022-02-02vect: Simplify and extend the complex numbers validation routines.Tamar Christina1-1/+10
This patch boosts the analysis for complex mul,fma and fms in order to ensure that it doesn't create an incorrect output. Essentially it adds an extra verification to check that the two nodes it's going to combine do the same operations on compatible values. The reason it needs to do this is that if one computation differs from the other then with the current implementation we have no way to deal with it since we have to remove the permute. When we can keep the permute around we can probably handle these by unrolling. While implementing this since I have to do the traversal anyway I took advantage of it by simplifying the code a bit. Previously we would determine whether something is a conjugate and then try to figure out which conjugate it is and then try to see if the permutes match what we expect. Now the code that does the traversal will detect this in one go and return to us whether the operation is something that can be combined and whether a conjugate is present. Secondly because it does this I can now simplify the checking code itself to essentially just try to apply fixed patterns to each operation. The patterns represent the order operations should appear in. For instance a complex MUL operation combines : Left 1 + Right 1 Left 2 + Right 2 with a permute on the nodes consisting of: { Even, Even } + { Odd, Odd } { Even, Odd } + { Odd, Even } By abstracting over these patterns the checking code becomes quite simple. As part of this I was checking the order of the operands which was left in "slp" order. as in, the same order they showed up in during SLP, which means that the accumulator is first. However it looks like I didn't document this and the x86 optab was implemented assuming the same order as FMA, i.e. that the accumulator is last. I have this changed the order to match that of FMA and FMS which corrects the x86 codegen and will update the Arm targets. This has now also been documented. gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * doc/md.texi: Update docs for cfms, cfma. * tree-data-ref.h (same_data_refs): Accept optional offset. * tree-vect-slp-patterns.cc (is_linear_load_p): Fix issue with repeating patterns. (vect_normalize_conj_loc): Remove. (is_eq_or_top): Change to take two nodes. (enum _conj_status, compatible_complex_nodes_p, vect_validate_multiplication): New. (class complex_add_pattern, complex_add_pattern::matches, complex_add_pattern::recognize, class complex_mul_pattern, complex_mul_pattern::recognize, class complex_fms_pattern, complex_fms_pattern::recognize, class complex_operations_pattern, complex_operations_pattern::recognize, addsub_pattern::recognize): Pass new cache. (complex_fms_pattern::matches, complex_mul_pattern::matches): Pass new cache and use new validation code. * tree-vect-slp.cc (vect_match_slp_patterns_2, vect_match_slp_patterns, vect_analyze_slp): Pass along cache. (compatible_calls_p): Expose. * tree-vectorizer.h (compatible_calls_p, slp_node_hash, slp_compat_nodes_map_t): New. (class vect_pattern): Update signatures include new cache. gcc/testsuite/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * g++.dg/vect/pr99149.cc: xfail for now. * gcc.dg/vect/complex/pr102819-1.c: New test. * gcc.dg/vect/complex/pr102819-2.c: New test. * gcc.dg/vect/complex/pr102819-3.c: New test. * gcc.dg/vect/complex/pr102819-4.c: New test. * gcc.dg/vect/complex/pr102819-5.c: New test. * gcc.dg/vect/complex/pr102819-6.c: New test. * gcc.dg/vect/complex/pr102819-7.c: New test. * gcc.dg/vect/complex/pr102819-8.c: New test. * gcc.dg/vect/complex/pr102819-9.c: New test. * gcc.dg/vect/complex/pr103169.c: New test.
2022-01-18[vect] Add main vectorized loop unrollingAndre Vieira1-1/+24
gcc/ChangeLog: * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Pass new argument suggested_unroll_factor. (vect_analyze_loop_costing): Likewise. (_loop_vec_info::_loop_vec_info): Initialize new member suggested_unroll_factor. (vect_determine_partial_vectors_and_peeling): Make epilogue of unrolled main loop use partial vectors. (vect_analyze_loop_2): Pass and use new argument suggested_unroll_factor. (vect_analyze_loop_1): Change to intialize local suggested_unroll_factor and use it. (vectorizable_reduction): Don't use single_defuse_cycle when unrolling. * tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new member suggested_unroll_factor. (vector_costs::vector_costs): Add new member m_suggested_unroll_factor. (vector_costs::suggested_unroll_factor): New getter function. (finish_cost): Set return argument suggested_unroll_factor.
2022-01-17Change references of .c files to .cc filesMartin Liska1-10/+10
ChangeLog: * MAINTAINERS: Rename .c names to .cc. contrib/ChangeLog: * filter-clang-warnings.py: Rename .c names to .cc. * gcc_update: Likewise. * paranoia.cc: Likewise. contrib/header-tools/ChangeLog: * README: Rename .c names to .cc. gcc/ChangeLog: * Makefile.in: Rename .c names to .cc. * alias.h: Likewise. * asan.cc: Likewise. * auto-profile.h: Likewise. * basic-block.h (struct basic_block_d): Likewise. * btfout.cc: Likewise. * builtins.cc (expand_builtin_longjmp): Likewise. (validate_arg): Likewise. (access_ref::offset_bounded): Likewise. * caller-save.cc (reg_restore_code): Likewise. (setup_save_areas): Likewise. * calls.cc (initialize_argument_information): Likewise. (expand_call): Likewise. (emit_library_call_value_1): Likewise. * cfg-flags.def (RTL): Likewise. (SIBCALL): Likewise. (CAN_FALLTHRU): Likewise. * cfganal.cc (post_order_compute): Likewise. * cfgcleanup.cc (try_simplify_condjump): Likewise. (merge_blocks_move_predecessor_nojumps): Likewise. (merge_blocks_move_successor_nojumps): Likewise. (merge_blocks_move): Likewise. (old_insns_match_p): Likewise. (try_crossjump_bb): Likewise. * cfgexpand.cc (expand_gimple_stmt): Likewise. * cfghooks.cc (split_block_before_cond_jump): Likewise. (profile_record_check_consistency): Likewise. * cfghooks.h: Likewise. * cfgrtl.cc (pass_free_cfg::execute): Likewise. (rtl_can_merge_blocks): Likewise. (try_redirect_by_replacing_jump): Likewise. (make_pass_outof_cfg_layout_mode): Likewise. (cfg_layout_can_merge_blocks_p): Likewise. * cgraph.cc (release_function_body): Likewise. (cgraph_node::get_fun): Likewise. * cgraph.h (struct cgraph_node): Likewise. (asmname_hasher::equal): Likewise. (cgraph_inline_failed_type): Likewise. (thunk_adjust): Likewise. (dump_callgraph_transformation): Likewise. (record_references_in_initializer): Likewise. (ipa_discover_variable_flags): Likewise. * cgraphclones.cc (GTY): Likewise. * cgraphunit.cc (symbol_table::finalize_compilation_unit): Likewise. * collect-utils.h (GCC_COLLECT_UTILS_H): Likewise. * collect2-aix.h (GCC_COLLECT2_AIX_H): Likewise. * collect2.cc (maybe_run_lto_and_relink): Likewise. * combine-stack-adj.cc: Likewise. * combine.cc (setup_incoming_promotions): Likewise. (combine_simplify_rtx): Likewise. (count_rtxs): Likewise. * common.opt: Likewise. * common/config/aarch64/aarch64-common.cc: Likewise. * common/config/arm/arm-common.cc (arm_asm_auto_mfpu): Likewise. * common/config/avr/avr-common.cc: Likewise. * common/config/i386/i386-isas.h (struct _isa_names_table): Likewise. * conditions.h: Likewise. * config.gcc: Likewise. * config/aarch64/aarch64-builtins.cc (aarch64_resolve_overloaded_memtag): Likewise. * config/aarch64/aarch64-protos.h (aarch64_classify_address): Likewise. (aarch64_get_extension_string_for_isa_flags): Likewise. * config/aarch64/aarch64-sve-builtins.cc (function_builder::add_function): Likewise. * config/aarch64/aarch64.cc (aarch64_regmode_natural_size): Likewise. (aarch64_sched_first_cycle_multipass_dfa_lookahead): Likewise. (aarch64_option_valid_attribute_p): Likewise. (aarch64_short_vector_p): Likewise. (aarch64_float_const_representable_p): Likewise. * config/aarch64/aarch64.h (DBX_REGISTER_NUMBER): Likewise. (ASM_OUTPUT_POOL_EPILOGUE): Likewise. (GTY): Likewise. * config/aarch64/cortex-a57-fma-steering.cc: Likewise. * config/aarch64/driver-aarch64.cc (contains_core_p): Likewise. * config/aarch64/t-aarch64: Likewise. * config/aarch64/x-aarch64: Likewise. * config/aarch64/x-darwin: Likewise. * config/alpha/alpha-protos.h: Likewise. * config/alpha/alpha.cc (alpha_scalar_mode_supported_p): Likewise. * config/alpha/alpha.h (LONG_DOUBLE_TYPE_SIZE): Likewise. (enum reg_class): Likewise. * config/alpha/alpha.md: Likewise. * config/alpha/driver-alpha.cc (AMASK_LOCKPFTCHOK): Likewise. * config/alpha/x-alpha: Likewise. * config/arc/arc-protos.h (arc_eh_uses): Likewise. * config/arc/arc.cc (ARC_OPT): Likewise. (arc_ccfsm_advance): Likewise. (arc_arg_partial_bytes): Likewise. (conditionalize_nonjump): Likewise. * config/arc/arc.md: Likewise. * config/arc/builtins.def: Likewise. * config/arc/t-arc: Likewise. * config/arm/arm-c.cc (arm_resolve_overloaded_builtin): Likewise. (arm_pragma_target_parse): Likewise. * config/arm/arm-protos.h (save_restore_target_globals): Likewise. (arm_cpu_cpp_builtins): Likewise. * config/arm/arm.cc (vfp3_const_double_index): Likewise. (shift_op): Likewise. (thumb2_final_prescan_insn): Likewise. (arm_final_prescan_insn): Likewise. (arm_asm_output_labelref): Likewise. (arm_small_register_classes_for_mode_p): Likewise. * config/arm/arm.h: Likewise. * config/arm/arm.md: Likewise. * config/arm/driver-arm.cc: Likewise. * config/arm/symbian.h: Likewise. * config/arm/t-arm: Likewise. * config/arm/thumb1.md: Likewise. * config/arm/x-arm: Likewise. * config/avr/avr-c.cc (avr_register_target_pragmas): Likewise. * config/avr/avr-fixed.md: Likewise. * config/avr/avr-log.cc (avr_log_vadump): Likewise. * config/avr/avr-mcus.def: Likewise. * config/avr/avr-modes.def (FRACTIONAL_INT_MODE): Likewise. * config/avr/avr-passes.def (INSERT_PASS_BEFORE): Likewise. * config/avr/avr-protos.h (make_avr_pass_casesi): Likewise. * config/avr/avr.cc (avr_option_override): Likewise. (avr_build_builtin_va_list): Likewise. (avr_mode_dependent_address_p): Likewise. (avr_function_arg_advance): Likewise. (avr_asm_output_aligned_decl_common): Likewise. * config/avr/avr.h (RETURN_ADDR_RTX): Likewise. (SUPPORTS_INIT_PRIORITY): Likewise. * config/avr/avr.md: Likewise. * config/avr/builtins.def: Likewise. * config/avr/gen-avr-mmcu-specs.cc (IN_GEN_AVR_MMCU_TEXI): Likewise. * config/avr/gen-avr-mmcu-texi.cc (IN_GEN_AVR_MMCU_TEXI): Likewise. (main): Likewise. * config/avr/t-avr: Likewise. * config/bfin/bfin.cc (frame_related_constant_load): Likewise. * config/bpf/bpf-protos.h (GCC_BPF_PROTOS_H): Likewise. * config/bpf/bpf.h (enum reg_class): Likewise. * config/bpf/t-bpf: Likewise. * config/c6x/c6x-protos.h (GCC_C6X_PROTOS_H): Likewise. * config/cr16/cr16-protos.h: Likewise. * config/cris/cris.cc (cris_address_cost): Likewise. (cris_side_effect_mode_ok): Likewise. (cris_init_machine_status): Likewise. (cris_emit_movem_store): Likewise. * config/cris/cris.h (INDEX_REG_CLASS): Likewise. (enum reg_class): Likewise. (struct cum_args): Likewise. * config/cris/cris.opt: Likewise. * config/cris/sync.md: Likewise. * config/csky/csky.cc (csky_expand_prologue): Likewise. * config/darwin-c.cc: Likewise. * config/darwin-f.cc: Likewise. * config/darwin-sections.def (zobj_const_section): Likewise. * config/darwin.cc (output_objc_section_asm_op): Likewise. (fprintf): Likewise. * config/darwin.h (GTY): Likewise. * config/elfos.h: Likewise. * config/epiphany/epiphany-sched.md: Likewise. * config/epiphany/epiphany.cc (epiphany_function_value): Likewise. * config/epiphany/epiphany.h (GTY): Likewise. (NO_FUNCTION_CSE): Likewise. * config/epiphany/mode-switch-use.cc: Likewise. * config/epiphany/predicates.md: Likewise. * config/epiphany/t-epiphany: Likewise. * config/fr30/fr30-protos.h: Likewise. * config/frv/frv-protos.h: Likewise. * config/frv/frv.cc (TLS_BIAS): Likewise. * config/frv/frv.h (ASM_OUTPUT_ALIGNED_LOCAL): Likewise. * config/ft32/ft32-protos.h: Likewise. * config/gcn/gcn-hsa.h (ASM_APP_OFF): Likewise. * config/gcn/gcn.cc (gcn_init_libfuncs): Likewise. * config/gcn/mkoffload.cc (copy_early_debug_info): Likewise. * config/gcn/t-gcn-hsa: Likewise. * config/gcn/t-omp-device: Likewise. * config/h8300/h8300-protos.h (GCC_H8300_PROTOS_H): Likewise. (same_cmp_following_p): Likewise. * config/h8300/h8300.cc (F): Likewise. * config/h8300/h8300.h (struct cum_arg): Likewise. (BRANCH_COST): Likewise. * config/i386/cygming.h (DEFAULT_PCC_STRUCT_RETURN): Likewise. * config/i386/djgpp.h (TARGET_ASM_LTO_END): Likewise. * config/i386/dragonfly.h (NO_PROFILE_COUNTERS): Likewise. * config/i386/driver-i386.cc (detect_caches_intel): Likewise. * config/i386/freebsd.h (NO_PROFILE_COUNTERS): Likewise. * config/i386/i386-c.cc (ix86_target_macros): Likewise. * config/i386/i386-expand.cc (get_mode_wider_vector): Likewise. * config/i386/i386-options.cc (ix86_set_func_type): Likewise. * config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant): Likewise. (ix86_register_pragmas): Likewise. (ix86_d_has_stdcall_convention): Likewise. (i386_pe_seh_init_sections): Likewise. * config/i386/i386.cc (ix86_function_arg_regno_p): Likewise. (ix86_function_value_regno_p): Likewise. (ix86_compute_frame_layout): Likewise. (legitimize_pe_coff_symbol): Likewise. (output_pic_addr_const): Likewise. * config/i386/i386.h (defined): Likewise. (host_detect_local_cpu): Likewise. (CONSTANT_ADDRESS_P): Likewise. (DEFAULT_LARGE_SECTION_THRESHOLD): Likewise. (struct machine_frame_state): Likewise. * config/i386/i386.md: Likewise. * config/i386/lynx.h (ASM_OUTPUT_ALIGN): Likewise. * config/i386/mmx.md: Likewise. * config/i386/sse.md: Likewise. * config/i386/t-cygming: Likewise. * config/i386/t-djgpp: Likewise. * config/i386/t-gnu-property: Likewise. * config/i386/t-i386: Likewise. * config/i386/t-intelmic: Likewise. * config/i386/t-omp-device: Likewise. * config/i386/winnt-cxx.cc (i386_pe_type_dllimport_p): Likewise. (i386_pe_adjust_class_at_definition): Likewise. * config/i386/winnt.cc (gen_stdcall_or_fastcall_suffix): Likewise. (i386_pe_mangle_decl_assembler_name): Likewise. (i386_pe_encode_section_info): Likewise. * config/i386/x-cygwin: Likewise. * config/i386/x-darwin: Likewise. * config/i386/x-i386: Likewise. * config/i386/x-mingw32: Likewise. * config/i386/x86-tune-sched-core.cc: Likewise. * config/i386/x86-tune.def: Likewise. * config/i386/xm-djgpp.h (STANDARD_STARTFILE_PREFIX_1): Likewise. * config/ia64/freebsd.h: Likewise. * config/ia64/hpux.h (REGISTER_TARGET_PRAGMAS): Likewise. * config/ia64/ia64-protos.h (ia64_except_unwind_info): Likewise. * config/ia64/ia64.cc (ia64_function_value_regno_p): Likewise. (ia64_secondary_reload_class): Likewise. (bundling): Likewise. * config/ia64/ia64.h: Likewise. * config/ia64/ia64.md: Likewise. * config/ia64/predicates.md: Likewise. * config/ia64/sysv4.h: Likewise. * config/ia64/t-ia64: Likewise. * config/iq2000/iq2000.h (FUNCTION_MODE): Likewise. * config/iq2000/iq2000.md: Likewise. * config/linux.h (TARGET_HAS_BIONIC): Likewise. (if): Likewise. * config/m32c/m32c.cc (m32c_function_needs_enter): Likewise. * config/m32c/m32c.h (MAX_REGS_PER_ADDRESS): Likewise. * config/m32c/t-m32c: Likewise. * config/m32r/m32r-protos.h: Likewise. * config/m32r/m32r.cc (m32r_print_operand): Likewise. * config/m32r/m32r.h: Likewise. * config/m32r/m32r.md: Likewise. * config/m68k/m68k-isas.def: Likewise. * config/m68k/m68k-microarchs.def: Likewise. * config/m68k/m68k-protos.h (strict_low_part_peephole_ok): Likewise. (m68k_epilogue_uses): Likewise. * config/m68k/m68k.cc (m68k_call_tls_get_addr): Likewise. (m68k_sched_adjust_cost): Likewise. (m68k_sched_md_init): Likewise. * config/m68k/m68k.h (__transfer_from_trampoline): Likewise. (enum m68k_function_kind): Likewise. * config/m68k/m68k.md: Likewise. * config/m68k/m68kemb.h: Likewise. * config/m68k/uclinux.h (ENDFILE_SPEC): Likewise. * config/mcore/mcore-protos.h: Likewise. * config/mcore/mcore.cc (mcore_expand_insv): Likewise. (mcore_expand_prolog): Likewise. * config/mcore/mcore.h (TARGET_MCORE): Likewise. * config/mcore/mcore.md: Likewise. * config/microblaze/microblaze-protos.h: Likewise. * config/microblaze/microblaze.cc (microblaze_legitimate_pic_operand): Likewise. (microblaze_function_prologue): Likewise. (microblaze_function_epilogue): Likewise. (microblaze_select_section): Likewise. (microblaze_asm_output_mi_thunk): Likewise. (microblaze_eh_return): Likewise. * config/microblaze/microblaze.h: Likewise. * config/microblaze/microblaze.md: Likewise. * config/microblaze/t-microblaze: Likewise. * config/mips/driver-native.cc: Likewise. * config/mips/loongson2ef.md: Likewise. * config/mips/mips-protos.h (mips_expand_vec_cmp_expr): Likewise. * config/mips/mips.cc (mips_rtx_costs): Likewise. (mips_output_filename): Likewise. (mips_output_function_prologue): Likewise. (mips_output_function_epilogue): Likewise. (mips_output_mi_thunk): Likewise. * config/mips/mips.h: Likewise. * config/mips/mips.md: Likewise. * config/mips/t-mips: Likewise. * config/mips/x-native: Likewise. * config/mmix/mmix-protos.h: Likewise. * config/mmix/mmix.cc (mmix_option_override): Likewise. (mmix_dbx_register_number): Likewise. (mmix_expand_prologue): Likewise. * config/mmix/mmix.h: Likewise. * config/mmix/mmix.md: Likewise. * config/mmix/predicates.md: Likewise. * config/mn10300/mn10300.cc (mn10300_symbolic_operand): Likewise. (mn10300_legitimate_pic_operand_p): Likewise. * config/mn10300/mn10300.h (enum reg_class): Likewise. (NO_FUNCTION_CSE): Likewise. * config/moxie/moxie-protos.h: Likewise. * config/moxie/uclinux.h (TARGET_LIBC_HAS_FUNCTION): Likewise. * config/msp430/msp430-devices.cc (extract_devices_dir_from_exec_prefix): Likewise. * config/msp430/msp430.cc (msp430_gimplify_va_arg_expr): Likewise. (msp430_incoming_return_addr_rtx): Likewise. * config/msp430/msp430.h (msp430_get_linker_devices_include_path): Likewise. * config/msp430/t-msp430: Likewise. * config/nds32/nds32-cost.cc (nds32_rtx_costs_speed_prefer): Likewise. (nds32_rtx_costs_size_prefer): Likewise. (nds32_init_rtx_costs): Likewise. * config/nds32/nds32-doubleword.md: Likewise. * config/nds32/nds32.cc (nds32_memory_move_cost): Likewise. (nds32_builtin_decl): Likewise. * config/nds32/nds32.h (enum nds32_16bit_address_type): Likewise. (enum nds32_isr_nested_type): Likewise. (enum reg_class): Likewise. * config/nds32/predicates.md: Likewise. * config/nds32/t-nds32: Likewise. * config/nios2/nios2.cc (nios2_pragma_target_parse): Likewise. * config/nvptx/nvptx-protos.h: Likewise. * config/nvptx/nvptx.cc (nvptx_goacc_expand_var_decl): Likewise. * config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Likewise. * config/nvptx/t-nvptx: Likewise. * config/nvptx/t-omp-device: Likewise. * config/pa/elf.h: Likewise. * config/pa/pa-linux.h (GLOBAL_ASM_OP): Likewise. * config/pa/pa-netbsd.h (GLOBAL_ASM_OP): Likewise. * config/pa/pa-openbsd.h (TARGET_ASM_GLOBALIZE_LABEL): Likewise. * config/pa/pa-protos.h (pa_eh_return_handler_rtx): Likewise. (pa_legitimize_reload_address): Likewise. (pa_can_use_return_insn): Likewise. * config/pa/pa.cc (mem_shadd_or_shadd_rtx_p): Likewise. (som_output_text_section_asm_op): Likewise. * config/pa/pa.h (PROFILE_BEFORE_PROLOGUE): Likewise. * config/pa/pa.md: Likewise. * config/pa/som.h: Likewise. * config/pa/t-pa: Likewise. * config/pdp11/pdp11.cc (decode_pdp11_d): Likewise. * config/pdp11/pdp11.h: Likewise. * config/pdp11/pdp11.md: Likewise. * config/pdp11/t-pdp11: Likewise. * config/pru/pru.md: Likewise. * config/pru/t-pru: Likewise. * config/riscv/riscv-protos.h (NUM_SYMBOL_TYPES): Likewise. (riscv_gpr_save_operation_p): Likewise. (riscv_d_register_target_info): Likewise. (riscv_init_builtins): Likewise. * config/riscv/riscv.cc (riscv_output_mi_thunk): Likewise. * config/riscv/riscv.h (CSW_MAX_OFFSET): Likewise. * config/riscv/t-riscv: Likewise. * config/rl78/rl78.cc (rl78_asm_ctor_dtor): Likewise. * config/rl78/t-rl78: Likewise. * config/rs6000/aix.h: Likewise. * config/rs6000/aix71.h (ASM_SPEC_COMMON): Likewise. * config/rs6000/aix72.h (ASM_SPEC_COMMON): Likewise. * config/rs6000/aix73.h (ASM_SPEC_COMMON): Likewise. * config/rs6000/darwin.h (TARGET_ASM_GLOBALIZE_LABEL): Likewise. * config/rs6000/driver-rs6000.cc: Likewise. * config/rs6000/freebsd.h: Likewise. * config/rs6000/freebsd64.h: Likewise. * config/rs6000/lynx.h (ASM_OUTPUT_ALIGN): Likewise. * config/rs6000/rbtree.cc: Likewise. * config/rs6000/rbtree.h: Likewise. * config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Likewise. * config/rs6000/rs6000-call.cc (rs6000_invalid_builtin): Likewise. (rs6000_expand_builtin): Likewise. (rs6000_init_builtins): Likewise. * config/rs6000/rs6000-cpus.def: Likewise. * config/rs6000/rs6000-gen-builtins.cc (write_init_ovld_table): Likewise. * config/rs6000/rs6000-internal.h (ALTIVEC_REG_BIT): Likewise. (quad_address_offset_p): Likewise. * config/rs6000/rs6000-logue.cc (interesting_frame_related_regno): Likewise. (rs6000_emit_epilogue): Likewise. * config/rs6000/rs6000-overload.def: Likewise. * config/rs6000/rs6000-p8swap.cc: Likewise. * config/rs6000/rs6000-protos.h (GCC_RS6000_PROTOS_H): Likewise. (rs6000_const_f32_to_i32): Likewise. * config/rs6000/rs6000.cc (legitimate_lo_sum_address_p): Likewise. (rs6000_debug_legitimize_address): Likewise. (rs6000_mode_dependent_address): Likewise. (rs6000_adjust_priority): Likewise. (rs6000_c_mode_for_suffix): Likewise. * config/rs6000/rs6000.h (defined): Likewise. (LONG_DOUBLE_TYPE_SIZE): Likewise. * config/rs6000/rs6000.md: Likewise. * config/rs6000/sysv4.h: Likewise. * config/rs6000/t-linux: Likewise. * config/rs6000/t-linux64: Likewise. * config/rs6000/t-rs6000: Likewise. * config/rs6000/x-darwin: Likewise. * config/rs6000/x-darwin64: Likewise. * config/rs6000/x-rs6000: Likewise. * config/rs6000/xcoff.h (ASM_OUTPUT_LABELREF): Likewise. * config/rx/rx.cc (rx_expand_builtin): Likewise. * config/s390/constraints.md: Likewise. * config/s390/driver-native.cc: Likewise. * config/s390/htmxlintrin.h: Likewise. * config/s390/s390-builtins.def (B_DEF): Likewise. (OB_DEF_VAR): Likewise. * config/s390/s390-builtins.h: Likewise. * config/s390/s390-c.cc: Likewise. * config/s390/s390-opts.h: Likewise. * config/s390/s390-protos.h (s390_check_symref_alignment): Likewise. (s390_register_target_pragmas): Likewise. * config/s390/s390.cc (s390_init_builtins): Likewise. (s390_expand_plus_operand): Likewise. (s390_expand_atomic): Likewise. (s390_valid_target_attribute_inner_p): Likewise. * config/s390/s390.h (LONG_DOUBLE_TYPE_SIZE): Likewise. * config/s390/s390.md: Likewise. * config/s390/t-s390: Likewise. * config/s390/vx-builtins.md: Likewise. * config/s390/x-native: Likewise. * config/sh/divtab-sh4-300.cc (main): Likewise. * config/sh/divtab-sh4.cc (main): Likewise. * config/sh/divtab.cc (main): Likewise. * config/sh/elf.h: Likewise. * config/sh/sh-protos.h (sh_fsca_int2sf): Likewise. * config/sh/sh.cc (SYMBOL_FLAG_FUNCVEC_FUNCTION): Likewise. (sh_struct_value_rtx): Likewise. (sh_remove_reg_dead_or_unused_notes): Likewise. * config/sh/sh.h (MIN_UNITS_PER_WORD): Likewise. * config/sh/t-sh: Likewise. * config/sol2-protos.h (solaris_override_options): Likewise. * config/sol2.h: Likewise. * config/sparc/driver-sparc.cc: Likewise. * config/sparc/freebsd.h: Likewise. * config/sparc/sparc-protos.h (make_pass_work_around_errata): Likewise. * config/sparc/sparc.cc (sparc_output_mi_thunk): Likewise. (sparc_asan_shadow_offset): Likewise. * config/sparc/sparc.h: Likewise. * config/sparc/sparc.md: Likewise. * config/sparc/t-sparc: Likewise. * config/sparc/x-sparc: Likewise. * config/stormy16/stormy16.cc (xstormy16_mode_dependent_address_p): Likewise. * config/t-darwin: Likewise. * config/t-dragonfly: Likewise. * config/t-freebsd: Likewise. * config/t-glibc: Likewise. * config/t-linux: Likewise. * config/t-netbsd: Likewise. * config/t-openbsd: Likewise. * config/t-pnt16-warn: Likewise. * config/t-sol2: Likewise. * config/t-vxworks: Likewise. * config/t-winnt: Likewise. * config/tilegx/t-tilegx: Likewise. * config/tilegx/tilegx-c.cc: Likewise. * config/tilegx/tilegx-protos.h (tilegx_function_profiler): Likewise. * config/tilegx/tilegx.md: Likewise. * config/tilepro/t-tilepro: Likewise. * config/tilepro/tilepro-c.cc: Likewise. * config/v850/t-v850: Likewise. * config/v850/v850-protos.h: Likewise. * config/v850/v850.cc (F): Likewise. * config/v850/v850.h (enum reg_class): Likewise. (SLOW_BYTE_ACCESS): Likewise. * config/vax/vax.cc (vax_mode_dependent_address_p): Likewise. * config/vax/vax.h (enum reg_class): Likewise. * config/vax/vax.md: Likewise. * config/visium/visium.cc (visium_legitimate_address_p): Likewise. * config/visium/visium.h: Likewise. * config/vms/t-vms: Likewise. * config/vms/vms-crtlmap.map: Likewise. * config/vms/vms-protos.h (vms_c_get_vms_ver): Likewise. * config/vx-common.h: Likewise. * config/x-darwin: Likewise. * config/x-hpux: Likewise. * config/x-linux: Likewise. * config/x-netbsd: Likewise. * config/x-openbsd: Likewise. * config/x-solaris: Likewise. * config/xtensa/xtensa-protos.h (xtensa_mem_offset): Likewise. * config/xtensa/xtensa.cc (xtensa_option_override): Likewise. * config/xtensa/xtensa.h: Likewise. * configure.ac: Likewise. * context.cc: Likewise. * convert.h: Likewise. * coretypes.h: Likewise. * coverage.cc: Likewise. * coverage.h: Likewise. * cppdefault.h (struct default_include): Likewise. * cprop.cc (local_cprop_pass): Likewise. (one_cprop_pass): Likewise. * cse.cc (hash_rtx_cb): Likewise. (fold_rtx): Likewise. * ctfc.h (ctfc_get_num_vlen_bytes): Likewise. * data-streamer.h (bp_unpack_var_len_int): Likewise. (streamer_write_widest_int): Likewise. * dbgcnt.def: Likewise. * dbxout.cc (dbxout_early_global_decl): Likewise. (dbxout_common_check): Likewise. * dbxout.h: Likewise. * debug.h (struct gcc_debug_hooks): Likewise. (dump_go_spec_init): Likewise. * df-core.cc: Likewise. * df-scan.cc (df_insn_info_delete): Likewise. (df_insn_delete): Likewise. * df.h (debug_df_chain): Likewise. (can_move_insns_across): Likewise. * dfp.cc (decimal_from_binary): Likewise. * diagnostic-color.cc: Likewise. * diagnostic-event-id.h: Likewise. * diagnostic-show-locus.cc (test_one_liner_labels): Likewise. * diagnostic.cc (bt_callback): Likewise. (num_digits): Likewise. * doc/avr-mmcu.texi: Likewise. * doc/cfg.texi: Likewise. * doc/contrib.texi: Likewise. * doc/cppinternals.texi: Likewise. * doc/extend.texi: Likewise. * doc/generic.texi: Likewise. * doc/gimple.texi: Likewise. * doc/gty.texi: Likewise. * doc/invoke.texi: Likewise. * doc/loop.texi: Likewise. * doc/lto.texi: Likewise. * doc/match-and-simplify.texi: Likewise. * doc/md.texi: Likewise. * doc/optinfo.texi: Likewise. * doc/options.texi: Likewise. * doc/passes.texi: Likewise. * doc/plugins.texi: Likewise. * doc/rtl.texi: Likewise. * doc/sourcebuild.texi: Likewise. * doc/tm.texi: Likewise. * doc/tm.texi.in: Likewise. * doc/tree-ssa.texi: Likewise. * dojump.cc (do_jump): Likewise. * dojump.h: Likewise. * dumpfile.cc (test_impl_location): Likewise. (test_capture_of_dump_calls): Likewise. * dumpfile.h (enum dump_kind): Likewise. (class dump_location_t): Likewise. (dump_enabled_p): Likewise. (enable_rtl_dump_file): Likewise. (dump_combine_total_stats): Likewise. * dwarf2asm.cc (dw2_asm_output_delta_uleb128): Likewise. * dwarf2ctf.h (ctf_debug_finish): Likewise. * dwarf2out.cc (dwarf2out_begin_prologue): Likewise. (struct loc_descr_context): Likewise. (rtl_for_decl_location): Likewise. (gen_subprogram_die): Likewise. (gen_label_die): Likewise. (is_trivial_indirect_ref): Likewise. (dwarf2out_late_global_decl): Likewise. (dwarf_file_hasher::hash): Likewise. (dwarf2out_end_source_file): Likewise. (dwarf2out_define): Likewise. (dwarf2out_early_finish): Likewise. * dwarf2out.h (struct dw_fde_node): Likewise. (struct dw_discr_list_node): Likewise. (output_loc_sequence_raw): Likewise. * emit-rtl.cc (gen_raw_REG): Likewise. (maybe_set_max_label_num): Likewise. * emit-rtl.h (struct rtl_data): Likewise. * errors.cc (internal_error): Likewise. (trim_filename): Likewise. * et-forest.cc: Likewise. * except.cc (init_eh_for_function): Likewise. * explow.cc (promote_ssa_mode): Likewise. (get_dynamic_stack_size): Likewise. * explow.h: Likewise. * expmed.h: Likewise. * expr.cc (safe_from_p): Likewise. (expand_expr_real_2): Likewise. (expand_expr_real_1): Likewise. * file-prefix-map.cc (remap_filename): Likewise. * final.cc (app_enable): Likewise. (make_pass_compute_alignments): Likewise. (final_scan_insn_1): Likewise. (final_scan_insn): Likewise. * fixed-value.h (fixed_from_string): Likewise. * flag-types.h (NO_DEBUG): Likewise. (DWARF2_DEBUG): Likewise. (VMS_DEBUG): Likewise. (BTF_DEBUG): Likewise. (enum ctf_debug_info_levels): Likewise. * fold-const.cc (const_binop): Likewise. (fold_binary_loc): Likewise. (fold_checksum_tree): Likewise. * fp-test.cc: Likewise. * function.cc (expand_function_end): Likewise. * function.h (struct function): Likewise. * fwprop.cc (should_replace_address): Likewise. * gcc-main.cc: Likewise. * gcc-rich-location.h (class gcc_rich_location): Likewise. * gcc-symtab.h: Likewise. * gcc.cc (MIN_FATAL_STATUS): Likewise. (driver_handle_option): Likewise. (quote_spec_arg): Likewise. (driver::finalize): Likewise. * gcc.h (set_input): Likewise. * gcov-dump.cc: Likewise. * gcov.cc (solve_flow_graph): Likewise. * gcse-common.cc: Likewise. * gcse.cc (make_pass_rtl_hoist): Likewise. * genattr-common.cc: Likewise. * genattrtab.cc (min_fn): Likewise. (write_const_num_delay_slots): Likewise. * genautomata.cc: Likewise. * genconditions.cc (write_one_condition): Likewise. * genconstants.cc: Likewise. * genemit.cc (gen_exp): Likewise. * generic-match-head.cc: Likewise. * genextract.cc: Likewise. * gengenrtl.cc (always_void_p): Likewise. * gengtype-parse.cc (gtymarker_opt): Likewise. * gengtype-state.cc (state_writer::state_writer): Likewise. (write_state_trailer): Likewise. (equals_type_number): Likewise. (read_state): Likewise. * gengtype.cc (open_base_files): Likewise. (struct file_rule_st): Likewise. (header_dot_h_frul): Likewise. * gengtype.h: Likewise. * genmatch.cc (main): Likewise. * genmddeps.cc: Likewise. * genmodes.cc (emit_mode_inner): Likewise. (emit_mode_unit_size): Likewise. * genpeep.cc (gen_peephole): Likewise. * genpreds.cc (write_tm_preds_h): Likewise. * genrecog.cc (validate_pattern): Likewise. (write_header): Likewise. (main): Likewise. * gensupport.cc (change_subst_attribute): Likewise. (traverse_c_tests): Likewise. (add_predicate): Likewise. (init_predicate_table): Likewise. * gensupport.h (struct optab_pattern): Likewise. (get_num_insn_codes): Likewise. (maybe_eval_c_test): Likewise. (struct pred_data): Likewise. * ggc-internal.h: Likewise. * gimple-fold.cc (maybe_fold_reference): Likewise. (get_range_strlen_tree): Likewise. * gimple-fold.h (gimple_stmt_integer_valued_real_p): Likewise. * gimple-low.cc: Likewise. * gimple-match-head.cc (directly_supported_p): Likewise. * gimple-pretty-print.h: Likewise. * gimple-ssa-sprintf.cc (format_percent): Likewise. (adjust_range_for_overflow): Likewise. * gimple-streamer.h: Likewise. * gimple.h (struct GTY): Likewise. (is_gimple_resx): Likewise. * gimplify.cc (gimplify_expr): Likewise. (gimplify_init_constructor): Likewise. (omp_construct_selector_matches): Likewise. (gimplify_omp_target_update): Likewise. (gimplify_omp_ordered): Likewise. (gimplify_va_arg_expr): Likewise. * graphite-isl-ast-to-gimple.cc (should_copy_to_new_region): Likewise. * haifa-sched.cc (increase_insn_priority): Likewise. (try_ready): Likewise. (sched_create_recovery_edges): Likewise. * ifcvt.cc (find_if_case_1): Likewise. (find_if_case_2): Likewise. * inchash.h: Likewise. * incpath.cc (add_env_var_paths): Likewise. * input.cc (dump_location_info): Likewise. (assert_loceq): Likewise. (test_lexer_string_locations_concatenation_1): Likewise. (test_lexer_string_locations_concatenation_2): Likewise. (test_lexer_string_locations_concatenation_3): Likewise. * input.h (BUILTINS_LOCATION): Likewise. (class string_concat_db): Likewise. * internal-fn.cc (expand_MUL_OVERFLOW): Likewise. (expand_LOOP_VECTORIZED): Likewise. * ipa-cp.cc (make_pass_ipa_cp): Likewise. * ipa-fnsummary.cc (remap_freqcounting_preds_after_dup): Likewise. (ipa_fn_summary_t::duplicate): Likewise. (make_pass_ipa_fn_summary): Likewise. * ipa-fnsummary.h (enum ipa_hints_vals): Likewise. * ipa-free-lang-data.cc (fld_simplified_type): Likewise. (free_lang_data_in_decl): Likewise. * ipa-inline.cc (compute_inlined_call_time): Likewise. (inline_always_inline_functions): Likewise. * ipa-inline.h (free_growth_caches): Likewise. (inline_account_function_p): Likewise. * ipa-modref.cc (modref_access_analysis::analyze_stmt): Likewise. (modref_eaf_analysis::analyze_ssa_name): Likewise. * ipa-param-manipulation.cc (ipa_param_body_adjustments::mark_dead_statements): Likewise. (ipa_param_body_adjustments::remap_with_debug_expressions): Likewise. * ipa-prop.cc (ipa_set_node_agg_value_chain): Likewise. * ipa-prop.h (IPA_UNDESCRIBED_USE): Likewise. (unadjusted_ptr_and_unit_offset): Likewise. * ipa-reference.cc (make_pass_ipa_reference): Likewise. * ipa-reference.h (GCC_IPA_REFERENCE_H): Likewise. * ipa-split.cc (consider_split): Likewise. * ipa-sra.cc (isra_read_node_info): Likewise. * ipa-utils.h (struct ipa_dfs_info): Likewise. (recursive_call_p): Likewise. (ipa_make_function_pure): Likewise. * ira-build.cc (ira_create_allocno): Likewise. (ira_flattening): Likewise. * ira-color.cc (do_coloring): Likewise. (update_curr_costs): Likewise. * ira-conflicts.cc (process_regs_for_copy): Likewise. * ira-int.h (struct ira_emit_data): Likewise. (ira_prohibited_mode_move_regs): Likewise. (ira_get_dup_out_num): Likewise. (ira_destroy): Likewise. (ira_tune_allocno_costs): Likewise. (ira_implicitly_set_insn_hard_regs): Likewise. (ira_build_conflicts): Likewise. (ira_color): Likewise. * ira-lives.cc (process_bb_node_lives): Likewise. * ira.cc (class ira_spilled_reg_stack_slot): Likewise. (setup_uniform_class_p): Likewise. (def_dominates_uses): Likewise. * ira.h (ira_nullify_asm_goto): Likewise. * langhooks.cc (lhd_post_options): Likewise. * langhooks.h (class substring_loc): Likewise. (struct lang_hooks_for_tree_inlining): Likewise. (struct lang_hooks_for_types): Likewise. (struct lang_hooks): Likewise. * libfuncs.h (synchronize_libfunc): Likewise. * loop-doloop.cc (doloop_condition_get): Likewise. * loop-init.cc (fix_loop_structure): Likewise. * loop-invariant.cc: Likewise. * lower-subreg.h: Likewise. * lra-constraints.cc (curr_insn_transform): Likewise. * lra-int.h (struct lra_insn_reg): Likewise. (lra_undo_inheritance): Likewise. (lra_setup_reload_pseudo_preferenced_hard_reg): Likewise. (lra_split_hard_reg_for): Likewise. (lra_coalesce): Likewise. (lra_final_code_change): Likewise. * lra-spills.cc (lra_final_code_change): Likewise. * lra.cc (lra_process_new_insns): Likewise. * lto-compress.h (struct lto_compression_stream): Likewise. * lto-streamer-out.cc (DFS::DFS_write_tree_body): Likewise. (write_symbol): Likewise. * lto-streamer.h (enum LTO_tags): Likewise. (lto_value_range_error): Likewise. (lto_append_block): Likewise. (lto_streamer_hooks_init): Likewise. (stream_read_tree_ref): Likewise. (lto_prepare_function_for_streaming): Likewise. (select_what_to_stream): Likewise. (omp_lto_input_declare_variant_alt): Likewise. (cl_optimization_stream_in): Likewise. * lto-wrapper.cc (append_compiler_options): Likewise. * machmode.def: Likewise. * machmode.h (struct int_n_data_t): Likewise. * main.cc (main): Likewise. * match.pd: Likewise. * omp-builtins.def (BUILT_IN_GOMP_CRITICAL_NAME_END): Likewise. (BUILT_IN_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT): Likewise. * omp-expand.cc (expand_omp_atomic_fetch_op): Likewise. (make_pass_expand_omp_ssa): Likewise. * omp-low.cc (struct omp_context): Likewise. (struct omp_taskcopy_context): Likewise. (lower_omp): Likewise. * omp-oacc-neuter-broadcast.cc (omp_sese_active_worker_call): Likewise. (mask_name): Likewise. (omp_sese_dump_pars): Likewise. (worker_single_simple): Likewise. * omp-offload.cc (omp_finish_file): Likewise. (execute_oacc_loop_designation): Likewise. * optabs-query.cc (lshift_cheap_p): Likewise. * optc-gen.awk: Likewise. * optc-save-gen.awk: Likewise. * optinfo-emit-json.cc (optrecord_json_writer::optrecord_json_writer): Likewise. * opts-common.cc: Likewise. * output.h (app_enable): Likewise. (output_operand_lossage): Likewise. (insn_current_reference_address): Likewise. (get_insn_template): Likewise. (output_quoted_string): Likewise. * pass_manager.h (struct register_pass_info): Likewise. * plugin.cc: Likewise. * plugin.def (PLUGIN_ANALYZER_INIT): Likewise. * plugin.h (invoke_plugin_callbacks): Likewise. * pointer-query.cc (handle_mem_ref): Likewise. * postreload-gcse.cc (alloc_mem): Likewise. * predict.h (enum prediction): Likewise. (add_reg_br_prob_note): Likewise. * prefix.h: Likewise. * profile.h (get_working_sets): Likewise. * read-md.cc: Likewise. * read-md.h (struct mapping): Likewise. (class md_reader): Likewise. (class noop_reader): Likewise. * read-rtl-function.cc (function_reader::create_function): Likewise. (function_reader::extra_parsing_for_operand_code_0): Likewise. * read-rtl.cc (initialize_iterators): Likewise. * real.cc: Likewise. * real.h (struct real_value): Likewise. (format_helper::format_helper): Likewise. (real_hash): Likewise. (real_can_shorten_arithmetic): Likewise. * recog.cc (struct target_recog): Likewise. (offsettable_nonstrict_memref_p): Likewise. (constrain_operands): Likewise. * recog.h (MAX_RECOG_ALTERNATIVES): Likewise. (which_op_alt): Likewise. (struct insn_gen_fn): Likewise. * reg-notes.def (REG_NOTE): Likewise. * reg-stack.cc: Likewise. * regs.h (reg_is_parm_p): Likewise. * regset.h: Likewise. * reload.cc (push_reload): Likewise. (find_reloads): Likewise. (find_reloads_address_1): Likewise. (find_replacement): Likewise. (refers_to_regno_for_reload_p): Likewise. (refers_to_mem_for_reload_p): Likewise. * reload.h (push_reload): Likewise. (deallocate_reload_reg): Likewise. * reload1.cc (emit_input_reload_insns): Likewise. * reorg.cc (relax_delay_slots): Likewise. * rtl.def (UNKNOWN): Likewise. (SEQUENCE): Likewise. (BARRIER): Likewise. (ASM_OPERANDS): Likewise. (EQ_ATTR_ALT): Likewise. * rtl.h (struct GTY): Likewise. (LABEL_NAME): Likewise. (LABEL_ALT_ENTRY_P): Likewise. (SUBREG_BYTE): Likewise. (get_stack_check_protect): Likewise. (dump_rtx_statistics): Likewise. (unwrap_const_vec_duplicate): Likewise. (subreg_promoted_mode): Likewise. (gen_lowpart_common): Likewise. (operand_subword): Likewise. (immed_wide_int_const): Likewise. (decide_function_section): Likewise. (active_insn_p): Likewise. (delete_related_insns): Likewise. (try_split): Likewise. (val_signbit_known_clear_p): Likewise. (simplifiable_subregs): Likewise. (set_insn_deleted): Likewise. (subreg_get_info): Likewise. (remove_free_EXPR_LIST_node): Likewise. (finish_subregs_of_mode): Likewise. (get_mem_attrs): Likewise. (lookup_constant_def): Likewise. (rtx_to_tree_code): Likewise. (hash_rtx): Likewise. (condjump_in_parallel_p): Likewise. (validate_subreg): Likewise. (make_compound_operation): Likewise. (schedule_ebbs): Likewise. (print_inline_rtx): Likewise. (fixup_args_size_notes): Likewise. (expand_dec): Likewise. (prepare_copy_insn): Likewise. (mark_elimination): Likewise. (valid_mode_changes_for_regno): Likewise. (make_debug_expr_from_rtl): Likewise. (delete_vta_debug_insns): Likewise. (simplify_using_condition): Likewise. (set_insn_locations): Likewise. (fatal_insn_not_found): Likewise. (word_register_operation_p): Likewise. * rtlanal.cc (get_call_fndecl): Likewise. (side_effects_p): Likewise. (subreg_nregs): Likewise. (rtx_cost): Likewise. (canonicalize_condition): Likewise. * rtlanal.h (rtx_properties::try_to_add_note): Likewise. * run-rtl-passes.cc (run_rtl_passes): Likewise. * sanitizer.def (BUILT_IN_ASAN_VERSION_MISMATCH_CHECK): Likewise. * sched-deps.cc (add_dependence_1): Likewise. * sched-ebb.cc (begin_move_insn): Likewise. (add_deps_for_risky_insns): Likewise. (advance_target_bb): Likewise. * sched-int.h (reemit_notes): Likewise. (struct _haifa_insn_data): Likewise. (HID): Likewise. (DEP_CANCELLED): Likewise. (debug_ds): Likewise. (number_in_ready): Likewise. (schedule_ebbs_finish): Likewise. (find_modifiable_mems): Likewise. * sched-rgn.cc (debug_rgn_dependencies): Likewise. * sel-sched-dump.cc (dump_lv_set): Likewise. * sel-sched-dump.h: Likewise. * sel-sched-ir.cc (sel_insn_rtx_cost): Likewise. (setup_id_reg_sets): Likewise. (has_dependence_p): Likewise. (sel_num_cfg_preds_gt_1): Likewise. (bb_ends_ebb_p): Likewise. * sel-sched-ir.h (struct _list_node): Likewise. (struct idata_def): Likewise. (bb_next_bb): Likewise. * sel-sched.cc (vinsn_writes_one_of_regs_p): Likewise. (choose_best_pseudo_reg): Likewise. (verify_target_availability): Likewise. (can_speculate_dep_p): Likewise. (sel_rank_for_schedule): Likewise. * selftest-run-tests.cc (selftest::run_tests): Likewise. * selftest.h (class auto_fix_quotes): Likewise. * shrink-wrap.cc (handle_simple_exit): Likewise. * shrink-wrap.h: Likewise. * simplify-rtx.cc (simplify_context::simplify_associative_operation): Likewise. (simplify_context::simplify_gen_vec_select): Likewise. * spellcheck-tree.h: Likewise. * spellcheck.h: Likewise. * statistics.h (struct function): Likewise. * stmt.cc (conditional_probability): Likewise. * stmt.h: Likewise. * stor-layout.h: Likewise. * streamer-hooks.h: Likewise. * stringpool.h: Likewise. * symtab.cc (symbol_table::change_decl_assembler_name): Likewise. * target.def (HOOK_VECTOR_END): Likewise. (type.): Likewise. * target.h (union cumulative_args_t): Likewise. (by_pieces_ninsns): Likewise. (class predefined_function_abi): Likewise. * targhooks.cc (default_translate_mode_attribute): Likewise. * timevar.def: Likewise. * timevar.h (class timer): Likewise. * toplev.h (enable_rtl_dump_file): Likewise. * trans-mem.cc (collect_bb2reg): Likewise. * tree-call-cdce.cc (gen_conditions_for_pow): Likewise. * tree-cfg.cc (remove_bb): Likewise. (verify_gimple_debug): Likewise. (remove_edge_and_dominated_blocks): Likewise. (push_fndecl): Likewise. * tree-cfgcleanup.h (GCC_TREE_CFGCLEANUP_H): Likewise. * tree-complex.cc (expand_complex_multiplication): Likewise. (expand_complex_div_straight): Likewise. * tree-core.h (enum tree_index): Likewise. (enum operand_equal_flag): Likewise. * tree-eh.cc (honor_protect_cleanup_actions): Likewise. * tree-if-conv.cc (if_convertible_gimple_assign_stmt_p): Likewise. * tree-inline.cc (initialize_inlined_parameters): Likewise. * tree-inline.h (force_value_to_type): Likewise. * tree-nested.cc (get_chain_decl): Likewise. (walk_all_functions): Likewise. * tree-object-size.h: Likewise. * tree-outof-ssa.cc: Likewise. * tree-parloops.cc (create_parallel_loop): Likewise. * tree-pretty-print.cc (print_generic_expr_to_str): Likewise. (dump_generic_node): Likewise. * tree-profile.cc (tree_profiling): Likewise. * tree-sra.cc (maybe_add_sra_candidate): Likewise. * tree-ssa-address.cc: Likewise. * tree-ssa-alias.cc: Likewise. * tree-ssa-alias.h (ao_ref::max_size_known_p): Likewise. (dump_alias_stats): Likewise. * tree-ssa-ccp.cc: Likewise. * tree-ssa-coalesce.h: Likewise. * tree-ssa-live.cc (remove_unused_scope_block_p): Likewise. * tree-ssa-loop-manip.cc (copy_phi_node_args): Likewise. * tree-ssa-loop-unswitch.cc: Likewise. * tree-ssa-math-opts.cc: Likewise. * tree-ssa-operands.cc (class operands_scanner): Likewise. * tree-ssa-pre.cc: Likewise. * tree-ssa-reassoc.cc (optimize_ops_list): Likewise. (debug_range_entry): Likewise. * tree-ssa-sccvn.cc (eliminate_dom_walker::eliminate_stmt): Likewise. * tree-ssa-sccvn.h (TREE_SSA_SCCVN_H): Likewise. * tree-ssa-scopedtables.cc (add_expr_commutative): Likewise. (equal_mem_array_ref_p): Likewise. * tree-ssa-strlen.cc (is_strlen_related_p): Likewise. * tree-ssa-strlen.h (get_range_strlen_dynamic): Likewise. * tree-ssa-tail-merge.cc (stmt_local_def): Likewise. * tree-ssa-ter.h: Likewise. * tree-ssa-threadupdate.h (enum bb_dom_status): Likewise. * tree-streamer-in.cc (lto_input_ts_block_tree_pointers): Likewise. * tree-streamer-out.cc (pack_ts_block_value_fields): Likewise. (write_ts_block_tree_pointers): Likewise. * tree-streamer.h (struct streamer_tree_cache_d): Likewise. (streamer_read_tree_bitfields): Likewise. (streamer_write_integer_cst): Likewise. * tree-vect-patterns.cc (apply_binop_and_append_stmt): Likewise. (vect_synth_mult_by_constant): Likewise. * tree-vect-stmts.cc (vectorizable_operation): Likewise. * tree-vectorizer.cc: Likewise. * tree-vectorizer.h (class auto_purge_vect_location): Likewise. (vect_update_inits_of_drs): Likewise. (vect_get_mask_type_for_stmt): Likewise. (vect_rgroup_iv_might_wrap_p): Likewise. (cse_and_gimplify_to_preheader): Likewise. (vect_free_slp_tree): Likewise. (vect_pattern_recog): Likewise. (vect_stmt_dominates_stmt_p): Likewise. * tree.cc (initialize_tree_contains_struct): Likewise. (need_assembler_name_p): Likewise. (type_with_interoperable_signedness): Likewise. * tree.def (SWITCH_EXPR): Likewise. * tree.h (TYPE_SYMTAB_ADDRESS): Likewise. (poly_int_tree_p): Likewise. (inlined_function_outer_scope_p): Likewise. (tree_code_for_canonical_type_merging): Likewise. * value-prof.cc: Likewise. * value-prof.h (get_nth_most_common_value): Likewise. (find_func_by_profile_id): Likewise. * value-range.cc (vrp_operand_equal_p): Likewise. * value-range.h: Likewise. * var-tracking.cc: Likewise. * varasm.cc (default_function_section): Likewise. (function_section_1): Likewise. (assemble_variable): Likewise. (handle_vtv_comdat_section): Likewise. * vec.h (struct vec_prefix): Likewise. * vmsdbgout.cc (full_name): Likewise. * vtable-verify.cc: Likewise. * vtable-verify.h (struct vtv_graph_node): Likewise. * xcoffout.cc: Likewise. * xcoffout.h (DEBUG_SYMS_TEXT): Likewise. gcc/ada/ChangeLog: * Make-generated.in: Rename .c names to .cc. * adaint.c: Likewise. * ctrl_c.c (dummy_handler): Likewise. * gcc-interface/Makefile.in: Likewise. * gcc-interface/config-lang.in: Likewise. * gcc-interface/decl.cc (concat_name): Likewise. (init_gnat_decl): Likewise. * gcc-interface/gigi.h (concat_name): Likewise. (init_gnat_utils): Likewise. (build_call_raise_range): Likewise. (gnat_mark_addressable): Likewise. (gnat_protect_expr): Likewise. (gnat_rewrite_reference): Likewise. * gcc-interface/lang-specs.h (ADA_DUMPS_OPTIONS): Likewise. * gcc-interface/utils.cc (GTY): Likewise. (add_deferred_type_context): Likewise. (init_gnat_utils): Likewise. * gcc-interface/utils2.cc (gnat_stable_expr_p): Likewise. (gnat_protect_expr): Likewise. (gnat_stabilize_reference_1): Likewise. (gnat_rewrite_reference): Likewise. * gsocket.h: Likewise. * init.cc (__gnat_error_handler): Likewise. * libgnarl/s-intman.ads: Likewise. * libgnarl/s-osinte__android.ads: Likewise. * libgnarl/s-osinte__darwin.ads: Likewise. * libgnarl/s-osinte__hpux.ads: Likewise. * libgnarl/s-osinte__linux.ads: Likewise. * libgnarl/s-osinte__qnx.ads: Likewise. * libgnarl/s-taskin.ads: Likewise. * rtfinal.cc: Likewise. * s-oscons-tmplt.c (CND): Likewise. * set_targ.ads: Likewise. gcc/analyzer/ChangeLog: * analyzer.cc (is_special_named_call_p): Rename .c names to .cc. (is_named_call_p): Likewise. * region-model-asm.cc (deterministic_p): Likewise. * region.cc (field_region::get_relative_concrete_offset): Likewise. * sm-malloc.cc (method_p): Likewise. * supergraph.cc (superedge::dump_dot): Likewise. gcc/c-family/ChangeLog: * c-ada-spec.cc: Rename .c names to .cc. * c-ada-spec.h: Likewise. * c-common.cc (c_build_vec_convert): Likewise. (warning_candidate_p): Likewise. * c-common.h (enum rid): Likewise. (build_real_imag_expr): Likewise. (finish_label_address_expr): Likewise. (c_get_substring_location): Likewise. (c_build_bind_expr): Likewise. (conflict_marker_get_final_tok_kind): Likewise. (c_parse_error): Likewise. (check_missing_format_attribute): Likewise. (invalid_array_size_error): Likewise. (warn_for_multistatement_macros): Likewise. (build_attr_access_from_parms): Likewise. * c-cppbuiltin.cc (c_cpp_builtins): Likewise. * c-format.cc: Likewise. * c-gimplify.cc (c_gimplify_expr): Likewise. * c-indentation.h: Likewise. * c-objc.h (objc_prop_attr_kind_for_rid): Likewise. * c-omp.cc (c_omp_predetermined_mapping): Likewise. * c-opts.cc (c_common_post_options): Likewise. (set_std_cxx23): Likewise. * c-pragma.cc (handle_pragma_redefine_extname): Likewise. * c-pretty-print.h: Likewise. gcc/c/ChangeLog: * Make-lang.in: Rename .c names to .cc. * c-convert.cc: Likewise. * c-decl.cc (struct lang_identifier): Likewise. (pop_scope): Likewise. (finish_decl): Likewise. * c-objc-common.h (GCC_C_OBJC_COMMON): Likewise. * c-parser.cc (c_parser_skip_to_end_of_block_or_statement): Likewise. * c-parser.h (GCC_C_PARSER_H): Likewise. * c-tree.h (c_keyword_starts_typename): Likewise. (finish_declspecs): Likewise. (c_get_alias_set): Likewise. (enum c_oracle_request): Likewise. (tag_exists_p): Likewise. (set_c_expr_source_range): Likewise. * c-typeck.cc (c_common_type): Likewise. (c_finish_omp_clauses): Likewise. * config-lang.in: Likewise. gcc/cp/ChangeLog: * Make-lang.in: Rename .c names to .cc. * config-lang.in: Likewise. * constexpr.cc (cxx_eval_constant_expression): Likewise. * coroutines.cc (morph_fn_to_coro): Likewise. * cp-gimplify.cc (cp_gimplify_expr): Likewise. * cp-lang.cc (struct lang_hooks): Likewise. (get_template_argument_pack_elems_folded): Likewise. * cp-objcp-common.cc (cp_tree_size): Likewise. (cp_unit_size_without_reusable_padding): Likewise. (pop_file_scope): Likewise. (cp_pushdecl): Likewise. * cp-objcp-common.h (GCC_CP_OBJCP_COMMON): Likewise. (cxx_simulate_record_decl): Likewise. * cp-tree.h (struct named_label_entry): Likewise. (current_function_return_value): Likewise. (more_aggr_init_expr_args_p): Likewise. (get_function_version_dispatcher): Likewise. (common_enclosing_class): Likewise. (strip_fnptr_conv): Likewise. (current_decl_namespace): Likewise. (do_aggregate_paren_init): Likewise. (cp_check_const_attributes): Likewise. (qualified_name_lookup_error): Likewise. (generic_targs_for): Likewise. (mark_exp_read): Likewise. (is_global_friend): Likewise. (maybe_reject_flexarray_init): Likewise. (module_token_lang): Likewise. (handle_module_option): Likewise. (literal_integer_zerop): Likewise. (build_extra_args): Likewise. (build_if_nonnull): Likewise. (maybe_check_overriding_exception_spec): Likewise. (finish_omp_target_clauses): Likewise. (maybe_warn_zero_as_null_pointer_constant): Likewise. (cxx_print_error_function): Likewise. (decl_in_std_namespace_p): Likewise. (merge_exception_specifiers): Likewise. (mangle_module_global_init): Likewise. (cxx_block_may_fallthru): Likewise. (fold_builtin_source_location): Likewise. (enum cp_oracle_request): Likewise. (subsumes): Likewise. (cp_finish_injected_record_type): Likewise. (vtv_build_vtable_verify_fndecl): Likewise. (cp_tree_c_finish_parsing): Likewise. * cvt.cc (diagnose_ref_binding): Likewise. (convert_to_void): Likewise. (convert_force): Likewise. (type_promotes_to): Likewise. * decl.cc (make_unbound_class_template_raw): Likewise. (cxx_init_decl_processing): Likewise. (check_class_member_definition_namespace): Likewise. (cxx_maybe_build_cleanup): Likewise. * decl2.cc (maybe_emit_vtables): Likewise. * error.cc (dump_function_name): Likewise. * init.cc (is_class_type): Likewise. (build_new_1): Likewise. * lang-specs.h: Likewise. * method.cc (make_alias_for_thunk): Likewise. * module.cc (specialization_add): Likewise. (module_state::read_cluster): Likewise. * name-lookup.cc (check_extern_c_conflict): Likewise. * name-lookup.h (struct cxx_binding): Likewise. * parser.cc (cp_parser_identifier): Likewise. * parser.h (struct cp_parser): Likewise. * pt.cc (has_value_dependent_address): Likewise. (push_tinst_level_loc): Likewise. * semantics.cc (finish_omp_clauses): Likewise. (finish_omp_atomic): Likewise. * tree.cc (cp_save_expr): Likewise. (cp_free_lang_data): Likewise. * typeck.cc (cp_common_type): Likewise. (strip_array_domain): Likewise. (rationalize_conditional_expr): Likewise. (check_return_expr): Likewise. * vtable-class-hierarchy.cc: Likewise. gcc/d/ChangeLog: * d-gimplify.cc: Rename .c names to .cc. * d-incpath.cc: Likewise. * lang-specs.h: Likewise. gcc/fortran/ChangeLog: * check.cc (gfc_check_all_any): Rename .c names to .cc. * class.cc (find_intrinsic_vtab): Likewise. * config-lang.in: Likewise. * cpp.cc (cpp_define_builtins): Likewise. * data.cc (get_array_index): Likewise. * decl.cc (match_clist_expr): Likewise. (get_proc_name): Likewise. (gfc_verify_c_interop_param): Likewise. (gfc_get_pdt_instance): Likewise. (gfc_match_formal_arglist): Likewise. (gfc_get_type_attr_spec): Likewise. * dependency.cc: Likewise. * error.cc (gfc_format_decoder): Likewise. * expr.cc (check_restricted): Likewise. (gfc_build_default_init_expr): Likewise. * f95-lang.cc: Likewise. * gfc-internals.texi: Likewise. * gfortran.h (enum match): Likewise. (enum procedure_type): Likewise. (enum oacc_routine_lop): Likewise. (gfc_get_pdt_instance): Likewise. (gfc_end_source_files): Likewise. (gfc_mpz_set_hwi): Likewise. (gfc_get_option_string): Likewise. (gfc_find_sym_in_expr): Likewise. (gfc_errors_to_warnings): Likewise. (gfc_real_4_kind): Likewise. (gfc_free_finalizer): Likewise. (gfc_sym_get_dummy_args): Likewise. (gfc_check_intrinsic_standard): Likewise. (gfc_free_case_list): Likewise. (gfc_resolve_oacc_routines): Likewise. (gfc_check_vardef_context): Likewise. (gfc_free_association_list): Likewise. (gfc_implicit_pure_function): Likewise. (gfc_ref_dimen_size): Likewise. (gfc_compare_actual_formal): Likewise. (gfc_resolve_wait): Likewise. (gfc_dt_upper_string): Likewise. (gfc_generate_module_code): Likewise. (gfc_delete_bbt): Likewise. (debug): Likewise. (gfc_build_block_ns): Likewise. (gfc_dep_difference): Likewise. (gfc_invalid_null_arg): Likewise. (gfc_is_finalizable): Likewise. (gfc_fix_implicit_pure): Likewise. (gfc_is_size_zero_array): Likewise. (gfc_is_reallocatable_lhs): Likewise. * gfortranspec.cc: Likewise. * interface.cc (compare_actual_expr): Likewise. * intrinsic.cc (add_functions): Likewise. * iresolve.cc (gfc_resolve_matmul): Likewise. (gfc_resolve_alarm_sub): Likewise. * iso-c-binding.def: Likewise. * lang-specs.h: Likewise. * libgfortran.h (GFC_STDERR_UNIT_NUMBER): Likewise. * match.cc (gfc_match_label): Likewise. (gfc_match_symbol): Likewise. (match_derived_type_spec): Likewise. (copy_ts_from_selector_to_associate): Likewise. * match.h (gfc_match_call): Likewise. (gfc_get_common): Likewise. (gfc_match_omp_end_single): Likewise. (gfc_match_volatile): Likewise. (gfc_match_bind_c): Likewise. (gfc_match_literal_constant): Likewise. (gfc_match_init_expr): Likewise. (gfc_match_array_constructor): Likewise. (gfc_match_end_interface): Likewise. (gfc_match_print): Likewise. (gfc_match_expr): Likewise. * matchexp.cc (next_operator): Likewise. * mathbuiltins.def: Likewise. * module.cc (free_true_name): Likewise. * openmp.cc (gfc_resolve_omp_parallel_blocks): Likewise. (gfc_omp_save_and_clear_state): Likewise. * parse.cc (parse_union): Likewise. (set_syms_host_assoc): Likewise. * resolve.cc (resolve_actual_arglist): Likewise. (resolve_elemental_actual): Likewise. (check_host_association): Likewise. (resolve_typebound_function): Likewise. (resolve_typebound_subroutine): Likewise. (gfc_resolve_expr): Likewise. (resolve_assoc_var): Likewise. (resolve_typebound_procedures): Likewise. (resolve_equivalence_derived): Likewise. * simplify.cc (simplify_bound): Likewise. * symbol.cc (gfc_set_default_type): Likewise. (gfc_add_ext_attribute): Likewise. * target-memory.cc (gfc_target_interpret_expr): Likewise. * target-memory.h (gfc_target_interpret_expr): Likewise. * trans-array.cc (gfc_get_cfi_dim_sm): Likewise. (gfc_conv_shift_descriptor_lbound): Likewise. (gfc_could_be_alias): Likewise. (gfc_get_dataptr_offset): Likewise. * trans-const.cc: Likewise. * trans-decl.cc (trans_function_start): Likewise. (gfc_trans_deferred_vars): Likewise. (generate_local_decl): Likewise. (gfc_generate_function_code): Likewise. * trans-expr.cc (gfc_vptr_size_get): Likewise. (gfc_trans_class_array_init_assign): Likewise. (POWI_TABLE_SIZE): Likewise. (gfc_conv_procedure_call): Likewise. (gfc_trans_arrayfunc_assign): Likewise. * trans-intrinsic.cc (gfc_conv_intrinsic_len): Likewise. (gfc_conv_intrinsic_loc): Likewise. (conv_intrinsic_event_query): Likewise. * trans-io.cc (gfc_build_st_parameter): Likewise. * trans-openmp.cc (gfc_omp_check_optional_argument): Likewise. (gfc_omp_unshare_expr_r): Likewise. (gfc_trans_omp_array_section): Likewise. (gfc_trans_omp_clauses): Likewise. * trans-stmt.cc (trans_associate_var): Likewise. (gfc_trans_deallocate): Likewise. * trans-stmt.h (gfc_trans_class_init_assign): Likewise. (gfc_trans_deallocate): Likewise. (gfc_trans_oacc_declare): Likewise. * trans-types.cc: Likewise. * trans-types.h (enum gfc_packed): Likewise. * trans.cc (N_): Likewise. (trans_code): Likewise. * trans.h (gfc_build_compare_string): Likewise. (gfc_conv_expr_type): Likewise. (gfc_trans_deferred_vars): Likewise. (getdecls): Likewise. (gfc_get_array_descr_info): Likewise. (gfc_omp_firstprivatize_type_sizes): Likewise. (GTY): Likewise. gcc/go/ChangeLog: * config-lang.in: Rename .c names to .cc. * go-backend.cc: Likewise. * go-lang.cc: Likewise. * gospec.cc: Likewise. * lang-specs.h: Likewise. gcc/jit/ChangeLog: * config-lang.in: Rename .c names to .cc. * docs/_build/texinfo/libgccjit.texi: Likewise. * docs/internals/index.rst: Likewise. * jit-builtins.cc (builtins_manager::make_builtin_function): Likewise. * jit-playback.cc (fold_const_var): Likewise. (playback::context::~context): Likewise. (new_field): Likewise. (new_bitfield): Likewise. (new_compound_type): Likewise. (playback::compound_type::set_fields): Likewise. (global_set_init_rvalue): Likewise. (load_blob_in_ctor): Likewise. (new_global_initialized): Likewise. (double>): Likewise. (new_string_literal): Likewise. (as_truth_value): Likewise. (build_call): Likewise. (playback::context::build_cast): Likewise. (new_array_access): Likewise. (new_field_access): Likewise. (dereference): Likewise. (postprocess): Likewise. (add_jump): Likewise. (add_switch): Likewise. (build_goto_operands): Likewise. (playback::context::read_dump_file): Likewise. (init_types): Likewise. * jit-recording.cc (recording::context::get_int_type): Likewise. * jit-recording.h: Likewise. * libgccjit.cc (compatible_types): Likewise. (gcc_jit_context_acquire): Likewise. (gcc_jit_context_release): Likewise. (gcc_jit_context_new_child_context): Likewise. (gcc_jit_type_as_object): Likewise. (gcc_jit_context_get_type): Likewise. (gcc_jit_context_get_int_type): Likewise. (gcc_jit_type_get_pointer): Likewise. (gcc_jit_type_get_const): Likewise. (gcc_jit_type_get_volatile): Likewise. (gcc_jit_type_dyncast_array): Likewise. (gcc_jit_type_is_bool): Likewise. (gcc_jit_type_is_pointer): Likewise. (gcc_jit_type_is_integral): Likewise. (gcc_jit_type_dyncast_vector): Likewise. (gcc_jit_type_is_struct): Likewise. (gcc_jit_vector_type_get_num_units): Likewise. (gcc_jit_vector_type_get_element_type): Likewise. (gcc_jit_type_unqualified): Likewise. (gcc_jit_type_dyncast_function_ptr_type): Likewise. (gcc_jit_function_type_get_return_type): Likewise. (gcc_jit_function_type_get_param_count): Likewise. (gcc_jit_function_type_get_param_type): Likewise. (gcc_jit_context_new_array_type): Likewise. (gcc_jit_context_new_field): Likewise. (gcc_jit_field_as_object): Likewise. (gcc_jit_context_new_struct_type): Likewise. (gcc_jit_struct_as_type): Likewise. (gcc_jit_struct_set_fields): Likewise. (gcc_jit_struct_get_field_count): Likewise. (gcc_jit_context_new_union_type): Likewise. (gcc_jit_context_new_function_ptr_type): Likewise. (gcc_jit_param_as_rvalue): Likewise. (gcc_jit_context_new_function): Likewise. (gcc_jit_function_get_return_type): Likewise. (gcc_jit_function_dump_to_dot): Likewise. (gcc_jit_block_get_function): Likewise. (gcc_jit_global_set_initializer_rvalue): Likewise. (gcc_jit_rvalue_get_type): Likewise. (gcc_jit_context_new_rvalue_from_int): Likewise. (gcc_jit_context_one): Likewise. (gcc_jit_context_new_rvalue_from_double): Likewise. (gcc_jit_context_null): Likewise. (gcc_jit_context_new_string_literal): Likewise. (valid_binary_op_p): Likewise. (gcc_jit_context_new_binary_op): Likewise. (gcc_jit_context_new_comparison): Likewise. (gcc_jit_context_new_call): Likewise. (is_valid_cast): Likewise. (gcc_jit_context_new_cast): Likewise. (gcc_jit_object_get_context): Likewise. (gcc_jit_object_get_debug_string): Likewise. (gcc_jit_lvalue_access_field): Likewise. (gcc_jit_rvalue_access_field): Likewise. (gcc_jit_rvalue_dereference_field): Likewise. (gcc_jit_rvalue_dereference): Likewise. (gcc_jit_lvalue_get_address): Likewise. (gcc_jit_lvalue_set_tls_model): Likewise. (gcc_jit_lvalue_set_link_section): Likewise. (gcc_jit_function_new_local): Likewise. (gcc_jit_block_add_eval): Likewise. (gcc_jit_block_add_assignment): Likewise. (is_bool): Likewise. (gcc_jit_block_end_with_conditional): Likewise. (gcc_jit_block_add_comment): Likewise. (gcc_jit_block_end_with_jump): Likewise. (gcc_jit_block_end_with_return): Likewise. (gcc_jit_block_end_with_void_return): Likewise. (case_range_validator::case_range_validator): Likewise. (case_range_validator::validate): Likewise. (case_range_validator::get_wide_int): Likewise. (gcc_jit_block_end_with_switch): Likewise. (gcc_jit_context_set_str_option): Likewise. (gcc_jit_context_set_int_option): Likewise. (gcc_jit_context_set_bool_option): Likewise. (gcc_jit_context_set_bool_allow_unreachable_blocks): Likewise. (gcc_jit_context_set_bool_use_external_driver): Likewise. (gcc_jit_context_add_command_line_option): Likewise. (gcc_jit_context_add_driver_option): Likewise. (gcc_jit_context_enable_dump): Likewise. (gcc_jit_context_compile): Likewise. (gcc_jit_context_compile_to_file): Likewise. (gcc_jit_context_set_logfile): Likewise. (gcc_jit_context_dump_reproducer_to_file): Likewise. (gcc_jit_context_get_first_error): Likewise. (gcc_jit_context_get_last_error): Likewise. (gcc_jit_result_get_code): Likewise. (gcc_jit_result_get_global): Likewise. (gcc_jit_rvalue_set_bool_require_tail_call): Likewise. (gcc_jit_type_get_aligned): Likewise. (gcc_jit_type_get_vector): Likewise. (gcc_jit_function_get_address): Likewise. (gcc_jit_version_patchlevel): Likewise. (gcc_jit_block_add_extended_asm): Likewise. (gcc_jit_extended_asm_as_object): Likewise. (gcc_jit_extended_asm_set_volatile_flag): Likewise. (gcc_jit_extended_asm_set_inline_flag): Likewise. (gcc_jit_extended_asm_add_output_operand): Likewise. (gcc_jit_extended_asm_add_input_operand): Likewise. (gcc_jit_extended_asm_add_clobber): Likewise. * notes.txt: Likewise. gcc/lto/ChangeLog: * config-lang.in: Rename .c names to .cc. * lang-specs.h: Likewise. * lto-common.cc (gimple_register_canonical_type_1): Likewise. * lto-common.h: Likewise. * lto-dump.cc (lto_main): Likewise. * lto-lang.cc (handle_fnspec_attribute): Likewise. (lto_getdecls): Likewise. (lto_init): Likewise. * lto.cc (lto_main): Likewise. * lto.h: Likewise. gcc/objc/ChangeLog: * Make-lang.in: Rename .c names to .cc. * config-lang.in: Likewise. * lang-specs.h: Likewise. * objc-act.cc (objc_build_component_ref): Likewise. (objc_copy_binfo): Likewise. (lookup_method_in_hash_lists): Likewise. (objc_finish_foreach_loop): Likewise. * objc-act.h (objc_common_init_ts): Likewise. * objc-gnu-runtime-abi-01.cc: Likewise. * objc-lang.cc (struct lang_hooks): Likewise. * objc-map.cc: Likewise. * objc-next-runtime-abi-01.cc (generate_objc_symtab_decl): Likewise. * objc-runtime-shared-support.cc: Likewise. * objc-runtime-shared-support.h (build_protocol_initializer): Likewise. gcc/objcp/ChangeLog: * Make-lang.in: Rename .c names to .cc. * config-lang.in: Likewise. * lang-specs.h: Likewise. * objcp-decl.cc (objcp_end_compound_stmt): Likewise. * objcp-lang.cc (struct lang_hooks): Likewise. gcc/po/ChangeLog: * EXCLUDES: Rename .c names to .cc. libcpp/ChangeLog: * Makefile.in: Rename .c names to .cc. * charset.cc (convert_escape): Likewise. * directives.cc (directive_diagnostics): Likewise. (_cpp_handle_directive): Likewise. (lex_macro_node): Likewise. * include/cpplib.h (struct _cpp_file): Likewise. (PURE_ZERO): Likewise. (cpp_defined): Likewise. (cpp_error_at): Likewise. (cpp_forall_identifiers): Likewise. (cpp_compare_macros): Likewise. (cpp_get_converted_source): Likewise. (cpp_read_state): Likewise. (cpp_directive_only_process): Likewise. (struct cpp_decoded_char): Likewise. * include/line-map.h (enum lc_reason): Likewise. (enum location_aspect): Likewise. * include/mkdeps.h: Likewise. * init.cc (cpp_destroy): Likewise. (cpp_finish): Likewise. * internal.h (struct cpp_reader): Likewise. (_cpp_defined_macro_p): Likewise. (_cpp_backup_tokens_direct): Likewise. (_cpp_destroy_hashtable): Likewise. (_cpp_has_header): Likewise. (_cpp_expand_op_stack): Likewise. (_cpp_commit_buff): Likewise. (_cpp_restore_special_builtin): Likewise. (_cpp_bracket_include): Likewise. (_cpp_replacement_text_len): Likewise. (ufputs): Likewise. * line-map.cc (linemap_macro_loc_to_exp_point): Likewise. (linemap_check_files_exited): Likewise. (line_map_new_raw): Likewise. * traditional.cc (enum ls): Likewise.
2022-01-13vect: Add bias parameter for partial vectorizationRobin Dapp1-0/+10
This introduces a bias parameter for the len_load/len_store ifns as well as optabs that is meant to distinguish between Power and s390 variants. PowerPC's instructions require a bias of 0, while in s390's case vll/vstl do not support lengths of zero bytes and a bias of -1 must be used. gcc/ChangeLog: * internal-fn.c (expand_partial_load_optab_fn): Add bias. (expand_partial_store_optab_fn): Likewise. (internal_len_load_store_bias): New function. * internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define. (internal_len_load_store_bias): New function. * tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set bias. (vect_set_loop_condition_partial_vectors): Add header_seq parameter. * tree-vect-loop.c (vect_verify_loop_lens): Verify bias. (vect_estimate_min_profitable_iters): Account for bias. (vect_get_loop_len): Add bias-adjusted length. * tree-vect-stmts.c (vectorizable_store): Use. (vectorizable_load): Use. * tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted length. (LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro. * config/rs6000/vsx.md: Use const0 bias predicate. * doc/md.texi: Document bias value.
2022-01-03Update copyright years.Jakub Jelinek1-1/+1
2021-12-03sve: combine nested if predicatesTamar Christina1-0/+9
The following example void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n) { for (int i = 0; i < n; i++) { float a = x[i]; float b = y[i]; if (a > b) { z0[i] = a + b; if (a > c) { z1[i] = a - b; } } } } generates currently: ptrue p3.b, all ld1w z1.s, p1/z, [x2, x5, lsl 2] ld1w z2.s, p1/z, [x3, x5, lsl 2] fcmgt p0.s, p3/z, z1.s, z0.s fcmgt p2.s, p1/z, z1.s, z2.s fcmgt p0.s, p0/z, z1.s, z2.s and p0.b, p0/z, p1.b, p1.b The conditions for a > b and a > c become separate comparisons. After this patch we generate: ld1w z1.s, p0/z, [x2, x5, lsl 2] ld1w z2.s, p0/z, [x3, x5, lsl 2] fcmgt p1.s, p0/z, z1.s, z2.s fcmgt p1.s, p1/z, z1.s, z0.s Where the condition a > b && a > c are folded by using the predicate result of the previous compare and thus allows the removal of one of the compares. When never a mask is being generated from an BIT_AND we mask the operands of the and instead and then just AND the result. This allows us to be able to CSE the masks and generate the right combination. However because re-assoc will try to re-order the masks in the & we have to now perform a small local CSE on the vectorized loop is vectorization is successful. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. gcc/ChangeLog: * tree-vect-stmts.c (prepare_load_store_mask): Rename to... (prepare_vec_mask): ...This and record operations that have already been masked. (vectorizable_call): Use it. (vectorizable_operation): Likewise. (vectorizable_store): Likewise. (vectorizable_load): Likewise. * tree-vectorizer.h (class _loop_vec_info): Add vec_cond_masked_set. (vec_cond_masked_set_type, tree_cond_mask_hash): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-combine-and.c: New test.
2021-11-30vect: Fix vect_is_reductionRichard Sandiford1-2/+1
The current definition of vect_is_reduction (provided for target costing) misses some pattern statements. gcc/ * tree-vectorizer.h (vect_is_reduction): Use STMT_VINFO_REDUC_IDX. gcc/testsuite/ * gcc.target/aarch64/sve/cost_model_13.c: New test.
2021-11-30vect: Make reduction code handle callsRichard Sandiford1-4/+6
This patch extends the reduction code to handle calls. So far it's a structural change only; a later patch adds support for specific function reductions. Most of the patch consists of using code_helper and gimple_match_op to describe the reduction operations. The other main change is that vectorizable_call now needs to handle fully-predicated reductions. There are some new functions that are provided for ABI completeness and aren't currently used: first_commutative_argument commutative_ternary_op_p 1- and 3-argument forms of gimple_build gcc/ * builtins.h (associated_internal_fn): Declare overload that takes a (combined_cfn, return type) pair. * builtins.c (associated_internal_fn): Split new overload out of original fndecl version. Also provide an overload that takes a (combined_cfn, return type) pair. * internal-fn.h (commutative_binary_fn_p): Declare. (commutative_ternary_fn_p): Likewise. (associative_binary_fn_p): Likewise. * internal-fn.c (commutative_binary_fn_p, commutative_ternary_fn_p): New functions, split out from... (first_commutative_argument): ...here. (associative_binary_fn_p): New function. * gimple-match.h (code_helper): Add a constructor that takes internal functions. (commutative_binary_op_p): Declare. (commutative_ternary_op_p): Likewise. (first_commutative_argument): Likewise. (associative_binary_op_p): Likewise. (canonicalize_code): Likewise. (directly_supported_p): Likewise. (get_conditional_internal_fn): Likewise. (gimple_build): New overloads that takes a code_helper. * gimple-fold.c (gimple_build): Likewise. * gimple-match-head.c (commutative_binary_op_p): New function. (commutative_ternary_op_p): Likewise. (first_commutative_argument): Likewise. (associative_binary_op_p): Likewise. (canonicalize_code): Likewise. (directly_supported_p): Likewise. (get_conditional_internal_fn): Likewise. * tree-vectorizer.h: Include gimple-match.h. (neutral_op_for_reduction): Take a code_helper instead of a tree_code. (needs_fold_left_reduction_p): Likewise. (reduction_fn_for_scalar_code): Likewise. (vect_can_vectorize_without_simd_p): Declare a nNew overload that takes a code_helper. * tree-vect-loop.c: Include case-cfn-macros.h. (fold_left_reduction_fn): Take a code_helper instead of a tree_code. (reduction_fn_for_scalar_code): Likewise. (neutral_op_for_reduction): Likewise. (needs_fold_left_reduction_p): Likewise. (use_mask_by_cond_expr_p): Likewise. (build_vect_cond_expr): Likewise. (vect_create_partial_epilog): Likewise. Use gimple_build rather than gimple_build_assign. (check_reduction_path): Handle calls and operate on code_helpers rather than tree_codes. (vect_is_simple_reduction): Likewise. (vect_model_reduction_cost): Likewise. (vect_find_reusable_accumulator): Likewise. (vect_create_epilog_for_reduction): Likewise. (vect_transform_cycle_phi): Likewise. (vectorizable_reduction): Likewise. Make more use of lane_reduc_code_p. (vect_transform_reduction): Use gimple_extract_op but expect a tree_code for now. (vect_can_vectorize_without_simd_p): New overload that takes a code_helper. * tree-vect-stmts.c (vectorizable_call): Handle reductions in fully-masked loops. * tree-vect-patterns.c (vect_mark_pattern_stmts): Use gimple_extract_op when updating STMT_VINFO_REDUC_IDX.
2021-11-10AArch64: do not keep negated mask and inverse mask live at the same timeTamar Christina1-3/+7
The following example: void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n) { for (int i = 0; i < n; i++) { z[i] = (w[i] > 0) ? w[i] : y[i]; } } Generates currently: ptrue p2.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p2/z, z0.d, #0.0 bic p3.b, p2/z, p0.b, p1.b ld1d z1.d, p3/z, [x3, x2, lsl 3] and after the previous patches generates: ptrue p3.b, all ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 fcmgt p2.d, p3/z, z0.d, #0.0 not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] where a duplicate comparison is performed for w[i] > 0. This is because in the vectorizer we're emitting a comparison for both a and ~a where we just need to emit one of them and invert the other. After this patch we generate: ld1d z0.d, p0/z, [x1, x2, lsl 3] fcmgt p1.d, p0/z, z0.d, #0.0 mov p2.b, p1.b not p1.b, p0/z, p1.b ld1d z1.d, p1/z, [x3, x2, lsl 3] In order to perform the check I have to fully expand the NOT stmts when recording them as the SSA names for the top level expressions differ but their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not the operands in _34. But we only do this when the operation is an ordered one because mixing ordered and unordered expressions can lead to de-optimized code. Note: This patch series is working incrementally towards generating the most efficient code for this and other loops in small steps. The mov is created by postreload when it does a late CSE. gcc/ChangeLog: * tree-vectorizer.h (struct scalar_cond_masked_key): Add inverted_p. (default_hash_traits<scalar_conf_masked_key>): Likewise. * tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask is live. * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree): Register mask inverses. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred-not-gen-1.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-2.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-3.c: Update testcase. * gcc.target/aarch64/sve/pred-not-gen-4.c: Update testcase.
2021-11-10vect: Pass scalar_costs to finish_costRichard Sandiford1-5/+9
When finishing the vector costs, it can be useful to know what the associated scalar costs were. This allows targets to read information collected about the original scalar loop when trying to make a final judgement about the cost of the vector code. This patch therefore passes the scalar costs to vector_costs::finish_cost. The parameter is null for the scalar costs themselves. gcc/ * tree-vectorizer.h (vector_costs::finish_cost): Take the corresponding scalar costs as a parameter. (finish_cost): Likewise. * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost) (vect_estimate_min_profitable_iters): Update accordingly. * tree-vect-slp.c (vect_bb_vectorization_profitable_p): Likewise. * tree-vectorizer.c (vector_costs::finish_cost): Likewise. * config/aarch64/aarch64.c (aarch64_vector_costs::finish_cost): Likewise. * config/rs6000/rs6000.c (rs6000_cost_data::finish_cost): Likewise.
2021-11-10vect: Keep scalar costs around longerRichard Sandiford1-4/+13
The scalar costs for a loop are fleeting, with only the final single_scalar_iteration_cost being kept for later comparison. This patch replaces single_scalar_iteration_cost with the cost structure, so that (with later patches) it's possible for targets to examine other target-specific cost properties as well. This will be done by passing the scalar costs to hooks where appropriate; targets shouldn't try to read the information directly from loop_vec_infos. gcc/ * tree-vectorizer.h (_loop_vec_info::scalar_costs): New member variable. (_loop_vec_info::single_scalar_iteration_cost): Delete. (LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST): Delete. (vector_costs::total_cost): New function. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update after above changes. (_loop_vec_info::~_loop_vec_info): Delete scalar_costs. (vect_compute_single_scalar_iteration_cost): Store the costs in loop_vinfo->scalar_costs. (vect_estimate_min_profitable_iters): Get the scalar cost from loop_vinfo->scalar_costs.
2021-11-10vect: Hookize better_loop_vinfo_pRichard Sandiford1-0/+17
One of the things we want to do on AArch64 is compare vector loops side-by-side and pick the best one. For some targets, we want this to be based on issue rates as well as the usual latency-based costs (at least for loops with relatively high iteration counts). The current approach to doing this is: when costing vectorisation candidate A, try to guess what the other main candidate B will look like and adjust A's latency-based cost up or down based on the likely difference between A and B's issue rates. This effectively means that we try to cost parts of B at the same time as A, without actually being able to see B. This is needlessly indirect and complex. It was a compromise due to the code being added (too) late in the GCC 11 cycle, so that target-independent changes weren't possible. The target-independent code already compares two candidate loop_vec_infos side-by-side, so that information about A and B above are available directly. This patch creates a way for targets to hook into this comparison. The AArch64 code can therefore hook into better_main_loop_than_p to compare issue rates. If the issue rate comparison isn't decisive, the code can fall back to the normal latency-based comparison instead. gcc/ * tree-vectorizer.h (vector_costs::better_main_loop_than_p) (vector_costs::better_epilogue_loop_than_p) (vector_costs::compare_inside_loop_cost) (vector_costs::compare_outside_loop_cost): Likewise. * tree-vectorizer.c (vector_costs::better_main_loop_than_p) (vector_costs::better_epilogue_loop_than_p) (vector_costs::compare_inside_loop_cost) (vector_costs::compare_outside_loop_cost): New functions, containing code moved from... * tree-vect-loop.c (vect_better_loop_vinfo_p): ...here.
2021-11-10vect: Remove vec_outside/inside_cost fieldsRichard Sandiford1-7/+9
The vector costs now use a common base class instead of being completely abstract. This means that there's no longer a need to record the inside and outside costs separately. gcc/ * tree-vectorizer.h (_loop_vec_info): Remove vec_outside_cost and vec_inside_cost. (vector_costs::outside_cost): New function. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update after above. (vect_estimate_min_profitable_iters): Likewise. (vect_better_loop_vinfo_p): Get the inside and outside costs from the loop_vec_infos' vector_costs.
2021-11-10vect: Move vector costs to loop_vec_infoRichard Sandiford1-4/+3
target_cost_data is in vec_info but is really specific to loop_vec_info. This patch moves it there and renames it to vector_costs, to distinguish it from scalar target costs. gcc/ * tree-vectorizer.h (vec_info::target_cost_data): Replace with... (_loop_vec_info::vector_costs): ...this. (LOOP_VINFO_TARGET_COST_DATA): Delete. * tree-vectorizer.c (vec_info::vec_info): Remove target_cost_data initialization. (vec_info::~vec_info): Remove corresponding delete. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize vector_costs to null. (_loop_vec_info::~_loop_vec_info): Delete vector_costs. (vect_analyze_loop_operations): Update after above changes. (vect_analyze_loop_2): Likewise. (vect_estimate_min_profitable_iters): Likewise. * tree-vect-slp.c (vect_slp_analyze_operations): Likewise.
2021-11-10Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.liuhongt1-0/+1
This will enable transformation like - # sum1_50 = PHI <prephitmp_64(13), 0(4)> - # sum2_52 = PHI <sum2_21(13), 0(4)> + # sum1_50 = PHI <_87(13), 0(4)> + # sum2_52 = PHI <_89(13), 0(4)> # ivtmp_62 = PHI <ivtmp_61(13), 64(4)> i.2_7 = (long unsigned int) i_49; _8 = i.2_7 * 8; ... vec1_i_38 = vec1_29 >> _10; vec2_i_39 = vec2_31 >> _10; _11 = vec1_i_38 & 1; - _63 = tmp_37 ^ sum1_50; - prephitmp_64 = _11 == 0 ? sum1_50 : _63; + _ifc__86 = _11 != 0 ? tmp_37 : 0; + _87 = sum1_50 ^ _ifc__86; _12 = vec2_i_39 & 1; : so that vectorizer won't failed due to /* If this isn't a nested cycle or if the nested cycle reduction value is used ouside of the inner loop we cannot handle uses of the reduction value. */ if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "reduction used in loop.\n"); return NULL; } gcc/ChangeLog: PR tree-optimization/103126 * tree-vect-loop.c (neutral_op_for_reduction): Remove static. * tree-vectorizer.h (neutral_op_for_reduction): Declare. * tree-if-conv.c : Include tree-vectorizer.h. (is_cond_scalar_reduction): Handle BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR. (convert_scalar_cond_reduction): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
2021-11-08tree-optimization/103102 - fix error in vectorizer refactoringRichard Biener1-1/+2
This fixes an oversight that caused vectorized epilogues to have versioning for niters applied. 2021-11-08 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_create_loop_vinfo): Add main_loop_info parameter. * tree-vect-loop.c (vect_create_loop_vinfo): Likewise. Set LOOP_VINFO_ORIG_LOOP_INFO and conditionalize set of LOOP_VINFO_NITERS_ASSUMPTIONS. (vect_analyze_loop_1): Adjust. (vect_analyze_loop): Move loop constraint setting and SCEV/niter reset here from vect_create_loop_vinfo to perform it only once. (vect_analyze_loop_form): Move dumping of symbolic niters here from vect_create_loop_vinfo.
2021-11-05Split vector loop analysis into main and epilogue analysisRichard Biener1-6/+4
As discussed this splits the analysis loop into two, first settling on a vector mode used for the main loop and only then analyzing the epilogue of that for possible vectorization. That makes it easier to put in support for unrolled main loops. On the way I've realized some cleanup opportunities, namely caching n_stmts in vec_info_shared (it's computed by dataref analysis) avoiding to pass that around and setting/clearing loop->aux during analysis - try_vectorize_loop_1 will ultimatively set it on those we vectorize. This also gets rid of the previously introduced callback in vect_analyze_loop_1 in favor of making that advance the mode iterator. I'm now pushing VOIDmode explicitely into the vector_modes array which makes the re-start on the epilogue side a bit more straight-forward. Note that will now use auto-detection of the vector mode in case the main loop used it and we want to try LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P and the first mode from the target array if not. I've added a comment that says we may want to make sure we don't try vectorizing the epilogue with a bigger vector size than the main loop but the situation isn't very likely to appear in practice I guess (and it was also present before this change). In principle this change should not change vectorization decisions but the way we handled re-analyzing epilogues as main loops makes me only 99% sure that it does. 2021-11-05 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vec_info_shared::n_stmts): Add. (LOOP_VINFO_N_STMTS): Likewise. (vec_info_for_bb): Remove unused function. * tree-vectorizer.c (vec_info_shared::vec_info_shared): Initialize n_stmts member. * tree-vect-loop.c: Remove INCLUDE_FUNCTIONAL. (vect_create_loop_vinfo): Do not set loop->aux. (vect_analyze_loop_2): Do not get n_stmts as argument, instead use LOOP_VINFO_N_STMTS. Set LOOP_VINFO_VECTORIZABLE_P here. (vect_analyze_loop_1): Remove callback, get the mode iterator and autodetected_vector_mode as argument, advancing the iterator and initializing autodetected_vector_mode here. (vect_analyze_loop): Split analysis loop into two, first processing main loops only and then epilogues.
2021-11-05First refactor of vect_analyze_loopRichard Biener1-2/+11
This refactors the main loop analysis part in vect_analyze_loop, re-purposing the existing vect_reanalyze_as_main_loop for this to reduce code duplication. Failure flow is a bit tricky since we want to extract info from the analyzed loop but I wanted to share the destruction part. Thus I add some std::function and lambda to funnel post-analysis for the case we want that (when analyzing from the main iteration but not when re-analyzing an epilogue as main). In addition I split vect_analyze_loop_form into analysis and vinfo creation so we can do the analysis only once, simplifying the new vect_analyze_loop_1. As discussed we probably want to change the loop over vector modes to first only analyze things as the main loop, picking the best (or simd VF) mode for the main loop and then analyze for a vectorized epilogue. The unroll would then integrate with the main loop vectorization. I think that currently we may fail to analyze the epilogue with the same mode as the main loop when using partial vectors since we increment mode_i before doing that. 2021-11-04 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (struct vect_loop_form_info): New. (vect_analyze_loop_form): Adjust. (vect_create_loop_vinfo): New. * tree-parloops.c (gather_scalar_reductions): Adjust for vect_analyze_loop_form API change. * tree-vect-loop.c: Include <functional>. (vect_analyze_loop_form_1): Rename to vect_analyze_loop_form, take struct vect_loop_form_info as output parameter and adjust. (vect_analyze_loop_form): Rename to vect_create_loop_vinfo and split out call to the original vect_analyze_loop_form_1. (vect_reanalyze_as_main_loop): Rename to... (vect_analyze_loop_1): ... this, factor out the call to vect_analyze_loop_form and generalize to be able to use it twice ... (vect_analyze_loop): ... here. Perform vect_analyze_loop_form once only and here.
2021-11-04vect: Convert cost hooks to classesRichard Sandiford1-25/+116
The current vector cost interface has a quite a bit of redundancy built in. Each target that defines its own hooks has to replicate the basic unsigned[3] management. Currently each target also duplicates the cost adjustment for inner loops. This patch instead defines a vector_costs class for holding the scalar or vector cost and allows targets to subclass it. There is then only one costing hook: to create a new costs structure of the appropriate type. Everything else can be virtual functions, with common concepts implemented in the base class rather than in each target's derivation. This might seem like excess C++-ification, but it shaves ~100 LOC. I've also got some follow-on changes that become significantly easier with this patch. Maybe it could help with things like weighting blocks based on frequency too. This will clash with Andre's unrolling patches. His patches have priority so this patch should queue behind them. The x86 and rs6000 parts fully convert to a self-contained class. The equivalent aarch64 changes are more complex, so this patch just does the bare minimum. A later patch will rework the aarch64 bits. gcc/ * target.def (targetm.vectorize.init_cost): Replace with... (targetm.vectorize.create_costs): ...this. (targetm.vectorize.add_stmt_cost): Delete. (targetm.vectorize.finish_cost): Likewise. (targetm.vectorize.destroy_cost_data): Likewise. * doc/tm.texi.in (TARGET_VECTORIZE_INIT_COST): Replace with... (TARGET_VECTORIZE_CREATE_COSTS): ...this. (TARGET_VECTORIZE_ADD_STMT_COST): Delete. (TARGET_VECTORIZE_FINISH_COST): Likewise. (TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise. * doc/tm.texi: Regenerate. * tree-vectorizer.h (vec_info::vec_info): Remove target_cost_data parameter. (vec_info::target_cost_data): Change from a void * to a vector_costs *. (vector_costs): New class. (init_cost): Take a vec_info and return a vector_costs. (dump_stmt_cost): Remove data parameter. (add_stmt_cost): Replace vinfo and data parameters with a vector_costs. (add_stmt_costs): Likewise. (finish_cost): Replace data parameter with a vector_costs. (destroy_cost_data): Delete. * tree-vectorizer.c (dump_stmt_cost): Remove data argument and don't print it. (vec_info::vec_info): Remove the target_cost_data parameter and initialize the member variable to null instead. (vec_info::~vec_info): Delete target_cost_data instead of calling destroy_cost_data. (vector_costs::add_stmt_cost): New function. (vector_costs::finish_cost): Likewise. (vector_costs::record_stmt_cost): Likewise. (vector_costs::adjust_cost_for_freq): Likewise. * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update call to vec_info::vec_info. (vect_compute_single_scalar_iteration_cost): Update after above changes to costing interface. (vect_analyze_loop_operations): Likewise. (vect_estimate_min_profitable_iters): Likewise. (vect_analyze_loop_2): Initialize LOOP_VINFO_TARGET_COST_DATA at the start_over point, where it needs to be recreated after trying without slp. Update retry code accordingly. * tree-vect-slp.c (_bb_vec_info::_bb_vec_info): Update call to vec_info::vec_info. (vect_slp_analyze_operation): Update after above changes to costing interface. (vect_bb_vectorization_profitable_p): Likewise. * targhooks.h (default_init_cost): Replace with... (default_vectorize_create_costs): ...this. (default_add_stmt_cost): Delete. (default_finish_cost, default_destroy_cost_data): Likewise. * targhooks.c (default_init_cost): Replace with... (default_vectorize_create_costs): ...this. (default_add_stmt_cost): Delete, moving logic to vector_costs instead. (default_finish_cost, default_destroy_cost_data): Delete. * config/aarch64/aarch64.c (aarch64_vector_costs): Inherit from vector_costs. Add a constructor. (aarch64_init_cost): Replace with... (aarch64_vectorize_create_costs): ...this. (aarch64_add_stmt_cost): Replace with... (aarch64_vector_costs::add_stmt_cost): ...this. Use record_stmt_cost to adjust the cost for inner loops. (aarch64_finish_cost): Replace with... (aarch64_vector_costs::finish_cost): ...this. (aarch64_destroy_cost_data): Delete. (TARGET_VECTORIZE_INIT_COST): Replace with... (TARGET_VECTORIZE_CREATE_COSTS): ...this. (TARGET_VECTORIZE_ADD_STMT_COST): Delete. (TARGET_VECTORIZE_FINISH_COST): Likewise. (TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise. * config/i386/i386.c (ix86_vector_costs): New structure. (ix86_init_cost): Replace with... (ix86_vectorize_create_costs): ...this. (ix86_add_stmt_cost): Replace with... (ix86_vector_costs::add_stmt_cost): ...this. Use adjust_cost_for_freq to adjust the cost for inner loops. (ix86_finish_cost, ix86_destroy_cost_data): Delete. (TARGET_VECTORIZE_INIT_COST): Replace with... (TARGET_VECTORIZE_CREATE_COSTS): ...this. (TARGET_VECTORIZE_ADD_STMT_COST): Delete. (TARGET_VECTORIZE_FINISH_COST): Likewise. (TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise. * config/rs6000/rs6000.c (TARGET_VECTORIZE_INIT_COST): Replace with... (TARGET_VECTORIZE_CREATE_COSTS): ...this. (TARGET_VECTORIZE_ADD_STMT_COST): Delete. (TARGET_VECTORIZE_FINISH_COST): Likewise. (TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise. (rs6000_cost_data): Inherit from vector_costs. Add a constructor. Drop loop_info, cost and costing_for_scalar in favor of the corresponding vector_costs member variables. Add "m_" to the names of the remaining member variables and initialize them. (rs6000_density_test): Replace with... (rs6000_cost_data::density_test): ...this. (rs6000_init_cost): Replace with... (rs6000_vectorize_create_costs): ...this. (rs6000_update_target_cost_per_stmt): Replace with... (rs6000_cost_data::update_target_cost_per_stmt): ...this. (rs6000_add_stmt_cost): Replace with... (rs6000_cost_data::add_stmt_cost): ...this. Use adjust_cost_for_freq to adjust the cost for inner loops. (rs6000_adjust_vect_cost_per_loop): Replace with... (rs6000_cost_data::adjust_vect_cost_per_loop): ...this. (rs6000_finish_cost): Replace with... (rs6000_cost_data::finish_cost): ...this. Group loop code into a single if statement and pass the loop_vinfo down to subroutines. (rs6000_destroy_cost_data): Delete.
2021-10-26Unify offset and byte_offset for vect_create_addr_base_for_vector_refRichard Biener1-2/+2
Now that both are measured in bytes we can unify the two parameters. 2021-10-26 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_create_addr_base_for_vector_ref): Remove byte_offset parameter. (vect_create_data_ref_ptr): Likewise. * tree-vect-data-refs.c (vect_create_addr_base_for_vector_ref): Likewise. (vect_create_data_ref_ptr): Likewise. * tree-vect-stmts.c (vectorizable_store): Adjust. (vectorizable_load): Likewise.
2021-10-26Move negative stride bias out of dr_misalignmentRichard Biener1-1/+2
This moves applying of a bias for negative stride accesses out of dr_misalignment in favor of a more general optional offset argument. The negative bias is now computed by get_load_store_type and applied accordingly to determine the alignment support scheme. Likewise the peeling/versioning code is adjusted albeit that still assumes we'll end up with VMAT_CONTIGUOUS_DOWN or VMAT_CONTIGUOUS_REVERSE but at least when not so (VMAT_STRIDED_SLP is one possibility) then get_load_store_type will _not_ falsely report an aligned access but instead an access with known misalignment. This fixes PR96109. 2021-10-25 Richard Biener <rguenther@suse.de> PR tree-optimization/96109 * tree-vectorizer.h (dr_misalignment): Add optional offset parameter. * tree-vect-data-refs.c (dr_misalignment): Likewise. Remove offset applied for negative stride accesses. (vect_enhance_data_refs_alignment): Compute negative stride access offset and pass it to dr_misalignment. * tree-vect-stmts.c (get_negative_load_store_type): Pass negative offset to dr_misalignment. (get_group_load_store_type): Likewise. (get_load_store_type): Likewise. (vectorizable_store): Remove asserts about alignment. (vectorizable_load): Likewise.
2021-10-19Refactor vect_supportable_dr_alignmentRichard Biener1-1/+1
This refactors vect_supportable_dr_alignment to get the misalignment as input parameter which allows us to elide modifying/restoring of DR_MISALIGNMENT during alignment peeling analysis which eventually makes it more straight-forward to split out the negative step handling. 2021-10-19 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_supportable_dr_alignment): Add misalignment parameter. * tree-vect-data-refs.c (vect_get_peeling_costs_all_drs): Do not change DR_MISALIGNMENT in place, instead pass the adjusted misalignment to vect_supportable_dr_alignment. (vect_peeling_supportable): Likewise. (vect_peeling_hash_get_lowest_cost): Adjust. (vect_enhance_data_refs_alignment): Likewise. (vect_vfa_access_size): Likewise. (vect_supportable_dr_alignment): Add misalignment parameter and simplify. * tree-vect-stmts.c (get_negative_load_store_type): Adjust. (get_group_load_store_type): Likewise. (get_load_store_type): Likewise.
2021-10-19Refactor load/store costingRichard Biener1-1/+3
This passes down the already available alignment scheme and misalignment to the load/store costing routines, removing redundant queries. 2021-10-19 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_get_store_cost): Adjust signature. (vect_get_load_cost): Likewise. * tree-vect-data-refs.c (vect_get_data_access_cost): Get alignment support scheme and misalignment as arguments and pass them down. (vect_get_peeling_costs_all_drs): Compute that info here and note that we shouldn't need to. * tree-vect-stmts.c (vect_model_store_cost): Get alignment support scheme and misalignment as arguments. (vect_get_store_cost): Likewise. (vect_model_load_cost): Likewise. (vect_get_load_cost): Likewise. (vectorizable_store): Pass down alignment support scheme and misalignment to costing. (vectorizable_load): Likewise.
2021-10-19Remove check_aligned parameter from vect_supportable_dr_alignmentRichard Biener1-1/+1
There are two calls with true as parameter, one is only relevant for the case of the misalignment being unknown which means the access is never aligned there, the other is in the peeling hash insert code used conditional on the unlimited cost model which adds an artificial count. But the way it works right now is that it boosts the count if the specific misalignment when not peeling is unsupported - in particular when the access is currently aligned we'll query the backend with a misalign value of zero. I've changed it to boost the peeling when unknown alignment is not supported instead and noted how we could in principle improve this. 2021-10-19 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_supportable_dr_alignment): Remove check_aligned argument. * tree-vect-data-refs.c (vect_supportable_dr_alignment): Likewise. (vect_peeling_hash_insert): Add supportable_if_not_aligned argument and do not call vect_supportable_dr_alignment here. (vect_peeling_supportable): Adjust. (vect_enhance_data_refs_alignment): Compute whether the access is supported with different alignment here and pass that down to vect_peeling_hash_insert. (vect_vfa_access_size): Adjust. * tree-vect-stmts.c (vect_get_store_cost): Likewise. (vect_get_load_cost): Likewise. (get_negative_load_store_type): Likewise. (get_group_load_store_type): Likewise. (get_load_store_type): Likewise.
2021-10-12vectorizer: Fix up -fsimd-cost-model= handlingJakub Jelinek1-4/+12
> * testsuite/libgomp.c++/scan-10.C: Add option -fvect-cost-model=cheap. I don't think this is the right thing to do. This just means that at some point between 2013 when -fsimd-cost-model has been introduced and now -fsimd-cost-model= option at least partially stopped working properly. As documented, -fsimd-cost-model= overrides the -fvect-cost-model= setting for OpenMP simd loops (loop->force_vectorize is true) if specified differently from default. In tree-vectorizer.h we have: static inline bool unlimited_cost_model (loop_p loop) { if (loop != NULL && loop->force_vectorize && flag_simd_cost_model != VECT_COST_MODEL_DEFAULT) return flag_simd_cost_model == VECT_COST_MODEL_UNLIMITED; return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED); } and use it in various places, but we also just use flag_vect_cost_model in lots of places (and in one spot use flag_simd_cost_model, not sure if we are sure it is a force_vectorize loop or what). So, IMHO we should change the above inline function to loop_cost_model and let it return the cost model and then just reimplement unlimited_cost_model as return loop_cost_model (loop) == VECT_COST_MODEL_UNLIMITED; and then adjust the direct uses of the flag and revert these changes. 2021-10-12 Jakub Jelinek <jakub@redhat.com> gcc/ * tree-vectorizer.h (loop_cost_model): New function. (unlimited_cost_model): Use it. * tree-vect-loop.c (vect_analyze_loop_costing): Use loop_cost_model call instead of flag_vect_cost_model. * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Likewise. (vect_prune_runtime_alias_test_list): Likewise. Also use it instead of flag_simd_cost_model. gcc/testsuite/ * gcc.dg/gomp/simd-2.c: Remove option -fvect-cost-model=cheap. * gcc.dg/gomp/simd-3.c: Likewise. libgomp/ * testsuite/libgomp.c/scan-11.c: Remove option -fvect-cost-model=cheap. * testsuite/libgomp.c/scan-12.c: Likewise. * testsuite/libgomp.c/scan-13.c: Likewise. * testsuite/libgomp.c/scan-14.c: Likewise. * testsuite/libgomp.c/scan-15.c: Likewise. * testsuite/libgomp.c/scan-16.c: Likewise. * testsuite/libgomp.c/scan-17.c: Likewise. * testsuite/libgomp.c/scan-18.c: Likewise. * testsuite/libgomp.c/scan-19.c: Likewise. * testsuite/libgomp.c/scan-20.c: Likewise. * testsuite/libgomp.c/scan-21.c: Likewise. * testsuite/libgomp.c/scan-22.c: Likewise. * testsuite/libgomp.c++/scan-9.C: Likewise. * testsuite/libgomp.c++/scan-10.C: Likewise. * testsuite/libgomp.c++/scan-11.C: Likewise. * testsuite/libgomp.c++/scan-12.C: Likewise. * testsuite/libgomp.c++/scan-13.C: Likewise. * testsuite/libgomp.c++/scan-14.C: Likewise. * testsuite/libgomp.c++/scan-15.C: Likewise. * testsuite/libgomp.c++/scan-16.C: Likewise.