Age | Commit message (Collapse) | Author | Files | Lines |
|
The following adds the capability to do SLP on .MASK_STORE, I do not
plan to add interleaving support.
PR tree-optimization/111115
gcc/
* tree-vectorizer.h (vect_slp_child_index_for_operand): New.
* tree-vect-data-refs.cc (can_group_stmts_p): Also group
.MASK_STORE.
* tree-vect-slp.cc (arg3_arg2_map): New.
(vect_get_operand_map): Handle IFN_MASK_STORE.
(vect_slp_child_index_for_operand): New function.
(vect_build_slp_tree_1): Handle statements with no LHS,
masked store ifns.
(vect_remove_slp_scalar_calls): Likewise.
* tree-vect-stmts.cc (vect_check_store_rhs): Lookup the
SLP child corresponding to the ifn value index.
(vectorizable_store): Likewise for the mask index. Support
masked stores.
(vectorizable_load): Lookup the SLP child corresponding to the
ifn mask index.
gcc/testsuite/
* lib/target-supports.exp (check_effective_target_vect_masked_store):
Supported with check_avx_available.
* gcc.dg/vect/slp-mask-store-1.c: New testcase.
|
|
Hi, Richard and Richi.
This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.
Consider this simple case:
void __attribute__ ((noinline, noclone))
foo (int *__restrict a, int *__restrict b, int *__restrict c,
int *__restrict d, int *__restrict e, int *__restrict f,
int *__restrict g, int *__restrict h, int *__restrict j, int n)
{
for (int i = 0; i < n; ++i)
{
a[i] = j[i * 8];
b[i] = j[i * 8 + 1];
c[i] = j[i * 8 + 2];
d[i] = j[i * 8 + 3];
e[i] = j[i * 8 + 4];
f[i] = j[i * 8 + 5];
g[i] = j[i * 8 + 6];
h[i] = j[i * 8 + 7];
}
}
RVV Gimple IR:
_79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
ivtmp_125 = _79 * 32;
vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
vect__8.9_122 = vect_array.8[0];
vect__8.10_121 = vect_array.8[1];
vect__8.11_120 = vect_array.8[2];
vect__8.12_119 = vect_array.8[3];
vect__8.13_118 = vect_array.8[4];
vect__8.14_117 = vect_array.8[5];
vect__8.15_116 = vect_array.8[6];
vect__8.16_115 = vect_array.8[7];
vect_array.8 ={v} {CLOBBER};
ivtmp_114 = _79 * 4;
.MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
.MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
.MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
.MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
.MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
.MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
.MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
.MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
ASM:
foo:
lw t4,8(sp)
ld t5,0(sp)
ble t4,zero,.L5
.L3:
vsetvli t1,t4,e8,mf4,ta,ma
vlseg8e32.v v8,(t5)
slli t3,t1,2
slli t6,t1,5
vse32.v v8,0(a0)
vse32.v v9,0(a1)
vse32.v v10,0(a2)
vse32.v v11,0(a3)
vse32.v v12,0(a4)
vse32.v v13,0(a5)
vse32.v v14,0(a6)
vse32.v v15,0(a7)
sub t4,t4,t1
add t5,t5,t6
add a0,a0,t3
add a1,a1,t3
add a2,a2,t3
add a3,a3,t3
add a4,a4,t3
add a5,a5,t3
add a6,a6,t3
add a7,a7,t3
bne t4,zero,.L3
.L5:
ret
The details of the approach:
Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):
+/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
+ vectors of type VECTYPE. MASKED_P says whether the masked form is needed. */
-bool
+internal_fn
vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
bool masked_p)
{
- if (masked_p)
- return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
- vec_mask_load_lanes_optab,
- vectype, count);
+ if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
+ vec_mask_len_load_lanes_optab,
+ vectype, count))
+ return IFN_MASK_LEN_LOAD_LANES;
+ else if (masked_p)
+ {
+ if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+ vec_mask_load_lanes_optab,
+ vectype, count))
+ return IFN_MASK_LOAD_LANES;
+ }
else
- return vect_lanes_optab_supported_p ("vec_load_lanes",
- vec_load_lanes_optab,
- vectype, count);
+ {
+ if (vect_lanes_optab_supported_p ("vec_load_lanes",
+ vec_load_lanes_optab,
+ vectype, count))
+ return IFN_LOAD_LANES;
+ }
+ return IFN_LAST;
}
Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
I change it into return internal_fn of the LANES LOAD/STORE that target support,
If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
Step 2 - Compute IFN for LANES LOAD/STORE (Only compute once).
if (!STMT_VINFO_STRIDED_P (first_stmt_info)
&& (can_overrun_p || !would_overrun_p)
&& compare_step_with_zero (vinfo, stmt_info) > 0)
{
/* First cope with the degenerate case of a single-element
vector. */
if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
;
else
{
/* Otherwise try using LOAD/STORE_LANES. */
*lanes_ifn
= vls_type == VLS_LOAD
? vect_load_lanes_supported (vectype, group_size, masked_p)
: vect_store_lanes_supported (vectype, group_size,
masked_p);
if (*lanes_ifn != IFN_LAST)
{
*memory_access_type = VMAT_LOAD_STORE_LANES;
overrun_p = would_overrun_p;
}
/* If that fails, try using permuting loads. */
else if (vls_type == VLS_LOAD
? vect_grouped_load_supported (vectype,
single_element_p,
group_size)
: vect_grouped_store_supported (vectype, group_size))
{
*memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
overrun_p = would_overrun_p;
}
}
}
Step 3 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
+ if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
+ {
+ if (loop_lens)
+ final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ ncopies, vectype, j, 1);
+ else
+ final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+ {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+ }
+ }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_mask)
+ {
+ /* Emit:
+ MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
+ LEN, BIAS, VEC_ARRAY). */
+ unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+ tree alias_ptr = build_int_cst (ref_type, align);
+ call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
+ dataref_ptr, alias_ptr,
+ final_mask, final_len, bias,
+ vec_array);
+ }
+ else if (final_mask)
The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.
gcc/ChangeLog:
* internal-fn.cc (internal_load_fn_p): Apply
MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
(internal_store_fn_p): Ditto.
(internal_fn_len_index): Ditto.
(internal_fn_mask_index): Ditto.
(internal_fn_stored_value_index): Ditto.
* tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
(vect_load_lanes_supported): Ditto.
* tree-vect-loop.cc: Ditto.
* tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
(get_group_load_store_type): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
* tree-vectorizer.h (vect_store_lanes_supported): Ditto.
(vect_load_lanes_supported): Ditto.
|
|
The following supports vectorizing BB reductions involving a
constant or an invariant.
* tree-vectorizer.h (_slp_instance::remain_stmts): Change
to ...
(_slp_instance::remain_defs): ... this.
(SLP_INSTANCE_REMAIN_STMTS): Rename to ...
(SLP_INSTANCE_REMAIN_DEFS): ... this.
(slp_root::remain): New.
(slp_root::slp_root): Adjust.
* tree-vect-slp.cc (vect_free_slp_instance): Adjust.
(vect_build_slp_instance): Get extra remain parameter,
adjust former handling of a cut off stmt.
(vect_analyze_slp_instance): Adjust.
(vect_analyze_slp): Likewise.
(_bb_vec_info::~_bb_vec_info): Likewise.
(vectorizable_bb_reduc_epilogue): Dump something if we fail.
(vect_slp_check_for_constructors): Handle non-internal
defs as remain defs of a reduction.
(vectorize_slp_instance_root_stmt): Adjust.
* gcc.dg/vect/bb-slp-75.c: New testcase.
|
|
The insert location argument isn't actually used but we compute
that ourselves. There's a single spot, namely when asking
for the loop mask via vect_get_loop_mask that the passed argument
is used but that looks like an oversight. The following fixes that
and adjusts vectorizable_live_operation and can_vectorize_live_stmts
to no longer take a stmt iterator argument.
* tree-vectorizer.h (vectorizable_live_operation): Remove
gimple_stmt_iterator * argument.
* tree-vect-loop.cc (vectorizable_live_operation): Likewise.
Adjust plumbing around vect_get_loop_mask.
(vect_analyze_loop_operations): Adjust.
* tree-vect-slp.cc (vect_slp_analyze_node_operations_1): Likewise.
(vect_bb_slp_mark_live_stmts): Likewise.
(vect_schedule_slp_node): Likewise.
* tree-vect-stmts.cc (can_vectorize_live_stmts): Likewise.
Remove gimple_stmt_iterator * argument.
(vect_transform_stmt): Adjust.
|
|
The following enhances BB reduction vectorization to support
vectorizing only a subset of the lanes, keeping the rest as
scalar ops. For now we try to make the number of lanes even
by leaving alone the "last" lane. That's because SLP discovery
with all lanes will fail too soon to get us any hint on which
lane to strip and likewise we don't know what vector modes the
target supports so restricting ourselves to power-of-two or
other cases isn't easy.
This is enough to get at the vectorization opportunity for the
testcase in the PR - albeit with the chosen lanes not optimal
but at least vectorizable.
PR tree-optimization/49955
* tree-vectorizer.h (_slp_instance::remain_stmts): New.
(SLP_INSTANCE_REMAIN_STMTS): Likewise.
* tree-vect-slp.cc (vect_free_slp_instance): Release
SLP_INSTANCE_REMAIN_STMTS.
(vect_build_slp_instance): Make the number of lanes of
a BB reduction even.
(vectorize_slp_instance_root_stmt): Handle unvectorized
defs of a BB reduction.
* gfortran.dg/vect/pr49955.f: New testcase.
|
|
The following unifies SLP_TREE_VEC_STMTS into SLP_TREE_VEC_DEFS
which can handle all cases we need.
* tree-vectorizer.h (_slp_tree::push_vec_def): Add.
(_slp_tree::vec_stmts): Remove.
(SLP_TREE_VEC_STMTS): Remove.
* tree-vect-slp.cc (_slp_tree::push_vec_def): Define.
(_slp_tree::_slp_tree): Adjust.
(_slp_tree::~_slp_tree): Likewise.
(vect_get_slp_vect_def): Simplify.
(vect_get_slp_defs): Likewise.
(vect_transform_slp_perm_load_1): Adjust.
(vect_add_slp_permutation): Likewise.
(vect_schedule_slp_node): Likewise.
(vectorize_slp_instance_root_stmt): Likewise.
(vect_schedule_scc): Likewise.
* tree-vect-stmts.cc (vectorizable_bswap): Use push_vec_def.
(vectorizable_call): Likewise.
(vectorizable_call): Likewise.
(vect_create_vectorized_demotion_stmts): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_assignment): Likewise.
(vectorizable_shift): Likewise.
(vectorizable_operation): Likewise.
(vectorizable_load): Likewise.
(vectorizable_condition): Likewise.
(vectorizable_comparison): Likewise.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Adjust.
(vectorize_fold_left_reduction): Use push_vec_def.
(vect_transform_reduction): Likewise.
(vect_transform_cycle_phi): Likewise.
(vectorizable_lc_phi): Likewise.
(vectorizable_phi): Likewise.
(vectorizable_recurr): Likewise.
(vectorizable_induction): Likewise.
(vectorizable_live_operation): Likewise.
|
|
The following consolidates an assert that now hits for ppc64le
with an earlier check we already do, simplifying
vect_determine_partial_vectors_and_peeling and getting rid of
its now redundant argument.
PR tree-optimization/110563
* tree-vectorizer.h (vect_determine_partial_vectors_and_peeling):
Remove second argument.
* tree-vect-loop.cc (vect_determine_partial_vectors_and_peeling):
Remove for_epilogue_p argument. Merge assert ...
(vect_analyze_loop_2): ... with check done before determining
partial vectors by moving it after.
* tree-vect-loop-manip.cc (vect_do_peeling): Adjust.
|
|
This implemens fully masked vectorization or a masked epilog for
AVX512 style masks which single themselves out by representing
each lane with a single bit and by using integer modes for the mask
(both is much like GCN).
AVX512 is also special in that it doesn't have any instruction
to compute the mask from a scalar IV like SVE has with while_ult.
Instead the masks are produced by vector compares and the loop
control retains the scalar IV (mainly to avoid dependences on
mask generation, a suitable mask test instruction is available).
Like RVV code generation prefers a decrementing IV though IVOPTs
messes things up in some cases removing that IV to eliminate
it with an incrementing one used for address generation.
One of the motivating testcases is from PR108410 which in turn
is extracted from x264 where large size vectorization shows
issues with small trip loops. Execution time there improves
compared to classic AVX512 with AVX2 epilogues for the cases
of less than 32 iterations.
size scalar 128 256 512 512e 512f
1 9.42 11.32 9.35 11.17 15.13 16.89
2 5.72 6.53 6.66 6.66 7.62 8.56
3 4.49 5.10 5.10 5.74 5.08 5.73
4 4.10 4.33 4.29 5.21 3.79 4.25
6 3.78 3.85 3.86 4.76 2.54 2.85
8 3.64 1.89 3.76 4.50 1.92 2.16
12 3.56 2.21 3.75 4.26 1.26 1.42
16 3.36 0.83 1.06 4.16 0.95 1.07
20 3.39 1.42 1.33 4.07 0.75 0.85
24 3.23 0.66 1.72 4.22 0.62 0.70
28 3.18 1.09 2.04 4.20 0.54 0.61
32 3.16 0.47 0.41 0.41 0.47 0.53
34 3.16 0.67 0.61 0.56 0.44 0.50
38 3.19 0.95 0.95 0.82 0.40 0.45
42 3.09 0.58 1.21 1.13 0.36 0.40
'size' specifies the number of actual iterations, 512e is for
a masked epilog and 512f for the fully masked loop. From
4 scalar iterations on the AVX512 masked epilog code is clearly
the winner, the fully masked variant is clearly worse and
it's size benefit is also tiny.
This patch does not enable using fully masked loops or
masked epilogues by default. More work on cost modeling
and vectorization kind selection on x86_64 is necessary
for this.
Implementation wise this introduces LOOP_VINFO_PARTIAL_VECTORS_STYLE
which could be exploited further to unify some of the flags
we have right now but there didn't seem to be many easy things
to merge, so I'm leaving this for followups.
Mask requirements as registered by vect_record_loop_mask are kept in their
original form and recorded in a hash_set now instead of being
processed to a vector of rgroup_controls. Instead that's now
left to the final analysis phase which tries forming the rgroup_controls
vector using while_ult and if that fails now tries AVX512 style
which needs a different organization and instead fills a hash_map
with the relevant info. vect_get_loop_mask now has two implementations,
one for the two mask styles we then have.
I have decided against interweaving vect_set_loop_condition_partial_vectors
with conditions to do AVX512 style masking and instead opted to
"duplicate" this to vect_set_loop_condition_partial_vectors_avx512.
Likewise for vect_verify_full_masking vs vect_verify_full_masking_avx512.
The vect_prepare_for_masked_peels hunk might run into issues with
SVE, I didn't check yet but using LOOP_VINFO_RGROUP_COMPARE_TYPE
looked odd.
Bootstrapped and tested on x86_64-unknown-linux-gnu. I've run
the testsuite with --param vect-partial-vector-usage=2 with and
without -fno-vect-cost-model and filed two bugs, one ICE (PR110221)
and one latent wrong-code (PR110237).
* tree-vectorizer.h (enum vect_partial_vector_style): New.
(_loop_vec_info::partial_vector_style): Likewise.
(LOOP_VINFO_PARTIAL_VECTORS_STYLE): Likewise.
(rgroup_controls::compare_type): Add.
(vec_loop_masks): Change from a typedef to auto_vec<>
to a structure.
* tree-vect-loop-manip.cc (vect_set_loop_condition_partial_vectors):
Adjust. Convert niters_skip to compare_type.
(vect_set_loop_condition_partial_vectors_avx512): New function
implementing the AVX512 partial vector codegen.
(vect_set_loop_condition): Dispatch to the correct
vect_set_loop_condition_partial_vectors_* function based on
LOOP_VINFO_PARTIAL_VECTORS_STYLE.
(vect_prepare_for_masked_peels): Compute LOOP_VINFO_MASK_SKIP_NITERS
in the original niter type.
* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize
partial_vector_style.
(can_produce_all_loop_masks_p): Adjust.
(vect_verify_full_masking): Produce the rgroup_controls vector
here. Set LOOP_VINFO_PARTIAL_VECTORS_STYLE on success.
(vect_verify_full_masking_avx512): New function implementing
verification of AVX512 style masking.
(vect_verify_loop_lens): Set LOOP_VINFO_PARTIAL_VECTORS_STYLE.
(vect_analyze_loop_2): Also try AVX512 style masking.
Adjust condition.
(vect_estimate_min_profitable_iters): Implement AVX512 style
mask producing cost.
(vect_record_loop_mask): Do not build the rgroup_controls
vector here but record masks in a hash-set.
(vect_get_loop_mask): Implement AVX512 style mask query,
complementing the existing while_ult style.
|
|
This adds a loop_vinfo argument for future use, making the next
patch smaller.
* tree-vectorizer.h (vect_get_loop_mask): Add loop_vec_info
argument.
* tree-vect-loop.cc (vect_get_loop_mask): Likewise.
(vectorize_fold_left_reduction): Adjust.
(vect_transform_reduction): Likewise.
(vectorizable_live_operation): Likewise.
* tree-vect-stmts.cc (vectorizable_call): Likewise.
(vectorizable_operation): Likewise.
(vectorizable_store): Likewise.
(vectorizable_load): Likewise.
(vectorizable_condition): Likewise.
|
|
This patch address comments from Richard && Richi and rebase to trunk.
This patch is adding SELECT_VL middle-end support
allow target have target dependent optimization in case of
length calculation.
This patch is inspired by RVV ISA and LLVM:
https://reviews.llvm.org/D99750
The SELECT_VL is same behavior as LLVM "get_vector_length" with
these following properties:
1. Only apply on single-rgroup.
2. non SLP.
3. adjust loop control IV.
4. adjust data reference IV.
5. allow non-vf elements processing in non-final iteration
Code
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
Take RVV codegen for example:
Before this patch:
vvaddint32:
ble a0,zero,.L6
csrr a4,vlenb
srli a6,a4,2
.L4:
mv a5,a0
bleu a0,a6,.L3
mv a5,a6
.L3:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetvli a7,zero,e32,m1,ta,ma
sub a0,a0,a5
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a3)
add a2,a2,a4
add a3,a3,a4
add a1,a1,a4
bne a0,zero,.L4
.L6:
ret
After this patch:
vvaddint32:
vsetvli t0, a0, e32, ta, ma # Set vector length based on 32-bit vectors
vle32.v v0, (a1) # Get first vector
sub a0, a0, t0 # Decrement number done
slli t0, t0, 2 # Multiply number done by 4 bytes
add a1, a1, t0 # Bump pointer
vle32.v v1, (a2) # Get second vector
add a2, a2, t0 # Bump pointer
vadd.vv v2, v0, v1 # Sum vectors
vse32.v v2, (a3) # Store result
add a3, a3, t0 # Bump pointer
bnez a0, vvaddint32 # Loop back
ret # Finished
Co-authored-by: Richard Sandiford<richard.sandiford@arm.com>
Co-authored-by: Richard Biener <rguenther@suse.de>
gcc/ChangeLog:
* doc/md.texi: Add SELECT_VL support.
* internal-fn.def (SELECT_VL): Ditto.
* optabs.def (OPTAB_D): Ditto.
* tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Ditto.
* tree-vect-stmts.cc (get_select_vl_data_ref_ptr): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
* tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): Ditto.
|
|
Refactor vect-patterns to allow patterns to be internal_fns starting
with widening_plus/minus patterns
2023-06-05 Andre Vieira <andre.simoesdiasvieira@arm.com>
Joel Hutton <joel.hutton@arm.com>
gcc/ChangeLog:
* tree-vect-patterns.cc: Add include for gimple-iterator.
(vect_recog_widen_op_pattern): Refactor to use code_helper.
(vect_gimple_build): New function.
* tree-vect-stmts.cc (simple_integer_narrowing): Refactor to use
code_helper.
(vectorizable_call): Likewise.
(vect_gen_widened_results_half): Likewise.
(vect_create_vectorized_demotion_stmts): Likewise.
(vect_create_vectorized_promotion_stmts): Likewise.
(vect_create_half_widening_stmts): Likewise.
(vectorizable_conversion): Likewise.
(supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
* tree-vectorizer.h (supportable_widening_operation): Change
prototype to use code_helper.
(supportable_narrowing_operation): Likewise.
(vect_gimple_build): New function prototype.
* tree.h (code_helper::safe_as_tree_code): New function.
(code_helper::safe_as_fn_code): New function.
|
|
precision.
Similar like WIDEN FLOAT_EXPR, when direct_optab is not existed, try
intermediate integer type whenever gimple ranger can tell it's safe.
.i.e.
When there's no direct optab for vector long long -> vector float, but
the value range of integer can be represented as int, try vector int
-> vector float if availble.
gcc/ChangeLog:
PR tree-optimization/108804
* tree-vect-patterns.cc (vect_get_range_info): Remove static.
* tree-vect-stmts.cc (vect_create_vectorized_demotion_stmts):
Add new parameter narrow_src_p.
(vectorizable_conversion): Enhance NARROW FLOAT_EXPR
vectorization by truncating to lower precision.
* tree-vectorizer.h (vect_get_range_info): New declare.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr108804.c: New test.
|
|
This patch is supporting decrement IV by following the flow designed by
Richard:
(1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
call vect_set_loop_controls_directly.
(2) vect_set_loop_controls_directly calculates "step" as in your patch.
If rgc has 1 control, this step is the SSA name created for that
control. Otherwise the step is a fresh SSA name, as in your patch.
(3) vect_set_loop_controls_directly stores this step somewhere for later
use, probably in LOOP_VINFO. Let's use "S" to refer to this stored
step.
(4) After the vect_set_loop_controls_directly call above, and outside
the "if" statement that now contains vect_set_loop_controls_directly,
check whether rgc->controls.length () > 1. If so, use
vect_adjust_loop_lens_control to set the controls based on S.
Then the only caller of vect_adjust_loop_lens_control is
vect_set_loop_condition_partial_vectors. And the starting
step for vect_adjust_loop_lens_control is always S.
This patch has well tested for single-rgroup and multiple-rgroup (SLP)
and passed all testcase in RISC-V port.
Signed-off-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Co-Authored-By: Richard Sandiford <richard.sandiford@arm.com>
gcc/ChangeLog:
* tree-vect-loop-manip.cc (vect_adjust_loop_lens_control): New
function.
(vect_set_loop_controls_directly): Add decrement IV support.
(vect_set_loop_condition_partial_vectors): Ditto.
* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): New
variable.
* tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New
macro.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-3.c: New test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-4.c: New test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-3.c: New test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-4.c: New test.
|
|
Address comments from Richard that splits the patch of fixing
multiple-rgroup
handling of length counting elements.
This patch is fixing issue of handling multiple-rgroup of length is
counting elements
Before this patch, multiple rgroup run fail:
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c
execution test
FAIL: gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c
execution test
After this patch, These tests are all passed.
gcc/ChangeLog:
* tree-vect-loop.cc (vect_get_loop_len): Fix issue for
multiple-rgroup of length.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.
* tree-vectorizer.h (vect_get_loop_len): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.c: New
test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-1.h: New
test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.c: New
test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup-2.h: New
test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-1.c:
New test.
* gcc.target/riscv/rvv/autovec/partial/multiple_rgroup_run-2.c:
New test.
|
|
Many functions defined in our headers are declared 'static inline' which
is a C idiom whose use predates our move to C++ as the implementation
language. But in C++ the inline keyword is more than just a compiler
hint, and is sufficient to give the function the intended semantics.
In fact declaring a function both static and inline is a pessimization
since static effectively disables the desired definition merging
behavior enabled by inline, and is also a source of (harmless) ODR
violations when a static inline function gets called from a non-static
inline one (such as tree_operand_check calling tree_operand_length).
This patch mechanically fixes the vast majority of occurrences of this
anti-pattern throughout the compiler's headers via the command line
sed -i 's/^static inline/inline/g' gcc/*.h gcc/*/*.h
There's also a manual change to remove the redundant declarations
of is_ivar and lookup_category in gcc/objc/objc-act.cc which would
otherwise conflict with their modified definitions in objc-act.h
(due to the difference in staticness).
Besides fixing some ODR violations, this speeds up stage1 cc1plus by
about 2% and reduces the size of its text segment by 1.5MB.
gcc/ChangeLog:
* addresses.h: Mechanically drop 'static' from 'static inline'
functions via s/^static inline/inline/g.
* asan.h: Likewise.
* attribs.h: Likewise.
* basic-block.h: Likewise.
* bitmap.h: Likewise.
* cfghooks.h: Likewise.
* cfgloop.h: Likewise.
* cgraph.h: Likewise.
* cselib.h: Likewise.
* data-streamer.h: Likewise.
* debug.h: Likewise.
* df.h: Likewise.
* diagnostic.h: Likewise.
* dominance.h: Likewise.
* dumpfile.h: Likewise.
* emit-rtl.h: Likewise.
* except.h: Likewise.
* expmed.h: Likewise.
* expr.h: Likewise.
* fixed-value.h: Likewise.
* gengtype.h: Likewise.
* gimple-expr.h: Likewise.
* gimple-iterator.h: Likewise.
* gimple-predict.h: Likewise.
* gimple-range-fold.h: Likewise.
* gimple-ssa.h: Likewise.
* gimple.h: Likewise.
* graphite.h: Likewise.
* hard-reg-set.h: Likewise.
* hash-map.h: Likewise.
* hash-set.h: Likewise.
* hash-table.h: Likewise.
* hwint.h: Likewise.
* input.h: Likewise.
* insn-addr.h: Likewise.
* internal-fn.h: Likewise.
* ipa-fnsummary.h: Likewise.
* ipa-icf-gimple.h: Likewise.
* ipa-inline.h: Likewise.
* ipa-modref.h: Likewise.
* ipa-prop.h: Likewise.
* ira-int.h: Likewise.
* ira.h: Likewise.
* lra-int.h: Likewise.
* lra.h: Likewise.
* lto-streamer.h: Likewise.
* memmodel.h: Likewise.
* omp-general.h: Likewise.
* optabs-query.h: Likewise.
* optabs.h: Likewise.
* plugin.h: Likewise.
* pretty-print.h: Likewise.
* range.h: Likewise.
* read-md.h: Likewise.
* recog.h: Likewise.
* regs.h: Likewise.
* rtl-iter.h: Likewise.
* rtl.h: Likewise.
* sbitmap.h: Likewise.
* sched-int.h: Likewise.
* sel-sched-ir.h: Likewise.
* sese.h: Likewise.
* sparseset.h: Likewise.
* ssa-iterators.h: Likewise.
* system.h: Likewise.
* target-globals.h: Likewise.
* target.h: Likewise.
* timevar.h: Likewise.
* tree-chrec.h: Likewise.
* tree-data-ref.h: Likewise.
* tree-iterator.h: Likewise.
* tree-outof-ssa.h: Likewise.
* tree-phinodes.h: Likewise.
* tree-scalar-evolution.h: Likewise.
* tree-sra.h: Likewise.
* tree-ssa-alias.h: Likewise.
* tree-ssa-live.h: Likewise.
* tree-ssa-loop-manip.h: Likewise.
* tree-ssa-loop.h: Likewise.
* tree-ssa-operands.h: Likewise.
* tree-ssa-propagate.h: Likewise.
* tree-ssa-sccvn.h: Likewise.
* tree-ssa.h: Likewise.
* tree-ssanames.h: Likewise.
* tree-streamer.h: Likewise.
* tree-switch-conversion.h: Likewise.
* tree-vectorizer.h: Likewise.
* tree.h: Likewise.
* wide-int.h: Likewise.
gcc/c-family/ChangeLog:
* c-common.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
gcc/c/ChangeLog:
* c-parser.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
gcc/cp/ChangeLog:
* cp-tree.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
gcc/fortran/ChangeLog:
* gfortran.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
gcc/jit/ChangeLog:
* jit-dejagnu.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
* jit-recording.h: Likewise.
gcc/objc/ChangeLog:
* objc-act.h: Mechanically drop static from static inline
functions via s/^static inline/inline/g.
* objc-map.h: Likewise.
* objc-act.cc: Remove the redundant redeclarations of is_ivar
and lookup_category.
|
|
Normally when vf is not constant, it will be prevented by
vectorizable_nonlinear_inductions, but for this case, it failed going
into
if (STMT_VINFO_RELEVANT_P (stmt_info))
{
need_to_vectorize = true;
if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
&& ! PURE_SLP_STMT (stmt_info))
ok = vectorizable_induction (loop_vinfo,
stmt_info, NULL, NULL,
&cost_vec);
since the iv is never used outside of the loop, and will be dce later, so
vectorizer doesn't bother checking if it's vectorizable. it's
true but hit gcc_assert in vect_can_peel_nonlinear_iv_p when vf is not
constant. One solution is ignoring the nonlinear iv peeling if it's
!STMT_VINFO_RELEVANT_P (stmt_info) just like the upper code, the other
solution is returning false earlier in the
vect_can_peel_nonlinear_iv_p when vf is not constant, the patch chooses
the second incase there's other cases using vect_can_advance_ivs_p which
calls vect_can_peel_nonlinear_iv_p.
Also remove vect_peel_nonlinear_iv_p from
vectorizable_nonlinear_inductions.
gcc/ChangeLog:
PR tree-optimization/108601
* tree-vectorizer.h (vect_can_peel_nonlinear_iv_p): Removed.
* tree-vect-loop.cc
(vectorizable_nonlinear_induction): Remove
vect_can_peel_nonlinear_iv_p.
(vect_can_peel_nonlinear_iv_p): Don't peel
nonlinear iv(mult or shift) for epilog when vf is not
constant and moved the defination to ..
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p):
.. Here.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/pr108601.c: New test.
|
|
|
|
The following picks up the prototype by Ju-Zhe Zhong for vectorizing
first order recurrences. That solves two TSVC missed optimization PRs.
There's a new scalar cycle def kind, vect_first_order_recurrence
and it's handling of the backedge value vectorization is complicated
by the fact that the vectorized value isn't the PHI but instead
a (series of) permute(s) shifting in the recurring value from the
previous iteration. I've implemented this by creating both the
single vectorized PHI and the series of permutes when vectorizing
the scalar PHI but leave the backedge values in both unassigned.
The backedge values are (for the testcases) computed by a load
which is also the place after which the permutes are inserted.
That placement also restricts the cases we can handle (without
resorting to code motion).
I added both costing and SLP handling though SLP handling is
restricted to the case where a single vectorized PHI is enough.
Missing is epilogue handling - while prologue peeling would
be handled transparently by adjusting iv_phi_p the epilogue
case doesn't work with just inserting a scalar LC PHI since
that a) keeps the scalar load live and b) that loads is the
wrong one, it has to be the last, much like when we'd vectorize
the LC PHI as live operation. Unfortunately LIVE
compute/analysis happens too early before we decide on
peeling. When using fully masked loop vectorization the
vect-recurr-6.c works as expected though.
I have tested this on x86_64 for now, but since epilogue
handling is missing there's probably no practical cases.
My prototype WHILE_ULT AVX512 patch can handle vect-recurr-6.c
just fine but I didn't feel like running SPEC within SDE nor
is the WHILE_ULT patch complete enough.
PR tree-optimization/99409
PR tree-optimization/99394
* tree-vectorizer.h (vect_def_type::vect_first_order_recurrence): Add.
(stmt_vec_info_type::recurr_info_type): Likewise.
(vectorizable_recurr): New function.
* tree-vect-loop.cc (vect_phi_first_order_recurrence_p): New
function.
(vect_analyze_scalar_cycles_1): Look for first order
recurrences.
(vect_analyze_loop_operations): Handle them.
(vect_transform_loop): Likewise.
(vectorizable_recurr): New function.
(maybe_set_vectorized_backedge_value): Handle the backedge value
setting in the first order recurrence PHI and the permutes.
* tree-vect-stmts.cc (vect_analyze_stmt): Handle first order
recurrences.
(vect_transform_stmt): Likewise.
(vect_is_simple_use): Likewise.
(vect_is_simple_use): Likewise.
* tree-vect-slp.cc (vect_get_and_check_slp_defs): Likewise.
(vect_build_slp_tree_2): Likewise.
(vect_schedule_scc): Handle the backedge value setting in the
first order recurrence PHI and the permutes.
* gcc.dg/vect/vect-recurr-1.c: New testcase.
* gcc.dg/vect/vect-recurr-2.c: Likewise.
* gcc.dg/vect/vect-recurr-3.c: Likewise.
* gcc.dg/vect/vect-recurr-4.c: Likewise.
* gcc.dg/vect/vect-recurr-5.c: Likewise.
* gcc.dg/vect/vect-recurr-6.c: Likewise.
* gcc.dg/vect/tsvc/vect-tsvc-s252.c: Un-XFAIL.
* gcc.dg/vect/tsvc/vect-tsvc-s254.c: Likewise.
* gcc.dg/vect/tsvc/vect-tsvc-s291.c: Likewise.
Co-authored-by: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
|
|
vectorizable_nonlinear_induction doesn't always guard
vect_peel_nonlinear_iv_init when it's called by
vect_update_ivs_after_vectorizer.
It's supposed to be guarded by vect_can_advance_ivs_p.
gcc/ChangeLog:
PR tree-optimization/107055
* tree-vect-loop-manip.cc (vect_can_advance_ivs_p): Check for
nonlinear induction variables.
* tree-vect-loop.cc (vect_can_peel_nonlinear_iv_p): New
functions.
(vectorizable_nonlinear_induction): Put part codes into
vect_can_peel_nonlinear_iv_p.
* tree-vectorizer.h (vect_can_peel_nonlinear_iv_p): Declare.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr107055.c: New test.
|
|
with a constant.
For neg, the patch create a vec_init as [ a, -a, a, -a, ... ] and no
vec_step is needed to update vectorized iv since vf is always multiple
of 2(negative * negative is positive).
For shift, the patch create a vec_init as [ a, a >> c, a >> 2*c, ..]
as vec_step as [ c * nunits, c * nunits, c * nunits, ... ], vectorized iv is
updated as vec_def = vec_init >>/<< vec_step.
For mul, the patch create a vec_init as [ a, a * c, a * pow(c, 2), ..]
as vec_step as [ pow(c,nunits), pow(c,nunits),...] iv is updated as vec_def =
vec_init * vec_step.
The patch handles nonlinear iv for
1. Integer type only, floating point is not handled.
2. No slp_node.
3. iv_loop should be same as vector loop, not nested loop.
4. No UD is created, for mul, use unsigned mult to avoid UD, for
shift, shift count should be less than type precision.
gcc/ChangeLog:
PR tree-optimization/103144
* tree-vect-loop.cc (vect_is_nonlinear_iv_evolution): New function.
(vect_analyze_scalar_cycles_1): Detect nonlinear iv by upper function.
(vect_create_nonlinear_iv_init): New function.
(vect_peel_nonlinear_iv_init): Ditto.
(vect_create_nonlinear_iv_step): Ditto
(vect_create_nonlinear_iv_vec_step): Ditto
(vect_update_nonlinear_iv): Ditto
(vectorizable_nonlinear_induction): Ditto.
(vectorizable_induction): Call
vectorizable_nonlinear_induction when induction_type is not
vect_step_op_add.
* tree-vect-loop-manip.cc (vect_update_ivs_after_vectorizer):
Update nonlinear iv for epilogue loop.
* tree-vectorizer.h (enum vect_induction_op_type): New enum.
(STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE): New Macro.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103144-mul-1.c: New test.
* gcc.target/i386/pr103144-mul-2.c: New test.
* gcc.target/i386/pr103144-neg-1.c: New test.
* gcc.target/i386/pr103144-neg-2.c: New test.
* gcc.target/i386/pr103144-shift-1.c: New test.
* gcc.target/i386/pr103144-shift-2.c: New test.
|
|
Currently SLP tries to force permute operations "down" the graph
from loads in the hope of reducing the total number of permutations
needed or (in the best case) removing the need for the permutations
entirely. This patch tries to extend it as follows:
- Allow loads to take a different permutation from the one they
started with, rather than choosing between "original permutation"
and "no permutation".
- Allow changes in both directions, if the target supports the
reverse permutation.
- Treat the placement of permutations as a two-way dataflow problem:
after propagating information from leaves to roots (as now), propagate
information back up the graph.
- Take execution frequency into account when optimising for speed,
so that (for example) permutations inside loops have a higher
cost than permutations outside loops.
- Try to reduce the total number of permutations when optimising for
size, even if that increases the number of permutations on a given
execution path.
See the big block comment above vect_optimize_slp_pass for
a detailed description.
The original motivation for doing this was to add a framework that would
allow other layout differences in future. The two main ones are:
- Make it easier to represent predicated operations, including
predicated operations with gaps. E.g.:
a[0] += 1;
a[1] += 1;
a[3] += 1;
could be a single load/add/store for SVE. We could handle this
by representing a layout such as { 0, 1, _, 2 } or { 0, 1, _, 3 }
(depending on what's being counted). We might need to move
elements between lanes at various points, like with permutes.
(This would first mean adding support for stores with gaps.)
- Make it easier to switch between an even/odd and unpermuted layout
when switching between wide and narrow elements. E.g. if a widening
operation produces an even vector and an odd vector, we should try
to keep operations on the wide elements in that order rather than
force them to be permuted back "in order".
To give some examples of what the patch does:
int f1(int *__restrict a, int *__restrict b, int *__restrict c,
int *__restrict d)
{
a[0] = (b[1] << c[3]) - d[1];
a[1] = (b[0] << c[2]) - d[0];
a[2] = (b[3] << c[1]) - d[3];
a[3] = (b[2] << c[0]) - d[2];
}
continues to produce the same code as before when optimising for
speed: b, c and d are permuted at load time. But when optimising
for size we instead permute c into the same order as b+d and then
permute the result of the arithmetic into the same order as a:
ldr q1, [x2]
ldr q0, [x1]
ext v1.16b, v1.16b, v1.16b, #8 // <------
sshl v0.4s, v0.4s, v1.4s
ldr q1, [x3]
sub v0.4s, v0.4s, v1.4s
rev64 v0.4s, v0.4s // <------
str q0, [x0]
ret
The following function:
int f2(int *__restrict a, int *__restrict b, int *__restrict c,
int *__restrict d)
{
a[0] = (b[3] << c[3]) - d[3];
a[1] = (b[2] << c[2]) - d[2];
a[2] = (b[1] << c[1]) - d[1];
a[3] = (b[0] << c[0]) - d[0];
}
continues to push the reverse down to just before the store,
like the previous code did.
In:
int f3(int *__restrict a, int *__restrict b, int *__restrict c,
int *__restrict d)
{
for (int i = 0; i < 100; ++i)
{
a[0] = (a[0] + c[3]);
a[1] = (a[1] + c[2]);
a[2] = (a[2] + c[1]);
a[3] = (a[3] + c[0]);
c += 4;
}
}
the loads of a are hoisted and the stores of a are sunk, so that
only the load from c happens in the loop. When optimising for
speed, we prefer to have the loop operate on the reversed layout,
changing on entry and exit from the loop:
mov x3, x0
adrp x0, .LC0
add x1, x2, 1600
ldr q2, [x0, #:lo12:.LC0]
ldr q0, [x3]
mov v1.16b, v0.16b
tbl v0.16b, {v0.16b - v1.16b}, v2.16b // <--------
.p2align 3,,7
.L6:
ldr q1, [x2], 16
add v0.4s, v0.4s, v1.4s
cmp x2, x1
bne .L6
mov v1.16b, v0.16b
adrp x0, .LC0
ldr q2, [x0, #:lo12:.LC0]
tbl v0.16b, {v0.16b - v1.16b}, v2.16b // <--------
str q0, [x3]
ret
Similarly, for the very artificial testcase:
int f4(int *__restrict a, int *__restrict b, int *__restrict c,
int *__restrict d)
{
int a0 = a[0];
int a1 = a[1];
int a2 = a[2];
int a3 = a[3];
for (int i = 0; i < 100; ++i)
{
a0 ^= c[0];
a1 ^= c[1];
a2 ^= c[2];
a3 ^= c[3];
c += 4;
for (int j = 0; j < 100; ++j)
{
a0 += d[1];
a1 += d[0];
a2 += d[3];
a3 += d[2];
d += 4;
}
b[0] = a0;
b[1] = a1;
b[2] = a2;
b[3] = a3;
b += 4;
}
a[0] = a0;
a[1] = a1;
a[2] = a2;
a[3] = a3;
}
the a vector in the inner loop maintains the order { 1, 0, 3, 2 },
even though it's part of an SCC that includes the outer loop.
In other words, this is a motivating case for not assigning
permutes at SCC granularity. The code we get is:
ldr q0, [x0]
mov x4, x1
mov x5, x0
add x1, x3, 1600
add x3, x4, 1600
.p2align 3,,7
.L11:
ldr q1, [x2], 16
sub x0, x1, #1600
eor v0.16b, v1.16b, v0.16b
rev64 v0.4s, v0.4s // <---
.p2align 3,,7
.L10:
ldr q1, [x0], 16
add v0.4s, v0.4s, v1.4s
cmp x0, x1
bne .L10
rev64 v0.4s, v0.4s // <---
add x1, x0, 1600
str q0, [x4], 16
cmp x3, x4
bne .L11
str q0, [x5]
ret
bb-slp-layout-17.c is a collection of compile tests for problems
I hit with earlier versions of the patch. The same prolems might
show up elsewhere, but it seemed worth having the test anyway.
In slp-11b.c we previously pushed the permutation of the in[i*4]
group down from the load to just before the store. That didn't
reduce the number or frequency of the permutations (or increase
them either). But separating the permute from the load meant
that we could no longer use load/store lanes.
Whether load/store lanes are a good idea here is another question.
If there were two sets of loads, and if we could use a single
permutation instead of one per load, then avoiding load/store
lanes should be a good thing even under the current abstract
cost model. But I think under the current model we should
try to avoid splitting up potential load/store lanes groups
if there is no specific benefit to the split.
Preferring load/store lanes is still a source of missed optimisations
that we should fix one day...
gcc/
* params.opt (-param=vect-max-layout-candidates=): New parameter.
* doc/invoke.texi (vect-max-layout-candidates): Document it.
* tree-vectorizer.h (auto_lane_permutation_t): New typedef.
(auto_load_permutation_t): Likewise.
* tree-vect-slp.cc (vect_slp_node_weight): New function.
(slpg_layout_cost): New class.
(slpg_vertex): Replace perm_in and perm_out with partition,
out_degree, weight and out_weight.
(slpg_partition_info, slpg_partition_layout_costs): New classes.
(vect_optimize_slp_pass): Likewise, cannibalizing some part of
the previous vect_optimize_slp.
(vect_optimize_slp): Use it.
gcc/testsuite/
* lib/target-supports.exp (check_effective_target_vect_var_shift):
Return true for aarch64.
* gcc.dg/vect/bb-slp-layout-1.c: New test.
* gcc.dg/vect/bb-slp-layout-2.c: New test.
* gcc.dg/vect/bb-slp-layout-3.c: New test.
* gcc.dg/vect/bb-slp-layout-4.c: New test.
* gcc.dg/vect/bb-slp-layout-5.c: New test.
* gcc.dg/vect/bb-slp-layout-6.c: New test.
* gcc.dg/vect/bb-slp-layout-7.c: New test.
* gcc.dg/vect/bb-slp-layout-8.c: New test.
* gcc.dg/vect/bb-slp-layout-9.c: New test.
* gcc.dg/vect/bb-slp-layout-10.c: New test.
* gcc.dg/vect/bb-slp-layout-11.c: New test.
* gcc.dg/vect/bb-slp-layout-13.c: New test.
* gcc.dg/vect/bb-slp-layout-14.c: New test.
* gcc.dg/vect/bb-slp-layout-15.c: New test.
* gcc.dg/vect/bb-slp-layout-16.c: New test.
* gcc.dg/vect/bb-slp-layout-17.c: New test.
* gcc.dg/vect/slp-11b.c: XFAIL SLP test for load-lanes targets.
|
|
The following reverts the just added assert that virtual SSA does not
need updating. It instead goes for a select whitelist of transforms
known to be prone to difficulties with virtual SSA update.
* tree-vect-loop-manip.cc (vect_do_peeling): Revert assert
and update virtual SSA form again. Assert we do so for
a known set of transforms only.
* tree-vectorizer.h (vec_info::any_known_not_updated_vssa): New.
* tree-vect-stmts.cc (vectorizable_load): When vectorizing
using load-lanes allow virtual SSA update.
|
|
This adjusts the vectorizer costing API to allow passing down the
SLP node the vector stmt is created from.
2022-02-18 Richard Biener <rguenther@suse.de>
PR tree-optimization/104582
* tree-vectorizer.h (stmt_info_for_cost::node): New field.
(vector_costs::add_stmt_cost): Add SLP node parameter.
(dump_stmt_cost): Likewise.
(add_stmt_cost): Likewise, new overload and adjust.
(add_stmt_costs): Adjust.
(record_stmt_cost): New overload.
* tree-vectorizer.cc (dump_stmt_cost): Dump the SLP node.
(vector_costs::add_stmt_cost): Adjust.
* tree-vect-loop.cc (vect_estimate_min_profitable_iters):
Adjust.
* tree-vect-slp.cc (vect_prologue_cost_for_slp): Record
the SLP node for costing.
(vectorizable_slp_permutation): Likewise.
* tree-vect-stmts.cc (record_stmt_cost): Adjust and add
new overloads.
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Adjust.
* config/aarch64/aarch64.cc (aarch64_vector_costs::add_stmt_cost):
Adjust.
* config/rs6000/rs6000.cc (rs6000_vector_costs::add_stmt_cost):
Adjust.
(rs6000_cost_data::adjust_vect_cost_per_loop): Likewise.
|
|
This simplifies the vectorizer cost API by providing overloads
to add_stmt_cost and record_stmt_cost suitable for scalar stmt
and branch stmt costing which do not need information like
a vector type or alignment. It also fixes two mistakes where
costs for versioning tests were recorded as vector stmt rather
than scalar stmt.
This is a first patch to simplify the actual fix for PR104582.
2022-02-18 Richard Biener <rguenther@suse.de>
PR tree-optimization/104582
* tree-vectorizer.h (add_stmt_cost): New overload.
(record_stmt_cost): Likewise.
* tree-vect-loop.cc (vect_compute_single_scalar_iteration_cost):
Use add_stmt_costs.
(vect_get_known_peeling_cost): Use new overloads.
(vect_estimate_min_profitable_iters): Likewise. Consistently
use scalar_stmt for costing versioning checks.
* tree-vect-stmts.cc (record_stmt_cost): New overload.
|
|
ldp_stp_1.c, ldp_stp_4.c and ldp_stp_5.c have been failing since
vectorisation was enabled at -O2. In all three cases SLP is
generating vector code when scalar code would be better.
The problem is that the target costs do not model whether STP could
be used for the scalar or vector code, so the normal latency-based
costs for store-heavy code can be way off. It would be good to fix
that “properly” at some point, but it isn't easy; see the existing
discussion in aarch64_sve_adjust_stmt_cost for more details.
This patch therefore adds an on-the-side check for whether the
code is doing nothing more than set-up+stores. It then applies
STP-based costs to those cases only, in addition to the normal
latency-based costs. (That is, the vector code has to win on
both counts rather than on one count individually.)
However, at the moment, SLP costs one vector set-up instruction
for every vector in an SLP node, even if the contents are the
same as a previous vector in the same node. Fixing the STP costs
without fixing that would regress other cases, tested in the patch.
The patch therefore makes the SLP costing code check for duplicates
within a node. Ideally we'd check for duplicates more globally,
but that would require a more global approach to costs: the cost
of an initialisation should be amoritised across all trees that
use the initialisation, rather than fully counted against one
arbitrarily-chosen subtree.
Back on aarch64: an earlier version of the patch tried to apply
the new heuristic to constant stores. However, that didn't work
too well in practice; see the comments for details. The patch
therefore just tests the status quo for constant cases, leaving out
a match if the current choice is dubious.
ldp_stp_5.c was affected by the same thing. The test would be
worth vectorising if we generated better vector code, but:
(1) We do a bad job of moving the { -1, 1 } constant, given that
we have { -1, -1 } and { 1, 1 } to hand.
(2) The vector code has 6 pairable stores to misaligned offsets.
We have peephole patterns to handle such misalignment for
4 pairable stores, but not 6.
So the SLP decision isn't wrong as such. It's just being let
down by later codegen.
The patch therefore adds -mstrict-align to preserve the original
intention of the test while adding ldp_stp_19.c to check for the
preferred vector code (XFAILed for now).
gcc/
* tree-vectorizer.h (vect_scalar_ops_slice): New struct.
(vect_scalar_ops_slice_hash): Likewise.
(vect_scalar_ops_slice::op): New function.
* tree-vect-slp.cc (vect_scalar_ops_slice::all_same_p): New function.
(vect_scalar_ops_slice_hash::hash): Likewise.
(vect_scalar_ops_slice_hash::equal): Likewise.
(vect_prologue_cost_for_slp): Check for duplicate vectors.
* config/aarch64/aarch64.cc
(aarch64_vector_costs::m_stp_sequence_cost): New member variable.
(aarch64_aligned_constant_offset_p): New function.
(aarch64_stp_sequence_cost): Likewise.
(aarch64_vector_costs::add_stmt_cost): Handle new STP heuristic.
(aarch64_vector_costs::finish_cost): Likewise.
gcc/testsuite/
* gcc.target/aarch64/ldp_stp_5.c: Require -mstrict-align.
* gcc.target/aarch64/ldp_stp_14.h,
* gcc.target/aarch64/ldp_stp_14.c: New test.
* gcc.target/aarch64/ldp_stp_15.c: Likewise.
* gcc.target/aarch64/ldp_stp_16.c: Likewise.
* gcc.target/aarch64/ldp_stp_17.c: Likewise.
* gcc.target/aarch64/ldp_stp_18.c: Likewise.
* gcc.target/aarch64/ldp_stp_19.c: Likewise.
|
|
This patch boosts the analysis for complex mul,fma and fms in order to ensure
that it doesn't create an incorrect output.
Essentially it adds an extra verification to check that the two nodes it's going
to combine do the same operations on compatible values. The reason it needs to
do this is that if one computation differs from the other then with the current
implementation we have no way to deal with it since we have to remove the
permute.
When we can keep the permute around we can probably handle these by unrolling.
While implementing this since I have to do the traversal anyway I took advantage
of it by simplifying the code a bit. Previously we would determine whether
something is a conjugate and then try to figure out which conjugate it is and
then try to see if the permutes match what we expect.
Now the code that does the traversal will detect this in one go and return to us
whether the operation is something that can be combined and whether a conjugate
is present.
Secondly because it does this I can now simplify the checking code itself to
essentially just try to apply fixed patterns to each operation.
The patterns represent the order operations should appear in. For instance a
complex MUL operation combines :
Left 1 + Right 1
Left 2 + Right 2
with a permute on the nodes consisting of:
{ Even, Even } + { Odd, Odd }
{ Even, Odd } + { Odd, Even }
By abstracting over these patterns the checking code becomes quite simple.
As part of this I was checking the order of the operands which was left in
"slp" order. as in, the same order they showed up in during SLP, which means
that the accumulator is first. However it looks like I didn't document this
and the x86 optab was implemented assuming the same order as FMA, i.e. that
the accumulator is last.
I have this changed the order to match that of FMA and FMS which corrects the
x86 codegen and will update the Arm targets. This has now also been
documented.
gcc/ChangeLog:
PR tree-optimization/102819
PR tree-optimization/103169
* doc/md.texi: Update docs for cfms, cfma.
* tree-data-ref.h (same_data_refs): Accept optional offset.
* tree-vect-slp-patterns.cc (is_linear_load_p): Fix issue with repeating
patterns.
(vect_normalize_conj_loc): Remove.
(is_eq_or_top): Change to take two nodes.
(enum _conj_status, compatible_complex_nodes_p,
vect_validate_multiplication): New.
(class complex_add_pattern, complex_add_pattern::matches,
complex_add_pattern::recognize, class complex_mul_pattern,
complex_mul_pattern::recognize, class complex_fms_pattern,
complex_fms_pattern::recognize, class complex_operations_pattern,
complex_operations_pattern::recognize, addsub_pattern::recognize): Pass
new cache.
(complex_fms_pattern::matches, complex_mul_pattern::matches): Pass new
cache and use new validation code.
* tree-vect-slp.cc (vect_match_slp_patterns_2, vect_match_slp_patterns,
vect_analyze_slp): Pass along cache.
(compatible_calls_p): Expose.
* tree-vectorizer.h (compatible_calls_p, slp_node_hash,
slp_compat_nodes_map_t): New.
(class vect_pattern): Update signatures include new cache.
gcc/testsuite/ChangeLog:
PR tree-optimization/102819
PR tree-optimization/103169
* g++.dg/vect/pr99149.cc: xfail for now.
* gcc.dg/vect/complex/pr102819-1.c: New test.
* gcc.dg/vect/complex/pr102819-2.c: New test.
* gcc.dg/vect/complex/pr102819-3.c: New test.
* gcc.dg/vect/complex/pr102819-4.c: New test.
* gcc.dg/vect/complex/pr102819-5.c: New test.
* gcc.dg/vect/complex/pr102819-6.c: New test.
* gcc.dg/vect/complex/pr102819-7.c: New test.
* gcc.dg/vect/complex/pr102819-8.c: New test.
* gcc.dg/vect/complex/pr102819-9.c: New test.
* gcc.dg/vect/complex/pr103169.c: New test.
|
|
gcc/ChangeLog:
* tree-vect-loop.cc (vect_estimate_min_profitable_iters): Pass new
argument suggested_unroll_factor.
(vect_analyze_loop_costing): Likewise.
(_loop_vec_info::_loop_vec_info): Initialize new member
suggested_unroll_factor.
(vect_determine_partial_vectors_and_peeling): Make epilogue of unrolled
main loop use partial vectors.
(vect_analyze_loop_2): Pass and use new argument
suggested_unroll_factor.
(vect_analyze_loop_1): Change to intialize local
suggested_unroll_factor and use it.
(vectorizable_reduction): Don't use single_defuse_cycle when unrolling.
* tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new member
suggested_unroll_factor.
(vector_costs::vector_costs): Add new member m_suggested_unroll_factor.
(vector_costs::suggested_unroll_factor): New getter function.
(finish_cost): Set return argument suggested_unroll_factor.
|
|
ChangeLog:
* MAINTAINERS: Rename .c names to .cc.
contrib/ChangeLog:
* filter-clang-warnings.py: Rename .c names to .cc.
* gcc_update: Likewise.
* paranoia.cc: Likewise.
contrib/header-tools/ChangeLog:
* README: Rename .c names to .cc.
gcc/ChangeLog:
* Makefile.in: Rename .c names to .cc.
* alias.h: Likewise.
* asan.cc: Likewise.
* auto-profile.h: Likewise.
* basic-block.h (struct basic_block_d): Likewise.
* btfout.cc: Likewise.
* builtins.cc (expand_builtin_longjmp): Likewise.
(validate_arg): Likewise.
(access_ref::offset_bounded): Likewise.
* caller-save.cc (reg_restore_code): Likewise.
(setup_save_areas): Likewise.
* calls.cc (initialize_argument_information): Likewise.
(expand_call): Likewise.
(emit_library_call_value_1): Likewise.
* cfg-flags.def (RTL): Likewise.
(SIBCALL): Likewise.
(CAN_FALLTHRU): Likewise.
* cfganal.cc (post_order_compute): Likewise.
* cfgcleanup.cc (try_simplify_condjump): Likewise.
(merge_blocks_move_predecessor_nojumps): Likewise.
(merge_blocks_move_successor_nojumps): Likewise.
(merge_blocks_move): Likewise.
(old_insns_match_p): Likewise.
(try_crossjump_bb): Likewise.
* cfgexpand.cc (expand_gimple_stmt): Likewise.
* cfghooks.cc (split_block_before_cond_jump): Likewise.
(profile_record_check_consistency): Likewise.
* cfghooks.h: Likewise.
* cfgrtl.cc (pass_free_cfg::execute): Likewise.
(rtl_can_merge_blocks): Likewise.
(try_redirect_by_replacing_jump): Likewise.
(make_pass_outof_cfg_layout_mode): Likewise.
(cfg_layout_can_merge_blocks_p): Likewise.
* cgraph.cc (release_function_body): Likewise.
(cgraph_node::get_fun): Likewise.
* cgraph.h (struct cgraph_node): Likewise.
(asmname_hasher::equal): Likewise.
(cgraph_inline_failed_type): Likewise.
(thunk_adjust): Likewise.
(dump_callgraph_transformation): Likewise.
(record_references_in_initializer): Likewise.
(ipa_discover_variable_flags): Likewise.
* cgraphclones.cc (GTY): Likewise.
* cgraphunit.cc (symbol_table::finalize_compilation_unit): Likewise.
* collect-utils.h (GCC_COLLECT_UTILS_H): Likewise.
* collect2-aix.h (GCC_COLLECT2_AIX_H): Likewise.
* collect2.cc (maybe_run_lto_and_relink): Likewise.
* combine-stack-adj.cc: Likewise.
* combine.cc (setup_incoming_promotions): Likewise.
(combine_simplify_rtx): Likewise.
(count_rtxs): Likewise.
* common.opt: Likewise.
* common/config/aarch64/aarch64-common.cc: Likewise.
* common/config/arm/arm-common.cc (arm_asm_auto_mfpu): Likewise.
* common/config/avr/avr-common.cc: Likewise.
* common/config/i386/i386-isas.h (struct _isa_names_table): Likewise.
* conditions.h: Likewise.
* config.gcc: Likewise.
* config/aarch64/aarch64-builtins.cc (aarch64_resolve_overloaded_memtag): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_classify_address): Likewise.
(aarch64_get_extension_string_for_isa_flags): Likewise.
* config/aarch64/aarch64-sve-builtins.cc (function_builder::add_function): Likewise.
* config/aarch64/aarch64.cc (aarch64_regmode_natural_size): Likewise.
(aarch64_sched_first_cycle_multipass_dfa_lookahead): Likewise.
(aarch64_option_valid_attribute_p): Likewise.
(aarch64_short_vector_p): Likewise.
(aarch64_float_const_representable_p): Likewise.
* config/aarch64/aarch64.h (DBX_REGISTER_NUMBER): Likewise.
(ASM_OUTPUT_POOL_EPILOGUE): Likewise.
(GTY): Likewise.
* config/aarch64/cortex-a57-fma-steering.cc: Likewise.
* config/aarch64/driver-aarch64.cc (contains_core_p): Likewise.
* config/aarch64/t-aarch64: Likewise.
* config/aarch64/x-aarch64: Likewise.
* config/aarch64/x-darwin: Likewise.
* config/alpha/alpha-protos.h: Likewise.
* config/alpha/alpha.cc (alpha_scalar_mode_supported_p): Likewise.
* config/alpha/alpha.h (LONG_DOUBLE_TYPE_SIZE): Likewise.
(enum reg_class): Likewise.
* config/alpha/alpha.md: Likewise.
* config/alpha/driver-alpha.cc (AMASK_LOCKPFTCHOK): Likewise.
* config/alpha/x-alpha: Likewise.
* config/arc/arc-protos.h (arc_eh_uses): Likewise.
* config/arc/arc.cc (ARC_OPT): Likewise.
(arc_ccfsm_advance): Likewise.
(arc_arg_partial_bytes): Likewise.
(conditionalize_nonjump): Likewise.
* config/arc/arc.md: Likewise.
* config/arc/builtins.def: Likewise.
* config/arc/t-arc: Likewise.
* config/arm/arm-c.cc (arm_resolve_overloaded_builtin): Likewise.
(arm_pragma_target_parse): Likewise.
* config/arm/arm-protos.h (save_restore_target_globals): Likewise.
(arm_cpu_cpp_builtins): Likewise.
* config/arm/arm.cc (vfp3_const_double_index): Likewise.
(shift_op): Likewise.
(thumb2_final_prescan_insn): Likewise.
(arm_final_prescan_insn): Likewise.
(arm_asm_output_labelref): Likewise.
(arm_small_register_classes_for_mode_p): Likewise.
* config/arm/arm.h: Likewise.
* config/arm/arm.md: Likewise.
* config/arm/driver-arm.cc: Likewise.
* config/arm/symbian.h: Likewise.
* config/arm/t-arm: Likewise.
* config/arm/thumb1.md: Likewise.
* config/arm/x-arm: Likewise.
* config/avr/avr-c.cc (avr_register_target_pragmas): Likewise.
* config/avr/avr-fixed.md: Likewise.
* config/avr/avr-log.cc (avr_log_vadump): Likewise.
* config/avr/avr-mcus.def: Likewise.
* config/avr/avr-modes.def (FRACTIONAL_INT_MODE): Likewise.
* config/avr/avr-passes.def (INSERT_PASS_BEFORE): Likewise.
* config/avr/avr-protos.h (make_avr_pass_casesi): Likewise.
* config/avr/avr.cc (avr_option_override): Likewise.
(avr_build_builtin_va_list): Likewise.
(avr_mode_dependent_address_p): Likewise.
(avr_function_arg_advance): Likewise.
(avr_asm_output_aligned_decl_common): Likewise.
* config/avr/avr.h (RETURN_ADDR_RTX): Likewise.
(SUPPORTS_INIT_PRIORITY): Likewise.
* config/avr/avr.md: Likewise.
* config/avr/builtins.def: Likewise.
* config/avr/gen-avr-mmcu-specs.cc (IN_GEN_AVR_MMCU_TEXI): Likewise.
* config/avr/gen-avr-mmcu-texi.cc (IN_GEN_AVR_MMCU_TEXI): Likewise.
(main): Likewise.
* config/avr/t-avr: Likewise.
* config/bfin/bfin.cc (frame_related_constant_load): Likewise.
* config/bpf/bpf-protos.h (GCC_BPF_PROTOS_H): Likewise.
* config/bpf/bpf.h (enum reg_class): Likewise.
* config/bpf/t-bpf: Likewise.
* config/c6x/c6x-protos.h (GCC_C6X_PROTOS_H): Likewise.
* config/cr16/cr16-protos.h: Likewise.
* config/cris/cris.cc (cris_address_cost): Likewise.
(cris_side_effect_mode_ok): Likewise.
(cris_init_machine_status): Likewise.
(cris_emit_movem_store): Likewise.
* config/cris/cris.h (INDEX_REG_CLASS): Likewise.
(enum reg_class): Likewise.
(struct cum_args): Likewise.
* config/cris/cris.opt: Likewise.
* config/cris/sync.md: Likewise.
* config/csky/csky.cc (csky_expand_prologue): Likewise.
* config/darwin-c.cc: Likewise.
* config/darwin-f.cc: Likewise.
* config/darwin-sections.def (zobj_const_section): Likewise.
* config/darwin.cc (output_objc_section_asm_op): Likewise.
(fprintf): Likewise.
* config/darwin.h (GTY): Likewise.
* config/elfos.h: Likewise.
* config/epiphany/epiphany-sched.md: Likewise.
* config/epiphany/epiphany.cc (epiphany_function_value): Likewise.
* config/epiphany/epiphany.h (GTY): Likewise.
(NO_FUNCTION_CSE): Likewise.
* config/epiphany/mode-switch-use.cc: Likewise.
* config/epiphany/predicates.md: Likewise.
* config/epiphany/t-epiphany: Likewise.
* config/fr30/fr30-protos.h: Likewise.
* config/frv/frv-protos.h: Likewise.
* config/frv/frv.cc (TLS_BIAS): Likewise.
* config/frv/frv.h (ASM_OUTPUT_ALIGNED_LOCAL): Likewise.
* config/ft32/ft32-protos.h: Likewise.
* config/gcn/gcn-hsa.h (ASM_APP_OFF): Likewise.
* config/gcn/gcn.cc (gcn_init_libfuncs): Likewise.
* config/gcn/mkoffload.cc (copy_early_debug_info): Likewise.
* config/gcn/t-gcn-hsa: Likewise.
* config/gcn/t-omp-device: Likewise.
* config/h8300/h8300-protos.h (GCC_H8300_PROTOS_H): Likewise.
(same_cmp_following_p): Likewise.
* config/h8300/h8300.cc (F): Likewise.
* config/h8300/h8300.h (struct cum_arg): Likewise.
(BRANCH_COST): Likewise.
* config/i386/cygming.h (DEFAULT_PCC_STRUCT_RETURN): Likewise.
* config/i386/djgpp.h (TARGET_ASM_LTO_END): Likewise.
* config/i386/dragonfly.h (NO_PROFILE_COUNTERS): Likewise.
* config/i386/driver-i386.cc (detect_caches_intel): Likewise.
* config/i386/freebsd.h (NO_PROFILE_COUNTERS): Likewise.
* config/i386/i386-c.cc (ix86_target_macros): Likewise.
* config/i386/i386-expand.cc (get_mode_wider_vector): Likewise.
* config/i386/i386-options.cc (ix86_set_func_type): Likewise.
* config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant): Likewise.
(ix86_register_pragmas): Likewise.
(ix86_d_has_stdcall_convention): Likewise.
(i386_pe_seh_init_sections): Likewise.
* config/i386/i386.cc (ix86_function_arg_regno_p): Likewise.
(ix86_function_value_regno_p): Likewise.
(ix86_compute_frame_layout): Likewise.
(legitimize_pe_coff_symbol): Likewise.
(output_pic_addr_const): Likewise.
* config/i386/i386.h (defined): Likewise.
(host_detect_local_cpu): Likewise.
(CONSTANT_ADDRESS_P): Likewise.
(DEFAULT_LARGE_SECTION_THRESHOLD): Likewise.
(struct machine_frame_state): Likewise.
* config/i386/i386.md: Likewise.
* config/i386/lynx.h (ASM_OUTPUT_ALIGN): Likewise.
* config/i386/mmx.md: Likewise.
* config/i386/sse.md: Likewise.
* config/i386/t-cygming: Likewise.
* config/i386/t-djgpp: Likewise.
* config/i386/t-gnu-property: Likewise.
* config/i386/t-i386: Likewise.
* config/i386/t-intelmic: Likewise.
* config/i386/t-omp-device: Likewise.
* config/i386/winnt-cxx.cc (i386_pe_type_dllimport_p): Likewise.
(i386_pe_adjust_class_at_definition): Likewise.
* config/i386/winnt.cc (gen_stdcall_or_fastcall_suffix): Likewise.
(i386_pe_mangle_decl_assembler_name): Likewise.
(i386_pe_encode_section_info): Likewise.
* config/i386/x-cygwin: Likewise.
* config/i386/x-darwin: Likewise.
* config/i386/x-i386: Likewise.
* config/i386/x-mingw32: Likewise.
* config/i386/x86-tune-sched-core.cc: Likewise.
* config/i386/x86-tune.def: Likewise.
* config/i386/xm-djgpp.h (STANDARD_STARTFILE_PREFIX_1): Likewise.
* config/ia64/freebsd.h: Likewise.
* config/ia64/hpux.h (REGISTER_TARGET_PRAGMAS): Likewise.
* config/ia64/ia64-protos.h (ia64_except_unwind_info): Likewise.
* config/ia64/ia64.cc (ia64_function_value_regno_p): Likewise.
(ia64_secondary_reload_class): Likewise.
(bundling): Likewise.
* config/ia64/ia64.h: Likewise.
* config/ia64/ia64.md: Likewise.
* config/ia64/predicates.md: Likewise.
* config/ia64/sysv4.h: Likewise.
* config/ia64/t-ia64: Likewise.
* config/iq2000/iq2000.h (FUNCTION_MODE): Likewise.
* config/iq2000/iq2000.md: Likewise.
* config/linux.h (TARGET_HAS_BIONIC): Likewise.
(if): Likewise.
* config/m32c/m32c.cc (m32c_function_needs_enter): Likewise.
* config/m32c/m32c.h (MAX_REGS_PER_ADDRESS): Likewise.
* config/m32c/t-m32c: Likewise.
* config/m32r/m32r-protos.h: Likewise.
* config/m32r/m32r.cc (m32r_print_operand): Likewise.
* config/m32r/m32r.h: Likewise.
* config/m32r/m32r.md: Likewise.
* config/m68k/m68k-isas.def: Likewise.
* config/m68k/m68k-microarchs.def: Likewise.
* config/m68k/m68k-protos.h (strict_low_part_peephole_ok): Likewise.
(m68k_epilogue_uses): Likewise.
* config/m68k/m68k.cc (m68k_call_tls_get_addr): Likewise.
(m68k_sched_adjust_cost): Likewise.
(m68k_sched_md_init): Likewise.
* config/m68k/m68k.h (__transfer_from_trampoline): Likewise.
(enum m68k_function_kind): Likewise.
* config/m68k/m68k.md: Likewise.
* config/m68k/m68kemb.h: Likewise.
* config/m68k/uclinux.h (ENDFILE_SPEC): Likewise.
* config/mcore/mcore-protos.h: Likewise.
* config/mcore/mcore.cc (mcore_expand_insv): Likewise.
(mcore_expand_prolog): Likewise.
* config/mcore/mcore.h (TARGET_MCORE): Likewise.
* config/mcore/mcore.md: Likewise.
* config/microblaze/microblaze-protos.h: Likewise.
* config/microblaze/microblaze.cc (microblaze_legitimate_pic_operand): Likewise.
(microblaze_function_prologue): Likewise.
(microblaze_function_epilogue): Likewise.
(microblaze_select_section): Likewise.
(microblaze_asm_output_mi_thunk): Likewise.
(microblaze_eh_return): Likewise.
* config/microblaze/microblaze.h: Likewise.
* config/microblaze/microblaze.md: Likewise.
* config/microblaze/t-microblaze: Likewise.
* config/mips/driver-native.cc: Likewise.
* config/mips/loongson2ef.md: Likewise.
* config/mips/mips-protos.h (mips_expand_vec_cmp_expr): Likewise.
* config/mips/mips.cc (mips_rtx_costs): Likewise.
(mips_output_filename): Likewise.
(mips_output_function_prologue): Likewise.
(mips_output_function_epilogue): Likewise.
(mips_output_mi_thunk): Likewise.
* config/mips/mips.h: Likewise.
* config/mips/mips.md: Likewise.
* config/mips/t-mips: Likewise.
* config/mips/x-native: Likewise.
* config/mmix/mmix-protos.h: Likewise.
* config/mmix/mmix.cc (mmix_option_override): Likewise.
(mmix_dbx_register_number): Likewise.
(mmix_expand_prologue): Likewise.
* config/mmix/mmix.h: Likewise.
* config/mmix/mmix.md: Likewise.
* config/mmix/predicates.md: Likewise.
* config/mn10300/mn10300.cc (mn10300_symbolic_operand): Likewise.
(mn10300_legitimate_pic_operand_p): Likewise.
* config/mn10300/mn10300.h (enum reg_class): Likewise.
(NO_FUNCTION_CSE): Likewise.
* config/moxie/moxie-protos.h: Likewise.
* config/moxie/uclinux.h (TARGET_LIBC_HAS_FUNCTION): Likewise.
* config/msp430/msp430-devices.cc (extract_devices_dir_from_exec_prefix): Likewise.
* config/msp430/msp430.cc (msp430_gimplify_va_arg_expr): Likewise.
(msp430_incoming_return_addr_rtx): Likewise.
* config/msp430/msp430.h (msp430_get_linker_devices_include_path): Likewise.
* config/msp430/t-msp430: Likewise.
* config/nds32/nds32-cost.cc (nds32_rtx_costs_speed_prefer): Likewise.
(nds32_rtx_costs_size_prefer): Likewise.
(nds32_init_rtx_costs): Likewise.
* config/nds32/nds32-doubleword.md: Likewise.
* config/nds32/nds32.cc (nds32_memory_move_cost): Likewise.
(nds32_builtin_decl): Likewise.
* config/nds32/nds32.h (enum nds32_16bit_address_type): Likewise.
(enum nds32_isr_nested_type): Likewise.
(enum reg_class): Likewise.
* config/nds32/predicates.md: Likewise.
* config/nds32/t-nds32: Likewise.
* config/nios2/nios2.cc (nios2_pragma_target_parse): Likewise.
* config/nvptx/nvptx-protos.h: Likewise.
* config/nvptx/nvptx.cc (nvptx_goacc_expand_var_decl): Likewise.
* config/nvptx/nvptx.h (TARGET_CPU_CPP_BUILTINS): Likewise.
* config/nvptx/t-nvptx: Likewise.
* config/nvptx/t-omp-device: Likewise.
* config/pa/elf.h: Likewise.
* config/pa/pa-linux.h (GLOBAL_ASM_OP): Likewise.
* config/pa/pa-netbsd.h (GLOBAL_ASM_OP): Likewise.
* config/pa/pa-openbsd.h (TARGET_ASM_GLOBALIZE_LABEL): Likewise.
* config/pa/pa-protos.h (pa_eh_return_handler_rtx): Likewise.
(pa_legitimize_reload_address): Likewise.
(pa_can_use_return_insn): Likewise.
* config/pa/pa.cc (mem_shadd_or_shadd_rtx_p): Likewise.
(som_output_text_section_asm_op): Likewise.
* config/pa/pa.h (PROFILE_BEFORE_PROLOGUE): Likewise.
* config/pa/pa.md: Likewise.
* config/pa/som.h: Likewise.
* config/pa/t-pa: Likewise.
* config/pdp11/pdp11.cc (decode_pdp11_d): Likewise.
* config/pdp11/pdp11.h: Likewise.
* config/pdp11/pdp11.md: Likewise.
* config/pdp11/t-pdp11: Likewise.
* config/pru/pru.md: Likewise.
* config/pru/t-pru: Likewise.
* config/riscv/riscv-protos.h (NUM_SYMBOL_TYPES): Likewise.
(riscv_gpr_save_operation_p): Likewise.
(riscv_d_register_target_info): Likewise.
(riscv_init_builtins): Likewise.
* config/riscv/riscv.cc (riscv_output_mi_thunk): Likewise.
* config/riscv/riscv.h (CSW_MAX_OFFSET): Likewise.
* config/riscv/t-riscv: Likewise.
* config/rl78/rl78.cc (rl78_asm_ctor_dtor): Likewise.
* config/rl78/t-rl78: Likewise.
* config/rs6000/aix.h: Likewise.
* config/rs6000/aix71.h (ASM_SPEC_COMMON): Likewise.
* config/rs6000/aix72.h (ASM_SPEC_COMMON): Likewise.
* config/rs6000/aix73.h (ASM_SPEC_COMMON): Likewise.
* config/rs6000/darwin.h (TARGET_ASM_GLOBALIZE_LABEL): Likewise.
* config/rs6000/driver-rs6000.cc: Likewise.
* config/rs6000/freebsd.h: Likewise.
* config/rs6000/freebsd64.h: Likewise.
* config/rs6000/lynx.h (ASM_OUTPUT_ALIGN): Likewise.
* config/rs6000/rbtree.cc: Likewise.
* config/rs6000/rbtree.h: Likewise.
* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Likewise.
* config/rs6000/rs6000-call.cc (rs6000_invalid_builtin): Likewise.
(rs6000_expand_builtin): Likewise.
(rs6000_init_builtins): Likewise.
* config/rs6000/rs6000-cpus.def: Likewise.
* config/rs6000/rs6000-gen-builtins.cc (write_init_ovld_table): Likewise.
* config/rs6000/rs6000-internal.h (ALTIVEC_REG_BIT): Likewise.
(quad_address_offset_p): Likewise.
* config/rs6000/rs6000-logue.cc (interesting_frame_related_regno): Likewise.
(rs6000_emit_epilogue): Likewise.
* config/rs6000/rs6000-overload.def: Likewise.
* config/rs6000/rs6000-p8swap.cc: Likewise.
* config/rs6000/rs6000-protos.h (GCC_RS6000_PROTOS_H): Likewise.
(rs6000_const_f32_to_i32): Likewise.
* config/rs6000/rs6000.cc (legitimate_lo_sum_address_p): Likewise.
(rs6000_debug_legitimize_address): Likewise.
(rs6000_mode_dependent_address): Likewise.
(rs6000_adjust_priority): Likewise.
(rs6000_c_mode_for_suffix): Likewise.
* config/rs6000/rs6000.h (defined): Likewise.
(LONG_DOUBLE_TYPE_SIZE): Likewise.
* config/rs6000/rs6000.md: Likewise.
* config/rs6000/sysv4.h: Likewise.
* config/rs6000/t-linux: Likewise.
* config/rs6000/t-linux64: Likewise.
* config/rs6000/t-rs6000: Likewise.
* config/rs6000/x-darwin: Likewise.
* config/rs6000/x-darwin64: Likewise.
* config/rs6000/x-rs6000: Likewise.
* config/rs6000/xcoff.h (ASM_OUTPUT_LABELREF): Likewise.
* config/rx/rx.cc (rx_expand_builtin): Likewise.
* config/s390/constraints.md: Likewise.
* config/s390/driver-native.cc: Likewise.
* config/s390/htmxlintrin.h: Likewise.
* config/s390/s390-builtins.def (B_DEF): Likewise.
(OB_DEF_VAR): Likewise.
* config/s390/s390-builtins.h: Likewise.
* config/s390/s390-c.cc: Likewise.
* config/s390/s390-opts.h: Likewise.
* config/s390/s390-protos.h (s390_check_symref_alignment): Likewise.
(s390_register_target_pragmas): Likewise.
* config/s390/s390.cc (s390_init_builtins): Likewise.
(s390_expand_plus_operand): Likewise.
(s390_expand_atomic): Likewise.
(s390_valid_target_attribute_inner_p): Likewise.
* config/s390/s390.h (LONG_DOUBLE_TYPE_SIZE): Likewise.
* config/s390/s390.md: Likewise.
* config/s390/t-s390: Likewise.
* config/s390/vx-builtins.md: Likewise.
* config/s390/x-native: Likewise.
* config/sh/divtab-sh4-300.cc (main): Likewise.
* config/sh/divtab-sh4.cc (main): Likewise.
* config/sh/divtab.cc (main): Likewise.
* config/sh/elf.h: Likewise.
* config/sh/sh-protos.h (sh_fsca_int2sf): Likewise.
* config/sh/sh.cc (SYMBOL_FLAG_FUNCVEC_FUNCTION): Likewise.
(sh_struct_value_rtx): Likewise.
(sh_remove_reg_dead_or_unused_notes): Likewise.
* config/sh/sh.h (MIN_UNITS_PER_WORD): Likewise.
* config/sh/t-sh: Likewise.
* config/sol2-protos.h (solaris_override_options): Likewise.
* config/sol2.h: Likewise.
* config/sparc/driver-sparc.cc: Likewise.
* config/sparc/freebsd.h: Likewise.
* config/sparc/sparc-protos.h (make_pass_work_around_errata): Likewise.
* config/sparc/sparc.cc (sparc_output_mi_thunk): Likewise.
(sparc_asan_shadow_offset): Likewise.
* config/sparc/sparc.h: Likewise.
* config/sparc/sparc.md: Likewise.
* config/sparc/t-sparc: Likewise.
* config/sparc/x-sparc: Likewise.
* config/stormy16/stormy16.cc (xstormy16_mode_dependent_address_p): Likewise.
* config/t-darwin: Likewise.
* config/t-dragonfly: Likewise.
* config/t-freebsd: Likewise.
* config/t-glibc: Likewise.
* config/t-linux: Likewise.
* config/t-netbsd: Likewise.
* config/t-openbsd: Likewise.
* config/t-pnt16-warn: Likewise.
* config/t-sol2: Likewise.
* config/t-vxworks: Likewise.
* config/t-winnt: Likewise.
* config/tilegx/t-tilegx: Likewise.
* config/tilegx/tilegx-c.cc: Likewise.
* config/tilegx/tilegx-protos.h (tilegx_function_profiler): Likewise.
* config/tilegx/tilegx.md: Likewise.
* config/tilepro/t-tilepro: Likewise.
* config/tilepro/tilepro-c.cc: Likewise.
* config/v850/t-v850: Likewise.
* config/v850/v850-protos.h: Likewise.
* config/v850/v850.cc (F): Likewise.
* config/v850/v850.h (enum reg_class): Likewise.
(SLOW_BYTE_ACCESS): Likewise.
* config/vax/vax.cc (vax_mode_dependent_address_p): Likewise.
* config/vax/vax.h (enum reg_class): Likewise.
* config/vax/vax.md: Likewise.
* config/visium/visium.cc (visium_legitimate_address_p): Likewise.
* config/visium/visium.h: Likewise.
* config/vms/t-vms: Likewise.
* config/vms/vms-crtlmap.map: Likewise.
* config/vms/vms-protos.h (vms_c_get_vms_ver): Likewise.
* config/vx-common.h: Likewise.
* config/x-darwin: Likewise.
* config/x-hpux: Likewise.
* config/x-linux: Likewise.
* config/x-netbsd: Likewise.
* config/x-openbsd: Likewise.
* config/x-solaris: Likewise.
* config/xtensa/xtensa-protos.h (xtensa_mem_offset): Likewise.
* config/xtensa/xtensa.cc (xtensa_option_override): Likewise.
* config/xtensa/xtensa.h: Likewise.
* configure.ac: Likewise.
* context.cc: Likewise.
* convert.h: Likewise.
* coretypes.h: Likewise.
* coverage.cc: Likewise.
* coverage.h: Likewise.
* cppdefault.h (struct default_include): Likewise.
* cprop.cc (local_cprop_pass): Likewise.
(one_cprop_pass): Likewise.
* cse.cc (hash_rtx_cb): Likewise.
(fold_rtx): Likewise.
* ctfc.h (ctfc_get_num_vlen_bytes): Likewise.
* data-streamer.h (bp_unpack_var_len_int): Likewise.
(streamer_write_widest_int): Likewise.
* dbgcnt.def: Likewise.
* dbxout.cc (dbxout_early_global_decl): Likewise.
(dbxout_common_check): Likewise.
* dbxout.h: Likewise.
* debug.h (struct gcc_debug_hooks): Likewise.
(dump_go_spec_init): Likewise.
* df-core.cc: Likewise.
* df-scan.cc (df_insn_info_delete): Likewise.
(df_insn_delete): Likewise.
* df.h (debug_df_chain): Likewise.
(can_move_insns_across): Likewise.
* dfp.cc (decimal_from_binary): Likewise.
* diagnostic-color.cc: Likewise.
* diagnostic-event-id.h: Likewise.
* diagnostic-show-locus.cc (test_one_liner_labels): Likewise.
* diagnostic.cc (bt_callback): Likewise.
(num_digits): Likewise.
* doc/avr-mmcu.texi: Likewise.
* doc/cfg.texi: Likewise.
* doc/contrib.texi: Likewise.
* doc/cppinternals.texi: Likewise.
* doc/extend.texi: Likewise.
* doc/generic.texi: Likewise.
* doc/gimple.texi: Likewise.
* doc/gty.texi: Likewise.
* doc/invoke.texi: Likewise.
* doc/loop.texi: Likewise.
* doc/lto.texi: Likewise.
* doc/match-and-simplify.texi: Likewise.
* doc/md.texi: Likewise.
* doc/optinfo.texi: Likewise.
* doc/options.texi: Likewise.
* doc/passes.texi: Likewise.
* doc/plugins.texi: Likewise.
* doc/rtl.texi: Likewise.
* doc/sourcebuild.texi: Likewise.
* doc/tm.texi: Likewise.
* doc/tm.texi.in: Likewise.
* doc/tree-ssa.texi: Likewise.
* dojump.cc (do_jump): Likewise.
* dojump.h: Likewise.
* dumpfile.cc (test_impl_location): Likewise.
(test_capture_of_dump_calls): Likewise.
* dumpfile.h (enum dump_kind): Likewise.
(class dump_location_t): Likewise.
(dump_enabled_p): Likewise.
(enable_rtl_dump_file): Likewise.
(dump_combine_total_stats): Likewise.
* dwarf2asm.cc (dw2_asm_output_delta_uleb128): Likewise.
* dwarf2ctf.h (ctf_debug_finish): Likewise.
* dwarf2out.cc (dwarf2out_begin_prologue): Likewise.
(struct loc_descr_context): Likewise.
(rtl_for_decl_location): Likewise.
(gen_subprogram_die): Likewise.
(gen_label_die): Likewise.
(is_trivial_indirect_ref): Likewise.
(dwarf2out_late_global_decl): Likewise.
(dwarf_file_hasher::hash): Likewise.
(dwarf2out_end_source_file): Likewise.
(dwarf2out_define): Likewise.
(dwarf2out_early_finish): Likewise.
* dwarf2out.h (struct dw_fde_node): Likewise.
(struct dw_discr_list_node): Likewise.
(output_loc_sequence_raw): Likewise.
* emit-rtl.cc (gen_raw_REG): Likewise.
(maybe_set_max_label_num): Likewise.
* emit-rtl.h (struct rtl_data): Likewise.
* errors.cc (internal_error): Likewise.
(trim_filename): Likewise.
* et-forest.cc: Likewise.
* except.cc (init_eh_for_function): Likewise.
* explow.cc (promote_ssa_mode): Likewise.
(get_dynamic_stack_size): Likewise.
* explow.h: Likewise.
* expmed.h: Likewise.
* expr.cc (safe_from_p): Likewise.
(expand_expr_real_2): Likewise.
(expand_expr_real_1): Likewise.
* file-prefix-map.cc (remap_filename): Likewise.
* final.cc (app_enable): Likewise.
(make_pass_compute_alignments): Likewise.
(final_scan_insn_1): Likewise.
(final_scan_insn): Likewise.
* fixed-value.h (fixed_from_string): Likewise.
* flag-types.h (NO_DEBUG): Likewise.
(DWARF2_DEBUG): Likewise.
(VMS_DEBUG): Likewise.
(BTF_DEBUG): Likewise.
(enum ctf_debug_info_levels): Likewise.
* fold-const.cc (const_binop): Likewise.
(fold_binary_loc): Likewise.
(fold_checksum_tree): Likewise.
* fp-test.cc: Likewise.
* function.cc (expand_function_end): Likewise.
* function.h (struct function): Likewise.
* fwprop.cc (should_replace_address): Likewise.
* gcc-main.cc: Likewise.
* gcc-rich-location.h (class gcc_rich_location): Likewise.
* gcc-symtab.h: Likewise.
* gcc.cc (MIN_FATAL_STATUS): Likewise.
(driver_handle_option): Likewise.
(quote_spec_arg): Likewise.
(driver::finalize): Likewise.
* gcc.h (set_input): Likewise.
* gcov-dump.cc: Likewise.
* gcov.cc (solve_flow_graph): Likewise.
* gcse-common.cc: Likewise.
* gcse.cc (make_pass_rtl_hoist): Likewise.
* genattr-common.cc: Likewise.
* genattrtab.cc (min_fn): Likewise.
(write_const_num_delay_slots): Likewise.
* genautomata.cc: Likewise.
* genconditions.cc (write_one_condition): Likewise.
* genconstants.cc: Likewise.
* genemit.cc (gen_exp): Likewise.
* generic-match-head.cc: Likewise.
* genextract.cc: Likewise.
* gengenrtl.cc (always_void_p): Likewise.
* gengtype-parse.cc (gtymarker_opt): Likewise.
* gengtype-state.cc (state_writer::state_writer): Likewise.
(write_state_trailer): Likewise.
(equals_type_number): Likewise.
(read_state): Likewise.
* gengtype.cc (open_base_files): Likewise.
(struct file_rule_st): Likewise.
(header_dot_h_frul): Likewise.
* gengtype.h: Likewise.
* genmatch.cc (main): Likewise.
* genmddeps.cc: Likewise.
* genmodes.cc (emit_mode_inner): Likewise.
(emit_mode_unit_size): Likewise.
* genpeep.cc (gen_peephole): Likewise.
* genpreds.cc (write_tm_preds_h): Likewise.
* genrecog.cc (validate_pattern): Likewise.
(write_header): Likewise.
(main): Likewise.
* gensupport.cc (change_subst_attribute): Likewise.
(traverse_c_tests): Likewise.
(add_predicate): Likewise.
(init_predicate_table): Likewise.
* gensupport.h (struct optab_pattern): Likewise.
(get_num_insn_codes): Likewise.
(maybe_eval_c_test): Likewise.
(struct pred_data): Likewise.
* ggc-internal.h: Likewise.
* gimple-fold.cc (maybe_fold_reference): Likewise.
(get_range_strlen_tree): Likewise.
* gimple-fold.h (gimple_stmt_integer_valued_real_p): Likewise.
* gimple-low.cc: Likewise.
* gimple-match-head.cc (directly_supported_p): Likewise.
* gimple-pretty-print.h: Likewise.
* gimple-ssa-sprintf.cc (format_percent): Likewise.
(adjust_range_for_overflow): Likewise.
* gimple-streamer.h: Likewise.
* gimple.h (struct GTY): Likewise.
(is_gimple_resx): Likewise.
* gimplify.cc (gimplify_expr): Likewise.
(gimplify_init_constructor): Likewise.
(omp_construct_selector_matches): Likewise.
(gimplify_omp_target_update): Likewise.
(gimplify_omp_ordered): Likewise.
(gimplify_va_arg_expr): Likewise.
* graphite-isl-ast-to-gimple.cc (should_copy_to_new_region): Likewise.
* haifa-sched.cc (increase_insn_priority): Likewise.
(try_ready): Likewise.
(sched_create_recovery_edges): Likewise.
* ifcvt.cc (find_if_case_1): Likewise.
(find_if_case_2): Likewise.
* inchash.h: Likewise.
* incpath.cc (add_env_var_paths): Likewise.
* input.cc (dump_location_info): Likewise.
(assert_loceq): Likewise.
(test_lexer_string_locations_concatenation_1): Likewise.
(test_lexer_string_locations_concatenation_2): Likewise.
(test_lexer_string_locations_concatenation_3): Likewise.
* input.h (BUILTINS_LOCATION): Likewise.
(class string_concat_db): Likewise.
* internal-fn.cc (expand_MUL_OVERFLOW): Likewise.
(expand_LOOP_VECTORIZED): Likewise.
* ipa-cp.cc (make_pass_ipa_cp): Likewise.
* ipa-fnsummary.cc (remap_freqcounting_preds_after_dup): Likewise.
(ipa_fn_summary_t::duplicate): Likewise.
(make_pass_ipa_fn_summary): Likewise.
* ipa-fnsummary.h (enum ipa_hints_vals): Likewise.
* ipa-free-lang-data.cc (fld_simplified_type): Likewise.
(free_lang_data_in_decl): Likewise.
* ipa-inline.cc (compute_inlined_call_time): Likewise.
(inline_always_inline_functions): Likewise.
* ipa-inline.h (free_growth_caches): Likewise.
(inline_account_function_p): Likewise.
* ipa-modref.cc (modref_access_analysis::analyze_stmt): Likewise.
(modref_eaf_analysis::analyze_ssa_name): Likewise.
* ipa-param-manipulation.cc (ipa_param_body_adjustments::mark_dead_statements): Likewise.
(ipa_param_body_adjustments::remap_with_debug_expressions): Likewise.
* ipa-prop.cc (ipa_set_node_agg_value_chain): Likewise.
* ipa-prop.h (IPA_UNDESCRIBED_USE): Likewise.
(unadjusted_ptr_and_unit_offset): Likewise.
* ipa-reference.cc (make_pass_ipa_reference): Likewise.
* ipa-reference.h (GCC_IPA_REFERENCE_H): Likewise.
* ipa-split.cc (consider_split): Likewise.
* ipa-sra.cc (isra_read_node_info): Likewise.
* ipa-utils.h (struct ipa_dfs_info): Likewise.
(recursive_call_p): Likewise.
(ipa_make_function_pure): Likewise.
* ira-build.cc (ira_create_allocno): Likewise.
(ira_flattening): Likewise.
* ira-color.cc (do_coloring): Likewise.
(update_curr_costs): Likewise.
* ira-conflicts.cc (process_regs_for_copy): Likewise.
* ira-int.h (struct ira_emit_data): Likewise.
(ira_prohibited_mode_move_regs): Likewise.
(ira_get_dup_out_num): Likewise.
(ira_destroy): Likewise.
(ira_tune_allocno_costs): Likewise.
(ira_implicitly_set_insn_hard_regs): Likewise.
(ira_build_conflicts): Likewise.
(ira_color): Likewise.
* ira-lives.cc (process_bb_node_lives): Likewise.
* ira.cc (class ira_spilled_reg_stack_slot): Likewise.
(setup_uniform_class_p): Likewise.
(def_dominates_uses): Likewise.
* ira.h (ira_nullify_asm_goto): Likewise.
* langhooks.cc (lhd_post_options): Likewise.
* langhooks.h (class substring_loc): Likewise.
(struct lang_hooks_for_tree_inlining): Likewise.
(struct lang_hooks_for_types): Likewise.
(struct lang_hooks): Likewise.
* libfuncs.h (synchronize_libfunc): Likewise.
* loop-doloop.cc (doloop_condition_get): Likewise.
* loop-init.cc (fix_loop_structure): Likewise.
* loop-invariant.cc: Likewise.
* lower-subreg.h: Likewise.
* lra-constraints.cc (curr_insn_transform): Likewise.
* lra-int.h (struct lra_insn_reg): Likewise.
(lra_undo_inheritance): Likewise.
(lra_setup_reload_pseudo_preferenced_hard_reg): Likewise.
(lra_split_hard_reg_for): Likewise.
(lra_coalesce): Likewise.
(lra_final_code_change): Likewise.
* lra-spills.cc (lra_final_code_change): Likewise.
* lra.cc (lra_process_new_insns): Likewise.
* lto-compress.h (struct lto_compression_stream): Likewise.
* lto-streamer-out.cc (DFS::DFS_write_tree_body): Likewise.
(write_symbol): Likewise.
* lto-streamer.h (enum LTO_tags): Likewise.
(lto_value_range_error): Likewise.
(lto_append_block): Likewise.
(lto_streamer_hooks_init): Likewise.
(stream_read_tree_ref): Likewise.
(lto_prepare_function_for_streaming): Likewise.
(select_what_to_stream): Likewise.
(omp_lto_input_declare_variant_alt): Likewise.
(cl_optimization_stream_in): Likewise.
* lto-wrapper.cc (append_compiler_options): Likewise.
* machmode.def: Likewise.
* machmode.h (struct int_n_data_t): Likewise.
* main.cc (main): Likewise.
* match.pd: Likewise.
* omp-builtins.def (BUILT_IN_GOMP_CRITICAL_NAME_END): Likewise.
(BUILT_IN_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT): Likewise.
* omp-expand.cc (expand_omp_atomic_fetch_op): Likewise.
(make_pass_expand_omp_ssa): Likewise.
* omp-low.cc (struct omp_context): Likewise.
(struct omp_taskcopy_context): Likewise.
(lower_omp): Likewise.
* omp-oacc-neuter-broadcast.cc (omp_sese_active_worker_call): Likewise.
(mask_name): Likewise.
(omp_sese_dump_pars): Likewise.
(worker_single_simple): Likewise.
* omp-offload.cc (omp_finish_file): Likewise.
(execute_oacc_loop_designation): Likewise.
* optabs-query.cc (lshift_cheap_p): Likewise.
* optc-gen.awk: Likewise.
* optc-save-gen.awk: Likewise.
* optinfo-emit-json.cc (optrecord_json_writer::optrecord_json_writer): Likewise.
* opts-common.cc: Likewise.
* output.h (app_enable): Likewise.
(output_operand_lossage): Likewise.
(insn_current_reference_address): Likewise.
(get_insn_template): Likewise.
(output_quoted_string): Likewise.
* pass_manager.h (struct register_pass_info): Likewise.
* plugin.cc: Likewise.
* plugin.def (PLUGIN_ANALYZER_INIT): Likewise.
* plugin.h (invoke_plugin_callbacks): Likewise.
* pointer-query.cc (handle_mem_ref): Likewise.
* postreload-gcse.cc (alloc_mem): Likewise.
* predict.h (enum prediction): Likewise.
(add_reg_br_prob_note): Likewise.
* prefix.h: Likewise.
* profile.h (get_working_sets): Likewise.
* read-md.cc: Likewise.
* read-md.h (struct mapping): Likewise.
(class md_reader): Likewise.
(class noop_reader): Likewise.
* read-rtl-function.cc (function_reader::create_function): Likewise.
(function_reader::extra_parsing_for_operand_code_0): Likewise.
* read-rtl.cc (initialize_iterators): Likewise.
* real.cc: Likewise.
* real.h (struct real_value): Likewise.
(format_helper::format_helper): Likewise.
(real_hash): Likewise.
(real_can_shorten_arithmetic): Likewise.
* recog.cc (struct target_recog): Likewise.
(offsettable_nonstrict_memref_p): Likewise.
(constrain_operands): Likewise.
* recog.h (MAX_RECOG_ALTERNATIVES): Likewise.
(which_op_alt): Likewise.
(struct insn_gen_fn): Likewise.
* reg-notes.def (REG_NOTE): Likewise.
* reg-stack.cc: Likewise.
* regs.h (reg_is_parm_p): Likewise.
* regset.h: Likewise.
* reload.cc (push_reload): Likewise.
(find_reloads): Likewise.
(find_reloads_address_1): Likewise.
(find_replacement): Likewise.
(refers_to_regno_for_reload_p): Likewise.
(refers_to_mem_for_reload_p): Likewise.
* reload.h (push_reload): Likewise.
(deallocate_reload_reg): Likewise.
* reload1.cc (emit_input_reload_insns): Likewise.
* reorg.cc (relax_delay_slots): Likewise.
* rtl.def (UNKNOWN): Likewise.
(SEQUENCE): Likewise.
(BARRIER): Likewise.
(ASM_OPERANDS): Likewise.
(EQ_ATTR_ALT): Likewise.
* rtl.h (struct GTY): Likewise.
(LABEL_NAME): Likewise.
(LABEL_ALT_ENTRY_P): Likewise.
(SUBREG_BYTE): Likewise.
(get_stack_check_protect): Likewise.
(dump_rtx_statistics): Likewise.
(unwrap_const_vec_duplicate): Likewise.
(subreg_promoted_mode): Likewise.
(gen_lowpart_common): Likewise.
(operand_subword): Likewise.
(immed_wide_int_const): Likewise.
(decide_function_section): Likewise.
(active_insn_p): Likewise.
(delete_related_insns): Likewise.
(try_split): Likewise.
(val_signbit_known_clear_p): Likewise.
(simplifiable_subregs): Likewise.
(set_insn_deleted): Likewise.
(subreg_get_info): Likewise.
(remove_free_EXPR_LIST_node): Likewise.
(finish_subregs_of_mode): Likewise.
(get_mem_attrs): Likewise.
(lookup_constant_def): Likewise.
(rtx_to_tree_code): Likewise.
(hash_rtx): Likewise.
(condjump_in_parallel_p): Likewise.
(validate_subreg): Likewise.
(make_compound_operation): Likewise.
(schedule_ebbs): Likewise.
(print_inline_rtx): Likewise.
(fixup_args_size_notes): Likewise.
(expand_dec): Likewise.
(prepare_copy_insn): Likewise.
(mark_elimination): Likewise.
(valid_mode_changes_for_regno): Likewise.
(make_debug_expr_from_rtl): Likewise.
(delete_vta_debug_insns): Likewise.
(simplify_using_condition): Likewise.
(set_insn_locations): Likewise.
(fatal_insn_not_found): Likewise.
(word_register_operation_p): Likewise.
* rtlanal.cc (get_call_fndecl): Likewise.
(side_effects_p): Likewise.
(subreg_nregs): Likewise.
(rtx_cost): Likewise.
(canonicalize_condition): Likewise.
* rtlanal.h (rtx_properties::try_to_add_note): Likewise.
* run-rtl-passes.cc (run_rtl_passes): Likewise.
* sanitizer.def (BUILT_IN_ASAN_VERSION_MISMATCH_CHECK): Likewise.
* sched-deps.cc (add_dependence_1): Likewise.
* sched-ebb.cc (begin_move_insn): Likewise.
(add_deps_for_risky_insns): Likewise.
(advance_target_bb): Likewise.
* sched-int.h (reemit_notes): Likewise.
(struct _haifa_insn_data): Likewise.
(HID): Likewise.
(DEP_CANCELLED): Likewise.
(debug_ds): Likewise.
(number_in_ready): Likewise.
(schedule_ebbs_finish): Likewise.
(find_modifiable_mems): Likewise.
* sched-rgn.cc (debug_rgn_dependencies): Likewise.
* sel-sched-dump.cc (dump_lv_set): Likewise.
* sel-sched-dump.h: Likewise.
* sel-sched-ir.cc (sel_insn_rtx_cost): Likewise.
(setup_id_reg_sets): Likewise.
(has_dependence_p): Likewise.
(sel_num_cfg_preds_gt_1): Likewise.
(bb_ends_ebb_p): Likewise.
* sel-sched-ir.h (struct _list_node): Likewise.
(struct idata_def): Likewise.
(bb_next_bb): Likewise.
* sel-sched.cc (vinsn_writes_one_of_regs_p): Likewise.
(choose_best_pseudo_reg): Likewise.
(verify_target_availability): Likewise.
(can_speculate_dep_p): Likewise.
(sel_rank_for_schedule): Likewise.
* selftest-run-tests.cc (selftest::run_tests): Likewise.
* selftest.h (class auto_fix_quotes): Likewise.
* shrink-wrap.cc (handle_simple_exit): Likewise.
* shrink-wrap.h: Likewise.
* simplify-rtx.cc (simplify_context::simplify_associative_operation): Likewise.
(simplify_context::simplify_gen_vec_select): Likewise.
* spellcheck-tree.h: Likewise.
* spellcheck.h: Likewise.
* statistics.h (struct function): Likewise.
* stmt.cc (conditional_probability): Likewise.
* stmt.h: Likewise.
* stor-layout.h: Likewise.
* streamer-hooks.h: Likewise.
* stringpool.h: Likewise.
* symtab.cc (symbol_table::change_decl_assembler_name): Likewise.
* target.def (HOOK_VECTOR_END): Likewise.
(type.): Likewise.
* target.h (union cumulative_args_t): Likewise.
(by_pieces_ninsns): Likewise.
(class predefined_function_abi): Likewise.
* targhooks.cc (default_translate_mode_attribute): Likewise.
* timevar.def: Likewise.
* timevar.h (class timer): Likewise.
* toplev.h (enable_rtl_dump_file): Likewise.
* trans-mem.cc (collect_bb2reg): Likewise.
* tree-call-cdce.cc (gen_conditions_for_pow): Likewise.
* tree-cfg.cc (remove_bb): Likewise.
(verify_gimple_debug): Likewise.
(remove_edge_and_dominated_blocks): Likewise.
(push_fndecl): Likewise.
* tree-cfgcleanup.h (GCC_TREE_CFGCLEANUP_H): Likewise.
* tree-complex.cc (expand_complex_multiplication): Likewise.
(expand_complex_div_straight): Likewise.
* tree-core.h (enum tree_index): Likewise.
(enum operand_equal_flag): Likewise.
* tree-eh.cc (honor_protect_cleanup_actions): Likewise.
* tree-if-conv.cc (if_convertible_gimple_assign_stmt_p): Likewise.
* tree-inline.cc (initialize_inlined_parameters): Likewise.
* tree-inline.h (force_value_to_type): Likewise.
* tree-nested.cc (get_chain_decl): Likewise.
(walk_all_functions): Likewise.
* tree-object-size.h: Likewise.
* tree-outof-ssa.cc: Likewise.
* tree-parloops.cc (create_parallel_loop): Likewise.
* tree-pretty-print.cc (print_generic_expr_to_str): Likewise.
(dump_generic_node): Likewise.
* tree-profile.cc (tree_profiling): Likewise.
* tree-sra.cc (maybe_add_sra_candidate): Likewise.
* tree-ssa-address.cc: Likewise.
* tree-ssa-alias.cc: Likewise.
* tree-ssa-alias.h (ao_ref::max_size_known_p): Likewise.
(dump_alias_stats): Likewise.
* tree-ssa-ccp.cc: Likewise.
* tree-ssa-coalesce.h: Likewise.
* tree-ssa-live.cc (remove_unused_scope_block_p): Likewise.
* tree-ssa-loop-manip.cc (copy_phi_node_args): Likewise.
* tree-ssa-loop-unswitch.cc: Likewise.
* tree-ssa-math-opts.cc: Likewise.
* tree-ssa-operands.cc (class operands_scanner): Likewise.
* tree-ssa-pre.cc: Likewise.
* tree-ssa-reassoc.cc (optimize_ops_list): Likewise.
(debug_range_entry): Likewise.
* tree-ssa-sccvn.cc (eliminate_dom_walker::eliminate_stmt): Likewise.
* tree-ssa-sccvn.h (TREE_SSA_SCCVN_H): Likewise.
* tree-ssa-scopedtables.cc (add_expr_commutative): Likewise.
(equal_mem_array_ref_p): Likewise.
* tree-ssa-strlen.cc (is_strlen_related_p): Likewise.
* tree-ssa-strlen.h (get_range_strlen_dynamic): Likewise.
* tree-ssa-tail-merge.cc (stmt_local_def): Likewise.
* tree-ssa-ter.h: Likewise.
* tree-ssa-threadupdate.h (enum bb_dom_status): Likewise.
* tree-streamer-in.cc (lto_input_ts_block_tree_pointers): Likewise.
* tree-streamer-out.cc (pack_ts_block_value_fields): Likewise.
(write_ts_block_tree_pointers): Likewise.
* tree-streamer.h (struct streamer_tree_cache_d): Likewise.
(streamer_read_tree_bitfields): Likewise.
(streamer_write_integer_cst): Likewise.
* tree-vect-patterns.cc (apply_binop_and_append_stmt): Likewise.
(vect_synth_mult_by_constant): Likewise.
* tree-vect-stmts.cc (vectorizable_operation): Likewise.
* tree-vectorizer.cc: Likewise.
* tree-vectorizer.h (class auto_purge_vect_location): Likewise.
(vect_update_inits_of_drs): Likewise.
(vect_get_mask_type_for_stmt): Likewise.
(vect_rgroup_iv_might_wrap_p): Likewise.
(cse_and_gimplify_to_preheader): Likewise.
(vect_free_slp_tree): Likewise.
(vect_pattern_recog): Likewise.
(vect_stmt_dominates_stmt_p): Likewise.
* tree.cc (initialize_tree_contains_struct): Likewise.
(need_assembler_name_p): Likewise.
(type_with_interoperable_signedness): Likewise.
* tree.def (SWITCH_EXPR): Likewise.
* tree.h (TYPE_SYMTAB_ADDRESS): Likewise.
(poly_int_tree_p): Likewise.
(inlined_function_outer_scope_p): Likewise.
(tree_code_for_canonical_type_merging): Likewise.
* value-prof.cc: Likewise.
* value-prof.h (get_nth_most_common_value): Likewise.
(find_func_by_profile_id): Likewise.
* value-range.cc (vrp_operand_equal_p): Likewise.
* value-range.h: Likewise.
* var-tracking.cc: Likewise.
* varasm.cc (default_function_section): Likewise.
(function_section_1): Likewise.
(assemble_variable): Likewise.
(handle_vtv_comdat_section): Likewise.
* vec.h (struct vec_prefix): Likewise.
* vmsdbgout.cc (full_name): Likewise.
* vtable-verify.cc: Likewise.
* vtable-verify.h (struct vtv_graph_node): Likewise.
* xcoffout.cc: Likewise.
* xcoffout.h (DEBUG_SYMS_TEXT): Likewise.
gcc/ada/ChangeLog:
* Make-generated.in: Rename .c names to .cc.
* adaint.c: Likewise.
* ctrl_c.c (dummy_handler): Likewise.
* gcc-interface/Makefile.in: Likewise.
* gcc-interface/config-lang.in: Likewise.
* gcc-interface/decl.cc (concat_name): Likewise.
(init_gnat_decl): Likewise.
* gcc-interface/gigi.h (concat_name): Likewise.
(init_gnat_utils): Likewise.
(build_call_raise_range): Likewise.
(gnat_mark_addressable): Likewise.
(gnat_protect_expr): Likewise.
(gnat_rewrite_reference): Likewise.
* gcc-interface/lang-specs.h (ADA_DUMPS_OPTIONS): Likewise.
* gcc-interface/utils.cc (GTY): Likewise.
(add_deferred_type_context): Likewise.
(init_gnat_utils): Likewise.
* gcc-interface/utils2.cc (gnat_stable_expr_p): Likewise.
(gnat_protect_expr): Likewise.
(gnat_stabilize_reference_1): Likewise.
(gnat_rewrite_reference): Likewise.
* gsocket.h: Likewise.
* init.cc (__gnat_error_handler): Likewise.
* libgnarl/s-intman.ads: Likewise.
* libgnarl/s-osinte__android.ads: Likewise.
* libgnarl/s-osinte__darwin.ads: Likewise.
* libgnarl/s-osinte__hpux.ads: Likewise.
* libgnarl/s-osinte__linux.ads: Likewise.
* libgnarl/s-osinte__qnx.ads: Likewise.
* libgnarl/s-taskin.ads: Likewise.
* rtfinal.cc: Likewise.
* s-oscons-tmplt.c (CND): Likewise.
* set_targ.ads: Likewise.
gcc/analyzer/ChangeLog:
* analyzer.cc (is_special_named_call_p): Rename .c names to .cc.
(is_named_call_p): Likewise.
* region-model-asm.cc (deterministic_p): Likewise.
* region.cc (field_region::get_relative_concrete_offset): Likewise.
* sm-malloc.cc (method_p): Likewise.
* supergraph.cc (superedge::dump_dot): Likewise.
gcc/c-family/ChangeLog:
* c-ada-spec.cc: Rename .c names to .cc.
* c-ada-spec.h: Likewise.
* c-common.cc (c_build_vec_convert): Likewise.
(warning_candidate_p): Likewise.
* c-common.h (enum rid): Likewise.
(build_real_imag_expr): Likewise.
(finish_label_address_expr): Likewise.
(c_get_substring_location): Likewise.
(c_build_bind_expr): Likewise.
(conflict_marker_get_final_tok_kind): Likewise.
(c_parse_error): Likewise.
(check_missing_format_attribute): Likewise.
(invalid_array_size_error): Likewise.
(warn_for_multistatement_macros): Likewise.
(build_attr_access_from_parms): Likewise.
* c-cppbuiltin.cc (c_cpp_builtins): Likewise.
* c-format.cc: Likewise.
* c-gimplify.cc (c_gimplify_expr): Likewise.
* c-indentation.h: Likewise.
* c-objc.h (objc_prop_attr_kind_for_rid): Likewise.
* c-omp.cc (c_omp_predetermined_mapping): Likewise.
* c-opts.cc (c_common_post_options): Likewise.
(set_std_cxx23): Likewise.
* c-pragma.cc (handle_pragma_redefine_extname): Likewise.
* c-pretty-print.h: Likewise.
gcc/c/ChangeLog:
* Make-lang.in: Rename .c names to .cc.
* c-convert.cc: Likewise.
* c-decl.cc (struct lang_identifier): Likewise.
(pop_scope): Likewise.
(finish_decl): Likewise.
* c-objc-common.h (GCC_C_OBJC_COMMON): Likewise.
* c-parser.cc (c_parser_skip_to_end_of_block_or_statement): Likewise.
* c-parser.h (GCC_C_PARSER_H): Likewise.
* c-tree.h (c_keyword_starts_typename): Likewise.
(finish_declspecs): Likewise.
(c_get_alias_set): Likewise.
(enum c_oracle_request): Likewise.
(tag_exists_p): Likewise.
(set_c_expr_source_range): Likewise.
* c-typeck.cc (c_common_type): Likewise.
(c_finish_omp_clauses): Likewise.
* config-lang.in: Likewise.
gcc/cp/ChangeLog:
* Make-lang.in: Rename .c names to .cc.
* config-lang.in: Likewise.
* constexpr.cc (cxx_eval_constant_expression): Likewise.
* coroutines.cc (morph_fn_to_coro): Likewise.
* cp-gimplify.cc (cp_gimplify_expr): Likewise.
* cp-lang.cc (struct lang_hooks): Likewise.
(get_template_argument_pack_elems_folded): Likewise.
* cp-objcp-common.cc (cp_tree_size): Likewise.
(cp_unit_size_without_reusable_padding): Likewise.
(pop_file_scope): Likewise.
(cp_pushdecl): Likewise.
* cp-objcp-common.h (GCC_CP_OBJCP_COMMON): Likewise.
(cxx_simulate_record_decl): Likewise.
* cp-tree.h (struct named_label_entry): Likewise.
(current_function_return_value): Likewise.
(more_aggr_init_expr_args_p): Likewise.
(get_function_version_dispatcher): Likewise.
(common_enclosing_class): Likewise.
(strip_fnptr_conv): Likewise.
(current_decl_namespace): Likewise.
(do_aggregate_paren_init): Likewise.
(cp_check_const_attributes): Likewise.
(qualified_name_lookup_error): Likewise.
(generic_targs_for): Likewise.
(mark_exp_read): Likewise.
(is_global_friend): Likewise.
(maybe_reject_flexarray_init): Likewise.
(module_token_lang): Likewise.
(handle_module_option): Likewise.
(literal_integer_zerop): Likewise.
(build_extra_args): Likewise.
(build_if_nonnull): Likewise.
(maybe_check_overriding_exception_spec): Likewise.
(finish_omp_target_clauses): Likewise.
(maybe_warn_zero_as_null_pointer_constant): Likewise.
(cxx_print_error_function): Likewise.
(decl_in_std_namespace_p): Likewise.
(merge_exception_specifiers): Likewise.
(mangle_module_global_init): Likewise.
(cxx_block_may_fallthru): Likewise.
(fold_builtin_source_location): Likewise.
(enum cp_oracle_request): Likewise.
(subsumes): Likewise.
(cp_finish_injected_record_type): Likewise.
(vtv_build_vtable_verify_fndecl): Likewise.
(cp_tree_c_finish_parsing): Likewise.
* cvt.cc (diagnose_ref_binding): Likewise.
(convert_to_void): Likewise.
(convert_force): Likewise.
(type_promotes_to): Likewise.
* decl.cc (make_unbound_class_template_raw): Likewise.
(cxx_init_decl_processing): Likewise.
(check_class_member_definition_namespace): Likewise.
(cxx_maybe_build_cleanup): Likewise.
* decl2.cc (maybe_emit_vtables): Likewise.
* error.cc (dump_function_name): Likewise.
* init.cc (is_class_type): Likewise.
(build_new_1): Likewise.
* lang-specs.h: Likewise.
* method.cc (make_alias_for_thunk): Likewise.
* module.cc (specialization_add): Likewise.
(module_state::read_cluster): Likewise.
* name-lookup.cc (check_extern_c_conflict): Likewise.
* name-lookup.h (struct cxx_binding): Likewise.
* parser.cc (cp_parser_identifier): Likewise.
* parser.h (struct cp_parser): Likewise.
* pt.cc (has_value_dependent_address): Likewise.
(push_tinst_level_loc): Likewise.
* semantics.cc (finish_omp_clauses): Likewise.
(finish_omp_atomic): Likewise.
* tree.cc (cp_save_expr): Likewise.
(cp_free_lang_data): Likewise.
* typeck.cc (cp_common_type): Likewise.
(strip_array_domain): Likewise.
(rationalize_conditional_expr): Likewise.
(check_return_expr): Likewise.
* vtable-class-hierarchy.cc: Likewise.
gcc/d/ChangeLog:
* d-gimplify.cc: Rename .c names to .cc.
* d-incpath.cc: Likewise.
* lang-specs.h: Likewise.
gcc/fortran/ChangeLog:
* check.cc (gfc_check_all_any): Rename .c names to .cc.
* class.cc (find_intrinsic_vtab): Likewise.
* config-lang.in: Likewise.
* cpp.cc (cpp_define_builtins): Likewise.
* data.cc (get_array_index): Likewise.
* decl.cc (match_clist_expr): Likewise.
(get_proc_name): Likewise.
(gfc_verify_c_interop_param): Likewise.
(gfc_get_pdt_instance): Likewise.
(gfc_match_formal_arglist): Likewise.
(gfc_get_type_attr_spec): Likewise.
* dependency.cc: Likewise.
* error.cc (gfc_format_decoder): Likewise.
* expr.cc (check_restricted): Likewise.
(gfc_build_default_init_expr): Likewise.
* f95-lang.cc: Likewise.
* gfc-internals.texi: Likewise.
* gfortran.h (enum match): Likewise.
(enum procedure_type): Likewise.
(enum oacc_routine_lop): Likewise.
(gfc_get_pdt_instance): Likewise.
(gfc_end_source_files): Likewise.
(gfc_mpz_set_hwi): Likewise.
(gfc_get_option_string): Likewise.
(gfc_find_sym_in_expr): Likewise.
(gfc_errors_to_warnings): Likewise.
(gfc_real_4_kind): Likewise.
(gfc_free_finalizer): Likewise.
(gfc_sym_get_dummy_args): Likewise.
(gfc_check_intrinsic_standard): Likewise.
(gfc_free_case_list): Likewise.
(gfc_resolve_oacc_routines): Likewise.
(gfc_check_vardef_context): Likewise.
(gfc_free_association_list): Likewise.
(gfc_implicit_pure_function): Likewise.
(gfc_ref_dimen_size): Likewise.
(gfc_compare_actual_formal): Likewise.
(gfc_resolve_wait): Likewise.
(gfc_dt_upper_string): Likewise.
(gfc_generate_module_code): Likewise.
(gfc_delete_bbt): Likewise.
(debug): Likewise.
(gfc_build_block_ns): Likewise.
(gfc_dep_difference): Likewise.
(gfc_invalid_null_arg): Likewise.
(gfc_is_finalizable): Likewise.
(gfc_fix_implicit_pure): Likewise.
(gfc_is_size_zero_array): Likewise.
(gfc_is_reallocatable_lhs): Likewise.
* gfortranspec.cc: Likewise.
* interface.cc (compare_actual_expr): Likewise.
* intrinsic.cc (add_functions): Likewise.
* iresolve.cc (gfc_resolve_matmul): Likewise.
(gfc_resolve_alarm_sub): Likewise.
* iso-c-binding.def: Likewise.
* lang-specs.h: Likewise.
* libgfortran.h (GFC_STDERR_UNIT_NUMBER): Likewise.
* match.cc (gfc_match_label): Likewise.
(gfc_match_symbol): Likewise.
(match_derived_type_spec): Likewise.
(copy_ts_from_selector_to_associate): Likewise.
* match.h (gfc_match_call): Likewise.
(gfc_get_common): Likewise.
(gfc_match_omp_end_single): Likewise.
(gfc_match_volatile): Likewise.
(gfc_match_bind_c): Likewise.
(gfc_match_literal_constant): Likewise.
(gfc_match_init_expr): Likewise.
(gfc_match_array_constructor): Likewise.
(gfc_match_end_interface): Likewise.
(gfc_match_print): Likewise.
(gfc_match_expr): Likewise.
* matchexp.cc (next_operator): Likewise.
* mathbuiltins.def: Likewise.
* module.cc (free_true_name): Likewise.
* openmp.cc (gfc_resolve_omp_parallel_blocks): Likewise.
(gfc_omp_save_and_clear_state): Likewise.
* parse.cc (parse_union): Likewise.
(set_syms_host_assoc): Likewise.
* resolve.cc (resolve_actual_arglist): Likewise.
(resolve_elemental_actual): Likewise.
(check_host_association): Likewise.
(resolve_typebound_function): Likewise.
(resolve_typebound_subroutine): Likewise.
(gfc_resolve_expr): Likewise.
(resolve_assoc_var): Likewise.
(resolve_typebound_procedures): Likewise.
(resolve_equivalence_derived): Likewise.
* simplify.cc (simplify_bound): Likewise.
* symbol.cc (gfc_set_default_type): Likewise.
(gfc_add_ext_attribute): Likewise.
* target-memory.cc (gfc_target_interpret_expr): Likewise.
* target-memory.h (gfc_target_interpret_expr): Likewise.
* trans-array.cc (gfc_get_cfi_dim_sm): Likewise.
(gfc_conv_shift_descriptor_lbound): Likewise.
(gfc_could_be_alias): Likewise.
(gfc_get_dataptr_offset): Likewise.
* trans-const.cc: Likewise.
* trans-decl.cc (trans_function_start): Likewise.
(gfc_trans_deferred_vars): Likewise.
(generate_local_decl): Likewise.
(gfc_generate_function_code): Likewise.
* trans-expr.cc (gfc_vptr_size_get): Likewise.
(gfc_trans_class_array_init_assign): Likewise.
(POWI_TABLE_SIZE): Likewise.
(gfc_conv_procedure_call): Likewise.
(gfc_trans_arrayfunc_assign): Likewise.
* trans-intrinsic.cc (gfc_conv_intrinsic_len): Likewise.
(gfc_conv_intrinsic_loc): Likewise.
(conv_intrinsic_event_query): Likewise.
* trans-io.cc (gfc_build_st_parameter): Likewise.
* trans-openmp.cc (gfc_omp_check_optional_argument): Likewise.
(gfc_omp_unshare_expr_r): Likewise.
(gfc_trans_omp_array_section): Likewise.
(gfc_trans_omp_clauses): Likewise.
* trans-stmt.cc (trans_associate_var): Likewise.
(gfc_trans_deallocate): Likewise.
* trans-stmt.h (gfc_trans_class_init_assign): Likewise.
(gfc_trans_deallocate): Likewise.
(gfc_trans_oacc_declare): Likewise.
* trans-types.cc: Likewise.
* trans-types.h (enum gfc_packed): Likewise.
* trans.cc (N_): Likewise.
(trans_code): Likewise.
* trans.h (gfc_build_compare_string): Likewise.
(gfc_conv_expr_type): Likewise.
(gfc_trans_deferred_vars): Likewise.
(getdecls): Likewise.
(gfc_get_array_descr_info): Likewise.
(gfc_omp_firstprivatize_type_sizes): Likewise.
(GTY): Likewise.
gcc/go/ChangeLog:
* config-lang.in: Rename .c names to .cc.
* go-backend.cc: Likewise.
* go-lang.cc: Likewise.
* gospec.cc: Likewise.
* lang-specs.h: Likewise.
gcc/jit/ChangeLog:
* config-lang.in: Rename .c names to .cc.
* docs/_build/texinfo/libgccjit.texi: Likewise.
* docs/internals/index.rst: Likewise.
* jit-builtins.cc (builtins_manager::make_builtin_function): Likewise.
* jit-playback.cc (fold_const_var): Likewise.
(playback::context::~context): Likewise.
(new_field): Likewise.
(new_bitfield): Likewise.
(new_compound_type): Likewise.
(playback::compound_type::set_fields): Likewise.
(global_set_init_rvalue): Likewise.
(load_blob_in_ctor): Likewise.
(new_global_initialized): Likewise.
(double>): Likewise.
(new_string_literal): Likewise.
(as_truth_value): Likewise.
(build_call): Likewise.
(playback::context::build_cast): Likewise.
(new_array_access): Likewise.
(new_field_access): Likewise.
(dereference): Likewise.
(postprocess): Likewise.
(add_jump): Likewise.
(add_switch): Likewise.
(build_goto_operands): Likewise.
(playback::context::read_dump_file): Likewise.
(init_types): Likewise.
* jit-recording.cc (recording::context::get_int_type): Likewise.
* jit-recording.h: Likewise.
* libgccjit.cc (compatible_types): Likewise.
(gcc_jit_context_acquire): Likewise.
(gcc_jit_context_release): Likewise.
(gcc_jit_context_new_child_context): Likewise.
(gcc_jit_type_as_object): Likewise.
(gcc_jit_context_get_type): Likewise.
(gcc_jit_context_get_int_type): Likewise.
(gcc_jit_type_get_pointer): Likewise.
(gcc_jit_type_get_const): Likewise.
(gcc_jit_type_get_volatile): Likewise.
(gcc_jit_type_dyncast_array): Likewise.
(gcc_jit_type_is_bool): Likewise.
(gcc_jit_type_is_pointer): Likewise.
(gcc_jit_type_is_integral): Likewise.
(gcc_jit_type_dyncast_vector): Likewise.
(gcc_jit_type_is_struct): Likewise.
(gcc_jit_vector_type_get_num_units): Likewise.
(gcc_jit_vector_type_get_element_type): Likewise.
(gcc_jit_type_unqualified): Likewise.
(gcc_jit_type_dyncast_function_ptr_type): Likewise.
(gcc_jit_function_type_get_return_type): Likewise.
(gcc_jit_function_type_get_param_count): Likewise.
(gcc_jit_function_type_get_param_type): Likewise.
(gcc_jit_context_new_array_type): Likewise.
(gcc_jit_context_new_field): Likewise.
(gcc_jit_field_as_object): Likewise.
(gcc_jit_context_new_struct_type): Likewise.
(gcc_jit_struct_as_type): Likewise.
(gcc_jit_struct_set_fields): Likewise.
(gcc_jit_struct_get_field_count): Likewise.
(gcc_jit_context_new_union_type): Likewise.
(gcc_jit_context_new_function_ptr_type): Likewise.
(gcc_jit_param_as_rvalue): Likewise.
(gcc_jit_context_new_function): Likewise.
(gcc_jit_function_get_return_type): Likewise.
(gcc_jit_function_dump_to_dot): Likewise.
(gcc_jit_block_get_function): Likewise.
(gcc_jit_global_set_initializer_rvalue): Likewise.
(gcc_jit_rvalue_get_type): Likewise.
(gcc_jit_context_new_rvalue_from_int): Likewise.
(gcc_jit_context_one): Likewise.
(gcc_jit_context_new_rvalue_from_double): Likewise.
(gcc_jit_context_null): Likewise.
(gcc_jit_context_new_string_literal): Likewise.
(valid_binary_op_p): Likewise.
(gcc_jit_context_new_binary_op): Likewise.
(gcc_jit_context_new_comparison): Likewise.
(gcc_jit_context_new_call): Likewise.
(is_valid_cast): Likewise.
(gcc_jit_context_new_cast): Likewise.
(gcc_jit_object_get_context): Likewise.
(gcc_jit_object_get_debug_string): Likewise.
(gcc_jit_lvalue_access_field): Likewise.
(gcc_jit_rvalue_access_field): Likewise.
(gcc_jit_rvalue_dereference_field): Likewise.
(gcc_jit_rvalue_dereference): Likewise.
(gcc_jit_lvalue_get_address): Likewise.
(gcc_jit_lvalue_set_tls_model): Likewise.
(gcc_jit_lvalue_set_link_section): Likewise.
(gcc_jit_function_new_local): Likewise.
(gcc_jit_block_add_eval): Likewise.
(gcc_jit_block_add_assignment): Likewise.
(is_bool): Likewise.
(gcc_jit_block_end_with_conditional): Likewise.
(gcc_jit_block_add_comment): Likewise.
(gcc_jit_block_end_with_jump): Likewise.
(gcc_jit_block_end_with_return): Likewise.
(gcc_jit_block_end_with_void_return): Likewise.
(case_range_validator::case_range_validator): Likewise.
(case_range_validator::validate): Likewise.
(case_range_validator::get_wide_int): Likewise.
(gcc_jit_block_end_with_switch): Likewise.
(gcc_jit_context_set_str_option): Likewise.
(gcc_jit_context_set_int_option): Likewise.
(gcc_jit_context_set_bool_option): Likewise.
(gcc_jit_context_set_bool_allow_unreachable_blocks): Likewise.
(gcc_jit_context_set_bool_use_external_driver): Likewise.
(gcc_jit_context_add_command_line_option): Likewise.
(gcc_jit_context_add_driver_option): Likewise.
(gcc_jit_context_enable_dump): Likewise.
(gcc_jit_context_compile): Likewise.
(gcc_jit_context_compile_to_file): Likewise.
(gcc_jit_context_set_logfile): Likewise.
(gcc_jit_context_dump_reproducer_to_file): Likewise.
(gcc_jit_context_get_first_error): Likewise.
(gcc_jit_context_get_last_error): Likewise.
(gcc_jit_result_get_code): Likewise.
(gcc_jit_result_get_global): Likewise.
(gcc_jit_rvalue_set_bool_require_tail_call): Likewise.
(gcc_jit_type_get_aligned): Likewise.
(gcc_jit_type_get_vector): Likewise.
(gcc_jit_function_get_address): Likewise.
(gcc_jit_version_patchlevel): Likewise.
(gcc_jit_block_add_extended_asm): Likewise.
(gcc_jit_extended_asm_as_object): Likewise.
(gcc_jit_extended_asm_set_volatile_flag): Likewise.
(gcc_jit_extended_asm_set_inline_flag): Likewise.
(gcc_jit_extended_asm_add_output_operand): Likewise.
(gcc_jit_extended_asm_add_input_operand): Likewise.
(gcc_jit_extended_asm_add_clobber): Likewise.
* notes.txt: Likewise.
gcc/lto/ChangeLog:
* config-lang.in: Rename .c names to .cc.
* lang-specs.h: Likewise.
* lto-common.cc (gimple_register_canonical_type_1): Likewise.
* lto-common.h: Likewise.
* lto-dump.cc (lto_main): Likewise.
* lto-lang.cc (handle_fnspec_attribute): Likewise.
(lto_getdecls): Likewise.
(lto_init): Likewise.
* lto.cc (lto_main): Likewise.
* lto.h: Likewise.
gcc/objc/ChangeLog:
* Make-lang.in: Rename .c names to .cc.
* config-lang.in: Likewise.
* lang-specs.h: Likewise.
* objc-act.cc (objc_build_component_ref): Likewise.
(objc_copy_binfo): Likewise.
(lookup_method_in_hash_lists): Likewise.
(objc_finish_foreach_loop): Likewise.
* objc-act.h (objc_common_init_ts): Likewise.
* objc-gnu-runtime-abi-01.cc: Likewise.
* objc-lang.cc (struct lang_hooks): Likewise.
* objc-map.cc: Likewise.
* objc-next-runtime-abi-01.cc (generate_objc_symtab_decl): Likewise.
* objc-runtime-shared-support.cc: Likewise.
* objc-runtime-shared-support.h (build_protocol_initializer): Likewise.
gcc/objcp/ChangeLog:
* Make-lang.in: Rename .c names to .cc.
* config-lang.in: Likewise.
* lang-specs.h: Likewise.
* objcp-decl.cc (objcp_end_compound_stmt): Likewise.
* objcp-lang.cc (struct lang_hooks): Likewise.
gcc/po/ChangeLog:
* EXCLUDES: Rename .c names to .cc.
libcpp/ChangeLog:
* Makefile.in: Rename .c names to .cc.
* charset.cc (convert_escape): Likewise.
* directives.cc (directive_diagnostics): Likewise.
(_cpp_handle_directive): Likewise.
(lex_macro_node): Likewise.
* include/cpplib.h (struct _cpp_file): Likewise.
(PURE_ZERO): Likewise.
(cpp_defined): Likewise.
(cpp_error_at): Likewise.
(cpp_forall_identifiers): Likewise.
(cpp_compare_macros): Likewise.
(cpp_get_converted_source): Likewise.
(cpp_read_state): Likewise.
(cpp_directive_only_process): Likewise.
(struct cpp_decoded_char): Likewise.
* include/line-map.h (enum lc_reason): Likewise.
(enum location_aspect): Likewise.
* include/mkdeps.h: Likewise.
* init.cc (cpp_destroy): Likewise.
(cpp_finish): Likewise.
* internal.h (struct cpp_reader): Likewise.
(_cpp_defined_macro_p): Likewise.
(_cpp_backup_tokens_direct): Likewise.
(_cpp_destroy_hashtable): Likewise.
(_cpp_has_header): Likewise.
(_cpp_expand_op_stack): Likewise.
(_cpp_commit_buff): Likewise.
(_cpp_restore_special_builtin): Likewise.
(_cpp_bracket_include): Likewise.
(_cpp_replacement_text_len): Likewise.
(ufputs): Likewise.
* line-map.cc (linemap_macro_loc_to_exp_point): Likewise.
(linemap_check_files_exited): Likewise.
(line_map_new_raw): Likewise.
* traditional.cc (enum ls): Likewise.
|
|
This introduces a bias parameter for the len_load/len_store ifns as well as
optabs that is meant to distinguish between Power and s390 variants.
PowerPC's instructions require a bias of 0, while in s390's case
vll/vstl do not support lengths of zero bytes and a bias of -1 must be used.
gcc/ChangeLog:
* internal-fn.c (expand_partial_load_optab_fn): Add bias.
(expand_partial_store_optab_fn): Likewise.
(internal_len_load_store_bias): New function.
* internal-fn.h (VECT_PARTIAL_BIAS_UNSUPPORTED): New define.
(internal_len_load_store_bias): New function.
* tree-vect-loop-manip.c (vect_set_loop_controls_directly): Set bias.
(vect_set_loop_condition_partial_vectors): Add header_seq parameter.
* tree-vect-loop.c (vect_verify_loop_lens): Verify bias.
(vect_estimate_min_profitable_iters): Account for bias.
(vect_get_loop_len): Add bias-adjusted length.
* tree-vect-stmts.c (vectorizable_store): Use.
(vectorizable_load): Use.
* tree-vectorizer.h (struct rgroup_controls): Add bias-adjusted length.
(LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS): New macro.
* config/rs6000/vsx.md: Use const0 bias predicate.
* doc/md.texi: Document bias value.
|
|
|
|
The following example
void f5(float * restrict z0, float * restrict z1, float *restrict x,
float * restrict y, float c, int n)
{
for (int i = 0; i < n; i++) {
float a = x[i];
float b = y[i];
if (a > b) {
z0[i] = a + b;
if (a > c) {
z1[i] = a - b;
}
}
}
}
generates currently:
ptrue p3.b, all
ld1w z1.s, p1/z, [x2, x5, lsl 2]
ld1w z2.s, p1/z, [x3, x5, lsl 2]
fcmgt p0.s, p3/z, z1.s, z0.s
fcmgt p2.s, p1/z, z1.s, z2.s
fcmgt p0.s, p0/z, z1.s, z2.s
and p0.b, p0/z, p1.b, p1.b
The conditions for a > b and a > c become separate comparisons.
After this patch we generate:
ld1w z1.s, p0/z, [x2, x5, lsl 2]
ld1w z2.s, p0/z, [x3, x5, lsl 2]
fcmgt p1.s, p0/z, z1.s, z2.s
fcmgt p1.s, p1/z, z1.s, z0.s
Where the condition a > b && a > c are folded by using the predicate result of
the previous compare and thus allows the removal of one of the compares.
When never a mask is being generated from an BIT_AND we mask the operands of
the and instead and then just AND the result.
This allows us to be able to CSE the masks and generate the right combination.
However because re-assoc will try to re-order the masks in the & we have to now
perform a small local CSE on the vectorized loop is vectorization is successful.
Note: This patch series is working incrementally towards generating the most
efficient code for this and other loops in small steps.
gcc/ChangeLog:
* tree-vect-stmts.c (prepare_load_store_mask): Rename to...
(prepare_vec_mask): ...This and record operations that have already been
masked.
(vectorizable_call): Use it.
(vectorizable_operation): Likewise.
(vectorizable_store): Likewise.
(vectorizable_load): Likewise.
* tree-vectorizer.h (class _loop_vec_info): Add vec_cond_masked_set.
(vec_cond_masked_set_type, tree_cond_mask_hash): New.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/pred-combine-and.c: New test.
|
|
The current definition of vect_is_reduction (provided for target
costing) misses some pattern statements.
gcc/
* tree-vectorizer.h (vect_is_reduction): Use STMT_VINFO_REDUC_IDX.
gcc/testsuite/
* gcc.target/aarch64/sve/cost_model_13.c: New test.
|
|
This patch extends the reduction code to handle calls. So far
it's a structural change only; a later patch adds support for
specific function reductions.
Most of the patch consists of using code_helper and gimple_match_op
to describe the reduction operations. The other main change is that
vectorizable_call now needs to handle fully-predicated reductions.
There are some new functions that are provided for ABI completeness
and aren't currently used:
first_commutative_argument
commutative_ternary_op_p
1- and 3-argument forms of gimple_build
gcc/
* builtins.h (associated_internal_fn): Declare overload that
takes a (combined_cfn, return type) pair.
* builtins.c (associated_internal_fn): Split new overload out
of original fndecl version. Also provide an overload that takes
a (combined_cfn, return type) pair.
* internal-fn.h (commutative_binary_fn_p): Declare.
(commutative_ternary_fn_p): Likewise.
(associative_binary_fn_p): Likewise.
* internal-fn.c (commutative_binary_fn_p, commutative_ternary_fn_p):
New functions, split out from...
(first_commutative_argument): ...here.
(associative_binary_fn_p): New function.
* gimple-match.h (code_helper): Add a constructor that takes
internal functions.
(commutative_binary_op_p): Declare.
(commutative_ternary_op_p): Likewise.
(first_commutative_argument): Likewise.
(associative_binary_op_p): Likewise.
(canonicalize_code): Likewise.
(directly_supported_p): Likewise.
(get_conditional_internal_fn): Likewise.
(gimple_build): New overloads that takes a code_helper.
* gimple-fold.c (gimple_build): Likewise.
* gimple-match-head.c (commutative_binary_op_p): New function.
(commutative_ternary_op_p): Likewise.
(first_commutative_argument): Likewise.
(associative_binary_op_p): Likewise.
(canonicalize_code): Likewise.
(directly_supported_p): Likewise.
(get_conditional_internal_fn): Likewise.
* tree-vectorizer.h: Include gimple-match.h.
(neutral_op_for_reduction): Take a code_helper instead of a tree_code.
(needs_fold_left_reduction_p): Likewise.
(reduction_fn_for_scalar_code): Likewise.
(vect_can_vectorize_without_simd_p): Declare a nNew overload that takes
a code_helper.
* tree-vect-loop.c: Include case-cfn-macros.h.
(fold_left_reduction_fn): Take a code_helper instead of a tree_code.
(reduction_fn_for_scalar_code): Likewise.
(neutral_op_for_reduction): Likewise.
(needs_fold_left_reduction_p): Likewise.
(use_mask_by_cond_expr_p): Likewise.
(build_vect_cond_expr): Likewise.
(vect_create_partial_epilog): Likewise. Use gimple_build rather
than gimple_build_assign.
(check_reduction_path): Handle calls and operate on code_helpers
rather than tree_codes.
(vect_is_simple_reduction): Likewise.
(vect_model_reduction_cost): Likewise.
(vect_find_reusable_accumulator): Likewise.
(vect_create_epilog_for_reduction): Likewise.
(vect_transform_cycle_phi): Likewise.
(vectorizable_reduction): Likewise. Make more use of
lane_reduc_code_p.
(vect_transform_reduction): Use gimple_extract_op but expect
a tree_code for now.
(vect_can_vectorize_without_simd_p): New overload that takes
a code_helper.
* tree-vect-stmts.c (vectorizable_call): Handle reductions in
fully-masked loops.
* tree-vect-patterns.c (vect_mark_pattern_stmts): Use
gimple_extract_op when updating STMT_VINFO_REDUC_IDX.
|
|
The following example:
void f11(double * restrict z, double * restrict w, double * restrict x,
double * restrict y, int n)
{
for (int i = 0; i < n; i++) {
z[i] = (w[i] > 0) ? w[i] : y[i];
}
}
Generates currently:
ptrue p2.b, all
ld1d z0.d, p0/z, [x1, x2, lsl 3]
fcmgt p1.d, p2/z, z0.d, #0.0
bic p3.b, p2/z, p0.b, p1.b
ld1d z1.d, p3/z, [x3, x2, lsl 3]
and after the previous patches generates:
ptrue p3.b, all
ld1d z0.d, p0/z, [x1, x2, lsl 3]
fcmgt p1.d, p0/z, z0.d, #0.0
fcmgt p2.d, p3/z, z0.d, #0.0
not p1.b, p0/z, p1.b
ld1d z1.d, p1/z, [x3, x2, lsl 3]
where a duplicate comparison is performed for w[i] > 0.
This is because in the vectorizer we're emitting a comparison for both a and ~a
where we just need to emit one of them and invert the other. After this patch
we generate:
ld1d z0.d, p0/z, [x1, x2, lsl 3]
fcmgt p1.d, p0/z, z0.d, #0.0
mov p2.b, p1.b
not p1.b, p0/z, p1.b
ld1d z1.d, p1/z, [x3, x2, lsl 3]
In order to perform the check I have to fully expand the NOT stmts when
recording them as the SSA names for the top level expressions differ but
their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not
the operands in _34.
But we only do this when the operation is an ordered one because mixing
ordered and unordered expressions can lead to de-optimized code.
Note: This patch series is working incrementally towards generating the most
efficient code for this and other loops in small steps. The mov is
created by postreload when it does a late CSE.
gcc/ChangeLog:
* tree-vectorizer.h (struct scalar_cond_masked_key): Add inverted_p.
(default_hash_traits<scalar_conf_masked_key>): Likewise.
* tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask
is live.
* tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
Register mask inverses.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/sve/pred-not-gen-1.c: Update testcase.
* gcc.target/aarch64/sve/pred-not-gen-2.c: Update testcase.
* gcc.target/aarch64/sve/pred-not-gen-3.c: Update testcase.
* gcc.target/aarch64/sve/pred-not-gen-4.c: Update testcase.
|
|
When finishing the vector costs, it can be useful to know
what the associated scalar costs were. This allows targets
to read information collected about the original scalar loop
when trying to make a final judgement about the cost of the
vector code.
This patch therefore passes the scalar costs to
vector_costs::finish_cost. The parameter is null for the
scalar costs themselves.
gcc/
* tree-vectorizer.h (vector_costs::finish_cost): Take the
corresponding scalar costs as a parameter.
(finish_cost): Likewise.
* tree-vect-loop.c (vect_compute_single_scalar_iteration_cost)
(vect_estimate_min_profitable_iters): Update accordingly.
* tree-vect-slp.c (vect_bb_vectorization_profitable_p): Likewise.
* tree-vectorizer.c (vector_costs::finish_cost): Likewise.
* config/aarch64/aarch64.c (aarch64_vector_costs::finish_cost):
Likewise.
* config/rs6000/rs6000.c (rs6000_cost_data::finish_cost): Likewise.
|
|
The scalar costs for a loop are fleeting, with only the final
single_scalar_iteration_cost being kept for later comparison.
This patch replaces single_scalar_iteration_cost with the cost
structure, so that (with later patches) it's possible for targets
to examine other target-specific cost properties as well. This will
be done by passing the scalar costs to hooks where appropriate;
targets shouldn't try to read the information directly from
loop_vec_infos.
gcc/
* tree-vectorizer.h (_loop_vec_info::scalar_costs): New member
variable.
(_loop_vec_info::single_scalar_iteration_cost): Delete.
(LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST): Delete.
(vector_costs::total_cost): New function.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update
after above changes.
(_loop_vec_info::~_loop_vec_info): Delete scalar_costs.
(vect_compute_single_scalar_iteration_cost): Store the costs
in loop_vinfo->scalar_costs.
(vect_estimate_min_profitable_iters): Get the scalar cost from
loop_vinfo->scalar_costs.
|
|
One of the things we want to do on AArch64 is compare vector loops
side-by-side and pick the best one. For some targets, we want this
to be based on issue rates as well as the usual latency-based costs
(at least for loops with relatively high iteration counts).
The current approach to doing this is: when costing vectorisation
candidate A, try to guess what the other main candidate B will look
like and adjust A's latency-based cost up or down based on the likely
difference between A and B's issue rates. This effectively means
that we try to cost parts of B at the same time as A, without actually
being able to see B.
This is needlessly indirect and complex. It was a compromise due
to the code being added (too) late in the GCC 11 cycle, so that
target-independent changes weren't possible.
The target-independent code already compares two candidate loop_vec_infos
side-by-side, so that information about A and B above are available
directly. This patch creates a way for targets to hook into this
comparison.
The AArch64 code can therefore hook into better_main_loop_than_p to
compare issue rates. If the issue rate comparison isn't decisive,
the code can fall back to the normal latency-based comparison instead.
gcc/
* tree-vectorizer.h (vector_costs::better_main_loop_than_p)
(vector_costs::better_epilogue_loop_than_p)
(vector_costs::compare_inside_loop_cost)
(vector_costs::compare_outside_loop_cost): Likewise.
* tree-vectorizer.c (vector_costs::better_main_loop_than_p)
(vector_costs::better_epilogue_loop_than_p)
(vector_costs::compare_inside_loop_cost)
(vector_costs::compare_outside_loop_cost): New functions,
containing code moved from...
* tree-vect-loop.c (vect_better_loop_vinfo_p): ...here.
|
|
The vector costs now use a common base class instead of being
completely abstract. This means that there's no longer a
need to record the inside and outside costs separately.
gcc/
* tree-vectorizer.h (_loop_vec_info): Remove vec_outside_cost
and vec_inside_cost.
(vector_costs::outside_cost): New function.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update
after above.
(vect_estimate_min_profitable_iters): Likewise.
(vect_better_loop_vinfo_p): Get the inside and outside costs
from the loop_vec_infos' vector_costs.
|
|
target_cost_data is in vec_info but is really specific to
loop_vec_info. This patch moves it there and renames it to
vector_costs, to distinguish it from scalar target costs.
gcc/
* tree-vectorizer.h (vec_info::target_cost_data): Replace with...
(_loop_vec_info::vector_costs): ...this.
(LOOP_VINFO_TARGET_COST_DATA): Delete.
* tree-vectorizer.c (vec_info::vec_info): Remove target_cost_data
initialization.
(vec_info::~vec_info): Remove corresponding delete.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
vector_costs to null.
(_loop_vec_info::~_loop_vec_info): Delete vector_costs.
(vect_analyze_loop_operations): Update after above changes.
(vect_analyze_loop_2): Likewise.
(vect_estimate_min_profitable_iters): Likewise.
* tree-vect-slp.c (vect_slp_analyze_operations): Likewise.
|
|
This will enable transformation like
- # sum1_50 = PHI <prephitmp_64(13), 0(4)>
- # sum2_52 = PHI <sum2_21(13), 0(4)>
+ # sum1_50 = PHI <_87(13), 0(4)>
+ # sum2_52 = PHI <_89(13), 0(4)>
# ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
i.2_7 = (long unsigned int) i_49;
_8 = i.2_7 * 8;
...
vec1_i_38 = vec1_29 >> _10;
vec2_i_39 = vec2_31 >> _10;
_11 = vec1_i_38 & 1;
- _63 = tmp_37 ^ sum1_50;
- prephitmp_64 = _11 == 0 ? sum1_50 : _63;
+ _ifc__86 = _11 != 0 ? tmp_37 : 0;
+ _87 = sum1_50 ^ _ifc__86;
_12 = vec2_i_39 & 1;
:
so that vectorizer won't failed due to
/* If this isn't a nested cycle or if the nested cycle reduction value
is used ouside of the inner loop we cannot handle uses of the reduction
value. */
if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"reduction used in loop.\n");
return NULL;
}
gcc/ChangeLog:
PR tree-optimization/103126
* tree-vect-loop.c (neutral_op_for_reduction): Remove static.
* tree-vectorizer.h (neutral_op_for_reduction): Declare.
* tree-if-conv.c : Include tree-vectorizer.h.
(is_cond_scalar_reduction): Handle
BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
(convert_scalar_cond_reduction): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
|
|
This fixes an oversight that caused vectorized epilogues to have
versioning for niters applied.
2021-11-08 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vect_create_loop_vinfo): Add main_loop_info
parameter.
* tree-vect-loop.c (vect_create_loop_vinfo): Likewise. Set
LOOP_VINFO_ORIG_LOOP_INFO and conditionalize set of
LOOP_VINFO_NITERS_ASSUMPTIONS.
(vect_analyze_loop_1): Adjust.
(vect_analyze_loop): Move loop constraint setting and
SCEV/niter reset here from vect_create_loop_vinfo to perform
it only once.
(vect_analyze_loop_form): Move dumping of symbolic niters
here from vect_create_loop_vinfo.
|
|
As discussed this splits the analysis loop into two, first settling
on a vector mode used for the main loop and only then analyzing
the epilogue of that for possible vectorization. That makes it
easier to put in support for unrolled main loops.
On the way I've realized some cleanup opportunities, namely caching
n_stmts in vec_info_shared (it's computed by dataref analysis)
avoiding to pass that around and setting/clearing loop->aux
during analysis - try_vectorize_loop_1 will ultimatively set it
on those we vectorize.
This also gets rid of the previously introduced callback in
vect_analyze_loop_1 in favor of making that advance the mode iterator.
I'm now pushing VOIDmode explicitely into the vector_modes array
which makes the re-start on the epilogue side a bit more
straight-forward. Note that will now use auto-detection of the
vector mode in case the main loop used it and we want to try
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P and the first mode from
the target array if not. I've added a comment that says we may
want to make sure we don't try vectorizing the epilogue with a
bigger vector size than the main loop but the situation isn't
very likely to appear in practice I guess (and it was also present
before this change).
In principle this change should not change vectorization decisions
but the way we handled re-analyzing epilogues as main loops makes
me only 99% sure that it does.
2021-11-05 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vec_info_shared::n_stmts): Add.
(LOOP_VINFO_N_STMTS): Likewise.
(vec_info_for_bb): Remove unused function.
* tree-vectorizer.c (vec_info_shared::vec_info_shared):
Initialize n_stmts member.
* tree-vect-loop.c: Remove INCLUDE_FUNCTIONAL.
(vect_create_loop_vinfo): Do not set loop->aux.
(vect_analyze_loop_2): Do not get n_stmts as argument,
instead use LOOP_VINFO_N_STMTS. Set LOOP_VINFO_VECTORIZABLE_P
here.
(vect_analyze_loop_1): Remove callback, get the mode iterator
and autodetected_vector_mode as argument, advancing the
iterator and initializing autodetected_vector_mode here.
(vect_analyze_loop): Split analysis loop into two, first
processing main loops only and then epilogues.
|
|
This refactors the main loop analysis part in vect_analyze_loop,
re-purposing the existing vect_reanalyze_as_main_loop for this
to reduce code duplication. Failure flow is a bit tricky since
we want to extract info from the analyzed loop but I wanted to
share the destruction part. Thus I add some std::function and
lambda to funnel post-analysis for the case we want that
(when analyzing from the main iteration but not when re-analyzing
an epilogue as main).
In addition I split vect_analyze_loop_form into analysis and
vinfo creation so we can do the analysis only once, simplifying
the new vect_analyze_loop_1.
As discussed we probably want to change the loop over vector
modes to first only analyze things as the main loop, picking
the best (or simd VF) mode for the main loop and then analyze
for a vectorized epilogue. The unroll would then integrate
with the main loop vectorization. I think that currently
we may fail to analyze the epilogue with the same mode as
the main loop when using partial vectors since we increment
mode_i before doing that.
2021-11-04 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (struct vect_loop_form_info): New.
(vect_analyze_loop_form): Adjust.
(vect_create_loop_vinfo): New.
* tree-parloops.c (gather_scalar_reductions): Adjust for
vect_analyze_loop_form API change.
* tree-vect-loop.c: Include <functional>.
(vect_analyze_loop_form_1): Rename to vect_analyze_loop_form,
take struct vect_loop_form_info as output parameter and adjust.
(vect_analyze_loop_form): Rename to vect_create_loop_vinfo and
split out call to the original vect_analyze_loop_form_1.
(vect_reanalyze_as_main_loop): Rename to...
(vect_analyze_loop_1): ... this, factor out the call to
vect_analyze_loop_form and generalize to be able to use it twice ...
(vect_analyze_loop): ... here. Perform vect_analyze_loop_form
once only and here.
|
|
The current vector cost interface has a quite a bit of redundancy
built in. Each target that defines its own hooks has to replicate
the basic unsigned[3] management. Currently each target also
duplicates the cost adjustment for inner loops.
This patch instead defines a vector_costs class for holding
the scalar or vector cost and allows targets to subclass it.
There is then only one costing hook: to create a new costs
structure of the appropriate type. Everything else can be
virtual functions, with common concepts implemented in the
base class rather than in each target's derivation.
This might seem like excess C++-ification, but it shaves
~100 LOC. I've also got some follow-on changes that become
significantly easier with this patch. Maybe it could help
with things like weighting blocks based on frequency too.
This will clash with Andre's unrolling patches. His patches
have priority so this patch should queue behind them.
The x86 and rs6000 parts fully convert to a self-contained class.
The equivalent aarch64 changes are more complex, so this patch
just does the bare minimum. A later patch will rework the
aarch64 bits.
gcc/
* target.def (targetm.vectorize.init_cost): Replace with...
(targetm.vectorize.create_costs): ...this.
(targetm.vectorize.add_stmt_cost): Delete.
(targetm.vectorize.finish_cost): Likewise.
(targetm.vectorize.destroy_cost_data): Likewise.
* doc/tm.texi.in (TARGET_VECTORIZE_INIT_COST): Replace with...
(TARGET_VECTORIZE_CREATE_COSTS): ...this.
(TARGET_VECTORIZE_ADD_STMT_COST): Delete.
(TARGET_VECTORIZE_FINISH_COST): Likewise.
(TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise.
* doc/tm.texi: Regenerate.
* tree-vectorizer.h (vec_info::vec_info): Remove target_cost_data
parameter.
(vec_info::target_cost_data): Change from a void * to a vector_costs *.
(vector_costs): New class.
(init_cost): Take a vec_info and return a vector_costs.
(dump_stmt_cost): Remove data parameter.
(add_stmt_cost): Replace vinfo and data parameters with a vector_costs.
(add_stmt_costs): Likewise.
(finish_cost): Replace data parameter with a vector_costs.
(destroy_cost_data): Delete.
* tree-vectorizer.c (dump_stmt_cost): Remove data argument and
don't print it.
(vec_info::vec_info): Remove the target_cost_data parameter and
initialize the member variable to null instead.
(vec_info::~vec_info): Delete target_cost_data instead of calling
destroy_cost_data.
(vector_costs::add_stmt_cost): New function.
(vector_costs::finish_cost): Likewise.
(vector_costs::record_stmt_cost): Likewise.
(vector_costs::adjust_cost_for_freq): Likewise.
* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Update
call to vec_info::vec_info.
(vect_compute_single_scalar_iteration_cost): Update after above
changes to costing interface.
(vect_analyze_loop_operations): Likewise.
(vect_estimate_min_profitable_iters): Likewise.
(vect_analyze_loop_2): Initialize LOOP_VINFO_TARGET_COST_DATA
at the start_over point, where it needs to be recreated after
trying without slp. Update retry code accordingly.
* tree-vect-slp.c (_bb_vec_info::_bb_vec_info): Update call
to vec_info::vec_info.
(vect_slp_analyze_operation): Update after above changes to costing
interface.
(vect_bb_vectorization_profitable_p): Likewise.
* targhooks.h (default_init_cost): Replace with...
(default_vectorize_create_costs): ...this.
(default_add_stmt_cost): Delete.
(default_finish_cost, default_destroy_cost_data): Likewise.
* targhooks.c (default_init_cost): Replace with...
(default_vectorize_create_costs): ...this.
(default_add_stmt_cost): Delete, moving logic to vector_costs instead.
(default_finish_cost, default_destroy_cost_data): Delete.
* config/aarch64/aarch64.c (aarch64_vector_costs): Inherit from
vector_costs. Add a constructor.
(aarch64_init_cost): Replace with...
(aarch64_vectorize_create_costs): ...this.
(aarch64_add_stmt_cost): Replace with...
(aarch64_vector_costs::add_stmt_cost): ...this. Use record_stmt_cost
to adjust the cost for inner loops.
(aarch64_finish_cost): Replace with...
(aarch64_vector_costs::finish_cost): ...this.
(aarch64_destroy_cost_data): Delete.
(TARGET_VECTORIZE_INIT_COST): Replace with...
(TARGET_VECTORIZE_CREATE_COSTS): ...this.
(TARGET_VECTORIZE_ADD_STMT_COST): Delete.
(TARGET_VECTORIZE_FINISH_COST): Likewise.
(TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise.
* config/i386/i386.c (ix86_vector_costs): New structure.
(ix86_init_cost): Replace with...
(ix86_vectorize_create_costs): ...this.
(ix86_add_stmt_cost): Replace with...
(ix86_vector_costs::add_stmt_cost): ...this. Use adjust_cost_for_freq
to adjust the cost for inner loops.
(ix86_finish_cost, ix86_destroy_cost_data): Delete.
(TARGET_VECTORIZE_INIT_COST): Replace with...
(TARGET_VECTORIZE_CREATE_COSTS): ...this.
(TARGET_VECTORIZE_ADD_STMT_COST): Delete.
(TARGET_VECTORIZE_FINISH_COST): Likewise.
(TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise.
* config/rs6000/rs6000.c (TARGET_VECTORIZE_INIT_COST): Replace with...
(TARGET_VECTORIZE_CREATE_COSTS): ...this.
(TARGET_VECTORIZE_ADD_STMT_COST): Delete.
(TARGET_VECTORIZE_FINISH_COST): Likewise.
(TARGET_VECTORIZE_DESTROY_COST_DATA): Likewise.
(rs6000_cost_data): Inherit from vector_costs.
Add a constructor. Drop loop_info, cost and costing_for_scalar
in favor of the corresponding vector_costs member variables.
Add "m_" to the names of the remaining member variables and
initialize them.
(rs6000_density_test): Replace with...
(rs6000_cost_data::density_test): ...this.
(rs6000_init_cost): Replace with...
(rs6000_vectorize_create_costs): ...this.
(rs6000_update_target_cost_per_stmt): Replace with...
(rs6000_cost_data::update_target_cost_per_stmt): ...this.
(rs6000_add_stmt_cost): Replace with...
(rs6000_cost_data::add_stmt_cost): ...this. Use adjust_cost_for_freq
to adjust the cost for inner loops.
(rs6000_adjust_vect_cost_per_loop): Replace with...
(rs6000_cost_data::adjust_vect_cost_per_loop): ...this.
(rs6000_finish_cost): Replace with...
(rs6000_cost_data::finish_cost): ...this. Group loop code
into a single if statement and pass the loop_vinfo down to
subroutines.
(rs6000_destroy_cost_data): Delete.
|
|
Now that both are measured in bytes we can unify the two parameters.
2021-10-26 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vect_create_addr_base_for_vector_ref):
Remove byte_offset parameter.
(vect_create_data_ref_ptr): Likewise.
* tree-vect-data-refs.c (vect_create_addr_base_for_vector_ref):
Likewise.
(vect_create_data_ref_ptr): Likewise.
* tree-vect-stmts.c (vectorizable_store): Adjust.
(vectorizable_load): Likewise.
|
|
This moves applying of a bias for negative stride accesses out of
dr_misalignment in favor of a more general optional offset argument.
The negative bias is now computed by get_load_store_type and applied
accordingly to determine the alignment support scheme. Likewise
the peeling/versioning code is adjusted albeit that still assumes
we'll end up with VMAT_CONTIGUOUS_DOWN or VMAT_CONTIGUOUS_REVERSE
but at least when not so (VMAT_STRIDED_SLP is one possibility) then
get_load_store_type will _not_ falsely report an aligned access but
instead an access with known misalignment.
This fixes PR96109.
2021-10-25 Richard Biener <rguenther@suse.de>
PR tree-optimization/96109
* tree-vectorizer.h (dr_misalignment): Add optional offset
parameter.
* tree-vect-data-refs.c (dr_misalignment): Likewise. Remove
offset applied for negative stride accesses.
(vect_enhance_data_refs_alignment): Compute negative stride
access offset and pass it to dr_misalignment.
* tree-vect-stmts.c (get_negative_load_store_type): Pass
negative offset to dr_misalignment.
(get_group_load_store_type): Likewise.
(get_load_store_type): Likewise.
(vectorizable_store): Remove asserts about alignment.
(vectorizable_load): Likewise.
|
|
This refactors vect_supportable_dr_alignment to get the misalignment
as input parameter which allows us to elide modifying/restoring
of DR_MISALIGNMENT during alignment peeling analysis which eventually
makes it more straight-forward to split out the negative step
handling.
2021-10-19 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vect_supportable_dr_alignment): Add
misalignment parameter.
* tree-vect-data-refs.c (vect_get_peeling_costs_all_drs):
Do not change DR_MISALIGNMENT in place, instead pass the
adjusted misalignment to vect_supportable_dr_alignment.
(vect_peeling_supportable): Likewise.
(vect_peeling_hash_get_lowest_cost): Adjust.
(vect_enhance_data_refs_alignment): Likewise.
(vect_vfa_access_size): Likewise.
(vect_supportable_dr_alignment): Add misalignment
parameter and simplify.
* tree-vect-stmts.c (get_negative_load_store_type): Adjust.
(get_group_load_store_type): Likewise.
(get_load_store_type): Likewise.
|
|
This passes down the already available alignment scheme and
misalignment to the load/store costing routines, removing
redundant queries.
2021-10-19 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vect_get_store_cost): Adjust signature.
(vect_get_load_cost): Likewise.
* tree-vect-data-refs.c (vect_get_data_access_cost): Get
alignment support scheme and misalignment as arguments
and pass them down.
(vect_get_peeling_costs_all_drs): Compute that info here
and note that we shouldn't need to.
* tree-vect-stmts.c (vect_model_store_cost): Get
alignment support scheme and misalignment as arguments.
(vect_get_store_cost): Likewise.
(vect_model_load_cost): Likewise.
(vect_get_load_cost): Likewise.
(vectorizable_store): Pass down alignment support scheme
and misalignment to costing.
(vectorizable_load): Likewise.
|
|
There are two calls with true as parameter, one is only relevant
for the case of the misalignment being unknown which means the
access is never aligned there, the other is in the peeling hash
insert code used conditional on the unlimited cost model which
adds an artificial count. But the way it works right now is
that it boosts the count if the specific misalignment when not peeling
is unsupported - in particular when the access is currently aligned
we'll query the backend with a misalign value of zero. I've
changed it to boost the peeling when unknown alignment is not
supported instead and noted how we could in principle improve this.
2021-10-19 Richard Biener <rguenther@suse.de>
* tree-vectorizer.h (vect_supportable_dr_alignment): Remove
check_aligned argument.
* tree-vect-data-refs.c (vect_supportable_dr_alignment):
Likewise.
(vect_peeling_hash_insert): Add supportable_if_not_aligned
argument and do not call vect_supportable_dr_alignment here.
(vect_peeling_supportable): Adjust.
(vect_enhance_data_refs_alignment): Compute whether the
access is supported with different alignment here and
pass that down to vect_peeling_hash_insert.
(vect_vfa_access_size): Adjust.
* tree-vect-stmts.c (vect_get_store_cost): Likewise.
(vect_get_load_cost): Likewise.
(get_negative_load_store_type): Likewise.
(get_group_load_store_type): Likewise.
(get_load_store_type): Likewise.
|
|
> * testsuite/libgomp.c++/scan-10.C: Add option -fvect-cost-model=cheap.
I don't think this is the right thing to do.
This just means that at some point between 2013 when -fsimd-cost-model has
been introduced and now -fsimd-cost-model= option at least partially stopped
working properly.
As documented, -fsimd-cost-model= overrides the -fvect-cost-model= setting
for OpenMP simd loops (loop->force_vectorize is true) if specified differently
from default.
In tree-vectorizer.h we have:
static inline bool
unlimited_cost_model (loop_p loop)
{
if (loop != NULL && loop->force_vectorize
&& flag_simd_cost_model != VECT_COST_MODEL_DEFAULT)
return flag_simd_cost_model == VECT_COST_MODEL_UNLIMITED;
return (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED);
}
and use it in various places, but we also just use flag_vect_cost_model
in lots of places (and in one spot use flag_simd_cost_model, not sure if
we are sure it is a force_vectorize loop or what).
So, IMHO we should change the above inline function to
loop_cost_model and let it return the cost model and then just
reimplement unlimited_cost_model as
return loop_cost_model (loop) == VECT_COST_MODEL_UNLIMITED;
and then adjust the direct uses of the flag and revert these changes.
2021-10-12 Jakub Jelinek <jakub@redhat.com>
gcc/
* tree-vectorizer.h (loop_cost_model): New function.
(unlimited_cost_model): Use it.
* tree-vect-loop.c (vect_analyze_loop_costing): Use loop_cost_model
call instead of flag_vect_cost_model.
* tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Likewise.
(vect_prune_runtime_alias_test_list): Likewise. Also use it instead
of flag_simd_cost_model.
gcc/testsuite/
* gcc.dg/gomp/simd-2.c: Remove option -fvect-cost-model=cheap.
* gcc.dg/gomp/simd-3.c: Likewise.
libgomp/
* testsuite/libgomp.c/scan-11.c: Remove option -fvect-cost-model=cheap.
* testsuite/libgomp.c/scan-12.c: Likewise.
* testsuite/libgomp.c/scan-13.c: Likewise.
* testsuite/libgomp.c/scan-14.c: Likewise.
* testsuite/libgomp.c/scan-15.c: Likewise.
* testsuite/libgomp.c/scan-16.c: Likewise.
* testsuite/libgomp.c/scan-17.c: Likewise.
* testsuite/libgomp.c/scan-18.c: Likewise.
* testsuite/libgomp.c/scan-19.c: Likewise.
* testsuite/libgomp.c/scan-20.c: Likewise.
* testsuite/libgomp.c/scan-21.c: Likewise.
* testsuite/libgomp.c/scan-22.c: Likewise.
* testsuite/libgomp.c++/scan-9.C: Likewise.
* testsuite/libgomp.c++/scan-10.C: Likewise.
* testsuite/libgomp.c++/scan-11.C: Likewise.
* testsuite/libgomp.c++/scan-12.C: Likewise.
* testsuite/libgomp.c++/scan-13.C: Likewise.
* testsuite/libgomp.c++/scan-14.C: Likewise.
* testsuite/libgomp.c++/scan-15.C: Likewise.
* testsuite/libgomp.c++/scan-16.C: Likewise.
|