From 06039e71f02e478280ed63a63cbf5e76f3897513 Mon Sep 17 00:00:00 2001 From: Richard Ball Date: Mon, 18 Jul 2022 11:30:04 +0100 Subject: Replace manual swapping idiom with std::swap in aarch64.cc gcc/config/aarch64/aarch64.cc has a few manual swapping idioms of the form: x = in0, in0 = in1, in1 = x; The preferred way is using the standard: std::swap (in0, in1); We should just fix these to use std::swap. This will also allow us to eliminate the x temporary rtx. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_evpc_trn): Use std:swap. (aarch64_evpc_uzp): Likewise. (aarch64_evpc_zip): Likewise. --- gcc/config/aarch64/aarch64.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'gcc') diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1a514c1..4b486ae 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23498,7 +23498,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d) { HOST_WIDE_INT odd; poly_uint64 nelt = d->perm.length (); - rtx out, in0, in1, x; + rtx out, in0, in1; machine_mode vmode = d->vmode; if (GET_MODE_UNIT_SIZE (vmode) > 8) @@ -23522,7 +23522,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d) at the head of aarch64-sve.md for details. */ if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) { - x = in0, in0 = in1, in1 = x; + std::swap (in0, in1); odd = !odd; } out = d->target; @@ -23592,7 +23592,7 @@ static bool aarch64_evpc_uzp (struct expand_vec_perm_d *d) { HOST_WIDE_INT odd; - rtx out, in0, in1, x; + rtx out, in0, in1; machine_mode vmode = d->vmode; if (GET_MODE_UNIT_SIZE (vmode) > 8) @@ -23615,7 +23615,7 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d) at the head of aarch64-sve.md for details. */ if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) { - x = in0, in0 = in1, in1 = x; + std::swap (in0, in1); odd = !odd; } out = d->target; @@ -23631,7 +23631,7 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d) { unsigned int high; poly_uint64 nelt = d->perm.length (); - rtx out, in0, in1, x; + rtx out, in0, in1; machine_mode vmode = d->vmode; if (GET_MODE_UNIT_SIZE (vmode) > 8) @@ -23656,7 +23656,7 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d) at the head of aarch64-sve.md for details. */ if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) { - x = in0, in0 = in1, in1 = x; + std::swap (in0, in1); high = !high; } out = d->target; -- cgit v1.1 From ce92603fbe3b4870e0a38efee1ee766d62942065 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 18 Jul 2022 12:06:00 +0200 Subject: Improve common reduction vs builtin code generation in loop distribution loop distribution currently cannot handle the situation when the last partition is a builtin but there's a common reduction in all partitions (like the final IV value). The following lifts this restriction by making the last non-builtin partition provide the definitions for the loop-closed PHI nodes. Since we have heuristics in place to avoid code generating builtins last writing a testcase is difficult (but I ran into a case with other pending patches that made the heuristic ineffective). What's remaining is the inability to preserve common reductions when all partitions could be builtins (in some cases final value replacement could come to the rescue here). * tree-loop-distribution.cc (copy_loop_before): Add the ability to replace the original LC PHI defs. (generate_loops_for_partition): Pass through a flag whether to redirect original LC PHI defs. (generate_code_for_partition): Likewise. (loop_distribution::distribute_loop): Compute the partition that should provide the LC PHI defs for common reductions and pass that down. --- gcc/tree-loop-distribution.cc | 64 ++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 19 deletions(-) (limited to 'gcc') diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index ed7f432..0714bc4 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -942,7 +942,7 @@ stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) /* Return a copy of LOOP placed before LOOP. */ static class loop * -copy_loop_before (class loop *loop) +copy_loop_before (class loop *loop, bool redirect_lc_phi_defs) { class loop *res; edge preheader = loop_preheader_edge (loop); @@ -950,6 +950,24 @@ copy_loop_before (class loop *loop) initialize_original_copy_tables (); res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); gcc_assert (res != NULL); + + /* When a not last partition is supposed to keep the LC PHIs computed + adjust their definitions. */ + if (redirect_lc_phi_defs) + { + edge exit = single_exit (loop); + for (gphi_iterator si = gsi_start_phis (exit->dest); !gsi_end_p (si); + gsi_next (&si)) + { + gphi *phi = si.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + use_operand_p use_p = PHI_ARG_DEF_PTR_FROM_EDGE (phi, exit); + tree new_def = get_current_def (USE_FROM_PTR (use_p)); + SET_USE (use_p, new_def); + } + } + free_original_copy_tables (); delete_update_ssa (); @@ -977,7 +995,7 @@ create_bb_after_loop (class loop *loop) static void generate_loops_for_partition (class loop *loop, partition *partition, - bool copy_p) + bool copy_p, bool keep_lc_phis_p) { unsigned i; basic_block *bbs; @@ -985,7 +1003,7 @@ generate_loops_for_partition (class loop *loop, partition *partition, if (copy_p) { int orig_loop_num = loop->orig_loop_num; - loop = copy_loop_before (loop); + loop = copy_loop_before (loop, keep_lc_phis_p); gcc_assert (loop != NULL); loop->orig_loop_num = orig_loop_num; create_preheader (loop, CP_SIMPLE_PREHEADERS); @@ -1336,7 +1354,8 @@ destroy_loop (class loop *loop) static bool generate_code_for_partition (class loop *loop, - partition *partition, bool copy_p) + partition *partition, bool copy_p, + bool keep_lc_phis_p) { switch (partition->kind) { @@ -1345,7 +1364,8 @@ generate_code_for_partition (class loop *loop, /* Reductions all have to be in the last partition. */ gcc_assert (!partition_reduction_p (partition) || !copy_p); - generate_loops_for_partition (loop, partition, copy_p); + generate_loops_for_partition (loop, partition, copy_p, + keep_lc_phis_p); return false; case PKIND_MEMSET: @@ -3013,6 +3033,7 @@ loop_distribution::distribute_loop (class loop *loop, bool any_builtin = false; bool reduction_in_all = false; + int reduction_partition_num = -1; FOR_EACH_VEC_ELT (partitions, i, partition) { reduction_in_all @@ -3092,10 +3113,13 @@ loop_distribution::distribute_loop (class loop *loop, } /* Put a non-builtin partition last if we need to preserve a reduction. - ??? This is a workaround that makes sort_partitions_by_post_order do - the correct thing while in reality it should sort each component - separately and then put the component with a reduction or a non-builtin - last. */ + In most cases this helps to keep a normal partition last avoiding to + spill a reduction result across builtin calls. + ??? The proper way would be to use dependences to see whether we + can move builtin partitions earlier during merge_dep_scc_partitions + and its sort_partitions_by_post_order. Especially when the + dependence graph is composed of multiple independent subgraphs the + heuristic does not work reliably. */ if (reduction_in_all && partition_builtin_p (partitions.last())) FOR_EACH_VEC_ELT (partitions, i, partition) @@ -3126,19 +3150,20 @@ loop_distribution::distribute_loop (class loop *loop, finalize_partitions (loop, &partitions, &alias_ddrs); - /* If there is a reduction in all partitions make sure the last one - is not classified for builtin code generation. */ + /* If there is a reduction in all partitions make sure the last + non-builtin partition provides the LC PHI defs. */ if (reduction_in_all) { - partition = partitions.last (); - if (only_patterns_p - && partition_builtin_p (partition) - && !partition_builtin_p (partitions[0])) + FOR_EACH_VEC_ELT (partitions, i, partition) + if (!partition_builtin_p (partition)) + reduction_partition_num = i; + if (reduction_partition_num == -1) { - nbp = 0; - goto ldist_done; + /* If all partitions are builtin, force the last one to + be code generated as normal partition. */ + partition = partitions.last (); + partition->kind = PKIND_NORMAL; } - partition->kind = PKIND_NORMAL; } nbp = partitions.length (); @@ -3164,7 +3189,8 @@ loop_distribution::distribute_loop (class loop *loop, { if (partition_builtin_p (partition)) (*nb_calls)++; - *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1); + *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1, + i == reduction_partition_num); } ldist_done: -- cgit v1.1 From 9c8349ee1a35dac61b84bbae115ee6a1eeb6ddbd Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 18 Jul 2022 14:32:41 +0300 Subject: arc: Fix interrupt's epilogue. The stack pointer adjustment in interrupt epilogue is happening after restoring the ZOL registers which is wrong. Fixing this. gcc/ * config/arc/arc.cc (arc_expand_epilogue): Adjust the frame pointer first when in interrupts. gcc/testsuite/ * gcc.target/arc/interrupt-13.c: New file. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc.cc | 2 +- gcc/testsuite/gcc.target/arc/interrupt-13.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/arc/interrupt-13.c (limited to 'gcc') diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc index fbc17e6..77730c8 100644 --- a/gcc/config/arc/arc.cc +++ b/gcc/config/arc/arc.cc @@ -3965,7 +3965,7 @@ arc_expand_epilogue (int sibcall_p) if (size) emit_insn (gen_blockage ()); - if (ARC_INTERRUPT_P (fn_type) && restore_fp) + if (ARC_INTERRUPT_P (fn_type)) { /* We need to restore FP before any SP operation in an interrupt. */ diff --git a/gcc/testsuite/gcc.target/arc/interrupt-13.c b/gcc/testsuite/gcc.target/arc/interrupt-13.c new file mode 100644 index 0000000..0ed8451 --- /dev/null +++ b/gcc/testsuite/gcc.target/arc/interrupt-13.c @@ -0,0 +1,15 @@ +/* { dg-options "-O2" } */ + +extern int foo (int *); + +void __attribute__((interrupt("ilink"))) +irq (void) +{ + struct { + int x0; + int x1; + } a = {1 ,2}; + foo ((int *)&a); +} + +/* { dg-final { scan-assembler "add_s\\s+sp,sp,8.*pop_s\\s+r0" } } */ -- cgit v1.1 From 7313381d2ce44b72b4c9f70bd5670e5d78d1f631 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 18 Jul 2022 12:57:10 +0100 Subject: arm: Replace arm_builtin_vectorized_function [PR106253] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch extends the fix for PR106253 to AArch32. As with AArch64, we were using ACLE intrinsics to vectorise scalar built-ins, even though the two sometimes have different ECF_* flags. (That in turn is because the ACLE intrinsics should follow the instruction semantics as closely as possible, whereas the scalar built-ins follow language specs.) The patch also removes the copysignf built-in, which only existed for this purpose and wasn't a “real” arm_neon.h built-in. Doing this also has the side-effect of enabling vectorisation of rint and roundeven. Logically that should be a separate patch, but making it one would have meant adding a new int iterator for the original set of instructions and then removing it again when including new functions. I've restricted the bswap tests to little-endian because we end up with excessive spilling on big-endian. E.g.: sub sp, sp, #8 vstr d1, [sp] vldr d16, [sp] vrev16.8 d16, d16 vstr d16, [sp] vldr d0, [sp] add sp, sp, #8 @ sp needed bx lr Similarly, the copysign tests require little-endian because on big-endian we unnecessarily load the constant from the constant pool: vldr.32 s15, .L3 vdup.32 d0, d7[1] vbsl d0, d2, d1 bx lr .L3: .word -2147483648 gcc/ PR target/106253 * config/arm/arm-builtins.cc (arm_builtin_vectorized_function): Delete. * config/arm/arm-protos.h (arm_builtin_vectorized_function): Delete. * config/arm/arm.cc (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION): Delete. * config/arm/arm_neon_builtins.def (copysignf): Delete. * config/arm/iterators.md (nvrint_pattern): New attribute. * config/arm/neon.md (2): New pattern. (l2): Likewise. (neon_copysignf): Rename to... (copysign3): ...this. gcc/testsuite/ PR target/106253 * gcc.target/arm/vect_unary_1.c: New test. * gcc.target/arm/vect_binary_1.c: Likewise. --- gcc/config/arm/arm-builtins.cc | 123 --------------- gcc/config/arm/arm-protos.h | 1 - gcc/config/arm/arm.cc | 4 - gcc/config/arm/arm_neon_builtins.def | 1 - gcc/config/arm/iterators.md | 7 + gcc/config/arm/neon.md | 17 +- gcc/testsuite/gcc.target/arm/vect_binary_1.c | 50 ++++++ gcc/testsuite/gcc.target/arm/vect_unary_1.c | 224 +++++++++++++++++++++++++++ 8 files changed, 297 insertions(+), 130 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/vect_binary_1.c create mode 100644 gcc/testsuite/gcc.target/arm/vect_unary_1.c (limited to 'gcc') diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc index d917137..8f8155c 100644 --- a/gcc/config/arm/arm-builtins.cc +++ b/gcc/config/arm/arm-builtins.cc @@ -4026,129 +4026,6 @@ arm_expand_builtin (tree exp, return NULL_RTX; } -tree -arm_builtin_vectorized_function (unsigned int fn, tree type_out, tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - bool out_unsigned_p = TYPE_UNSIGNED (type_out); - - /* Can't provide any vectorized builtins when we can't use NEON. */ - if (!TARGET_NEON) - return NULL_TREE; - - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - -/* ARM_CHECK_BUILTIN_MODE and ARM_FIND_VRINT_VARIANT are used to find the - decl of the vectorized builtin for the appropriate vector mode. - NULL_TREE is returned if no such builtin is available. */ -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C) \ - (TARGET_VFP5 \ - && flag_unsafe_math_optimizations \ - && ARM_CHECK_BUILTIN_MODE_1 (C)) - -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SFmode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#undef ARM_FIND_VRINT_VARIANT -#define ARM_FIND_VRINT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sf, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sf, false) \ - : NULL_TREE)) - - switch (fn) - { - CASE_CFN_FLOOR: - return ARM_FIND_VRINT_VARIANT (vrintm); - CASE_CFN_CEIL: - return ARM_FIND_VRINT_VARIANT (vrintp); - CASE_CFN_TRUNC: - return ARM_FIND_VRINT_VARIANT (vrintz); - CASE_CFN_ROUND: - return ARM_FIND_VRINT_VARIANT (vrinta); -#undef ARM_CHECK_BUILTIN_MODE_1 -#define ARM_CHECK_BUILTIN_MODE_1(C) \ - (out_mode == SImode && out_n == C \ - && in_mode == SFmode && in_n == C) - -#define ARM_FIND_VCVT_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##v4sfv4si, false) \ - : NULL_TREE)) - -#define ARM_FIND_VCVTU_VARIANT(N) \ - (ARM_CHECK_BUILTIN_MODE (2) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv2sfv2si, false) \ - : (ARM_CHECK_BUILTIN_MODE (4) \ - ? arm_builtin_decl(ARM_BUILTIN_NEON_##N##uv4sfv4si, false) \ - : NULL_TREE)) - CASE_CFN_LROUND: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvta) - : ARM_FIND_VCVT_VARIANT (vcvta)); - CASE_CFN_LCEIL: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtp) - : ARM_FIND_VCVT_VARIANT (vcvtp)); - CASE_CFN_LFLOOR: - return (out_unsigned_p - ? ARM_FIND_VCVTU_VARIANT (vcvtm) - : ARM_FIND_VCVT_VARIANT (vcvtm)); -#undef ARM_CHECK_BUILTIN_MODE -#define ARM_CHECK_BUILTIN_MODE(C, N) \ - (out_mode == N##mode && out_n == C \ - && in_mode == N##mode && in_n == C) - case CFN_BUILT_IN_BSWAP16: - if (ARM_CHECK_BUILTIN_MODE (4, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4hi, false); - else if (ARM_CHECK_BUILTIN_MODE (8, HI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv8hi, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP32: - if (ARM_CHECK_BUILTIN_MODE (2, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2si, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv4si, false); - else - return NULL_TREE; - case CFN_BUILT_IN_BSWAP64: - if (ARM_CHECK_BUILTIN_MODE (2, DI)) - return arm_builtin_decl (ARM_BUILTIN_NEON_bswapv2di, false); - else - return NULL_TREE; - CASE_CFN_COPYSIGN: - if (ARM_CHECK_BUILTIN_MODE (2, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv2sf, false); - else if (ARM_CHECK_BUILTIN_MODE (4, SF)) - return arm_builtin_decl (ARM_BUILTIN_NEON_copysignfv4sf, false); - else - return NULL_TREE; - - default: - return NULL_TREE; - } - return NULL_TREE; -} -#undef ARM_FIND_VCVT_VARIANT -#undef ARM_FIND_VCVTU_VARIANT -#undef ARM_CHECK_BUILTIN_MODE -#undef ARM_FIND_VRINT_VARIANT - void arm_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) { diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h index 9d14209..f8aabbd 100644 --- a/gcc/config/arm/arm-protos.h +++ b/gcc/config/arm/arm-protos.h @@ -103,7 +103,6 @@ extern void neon_pairwise_reduce (rtx, rtx, machine_mode, rtx (*) (rtx, rtx, rtx)); extern rtx mve_bool_vec_to_const (rtx const_vec); extern rtx neon_make_constant (rtx, bool generate = true); -extern tree arm_builtin_vectorized_function (unsigned int, tree, tree); extern void neon_expand_vector_init (rtx, rtx); extern void neon_lane_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT, const_tree); extern void arm_const_bounds (rtx, HOST_WIDE_INT, HOST_WIDE_INT); diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc index 33fb98d..eca99c9 100644 --- a/gcc/config/arm/arm.cc +++ b/gcc/config/arm/arm.cc @@ -739,10 +739,6 @@ static const struct attribute_spec arm_attribute_table[] = #undef TARGET_VECTORIZE_BUILTINS #define TARGET_VECTORIZE_BUILTINS -#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION -#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ - arm_builtin_vectorized_function - #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT arm_vector_alignment diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 445b2bf..2e642cc 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -264,7 +264,6 @@ VAR1 (UNOP, vcvtv4hf, v4sf) VAR10 (TERNOP, vbsl, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR2 (TERNOP, vbsl, v8hf, v4hf) -VAR2 (UNOP, copysignf, v2sf, v4sf) VAR2 (UNOP, vrintn, v2sf, v4sf) VAR2 (UNOP, vrinta, v2sf, v4sf) VAR2 (UNOP, vrintp, v2sf, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 37cf797..29062cd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1150,6 +1150,13 @@ (UNSPEC_VRINTA "unconditional") (UNSPEC_VRINTM "unconditional") (UNSPEC_VRINTR "nocond") (UNSPEC_VRINTX "nocond")]) +(define_int_attr nvrint_pattern [(UNSPEC_NVRINTZ "btrunc") + (UNSPEC_NVRINTP "ceil") + (UNSPEC_NVRINTA "round") + (UNSPEC_NVRINTM "floor") + (UNSPEC_NVRINTX "rint") + (UNSPEC_NVRINTN "roundeven")]) + (define_int_attr nvrint_variant [(UNSPEC_NVRINTZ "z") (UNSPEC_NVRINTP "p") (UNSPEC_NVRINTA "a") (UNSPEC_NVRINTM "m") (UNSPEC_NVRINTX "x") (UNSPEC_NVRINTN "n")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 275bcc1..e1dae28 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -635,6 +635,13 @@ [(set_attr "type" "neon_fp_mla_s")] ) +(define_expand "2" + [(set (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand")] + NEON_VRINT))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vrint" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") (unspec:VCVTF [(match_operand:VCVTF 1 @@ -645,6 +652,14 @@ [(set_attr "type" "neon_fp_round_")] ) +(define_expand "l2" + [(set (match_operand: 0 "register_operand") + (FIXUORS: + (unspec:VCVTF [(match_operand:VCVTF 1 "register_operand")] + NEON_VCVT)))] + "TARGET_NEON && TARGET_VFP5 && flag_unsafe_math_optimizations" +) + (define_insn "neon_vcvt" [(set (match_operand: 0 "register_operand" "=w") (FIXUORS: (unspec:VCVTF @@ -3059,7 +3074,7 @@ "TARGET_I8MM" ) -(define_expand "neon_copysignf" +(define_expand "copysign3" [(match_operand:VCVTF 0 "register_operand") (match_operand:VCVTF 1 "register_operand") (match_operand:VCVTF 2 "register_operand")] diff --git a/gcc/testsuite/gcc.target/arm/vect_binary_1.c b/gcc/testsuite/gcc.target/arm/vect_binary_1.c new file mode 100644 index 0000000..c1fc905 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_binary_1.c @@ -0,0 +1,50 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-O3 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) z) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0], z[0]); \ + x[1] = __builtin_##NAME (y[1], z[1]); \ + x[2] = __builtin_##NAME (y[2], z[2]); \ + x[3] = __builtin_##NAME (y[3], z[3]); \ + return x; \ +} + +/* +** test2_float_copysignf_float: { target arm_little_endian } +** vmov.i32 d0, #(0x80000000|2147483648)(\s+.*) +** vbsl d0, d2, d1 +** bx lr +*/ +TEST2 (float, copysignf, float) + +/* +** test4_float_copysignf_float: { target arm_little_endian } +** vmov.i32 q0, #(0x80000000|2147483648)(\s+.*) +** vbsl q0, q2, q1 +** bx lr +*/ +TEST4 (float, copysignf, float) diff --git a/gcc/testsuite/gcc.target/arm/vect_unary_1.c b/gcc/testsuite/gcc.target/arm/vect_unary_1.c new file mode 100644 index 0000000..4677180 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/vect_unary_1.c @@ -0,0 +1,224 @@ +/* { dg-do compile { target { arm*-*-* } } } */ +/* { dg-require-effective-target arm_hard_ok } */ +/* { dg-require-effective-target arm_v8_neon_ok } */ +/* { dg-add-options arm_v8_neon } */ +/* { dg-additional-options "-Ofast -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include + +#define TEST2(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 2))) \ +test2_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(IN) * 2))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 2))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + return x; \ +} + +#define TEST4(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 4))) \ +test4_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 4))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 4))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + return x; \ +} + +#define TEST8(OUT, NAME, IN) \ +OUT __attribute__((vector_size(sizeof(OUT) * 8))) \ +test8_##OUT##_##NAME##_##IN (float dummy, \ + IN __attribute__((vector_size(sizeof(OUT) * 8))) y) \ +{ \ + OUT __attribute__((vector_size(sizeof(OUT) * 8))) x; \ + x[0] = __builtin_##NAME (y[0]); \ + x[1] = __builtin_##NAME (y[1]); \ + x[2] = __builtin_##NAME (y[2]); \ + x[3] = __builtin_##NAME (y[3]); \ + x[4] = __builtin_##NAME (y[4]); \ + x[5] = __builtin_##NAME (y[5]); \ + x[6] = __builtin_##NAME (y[6]); \ + x[7] = __builtin_##NAME (y[7]); \ + return x; \ +} + +/* +** test2_float_truncf_float: +** vrintz.f32 d0, d1 +** bx lr +*/ +TEST2 (float, truncf, float) + +/* +** test4_float_truncf_float: +** vrintz.f32 q0, q1 +** bx lr +*/ +TEST4 (float, truncf, float) + +/* +** test2_float_roundf_float: +** vrinta.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundf, float) + +/* +** test4_float_roundf_float: +** vrinta.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundf, float) + +/* +** test2_float_floorf_float: +** vrintm.f32 d0, d1 +** bx lr +*/ +TEST2 (float, floorf, float) + +/* +** test4_float_floorf_float: +** vrintm.f32 q0, q1 +** bx lr +*/ +TEST4 (float, floorf, float) + +/* +** test2_float_ceilf_float: +** vrintp.f32 d0, d1 +** bx lr +*/ +TEST2 (float, ceilf, float) + +/* +** test4_float_ceilf_float: +** vrintp.f32 q0, q1 +** bx lr +*/ +TEST4 (float, ceilf, float) + +/* +** test2_float_rintf_float: +** vrintx.f32 d0, d1 +** bx lr +*/ +TEST2 (float, rintf, float) + +/* +** test4_float_rintf_float: +** vrintx.f32 q0, q1 +** bx lr +*/ +TEST4 (float, rintf, float) + +/* +** test2_float_roundevenf_float: +** vrintn.f32 d0, d1 +** bx lr +*/ +TEST2 (float, roundevenf, float) + +/* +** test4_float_roundevenf_float: +** vrintn.f32 q0, q1 +** bx lr +*/ +TEST4 (float, roundevenf, float) + +/* +** test2_int_roundf_float: +** vcvta.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, roundf, float) + +/* +** test4_int_roundf_float: +** vcvta.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, roundf, float) + +/* +** test2_int_floorf_float: +** vcvtm.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, floorf, float) + +/* +** test4_int_floorf_float: +** vcvtm.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, floorf, float) + +/* +** test2_int_ceilf_float: +** vcvtp.s32.f32 d0, d1 +** bx lr +*/ +TEST2 (int, ceilf, float) + +/* +** test4_int_ceilf_float: +** vcvtp.s32.f32 q0, q1 +** bx lr +*/ +TEST4 (int, ceilf, float) + +/* +** test2_int_clz_int: +** vclz.i32 d0, d1 +** bx lr +*/ +TEST2 (int, clz, int) + +/* +** test4_int_clz_int: +** vclz.i32 q0, q1 +** bx lr +*/ +TEST4 (int, clz, int) + +/* +** test4_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 d0, d1 +** bx lr +*/ +TEST4 (int16_t, bswap16, int16_t) + +/* +** test8_int16_t_bswap16_int16_t: { target arm_little_endian } +** vrev16.8 q0, q1 +** bx lr +*/ +TEST8 (int16_t, bswap16, int16_t) + +/* +** test2_int_bswap32_int: { target arm_little_endian } +** vrev32.8 d0, d1 +** bx lr +*/ +TEST2 (int, bswap32, int) + +/* +** test4_int_bswap32_int: { target arm_little_endian } +** vrev32.8 q0, q1 +** bx lr +*/ +TEST4 (int, bswap32, int) + +/* +** test2_int64_t_bswap64_int64_t: { target arm_little_endian } +** vrev64.8 q0, q1 +** bx lr +*/ +TEST2 (int64_t, bswap64, int64_t) -- cgit v1.1 From 87f46a16ec05beb51439f55a4d3c36d64b95b00f Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 18 Jul 2022 13:09:57 +0200 Subject: Fix builtin vs non-builtin partition merge in loop distribution When r7-6373-g40b6bff965d004 fixed a costing issue it failed to make the logic symmetric which means that we now fuse normal vs. builtin when the cost model says so but we don't fuse builtin vs. normal. The following fixes that, also allowing the cost model to decide to fuse two builtin partitions as otherwise an intermediate non-builtin can result in a partial merge as well. * tree-loop-distribution.cc (loop_distribution::distribute_loop): When computing cost-based merging do not disregard builtin classified partitions in some cases. * gcc.dg/tree-ssa/ldist-24.c: XFAIL. * gcc.dg/tree-ssa/ldist-36.c: Adjust expected outcome. --- gcc/testsuite/gcc.dg/tree-ssa/ldist-24.c | 5 +++-- gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c | 3 ++- gcc/tree-loop-distribution.cc | 5 +---- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'gcc') diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-24.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-24.c index 75f7b8f..2403a24 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-24.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-24.c @@ -20,5 +20,6 @@ void foo () } } -/* { dg-final { scan-tree-dump "generated memcpy" "ldist" } } */ -/* { dg-final { scan-tree-dump "generated memset zero" "ldist" } } */ +/* The cost modeling does not consider WAR as beneficial to split. */ +/* { dg-final { scan-tree-dump "generated memcpy" "ldist" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump "generated memset zero" "ldist" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c index 07393f0..6d56006 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c @@ -25,4 +25,5 @@ foo (struct st * restrict p) } } -/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 3 library" 1 "ldist" } } */ +/* The cost modeling doesn't consider splitting a WAR re-use profitable. */ +/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 1 loops and 1 library" 1 "ldist" } } */ diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc index 0714bc4..0ee441c 100644 --- a/gcc/tree-loop-distribution.cc +++ b/gcc/tree-loop-distribution.cc @@ -3090,10 +3090,7 @@ loop_distribution::distribute_loop (class loop *loop, for (i = 0; partitions.iterate (i, &into); ++i) { bool changed = false; - if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) - continue; - for (int j = i + 1; - partitions.iterate (j, &partition); ++j) + for (int j = i + 1; partitions.iterate (j, &partition); ++j) { if (share_memory_accesses (rdg, into, partition)) { -- cgit v1.1 From 7501eec65c60701f72621d04eeb5342bad2fe4fb Mon Sep 17 00:00:00 2001 From: Claudiu Zissulescu Date: Mon, 18 Jul 2022 15:07:00 +0300 Subject: arc: Add ARCHS release 310a tune variant. Add mtune and mcpu options for ARCHS release 310a type CPU. The mtune=release31a is designed to be used as an alternative to the mcpu=hs4x_rel31 option. ARCHS4x release 31a uses DSP instructions which are implemented a bit different than mpy9. Hence, use safer mpy2 option. gcc/ * config/arc/arc-arch.h (arc_tune_attr): Add ARC_TUNE_ARCHS4X_REL31A variant. * config/arc/arc.cc (arc_override_options): Tune options for release 310a. (arc_sched_issue_rate): Use correct enum. (arc600_corereg_hazard): Textual change. (arc_hazard): Add release 310a tunning. * config/arc/arc.md (tune): Update and take into consideration new tune option. (tune_dspmpy): Likewise. (tune_store): New attribute. * config/arc/arc.opt (mtune): New tune option. * config/arc/arcHS4x.md (hs4x_brcc0, hs4x_brcc1): New cpu units. (hs4x_brcc_op): New instruction rezervation. (hs4x_data_store_1_op): Likewise. * config/arc/arc-cpus.def (hs4x_rel31): New cpu variant. * config/arc/arc-tables.opt: Regenerate. * config/arc/t-multilib: Likewise. * doc/invoke.texi (ARC): Update mcpu and tune sections. Signed-off-by: Claudiu Zissulescu --- gcc/config/arc/arc-arch.h | 3 +- gcc/config/arc/arc-cpus.def | 1 + gcc/config/arc/arc-tables.opt | 3 + gcc/config/arc/arc.cc | 192 ++++++++++++++++++++++++++---------------- gcc/config/arc/arc.md | 32 ++++--- gcc/config/arc/arc.opt | 3 + gcc/config/arc/arcHS4x.md | 17 +++- gcc/config/arc/t-multilib | 4 +- gcc/doc/invoke.texi | 16 ++++ 9 files changed, 181 insertions(+), 90 deletions(-) (limited to 'gcc') diff --git a/gcc/config/arc/arc-arch.h b/gcc/config/arc/arc-arch.h index 4c728a8..83b156e 100644 --- a/gcc/config/arc/arc-arch.h +++ b/gcc/config/arc/arc-arch.h @@ -77,7 +77,8 @@ enum arc_tune_attr ARC_TUNE_CORE_3, ARC_TUNE_ARCHS4X, ARC_TUNE_ARCHS4XD, - ARC_TUNE_ARCHS4XD_SLOW + ARC_TUNE_ARCHS4XD_SLOW, + ARC_TUNE_ARCHS4X_REL31A }; /* Extra options for a processor template to hold any CPU specific diff --git a/gcc/config/arc/arc-cpus.def b/gcc/config/arc/arc-cpus.def index baf61db..5668b0f 100644 --- a/gcc/config/arc/arc-cpus.def +++ b/gcc/config/arc/arc-cpus.def @@ -64,6 +64,7 @@ ARC_CPU (hs38, hs, FL_MPYOPT_9|FL_DIVREM|FL_LL64, NONE, NONE) ARC_CPU (hs38_linux, hs, FL_MPYOPT_9|FL_DIVREM|FL_LL64|FL_FPU_FPUD_ALL, NONE, NONE) ARC_CPU (hs4x, hs, FL_MPYOPT_9|FL_DIVREM|FL_LL64, NONE, ARCHS4X) ARC_CPU (hs4xd, hs, FL_MPYOPT_9|FL_DIVREM|FL_LL64, NONE, ARCHS4XD) +ARC_CPU (hs4x_rel31, hs, FL_MPYOPT_2|FL_DIVREM|FL_LL64, NONE, ARCHS4X_REL31A) ARC_CPU (arc600, 6xx, FL_BS, NONE, ARC600) ARC_CPU (arc600_norm, 6xx, FL_BS|FL_NORM, NONE, ARC600) diff --git a/gcc/config/arc/arc-tables.opt b/gcc/config/arc/arc-tables.opt index 8cc5135..0a0d354 100644 --- a/gcc/config/arc/arc-tables.opt +++ b/gcc/config/arc/arc-tables.opt @@ -70,6 +70,9 @@ EnumValue Enum(processor_type) String(hs4xd) Value(PROCESSOR_hs4xd) EnumValue +Enum(processor_type) String(hs4x_rel31) Value(PROCESSOR_hs4x_rel31) + +EnumValue Enum(processor_type) String(arc600) Value(PROCESSOR_arc600) EnumValue diff --git a/gcc/config/arc/arc.cc b/gcc/config/arc/arc.cc index 77730c8..064790b 100644 --- a/gcc/config/arc/arc.cc +++ b/gcc/config/arc/arc.cc @@ -646,8 +646,8 @@ arc_sched_issue_rate (void) { switch (arc_tune) { - case TUNE_ARCHS4X: - case TUNE_ARCHS4XD: + case ARC_TUNE_ARCHS4X: + case ARC_TUNE_ARCHS4XD: return 3; default: break; @@ -1458,6 +1458,12 @@ arc_override_options (void) if (!OPTION_SET_P (unaligned_access) && TARGET_HS) unaligned_access = 1; + if (TARGET_HS && (arc_tune == ARC_TUNE_ARCHS4X_REL31A)) + { + TARGET_CODE_DENSITY_FRAME = 0; + flag_delayed_branch = 0; + } + /* These need to be done at start up. It's convenient to do them here. */ arc_init (); } @@ -7817,6 +7823,115 @@ arc_store_addr_hazard_p (rtx_insn* producer, rtx_insn* consumer) return arc_store_addr_hazard_internal_p (producer, consumer); } +/* Return length adjustment for INSN. + For ARC600: + A write to a core reg greater or equal to 32 must not be immediately + followed by a use. Anticipate the length requirement to insert a nop + between PRED and SUCC to prevent a hazard. */ + +static int +arc600_corereg_hazard (rtx_insn *pred, rtx_insn *succ) +{ + if (!TARGET_ARC600) + return 0; + if (GET_CODE (PATTERN (pred)) == SEQUENCE) + pred = as_a (PATTERN (pred))->insn (1); + if (GET_CODE (PATTERN (succ)) == SEQUENCE) + succ = as_a (PATTERN (succ))->insn (0); + if (recog_memoized (pred) == CODE_FOR_mulsi_600 + || recog_memoized (pred) == CODE_FOR_umul_600 + || recog_memoized (pred) == CODE_FOR_mac_600 + || recog_memoized (pred) == CODE_FOR_mul64_600 + || recog_memoized (pred) == CODE_FOR_mac64_600 + || recog_memoized (pred) == CODE_FOR_umul64_600 + || recog_memoized (pred) == CODE_FOR_umac64_600) + return 0; + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, PATTERN (pred), NONCONST) + { + const_rtx x = *iter; + switch (GET_CODE (x)) + { + case SET: case POST_INC: case POST_DEC: case PRE_INC: case PRE_DEC: + break; + default: + /* This is also fine for PRE/POST_MODIFY, because they + contain a SET. */ + continue; + } + rtx dest = XEXP (x, 0); + /* Check if this sets a an extension register. N.B. we use 61 for the + condition codes, which is definitely not an extension register. */ + if (REG_P (dest) && REGNO (dest) >= 32 && REGNO (dest) < 61 + /* Check if the same register is used by the PAT. */ + && (refers_to_regno_p + (REGNO (dest), + REGNO (dest) + (GET_MODE_SIZE (GET_MODE (dest)) + 3) / 4U, + PATTERN (succ), 0))) + return 4; + } + return 0; +} + +/* For ARC600: + A write to a core reg greater or equal to 32 must not be immediately + followed by a use. Anticipate the length requirement to insert a nop + between PRED and SUCC to prevent a hazard. */ + +int +arc_hazard (rtx_insn *pred, rtx_insn *succ) +{ + if (!pred || !INSN_P (pred) || !succ || !INSN_P (succ)) + return 0; + + if (TARGET_ARC600) + return arc600_corereg_hazard (pred, succ); + + return 0; +} + +/* When compiling for release 310a, insert a nop before any + conditional jump. */ + +static int +arc_check_release31a (rtx_insn *pred, rtx_insn *succ) +{ + if (!pred || !INSN_P (pred) || !succ || !INSN_P (succ)) + return 0; + + if (!JUMP_P (pred) && !single_set (pred)) + return 0; + + if (!JUMP_P (succ) && !single_set (succ)) + return 0; + + if (TARGET_HS && (arc_tune == ARC_TUNE_ARCHS4X_REL31A)) + switch (get_attr_type (pred)) + { + case TYPE_STORE: + switch (get_attr_type (succ)) + { + case TYPE_BRCC: + case TYPE_BRCC_NO_DELAY_SLOT: + case TYPE_LOOP_END: + return 1; + default: + break; + } + break; + case TYPE_BRCC: + case TYPE_BRCC_NO_DELAY_SLOT: + case TYPE_LOOP_END: + if (get_attr_type (succ) == TYPE_STORE) + return 1; + break; + default: + break; + } + + return 0; +} + /* The same functionality as arc_hazard. It is called in machine reorg before any other optimization. Hence, the NOP size is taken into account when doing branch shortening. */ @@ -7830,10 +7945,8 @@ workaround_arc_anomaly (void) for (insn = get_insns (); insn; insn = NEXT_INSN (insn)) { succ0 = next_real_insn (insn); - if (arc_hazard (insn, succ0)) - { - emit_insn_before (gen_nopv (), succ0); - } + if (arc_hazard (insn, succ0) || arc_check_release31a (insn, succ0)) + emit_insn_before (gen_nopv (), succ0); } if (!TARGET_ARC700) @@ -9324,56 +9437,6 @@ disi_highpart (rtx in) return simplify_gen_subreg (SImode, in, DImode, TARGET_BIG_ENDIAN ? 0 : 4); } -/* Return length adjustment for INSN. - For ARC600: - A write to a core reg greater or equal to 32 must not be immediately - followed by a use. Anticipate the length requirement to insert a nop - between PRED and SUCC to prevent a hazard. */ - -static int -arc600_corereg_hazard (rtx_insn *pred, rtx_insn *succ) -{ - if (!TARGET_ARC600) - return 0; - if (GET_CODE (PATTERN (pred)) == SEQUENCE) - pred = as_a (PATTERN (pred))->insn (1); - if (GET_CODE (PATTERN (succ)) == SEQUENCE) - succ = as_a (PATTERN (succ))->insn (0); - if (recog_memoized (pred) == CODE_FOR_mulsi_600 - || recog_memoized (pred) == CODE_FOR_umul_600 - || recog_memoized (pred) == CODE_FOR_mac_600 - || recog_memoized (pred) == CODE_FOR_mul64_600 - || recog_memoized (pred) == CODE_FOR_mac64_600 - || recog_memoized (pred) == CODE_FOR_umul64_600 - || recog_memoized (pred) == CODE_FOR_umac64_600) - return 0; - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, PATTERN (pred), NONCONST) - { - const_rtx x = *iter; - switch (GET_CODE (x)) - { - case SET: case POST_INC: case POST_DEC: case PRE_INC: case PRE_DEC: - break; - default: - /* This is also fine for PRE/POST_MODIFY, because they - contain a SET. */ - continue; - } - rtx dest = XEXP (x, 0); - /* Check if this sets an extension register. N.B. we use 61 for the - condition codes, which is definitely not an extension register. */ - if (REG_P (dest) && REGNO (dest) >= 32 && REGNO (dest) < 61 - /* Check if the same register is used by the PAT. */ - && (refers_to_regno_p - (REGNO (dest), - REGNO (dest) + (GET_MODE_SIZE (GET_MODE (dest)) + 3) / 4U, - PATTERN (succ), 0))) - return 4; - } - return 0; -} - /* Given a rtx, check if it is an assembly instruction or not. */ static int @@ -9408,23 +9471,6 @@ arc_asm_insn_p (rtx x) return 0; } -/* For ARC600: - A write to a core reg greater or equal to 32 must not be immediately - followed by a use. Anticipate the length requirement to insert a nop - between PRED and SUCC to prevent a hazard. */ - -int -arc_hazard (rtx_insn *pred, rtx_insn *succ) -{ - if (!pred || !INSN_P (pred) || !succ || !INSN_P (succ)) - return 0; - - if (TARGET_ARC600) - return arc600_corereg_hazard (pred, succ); - - return 0; -} - /* Return length adjustment for INSN. */ int diff --git a/gcc/config/arc/arc.md b/gcc/config/arc/arc.md index 39b3580..7170445 100644 --- a/gcc/config/arc/arc.md +++ b/gcc/config/arc/arc.md @@ -645,22 +645,21 @@ ;; is made that makes conditional execution required. (define_attr "tune" "none,arc600,arc7xx,arc700_4_2_std,arc700_4_2_xmac, \ -core_3, archs4x, archs4xd, archs4xd_slow" +archs4x, archs4xd" (const - (cond [(symbol_ref "arc_tune == TUNE_ARC600") + (cond [(symbol_ref "arc_tune == ARC_TUNE_ARC600") (const_string "arc600") (symbol_ref "arc_tune == ARC_TUNE_ARC7XX") (const_string "arc7xx") - (symbol_ref "arc_tune == TUNE_ARC700_4_2_STD") + (symbol_ref "arc_tune == ARC_TUNE_ARC700_4_2_STD") (const_string "arc700_4_2_std") - (symbol_ref "arc_tune == TUNE_ARC700_4_2_XMAC") + (symbol_ref "arc_tune == ARC_TUNE_ARC700_4_2_XMAC") (const_string "arc700_4_2_xmac") - (symbol_ref "arc_tune == ARC_TUNE_CORE_3") - (const_string "core_3") - (symbol_ref "arc_tune == TUNE_ARCHS4X") + (ior (symbol_ref "arc_tune == ARC_TUNE_ARCHS4X") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4X_REL31A")) (const_string "archs4x") - (ior (symbol_ref "arc_tune == TUNE_ARCHS4XD") - (symbol_ref "arc_tune == TUNE_ARCHS4XD_SLOW")) + (ior (symbol_ref "arc_tune == ARC_TUNE_ARCHS4XD") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4XD_SLOW")) (const_string "archs4xd")] (const_string "none")))) @@ -671,13 +670,22 @@ core_3, archs4x, archs4xd, archs4xd_slow" (define_attr "tune_dspmpy" "none, slow, fast" (const - (cond [(ior (symbol_ref "arc_tune == TUNE_ARCHS4X") - (symbol_ref "arc_tune == TUNE_ARCHS4XD")) + (cond [(ior (symbol_ref "arc_tune == ARC_TUNE_ARCHS4X") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4XD")) (const_string "fast") - (symbol_ref "arc_tune == TUNE_ARCHS4XD_SLOW") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4XD_SLOW") (const_string "slow")] (const_string "none")))) +(define_attr "tune_store" "none, normal, rel31a" + (const + (cond [(ior (symbol_ref "arc_tune == ARC_TUNE_ARCHS4X") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4XD")) + (const_string "normal") + (symbol_ref "arc_tune == ARC_TUNE_ARCHS4X_REL31A") + (const_string "rel31a")] + (const_string "none")))) + ;; Move instructions. (define_expand "movqi" [(set (match_operand:QI 0 "move_dest_operand" "") diff --git a/gcc/config/arc/arc.opt b/gcc/config/arc/arc.opt index eb85f49..0add5a2 100644 --- a/gcc/config/arc/arc.opt +++ b/gcc/config/arc/arc.opt @@ -276,6 +276,9 @@ Enum(arc_tune_attr) String(arc750d) Value(ARC_TUNE_ARC700_4_2_XMAC) EnumValue Enum(arc_tune_attr) String(core3) Value(ARC_TUNE_CORE_3) +EnumValue +Enum(arc_tune_attr) String(release31a) Value(ARC_TUNE_ARCHS4X_REL31A) + mindexed-loads Target Var(TARGET_INDEXED_LOADS) Init(TARGET_INDEXED_LOADS_DEFAULT) Enable the use of indexed loads. diff --git a/gcc/config/arc/arcHS4x.md b/gcc/config/arc/arcHS4x.md index 5136eba..1009833 100644 --- a/gcc/config/arc/arcHS4x.md +++ b/gcc/config/arc/arcHS4x.md @@ -27,14 +27,21 @@ (define_cpu_unit "hs4x_mult" "ARCHS4x") (define_cpu_unit "hs4x_x1, hs4x_x2" "ARCHS4x") (define_cpu_unit "hs4x_y1, hs4x_y2" "ARCHS4x") +(define_cpu_unit "hs4x_brcc0, hs4x_brcc1" "ARCHS4x") (define_insn_reservation "hs4x_brj_op" 1 (and (match_test "TARGET_HS") (eq_attr "tune" "archs4x, archs4xd") (eq_attr "type" "call, call_no_delay_slot, uncond_branch, jump, \ -branch, brcc,brcc_no_delay_slot, sfunc")) +branch, sfunc")) "hs4x_issue0") +(define_insn_reservation "hs4x_brcc_op" 1 + (and (match_test "TARGET_HS") + (eq_attr "tune" "archs4x, archs4xd") + (eq_attr "type" "brcc,brcc_no_delay_slot,loop_end")) + "hs4x_issue0 + hs4x_brcc0 + hs4x_brcc1") + (define_insn_reservation "hs4x_data_load_op" 4 (and (match_test "TARGET_HS") (eq_attr "tune" "archs4x, archs4xd") @@ -43,10 +50,16 @@ branch, brcc,brcc_no_delay_slot, sfunc")) (define_insn_reservation "hs4x_data_store_op" 1 (and (match_test "TARGET_HS") - (eq_attr "tune" "archs4x, archs4xd") + (eq_attr "tune_store" "normal") (eq_attr "type" "store")) "hs4x_issue1 + hs4x_ld_st") +(define_insn_reservation "hs4x_data_store_1_op" 2 + (and (match_test "TARGET_HS") + (eq_attr "tune_store" "rel31a") + (eq_attr "type" "store")) + "hs4x_issue1 + hs4x_ld_st + hs4x_brcc0, hs4x_brcc1") + ;; Advanced ALU (define_insn_reservation "hs4x_adv_alue_op" 4 (and (match_test "TARGET_HS") diff --git a/gcc/config/arc/t-multilib b/gcc/config/arc/t-multilib index 8d97ad1..921945e 100644 --- a/gcc/config/arc/t-multilib +++ b/gcc/config/arc/t-multilib @@ -21,9 +21,9 @@ # along with GCC; see the file COPYING3. If not see # . -MULTILIB_OPTIONS = mcpu=em/mcpu=em_mini/mcpu=arcem/mcpu=em4/mcpu=em4_dmips/mcpu=em4_fpus/mcpu=em4_fpuda/mcpu=quarkse_em/mcpu=hs/mcpu=archs/mcpu=hs34/mcpu=hs38/mcpu=hs38_linux/mcpu=hs4x/mcpu=hs4xd/mcpu=arc600/mcpu=arc600_norm/mcpu=arc600_mul64/mcpu=arc600_mul32x16/mcpu=arc601/mcpu=arc601_norm/mcpu=arc601_mul64/mcpu=arc601_mul32x16/mcpu=arc700/mcpu=nps400 +MULTILIB_OPTIONS = mcpu=em/mcpu=em_mini/mcpu=arcem/mcpu=em4/mcpu=em4_dmips/mcpu=em4_fpus/mcpu=em4_fpuda/mcpu=quarkse_em/mcpu=hs/mcpu=archs/mcpu=hs34/mcpu=hs38/mcpu=hs38_linux/mcpu=hs4x/mcpu=hs4xd/mcpu=hs4x_rel31/mcpu=arc600/mcpu=arc600_norm/mcpu=arc600_mul64/mcpu=arc600_mul32x16/mcpu=arc601/mcpu=arc601_norm/mcpu=arc601_mul64/mcpu=arc601_mul32x16/mcpu=arc700/mcpu=nps400 -MULTILIB_DIRNAMES = em em_mini arcem em4 em4_dmips em4_fpus em4_fpuda quarkse_em hs archs hs34 hs38 hs38_linux hs4x hs4xd arc600 arc600_norm arc600_mul64 arc600_mul32x16 arc601 arc601_norm arc601_mul64 arc601_mul32x16 arc700 nps400 +MULTILIB_DIRNAMES = em em_mini arcem em4 em4_dmips em4_fpus em4_fpuda quarkse_em hs archs hs34 hs38 hs38_linux hs4x hs4xd hs4x_rel31 arc600 arc600_norm arc600_mul64 arc600_mul32x16 arc601 arc601_norm arc601_mul64 arc601_mul32x16 arc700 nps400 # Aliases: MULTILIB_MATCHES = mcpu?arc600=mcpu?ARC600 diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 84d6f0f..94fe57a 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -20053,6 +20053,15 @@ Compile for ARC HS38 CPU. @item hs38_linux Compile for ARC HS38 CPU with all hardware extensions on. +@item hs4x +Compile for ARC HS4x CPU. + +@item hs4xd +Compile for ARC HS4xD CPU. + +@item hs4x_rel31 +Compile for ARC HS4x CPU release 3.10a. + @item arc600_norm Compile for ARC 600 CPU with @code{norm} instructions enabled. @@ -20662,6 +20671,13 @@ Tune for ARC725D CPU. @item ARC750D Tune for ARC750D CPU. +@item core3 +Tune for ARCv2 core3 type CPU. This option enable usage of +@code{dbnz} instruction. + +@item release31a +Tune for ARC4x release 3.10a. + @end table @item -mmultcost=@var{num} -- cgit v1.1 From 7df79970bfec96e186cd50ac951f7017c2109a13 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 18 Jul 2022 16:47:20 +0100 Subject: RISC-V/doc: Correct the name of `-mriscv-attribute' Correct the name of the `-mriscv-attribute' invocation option, including a typo in the negated form. gcc/ * doc/invoke.texi (Option Summary): Fix `-mno-riscv-attribute'. (RISC-V Options): Likewise, and `-mriscv-attribute'. --- gcc/doc/invoke.texi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'gcc') diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 94fe57a..9f2f97c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1219,7 +1219,7 @@ See RS/6000 and PowerPC Options. -mcmodel=medlow -mcmodel=medany @gol -mexplicit-relocs -mno-explicit-relocs @gol -mrelax -mno-relax @gol --mriscv-attribute -mmo-riscv-attribute @gol +-mriscv-attribute -mno-riscv-attribute @gol -malign-data=@var{type} @gol -mbig-endian -mlittle-endian @gol -mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg} @gol @@ -28377,8 +28377,8 @@ Take advantage of linker relaxations to reduce the number of instructions required to materialize symbol addresses. The default is to take advantage of linker relaxations. -@item -memit-attribute -@itemx -mno-emit-attribute +@item -mriscv-attribute +@itemx -mno-riscv-attribute Emit (do not emit) RISC-V attribute to record extra information into ELF objects. This feature requires at least binutils 2.32. -- cgit v1.1 From fa16bb8ac0aba681bc9242f9a9717824c4867f91 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 18 Jul 2022 16:47:20 +0100 Subject: RISC-V/doc: Correct the formatting of `-mstack-protector-guard-reg=' Add missing second space around the `-mstack-protector-guard-reg=' invocation option. gcc/ * doc/invoke.texi (Option Summary): Add missing second space around `-mstack-protector-guard-reg='. --- gcc/doc/invoke.texi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 9f2f97c..1f6c73ec 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1222,7 +1222,7 @@ See RS/6000 and PowerPC Options. -mriscv-attribute -mno-riscv-attribute @gol -malign-data=@var{type} @gol -mbig-endian -mlittle-endian @gol --mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg} @gol +-mstack-protector-guard=@var{guard} -mstack-protector-guard-reg=@var{reg} @gol -mstack-protector-guard-offset=@var{offset}} @emph{RL78 Options} -- cgit v1.1 From e9ee752bbe2cc5632b803b01dc7c98ff214aede9 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 18 Jul 2022 16:47:21 +0100 Subject: RISC-V/doc: Add index references for `mrelax' and `mriscv-attribute' Add missing index references for the `-mrelax' and `-mriscv-attribute' invocation options. gcc/ * doc/invoke.texi (RISC-V Options): Add index references for `mrelax' and `mriscv-attribute'. --- gcc/doc/invoke.texi | 2 ++ 1 file changed, 2 insertions(+) (limited to 'gcc') diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 1f6c73ec..94689be 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -28373,12 +28373,14 @@ limit optimization. @item -mrelax @itemx -mno-relax +@opindex mrelax Take advantage of linker relaxations to reduce the number of instructions required to materialize symbol addresses. The default is to take advantage of linker relaxations. @item -mriscv-attribute @itemx -mno-riscv-attribute +@opindex mriscv-attribute Emit (do not emit) RISC-V attribute to record extra information into ELF objects. This feature requires at least binutils 2.32. -- cgit v1.1 From 5e47c9333df6df1aa9da861f07e68f985d7d28fb Mon Sep 17 00:00:00 2001 From: Andrew MacLeod Date: Thu, 14 Jul 2022 12:35:55 -0400 Subject: Check if transitives need to be registered. Whenever a relation is added, register_transitive is always called. If neither operand was in a relation before, or this is not a new relation, then there is no need to register transitives. PR tree-optimization/106280 * value-relation.cc (dom_oracle::register_relation): Register transitives only when it is possible for there to be one. (dom_oracle::set_one_relation): Return NULL if this is an existing relation. --- gcc/value-relation.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'gcc') diff --git a/gcc/value-relation.cc b/gcc/value-relation.cc index 13ce441..bd34425 100644 --- a/gcc/value-relation.cc +++ b/gcc/value-relation.cc @@ -967,8 +967,12 @@ dom_oracle::register_relation (basic_block bb, relation_kind k, tree op1, equiv_oracle::register_relation (bb, k, op1, op2); else { + // if neither op1 nor op2 are in a relation before this is registered, + // there will be no transitive. + bool check = bitmap_bit_p (m_relation_set, SSA_NAME_VERSION (op1)) + || bitmap_bit_p (m_relation_set, SSA_NAME_VERSION (op2)); relation_chain *ptr = set_one_relation (bb, k, op1, op2); - if (ptr) + if (ptr && check) register_transitives (bb, *ptr); } } @@ -1010,13 +1014,16 @@ dom_oracle::set_one_relation (basic_block bb, relation_kind k, tree op1, // Check into whether we can simply replace the relation rather than // intersecting it. THis may help with some optimistic iterative // updating algorithms. - ptr->intersect (vr); + bool new_rel = ptr->intersect (vr); if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, " to produce "); ptr->dump (dump_file); - fprintf (dump_file, "\n"); + fprintf (dump_file, " %s.\n", new_rel ? "Updated" : "No Change"); } + // If there was no change, return no record.. + if (!new_rel) + return NULL; } else { -- cgit v1.1 From 79fb1124c8c31da3ca70ee3a07bf15f3d2d87ab7 Mon Sep 17 00:00:00 2001 From: GCC Administrator Date: Tue, 19 Jul 2022 00:16:32 +0000 Subject: Daily bump. --- gcc/ChangeLog | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ gcc/DATESTAMP | 2 +- gcc/testsuite/ChangeLog | 21 +++++++++ 3 files changed, 134 insertions(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 994875f..43b70ba 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,115 @@ +2022-07-18 Andrew MacLeod + + PR tree-optimization/106280 + * value-relation.cc (dom_oracle::register_relation): Register + transitives only when it is possible for there to be one. + (dom_oracle::set_one_relation): Return NULL if this is an + existing relation. + +2022-07-18 Maciej W. Rozycki + + * doc/invoke.texi (RISC-V Options): Add index references for + `mrelax' and `mriscv-attribute'. + +2022-07-18 Maciej W. Rozycki + + * doc/invoke.texi (Option Summary): Add missing second space + around `-mstack-protector-guard-reg='. + +2022-07-18 Maciej W. Rozycki + + * doc/invoke.texi (Option Summary): Fix `-mno-riscv-attribute'. + (RISC-V Options): Likewise, and `-mriscv-attribute'. + +2022-07-18 Claudiu Zissulescu + + * config/arc/arc-arch.h (arc_tune_attr): Add + ARC_TUNE_ARCHS4X_REL31A variant. + * config/arc/arc.cc (arc_override_options): Tune options for + release 310a. + (arc_sched_issue_rate): Use correct enum. + (arc600_corereg_hazard): Textual change. + (arc_hazard): Add release 310a tunning. + * config/arc/arc.md (tune): Update and take into consideration new + tune option. + (tune_dspmpy): Likewise. + (tune_store): New attribute. + * config/arc/arc.opt (mtune): New tune option. + * config/arc/arcHS4x.md (hs4x_brcc0, hs4x_brcc1): New cpu units. + (hs4x_brcc_op): New instruction rezervation. + (hs4x_data_store_1_op): Likewise. + * config/arc/arc-cpus.def (hs4x_rel31): New cpu variant. + * config/arc/arc-tables.opt: Regenerate. + * config/arc/t-multilib: Likewise. + * doc/invoke.texi (ARC): Update mcpu and tune sections. + +2022-07-18 Richard Biener + + * tree-loop-distribution.cc (loop_distribution::distribute_loop): + When computing cost-based merging do not disregard builtin + classified partitions in some cases. + +2022-07-18 Richard Sandiford + + PR target/106253 + * config/arm/arm-builtins.cc (arm_builtin_vectorized_function): + Delete. + * config/arm/arm-protos.h (arm_builtin_vectorized_function): Delete. + * config/arm/arm.cc (TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION): + Delete. + * config/arm/arm_neon_builtins.def (copysignf): Delete. + * config/arm/iterators.md (nvrint_pattern): New attribute. + * config/arm/neon.md (2): + New pattern. + (l2): + Likewise. + (neon_copysignf): Rename to... + (copysign3): ...this. + +2022-07-18 Claudiu Zissulescu + + * config/arc/arc.cc (arc_expand_epilogue): Adjust the frame + pointer first when in interrupts. + +2022-07-18 Richard Biener + + * tree-loop-distribution.cc (copy_loop_before): Add + the ability to replace the original LC PHI defs. + (generate_loops_for_partition): Pass through a flag + whether to redirect original LC PHI defs. + (generate_code_for_partition): Likewise. + (loop_distribution::distribute_loop): Compute the partition + that should provide the LC PHI defs for common reductions + and pass that down. + +2022-07-18 Richard Ball + + * config/aarch64/aarch64.cc (aarch64_evpc_trn): Use std:swap. + (aarch64_evpc_uzp): Likewise. + (aarch64_evpc_zip): Likewise. + +2022-07-18 Roger Sayle + + PR target/106231 + * config/i386/i386.md (*ctzsidi2_ext): New insn_and_split + to recognize any_extend:DI of ctz:SI which is implicitly extended. + (*ctzsidi2_ext_falsedep): New define_insn to model a DImode + extended ctz:SI that has preceding xor to break false dependency. + +2022-07-18 Roger Sayle + + * config/i386/predicates.md (x86_64_const_vector_operand): + Check the operand's mode matches the specified mode argument. + +2022-07-18 Roger Sayle + + * config/i386/sse.md (kunpckhi): Add UNSPEC_MASKOP unspec. + (kunpcksi): Likewise, add UNSPEC_MASKOP unspec. + (kunpckdi): Likewise, add UNSPEC_MASKOP unspec. + (vec_pack_trunc_qi): Update to specify the now required + UNSPEC_MASKOP unspec. + (vec_pack_trunc_): Likewise. + 2022-07-16 Takayuki 'January June' Suwa * config/xtensa/xtensa.md diff --git a/gcc/DATESTAMP b/gcc/DATESTAMP index 2ac5479..a394c7a 100644 --- a/gcc/DATESTAMP +++ b/gcc/DATESTAMP @@ -1 +1 @@ -20220718 +20220719 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index cc8ea71..36913da 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,24 @@ +2022-07-18 Richard Biener + + * gcc.dg/tree-ssa/ldist-24.c: XFAIL. + * gcc.dg/tree-ssa/ldist-36.c: Adjust expected outcome. + +2022-07-18 Richard Sandiford + + PR target/106253 + * gcc.target/arm/vect_unary_1.c: New test. + * gcc.target/arm/vect_binary_1.c: Likewise. + +2022-07-18 Claudiu Zissulescu + + * gcc.target/arc/interrupt-13.c: New file. + +2022-07-18 Roger Sayle + + PR target/106231 + * gcc.target/i386/pr106231-1.c: New test case. + * gcc.target/i386/pr106231-2.c: New test case. + 2022-07-15 H.J. Lu PR target/85620 -- cgit v1.1 From 2180cdd8a0e65c2790a7732c82de87f83478487b Mon Sep 17 00:00:00 2001 From: Takayuki 'January June' Suwa Date: Mon, 18 Jul 2022 21:43:45 +0900 Subject: xtensa: Correct the relative RTX cost that corresponds to the Move Immediate "MOVI" instruction This patch corrects the overestimation of the relative cost of '(set (reg) (const_int N))' where N fits into the instruction itself. In fact, such overestimation confuses the RTL loop invariant motion pass. As a result, it brings almost no negative impact from the speed point of view, but addtiional reg-reg move instructions and register allocation pressure about the size. /* example, optimized for size */ extern int foo(void); extern int array[16]; void test_0(void) { unsigned int i; for (i = 0; i < sizeof(array)/sizeof(*array); ++i) array[i] = 1024; } void test_1(void) { unsigned int i; for (i = 0; i < sizeof(array)/sizeof(*array); ++i) array[i] = array[i] ? 1024 : 0; } void test_2(void) { unsigned int i; for (i = 0; i < sizeof(array)/sizeof(*array); ++i) array[i] = foo() ? 0 : 1024; } ;; before .literal_position .literal .LC0, array test_0: l32r a3, .LC0 movi.n a2, 0 movi a4, 0x400 // OK .L2: s32i.n a4, a3, 0 addi.n a2, a2, 1 addi.n a3, a3, 4 bnei a2, 16, .L2 ret.n .literal_position .literal .LC1, array test_1: l32r a2, .LC1 movi.n a3, 0 movi a5, 0x400 // NG .L6: l32i.n a4, a2, 0 beqz.n a4, .L5 mov.n a4, a5 // should be "movi a4, 0x400" .L5: s32i.n a4, a2, 0 addi.n a3, a3, 1 addi.n a2, a2, 4 bnei a3, 16, .L6 ret.n .literal_position .literal .LC2, array test_2: addi sp, sp, -32 s32i.n a12, sp, 24 l32r a12, .LC2 s32i.n a13, sp, 20 s32i.n a14, sp, 16 s32i.n a15, sp, 12 s32i.n a0, sp, 28 addi a13, a12, 64 movi.n a15, 0 // NG movi a14, 0x400 // and wastes callee-saved registers (only 4) .L11: call0 foo mov.n a3, a14 // should be "movi a3, 0x400" movnez a3, a15, a2 s32i.n a3, a12, 0 addi.n a12, a12, 4 bne a12, a13, .L11 l32i.n a0, sp, 28 l32i.n a12, sp, 24 l32i.n a13, sp, 20 l32i.n a14, sp, 16 l32i.n a15, sp, 12 addi sp, sp, 32 ret.n ;; after .literal_position .literal .LC0, array test_0: l32r a3, .LC0 movi.n a2, 0 movi a4, 0x400 // OK .L2: s32i.n a4, a3, 0 addi.n a2, a2, 1 addi.n a3, a3, 4 bnei a2, 16, .L2 ret.n .literal_position .literal .LC1, array test_1: l32r a2, .LC1 movi.n a3, 0 .L6: l32i.n a4, a2, 0 beqz.n a4, .L5 movi a4, 0x400 // OK .L5: s32i.n a4, a2, 0 addi.n a3, a3, 1 addi.n a2, a2, 4 bnei a3, 16, .L6 ret.n .literal_position .literal .LC2, array test_2: addi sp, sp, -16 s32i.n a12, sp, 8 l32r a12, .LC2 s32i.n a13, sp, 4 s32i.n a0, sp, 12 addi a13, a12, 64 .L11: call0 foo movi.n a3, 0 // OK movi a4, 0x400 // and less register allocation pressure moveqz a3, a4, a2 s32i.n a3, a12, 0 addi.n a12, a12, 4 bne a12, a13, .L11 l32i.n a0, sp, 12 l32i.n a12, sp, 8 l32i.n a13, sp, 4 addi sp, sp, 16 ret.n gcc/ChangeLog: * config/xtensa/xtensa.cc (xtensa_rtx_costs): Change the relative cost of '(set (reg) (const_int N))' where N fits into signed 12-bit from 4 to 0 if optimizing for size. And use the appropriate macro instead of the bare number 4. --- gcc/config/xtensa/xtensa.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index 9433745..a851a7a 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -4073,7 +4073,7 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code, case SET: if (xtensa_simm12b (INTVAL (x))) { - *total = 4; + *total = speed ? COSTS_N_INSNS (1) : 0; return true; } break; -- cgit v1.1 From 40f6e5912288256ee8ac41474f2dce7b6881c111 Mon Sep 17 00:00:00 2001 From: Roger Sayle Date: Tue, 19 Jul 2022 08:39:43 +0100 Subject: PR c/106264: Silence warnings from __builtin_modf et al. This middle-end patch resolves PR c/106264 which is a spurious warning regression caused by the tree-level expansion of modf, frexp and remquo producing "expression has no-effect" when the built-in function's result is ignored. When these built-ins were first expanded at tree-level, fold_builtin_n would blindly set TREE_NO_WARNING for all built-ins. Now that we're more discerning, we should precisely call suppress_warning selectively on those COMPOUND_EXPRs that need them. 2022-07-19 Roger Sayle Richard Biener gcc/ChangeLog PR c/106264 * builtins.cc (fold_builtin_frexp): Call suppress_warning on COMPOUND_EXPR to silence spurious warning if result isn't used. (fold_builtin_modf): Likewise. (do_mpfr_remquo): Likewise. gcc/testsuite/ChangeLog PR c/106264 * gcc.dg/pr106264.c: New test case. --- gcc/builtins.cc | 19 +++++++++++++------ gcc/testsuite/gcc.dg/pr106264.c | 27 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/pr106264.c (limited to 'gcc') diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 35b9197..91b9c9f 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -8625,7 +8625,7 @@ fold_builtin_frexp (location_t loc, tree arg0, tree arg1, tree rettype) if (TYPE_MAIN_VARIANT (TREE_TYPE (arg1)) == integer_type_node) { const REAL_VALUE_TYPE *const value = TREE_REAL_CST_PTR (arg0); - tree frac, exp; + tree frac, exp, res; switch (value->cl) { @@ -8656,7 +8656,9 @@ fold_builtin_frexp (location_t loc, tree arg0, tree arg1, tree rettype) /* Create the COMPOUND_EXPR (*arg1 = trunc, frac). */ arg1 = fold_build2_loc (loc, MODIFY_EXPR, rettype, arg1, exp); TREE_SIDE_EFFECTS (arg1) = 1; - return fold_build2_loc (loc, COMPOUND_EXPR, rettype, arg1, frac); + res = fold_build2_loc (loc, COMPOUND_EXPR, rettype, arg1, frac); + suppress_warning (res, OPT_Wunused_value); + return res; } return NULL_TREE; @@ -8682,6 +8684,7 @@ fold_builtin_modf (location_t loc, tree arg0, tree arg1, tree rettype) { const REAL_VALUE_TYPE *const value = TREE_REAL_CST_PTR (arg0); REAL_VALUE_TYPE trunc, frac; + tree res; switch (value->cl) { @@ -8711,8 +8714,10 @@ fold_builtin_modf (location_t loc, tree arg0, tree arg1, tree rettype) arg1 = fold_build2_loc (loc, MODIFY_EXPR, rettype, arg1, build_real (rettype, trunc)); TREE_SIDE_EFFECTS (arg1) = 1; - return fold_build2_loc (loc, COMPOUND_EXPR, rettype, arg1, - build_real (rettype, frac)); + res = fold_build2_loc (loc, COMPOUND_EXPR, rettype, arg1, + build_real (rettype, frac)); + suppress_warning (res, OPT_Wunused_value); + return res; } return NULL_TREE; @@ -10673,8 +10678,10 @@ do_mpfr_remquo (tree arg0, tree arg1, tree arg_quo) integer_quo)); TREE_SIDE_EFFECTS (result_quo) = 1; /* Combine the quo assignment with the rem. */ - result = non_lvalue (fold_build2 (COMPOUND_EXPR, type, - result_quo, result_rem)); + result = fold_build2 (COMPOUND_EXPR, type, + result_quo, result_rem); + suppress_warning (result, OPT_Wunused_value); + result = non_lvalue (result); } } } diff --git a/gcc/testsuite/gcc.dg/pr106264.c b/gcc/testsuite/gcc.dg/pr106264.c new file mode 100644 index 0000000..6b4af49 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr106264.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -Wall" } */ +double frexp (double, int*); +double modf (double, double*); +double remquo (double, double, int*); + +int f (void) +{ + int y; + frexp (1.0, &y); + return y; +} + +double g (void) +{ + double y; + modf (1.0, &y); + return y; +} + +int h (void) +{ + int y; + remquo (1.0, 1.0, &y); + return y; +} + -- cgit v1.1 From 0f129766fdb687394f0eea04f69268b5cc034cda Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 19 Jul 2022 10:02:40 +0200 Subject: lto/106334 - relax assert during WPA tree merging The dwarf2out map of tree to symbol + offset is populated too early when streaming in trees so that when WPA tree merging decides to recycle them the mapping prevails and if we are unlucky the same address is used for another tree with a symbol + offset DIE to record. The following mitigates the resulting ICE by relaxing the assert, allowing re-use of a slot during WPA. Delaying the register would be better but it's already somewhat hairy and uglifying this further doesn't look too important right now. PR lto/106334 * dwarf2out.cc (dwarf2out_register_external_die): Allow map entry re-use during WPA. --- gcc/dwarf2out.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc index e3920c8..3ac39c1 100644 --- a/gcc/dwarf2out.cc +++ b/gcc/dwarf2out.cc @@ -6069,7 +6069,11 @@ dwarf2out_register_external_die (tree decl, const char *sym, if (!external_die_map) external_die_map = hash_map::create_ggc (1000); - gcc_checking_assert (!external_die_map->get (decl)); + /* When we do tree merging during WPA we can end up re-using GC memory + as there's currently no way to unregister external DIEs. Ideally + we'd register them only after merging finished but allowing override + here is easiest. See PR106334. */ + gcc_checking_assert (flag_wpa || !external_die_map->get (decl)); sym_off_pair p = { IDENTIFIER_POINTER (get_identifier (sym)), off }; external_die_map->put (decl, p); } -- cgit v1.1 From e4ff11a8f2e80adb8ada69bf35ee6a1ab18a9c85 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 19 Jul 2022 09:57:22 +0200 Subject: middle-end/106331 - fix mem attributes for string op arguments get_memory_rtx tries hard to come up with a MEM_EXPR to record in the memory attributes but in the last fallback fails to properly account for an unknown offset and thus, as visible in this testcase, incorrect alignment computed from set_mem_attributes. The following rectifies both parts. PR middle-end/106331 * builtins.cc (get_memory_rtx): Compute alignment from the original address and set MEM_OFFSET to unknown when we create a MEM_EXPR from the base object of the address. * gfortran.dg/pr106331.f90: New testcase. --- gcc/builtins.cc | 13 +++++++++---- gcc/testsuite/gfortran.dg/pr106331.f90 | 7 +++++++ 2 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/pr106331.f90 (limited to 'gcc') diff --git a/gcc/builtins.cc b/gcc/builtins.cc index 91b9c9f..0d13197 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -1360,7 +1360,7 @@ expand_builtin_prefetch (tree exp) rtx get_memory_rtx (tree exp, tree len) { - tree orig_exp = exp; + tree orig_exp = exp, base; rtx addr, mem; /* When EXP is not resolved SAVE_EXPR, MEM_ATTRS can be still derived @@ -1391,10 +1391,11 @@ get_memory_rtx (tree exp, tree len) if (is_gimple_mem_ref_addr (TREE_OPERAND (exp, 0))) set_mem_attributes (mem, exp, 0); else if (TREE_CODE (TREE_OPERAND (exp, 0)) == ADDR_EXPR - && (exp = get_base_address (TREE_OPERAND (TREE_OPERAND (exp, 0), - 0)))) + && (base = get_base_address (TREE_OPERAND (TREE_OPERAND (exp, 0), + 0)))) { - exp = build_fold_addr_expr (exp); + unsigned int align = get_pointer_alignment (TREE_OPERAND (exp, 0)); + exp = build_fold_addr_expr (base); exp = fold_build2 (MEM_REF, build_array_type (char_type_node, build_range_type (sizetype, @@ -1402,6 +1403,10 @@ get_memory_rtx (tree exp, tree len) NULL)), exp, build_int_cst (ptr_type_node, 0)); set_mem_attributes (mem, exp, 0); + /* Since we stripped parts make sure the offset is unknown and the + alignment is computed from the original address. */ + clear_mem_offset (mem); + set_mem_align (mem, align); } set_mem_alias_set (mem, 0); return mem; diff --git a/gcc/testsuite/gfortran.dg/pr106331.f90 b/gcc/testsuite/gfortran.dg/pr106331.f90 new file mode 100644 index 0000000..3873863 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr106331.f90 @@ -0,0 +1,7 @@ +! { dg-do run } +! { dg-options "-Og" } + +PROGRAM main + CHARACTER(LEN=24) :: a(2) + a = '' +END PROGRAM -- cgit v1.1 From 4c3231302577445417715a7c22e879e4159376d3 Mon Sep 17 00:00:00 2001 From: Prathamesh Kulkarni Date: Tue, 19 Jul 2022 17:43:26 +0530 Subject: forwprop: Use lhs type instead of arg0 in folding VEC_PERM_EXPR. gcc/ChangeLog: * tree-ssa-forwprop.cc (simplify_permutation): Use lhs type instead of TREE_TYPE (arg0) as result type in folding VEC_PERM_EXPR. --- gcc/tree-ssa-forwprop.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'gcc') diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index fdc4bc8..d04cf4b 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -2661,7 +2661,7 @@ simplify_permutation (gimple_stmt_iterator *gsi) /* Shuffle of a constructor. */ bool ret = false; - tree res_type = TREE_TYPE (arg0); + tree res_type = TREE_TYPE (gimple_assign_lhs (stmt)); tree opt = fold_ternary (VEC_PERM_EXPR, res_type, arg0, arg1, op2); if (!opt || (TREE_CODE (opt) != CONSTRUCTOR && TREE_CODE (opt) != VECTOR_CST)) -- cgit v1.1 From edf0c132b19f73e5739715c2ac90c4ae1e96dc31 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Tue, 19 Jul 2022 15:40:58 +0200 Subject: Remote trailing : for subheading. gcc/ChangeLog: * doc/extend.texi: Remove trailing :. --- gcc/doc/extend.texi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'gcc') diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index dfbe33a..4222e76 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -11385,7 +11385,7 @@ keyword after the declarator. It is up to you to make sure that the assembler names you choose do not conflict with any other assembler symbols, or reference registers. -@subsubheading Assembler names for data: +@subsubheading Assembler names for data This sample shows how to specify the assembler name for data: @@ -11407,7 +11407,7 @@ since such variables do not have assembler names. If you are trying to put the variable in a particular register, see @ref{Explicit Register Variables}. -@subsubheading Assembler names for functions: +@subsubheading Assembler names for functions To specify the assembler name for functions, write a declaration for the function before its definition and put @code{asm} there, like this: -- cgit v1.1